diff --git a/README.md b/README.md index 50379d6..57743e1 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,8 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E ## Table Content - [Supported Models](#supported-models-quick-start) - - [Onnxruntime Models](./docs/model/onnxruntime_models.md) + - [Onnxruntime DirectML Models](./docs/model/onnxruntime_directml_models.md) + - [Onnxruntime CPU Models](./docs/model/onnxruntime_cpu_models.md) - [Ipex-LLM Models](./docs/model/ipex_models.md) - [Getting Started](#getting-started) - [Installation From Source](#installation) @@ -39,7 +40,7 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E | Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) | | Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) | | Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) | -| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) | +| Llama-3-8b-chat | 8B | 8192 | [luweigen/Llama-3-8B-Instruct-int4-onnx-directml](https://huggingface.co/luweigen/Llama-3-8B-Instruct-int4-onnx-directml) | | Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) | | Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) | | Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) | diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000..f09ffc3 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,89 @@ +# Benchmark +Allow users to test on themselves to get the benchmark of model(s) on different backend. It will analyse the Token In / Out throughput for you in a statistical manner + +## Benchmark a Model +To benchmark a model, run this +* --backend `cpu` | `ipex` | `openvino` | `directml` +* --model_name `Name of the Model` +* --model_path `Path to Model` | `Model Repo ID` +* --token_in `Number of Input Tokens (Max 2048)` +* --token_out `Number of Output Tokens` +* --input_token_bias `Adjust the input token` +* --output_token_bias `Adjust the output token` +* --loop_count `Adjust the loop count` + +```shell +python ellm_benchmark.py --backend --model_name --model_path --token_in --token_out --input_token_bias --output_token_bias --loop_count +``` + + +## Loop to benchmark the models +Customise your benchmarking config +```python +# Define the models +model_names = [ + # model names + +] + +# Define the model paths +model_paths = [ + # path to model in order to model names / model repo id + +] + +# Define the token length +token_in_out = [ + (1024, 1024), + (1024, 512), + (1024, 256), + (1024, 128), + (512, 1024), + (512, 512), + (512, 256), + (512, 128), + (256, 1024), + (256, 512), + (256, 256), + (256, 128), + (128, 1024), + (128, 512), + (128, 256), + (128, 128), +] + +# Choose backend +backend = "cpu" +backend = "directml" +backend = "ipex" +backend = "openvino" + +# Number of loops +loop_count = 20 + +# input and output token bias +input_token_bias = 0 +output_token_bias = 0 +``` +```shell +python loop_ellm_benchmark.py +``` + +## Generate a Report (`XLSX`) of a Model's Benchmark +To Generate report for a model, run this +* --model_name `Name of the Model` +```shell +python analyse_detailed_benchmark.py --model_name +``` + +## Generate Reports (`XLSX`) of Models' Benchmark +List out the models that you want to have report of benchmarking +```python +model_names = [ + # model names + +] +``` +```shell +python loop_analyse_detailed_benchmark.py +``` diff --git a/benchmark/analyse_detailed_benchmark.py b/benchmark/analyse_detailed_benchmark.py new file mode 100644 index 0000000..ca45d30 --- /dev/null +++ b/benchmark/analyse_detailed_benchmark.py @@ -0,0 +1,124 @@ +import os +import re +import numpy as np +import pandas as pd +import argparse + +def extract_data_from_log(log_file): + average_tps_list = [] + prompt_tokens_per_second_list = [] + new_tokens_per_second_list = [] + error_count = 0 + error_state = False + + if not os.path.exists(log_file): + print(f"Log file does not exist: {log_file}") + return average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count + + with open(log_file, 'r') as file: + for line in file: + if "ERROR" in line: + error_count += 1 + error_state = True + continue + + if "Average tps" in line and error_state == True: + error_state = False + continue + + if "Average tps" in line: + average_tps = float(re.search(r"Average tps: ([\d.]+)", line).group(1)) + average_tps_list.append(average_tps) + continue + + if "Prompt tokens per second" in line: + prompt_tokens_per_second = float(re.search(r"Prompt tokens per second: ([\d.]+)", line).group(1)) + prompt_tokens_per_second_list.append(prompt_tokens_per_second) + if "New tokens per second" in line: + new_tokens_per_second = float(re.search(r"New tokens per second: ([\d.]+)", line).group(1)) + new_tokens_per_second_list.append(new_tokens_per_second) + + return average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count + +def calculate_statistics(data): + data_np = np.array(data) + stats = { + "std": np.std(data_np, ddof=1), # Sample standard deviation + "mean": np.mean(data_np), + "min": np.min(data_np), + "1%": np.percentile(data_np, 1), + "25%": np.percentile(data_np, 25), + "50%": np.percentile(data_np, 50), # Median + "75%": np.percentile(data_np, 75), + "99%": np.percentile(data_np, 99), + "max": np.max(data_np) + } + return stats + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Process log files and generate statistics.") + parser.add_argument('--model_name', type=str, required=True, help='Name of the model') + return parser.parse_args() + +def main(model_name): + token_ins = [128, 256, 512, 1024] + token_outs = [128, 256, 512, 1024] + + statistics = [] + + # Create the profile_model_timing directory if it doesn't exist + log_dir = "profile_model_timing" + os.makedirs(log_dir, exist_ok=True) + + for input_token_length in token_ins: + for output_token_length in token_outs: + log_file = os.path.join(log_dir, f'profile_model_timing_{model_name}_{input_token_length}_{output_token_length}.log') + average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count = extract_data_from_log(log_file) + + if not average_tps_list and not prompt_tokens_per_second_list and not new_tokens_per_second_list: + # Log file does not exist or is empty, append "-" for each statistical value + statistics.append([ + model_name, input_token_length, output_token_length, + "-", "-", "-", "-", "-", "-", "-", "-", "-", + "-", "-", "-", "-", "-", "-", "-", "-", "-", + "-", "-", "-", "-", "-", "-", "-", "-", "-", + error_count + ]) + else: + min_len = min(len(average_tps_list), len(prompt_tokens_per_second_list), len(new_tokens_per_second_list)) + + if min_len > 0: + prompt_stats = calculate_statistics(prompt_tokens_per_second_list[5:min_len]) + new_token_stats = calculate_statistics(new_tokens_per_second_list[5:min_len]) + average_tps_stats = calculate_statistics(average_tps_list[5:min_len]) + + statistics.append([ + model_name, input_token_length, output_token_length, + prompt_stats["std"], prompt_stats["mean"], prompt_stats["min"], prompt_stats["1%"], prompt_stats["25%"], prompt_stats["50%"], prompt_stats["75%"], prompt_stats["99%"], prompt_stats["max"], + new_token_stats["std"], new_token_stats["mean"], new_token_stats["min"], new_token_stats["1%"], new_token_stats["25%"], new_token_stats["50%"], new_token_stats["75%"], new_token_stats["99%"], new_token_stats["max"], + average_tps_stats["std"], average_tps_stats["mean"], average_tps_stats["min"], average_tps_stats["1%"], average_tps_stats["25%"], average_tps_stats["50%"], average_tps_stats["75%"], average_tps_stats["99%"], average_tps_stats["max"], + error_count + ]) + + # Create a DataFrame + columns = [ + "Model", "Token In", "Token Out", + "Token In / sec std", "Token In / sec mean", "Token In / sec min", "Token In / sec 1%", "Token In / sec 25%", "Token In / sec 50%", "Token In / sec 75%", "Token In / sec 99%", "Token In / sec max", + "Token Out / sec std", "Token Out / sec mean", "Token Out / sec min", "Token Out / sec 1%", "Token Out / sec 25%", "Token Out / sec 50%", "Token Out / sec 75%", "Token Out / sec 99%", "Token Out / sec max", + "Average Token / sec std", "Average Token / sec mean", "Average Token / sec min", "Average Token / sec 1%", "Average Token / sec 25%", "Average Token / sec 50%", "Average Token / sec 75%", "Average Token / sec 99%", "Average Token / sec max", + "No of Fail" + ] + df = pd.DataFrame(statistics, columns=columns) + + # Create the statistics directory if it doesn't exist + output_dir = "statistics" + os.makedirs(output_dir, exist_ok=True) + + # Write to Excel + output_file = os.path.join(output_dir, f"{model_name}_statistics.xlsx") + df.to_excel(output_file, index=False) + print(f"Statistics written to {output_file}") + +if __name__ == "__main__": + args = parse_arguments() + main(args.model_name) diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py new file mode 100644 index 0000000..12a2822 --- /dev/null +++ b/benchmark/ellm_benchmark.py @@ -0,0 +1,132 @@ +import sys +import os +import time +import asyncio +import argparse +from loguru import logger + +# Add the 'src' directory to sys.path +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'src'))) + +# Import the engine module +from embeddedllm import engine +from embeddedllm import sampling_params + +async def benchmark(model, input_token_length, output_token_length, model_name, input_token_bias=0, output_token_bias=0): + + logger.info(f"Model: {model_name}") + + model.tokenizer.chat_template = "{% for message in messages %}{{ message['content']}}{% endfor %}" # Override + + prompt_text = """ + + """ + # Define the path to the file + file_path = "sampleText.txt" + + # Open the file and read its contents into the variable + with open(file_path, 'r') as file: + prompt_text = file.read() + + input_tokens = model.tokenizer.encode(prompt_text)[:(input_token_length + input_token_bias)] + input_text = model.tokenizer.decode(input_tokens) + print(input_text) + input_tokens = model.tokenizer.encode(input_text) + + PromptInputs = { + "prompt": input_text + } + + sampling_params_config = sampling_params.SamplingParams( + max_tokens=(output_token_length + output_token_bias), + top_p=0.1, + top_k=1, + temperature=1, + repetition_penalty=0.01, + ) + + start = time.perf_counter() + + async def generate(): + results = [] + async for response in model.generate( + inputs=PromptInputs, + sampling_params=sampling_params_config, + request_id="benchmark", + stream=True, + ): + results.append(response) + return results + + response = await generate() + end = time.perf_counter() + + logger.info(response[0]) # Access the generated text from the response + + total_time_taken = end - start + logger.info(f"Total time taken: {total_time_taken:.2f} seconds") + + average_tps = (input_token_length + output_token_length) / total_time_taken + logger.info("Average tps: "+ str(average_tps)) + + + +def main(): + parser = argparse.ArgumentParser(description="Benchmark EmbeddedLLM models.") + parser.add_argument('--backend', type=str, required=True, choices=['cpu', 'npu', 'directml', 'openvino', 'ipex'], help='Backend to use (cpu, npu, ipex, openvino or directml)') + parser.add_argument('--model_name', type=str, required=True, help='Name of the model') + parser.add_argument('--model_path', type=str, required=True, help='Path to the model or model repo id') + parser.add_argument('--token_in', type=int, required=True, help='Number of input tokens (max 2048)') + parser.add_argument('--token_out', type=int, required=True, help='Number of output tokens') + parser.add_argument('--input_token_bias', type=int, required=False, help='Adjust the input token length') + parser.add_argument('--output_token_bias', type=int, required=False, help='Adjust the output token length') + parser.add_argument('--loop_count', type=int, required=False, help='Adjust the loop count') + + args = parser.parse_args() + + backend = args.backend + model_path = args.model_path + model_name = args.model_name + token_in = args.token_in + token_out = args.token_out + input_token_bias = args.input_token_bias + output_token_bias = args.output_token_bias + loop_count = args.loop_count + + # Cap the input tokens to 2048 + if args.token_in > 2048: + print("Input tokens capped to 2048.") + args.token_in = 2048 + + # Create the profile_model_timing directory if it doesn't exist + log_dir = "profile_model_timing" + os.makedirs(log_dir, exist_ok=True) + + log_file = os.path.join(log_dir, f'profile_model_timing_{model_name}_{token_in}_{token_out}.log') + + # Add the log file to the logger + logger.add(log_file, mode='w') + + # need different parameter for cpu and directml + if backend == "cpu": + device="cpu" + elif backend == "npu": + device="npu" + elif backend == "ipex": + device="xpu" + elif backend == "openvino": + device="gpu" + elif backend == "directml": + device = "" + + model = engine.EmbeddedLLMEngine(model_path=model_path, vision=False, device=device, backend=backend) + + for _ in range(loop_count): + # Run the async function using asyncio.run() + asyncio.run(benchmark(model, token_in, token_out, model_name, input_token_bias, output_token_bias)) + + # Remove the logger to close the log file + logger.remove() + +if __name__ == "__main__": + main() diff --git a/benchmark/loop_analyse_detailed_benchmark.py b/benchmark/loop_analyse_detailed_benchmark.py new file mode 100644 index 0000000..e01bdda --- /dev/null +++ b/benchmark/loop_analyse_detailed_benchmark.py @@ -0,0 +1,20 @@ +import subprocess + +model_names = [ + # model names + +] + + +# Path to the ellm_benchmark.py script +analyse_detailed_benchmark_script = "analyse_detailed_benchmark.py" + +for model_name in model_names: + # Construct the command + command = [ + "python", analyse_detailed_benchmark_script, + "--model_name", model_name, + ] + + # Execute the command + subprocess.run(command) \ No newline at end of file diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py new file mode 100644 index 0000000..f78c50f --- /dev/null +++ b/benchmark/loop_ellm_benchmark.py @@ -0,0 +1,68 @@ +import subprocess + +# Define the models +model_names = [ + # model names + +] + +# Define the model paths +model_paths = [ + # path to model in order to model names / model repo id + +] + +# Define the token length +token_in_out = [ + (1024, 1024), + (1024, 512), + (1024, 256), + (1024, 128), + (512, 1024), + (512, 512), + (512, 256), + (512, 128), + (256, 1024), + (256, 512), + (256, 256), + (256, 128), + (128, 1024), + (128, 512), + (128, 256), + (128, 128), +] + +# Choose backend +# backend = "cpu" +# backend = "directml" +# backend = "ipex" +# backend = "openvino" +# backend = "npu" + +# Number of loops +loop_count = 3 + +# input and output token bias +input_token_bias = 0 +output_token_bias = 0 + +# Path to the ellm_benchmark.py script +ellm_benchmark_script = "ellm_benchmark.py" + +for model_name, model_path in zip(model_names, model_paths): + for input_token_length, output_token_length in token_in_out: + # Construct the command + command = [ + "python", ellm_benchmark_script, + "--backend", backend, + "--model_name", model_name, + "--model_path", model_path, + "--token_in", str(input_token_length), + "--token_out", str(output_token_length), + "--input_token_bias", str(input_token_bias), + "--output_token_bias", str(output_token_bias), + "--loop_count", str(loop_count) + ] + + # Execute the command + subprocess.run(command) diff --git a/benchmark/sampleText.txt b/benchmark/sampleText.txt new file mode 100644 index 0000000..3da3fbb --- /dev/null +++ b/benchmark/sampleText.txt @@ -0,0 +1,91 @@ +A large language model (LLM) is a computational model notable for its ability to achieve general-purpose language +generation and other natural language processing tasks such as classification. Based on language models, LLMs acquire +these abilities by learning statistical relationships from vast amounts of text during a computationally intensive +self-supervised and semi-supervised training process.[1] LLMs can be used for text generation, a form of generative AI, +by taking an input text and repeatedly predicting the next token or word.[2] + +LLMs are artificial neural networks that utilize the transformer architecture, invented in 2017. The largest and +most capable LLMs, as of June 2024, are built with a decoder-only transformer-based architecture, which enables +efficient processing and generation of large-scale text data. + +Historically, up to 2020, fine-tuning was the primary method used to adapt a model for specific tasks. However, +larger models such as GPT-3 have demonstrated the ability to achieve similar results through prompt engineering, +which involves crafting specific input prompts to guide the model's responses.[3] These models acquire knowledge +about syntax, semantics, and ontologies[4] inherent in human language corpora, but they also inherit inaccuracies +and biases present in the data they are trained on.[5] + +Some notable LLMs are OpenAI's GPT series of models (e.g., GPT-3.5 and GPT-4, used in ChatGPT and Microsoft Copilot), +Google's Gemini (the latter of which is currently used in the chatbot of the same name), Meta's LLaMA family of models, +Anthropic's Claude models, and Mistral AI's models. + +History +Before 2017, there were a few language models that were large as compared to capacities then available. In the 1990s, +the IBM alignment models pioneered statistical language modelling. A smoothed n-gram model in 2001 trained on 0.3 +billion words achieved then-SOTA perplexity.[6] In the 2000s, as Internet use became prevalent, some researchers +constructed Internet-scale language datasets ("web as corpus"[7]), upon which they trained statistical language +models.[8][9] In 2009, in most language processing tasks, statistical language models dominated over symbolic +language models, as they can usefully ingest large datasets.[10] + +After neural networks became dominant in image processing around 2012, they were applied to language modelling as +well. Google converted its translation service to Neural Machine Translation in 2016. As it was before Transformers, +it was done by seq2seq deep LSTM networks. + + +An illustration of main components of the transformer model from the original paper, where layers were normalized +after (instead of before) multiheaded attention At the 2017 NeurIPS conference, Google researchers introduced the +transformer architecture in their landmark paper "Attention Is All You Need". This paper's goal was to improve upon +2014 Seq2seq technology,[11] and was based mainly on the attention mechanism developed by Bahdanau et al. in 2014. +[12] The following year in 2018, BERT was introduced and quickly became "ubiquitous".[13] Though the original +transformer has both encoder and decoder blocks, BERT is an encoder-only model. + +Although decoder-only GPT-1 was introduced in 2018, it was GPT-2 in 2019 that caught widespread attention because +OpenAI at first deemed it too powerful to release publicly, out of fear of malicious use.[14] GPT-3 in 2020 went +a step further and as of 2024 is available only via API with no offering of downloading the model to execute locally. +But it was the 2022 consumer-facing browser-based ChatGPT that captured the imaginations of the general population +and caused some media hype and online buzz.[15] The 2023 GPT-4 was praised for its increased accuracy and as a +"holy grail" for its multimodal capabilities.[16] OpenAI did not reveal high-level architecture and the number +of parameters of GPT-4. + +Competing language models have for the most part been attempting to equal the GPT series, at least in terms of +number of parameters.[17] + +Since 2022, source-available models have been gaining popularity, especially at first with BLOOM and LLaMA, though +both have restrictions on the field of use. Mistral AI's models Mistral 7B and Mixtral 8x7b have the more permissive +Apache License. As of June 2024, The Instruction fine tuned variant of the Llama 3 70 billion parameter model is +the most powerful open LLM according to the LMSYS Chatbot Arena Leaderboard, being more powerful than GPT-3.5 but +not as powerful as GPT-4.[18] + +As of 2024, the largest and most capable models are all based on the Transformer architecture. Some recent +implementations are based on other architectures, such as recurrent neural network variants and Mamba +(a state space model).[19][20][21] + +Dataset preprocessing +See also: List of datasets for machine-learning research ยง Internet +Probabilistic tokenization +Because machine learning algorithms process numbers rather than text, the text must be converted to numbers. +In the first step, a vocabulary is decided upon, then integer indexes are arbitrarily but uniquely assigned +to each vocabulary entry, and finally, an embedding is associated to the integer index. Algorithms include +byte-pair encoding and WordPiece. + +Probabilistic tokenization also compresses the datasets. Because LLMs generally require input to be an array +that is not jagged, the shorter texts must be "padded" until they match the length of the longest one. How many +tokens are, on average, needed per word depends on the language of the dataset.[22][23] + +BPE +Using a modification of byte-pair encoding, in the first step, all unique characters (including blanks and +punctuation marks) are treated as an initial set of n-grams (i.e. initial set of uni-grams). Successively +the most frequent pair of adjacent characters is merged into a bi-gram and all instances of the pair are +replaced by it. All occurrences of adjacent pairs of (previously merged) n-grams that most frequently occur +together are then again merged into even lengthier n-gram repeatedly until a vocabulary of prescribed size +is obtained (in case of GPT-3, the size is 50257).[24] Token vocabulary consists of integers, spanning from +zero up to the size of the token vocabulary. New words can always be interpreted as combinations of the +tokens and the initial-set uni-grams.[25] + +A token vocabulary based on the frequencies extracted from mainly English corpora uses as few tokens as +possible for an average English word. An average word in another language encoded by such an English-optimized +tokenizer is however split into suboptimal amount of tokens. GPT-2 tokenizer can use up to 15 times more tokens +per word for some languages, for example for the Shan language from Myanmar. Even more widespread languages +such as Portuguese and German have "a premium of 50%" compared to English.[26] + +For example, here is how tokenizer used by GPT-3 (Legacy) split the following sentence tokenizer: texts -> +series of numerical "tokens". \ No newline at end of file diff --git a/docs/model/onnxruntime_cpu_models.md b/docs/model/onnxruntime_cpu_models.md new file mode 100644 index 0000000..6951ac8 --- /dev/null +++ b/docs/model/onnxruntime_cpu_models.md @@ -0,0 +1,14 @@ +# Model Powered by Onnxruntime CPU GenAI + +## Supported Models + +| Model Name | Parameters | Context Length | Size (GB) | Link | +|-------------------------------------------------------|------------|----------------|-----------|---------------------------------------------------------------------------------------------------------------------| +| Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32 | 3.8B | 4096 | 2.538 | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32/tree/main) | +| Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4 | 3.8B | 4096 | 2.538 | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) | +| Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32 | 3.8B | 4096 | 2.585 | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32/tree/main) | +| Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4 | 3.8B | 4096 | 2.585 | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) | +| mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32 | 7B | 32768 | 4.66 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32/tree/main) | +| mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4 | 7B | 32768 | 4.66 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) | +| openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32 | 8B | 8192 | 6.339 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32/tree/main) | +| openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4 | 8B | 8192 | 6.339 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) | diff --git a/docs/model/onnxruntime_directml_models.md b/docs/model/onnxruntime_directml_models.md new file mode 100644 index 0000000..0f6a3a3 --- /dev/null +++ b/docs/model/onnxruntime_directml_models.md @@ -0,0 +1,19 @@ +# Model Powered by Onnxruntime DirectML GenAI + +## Supported Models + +| Model Name | Parameters | Context Length | Size (GB) | Link | +|--------------------------------------------|------------|----------------|-----------|---------------------------------------------------------------------------------------------------------------------| +| Phi-3-mini-4k-instruct-onnx-directml | 3.8B | 4096 | 1.989 | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml) | +| Phi-3-mini-128k-instruct-onnx-directml | 3.8B | 131072 | 2.018 | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml) | +| Phi-3-medium-4k-instruct-onnx-directml | 17B | 4096 | 6.987 | [EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml) | +| Phi-3-medium-128k-instruct-onnx-directml | 17B | 131072 | 7.025 | [EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml) | +| Phi-3-mini-4k-instruct-062024-int4-onnx-directml | 3.8B | 4096 | 2.137 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml) | +| mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml | 7B | 32768 | 3.988 | [EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml) | +| gemma-2b-it-int4-onnx-directml | 2B | 8192 | 2.314 | [EmbeddedLLM/gemma-2b-it-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/gemma-2b-it-int4-onnx-directml) | +| gemma-7b-it-int4-onnx-directml | 7B | 8192 | 5.958 | [EmbeddedLLM/gemma-7b-it-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/gemma-7b-it-int4-onnx-directml) | +| llama-2-7b-chat-int4-onnx-directml | 7B | 4096 | 3.708 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) | +| Starling-LM-7b-beta-int4-onnx-directml | 7B | 8192 | 3.974 | [EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml) | +| openchat-3.6-8b-20240522-int4-onnx-directml | 8B | 8192 | 4.922 | [EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml) | +| Yi-1.5-6B-Chat-int4-onnx-directml | 6B | 32768 | 3.532 | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml) | + diff --git a/docs/model/onnxruntime_models.md b/docs/model/onnxruntime_models.md deleted file mode 100644 index 4d61ffe..0000000 --- a/docs/model/onnxruntime_models.md +++ /dev/null @@ -1,19 +0,0 @@ -# Model Powered by Onnxruntime GenAI - -## Supported Models - -| Models | Parameters | Context Length | Link | -| --- | --- | --- | --- | -| Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) | -| Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) | -| Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) | -| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) | -| Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) | -| Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) | -| Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) | -| Phi3-mini-128k-instruct | 3.8B | 128k | [microsoft/Phi-3-mini-128k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx) | -| Phi3-medium-4k-instruct | 17B | 4096 | [microsoft/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-directml) | -| Phi3-medium-128k-instruct | 17B | 128k | [microsoft/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct-onnx-directml) | -| Openchat-3.6-8b | 8B | 8192 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx) | -| Yi-1.5-6b-chat | 6B | 32k | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx) | -| Phi-3-vision-128k-instruct | | 128k | [EmbeddedLLM/Phi-3-vision-128k-instruct-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-vision-128k-instruct-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4) | diff --git a/src/embeddedllm/backend/onnxruntime_engine.py b/src/embeddedllm/backend/onnxruntime_engine.py index 82b5dca..95d13c3 100644 --- a/src/embeddedllm/backend/onnxruntime_engine.py +++ b/src/embeddedllm/backend/onnxruntime_engine.py @@ -1,9 +1,11 @@ # from embeddedllm.transformers_utils.image_processing_phi3v import Phi3VImageProcessor import contextlib import time +import os from pathlib import Path from tempfile import TemporaryDirectory from typing import AsyncIterator, List, Optional +from huggingface_hub import snapshot_download import onnxruntime_genai as og from loguru import logger @@ -39,6 +41,15 @@ def onnx_generator_context(model, params): class OnnxruntimeEngine(BaseLLMEngine): def __init__(self, model_path: str, vision: bool, device: str = "cpu"): self.model_path = model_path + + if not os.path.exists(model_path): + snapshot_path = snapshot_download( + repo_id=model_path, + allow_patterns=None, + repo_type="model", + ) + model_path = snapshot_path + self.model_config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True) self.device = device diff --git a/src/embeddedllm/engine.py b/src/embeddedllm/engine.py index 3eac11c..86f589c 100644 --- a/src/embeddedllm/engine.py +++ b/src/embeddedllm/engine.py @@ -80,7 +80,7 @@ def __init__(self, model_path: str, vision: bool, device: str = "xpu", backend: else: raise ValueError( - f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda` and `directml`." + f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda`, `openvino` and `directml`." ) self.tokenizer = self.engine.tokenizer diff --git a/src/embeddedllm/entrypoints/api_server.py b/src/embeddedllm/entrypoints/api_server.py index 9385f24..efc2916 100644 --- a/src/embeddedllm/entrypoints/api_server.py +++ b/src/embeddedllm/entrypoints/api_server.py @@ -28,9 +28,9 @@ class Config(BaseSettings): ) port: int = Field(default=6979, description="Server port.") host: str = Field(default="0.0.0.0", description="Server host.") - device: str = Field(default="cpu", description="Device type: `cpu`, `xpu`") + device: str = Field(default="cpu", description="Device type: `cpu`, `xpu`, `gpu`") backend: str = Field( - default="directml", description="Backend engine: `cpu`, `ipex` and `directml`" + default="directml", description="Backend engine: `cpu`, `ipex`, `openvino` and `directml`" ) response_role: str = Field(default="assistant", description="Server response role.") uvicorn_log_level: str = Field( diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py index ca1da44..cc1e15c 100644 --- a/src/embeddedllm/entrypoints/modelui.py +++ b/src/embeddedllm/entrypoints/modelui.py @@ -20,7 +20,7 @@ def get_embeddedllm_backend(): version = importlib.metadata.version("embeddedllm") # Use regex to extract the backend - match = re.search(r"\+(directml|cpu|cuda|ipex)$", version) + match = re.search(r"\+(directml|cpu|cuda|ipex|openvino)$", version) if match: backend = match.group(1) @@ -65,44 +65,170 @@ class ModelCard(BaseModel): size: Optional[int] = 0 -dml_model_dict_list = { +openvino_model_dict_list = { + # "OpenVINO/Phi-3-mini-128k-instruct-int4-ov": ModelCard( + # hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-128k-instruct-int4-ov/tree/main/", + # repo_id="OpenVINO/Phi-3-mini-128k-instruct-int4-ov", + # model_name="Phi-3-mini-128k-instruct-int4-ov", + # subfolder=".", + # repo_type="model", + # context_length=131072, + # ), + "OpenVINO/Phi-3-mini-128k-instruct-int8-ov": ModelCard( + hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-128k-instruct-int8-ov/tree/main/", + repo_id="OpenVINO/Phi-3-mini-128k-instruct-int8-ov", + model_name="Phi-3-mini-128k-instruct-int8-ov", + subfolder=".", + repo_type="model", + context_length=131072, + ), + # "OpenVINO/Phi-3-mini-4k-instruct-int4-ov": ModelCard( + # hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov/tree/main/", + # repo_id="OpenVINO/Phi-3-mini-4k-instruct-int4-ov", + # model_name="Phi-3-mini-4k-instruct-int4-ov", + # subfolder=".", + # repo_type="model", + # context_length=4096, + # ), + "OpenVINO/Phi-3-mini-4k-instruct-int8-ov": ModelCard( + hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int8-ov/tree/main/", + repo_id="OpenVINO/Phi-3-mini-4k-instruct-int8-ov", + model_name="Phi-3-mini-4k-instruct-int8-ov", + subfolder=".", + repo_type="model", + context_length=4096, + ), + # "OpenVINO/Phi-3-medium-4k-instruct-int4-ov": ModelCard( + # hf_url="https://huggingface.co/OpenVINO/Phi-3-medium-4k-instruct-int4-ov/tree/main/", + # repo_id="OpenVINO/Phi-3-medium-4k-instruct-int4-ov", + # model_name="Phi-3-medium-4k-instruct-int4-ov", + # subfolder=".", + # repo_type="model", + # context_length=4096, + # ), + "OpenVINO/Phi-3-medium-4k-instruct-int8-ov": ModelCard( + hf_url="https://huggingface.co/OpenVINO/Phi-3-medium-4k-instruct-int8-ov/tree/main/", + repo_id="OpenVINO/Phi-3-medium-4k-instruct-int8-ov", + model_name="Phi-3-medium-4k-instruct-int8-ov", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "OpenVINO/open_llama_7b_v2-int8-ov": ModelCard( + hf_url="https://huggingface.co/OpenVINO/open_llama_7b_v2-int8-ov/tree/main/", + repo_id="OpenVINO/open_llama_7b_v2-int8-ov", + model_name="open_llama_7b_v2-int8-ov", + subfolder=".", + repo_type="model", + context_length=2048, + ), + "OpenVINO/open_llama_3b_v2-int8-ov": ModelCard( + hf_url="https://huggingface.co/OpenVINO/open_llama_3b_v2-int8-ov/tree/main/", + repo_id="OpenVINO/open_llama_3b_v2-int8-ov", + model_name="open_llama_3b_v2-int8-ov", + subfolder=".", + repo_type="model", + context_length=2048, + ), +} + +ipex_model_dict_list = { "microsoft/Phi-3-mini-4k-instruct": ModelCard( - hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx/tree/main/directml/directml-int4-awq-block-128", - repo_id="microsoft/Phi-3-mini-4k-instruct-onnx", - model_name="Phi-3-mini-4k-instruct-onnx", - subfolder="directml/directml-int4-awq-block-128", + hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/tree/main/", + repo_id="microsoft/Phi-3-mini-4k-instruct", + model_name="Phi-3-mini-4k-instruct", + subfolder=".", repo_type="model", context_length=4096, ), - "EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4", - repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx", - model_name="Phi-3-mini-4k-instruct-062024-onnx", - subfolder="onnx/directml/Phi-3-mini-4k-instruct-062024-int4", + "microsoft/Phi-3-mini-128k-instruct": ModelCard( + hf_url="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/tree/main", + repo_id="microsoft/Phi-3-mini-128k-instruct", + model_name="Phi-3-mini-128k-instruct", + subfolder=".", + repo_type="model", + context_length=131072, + ), + "microsoft/Phi-3-medium-4k-instruct": ModelCard( + hf_url="https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/tree/main", + repo_id="microsoft/Phi-3-medium-4k-instruct", + model_name="Phi-3-medium-4k-instruct", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "microsoft/Phi-3-medium-128k-instruct": ModelCard( + hf_url="https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/tree/main", + repo_id="microsoft/Phi-3-medium-128k-instruct", + model_name="Phi-3-medium-128k-instruct", + subfolder=".", + repo_type="model", + context_length=131072, + ), +} + +dml_model_dict_list = { + "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml", + model_name="Phi-3-mini-4k-instruct-onnx-directml", + subfolder=".", repo_type="model", context_length=4096, ), - "EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/directml/mistralai_Mistral-7B-Instruct-v0.3-int4", - repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx", - model_name="mistral-7b-instruct-v0.3-onnx", - subfolder="onnx/directml/mistralai_Mistral-7B-Instruct-v0.3-int4", + "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml", + model_name="Phi-3-mini-128k-instruct-onnx-directml", + subfolder=".", + repo_type="model", + context_length=131072, + ), + "EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml", + model_name="Phi-3-medium-4k-instruct-onnx-directml", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml", + model_name="Phi-3-medium-128k-instruct-onnx-directml", + subfolder=".", + repo_type="model", + context_length=131072, + ), + "EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml", + model_name="Phi-3-mini-4k-instruct-062024-int4-onnx-directml", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml", + model_name="mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=32768, ), - "EmbeddedLLM/gemma-2b-it-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx/tree/main/onnx/directml/gemma-2b-it-int4", - repo_id="EmbeddedLLM/gemma-2b-it-onnx", - model_name="gemma-2b-it-int4", - subfolder="onnx/directml/gemma-2b-it-int4", + "EmbeddedLLM/gemma-2b-it-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/gemma-2b-it-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/gemma-2b-it-int4-onnx-directml", + model_name="gemma-2b-it-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=8192, ), - "EmbeddedLLM/gemma-7b-it-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/gemma-7b-it-onnx/tree/main/onnx/directml/gemma-7b-it-int4", - repo_id="EmbeddedLLM/gemma-7b-it-onnx", - model_name="gemma-7b-it-int4", - subfolder="onnx/directml/gemma-7b-it-int4", + "EmbeddedLLM/gemma-7b-it-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/gemma-7b-it-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/gemma-7b-it-int4-onnx-directml", + model_name="gemma-7b-it-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=8192, ), @@ -114,70 +240,94 @@ class ModelCard(BaseModel): repo_type="model", context_length=4096, ), - "EmbeddedLLM/Starling-LM-7b-beta-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-onnx/tree/main/onnx/directml/Starling-LM-7b-beta-int4", - repo_id="EmbeddedLLM/Starling-LM-7b-beta-onnx", - model_name="Starling-LM-7b-beta-int4", - subfolder="onnx/directml/Starling-LM-7b-beta-int4", + "EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml", + model_name="Starling-LM-7b-beta-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=8192, ), - "EmbeddedLLM/openchat-3.6-8b-20240522-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/directml/openchat-3.6-8b-20240522-int4", - repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx", - model_name="openchat-3.6-8b-20240522-int4", - subfolder="onnx/directml/openchat-3.6-8b-20240522-int4", + "EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml", + model_name="openchat-3.6-8b-20240522-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=8192, ), - "EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx/tree/main/onnx/directml/01-ai_Yi-1.5-6B-Chat-int4", - repo_id="EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx", - model_name="01-ai_Yi-1.5-6B-Chat-int4", - subfolder="onnx/directml/01-ai_Yi-1.5-6B-Chat-int4", + "EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml", + model_name="01-ai_Yi-1.5-6B-Chat-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=4096, ), } cpu_model_dict_list = { - "microsoft/Phi-3-mini-4k-instruct": ModelCard( - hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx/tree/main/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4", - repo_id="microsoft/Phi-3-mini-4k-instruct-onnx", - model_name="Phi-3-mini-4k-instruct-onnx", - subfolder="cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4", + "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32", + model_name="Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32", + subfolder=".", repo_type="model", context_length=4096, ), - "EmbeddedLLM/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4", - repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx", - model_name="mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4", - subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4", + "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4", + model_name="Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32", + model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32", + subfolder=".", + repo_type="model", + context_length=131072, + ), + "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4", + model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4", + subfolder=".", + repo_type="model", + context_length=131072, + ), + "EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main", + repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4", + model_name="mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4", + subfolder=".", repo_type="model", context_length=32768, ), - "EmbeddedLLM/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32", - repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx", - model_name="mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32", - subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32", + "EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32/tree/main", + repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32", + model_name="mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32", + subfolder=".", repo_type="model", context_length=32768, ), - "EmbeddedLLM/openchat-3.6-8b-20240522-cpu-int4-rtn-block-32-acc-level-4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4", - repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx", - model_name="openchat-3.6-8b-20240522-cpu-int4-rtn-block-32-acc-level-4", - subfolder="onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4", + "EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main", + repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4", + model_name="openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4", + subfolder=".", repo_type="model", context_length=8192, ), - "EmbeddedLLM/openchat-3.6-8b-20240522-cpu-int4-rtn-block-32": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32", - repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx", - model_name="openchat-3.6-8b-20240522-cpu-int4-rtn-block-32", - subfolder="onnx/cpu_and_mobile/cpu-int4-rtn-block-32", + "EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32/tree/main", + repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32", + model_name="openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32", + subfolder=".", repo_type="model", context_length=8192, ), @@ -231,8 +381,18 @@ def compute_memory_size(repo_id, path_in_repo, repo_type: str = "model"): repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type ) +for k, v in ipex_model_dict_list.items(): + v.size = compute_memory_size( + repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type + ) -def convert_to_dataframe(dml_model_dict_list): +for k, v in openvino_model_dict_list.items(): + v.size = compute_memory_size( + repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type + ) + + +def convert_to_dataframe(model_dict_list): # Create lists to store the data model_names = [] hf_urls = [] @@ -244,7 +404,7 @@ def convert_to_dataframe(dml_model_dict_list): context_lengths = [] # Iterate through the dictionary and extract the data - for key, model_card in dml_model_dict_list.items(): + for key, model_card in model_dict_list.items(): model_names.append(key) hf_urls.append(model_card.hf_url) repo_ids.append(model_card.repo_id) @@ -318,6 +478,12 @@ def update_model_list(engine_type): if engine_type == "DirectML": models = sorted(list(dml_model_dict_list.keys())) models_pandas = convert_to_dataframe(dml_model_dict_list) + elif engine_type == "Ipex": + models = sorted(list(ipex_model_dict_list.keys())) + models_pandas = convert_to_dataframe(ipex_model_dict_list) + elif engine_type == 'OpenVino': + models = sorted(list(openvino_model_dict_list.keys())) + models_pandas = convert_to_dataframe(openvino_model_dict_list) else: models = sorted(list(cpu_model_dict_list.keys())) models_pandas = convert_to_dataframe(cpu_model_dict_list) @@ -340,28 +506,51 @@ def deploy_model(engine_type, model_name, port_number): if engine_type == "DirectML": llm_model_card = dml_model_dict_list[model_name] + elif engine_type == "Ipex": + llm_model_card = ipex_model_dict_list[model_name] + elif engine_type == "OpenVino": + llm_model_card = openvino_model_dict_list[model_name] else: llm_model_card = cpu_model_dict_list[model_name] snapshot_path = snapshot_download( repo_id=llm_model_card.repo_id, - allow_patterns=f"{llm_model_card.subfolder}/*", + allow_patterns=( + f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None + ), repo_type="model", ) - model_path = os.path.join(snapshot_path, llm_model_card.subfolder) + if llm_model_card.subfolder != ".": + model_path = os.path.join(snapshot_path, llm_model_card.subfolder) + else: + model_path = snapshot_path + + print("Model path:", model_path) + + if engine_type == "Ipex": + device = "xpu" + elif engine_type == "OpenVino": + device = "gpu" + else: + device = "cpu" deployed_model.process = subprocess.Popen( [ "ellm_server", "--model_path", model_path, + "--backend", + backend, + "--device", + device, "--port", f"{port_number}", - "--served_model_name", - model_name, + # "--served_model_name", + # model_name ] ) + deployed_model.model_name = model_name while True: @@ -375,6 +564,7 @@ def deploy_model(engine_type, model_name, port_number):

Model: {model_name}

Engine: {engine_type}

Port: {port_number}

+

Model Path: {model_path}

""" @@ -402,6 +592,10 @@ def download_model(engine_type, model_name): if engine_type == "DirectML": llm_model_card = dml_model_dict_list[model_name] + elif engine_type == "Ipex": + llm_model_card = ipex_model_dict_list[model_name] + elif engine_type == "OpenVino": + llm_model_card = openvino_model_dict_list[model_name] else: llm_model_card = cpu_model_dict_list[model_name] @@ -412,7 +606,9 @@ def download_model(engine_type, model_name): yield "Downloading ..." snapshot_path = snapshot_download( repo_id=llm_model_card.repo_id, - allow_patterns=f"{llm_model_card.subfolder}/*", + allow_patterns=( + f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None + ), repo_type="model", ) yield snapshot_path @@ -443,9 +639,16 @@ def main(): with gr.Accordion("See More Model Details", open=False): model_info_pandas_frame = gr.Dataframe(value=None) + default_value = "CPU" # Default value + if backend == "directml": + default_value = "DirectML" + elif backend == "ipex": + default_value = "Ipex" + elif backend == "openvino": + default_value = "OpenVino" selected_engine_type = gr.Dropdown( - choices=["DirectML", "CPU"], - value="DirectML" if backend == "directml" else "CPU", + choices=["DirectML", "Ipex", "OpenVino", "CPU"], + value=default_value, multiselect=False, label="LLM Engine", show_label=True, diff --git a/src/embeddedllm/inputs.py b/src/embeddedllm/inputs.py index 9797d05..8f05498 100644 --- a/src/embeddedllm/inputs.py +++ b/src/embeddedllm/inputs.py @@ -23,13 +23,13 @@ class ImagePixelData(TypedDict): # https://github.com/vllm-project/vllm/pull/4028 @overload -def parse_and_batch_prompt(prompt: Union[str, List[str]]) -> Sequence[ParsedText]: - ... +def parse_and_batch_prompt(prompt: Union[str, List[str]]) -> Sequence[ParsedText]: ... @overload -def parse_and_batch_prompt(prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]: - ... +def parse_and_batch_prompt( + prompt: Union[List[int], List[List[int]]] +) -> Sequence[ParsedTokens]: ... def parse_and_batch_prompt(