diff --git a/README.md b/README.md
index 50379d6..57743e1 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,8 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
 ## Table Content
 
 - [Supported Models](#supported-models-quick-start)
-  - [Onnxruntime Models](./docs/model/onnxruntime_models.md)
+  - [Onnxruntime DirectML Models](./docs/model/onnxruntime_directml_models.md)
+  - [Onnxruntime CPU Models](./docs/model/onnxruntime_cpu_models.md)
   - [Ipex-LLM Models](./docs/model/ipex_models.md)
 - [Getting Started](#getting-started)
   - [Installation From Source](#installation)
@@ -39,7 +40,7 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
 | Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) |
 | Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) |
 | Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) |
-| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
+| Llama-3-8b-chat | 8B | 8192 | [luweigen/Llama-3-8B-Instruct-int4-onnx-directml](https://huggingface.co/luweigen/Llama-3-8B-Instruct-int4-onnx-directml) |
 | Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
 | Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) |
 | Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) |
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000..f09ffc3
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,89 @@
+# Benchmark
+Allow users to test on themselves to get the benchmark of model(s) on different backend. It will analyse the Token In / Out throughput for you in a statistical manner
+
+## Benchmark a Model
+To benchmark a model, run this
+* --backend `cpu` | `ipex` | `openvino` | `directml`
+* --model_name `Name of the Model`
+* --model_path `Path to Model` | `Model Repo ID`
+* --token_in `Number of Input Tokens (Max 2048)`
+* --token_out `Number of Output Tokens`
+* --input_token_bias `Adjust the input token`
+* --output_token_bias `Adjust the output token`
+* --loop_count `Adjust the loop count`
+
+```shell
+python ellm_benchmark.py --backend <cpu | ipex | openvino | directml> --model_name <Name of the Model> --model_path <Path to Model | Model Repo ID> --token_in <Number of Input Tokens (Max 2048)> --token_out <Number of Output Tokens> --input_token_bias <int value> --output_token_bias <int value> --loop_count <int value>
+```
+
+
+## Loop to benchmark the models
+Customise your benchmarking config
+```python
+# Define the models
+model_names = [
+    # model names
+
+]
+
+# Define the model paths
+model_paths = [
+    # path to model in order to model names / model repo id
+
+]
+
+# Define the token length
+token_in_out = [
+    (1024, 1024),
+    (1024, 512),
+    (1024, 256),
+    (1024, 128),
+    (512, 1024),
+    (512, 512),
+    (512, 256),
+    (512, 128),
+    (256, 1024),
+    (256, 512),
+    (256, 256),
+    (256, 128),
+    (128, 1024),
+    (128, 512),
+    (128, 256),
+    (128, 128),
+]
+
+# Choose backend
+backend = "cpu"
+backend = "directml"
+backend = "ipex"
+backend = "openvino"
+
+# Number of loops
+loop_count = 20
+
+# input and output token bias
+input_token_bias = 0
+output_token_bias = 0
+```
+```shell
+python loop_ellm_benchmark.py
+```
+
+## Generate a Report (`XLSX`) of a Model's Benchmark
+To Generate report for a model, run this
+* --model_name `Name of the Model`
+```shell
+python analyse_detailed_benchmark.py --model_name <Name of the Model>
+```
+
+## Generate Reports (`XLSX`) of Models' Benchmark
+List out the models that you want to have report of benchmarking
+```python
+model_names = [
+    # model names
+    
+]
+```
+```shell
+python loop_analyse_detailed_benchmark.py
+```
diff --git a/benchmark/analyse_detailed_benchmark.py b/benchmark/analyse_detailed_benchmark.py
new file mode 100644
index 0000000..ca45d30
--- /dev/null
+++ b/benchmark/analyse_detailed_benchmark.py
@@ -0,0 +1,124 @@
+import os
+import re
+import numpy as np
+import pandas as pd
+import argparse
+
+def extract_data_from_log(log_file):
+    average_tps_list = []
+    prompt_tokens_per_second_list = []
+    new_tokens_per_second_list = []
+    error_count = 0
+    error_state = False
+
+    if not os.path.exists(log_file):
+        print(f"Log file does not exist: {log_file}")
+        return average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count
+
+    with open(log_file, 'r') as file:
+        for line in file:
+            if "ERROR" in line:
+                error_count += 1
+                error_state = True
+                continue
+
+            if "Average tps" in line and error_state == True:
+                error_state = False
+                continue
+
+            if "Average tps" in line:
+                average_tps = float(re.search(r"Average tps: ([\d.]+)", line).group(1))
+                average_tps_list.append(average_tps)
+                continue
+
+            if "Prompt tokens per second" in line:
+                prompt_tokens_per_second = float(re.search(r"Prompt tokens per second: ([\d.]+)", line).group(1))
+                prompt_tokens_per_second_list.append(prompt_tokens_per_second)
+            if "New tokens per second" in line:
+                new_tokens_per_second = float(re.search(r"New tokens per second: ([\d.]+)", line).group(1))
+                new_tokens_per_second_list.append(new_tokens_per_second)
+
+    return average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count
+
+def calculate_statistics(data):
+    data_np = np.array(data)
+    stats = {
+        "std": np.std(data_np, ddof=1),  # Sample standard deviation
+        "mean": np.mean(data_np),
+        "min": np.min(data_np),
+        "1%": np.percentile(data_np, 1),
+        "25%": np.percentile(data_np, 25),
+        "50%": np.percentile(data_np, 50),  # Median
+        "75%": np.percentile(data_np, 75),
+        "99%": np.percentile(data_np, 99),
+        "max": np.max(data_np)
+    }
+    return stats
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Process log files and generate statistics.")
+    parser.add_argument('--model_name', type=str, required=True, help='Name of the model')
+    return parser.parse_args()
+
+def main(model_name):
+    token_ins = [128, 256, 512, 1024]
+    token_outs = [128, 256, 512, 1024]
+
+    statistics = []
+
+    # Create the profile_model_timing directory if it doesn't exist
+    log_dir = "profile_model_timing"
+    os.makedirs(log_dir, exist_ok=True)
+
+    for input_token_length in token_ins:
+        for output_token_length in token_outs:
+            log_file = os.path.join(log_dir, f'profile_model_timing_{model_name}_{input_token_length}_{output_token_length}.log')
+            average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count = extract_data_from_log(log_file)
+
+            if not average_tps_list and not prompt_tokens_per_second_list and not new_tokens_per_second_list:
+                # Log file does not exist or is empty, append "-" for each statistical value
+                statistics.append([
+                    model_name, input_token_length, output_token_length,
+                    "-", "-", "-", "-", "-", "-", "-", "-", "-",
+                    "-", "-", "-", "-", "-", "-", "-", "-", "-",
+                    "-", "-", "-", "-", "-", "-", "-", "-", "-",
+                    error_count
+                ])
+            else:
+                min_len = min(len(average_tps_list), len(prompt_tokens_per_second_list), len(new_tokens_per_second_list))
+
+                if min_len > 0:
+                    prompt_stats = calculate_statistics(prompt_tokens_per_second_list[5:min_len])
+                    new_token_stats = calculate_statistics(new_tokens_per_second_list[5:min_len])
+                    average_tps_stats = calculate_statistics(average_tps_list[5:min_len])
+
+                    statistics.append([
+                        model_name, input_token_length, output_token_length,
+                        prompt_stats["std"], prompt_stats["mean"], prompt_stats["min"], prompt_stats["1%"], prompt_stats["25%"], prompt_stats["50%"], prompt_stats["75%"], prompt_stats["99%"], prompt_stats["max"],
+                        new_token_stats["std"], new_token_stats["mean"], new_token_stats["min"], new_token_stats["1%"], new_token_stats["25%"], new_token_stats["50%"], new_token_stats["75%"], new_token_stats["99%"], new_token_stats["max"],
+                        average_tps_stats["std"], average_tps_stats["mean"], average_tps_stats["min"], average_tps_stats["1%"], average_tps_stats["25%"], average_tps_stats["50%"], average_tps_stats["75%"], average_tps_stats["99%"], average_tps_stats["max"],
+                        error_count
+                    ])
+
+    # Create a DataFrame
+    columns = [
+        "Model", "Token In", "Token Out",
+        "Token In / sec std", "Token In / sec mean", "Token In / sec min", "Token In / sec 1%", "Token In / sec 25%", "Token In / sec 50%", "Token In / sec 75%", "Token In / sec 99%", "Token In / sec max",
+        "Token Out / sec std", "Token Out / sec mean", "Token Out / sec min", "Token Out / sec 1%", "Token Out / sec 25%", "Token Out / sec 50%", "Token Out / sec 75%", "Token Out / sec 99%", "Token Out / sec max",
+        "Average Token / sec std", "Average Token / sec mean", "Average Token / sec min", "Average Token / sec 1%", "Average Token / sec 25%", "Average Token / sec 50%", "Average Token / sec 75%", "Average Token / sec 99%", "Average Token / sec max",
+        "No of Fail"
+    ]
+    df = pd.DataFrame(statistics, columns=columns)
+
+    # Create the statistics directory if it doesn't exist
+    output_dir = "statistics"
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Write to Excel
+    output_file = os.path.join(output_dir, f"{model_name}_statistics.xlsx")
+    df.to_excel(output_file, index=False)
+    print(f"Statistics written to {output_file}")
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args.model_name)
diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py
new file mode 100644
index 0000000..12a2822
--- /dev/null
+++ b/benchmark/ellm_benchmark.py
@@ -0,0 +1,132 @@
+import sys
+import os
+import time
+import asyncio
+import argparse
+from loguru import logger
+
+# Add the 'src' directory to sys.path
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'src')))
+
+# Import the engine module
+from embeddedllm import engine
+from embeddedllm import sampling_params
+
+async def benchmark(model, input_token_length, output_token_length, model_name, input_token_bias=0, output_token_bias=0):
+    
+    logger.info(f"Model: {model_name}")
+
+    model.tokenizer.chat_template = "{% for message in messages %}{{  message['content']}}{% endfor %}"  # Override
+
+    prompt_text = """
+
+    """
+    # Define the path to the file
+    file_path = "sampleText.txt"
+
+    # Open the file and read its contents into the variable
+    with open(file_path, 'r') as file:
+        prompt_text = file.read()
+
+    input_tokens = model.tokenizer.encode(prompt_text)[:(input_token_length + input_token_bias)]
+    input_text = model.tokenizer.decode(input_tokens)
+    print(input_text)
+    input_tokens = model.tokenizer.encode(input_text)
+
+    PromptInputs = {
+        "prompt": input_text
+    }
+
+    sampling_params_config = sampling_params.SamplingParams(
+        max_tokens=(output_token_length + output_token_bias),
+        top_p=0.1,
+        top_k=1,
+        temperature=1,
+        repetition_penalty=0.01,
+    )
+
+    start = time.perf_counter()
+
+    async def generate():
+        results = []
+        async for response in model.generate(
+            inputs=PromptInputs,
+            sampling_params=sampling_params_config,
+            request_id="benchmark",
+            stream=True,
+        ):
+            results.append(response)
+        return results
+
+    response = await generate()
+    end = time.perf_counter()
+
+    logger.info(response[0])  # Access the generated text from the response
+
+    total_time_taken = end - start
+    logger.info(f"Total time taken: {total_time_taken:.2f} seconds")
+
+    average_tps = (input_token_length + output_token_length) / total_time_taken
+    logger.info("Average tps: "+ str(average_tps))
+
+    
+
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark EmbeddedLLM models.")
+    parser.add_argument('--backend', type=str, required=True, choices=['cpu', 'npu', 'directml', 'openvino', 'ipex'], help='Backend to use (cpu, npu, ipex, openvino or directml)')
+    parser.add_argument('--model_name', type=str, required=True, help='Name of the model')
+    parser.add_argument('--model_path', type=str, required=True, help='Path to the model or model repo id')
+    parser.add_argument('--token_in', type=int, required=True, help='Number of input tokens (max 2048)')
+    parser.add_argument('--token_out', type=int, required=True, help='Number of output tokens')
+    parser.add_argument('--input_token_bias', type=int, required=False, help='Adjust the input token length')
+    parser.add_argument('--output_token_bias', type=int, required=False, help='Adjust the output token length')
+    parser.add_argument('--loop_count', type=int, required=False, help='Adjust the loop count')
+
+    args = parser.parse_args()
+
+    backend = args.backend
+    model_path = args.model_path
+    model_name = args.model_name
+    token_in = args.token_in
+    token_out = args.token_out
+    input_token_bias = args.input_token_bias
+    output_token_bias = args.output_token_bias
+    loop_count = args.loop_count
+
+    # Cap the input tokens to 2048
+    if args.token_in > 2048:
+        print("Input tokens capped to 2048.")
+        args.token_in = 2048
+
+    # Create the profile_model_timing directory if it doesn't exist
+    log_dir = "profile_model_timing"
+    os.makedirs(log_dir, exist_ok=True)
+
+    log_file = os.path.join(log_dir, f'profile_model_timing_{model_name}_{token_in}_{token_out}.log')
+
+    # Add the log file to the logger
+    logger.add(log_file, mode='w')
+
+    # need different parameter for cpu and directml
+    if backend == "cpu":
+        device="cpu"
+    elif backend == "npu":
+        device="npu"
+    elif backend == "ipex":
+        device="xpu"
+    elif backend == "openvino":
+        device="gpu"
+    elif backend == "directml":
+        device = ""
+
+    model = engine.EmbeddedLLMEngine(model_path=model_path, vision=False, device=device, backend=backend)
+
+    for _ in range(loop_count):
+        # Run the async function using asyncio.run()
+        asyncio.run(benchmark(model, token_in, token_out, model_name, input_token_bias, output_token_bias))
+
+    # Remove the logger to close the log file
+    logger.remove()
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/loop_analyse_detailed_benchmark.py b/benchmark/loop_analyse_detailed_benchmark.py
new file mode 100644
index 0000000..e01bdda
--- /dev/null
+++ b/benchmark/loop_analyse_detailed_benchmark.py
@@ -0,0 +1,20 @@
+import subprocess
+
+model_names = [
+    # model names
+    
+]
+
+
+# Path to the ellm_benchmark.py script
+analyse_detailed_benchmark_script = "analyse_detailed_benchmark.py"
+
+for model_name in model_names:
+    # Construct the command
+    command = [
+        "python", analyse_detailed_benchmark_script,
+        "--model_name", model_name,
+    ]
+
+    # Execute the command
+    subprocess.run(command)
\ No newline at end of file
diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py
new file mode 100644
index 0000000..f78c50f
--- /dev/null
+++ b/benchmark/loop_ellm_benchmark.py
@@ -0,0 +1,68 @@
+import subprocess
+
+# Define the models
+model_names = [
+    # model names
+    
+]
+
+# Define the model paths
+model_paths = [
+    # path to model in order to model names / model repo id
+
+]
+
+# Define the token length
+token_in_out = [
+    (1024, 1024),
+    (1024, 512),
+    (1024, 256),
+    (1024, 128),
+    (512, 1024),
+    (512, 512),
+    (512, 256),
+    (512, 128),
+    (256, 1024),
+    (256, 512),
+    (256, 256),
+    (256, 128),
+    (128, 1024),
+    (128, 512),
+    (128, 256),
+    (128, 128),
+]
+
+# Choose backend
+# backend = "cpu"
+# backend = "directml"
+# backend = "ipex"
+# backend = "openvino"
+# backend = "npu"
+
+# Number of loops
+loop_count = 3
+
+# input and output token bias
+input_token_bias = 0
+output_token_bias = 0
+
+# Path to the ellm_benchmark.py script
+ellm_benchmark_script = "ellm_benchmark.py"
+
+for model_name, model_path in zip(model_names, model_paths):
+    for input_token_length, output_token_length in token_in_out:
+        # Construct the command
+        command = [
+            "python", ellm_benchmark_script,
+            "--backend", backend,
+            "--model_name", model_name,
+            "--model_path", model_path,
+            "--token_in", str(input_token_length),
+            "--token_out", str(output_token_length),
+            "--input_token_bias", str(input_token_bias),
+            "--output_token_bias", str(output_token_bias),
+            "--loop_count", str(loop_count)
+        ]
+
+        # Execute the command
+        subprocess.run(command)
diff --git a/benchmark/sampleText.txt b/benchmark/sampleText.txt
new file mode 100644
index 0000000..3da3fbb
--- /dev/null
+++ b/benchmark/sampleText.txt
@@ -0,0 +1,91 @@
+A large language model (LLM) is a computational model notable for its ability to achieve general-purpose language 
+generation and other natural language processing tasks such as classification. Based on language models, LLMs acquire 
+these abilities by learning statistical relationships from vast amounts of text during a computationally intensive 
+self-supervised and semi-supervised training process.[1] LLMs can be used for text generation, a form of generative AI, 
+by taking an input text and repeatedly predicting the next token or word.[2]
+
+LLMs are artificial neural networks that utilize the transformer architecture, invented in 2017. The largest and 
+most capable LLMs, as of June 2024, are built with a decoder-only transformer-based architecture, which enables 
+efficient processing and generation of large-scale text data.
+
+Historically, up to 2020, fine-tuning was the primary method used to adapt a model for specific tasks. However, 
+larger models such as GPT-3 have demonstrated the ability to achieve similar results through prompt engineering, 
+which involves crafting specific input prompts to guide the model's responses.[3] These models acquire knowledge 
+about syntax, semantics, and ontologies[4] inherent in human language corpora, but they also inherit inaccuracies 
+and biases present in the data they are trained on.[5]
+
+Some notable LLMs are OpenAI's GPT series of models (e.g., GPT-3.5 and GPT-4, used in ChatGPT and Microsoft Copilot), 
+Google's Gemini (the latter of which is currently used in the chatbot of the same name), Meta's LLaMA family of models, 
+Anthropic's Claude models, and Mistral AI's models.
+
+History
+Before 2017, there were a few language models that were large as compared to capacities then available. In the 1990s, 
+the IBM alignment models pioneered statistical language modelling. A smoothed n-gram model in 2001 trained on 0.3 
+billion words achieved then-SOTA perplexity.[6] In the 2000s, as Internet use became prevalent, some researchers 
+constructed Internet-scale language datasets ("web as corpus"[7]), upon which they trained statistical language 
+models.[8][9] In 2009, in most language processing tasks, statistical language models dominated over symbolic 
+language models, as they can usefully ingest large datasets.[10]
+
+After neural networks became dominant in image processing around 2012, they were applied to language modelling as 
+well. Google converted its translation service to Neural Machine Translation in 2016. As it was before Transformers, 
+it was done by seq2seq deep LSTM networks.
+
+
+An illustration of main components of the transformer model from the original paper, where layers were normalized 
+after (instead of before) multiheaded attention At the 2017 NeurIPS conference, Google researchers introduced the 
+transformer architecture in their landmark paper "Attention Is All You Need". This paper's goal was to improve upon 
+2014 Seq2seq technology,[11] and was based mainly on the attention mechanism developed by Bahdanau et al. in 2014.
+[12] The following year in 2018, BERT was introduced and quickly became "ubiquitous".[13] Though the original 
+transformer has both encoder and decoder blocks, BERT is an encoder-only model.
+
+Although decoder-only GPT-1 was introduced in 2018, it was GPT-2 in 2019 that caught widespread attention because 
+OpenAI at first deemed it too powerful to release publicly, out of fear of malicious use.[14] GPT-3 in 2020 went 
+a step further and as of 2024 is available only via API with no offering of downloading the model to execute locally. 
+But it was the 2022 consumer-facing browser-based ChatGPT that captured the imaginations of the general population 
+and caused some media hype and online buzz.[15] The 2023 GPT-4 was praised for its increased accuracy and as a 
+"holy grail" for its multimodal capabilities.[16] OpenAI did not reveal high-level architecture and the number 
+of parameters of GPT-4.
+
+Competing language models have for the most part been attempting to equal the GPT series, at least in terms of 
+number of parameters.[17]
+
+Since 2022, source-available models have been gaining popularity, especially at first with BLOOM and LLaMA, though 
+both have restrictions on the field of use. Mistral AI's models Mistral 7B and Mixtral 8x7b have the more permissive 
+Apache License. As of June 2024, The Instruction fine tuned variant of the Llama 3 70 billion parameter model is 
+the most powerful open LLM according to the LMSYS Chatbot Arena Leaderboard, being more powerful than GPT-3.5 but 
+not as powerful as GPT-4.[18]
+
+As of 2024, the largest and most capable models are all based on the Transformer architecture. Some recent 
+implementations are based on other architectures, such as recurrent neural network variants and Mamba 
+(a state space model).[19][20][21]
+
+Dataset preprocessing
+See also: List of datasets for machine-learning research § Internet
+Probabilistic tokenization
+Because machine learning algorithms process numbers rather than text, the text must be converted to numbers. 
+In the first step, a vocabulary is decided upon, then integer indexes are arbitrarily but uniquely assigned 
+to each vocabulary entry, and finally, an embedding is associated to the integer index. Algorithms include 
+byte-pair encoding and WordPiece.
+
+Probabilistic tokenization also compresses the datasets. Because LLMs generally require input to be an array 
+that is not jagged, the shorter texts must be "padded" until they match the length of the longest one. How many 
+tokens are, on average, needed per word depends on the language of the dataset.[22][23]
+
+BPE
+Using a modification of byte-pair encoding, in the first step, all unique characters (including blanks and 
+punctuation marks) are treated as an initial set of n-grams (i.e. initial set of uni-grams). Successively 
+the most frequent pair of adjacent characters is merged into a bi-gram and all instances of the pair are 
+replaced by it. All occurrences of adjacent pairs of (previously merged) n-grams that most frequently occur 
+together are then again merged into even lengthier n-gram repeatedly until a vocabulary of prescribed size 
+is obtained (in case of GPT-3, the size is 50257).[24] Token vocabulary consists of integers, spanning from 
+zero up to the size of the token vocabulary. New words can always be interpreted as combinations of the 
+tokens and the initial-set uni-grams.[25]
+
+A token vocabulary based on the frequencies extracted from mainly English corpora uses as few tokens as 
+possible for an average English word. An average word in another language encoded by such an English-optimized 
+tokenizer is however split into suboptimal amount of tokens. GPT-2 tokenizer can use up to 15 times more tokens 
+per word for some languages, for example for the Shan language from Myanmar. Even more widespread languages 
+such as Portuguese and German have "a premium of 50%" compared to English.[26]
+
+For example, here is how tokenizer used by GPT-3 (Legacy) split the following sentence tokenizer: texts -> 
+series of numerical "tokens".
\ No newline at end of file
diff --git a/docs/model/onnxruntime_cpu_models.md b/docs/model/onnxruntime_cpu_models.md
new file mode 100644
index 0000000..6951ac8
--- /dev/null
+++ b/docs/model/onnxruntime_cpu_models.md
@@ -0,0 +1,14 @@
+# Model Powered by Onnxruntime CPU GenAI
+
+## Supported Models
+
+| Model Name                                            | Parameters | Context Length | Size (GB) | Link                                                                                                                |
+|-------------------------------------------------------|------------|----------------|-----------|---------------------------------------------------------------------------------------------------------------------|
+| Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32    | 3.8B          | 4096           | 2.538     | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32/tree/main) |
+| Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4 | 3.8B          | 4096           | 2.538     | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
+| Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32   | 3.8B          | 4096           | 2.585     | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32/tree/main) |
+| Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4 | 3.8B          | 4096           | 2.585     | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
+| mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32  | 7B         | 32768          | 4.66      | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32/tree/main) |
+| mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4 | 7B         | 32768          | 4.66      | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
+| openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32  | 8B         | 8192           | 6.339     | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32/tree/main) |
+| openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4 | 8B         | 8192           | 6.339     | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
diff --git a/docs/model/onnxruntime_directml_models.md b/docs/model/onnxruntime_directml_models.md
new file mode 100644
index 0000000..0f6a3a3
--- /dev/null
+++ b/docs/model/onnxruntime_directml_models.md
@@ -0,0 +1,19 @@
+# Model Powered by Onnxruntime DirectML GenAI
+
+## Supported Models
+
+| Model Name                                 | Parameters | Context Length | Size (GB) | Link                                                                                                                |
+|--------------------------------------------|------------|----------------|-----------|---------------------------------------------------------------------------------------------------------------------|
+| Phi-3-mini-4k-instruct-onnx-directml       | 3.8B       | 4096           | 1.989     | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml) |
+| Phi-3-mini-128k-instruct-onnx-directml      | 3.8B       | 131072           | 2.018     | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml)  |
+| Phi-3-medium-4k-instruct-onnx-directml      | 17B        | 4096           | 6.987     | [EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml)  |
+| Phi-3-medium-128k-instruct-onnx-directml    | 17B        | 131072           | 7.025     | [EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml) |
+| Phi-3-mini-4k-instruct-062024-int4-onnx-directml | 3.8B     | 4096           | 2.137     | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml) |
+| mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml | 7B  | 32768          | 3.988     | [EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml) |
+| gemma-2b-it-int4-onnx-directml              | 2B         | 8192           | 2.314     | [EmbeddedLLM/gemma-2b-it-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/gemma-2b-it-int4-onnx-directml)                      |
+| gemma-7b-it-int4-onnx-directml              | 7B         | 8192           | 5.958     | [EmbeddedLLM/gemma-7b-it-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/gemma-7b-it-int4-onnx-directml)                      |
+| llama-2-7b-chat-int4-onnx-directml          | 7B         | 4096           | 3.708     | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml)              |
+| Starling-LM-7b-beta-int4-onnx-directml      | 7B         | 8192           | 3.974     | [EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml)     |
+| openchat-3.6-8b-20240522-int4-onnx-directml | 8B         | 8192           | 4.922     | [EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml) |
+| Yi-1.5-6B-Chat-int4-onnx-directml           | 6B         | 32768          | 3.532     | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml)  |
+
diff --git a/docs/model/onnxruntime_models.md b/docs/model/onnxruntime_models.md
deleted file mode 100644
index 4d61ffe..0000000
--- a/docs/model/onnxruntime_models.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# Model Powered by Onnxruntime GenAI
-
-## Supported Models
-
-| Models | Parameters | Context Length | Link |
-| --- | --- | --- | --- |
-| Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) |
-| Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) |
-| Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) |
-| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
-| Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
-| Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) |
-| Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) |
-| Phi3-mini-128k-instruct | 3.8B | 128k | [microsoft/Phi-3-mini-128k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx) |
-| Phi3-medium-4k-instruct | 17B | 4096 | [microsoft/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-directml) |
-| Phi3-medium-128k-instruct | 17B | 128k | [microsoft/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct-onnx-directml) |
-| Openchat-3.6-8b | 8B | 8192 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx) |
-| Yi-1.5-6b-chat | 6B | 32k | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx) |
-| Phi-3-vision-128k-instruct |  | 128k | [EmbeddedLLM/Phi-3-vision-128k-instruct-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-vision-128k-instruct-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4) |
diff --git a/src/embeddedllm/backend/onnxruntime_engine.py b/src/embeddedllm/backend/onnxruntime_engine.py
index 82b5dca..95d13c3 100644
--- a/src/embeddedllm/backend/onnxruntime_engine.py
+++ b/src/embeddedllm/backend/onnxruntime_engine.py
@@ -1,9 +1,11 @@
 # from embeddedllm.transformers_utils.image_processing_phi3v import Phi3VImageProcessor
 import contextlib
 import time
+import os
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import AsyncIterator, List, Optional
+from huggingface_hub import snapshot_download
 
 import onnxruntime_genai as og
 from loguru import logger
@@ -39,6 +41,15 @@ def onnx_generator_context(model, params):
 class OnnxruntimeEngine(BaseLLMEngine):
     def __init__(self, model_path: str, vision: bool, device: str = "cpu"):
         self.model_path = model_path
+
+        if not os.path.exists(model_path):
+            snapshot_path = snapshot_download(
+                repo_id=model_path,
+                allow_patterns=None,
+                repo_type="model",
+            )
+            model_path = snapshot_path
+
         self.model_config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
         self.device = device
 
diff --git a/src/embeddedllm/engine.py b/src/embeddedllm/engine.py
index 3eac11c..86f589c 100644
--- a/src/embeddedllm/engine.py
+++ b/src/embeddedllm/engine.py
@@ -80,7 +80,7 @@ def __init__(self, model_path: str, vision: bool, device: str = "xpu", backend:
 
         else:
             raise ValueError(
-                f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda` and `directml`."
+                f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda`, `openvino` and `directml`."
             )
         self.tokenizer = self.engine.tokenizer
 
diff --git a/src/embeddedllm/entrypoints/api_server.py b/src/embeddedllm/entrypoints/api_server.py
index 9385f24..efc2916 100644
--- a/src/embeddedllm/entrypoints/api_server.py
+++ b/src/embeddedllm/entrypoints/api_server.py
@@ -28,9 +28,9 @@ class Config(BaseSettings):
     )
     port: int = Field(default=6979, description="Server port.")
     host: str = Field(default="0.0.0.0", description="Server host.")
-    device: str = Field(default="cpu", description="Device type: `cpu`, `xpu`")
+    device: str = Field(default="cpu", description="Device type: `cpu`, `xpu`, `gpu`")
     backend: str = Field(
-        default="directml", description="Backend engine: `cpu`, `ipex` and `directml`"
+        default="directml", description="Backend engine: `cpu`, `ipex`, `openvino` and `directml`"
     )
     response_role: str = Field(default="assistant", description="Server response role.")
     uvicorn_log_level: str = Field(
diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py
index ca1da44..cc1e15c 100644
--- a/src/embeddedllm/entrypoints/modelui.py
+++ b/src/embeddedllm/entrypoints/modelui.py
@@ -20,7 +20,7 @@ def get_embeddedllm_backend():
         version = importlib.metadata.version("embeddedllm")
 
         # Use regex to extract the backend
-        match = re.search(r"\+(directml|cpu|cuda|ipex)$", version)
+        match = re.search(r"\+(directml|cpu|cuda|ipex|openvino)$", version)
 
         if match:
             backend = match.group(1)
@@ -65,44 +65,170 @@ class ModelCard(BaseModel):
     size: Optional[int] = 0
 
 
-dml_model_dict_list = {
+openvino_model_dict_list = {
+    # "OpenVINO/Phi-3-mini-128k-instruct-int4-ov": ModelCard(
+    #     hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-128k-instruct-int4-ov/tree/main/",
+    #     repo_id="OpenVINO/Phi-3-mini-128k-instruct-int4-ov",
+    #     model_name="Phi-3-mini-128k-instruct-int4-ov",
+    #     subfolder=".",
+    #     repo_type="model",
+    #     context_length=131072,
+    # ),
+    "OpenVINO/Phi-3-mini-128k-instruct-int8-ov": ModelCard(
+        hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-128k-instruct-int8-ov/tree/main/",
+        repo_id="OpenVINO/Phi-3-mini-128k-instruct-int8-ov",
+        model_name="Phi-3-mini-128k-instruct-int8-ov",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+    # "OpenVINO/Phi-3-mini-4k-instruct-int4-ov": ModelCard(
+    #     hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov/tree/main/",
+    #     repo_id="OpenVINO/Phi-3-mini-4k-instruct-int4-ov",
+    #     model_name="Phi-3-mini-4k-instruct-int4-ov",
+    #     subfolder=".",
+    #     repo_type="model",
+    #     context_length=4096,
+    # ),
+    "OpenVINO/Phi-3-mini-4k-instruct-int8-ov": ModelCard(
+        hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int8-ov/tree/main/",
+        repo_id="OpenVINO/Phi-3-mini-4k-instruct-int8-ov",
+        model_name="Phi-3-mini-4k-instruct-int8-ov",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    # "OpenVINO/Phi-3-medium-4k-instruct-int4-ov": ModelCard(
+    #     hf_url="https://huggingface.co/OpenVINO/Phi-3-medium-4k-instruct-int4-ov/tree/main/",
+    #     repo_id="OpenVINO/Phi-3-medium-4k-instruct-int4-ov",
+    #     model_name="Phi-3-medium-4k-instruct-int4-ov",
+    #     subfolder=".",
+    #     repo_type="model",
+    #     context_length=4096,
+    # ),
+    "OpenVINO/Phi-3-medium-4k-instruct-int8-ov": ModelCard(
+        hf_url="https://huggingface.co/OpenVINO/Phi-3-medium-4k-instruct-int8-ov/tree/main/",
+        repo_id="OpenVINO/Phi-3-medium-4k-instruct-int8-ov",
+        model_name="Phi-3-medium-4k-instruct-int8-ov",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "OpenVINO/open_llama_7b_v2-int8-ov": ModelCard(
+        hf_url="https://huggingface.co/OpenVINO/open_llama_7b_v2-int8-ov/tree/main/",
+        repo_id="OpenVINO/open_llama_7b_v2-int8-ov",
+        model_name="open_llama_7b_v2-int8-ov",
+        subfolder=".",
+        repo_type="model",
+        context_length=2048,
+    ),
+    "OpenVINO/open_llama_3b_v2-int8-ov": ModelCard(
+        hf_url="https://huggingface.co/OpenVINO/open_llama_3b_v2-int8-ov/tree/main/",
+        repo_id="OpenVINO/open_llama_3b_v2-int8-ov",
+        model_name="open_llama_3b_v2-int8-ov",
+        subfolder=".",
+        repo_type="model",
+        context_length=2048,
+    ),
+}
+
+ipex_model_dict_list = {
     "microsoft/Phi-3-mini-4k-instruct": ModelCard(
-        hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx/tree/main/directml/directml-int4-awq-block-128",
-        repo_id="microsoft/Phi-3-mini-4k-instruct-onnx",
-        model_name="Phi-3-mini-4k-instruct-onnx",
-        subfolder="directml/directml-int4-awq-block-128",
+        hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/tree/main/",
+        repo_id="microsoft/Phi-3-mini-4k-instruct",
+        model_name="Phi-3-mini-4k-instruct",
+        subfolder=".",
         repo_type="model",
         context_length=4096,
     ),
-    "EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4",
-        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx",
-        model_name="Phi-3-mini-4k-instruct-062024-onnx",
-        subfolder="onnx/directml/Phi-3-mini-4k-instruct-062024-int4",
+    "microsoft/Phi-3-mini-128k-instruct": ModelCard(
+        hf_url="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/tree/main",
+        repo_id="microsoft/Phi-3-mini-128k-instruct",
+        model_name="Phi-3-mini-128k-instruct",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+    "microsoft/Phi-3-medium-4k-instruct": ModelCard(
+        hf_url="https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/tree/main",
+        repo_id="microsoft/Phi-3-medium-4k-instruct",
+        model_name="Phi-3-medium-4k-instruct",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "microsoft/Phi-3-medium-128k-instruct": ModelCard(
+        hf_url="https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/tree/main",
+        repo_id="microsoft/Phi-3-medium-128k-instruct",
+        model_name="Phi-3-medium-128k-instruct",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+}
+
+dml_model_dict_list = {
+    "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml",
+        model_name="Phi-3-mini-4k-instruct-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=4096,
     ),
-    "EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/directml/mistralai_Mistral-7B-Instruct-v0.3-int4",
-        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx",
-        model_name="mistral-7b-instruct-v0.3-onnx",
-        subfolder="onnx/directml/mistralai_Mistral-7B-Instruct-v0.3-int4",
+    "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml",
+        model_name="Phi-3-mini-128k-instruct-onnx-directml",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+    "EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml",
+        model_name="Phi-3-medium-4k-instruct-onnx-directml",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml",
+        model_name="Phi-3-medium-128k-instruct-onnx-directml",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+    "EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml",
+        model_name="Phi-3-mini-4k-instruct-062024-int4-onnx-directml",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml",
+        model_name="mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=32768,
     ),
-    "EmbeddedLLM/gemma-2b-it-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx/tree/main/onnx/directml/gemma-2b-it-int4",
-        repo_id="EmbeddedLLM/gemma-2b-it-onnx",
-        model_name="gemma-2b-it-int4",
-        subfolder="onnx/directml/gemma-2b-it-int4",
+    "EmbeddedLLM/gemma-2b-it-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/gemma-2b-it-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/gemma-2b-it-int4-onnx-directml",
+        model_name="gemma-2b-it-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
-    "EmbeddedLLM/gemma-7b-it-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/gemma-7b-it-onnx/tree/main/onnx/directml/gemma-7b-it-int4",
-        repo_id="EmbeddedLLM/gemma-7b-it-onnx",
-        model_name="gemma-7b-it-int4",
-        subfolder="onnx/directml/gemma-7b-it-int4",
+    "EmbeddedLLM/gemma-7b-it-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/gemma-7b-it-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/gemma-7b-it-int4-onnx-directml",
+        model_name="gemma-7b-it-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
@@ -114,70 +240,94 @@ class ModelCard(BaseModel):
         repo_type="model",
         context_length=4096,
     ),
-    "EmbeddedLLM/Starling-LM-7b-beta-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-onnx/tree/main/onnx/directml/Starling-LM-7b-beta-int4",
-        repo_id="EmbeddedLLM/Starling-LM-7b-beta-onnx",
-        model_name="Starling-LM-7b-beta-int4",
-        subfolder="onnx/directml/Starling-LM-7b-beta-int4",
+    "EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml",
+        model_name="Starling-LM-7b-beta-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
-    "EmbeddedLLM/openchat-3.6-8b-20240522-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/directml/openchat-3.6-8b-20240522-int4",
-        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx",
-        model_name="openchat-3.6-8b-20240522-int4",
-        subfolder="onnx/directml/openchat-3.6-8b-20240522-int4",
+    "EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml",
+        model_name="openchat-3.6-8b-20240522-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
-    "EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx/tree/main/onnx/directml/01-ai_Yi-1.5-6B-Chat-int4",
-        repo_id="EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx",
-        model_name="01-ai_Yi-1.5-6B-Chat-int4",
-        subfolder="onnx/directml/01-ai_Yi-1.5-6B-Chat-int4",
+    "EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml",
+        model_name="01-ai_Yi-1.5-6B-Chat-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=4096,
     ),
 }
 
 cpu_model_dict_list = {
-    "microsoft/Phi-3-mini-4k-instruct": ModelCard(
-        hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx/tree/main/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4",
-        repo_id="microsoft/Phi-3-mini-4k-instruct-onnx",
-        model_name="Phi-3-mini-4k-instruct-onnx",
-        subfolder="cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4",
+    "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32",
+        model_name="Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32",
+        subfolder=".",
         repo_type="model",
         context_length=4096,
     ),
-    "EmbeddedLLM/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4",
-        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx",
-        model_name="mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4",
-        subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4",
+    "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        model_name="Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32",
+        model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+    "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+    "EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main",
+        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        model_name="mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        subfolder=".",
         repo_type="model",
         context_length=32768,
     ),
-    "EmbeddedLLM/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32",
-        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx",
-        model_name="mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32",
-        subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32",
+    "EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32/tree/main",
+        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32",
+        model_name="mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32",
+        subfolder=".",
         repo_type="model",
         context_length=32768,
     ),
-    "EmbeddedLLM/openchat-3.6-8b-20240522-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4",
-        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx",
-        model_name="openchat-3.6-8b-20240522-cpu-int4-rtn-block-32-acc-level-4",
-        subfolder="onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4",
+    "EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main",
+        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        model_name="openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
-    "EmbeddedLLM/openchat-3.6-8b-20240522-cpu-int4-rtn-block-32": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32",
-        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx",
-        model_name="openchat-3.6-8b-20240522-cpu-int4-rtn-block-32",
-        subfolder="onnx/cpu_and_mobile/cpu-int4-rtn-block-32",
+    "EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32/tree/main",
+        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32",
+        model_name="openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
@@ -231,8 +381,18 @@ def compute_memory_size(repo_id, path_in_repo, repo_type: str = "model"):
         repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type
     )
 
+for k, v in ipex_model_dict_list.items():
+    v.size = compute_memory_size(
+        repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type
+    )
 
-def convert_to_dataframe(dml_model_dict_list):
+for k, v in openvino_model_dict_list.items():
+    v.size = compute_memory_size(
+        repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type
+    )
+
+
+def convert_to_dataframe(model_dict_list):
     # Create lists to store the data
     model_names = []
     hf_urls = []
@@ -244,7 +404,7 @@ def convert_to_dataframe(dml_model_dict_list):
     context_lengths = []
 
     # Iterate through the dictionary and extract the data
-    for key, model_card in dml_model_dict_list.items():
+    for key, model_card in model_dict_list.items():
         model_names.append(key)
         hf_urls.append(model_card.hf_url)
         repo_ids.append(model_card.repo_id)
@@ -318,6 +478,12 @@ def update_model_list(engine_type):
     if engine_type == "DirectML":
         models = sorted(list(dml_model_dict_list.keys()))
         models_pandas = convert_to_dataframe(dml_model_dict_list)
+    elif engine_type == "Ipex":
+        models = sorted(list(ipex_model_dict_list.keys()))
+        models_pandas = convert_to_dataframe(ipex_model_dict_list)
+    elif engine_type == 'OpenVino':
+        models = sorted(list(openvino_model_dict_list.keys()))
+        models_pandas = convert_to_dataframe(openvino_model_dict_list)
     else:
         models = sorted(list(cpu_model_dict_list.keys()))
         models_pandas = convert_to_dataframe(cpu_model_dict_list)
@@ -340,28 +506,51 @@ def deploy_model(engine_type, model_name, port_number):
 
     if engine_type == "DirectML":
         llm_model_card = dml_model_dict_list[model_name]
+    elif engine_type == "Ipex":
+        llm_model_card = ipex_model_dict_list[model_name]
+    elif engine_type == "OpenVino":
+        llm_model_card = openvino_model_dict_list[model_name]
     else:
         llm_model_card = cpu_model_dict_list[model_name]
 
     snapshot_path = snapshot_download(
         repo_id=llm_model_card.repo_id,
-        allow_patterns=f"{llm_model_card.subfolder}/*",
+        allow_patterns=(
+            f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None
+        ),
         repo_type="model",
     )
 
-    model_path = os.path.join(snapshot_path, llm_model_card.subfolder)
+    if llm_model_card.subfolder != ".":
+        model_path = os.path.join(snapshot_path, llm_model_card.subfolder)
+    else:
+        model_path = snapshot_path
+
+    print("Model path:", model_path)
+
+    if engine_type == "Ipex":
+        device = "xpu"
+    elif engine_type == "OpenVino":
+        device = "gpu"
+    else:
+        device = "cpu"
 
     deployed_model.process = subprocess.Popen(
         [
             "ellm_server",
             "--model_path",
             model_path,
+            "--backend",
+            backend,
+            "--device",
+            device,
             "--port",
             f"{port_number}",
-            "--served_model_name",
-            model_name,
+            # "--served_model_name",
+            # model_name
         ]
     )
+
     deployed_model.model_name = model_name
 
     while True:
@@ -375,6 +564,7 @@ def deploy_model(engine_type, model_name, port_number):
         <p style="color: #2D2363;"><strong>Model:</strong> {model_name}</p>
         <p style="color: #2D2363;"><strong>Engine:</strong> {engine_type}</p>
         <p style="color: #2D2363;"><strong>Port:</strong> {port_number}</p>
+        <p style="color: #2D2363;"><strong>Model Path:</strong> {model_path}</p>
     </div>
     """
 
@@ -402,6 +592,10 @@ def download_model(engine_type, model_name):
 
     if engine_type == "DirectML":
         llm_model_card = dml_model_dict_list[model_name]
+    elif engine_type == "Ipex":
+        llm_model_card = ipex_model_dict_list[model_name]
+    elif engine_type == "OpenVino":
+        llm_model_card = openvino_model_dict_list[model_name]
     else:
         llm_model_card = cpu_model_dict_list[model_name]
 
@@ -412,7 +606,9 @@ def download_model(engine_type, model_name):
     yield "Downloading ..."
     snapshot_path = snapshot_download(
         repo_id=llm_model_card.repo_id,
-        allow_patterns=f"{llm_model_card.subfolder}/*",
+        allow_patterns=(
+            f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None
+        ),
         repo_type="model",
     )
     yield snapshot_path
@@ -443,9 +639,16 @@ def main():
         with gr.Accordion("See More Model Details", open=False):
             model_info_pandas_frame = gr.Dataframe(value=None)
 
+        default_value = "CPU"  # Default value
+        if backend == "directml":
+            default_value = "DirectML"
+        elif backend == "ipex":
+            default_value = "Ipex"
+        elif backend == "openvino":
+            default_value = "OpenVino"
         selected_engine_type = gr.Dropdown(
-            choices=["DirectML", "CPU"],
-            value="DirectML" if backend == "directml" else "CPU",
+            choices=["DirectML", "Ipex", "OpenVino", "CPU"],
+            value=default_value,
             multiselect=False,
             label="LLM Engine",
             show_label=True,
diff --git a/src/embeddedllm/inputs.py b/src/embeddedllm/inputs.py
index 9797d05..8f05498 100644
--- a/src/embeddedllm/inputs.py
+++ b/src/embeddedllm/inputs.py
@@ -23,13 +23,13 @@ class ImagePixelData(TypedDict):
 
 # https://github.com/vllm-project/vllm/pull/4028
 @overload
-def parse_and_batch_prompt(prompt: Union[str, List[str]]) -> Sequence[ParsedText]:
-    ...
+def parse_and_batch_prompt(prompt: Union[str, List[str]]) -> Sequence[ParsedText]: ...
 
 
 @overload
-def parse_and_batch_prompt(prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]:
-    ...
+def parse_and_batch_prompt(
+    prompt: Union[List[int], List[List[int]]]
+) -> Sequence[ParsedTokens]: ...
 
 
 def parse_and_batch_prompt(