arangoml
diff --git a/‎.github/workflows/docker/compose/llms-compose.yaml
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/docker/compose/llms-compose.yaml
Lines changed: 4 additions & 0 deletions
diff --git a/‎comps/cores/proto/api_protocol.py
Lines changed: 2 additions & 0 deletions b/‎comps/cores/proto/api_protocol.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎comps/llms/deployment/docker_compose/compose_text-generation.yaml
Lines changed: 35 additions & 0 deletions b/‎comps/llms/deployment/docker_compose/compose_text-generation.yaml
Lines changed: 35 additions & 0 deletions
diff --git a/‎comps/llms/src/text-generation/Dockerfile.intel_hpu
Lines changed: 3 additions & 1 deletion b/‎comps/llms/src/text-generation/Dockerfile.intel_hpu
Lines changed: 3 additions & 1 deletion
diff --git a/‎comps/llms/src/text-generation/Dockerfile.intel_hpu_phi4
Lines changed: 36 additions & 0 deletions b/‎comps/llms/src/text-generation/Dockerfile.intel_hpu_phi4
Lines changed: 36 additions & 0 deletions
diff --git a/‎comps/llms/src/text-generation/README_native.md
Lines changed: 38 additions & 4 deletions b/‎comps/llms/src/text-generation/README_native.md
Lines changed: 38 additions & 4 deletions
diff --git a/‎comps/llms/src/text-generation/entrypoint_phi4.sh
Lines changed: 30 additions & 0 deletions b/‎comps/llms/src/text-generation/entrypoint_phi4.sh
Lines changed: 30 additions & 0 deletions
diff --git a/‎comps/llms/src/text-generation/integrations/native.py
Lines changed: 20 additions & 3 deletions b/‎comps/llms/src/text-generation/integrations/native.py
Lines changed: 20 additions & 3 deletions
@@ -11,6 +11,10 @@ services:
     build:
       dockerfile: comps/llms/src/text-generation/Dockerfile.intel_hpu
     image: ${REGISTRY:-opea}/llm-textgen-gaudi:${TAG:-latest}
+  llm-textgen-phi4-gaudi:
+    build:
+      dockerfile: comps/llms/src/text-generation/Dockerfile.intel_hpu_phi4
+    image: ${REGISTRY:-opea}/llm-textgen-phi4-gaudi:${TAG:-latest}
   llm-docsum:
     build:
       dockerfile: comps/llms/src/doc-summarization/Dockerfile
 
@@ -177,6 +177,8 @@ class ChatCompletionRequest(BaseModel):
     parallel_tool_calls: Optional[bool] = True
     user: Optional[str] = None
     language: str = "auto"  # can be "en", "zh"
+    image_path: Optional[str] = None
+    audio_path: Optional[str] = None
 
     # Ordered by official OpenAI API documentation
     # default values are same with
 
@@ -46,6 +46,29 @@ services:
       - SYS_NICE
     restart: unless-stopped
 
+  textgen-phi4-gaudi:
+    image: ${REGISTRY:-opea}/llm-textgen-phi4-gaudi:${TAG:-latest}
+    container_name: llm-textgen-phi4-gaudi-server
+    ports:
+      - ${TEXTGEN_PORT:-9000}:9000
+    volumes:
+      - "${DATA_PATH:-./data}:/data"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      HF_TOKEN: ${HF_TOKEN}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      TOKENIZERS_PARALLELISM: False
+      LOGFLAG: ${LOGFLAG:-False}
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    restart: unless-stopped
+
   textgen-service-tgi:
     extends: textgen
     container_name: textgen-service-tgi
@@ -101,6 +124,18 @@ services:
     environment:
       LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative}
 
+  textgen-native-phi4-gaudi:
+    extends: textgen-phi4-gaudi
+    container_name: textgen-native-phi4-gaudi
+    environment:
+      LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative}
+
+  textgen-native-phi4-multimodal-gaudi:
+    extends: textgen-phi4-gaudi
+    container_name: textgen-native-phi4-multimodal-gaudi
+    environment:
+      LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNativePhi4Multimodal}
+
   textgen-service-ovms:
     extends: textgen
     container_name: textgen-service-ovms
 
@@ -15,6 +15,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
 
 RUN useradd -m -s /bin/bash user && \
     mkdir -p /home/user && \
+    mkdir -p /home/user/logs && \
     chown -R user /home/user/
 
 RUN git lfs install
@@ -29,9 +30,10 @@ RUN git clone ${REPO} /home/user/optimum-habana && \
     cd /home/user/optimum-habana && git checkout ${REPO_VER} && \
     cd examples/text-generation && pip install --no-cache-dir -r requirements.txt && \
     cd /home/user/comps/llms/src/text-generation/ && pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir --upgrade --force-reinstall pydantic numpy==1.23.5
+    pip install --no-cache-dir --upgrade --force-reinstall pydantic numpy==1.25
 
 ENV PYTHONPATH=/root:/home/user
+ENV HABANA_LOGS=/home/user/logs
 
 USER user
 
 
@@ -0,0 +1,36 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# HABANA environment
+FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1 AS hpu
+
+ENV LANG=en_US.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    git-lfs \
+    libgl1-mesa-glx \
+    libjemalloc-dev
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    mkdir -p /home/user/logs && \
+    chown -R user /home/user/
+
+RUN git lfs install
+
+COPY comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir --upgrade-strategy eager optimum[habana] && \
+    pip install --no-cache-dir git+https://github.com/HabanaAI/[email protected]
+
+RUN pip install git+https://github.com/huggingface/optimum-habana.git@transformers_future && \
+    cd /home/user/comps/llms/src/text-generation/ && pip install --no-cache-dir -r requirements.txt && \
+    pip install soundfile peft backoff
+
+ENV PYTHONPATH=/root:/home/user
+ENV HABANA_LOGS=/home/user/logs
+
+WORKDIR /home/user/comps/llms/src/text-generation/
+
+ENTRYPOINT ["bash", "entrypoint_phi4.sh"]
@@ -8,20 +8,36 @@ LLM Native microservice uses [optimum-habana](https://github.com/huggingface/opt
 
 In order to start Native LLM service, you need to setup the following environment variables first.
 
-For LLM model, both `Qwen` and `Falcon3` models are supported. Users can set different models by changing the `LLM_MODEL_ID` below.
+For LLM model, both `Qwen`, `Falcon3` and `Phi4` models are supported. Users can set different models by changing the `LLM_MODEL_ID` below.
 
 ```bash
 export LLM_MODEL_ID="Qwen/Qwen2-7B-Instruct"
 export HF_TOKEN="your_huggingface_token"
 export TEXTGEN_PORT=10512
+export LLM_COMPONENT_NAME="OpeaTextGenNative"
 export host_ip=${host_ip}
 ```
 
+Note. If you want to run "microsoft/Phi-4-multimodal-instruct", please download the [model weights](https://huggingface.co/microsoft/Phi-4-multimodal-instruct/tree/main) manually and put at `/path/to/Phi-4-multimodal-instruct` locally, then setup following environment.
+
+```bash
+export LLM_MODEL_ID="/path/to/Phi-4-multimodal-instruct"
+export LLM_COMPONENT_NAME="OpeaTextGenNativePhi4Multimodal"
+```
+
 ### 1.2 Build Docker Image
 
 ```bash
+## For `Qwen` and `Falcon`
+dockerfile_path="comps/llms/src/text-generation/Dockerfile.intel_hpu"
+export image_name="opea/llm-textgen-gaudi:latest"
+
+## For `Phi4`
+# dockerfile_path="comps/llms/src/text-generation/Dockerfile.intel_hpu_phi4"
+# export image_name="opea/llm-textgen-phi4-gaudi:latest"
+
 cd ../../../../../
-docker build -t opea/llm-textgen-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile.intel_hpu .
+docker build -t $image_name --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $dockerfile_path .
 ```
 
 To start a docker container, you have two options:
@@ -34,13 +50,15 @@ You can choose one as needed.
 ### 1.3 Run Docker with CLI (Option A)
 
 ```bash
-docker run -d --runtime=habana --name="llm-native-server" -p 9000:9000 -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e TOKENIZERS_PARALLELISM=false -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e LLM_MODEL_ID=${LLM_MODEL_ID} opea/llm-textgen-gaudi:latest
+docker run -d --runtime=habana --name="llm-native-server" -p $TEXTGEN_PORT:9000 -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e TOKENIZERS_PARALLELISM=false -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e LLM_MODEL_ID=${LLM_MODEL_ID} -e LLM_COMPONENT_NAME=$LLM_COMPONENT_NAME $image_name
 ```
 
 ### 1.4 Run Docker with Docker Compose (Option B)
 
 ```bash
 export service_name="textgen-native-gaudi"
+# export service_name="textgen-native-phi4-gaudi" # For Phi-4-mini-instruct
+# export service_name="textgen-native-phi4-multimodal-gaudi" #Phi-4-multimodal-instruct
 cd comps/llms/deployment/docker_compose
 docker compose -f compose_text-generation.yaml up ${service_name} -d
 ```
@@ -60,6 +78,22 @@ curl http://${your_ip}:9000/v1/health_check\
 ```bash
 curl http://${your_ip}:9000/v1/chat/completions\
   -X POST \
-  -d '{"messages":"What is Deep Learning?"}' \
+  -d '{"messages":"What is Deep Learning?", "max_tokens":17}' \
+  -H 'Content-Type: application/json'
+```
+
+If you run a multimodal model such as `Phi-4-multimodal-instruct`, you can try with image or audio input.
+
+```bash
+#image
+curl http://${your_ip}:9000/v1/chat/completions\
+  -X POST \
+  -d '{"messages":"What is shown in this image?", "image_path":"/path/to/image", "max_tokens":17}' \
+  -H 'Content-Type: application/json'
+
+#audio
+curl http://${your_ip}:9000/v1/chat/completions\
+  -X POST \
+  -d '{"messages":"Based on the attached audio, generate a comprehensive text transcription of the spoken content.", "audio_path":"/path/to/audio", "max_tokens":17}' \
   -H 'Content-Type: application/json'
 ```
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#!/bin/bash
+
+#LLM_MODEL_ID mush be a model path
+llm_name=$LLM_MODEL_ID
+WORKPATH="/home/user/comps/llms/src/text-generation/"
+
+if [[ $llm_name == *"Phi-4-multimodal-instruct"* ]]; then
+    cd $WORKPATH
+    echo -e "Patching into the multimodal models"
+    cp patch/phi4-multimodal-patch/*.py $llm_name/
+    export PT_HPU_LAZY_MODE=1
+elif [[ $llm_name == *"Phi-4-mini-instruct"* ]]; then
+    cd $WORKPATH
+    git clone -b transformers_future https://github.com/huggingface/optimum-habana
+    cd optimum-habana
+    cp ../patch/optimum-habana-phi4.patch .
+    git apply optimum-habana-phi4.patch
+    pip install -e .
+    cd examples/text-generation/
+    pip install -r requirements.txt
+    cd phi-4-mini-instruct/
+    bash ./01-patch-transformer.sh
+fi
+
+cd $WORKPATH
+python opea_llm_microservice.py
@@ -48,8 +48,8 @@
     "device": "hpu",
     "model_name_or_path": MODEL_NAME,
     "bf16": True,
-    "max_new_tokens": 100,
-    "max_input_tokens": 0,
+    "max_new_tokens": 32,
+    "max_input_tokens": 128,
     "batch_size": 1,
     "warmup": 3,
     "n_iterations": 5,
@@ -105,6 +105,21 @@
     "penalty_alpha": None,
 }
 
+if "Phi-4-mini-instruct" in MODEL_NAME:
+    args_dict_phi4 = {
+        "use_kv_cache": False,
+        "attn_softmax_bf16": True,
+        "limit_hpu_graphs": True,
+        "use_flash_attention": True,
+        "flash_attention_recompute": True,
+        "flash_attention_causal_mask": True,
+        "flash_attention_fast_softmax": True,
+    }
+    args_dict.update(args_dict_phi4)
+
+if logflag:
+    logger.info(args_dict)
+
 
 class Args:
     def __init__(self, **entries):
@@ -123,6 +138,7 @@ def __init__(self, **entries):
 def generate(
     input_query: list,
     device="hpu",
+    max_new_tokens=32,
     use_lazy_mode=True,
     use_hpu_graphs=True,
     profiling_steps=0,
@@ -159,6 +175,7 @@ def generate(
         **input_tokens,
         generation_config=generation_config,
         assistant_model=assistant_model,
+        max_new_tokens=max_new_tokens,
         lazy_mode=use_lazy_mode,
         hpu_graphs=use_hpu_graphs,
         profiling_steps=profiling_steps,
@@ -262,7 +279,7 @@ async def invoke(self, input: ChatCompletionRequest):
         else:
             if input.documents:
                 prompt = ChatTemplate.generate_rag_prompt(message, input.documents)
-        res = generate([prompt])
+        res = generate([prompt], max_new_tokens=input.max_tokens)
 
         if logflag:
             logger.info(f"[llm - native] inference result: {res}")