Skip to content

Commit dc5edc3

Browse files
authored
Support Phi-4-mini and Phi-4-multimodal-instruct in LLM text-generation comps on gaudi mode (opea-project#1335)
Signed-off-by: Xinyao Wang <[email protected]>
1 parent d51a136 commit dc5edc3

15 files changed

+5761
-9
lines changed

.github/workflows/docker/compose/llms-compose.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ services:
1111
build:
1212
dockerfile: comps/llms/src/text-generation/Dockerfile.intel_hpu
1313
image: ${REGISTRY:-opea}/llm-textgen-gaudi:${TAG:-latest}
14+
llm-textgen-phi4-gaudi:
15+
build:
16+
dockerfile: comps/llms/src/text-generation/Dockerfile.intel_hpu_phi4
17+
image: ${REGISTRY:-opea}/llm-textgen-phi4-gaudi:${TAG:-latest}
1418
llm-docsum:
1519
build:
1620
dockerfile: comps/llms/src/doc-summarization/Dockerfile

comps/cores/proto/api_protocol.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,8 @@ class ChatCompletionRequest(BaseModel):
177177
parallel_tool_calls: Optional[bool] = True
178178
user: Optional[str] = None
179179
language: str = "auto" # can be "en", "zh"
180+
image_path: Optional[str] = None
181+
audio_path: Optional[str] = None
180182

181183
# Ordered by official OpenAI API documentation
182184
# default values are same with

comps/llms/deployment/docker_compose/compose_text-generation.yaml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,29 @@ services:
4646
- SYS_NICE
4747
restart: unless-stopped
4848

49+
textgen-phi4-gaudi:
50+
image: ${REGISTRY:-opea}/llm-textgen-phi4-gaudi:${TAG:-latest}
51+
container_name: llm-textgen-phi4-gaudi-server
52+
ports:
53+
- ${TEXTGEN_PORT:-9000}:9000
54+
volumes:
55+
- "${DATA_PATH:-./data}:/data"
56+
ipc: host
57+
environment:
58+
no_proxy: ${no_proxy}
59+
http_proxy: ${http_proxy}
60+
https_proxy: ${https_proxy}
61+
LLM_MODEL_ID: ${LLM_MODEL_ID}
62+
HF_TOKEN: ${HF_TOKEN}
63+
HABANA_VISIBLE_DEVICES: all
64+
OMPI_MCA_btl_vader_single_copy_mechanism: none
65+
TOKENIZERS_PARALLELISM: False
66+
LOGFLAG: ${LOGFLAG:-False}
67+
runtime: habana
68+
cap_add:
69+
- SYS_NICE
70+
restart: unless-stopped
71+
4972
textgen-service-tgi:
5073
extends: textgen
5174
container_name: textgen-service-tgi
@@ -101,6 +124,18 @@ services:
101124
environment:
102125
LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative}
103126

127+
textgen-native-phi4-gaudi:
128+
extends: textgen-phi4-gaudi
129+
container_name: textgen-native-phi4-gaudi
130+
environment:
131+
LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNative}
132+
133+
textgen-native-phi4-multimodal-gaudi:
134+
extends: textgen-phi4-gaudi
135+
container_name: textgen-native-phi4-multimodal-gaudi
136+
environment:
137+
LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenNativePhi4Multimodal}
138+
104139
textgen-service-ovms:
105140
extends: textgen
106141
container_name: textgen-service-ovms

comps/llms/src/text-generation/Dockerfile.intel_hpu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
1515

1616
RUN useradd -m -s /bin/bash user && \
1717
mkdir -p /home/user && \
18+
mkdir -p /home/user/logs && \
1819
chown -R user /home/user/
1920

2021
RUN git lfs install
@@ -29,9 +30,10 @@ RUN git clone ${REPO} /home/user/optimum-habana && \
2930
cd /home/user/optimum-habana && git checkout ${REPO_VER} && \
3031
cd examples/text-generation && pip install --no-cache-dir -r requirements.txt && \
3132
cd /home/user/comps/llms/src/text-generation/ && pip install --no-cache-dir -r requirements.txt && \
32-
pip install --no-cache-dir --upgrade --force-reinstall pydantic numpy==1.23.5
33+
pip install --no-cache-dir --upgrade --force-reinstall pydantic numpy==1.25
3334

3435
ENV PYTHONPATH=/root:/home/user
36+
ENV HABANA_LOGS=/home/user/logs
3537

3638
USER user
3739

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# HABANA environment
5+
FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1 AS hpu
6+
7+
ENV LANG=en_US.UTF-8
8+
9+
RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
10+
git-lfs \
11+
libgl1-mesa-glx \
12+
libjemalloc-dev
13+
14+
RUN useradd -m -s /bin/bash user && \
15+
mkdir -p /home/user && \
16+
mkdir -p /home/user/logs && \
17+
chown -R user /home/user/
18+
19+
RUN git lfs install
20+
21+
COPY comps /home/user/comps
22+
23+
RUN pip install --no-cache-dir --upgrade pip setuptools && \
24+
pip install --no-cache-dir --upgrade-strategy eager optimum[habana] && \
25+
pip install --no-cache-dir git+https://github.com/HabanaAI/[email protected]
26+
27+
RUN pip install git+https://github.com/huggingface/optimum-habana.git@transformers_future && \
28+
cd /home/user/comps/llms/src/text-generation/ && pip install --no-cache-dir -r requirements.txt && \
29+
pip install soundfile peft backoff
30+
31+
ENV PYTHONPATH=/root:/home/user
32+
ENV HABANA_LOGS=/home/user/logs
33+
34+
WORKDIR /home/user/comps/llms/src/text-generation/
35+
36+
ENTRYPOINT ["bash", "entrypoint_phi4.sh"]

comps/llms/src/text-generation/README_native.md

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,36 @@ LLM Native microservice uses [optimum-habana](https://github.com/huggingface/opt
88

99
In order to start Native LLM service, you need to setup the following environment variables first.
1010

11-
For LLM model, both `Qwen` and `Falcon3` models are supported. Users can set different models by changing the `LLM_MODEL_ID` below.
11+
For LLM model, both `Qwen`, `Falcon3` and `Phi4` models are supported. Users can set different models by changing the `LLM_MODEL_ID` below.
1212

1313
```bash
1414
export LLM_MODEL_ID="Qwen/Qwen2-7B-Instruct"
1515
export HF_TOKEN="your_huggingface_token"
1616
export TEXTGEN_PORT=10512
17+
export LLM_COMPONENT_NAME="OpeaTextGenNative"
1718
export host_ip=${host_ip}
1819
```
1920

21+
Note. If you want to run "microsoft/Phi-4-multimodal-instruct", please download the [model weights](https://huggingface.co/microsoft/Phi-4-multimodal-instruct/tree/main) manually and put at `/path/to/Phi-4-multimodal-instruct` locally, then setup following environment.
22+
23+
```bash
24+
export LLM_MODEL_ID="/path/to/Phi-4-multimodal-instruct"
25+
export LLM_COMPONENT_NAME="OpeaTextGenNativePhi4Multimodal"
26+
```
27+
2028
### 1.2 Build Docker Image
2129

2230
```bash
31+
## For `Qwen` and `Falcon`
32+
dockerfile_path="comps/llms/src/text-generation/Dockerfile.intel_hpu"
33+
export image_name="opea/llm-textgen-gaudi:latest"
34+
35+
## For `Phi4`
36+
# dockerfile_path="comps/llms/src/text-generation/Dockerfile.intel_hpu_phi4"
37+
# export image_name="opea/llm-textgen-phi4-gaudi:latest"
38+
2339
cd ../../../../../
24-
docker build -t opea/llm-textgen-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile.intel_hpu .
40+
docker build -t $image_name --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $dockerfile_path .
2541
```
2642

2743
To start a docker container, you have two options:
@@ -34,13 +50,15 @@ You can choose one as needed.
3450
### 1.3 Run Docker with CLI (Option A)
3551

3652
```bash
37-
docker run -d --runtime=habana --name="llm-native-server" -p 9000:9000 -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e TOKENIZERS_PARALLELISM=false -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e LLM_MODEL_ID=${LLM_MODEL_ID} opea/llm-textgen-gaudi:latest
53+
docker run -d --runtime=habana --name="llm-native-server" -p $TEXTGEN_PORT:9000 -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e TOKENIZERS_PARALLELISM=false -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e LLM_MODEL_ID=${LLM_MODEL_ID} -e LLM_COMPONENT_NAME=$LLM_COMPONENT_NAME $image_name
3854
```
3955

4056
### 1.4 Run Docker with Docker Compose (Option B)
4157

4258
```bash
4359
export service_name="textgen-native-gaudi"
60+
# export service_name="textgen-native-phi4-gaudi" # For Phi-4-mini-instruct
61+
# export service_name="textgen-native-phi4-multimodal-gaudi" #Phi-4-multimodal-instruct
4462
cd comps/llms/deployment/docker_compose
4563
docker compose -f compose_text-generation.yaml up ${service_name} -d
4664
```
@@ -60,6 +78,22 @@ curl http://${your_ip}:9000/v1/health_check\
6078
```bash
6179
curl http://${your_ip}:9000/v1/chat/completions\
6280
-X POST \
63-
-d '{"messages":"What is Deep Learning?"}' \
81+
-d '{"messages":"What is Deep Learning?", "max_tokens":17}' \
82+
-H 'Content-Type: application/json'
83+
```
84+
85+
If you run a multimodal model such as `Phi-4-multimodal-instruct`, you can try with image or audio input.
86+
87+
```bash
88+
#image
89+
curl http://${your_ip}:9000/v1/chat/completions\
90+
-X POST \
91+
-d '{"messages":"What is shown in this image?", "image_path":"/path/to/image", "max_tokens":17}' \
92+
-H 'Content-Type: application/json'
93+
94+
#audio
95+
curl http://${your_ip}:9000/v1/chat/completions\
96+
-X POST \
97+
-d '{"messages":"Based on the attached audio, generate a comprehensive text transcription of the spoken content.", "audio_path":"/path/to/audio", "max_tokens":17}' \
6498
-H 'Content-Type: application/json'
6599
```
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright (C) 2024 Intel Corporation
4+
# SPDX-License-Identifier: Apache-2.0
5+
#!/bin/bash
6+
7+
#LLM_MODEL_ID mush be a model path
8+
llm_name=$LLM_MODEL_ID
9+
WORKPATH="/home/user/comps/llms/src/text-generation/"
10+
11+
if [[ $llm_name == *"Phi-4-multimodal-instruct"* ]]; then
12+
cd $WORKPATH
13+
echo -e "Patching into the multimodal models"
14+
cp patch/phi4-multimodal-patch/*.py $llm_name/
15+
export PT_HPU_LAZY_MODE=1
16+
elif [[ $llm_name == *"Phi-4-mini-instruct"* ]]; then
17+
cd $WORKPATH
18+
git clone -b transformers_future https://github.com/huggingface/optimum-habana
19+
cd optimum-habana
20+
cp ../patch/optimum-habana-phi4.patch .
21+
git apply optimum-habana-phi4.patch
22+
pip install -e .
23+
cd examples/text-generation/
24+
pip install -r requirements.txt
25+
cd phi-4-mini-instruct/
26+
bash ./01-patch-transformer.sh
27+
fi
28+
29+
cd $WORKPATH
30+
python opea_llm_microservice.py

comps/llms/src/text-generation/integrations/native.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@
4848
"device": "hpu",
4949
"model_name_or_path": MODEL_NAME,
5050
"bf16": True,
51-
"max_new_tokens": 100,
52-
"max_input_tokens": 0,
51+
"max_new_tokens": 32,
52+
"max_input_tokens": 128,
5353
"batch_size": 1,
5454
"warmup": 3,
5555
"n_iterations": 5,
@@ -105,6 +105,21 @@
105105
"penalty_alpha": None,
106106
}
107107

108+
if "Phi-4-mini-instruct" in MODEL_NAME:
109+
args_dict_phi4 = {
110+
"use_kv_cache": False,
111+
"attn_softmax_bf16": True,
112+
"limit_hpu_graphs": True,
113+
"use_flash_attention": True,
114+
"flash_attention_recompute": True,
115+
"flash_attention_causal_mask": True,
116+
"flash_attention_fast_softmax": True,
117+
}
118+
args_dict.update(args_dict_phi4)
119+
120+
if logflag:
121+
logger.info(args_dict)
122+
108123

109124
class Args:
110125
def __init__(self, **entries):
@@ -123,6 +138,7 @@ def __init__(self, **entries):
123138
def generate(
124139
input_query: list,
125140
device="hpu",
141+
max_new_tokens=32,
126142
use_lazy_mode=True,
127143
use_hpu_graphs=True,
128144
profiling_steps=0,
@@ -159,6 +175,7 @@ def generate(
159175
**input_tokens,
160176
generation_config=generation_config,
161177
assistant_model=assistant_model,
178+
max_new_tokens=max_new_tokens,
162179
lazy_mode=use_lazy_mode,
163180
hpu_graphs=use_hpu_graphs,
164181
profiling_steps=profiling_steps,
@@ -262,7 +279,7 @@ async def invoke(self, input: ChatCompletionRequest):
262279
else:
263280
if input.documents:
264281
prompt = ChatTemplate.generate_rag_prompt(message, input.documents)
265-
res = generate([prompt])
282+
res = generate([prompt], max_new_tokens=input.max_tokens)
266283

267284
if logflag:
268285
logger.info(f"[llm - native] inference result: {res}")

0 commit comments

Comments
 (0)