Skip to content

Commit 9adf7a6

Browse files
Add support for latest deepseek models on Gaudi (#1491)
Signed-off-by: lvliang-intel <[email protected]>
1 parent a4d028e commit 9adf7a6

10 files changed

+14
-5
lines changed

ChatQnA/docker_compose/intel/hpu/gaudi/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Quick Start:
1010
2. Run Docker Compose.
1111
3. Consume the ChatQnA Service.
1212

13-
Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
13+
Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). We now support running the latest DeepSeek models, including [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) and [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) on Gaudi accelerators. To run `deepseek-ai/DeepSeek-R1-Distill-Llama-70B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 8 in the [set_env.sh](./set_env.sh) script. To run `deepseek-ai/DeepSeek-R1-Distill-Qwen-32B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 4 in the [set_env.sh](./set_env.sh) script.
1414

1515
## Quick Start: 1.Setup Environment Variable
1616

ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ services:
9292
HABANA_VISIBLE_DEVICES: all
9393
OMPI_MCA_btl_vader_single_copy_mechanism: none
9494
LLM_MODEL_ID: ${LLM_MODEL_ID}
95+
NUM_CARDS: ${NUM_CARDS}
9596
VLLM_TORCH_PROFILER_DIR: "/mnt"
9697
healthcheck:
9798
test: ["CMD-SHELL", "curl -f http://$host_ip:8007/health || exit 1"]
@@ -102,7 +103,7 @@ services:
102103
cap_add:
103104
- SYS_NICE
104105
ipc: host
105-
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
106+
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
106107
chatqna-gaudi-backend-server:
107108
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
108109
container_name: chatqna-gaudi-backend-server

ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,12 +133,13 @@ services:
133133
HABANA_VISIBLE_DEVICES: all
134134
OMPI_MCA_btl_vader_single_copy_mechanism: none
135135
LLM_MODEL_ID: ${LLM_MODEL_ID}
136+
NUM_CARDS: ${NUM_CARDS}
136137
VLLM_TORCH_PROFILER_DIR: "/mnt"
137138
runtime: habana
138139
cap_add:
139140
- SYS_NICE
140141
ipc: host
141-
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
142+
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
142143
chatqna-gaudi-backend-server:
143144
image: ${REGISTRY:-opea}/chatqna-guardrails:${TAG:-latest}
144145
container_name: chatqna-gaudi-guardrails-server

ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,12 @@ services:
101101
LIMIT_HPU_GRAPH: true
102102
USE_FLASH_ATTENTION: true
103103
FLASH_ATTENTION_RECOMPUTE: true
104+
NUM_CARDS: ${NUM_CARDS}
104105
runtime: habana
105106
cap_add:
106107
- SYS_NICE
107108
ipc: host
108-
command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
109+
command: --model-id ${LLM_MODEL_ID} --num-shard ${NUM_CARDS} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
109110
jaeger:
110111
image: jaegertracing/all-in-one:latest
111112
container_name: jaeger

ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,13 @@ services:
7373
HABANA_VISIBLE_DEVICES: all
7474
OMPI_MCA_btl_vader_single_copy_mechanism: none
7575
LLM_MODEL_ID: ${LLM_MODEL_ID}
76+
NUM_CARDS: ${NUM_CARDS}
7677
VLLM_TORCH_PROFILER_DIR: "/mnt"
7778
runtime: habana
7879
cap_add:
7980
- SYS_NICE
8081
ipc: host
81-
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
82+
command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
8283
chatqna-gaudi-backend-server:
8384
image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
8485
container_name: chatqna-gaudi-backend-server

ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
1111
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
1212
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
1313
export INDEX_NAME="rag-redis"
14+
export NUM_CARDS=1
1415
# Set it as a non-null string, such as true, if you want to enable logging facility,
1516
# otherwise, keep it as "" to disable it.
1617
export LOGFLAG=""

ChatQnA/tests/test_compose_guardrails_on_gaudi.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ function start_services() {
4747
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
4848
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
4949
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
50+
export NUM_CARDS=1
5051
export INDEX_NAME="rag-redis"
5152
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
5253
export GURADRAILS_MODEL_ID="meta-llama/Meta-Llama-Guard-2-8B"

ChatQnA/tests/test_compose_on_gaudi.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ function start_services() {
4545
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
4646
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
4747
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
48+
export NUM_CARDS=1
4849
export INDEX_NAME="rag-redis"
4950
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
5051
export host_ip=${ip_address}

ChatQnA/tests/test_compose_tgi_on_gaudi.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ function start_services() {
4646
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
4747
export RERANK_MODEL_ID="BAAI/bge-reranker-base"
4848
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
49+
export NUM_CARDS=1
4950
export INDEX_NAME="rag-redis"
5051
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
5152
export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')

ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ function start_services() {
4545
cd $WORKPATH/docker_compose/intel/hpu/gaudi
4646
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
4747
export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
48+
export NUM_CARDS=1
4849
export INDEX_NAME="rag-redis"
4950
export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
5051

0 commit comments

Comments
 (0)