Add support for latest deepseek models on Gaudi (#1491)

lvliang-intel · web-flow · commit 9adf7a6af003 · 2025-02-05T08:30:04.000+08:00
Signed-off-by: lvliang-intel &lt;liang1.lv@intel.com&gt;
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -10,7 +10,7 @@ Quick Start:
 2. Run Docker Compose.
 3. Consume the ChatQnA Service.
 
-Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models).
+Note: The default LLM is `meta-llama/Meta-Llama-3-8B-Instruct`. Before deploying the application, please make sure either you've requested and been granted the access to it on [Huggingface](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) or you've downloaded the model locally from [ModelScope](https://www.modelscope.cn/models). We now support running the latest DeepSeek models, including [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) and [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) on Gaudi accelerators. To run `deepseek-ai/DeepSeek-R1-Distill-Llama-70B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 8 in the [set_env.sh](./set_env.sh) script. To run `deepseek-ai/DeepSeek-R1-Distill-Qwen-32B`, update the `LLM_MODEL_ID` and configure `NUM_CARDS` to 4 in the [set_env.sh](./set_env.sh) script.
 
 ## Quick Start: 1.Setup Environment Variable
 
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -92,6 +92,7 @@ services:
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
       LLM_MODEL_ID: ${LLM_MODEL_ID}
+      NUM_CARDS: ${NUM_CARDS}
       VLLM_TORCH_PROFILER_DIR: "/mnt"
     healthcheck:
       test: ["CMD-SHELL", "curl -f http://$host_ip:8007/health || exit 1"]
@@ -102,7 +103,7 @@ services:
     cap_add:
       - SYS_NICE
     ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
   chatqna-gaudi-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-gaudi-backend-server
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
@@ -133,12 +133,13 @@ services:
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
       LLM_MODEL_ID: ${LLM_MODEL_ID}
+      NUM_CARDS: ${NUM_CARDS}
       VLLM_TORCH_PROFILER_DIR: "/mnt"
     runtime: habana
     cap_add:
       - SYS_NICE
     ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
   chatqna-gaudi-backend-server:
     image: ${REGISTRY:-opea}/chatqna-guardrails:${TAG:-latest}
     container_name: chatqna-gaudi-guardrails-server
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_tgi.yaml
@@ -101,11 +101,12 @@ services:
       LIMIT_HPU_GRAPH: true
       USE_FLASH_ATTENTION: true
       FLASH_ATTENTION_RECOMPUTE: true
+      NUM_CARDS: ${NUM_CARDS}
     runtime: habana
     cap_add:
       - SYS_NICE
     ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+    command: --model-id ${LLM_MODEL_ID} --num-shard ${NUM_CARDS} --max-input-length 2048 --max-total-tokens 4096 --otlp-endpoint $OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
   jaeger:
     image: jaegertracing/all-in-one:latest
     container_name: jaeger
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
@@ -73,12 +73,13 @@ services:
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
       LLM_MODEL_ID: ${LLM_MODEL_ID}
+      NUM_CARDS: ${NUM_CARDS}
       VLLM_TORCH_PROFILER_DIR: "/mnt"
     runtime: habana
     cap_add:
       - SYS_NICE
     ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+    command: --model ${LLM_MODEL_ID} --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
   chatqna-gaudi-backend-server:
     image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
     container_name: chatqna-gaudi-backend-server
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -11,6 +11,7 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
 export INDEX_NAME="rag-redis"
+export NUM_CARDS=1
 # Set it as a non-null string, such as true, if you want to enable logging facility,
 # otherwise, keep it as "" to disable it.
 export LOGFLAG=""
diff --git a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
@@ -47,6 +47,7 @@ function start_services() {
     export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
     export RERANK_MODEL_ID="BAAI/bge-reranker-base"
     export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export NUM_CARDS=1
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
     export GURADRAILS_MODEL_ID="meta-llama/Meta-Llama-Guard-2-8B"
diff --git a/ChatQnA/tests/test_compose_on_gaudi.sh b/ChatQnA/tests/test_compose_on_gaudi.sh
@@ -45,6 +45,7 @@ function start_services() {
     export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
     export RERANK_MODEL_ID="BAAI/bge-reranker-base"
     export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export NUM_CARDS=1
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
     export host_ip=${ip_address}
diff --git a/ChatQnA/tests/test_compose_tgi_on_gaudi.sh b/ChatQnA/tests/test_compose_tgi_on_gaudi.sh
@@ -46,6 +46,7 @@ function start_services() {
     export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
     export RERANK_MODEL_ID="BAAI/bge-reranker-base"
     export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export NUM_CARDS=1
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
     export JAEGER_IP=$(ip route get 8.8.8.8 | grep -oP 'src \K[^ ]+')
diff --git a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
@@ -45,6 +45,7 @@ function start_services() {
     cd $WORKPATH/docker_compose/intel/hpu/gaudi
     export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
     export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
+    export NUM_CARDS=1
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}