Skip to content

Commit f50dd75

Browse files
committed
codetrans: add vLLM as default inference engine
Signed-off-by: Lianhao Lu <[email protected]>
1 parent 10f85d2 commit f50dd75

File tree

8 files changed

+96
-27
lines changed

8 files changed

+96
-27
lines changed

helm-charts/codetrans/Chart.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ dependencies:
99
- name: tgi
1010
version: 0-latest
1111
repository: "file://../common/tgi"
12+
condition: tgi.enabled
13+
- name: vllm
14+
version: 0-latest
15+
repository: "file://../common/vllm"
16+
condition: vllm.enabled
1217
- name: llm-uservice
1318
version: 0-latest
1419
repository: "file://../common/llm-uservice"

helm-charts/codetrans/README.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,14 @@ helm dependency update codetrans
1515
export HFTOKEN="insert-your-huggingface-token-here"
1616
export MODELDIR="/mnt/opea-models"
1717
export MODELNAME="mistralai/Mistral-7B-Instruct-v0.3"
18-
helm install codetrans codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME}
19-
# To use Gaudi device
20-
# helm install codetrans codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values codetrans/gaudi-values.yaml
18+
# To use CPU with vLLM
19+
helm install codetrans codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-uservcie.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} -f cpu-values.yaml
20+
# To use CPU with TGI
21+
# helm install codetrans codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-uservcie.LLM_MODEL_ID=${MODELNAME} --set tgi.LLM_MODEL_ID=${MODELNAME} -f cpu-tgi-values.yaml
22+
# To use Gaudi device with vLLM
23+
# helm install codetrans codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-uservcie.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} -f gaudi-values.yaml
24+
# To use Gaudi device with TGI
25+
# helm install codetrans codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-uservcie.LLM_MODEL_ID=${MODELNAME} --set tgi.LLM_MODEL_ID=${MODELNAME} -f gaudi-tgi-values.yaml
2126
```
2227

2328
### IMPORTANT NOTE
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
tgi:
5+
enabled: true
6+
vllm:
7+
enabled: false
8+
llm-uservice:
9+
TEXTGEN_BACKEND: TGI

helm-charts/codetrans/cpu-values.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,8 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
tgi:
5-
LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3
5+
enabled: false
6+
vllm:
7+
enabled: true
8+
llm-uservice:
9+
TEXTGEN_BACKEND: vLLM
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
tgi:
5+
enabled: true
6+
accelDevice: "gaudi"
7+
image:
8+
repository: ghcr.io/huggingface/tgi-gaudi
9+
tag: "2.3.1"
10+
resources:
11+
limits:
12+
habana.ai/gaudi: 1
13+
MAX_INPUT_LENGTH: "2048"
14+
MAX_TOTAL_TOKENS: "4096"
15+
CUDA_GRAPHS: ""
16+
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
17+
ENABLE_HPU_GRAPH: "true"
18+
LIMIT_HPU_GRAPH: "true"
19+
USE_FLASH_ATTENTION: "true"
20+
FLASH_ATTENTION_RECOMPUTE: "true"
21+
livenessProbe:
22+
initialDelaySeconds: 5
23+
periodSeconds: 5
24+
timeoutSeconds: 1
25+
readinessProbe:
26+
initialDelaySeconds: 5
27+
periodSeconds: 5
28+
timeoutSeconds: 1
29+
startupProbe:
30+
initialDelaySeconds: 5
31+
periodSeconds: 5
32+
timeoutSeconds: 1
33+
failureThreshold: 120
34+
vllm:
35+
enabled: false
36+
llm-uservice:
37+
TEXTGEN_BACKEND: TGI
Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,33 @@
11
# Copyright (C) 2024 Intel Corporation
22
# SPDX-License-Identifier: Apache-2.0
33

4+
# Accelerate inferencing in heaviest components to improve performance
5+
# by overriding their subchart values
6+
47
tgi:
8+
enabled: false
9+
10+
vllm:
11+
enabled: true
512
accelDevice: "gaudi"
613
image:
7-
repository: ghcr.io/huggingface/tgi-gaudi
8-
tag: "2.3.1"
14+
repository: opea/vllm-gaudi
15+
startupProbe:
16+
failureThreshold: 360
17+
18+
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
19+
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
20+
921
resources:
1022
limits:
1123
habana.ai/gaudi: 1
12-
MAX_INPUT_LENGTH: "1024"
13-
MAX_TOTAL_TOKENS: "2048"
14-
CUDA_GRAPHS: ""
15-
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
16-
ENABLE_HPU_GRAPH: "true"
17-
LIMIT_HPU_GRAPH: "true"
18-
USE_FLASH_ATTENTION: "true"
19-
FLASH_ATTENTION_RECOMPUTE: "true"
20-
livenessProbe:
21-
initialDelaySeconds: 5
22-
periodSeconds: 5
23-
timeoutSeconds: 1
24-
readinessProbe:
25-
initialDelaySeconds: 5
26-
periodSeconds: 5
27-
timeoutSeconds: 1
28-
startupProbe:
29-
initialDelaySeconds: 5
30-
periodSeconds: 5
31-
timeoutSeconds: 1
32-
failureThreshold: 120
24+
extraCmdArgs: [
25+
"--tensor-parallel-size", "1",
26+
"--block-size", "128",
27+
"--max-num-seqs", "256",
28+
"--max-seq_len-to-capture", "2048"
29+
]
30+
31+
llm-uservice:
32+
TEXTGEN_BACKEND: vLLM
33+
retryTimeoutSeconds: 720

helm-charts/codetrans/values.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,15 @@ affinity: {}
5959

6060
# To override values in subchart tgi
6161
tgi:
62+
enabled: false
63+
LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3
64+
65+
vllm:
66+
enabled: true
6267
LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3
6368

6469
llm-uservice:
70+
TEXTGEN_BACKEND: vLLM
6571
LLM_MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3
6672

6773
nginx:

helm-charts/valuefiles.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,9 @@ codetrans:
5353
dest_dir: CodeTrans/kubernetes/helm
5454
values:
5555
- cpu-values.yaml
56+
- cpu-tgi-values.yaml
5657
- gaudi-values.yaml
58+
- gaudi-tgi-values.yaml
5759
docsum:
5860
src_repo: GenAIInfra
5961
src_dir: helm-charts/docsum

0 commit comments

Comments
 (0)