File tree Expand file tree Collapse file tree 8 files changed +96
-27
lines changed Expand file tree Collapse file tree 8 files changed +96
-27
lines changed Original file line number Diff line number Diff line change @@ -9,6 +9,11 @@ dependencies:
9
9
- name : tgi
10
10
version : 0-latest
11
11
repository : " file://../common/tgi"
12
+ condition : tgi.enabled
13
+ - name : vllm
14
+ version : 0-latest
15
+ repository : " file://../common/vllm"
16
+ condition : vllm.enabled
12
17
- name : llm-uservice
13
18
version : 0-latest
14
19
repository : " file://../common/llm-uservice"
Original file line number Diff line number Diff line change @@ -15,9 +15,14 @@ helm dependency update codetrans
15
15
export HFTOKEN="insert-your-huggingface-token-here"
16
16
export MODELDIR="/mnt/opea-models"
17
17
export MODELNAME="mistralai/Mistral-7B-Instruct-v0.3"
18
- helm install codetrans codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME}
19
- # To use Gaudi device
20
- # helm install codetrans codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values codetrans/gaudi-values.yaml
18
+ # To use CPU with vLLM
19
+ helm install codetrans codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-uservcie.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} -f cpu-values.yaml
20
+ # To use CPU with TGI
21
+ # helm install codetrans codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-uservcie.LLM_MODEL_ID=${MODELNAME} --set tgi.LLM_MODEL_ID=${MODELNAME} -f cpu-tgi-values.yaml
22
+ # To use Gaudi device with vLLM
23
+ # helm install codetrans codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-uservcie.LLM_MODEL_ID=${MODELNAME} --set vllm.LLM_MODEL_ID=${MODELNAME} -f gaudi-values.yaml
24
+ # To use Gaudi device with TGI
25
+ # helm install codetrans codetrans --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set llm-uservcie.LLM_MODEL_ID=${MODELNAME} --set tgi.LLM_MODEL_ID=${MODELNAME} -f gaudi-tgi-values.yaml
21
26
```
22
27
23
28
### IMPORTANT NOTE
Original file line number Diff line number Diff line change
1
+ # Copyright (C) 2024 Intel Corporation
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ tgi :
5
+ enabled : true
6
+ vllm :
7
+ enabled : false
8
+ llm-uservice :
9
+ TEXTGEN_BACKEND : TGI
Original file line number Diff line number Diff line change 2
2
# SPDX-License-Identifier: Apache-2.0
3
3
4
4
tgi :
5
- LLM_MODEL_ID : mistralai/Mistral-7B-Instruct-v0.3
5
+ enabled : false
6
+ vllm :
7
+ enabled : true
8
+ llm-uservice :
9
+ TEXTGEN_BACKEND : vLLM
Original file line number Diff line number Diff line change
1
+ # Copyright (C) 2024 Intel Corporation
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ tgi :
5
+ enabled : true
6
+ accelDevice : " gaudi"
7
+ image :
8
+ repository : ghcr.io/huggingface/tgi-gaudi
9
+ tag : " 2.3.1"
10
+ resources :
11
+ limits :
12
+ habana.ai/gaudi : 1
13
+ MAX_INPUT_LENGTH : " 2048"
14
+ MAX_TOTAL_TOKENS : " 4096"
15
+ CUDA_GRAPHS : " "
16
+ OMPI_MCA_btl_vader_single_copy_mechanism : " none"
17
+ ENABLE_HPU_GRAPH : " true"
18
+ LIMIT_HPU_GRAPH : " true"
19
+ USE_FLASH_ATTENTION : " true"
20
+ FLASH_ATTENTION_RECOMPUTE : " true"
21
+ livenessProbe :
22
+ initialDelaySeconds : 5
23
+ periodSeconds : 5
24
+ timeoutSeconds : 1
25
+ readinessProbe :
26
+ initialDelaySeconds : 5
27
+ periodSeconds : 5
28
+ timeoutSeconds : 1
29
+ startupProbe :
30
+ initialDelaySeconds : 5
31
+ periodSeconds : 5
32
+ timeoutSeconds : 1
33
+ failureThreshold : 120
34
+ vllm :
35
+ enabled : false
36
+ llm-uservice :
37
+ TEXTGEN_BACKEND : TGI
Original file line number Diff line number Diff line change 1
1
# Copyright (C) 2024 Intel Corporation
2
2
# SPDX-License-Identifier: Apache-2.0
3
3
4
+ # Accelerate inferencing in heaviest components to improve performance
5
+ # by overriding their subchart values
6
+
4
7
tgi :
8
+ enabled : false
9
+
10
+ vllm :
11
+ enabled : true
5
12
accelDevice : " gaudi"
6
13
image :
7
- repository : ghcr.io/huggingface/tgi-gaudi
8
- tag : " 2.3.1"
14
+ repository : opea/vllm-gaudi
15
+ startupProbe :
16
+ failureThreshold : 360
17
+
18
+ PT_HPU_ENABLE_LAZY_COLLECTIVES : " true"
19
+ OMPI_MCA_btl_vader_single_copy_mechanism : " none"
20
+
9
21
resources :
10
22
limits :
11
23
habana.ai/gaudi : 1
12
- MAX_INPUT_LENGTH : " 1024"
13
- MAX_TOTAL_TOKENS : " 2048"
14
- CUDA_GRAPHS : " "
15
- OMPI_MCA_btl_vader_single_copy_mechanism : " none"
16
- ENABLE_HPU_GRAPH : " true"
17
- LIMIT_HPU_GRAPH : " true"
18
- USE_FLASH_ATTENTION : " true"
19
- FLASH_ATTENTION_RECOMPUTE : " true"
20
- livenessProbe :
21
- initialDelaySeconds : 5
22
- periodSeconds : 5
23
- timeoutSeconds : 1
24
- readinessProbe :
25
- initialDelaySeconds : 5
26
- periodSeconds : 5
27
- timeoutSeconds : 1
28
- startupProbe :
29
- initialDelaySeconds : 5
30
- periodSeconds : 5
31
- timeoutSeconds : 1
32
- failureThreshold : 120
24
+ extraCmdArgs : [
25
+ " --tensor-parallel-size" , "1",
26
+ " --block-size" , "128",
27
+ " --max-num-seqs" , "256",
28
+ " --max-seq_len-to-capture" , "2048"
29
+ ]
30
+
31
+ llm-uservice :
32
+ TEXTGEN_BACKEND : vLLM
33
+ retryTimeoutSeconds : 720
Original file line number Diff line number Diff line change @@ -59,9 +59,15 @@ affinity: {}
59
59
60
60
# To override values in subchart tgi
61
61
tgi :
62
+ enabled : false
63
+ LLM_MODEL_ID : mistralai/Mistral-7B-Instruct-v0.3
64
+
65
+ vllm :
66
+ enabled : true
62
67
LLM_MODEL_ID : mistralai/Mistral-7B-Instruct-v0.3
63
68
64
69
llm-uservice :
70
+ TEXTGEN_BACKEND : vLLM
65
71
LLM_MODEL_ID : mistralai/Mistral-7B-Instruct-v0.3
66
72
67
73
nginx :
Original file line number Diff line number Diff line change @@ -53,7 +53,9 @@ codetrans:
53
53
dest_dir : CodeTrans/kubernetes/helm
54
54
values :
55
55
- cpu-values.yaml
56
+ - cpu-tgi-values.yaml
56
57
- gaudi-values.yaml
58
+ - gaudi-tgi-values.yaml
57
59
docsum :
58
60
src_repo : GenAIInfra
59
61
src_dir : helm-charts/docsum
You can’t perform that action at this time.
0 commit comments