Skip to content

Commit cdd3585

Browse files
authored
Support alternative metrics on accelerated TGI / TEI instances (#454)
* Add tgi.accelDevice to rest of top-level gaudi-values.yaml files DocSum defaults to same model as ChatQnA, and default model used by CodeGen + CodeTrans is also 7b one, so tgi.accelDevice impact is assumed to be close enough. Signed-off-by: Eero Tamminen <[email protected]> * Different TGI/TEI custom metrics & HPA rules for accelerated devices Signed-off-by: Eero Tamminen <[email protected]> --------- Signed-off-by: Eero Tamminen <[email protected]>
1 parent cdd47a5 commit cdd3585

File tree

7 files changed

+122
-71
lines changed

7 files changed

+122
-71
lines changed

helm-charts/chatqna/templates/custom-metrics-configmap.yaml

Lines changed: 71 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -12,54 +12,75 @@ metadata:
1212
app.kubernetes.io/name: prometheus-adapter
1313
data:
1414
config.yaml: |
15-
rules:
16-
{{- if .Values.tgi.horizontalPodAutoscaler.enabled }}
17-
# check metric with:
18-
# kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency | jq
19-
#
20-
- seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
21-
# Average request latency from TGI histograms, over 1 min
22-
# (0.001 divider add is to make sure there's always a valid value)
23-
metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
24-
name:
25-
matches: ^tgi_request_inference_duration_sum
26-
as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency"
27-
resources:
28-
# HPA needs both namespace + suitable object resource for its query paths:
29-
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency
30-
# (pod is not suitable object type for matching as each instance has different name)
31-
overrides:
32-
namespace:
33-
resource: namespace
34-
service:
35-
resource: service
36-
{{- end }}
37-
{{- if .Values.teirerank.horizontalPodAutoscaler.enabled }}
38-
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
39-
# Average request latency from TEI histograms, over 1 min
40-
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))'
41-
name:
42-
matches: ^te_request_inference_duration_sum
43-
as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_request_latency"
44-
resources:
45-
overrides:
46-
namespace:
47-
resource: namespace
48-
service:
49-
resource: service
50-
{{- end }}
51-
{{- if .Values.tei.horizontalPodAutoscaler.enabled }}
52-
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
53-
# Average request latency from TEI histograms, over 1 min
54-
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))'
55-
name:
56-
matches: ^te_request_inference_duration_sum
57-
as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_request_latency"
58-
resources:
59-
overrides:
60-
namespace:
61-
resource: namespace
62-
service:
63-
resource: service
64-
{{- end }}
15+
rules:
16+
{{- if .Values.tgi.horizontalPodAutoscaler.enabled }}
17+
# check metric with:
18+
# kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric> | jq
19+
#
20+
{{- if .Values.tgi.accelDevice }}
21+
- seriesQuery: '{__name__="tgi_queue_size",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
22+
# TGI instances queue_size sum
23+
metricsQuery: 'sum by (namespace,service) (tgi_queue_size{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>})'
24+
name:
25+
matches: ^tgi_queue_size
26+
as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_queue_size_sum"
27+
{{- else }}
28+
- seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
29+
# Average request latency from TGI histograms, over 1 min
30+
# (0.001 divider add is to make sure there's always a valid value)
31+
metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
32+
name:
33+
matches: ^tgi_request_inference_duration_sum
34+
as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency"
35+
{{- end }}
36+
resources:
37+
# HPA needs both namespace + suitable object resource for its query paths:
38+
# /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric>
39+
# (pod is not suitable object type for matching as each instance has different name)
40+
overrides:
41+
namespace: {resource: "namespace"}
42+
service: {resource: "service"}
43+
{{- end }}
44+
{{- if .Values.teirerank.horizontalPodAutoscaler.enabled }}
45+
{{- if .Values.teirerank.accelDevice }}
46+
- seriesQuery: '{__name__="te_queue_size",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
47+
# TEI instances queue_size sum
48+
metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>})'
49+
name:
50+
matches: ^te_queue_size
51+
as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_queue_size_sum"
52+
{{- else }}
53+
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
54+
# Average request latency from TEI histograms, over 1 min
55+
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))'
56+
name:
57+
matches: ^te_request_inference_duration_sum
58+
as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_request_latency"
59+
{{- end }}
60+
resources:
61+
overrides:
62+
namespace: {resource: "namespace"}
63+
service: {resource: "service"}
64+
{{- end }}
65+
{{- if .Values.tei.horizontalPodAutoscaler.enabled }}
66+
{{- if .Values.tei.accelDevice }}
67+
- seriesQuery: '{__name__="te_queue_size",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
68+
# TEI instances queue_size sum
69+
metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>})'
70+
name:
71+
matches: ^te_queue_size
72+
as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_queue_size_sum"
73+
{{- else }}
74+
- seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
75+
# Average request latency from TEI histograms, over 1 min
76+
metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))'
77+
name:
78+
matches: ^te_request_inference_duration_sum
79+
as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_request_latency"
80+
{{- end }}
81+
resources:
82+
overrides:
83+
namespace: {resource: "namespace"}
84+
service: {resource: "service"}
85+
{{- end }}
6586
{{- end }}

helm-charts/codegen/gaudi-values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
tgi:
5+
accelDevice: "gaudi"
56
image:
67
repository: ghcr.io/huggingface/tgi-gaudi
78
tag: "2.0.1"

helm-charts/codetrans/gaudi-values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
tgi:
5+
accelDevice: "gaudi"
56
image:
67
repository: ghcr.io/huggingface/tgi-gaudi
78
tag: "2.0.1"

helm-charts/common/tei/templates/horizontal-pod-autoscaler.yaml

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,30 @@ spec:
1616
metrics:
1717
- type: Object
1818
object:
19-
metric:
20-
# TEI time metrics are in seconds
21-
name: {{ include "tei.metricPrefix" . }}_request_latency
2219
describedObject:
2320
apiVersion: v1
2421
# get metric for named object of given type (in same namespace)
2522
kind: Service
2623
name: {{ include "tei.fullname" . }}
2724
target:
28-
# embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when
29-
# TEI startup + request processing takes longer than HPA evaluation period, this uses
30-
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
25+
{{- if .Values.accelDevice }}
26+
# Metric is sum from all pods. "AverageValue" divides value returned from
27+
# the custom metrics API by the number of Pods before comparing to the target:
3128
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
29+
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
30+
type: AverageValue
31+
averageValue: 15
32+
metric:
33+
name: {{ include "tei.metricPrefix" . }}_queue_size_sum
34+
{{- else }}
35+
# Metric is average for all the pods. To avoid replica fluctuation when pod
36+
# startup + request processing takes longer than HPA evaluation period, this uses
37+
# "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
3238
type: Value
33-
value: 4
39+
value: 4 # seconds
40+
metric:
41+
name: {{ include "tei.metricPrefix" . }}_request_latency
42+
{{- end }}
3443
behavior:
3544
scaleDown:
3645
stabilizationWindowSeconds: 180

helm-charts/common/teirerank/templates/horizontal-pod-autoscaler.yaml

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,30 @@ spec:
1616
metrics:
1717
- type: Object
1818
object:
19-
metric:
20-
# TEI time metrics are in seconds
21-
name: {{ include "teirerank.metricPrefix" . }}_request_latency
2219
describedObject:
2320
apiVersion: v1
2421
# get metric for named object of given type (in same namespace)
2522
kind: Service
2623
name: {{ include "teirerank.fullname" . }}
2724
target:
28-
# reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when
29-
# TEI startup + request processing takes longer than HPA evaluation period, this uses
30-
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
25+
{{- if .Values.accelDevice }}
26+
# Metric is sum from all pods. "AverageValue" divides value returned from
27+
# the custom metrics API by the number of Pods before comparing to the target:
3128
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
29+
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
30+
type: AverageValue
31+
averageValue: 15
32+
metric:
33+
name: {{ include "teirerank.metricPrefix" . }}_queue_size_sum
34+
{{- else }}
35+
# Metric is average for all the pods. To avoid replica fluctuation when pod
36+
# startup + request processing takes longer than HPA evaluation period, this uses
37+
# "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
3238
type: Value
33-
value: 4
39+
value: 4 # seconds
40+
metric:
41+
name: {{ include "teirerank.metricPrefix" . }}_request_latency
42+
{{- end }}
3443
behavior:
3544
scaleDown:
3645
stabilizationWindowSeconds: 180

helm-charts/common/tgi/templates/horizontal-pod-autoscaler.yaml

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,30 @@ spec:
1616
metrics:
1717
- type: Object
1818
object:
19-
metric:
20-
# TGI time metrics are in seconds
21-
name: {{ include "tgi.metricPrefix" . }}_request_latency
2219
describedObject:
2320
apiVersion: v1
2421
# get metric for named object of given type (in same namespace)
2522
kind: Service
2623
name: {{ include "tgi.fullname" . }}
2724
target:
28-
# tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when
29-
# TGI startup + request processing takes longer than HPA evaluation period, this uses
30-
# "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
25+
{{- if .Values.accelDevice }}
26+
# Metric is sum from all pods. "AverageValue" divides value returned from
27+
# the custom metrics API by the number of Pods before comparing to the target:
3128
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
29+
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
30+
type: AverageValue
31+
averageValue: 15
32+
metric:
33+
name: {{ include "tgi.metricPrefix" . }}_queue_size_sum
34+
{{- else }}
35+
# Metric is average for all the pods. To avoid replica fluctuation when pod
36+
# startup + request processing takes longer than HPA evaluation period, this uses
37+
# "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
3238
type: Value
33-
value: 4
39+
value: 4 # seconds
40+
metric:
41+
name: {{ include "tgi.metricPrefix" . }}_request_latency
42+
{{- end }}
3443
behavior:
3544
scaleDown:
3645
stabilizationWindowSeconds: 180

helm-charts/docsum/gaudi-values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
tgi:
5+
accelDevice: "gaudi"
56
image:
67
repository: ghcr.io/huggingface/tgi-gaudi
78
tag: "2.0.1"

0 commit comments

Comments
 (0)