Support alternative metrics on accelerated TGI / TEI instances (#454)

eero-t · web-flow · commit cdd358570ecc · 2024-09-27T12:43:17.000+08:00
* Add tgi.accelDevice to rest of top-level gaudi-values.yaml files

DocSum defaults to same model as ChatQnA, and default model used by
CodeGen + CodeTrans is also 7b one, so tgi.accelDevice impact is
assumed to be close enough.

Signed-off-by: Eero Tamminen &lt;eero.t.tamminen@intel.com&gt;

* Different TGI/TEI custom metrics &amp; HPA rules for accelerated devices

Signed-off-by: Eero Tamminen &lt;eero.t.tamminen@intel.com&gt;

---------

Signed-off-by: Eero Tamminen &lt;eero.t.tamminen@intel.com&gt;
diff --git a/helm-charts/chatqna/templates/custom-metrics-configmap.yaml b/helm-charts/chatqna/templates/custom-metrics-configmap.yaml
@@ -12,54 +12,75 @@ metadata:
     app.kubernetes.io/name: prometheus-adapter
 data:
   config.yaml: |
-      rules:
-      {{- if .Values.tgi.horizontalPodAutoscaler.enabled }}
-      # check metric with:
-      # kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency | jq
-      #
-      - seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
-        # Average request latency from TGI histograms, over 1 min
-        # (0.001 divider add is to make sure there's always a valid value)
-        metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
-        name:
-          matches: ^tgi_request_inference_duration_sum
-          as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency"
-        resources:
-          # HPA needs both namespace + suitable object resource for its query paths:
-          # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/tgi_request_latency
-          # (pod is not suitable object type for matching as each instance has different name)
-          overrides:
-            namespace:
-              resource: namespace
-            service:
-              resource: service
-      {{- end }}
-      {{- if .Values.teirerank.horizontalPodAutoscaler.enabled }}
-      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
-        # Average request latency from TEI histograms, over 1 min
-        metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))'
-        name:
-          matches: ^te_request_inference_duration_sum
-          as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_request_latency"
-        resources:
-          overrides:
-            namespace:
-              resource: namespace
-            service:
-              resource: service
-      {{- end }}
-      {{- if .Values.tei.horizontalPodAutoscaler.enabled }}
-      - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
-        # Average request latency from TEI histograms, over 1 min
-        metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))'
-        name:
-          matches: ^te_request_inference_duration_sum
-          as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_request_latency"
-        resources:
-          overrides:
-            namespace:
-              resource: namespace
-            service:
-              resource: service
-      {{- end }}
+    rules:
+    {{- if .Values.tgi.horizontalPodAutoscaler.enabled }}
+    # check metric with:
+    # kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric> | jq
+    #
+    {{- if .Values.tgi.accelDevice }}
+    - seriesQuery: '{__name__="tgi_queue_size",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
+      # TGI instances queue_size sum
+      metricsQuery: 'sum by (namespace,service) (tgi_queue_size{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>})'
+      name:
+        matches: ^tgi_queue_size
+        as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_queue_size_sum"
+    {{- else }}
+    - seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
+      # Average request latency from TGI histograms, over 1 min
+      # (0.001 divider add is to make sure there's always a valid value)
+      metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
+      name:
+        matches: ^tgi_request_inference_duration_sum
+        as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency"
+    {{- end }}
+      resources:
+        # HPA needs both namespace + suitable object resource for its query paths:
+        # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric>
+        # (pod is not suitable object type for matching as each instance has different name)
+        overrides:
+          namespace: {resource: "namespace"}
+          service:   {resource: "service"}
+    {{- end }}
+    {{- if .Values.teirerank.horizontalPodAutoscaler.enabled }}
+    {{- if .Values.teirerank.accelDevice }}
+    - seriesQuery: '{__name__="te_queue_size",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
+      # TEI instances queue_size sum
+      metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>})'
+      name:
+        matches: ^te_queue_size
+        as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_queue_size_sum"
+    {{- else }}
+    - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "teirerank.fullname" .Subcharts.teirerank }}"}'
+      # Average request latency from TEI histograms, over 1 min
+      metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "teirerank.fullname" .Subcharts.teirerank }}",<<.LabelMatchers>>}[1m]))'
+      name:
+        matches: ^te_request_inference_duration_sum
+        as: "{{ include "teirerank.metricPrefix" .Subcharts.teirerank }}_request_latency"
+    {{- end }}
+      resources:
+        overrides:
+          namespace: {resource: "namespace"}
+          service:   {resource: "service"}
+    {{- end }}
+    {{- if .Values.tei.horizontalPodAutoscaler.enabled }}
+    {{- if .Values.tei.accelDevice }}
+    - seriesQuery: '{__name__="te_queue_size",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
+      # TEI instances queue_size sum
+      metricsQuery: 'sum by (namespace,service) (te_queue_size{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>})'
+      name:
+        matches: ^te_queue_size
+        as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_queue_size_sum"
+    {{- else }}
+    - seriesQuery: '{__name__="te_request_inference_duration_sum",service="{{ include "tei.fullname" .Subcharts.tei }}"}'
+      # Average request latency from TEI histograms, over 1 min
+      metricsQuery: 'rate(te_request_inference_duration_sum{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(te_request_inference_duration_count{service="{{ include "tei.fullname" .Subcharts.tei }}",<<.LabelMatchers>>}[1m]))'
+      name:
+        matches: ^te_request_inference_duration_sum
+        as: "{{ include "tei.metricPrefix" .Subcharts.tei }}_request_latency"
+    {{- end }}
+      resources:
+        overrides:
+          namespace: {resource: "namespace"}
+          service:   {resource: "service"}
+    {{- end }}
 {{- end }}
diff --git a/helm-charts/codegen/gaudi-values.yaml b/helm-charts/codegen/gaudi-values.yaml
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 tgi:
+  accelDevice: "gaudi"
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
     tag: "2.0.1"
diff --git a/helm-charts/codetrans/gaudi-values.yaml b/helm-charts/codetrans/gaudi-values.yaml
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 tgi:
+  accelDevice: "gaudi"
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
     tag: "2.0.1"
diff --git a/helm-charts/common/tei/templates/horizontal-pod-autoscaler.yaml b/helm-charts/common/tei/templates/horizontal-pod-autoscaler.yaml
@@ -16,21 +16,30 @@ spec:
   metrics:
   - type: Object
     object:
-      metric:
-        # TEI time metrics are in seconds
-        name: {{ include "tei.metricPrefix" . }}_request_latency
       describedObject:
         apiVersion: v1
         # get metric for named object of given type (in same namespace)
         kind: Service
         name: {{ include "tei.fullname" . }}
       target:
-        # embedding_request_latency is average for all TEI pods. To avoid replica fluctuations when
-        # TEI startup + request processing takes longer than HPA evaluation period, this uses
-        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+{{- if .Values.accelDevice }}
+        # Metric is sum from all pods. "AverageValue" divides value returned from
+        # the custom metrics API by the number of Pods before comparing to the target:
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
+        type: AverageValue
+        averageValue: 15
+      metric:
+        name: {{ include "tei.metricPrefix" . }}_queue_size_sum
+{{- else }}
+        # Metric is average for all the pods. To avoid replica fluctuation when pod
+        # startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
         type: Value
-        value: 4
+        value: 4 # seconds
+      metric:
+        name: {{ include "tei.metricPrefix" . }}_request_latency
+{{- end }}
   behavior:
     scaleDown:
       stabilizationWindowSeconds: 180
diff --git a/helm-charts/common/teirerank/templates/horizontal-pod-autoscaler.yaml b/helm-charts/common/teirerank/templates/horizontal-pod-autoscaler.yaml
@@ -16,21 +16,30 @@ spec:
   metrics:
   - type: Object
     object:
-      metric:
-        # TEI time metrics are in seconds
-        name: {{ include "teirerank.metricPrefix" . }}_request_latency
       describedObject:
         apiVersion: v1
         # get metric for named object of given type (in same namespace)
         kind: Service
         name: {{ include "teirerank.fullname" . }}
       target:
-        # reranking_request_latency is average for all TEI pods. To avoid replica fluctuations when
-        # TEI startup + request processing takes longer than HPA evaluation period, this uses
-        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+{{- if .Values.accelDevice }}
+        # Metric is sum from all pods. "AverageValue" divides value returned from
+        # the custom metrics API by the number of Pods before comparing to the target:
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
+        type: AverageValue
+        averageValue: 15
+      metric:
+        name: {{ include "teirerank.metricPrefix" . }}_queue_size_sum
+{{- else }}
+        # Metric is average for all the pods. To avoid replica fluctuation when pod
+        # startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
         type: Value
-        value: 4
+        value: 4 # seconds
+      metric:
+        name: {{ include "teirerank.metricPrefix" . }}_request_latency
+{{- end }}
   behavior:
     scaleDown:
       stabilizationWindowSeconds: 180
diff --git a/helm-charts/common/tgi/templates/horizontal-pod-autoscaler.yaml b/helm-charts/common/tgi/templates/horizontal-pod-autoscaler.yaml
@@ -16,21 +16,30 @@ spec:
   metrics:
   - type: Object
     object:
-      metric:
-        # TGI time metrics are in seconds
-        name: {{ include "tgi.metricPrefix" . }}_request_latency
       describedObject:
         apiVersion: v1
         # get metric for named object of given type (in same namespace)
         kind: Service
         name: {{ include "tgi.fullname" . }}
       target:
-        # tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when
-        # TGI startup + request processing takes longer than HPA evaluation period, this uses
-        # "Value" (replicas = metric.value / target.value), instead of "averageValue" type:
+{{- if .Values.accelDevice }}
+        # Metric is sum from all pods. "AverageValue" divides value returned from
+        # the custom metrics API by the number of Pods before comparing to the target:
         #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
+        type: AverageValue
+        averageValue: 15
+      metric:
+        name: {{ include "tgi.metricPrefix" . }}_queue_size_sum
+{{- else }}
+        # Metric is average for all the pods. To avoid replica fluctuation when pod
+        # startup + request processing takes longer than HPA evaluation period, this uses
+        # "Value" (replicas = metric.value / target.value), instead of "AverageValue" type.
         type: Value
-        value: 4
+        value: 4 # seconds
+      metric:
+        name: {{ include "tgi.metricPrefix" . }}_request_latency
+{{- end }}
   behavior:
     scaleDown:
       stabilizationWindowSeconds: 180
diff --git a/helm-charts/docsum/gaudi-values.yaml b/helm-charts/docsum/gaudi-values.yaml
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 tgi:
+  accelDevice: "gaudi"
   image:
     repository: ghcr.io/huggingface/tgi-gaudi
     tag: "2.0.1"