InftyAI · InftyAI-Agent · Jan 17, 2025 · Jan 17, 2025
diff --git a/README.md b/README.md
@@ -67,9 +67,9 @@ spec:
     modelHub:
       modelID: facebook/opt-125m
   inferenceFlavors:
-  - name: t4 # GPU type
-    requests:
-      nvidia.com/gpu: 1
+    - name: t4 # GPU type
+      requests:
+        nvidia.com/gpu: 1
 ```
 
 #### Inference Playground
@@ -124,12 +124,11 @@ If you want to learn more about this project, please refer to [develop.md](./doc
 - CLI tool support
 - Model training, fine tuning in the long-term
 
-
 ## Community
 
 Join us for more discussions:
 
-* **Slack Channel**: [#llmaz](https://inftyai.slack.com/archives/C06D0BGEQ1G)
+- **Slack Channel**: [#llmaz](https://inftyai.slack.com/archives/C06D0BGEQ1G)
 
 ## Contributions
 

diff --git a/api/inference/v1alpha1/backendruntime_types.go b/api/inference/v1alpha1/backendruntime_types.go
@@ -63,6 +63,22 @@ type BackendRuntimeSpec struct {
 	// accelerators like GPU should not be defined here, but at the model flavors,
 	// or the values here will be overwritten.
 	Resources ResourceRequirements `json:"resources"`
+	// Periodic probe of backend liveness.
+	// Backend will be restarted if the probe fails.
+	// Cannot be updated.
+	// +optional
+	LivenessProbe *corev1.Probe `json:"livenessProbe,omitempty"`
+	// Periodic probe of backend readiness.
+	// Backend will be removed from service endpoints if the probe fails.
+	// +optional
+	ReadinessProbe *corev1.Probe `json:"readinessProbe,omitempty"`
+	// StartupProbe indicates that the Backend has successfully initialized.
+	// If specified, no other probes are executed until this completes successfully.
+	// If this probe fails, the backend will be restarted, just as if the livenessProbe failed.
+	// This can be used to provide different probe parameters at the beginning of a backend's lifecycle,
+	// when it might take a long time to load data or warm a cache, than during steady-state operation.
+	// +optional
+	StartupProbe *corev1.Probe `json:"startupProbe,omitempty"`
 }
 
 // BackendRuntimeStatus defines the observed state of BackendRuntime

diff --git a/chart/templates/backends/llamacpp.yaml b/chart/templates/backends/llamacpp.yaml
@@ -23,6 +23,7 @@ spec:
         - "0.0.0.0"
         - --port
         - "8080"
+    # TODO: not supported yet, see https://github.com/InftyAI/llmaz/issues/240.
     - name: speculative-decoding
       flags:
         - -m
@@ -40,4 +41,24 @@ spec:
     limits:
       cpu: 2
       memory: 4Gi
+  startupProbe:
+    periodSeconds: 10
+    failureThreshold: 30
+    httpGet:
+      path: /health
+      port: 8080
+  livenessProbe:
+    initialDelaySeconds: 15
+    periodSeconds: 10
+    failureThreshold: 3
+    httpGet:
+      path: /health
+      port: 8080
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 3
+    httpGet:
+      path: /health
+      port: 8080
 {{- end }}
diff --git a/chart/templates/backends/sglang.yaml b/chart/templates/backends/sglang.yaml
@@ -34,4 +34,24 @@ spec:
     limits:
       cpu: 4
       memory: 8Gi
+  startupProbe:
+    periodSeconds: 10
+    failureThreshold: 30
+    httpGet:
+      path: /health
+      port: 8080
+  livenessProbe:
+    initialDelaySeconds: 15
+    periodSeconds: 10
+    failureThreshold: 3
+    httpGet:
+      path: /health
+      port: 8080
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 3
+    httpGet:
+      path: /health_generate
+      port: 8080
 {{- end }}
diff --git a/chart/templates/backends/tgi.yaml b/chart/templates/backends/tgi.yaml
@@ -26,4 +26,24 @@ spec:
     limits:
       cpu: 4
       memory: 8Gi
+  startupProbe:
+    periodSeconds: 10
+    failureThreshold: 30
+    httpGet:
+      path: /health
+      port: 8080
+  livenessProbe:
+    initialDelaySeconds: 15
+    periodSeconds: 10
+    failureThreshold: 3
+    httpGet:
+      path: /health
+      port: 8080
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 3
+    httpGet:
+      path: /health
+      port: 8080
 {{- end }}
diff --git a/chart/templates/backends/vllm.yaml b/chart/templates/backends/vllm.yaml
@@ -107,4 +107,24 @@ spec:
     limits:
       cpu: 4
       memory: 8Gi
+  startupProbe:
+    periodSeconds: 10
+    failureThreshold: 30
+    httpGet:
+      path: /health
+      port: 8080
+  livenessProbe:
+    initialDelaySeconds: 15
+    periodSeconds: 10
+    failureThreshold: 3
+    httpGet:
+      path: /health
+      port: 8080
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    failureThreshold: 3
+    httpGet:
+      path: /health
+      port: 8080
 {{- end }}