Generalize BaseStatistics code a bit + document it

eero-t · eero-t · commit c55e6452e650 · 2025-01-03T14:05:58.000+02:00
Signed-off-by: Eero Tamminen &lt;eero.t.tamminen@intel.com&gt;
diff --git a/comps/cores/mega/base_statistics.py b/comps/cores/mega/base_statistics.py
@@ -21,47 +21,23 @@ def append_latency(self, latency, first_token_latency=None):
         if first_token_latency:
             self.first_token_latencies.append(first_token_latency)
 
-    def calculate_statistics(self):
-        if not self.response_times:
-            return {
-                "p50_latency": None,
-                "p99_latency": None,
-                "average_latency": None,
-            }
-        # Calculate the P50 (median)
-        p50 = np.percentile(self.response_times, 50)
-
-        # Calculate the P99
-        p99 = np.percentile(self.response_times, 99)
-
-        avg = np.average(self.response_times)
-
-        return {
-            "p50_latency": p50,
-            "p99_latency": p99,
-            "average_latency": avg,
-        }
-
-    def calculate_first_token_statistics(self):
-        if not self.first_token_latencies:
-            return {
-                "p50_latency_first_token": None,
-                "p99_latency_first_token": None,
-                "average_latency_first_token": None,
-            }
-        # Calculate the P50 (median)
-        p50 = np.percentile(self.first_token_latencies, 50)
-
-        # Calculate the P99
-        p99 = np.percentile(self.first_token_latencies, 99)
-
-        avg = np.average(self.first_token_latencies)
-
-        return {
-            "p50_latency_first_token": p50,
-            "p99_latency_first_token": p99,
-            "average_latency_first_token": avg,
-        }
+    def _add_statistics(self, result, stats, suffix):
+        "add P50 (median), P99 and average values for 'stats' array to 'result' dict"
+        if stats:
+            result[f"p50_{suffix}"] = np.percentile(stats, 50)
+            result[f"p99_{suffix}"] = np.percentile(stats, 99)
+            result[f"average_{suffix}"] = np.average(stats)
+        else:
+            result[f"p50_{suffix}"] = None
+            result[f"p99_{suffix}"] = None
+            result[f"average_{suffix}"] = None
+
+    def get_statistics(self):
+        "return stats dict with P50, P99 and average values for first token and response timings"
+        result = {}
+        self._add_statistics(result, self.response_times, "latency")
+        self._add_statistics(result, self.first_token_latencies, "latency_first_token")
+        return result
 
 
 def register_statistics(
@@ -79,7 +55,5 @@ def collect_all_statistics():
     results = {}
     if statistics_dict:
         for name, statistic in statistics_dict.items():
-            tmp_dict = statistic.calculate_statistics()
-            tmp_dict.update(statistic.calculate_first_token_statistics())
-            results.update({name: tmp_dict})
+            results[name] = statistic.get_statistics()
     return results
diff --git a/comps/cores/telemetry/README.md b/comps/cores/telemetry/README.md
@@ -4,6 +4,19 @@ OPEA Comps currently provides telemetry functionalities for metrics and tracing
 
 ![opea telemetry](https://raw.githubusercontent.com/Spycsh/assets/main/OPEA%20Telemetry.jpg)
 
+Contents:
+
+- [Metrics](#metrics)
+  - [HTTP metrics](#http-metrics)
+  - [Megaservice E2E metrics](#megaservice-e2e-metrics)
+  - [Inferencing metrics](#inferencing-metrics)
+  - [Metrics collection](#metrics-collection)
+- [Statistics](#statistics)
+- [Tracing](#tracing)
+- [Visualization](#visualization)
+- [Visualize metrics](#visualize-metrics)
+- [Visualize tracing](#visualize-tracing)
+
 ## Metrics
 
 OPEA microservice metrics are exported in Prometheus format under `/metrics` endpoint.
@@ -20,7 +33,7 @@ They can be fetched e.g. with `curl`:
 curl localhost:{port of your service}/metrics
 ```
 
-### HTTP Metrics
+### HTTP metrics
 
 Metrics output looks following:
 
@@ -54,7 +67,7 @@ Latency ones are histogram metrics i.e. include count, total value and set of va
 
 They are available only for _streaming_ requests using LLM. Pending count accounts for all requests.
 
-### Inferencing Metrics
+### Inferencing metrics
 
 For example, you can `curl localhost:6006/metrics` to retrieve the TEI embedding metrics, and the output should look like follows:
 
@@ -95,6 +108,11 @@ Below are some default metrics endpoints for specific microservices:
 | TEI embedding | 6006  | /metrics | [link](https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/metrics) |
 | TEI reranking | 8808  | /metrics | [link](https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/metrics) |
 
+## Statistics
+
+Additionally, GenAIComps microservices provide separate `/v1/statistics` endpoint, which outputs P50, P99 and average metrics
+for response times, and first token latencies, if microservice processes them.
+
 ## Tracing
 
 OPEA use OpenTelemetry to trace function call stacks. To trace a function, add the `@opea_telemetry` decorator to either an async or sync function. The call stacks and time span data will be exported by OpenTelemetry. You can use Jaeger UI to visualize this tracing data.