Skip to content

Commit 4ede405

Browse files
eero-tmkbhanda
andauthored
Create token metrics only when they are available (#1092)
* Create token metrics only when they are available This avoids generation of useless token/request histogram metrics for services that use Orchestrator class, but never call its token processing functionality. (Helps in differentiating frontend megaservice metrics from backend megaservice ones, especially when multiple OPEA applications run in the same cluster.) Also change Orchestrator CI test workaround to use unique prefix for each metric instance, instead of metrics being (singleton) class variables. Signed-off-by: Eero Tamminen <[email protected]> * Add locking for latency metric creation / method change As that that could be called from multiple request handling threads. Signed-off-by: Eero Tamminen <[email protected]> --------- Signed-off-by: Eero Tamminen <[email protected]> Co-authored-by: Malini Bhandaru <[email protected]>
1 parent 119acf2 commit 4ede405

File tree

1 file changed

+50
-11
lines changed

1 file changed

+50
-11
lines changed

comps/cores/mega/orchestrator.py

Lines changed: 50 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import json
88
import os
99
import re
10+
import threading
1011
import time
1112
from typing import Dict, List
1213

@@ -27,28 +28,66 @@
2728

2829

2930
class OrchestratorMetrics:
30-
# Because:
31+
# Need an instance ID for metric prefix because:
32+
# - Orchestror instances are not named
3133
# - CI creates several orchestrator instances
32-
# - Prometheus requires metrics to be singletons
33-
# - Oorchestror instances are not provided their own names
34-
# Metrics are class members with "megaservice" name prefix
35-
first_token_latency = Histogram("megaservice_first_token_latency", "First token latency (histogram)")
36-
inter_token_latency = Histogram("megaservice_inter_token_latency", "Inter-token latency (histogram)")
37-
request_latency = Histogram("megaservice_request_latency", "Whole request/reply latency (histogram)")
38-
request_pending = Gauge("megaservice_request_pending", "Count of currently pending requests (gauge)")
34+
# - Prometheus requires metrics (their names) to be unique
35+
_instance_id = 0
3936

4037
def __init__(self) -> None:
41-
pass
38+
self._instance_id += 1
39+
if self._instance_id > 1:
40+
self._prefix = f"megaservice{self._instance_id}"
41+
else:
42+
self._prefix = "megaservice"
43+
44+
self.request_pending = Gauge(f"{self._prefix}_request_pending", "Count of currently pending requests (gauge)")
45+
46+
# locking for latency metric creation / method change
47+
self._lock = threading.Lock()
48+
49+
# Metrics related to token processing are created on demand,
50+
# to avoid bogus ones for services that never handle tokens
51+
self.first_token_latency = None
52+
self.inter_token_latency = None
53+
self.request_latency = None
54+
55+
# initial methods to create the metrics
56+
self.token_update = self._token_update_create
57+
self.request_update = self._request_update_create
58+
59+
def _token_update_create(self, token_start: float, is_first: bool) -> float:
60+
with self._lock:
61+
# in case another thread already got here
62+
if self.token_update == self._token_update_create:
63+
self.first_token_latency = Histogram(
64+
f"{self._prefix}_first_token_latency", "First token latency (histogram)"
65+
)
66+
self.inter_token_latency = Histogram(
67+
f"{self._prefix}_inter_token_latency", "Inter-token latency (histogram)"
68+
)
69+
self.token_update = self._token_update_real
70+
return self.token_update(token_start, is_first)
71+
72+
def _request_update_create(self, req_start: float) -> None:
73+
with self._lock:
74+
# in case another thread already got here
75+
if self.request_update == self._request_update_create:
76+
self.request_latency = Histogram(
77+
f"{self._prefix}_request_latency", "Whole LLM request/reply latency (histogram)"
78+
)
79+
self.request_update = self._request_update_real
80+
self.request_update(req_start)
4281

43-
def token_update(self, token_start: float, is_first: bool) -> float:
82+
def _token_update_real(self, token_start: float, is_first: bool) -> float:
4483
now = time.time()
4584
if is_first:
4685
self.first_token_latency.observe(now - token_start)
4786
else:
4887
self.inter_token_latency.observe(now - token_start)
4988
return now
5089

51-
def request_update(self, req_start: float) -> None:
90+
def _request_update_real(self, req_start: float) -> None:
5291
self.request_latency.observe(time.time() - req_start)
5392

5493
def pending_update(self, increase: bool) -> None:

0 commit comments

Comments
 (0)