Skip to content

Commit 525cdad

Browse files
Spycshpre-commit-ci[bot]
authored andcommitted
add microservice level perf statistics (opea-project#135)
* add statistics * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Xinyao Wang <[email protected]>
1 parent 986fe6b commit 525cdad

File tree

7 files changed

+156
-4
lines changed

7 files changed

+156
-4
lines changed

comps/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,6 @@
2929

3030
# Telemetry
3131
from comps.cores.telemetry.opea_telemetry import opea_telemetry
32+
33+
# Statistics
34+
from comps.cores.mega.base_statistics import statistics_dict, register_statistics

comps/cores/mega/base_statistics.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
import numpy as np
5+
6+
# name => statistic dict
7+
statistics_dict = {}
8+
9+
10+
class BaseStatistics:
11+
"""Base class to store in-memory statistics of an entity for measurement in one service."""
12+
13+
def __init__(
14+
self,
15+
):
16+
self.response_times = [] # store responses time for all requests
17+
self.first_token_latencies = [] # store first token latencies for all requests
18+
19+
def append_latency(self, latency, first_token_latency=None):
20+
self.response_times.append(latency)
21+
if first_token_latency:
22+
self.first_token_latencies.append(first_token_latency)
23+
24+
def calcuate_statistics(self):
25+
if not self.response_times:
26+
return {
27+
"p50_latency": None,
28+
"p99_latency": None,
29+
"average_latency": None,
30+
}
31+
# Calculate the P50 (median)
32+
p50 = np.percentile(self.response_times, 50)
33+
34+
# Calculate the P99
35+
p99 = np.percentile(self.response_times, 99)
36+
37+
avg = np.average(self.response_times)
38+
39+
return {
40+
"p50_latency": p50,
41+
"p99_latency": p99,
42+
"average_latency": avg,
43+
}
44+
45+
def calcuate_first_token_statistics(self):
46+
if not self.first_token_latencies:
47+
return {
48+
"p50_latency_first_token": None,
49+
"p99_latency_first_token": None,
50+
"average_latency_first_token": None,
51+
}
52+
# Calculate the P50 (median)
53+
p50 = np.percentile(self.first_token_latencies, 50)
54+
55+
# Calculate the P99
56+
p99 = np.percentile(self.first_token_latencies, 99)
57+
58+
avg = np.average(self.first_token_latencies)
59+
60+
return {
61+
"p50_latency_first_token": p50,
62+
"p99_latency_first_token": p99,
63+
"average_latency_first_token": avg,
64+
}
65+
66+
67+
def register_statistics(
68+
names,
69+
):
70+
def decorator(func):
71+
for name in names:
72+
statistics_dict[name] = BaseStatistics()
73+
return func
74+
75+
return decorator
76+
77+
78+
def collect_all_statistics():
79+
results = {}
80+
if statistics_dict:
81+
for name, statistic in statistics_dict.items():
82+
tmp_dict = statistic.calcuate_statistics()
83+
tmp_dict.update(statistic.calcuate_first_token_statistics())
84+
results.update({name: tmp_dict})
85+
return results

comps/cores/mega/http_service.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from uvicorn import Config, Server
88

99
from .base_service import BaseService
10+
from .base_statistics import collect_all_statistics
1011

1112

1213
class HTTPService(BaseService):
@@ -66,6 +67,16 @@ async def _health_check():
6667
"""Get the health status of this GenAI microservice."""
6768
return {"Service Title": self.title, "Service Description": self.description}
6869

70+
@app.get(
71+
path="/v1/statistics",
72+
summary="Get the statistics of GenAI services",
73+
tags=["Debug"],
74+
)
75+
async def _get_statistics():
76+
"""Get the statistics of GenAI services."""
77+
result = collect_all_statistics()
78+
return result
79+
6980
return app
7081

7182
async def initialize_server(self):

comps/embeddings/langchain/embedding_tei_gaudi.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,20 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
import os
5+
import time
56

67
from langchain_community.embeddings import HuggingFaceHubEmbeddings
78
from langsmith import traceable
89

9-
from comps import EmbedDoc768, ServiceType, TextDoc, opea_microservices, register_microservice
10+
from comps import (
11+
EmbedDoc768,
12+
ServiceType,
13+
TextDoc,
14+
opea_microservices,
15+
register_microservice,
16+
register_statistics,
17+
statistics_dict,
18+
)
1019

1120

1221
@register_microservice(
@@ -19,10 +28,13 @@
1928
output_datatype=EmbedDoc768,
2029
)
2130
@traceable(run_type="embedding")
31+
@register_statistics(names=["opea_service@embedding_tgi_gaudi"])
2232
def embedding(input: TextDoc) -> EmbedDoc768:
33+
start = time.time()
2334
embed_vector = embeddings.embed_query(input.text)
2435
embed_vector = embed_vector[:768] # Keep only the first 768 elements
2536
res = EmbedDoc768(text=input.text, embedding=embed_vector)
37+
statistics_dict["opea_service@embedding_tgi_gaudi"].append_latency(time.time() - start, None)
2638
return res
2739

2840

comps/llms/text-generation/tgi/llm.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,21 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
import os
5+
import time
56

67
from fastapi.responses import StreamingResponse
78
from langchain_community.llms import HuggingFaceEndpoint
89
from langsmith import traceable
910

10-
from comps import GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
11+
from comps import (
12+
GeneratedDoc,
13+
LLMParamsDoc,
14+
ServiceType,
15+
opea_microservices,
16+
register_microservice,
17+
register_statistics,
18+
statistics_dict,
19+
)
1120

1221

1322
@register_microservice(
@@ -18,7 +27,9 @@
1827
port=9000,
1928
)
2029
@traceable(run_type="llm")
30+
@register_statistics(names=["opea_service@llm_tgi"])
2131
def llm_generate(input: LLMParamsDoc):
32+
start = time.time()
2233
llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
2334
llm = HuggingFaceEndpoint(
2435
endpoint_url=llm_endpoint,
@@ -34,19 +45,24 @@ def llm_generate(input: LLMParamsDoc):
3445

3546
if input.streaming:
3647

48+
stream_gen_time = []
49+
3750
async def stream_generator():
3851
chat_response = ""
3952
async for text in llm.astream(input.query):
53+
stream_gen_time.append(time.time() - start)
4054
chat_response += text
4155
chunk_repr = repr(text.encode("utf-8"))
4256
print(f"[llm - chat_stream] chunk:{chunk_repr}")
4357
yield f"data: {chunk_repr}\n\n"
4458
print(f"[llm - chat_stream] stream response: {chat_response}")
59+
statistics_dict["opea_service@llm_tgi"].append_latency(stream_gen_time[-1], stream_gen_time[0])
4560
yield "data: [DONE]\n\n"
4661

4762
return StreamingResponse(stream_generator(), media_type="text/event-stream")
4863
else:
4964
response = llm.invoke(input.query)
65+
statistics_dict["opea_service@llm_tgi"].append_latency(time.time() - start, None)
5066
return GeneratedDoc(text=response, prompt=input.query)
5167

5268

comps/reranks/langchain/reranking_tei_xeon.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,21 @@
33

44
import json
55
import os
6+
import time
67

78
import requests
89
from langchain_core.prompts import ChatPromptTemplate
910
from langsmith import traceable
1011

11-
from comps import LLMParamsDoc, SearchedDoc, ServiceType, opea_microservices, register_microservice
12+
from comps import (
13+
LLMParamsDoc,
14+
SearchedDoc,
15+
ServiceType,
16+
opea_microservices,
17+
register_microservice,
18+
register_statistics,
19+
statistics_dict,
20+
)
1221

1322

1423
@register_microservice(
@@ -21,7 +30,9 @@
2130
output_datatype=LLMParamsDoc,
2231
)
2332
@traceable(run_type="llm")
33+
@register_statistics(names=["opea_service@reranking_tgi_gaudi"])
2434
def reranking(input: SearchedDoc) -> LLMParamsDoc:
35+
start = time.time()
2536
docs = [doc.text for doc in input.retrieved_docs]
2637
url = tei_reranking_endpoint + "/rerank"
2738
data = {"query": input.initial_query, "texts": docs}
@@ -36,6 +47,7 @@ def reranking(input: SearchedDoc) -> LLMParamsDoc:
3647
prompt = ChatPromptTemplate.from_template(template)
3748
doc = input.retrieved_docs[best_response["index"]]
3849
final_prompt = prompt.format(context=doc.text, question=input.initial_query)
50+
statistics_dict["opea_service@reranking_tgi_gaudi"].append_latency(time.time() - start, None)
3951
return LLMParamsDoc(query=final_prompt.strip())
4052

4153

comps/retrievers/langchain/retriever_redis.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,23 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
import os
5+
import time
56

67
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings
78
from langchain_community.vectorstores import Redis
89
from langsmith import traceable
910
from redis_config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, REDIS_URL
1011

11-
from comps import EmbedDoc768, SearchedDoc, ServiceType, TextDoc, opea_microservices, register_microservice
12+
from comps import (
13+
EmbedDoc768,
14+
SearchedDoc,
15+
ServiceType,
16+
TextDoc,
17+
opea_microservices,
18+
register_microservice,
19+
register_statistics,
20+
statistics_dict,
21+
)
1222

1323
tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT")
1424

@@ -21,12 +31,15 @@
2131
port=7000,
2232
)
2333
@traceable(run_type="retriever")
34+
@register_statistics(names=["opea_service@retriever_redis"])
2435
def retrieve(input: EmbedDoc768) -> SearchedDoc:
36+
start = time.time()
2537
search_res = vector_db.similarity_search_by_vector(embedding=input.embedding)
2638
searched_docs = []
2739
for r in search_res:
2840
searched_docs.append(TextDoc(text=r.page_content))
2941
result = SearchedDoc(retrieved_docs=searched_docs, initial_query=input.text)
42+
statistics_dict["opea_service@retriever_redis"].append_latency(time.time() - start, None)
3043
return result
3144

3245

0 commit comments

Comments
 (0)