Skip to content

Commit 626d269

Browse files
Support microservice level benchmark (#95)
* Support microservice level benchmark Signed-off-by: lvliang-intel <[email protected]>
1 parent 77bb66c commit 626d269

File tree

9 files changed

+98
-33
lines changed

9 files changed

+98
-33
lines changed

evals/benchmark/benchmark.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,25 @@
1111
service_endpoints = {
1212
"chatqna": {
1313
"embedding": "/v1/embeddings",
14-
"embedding_serving": "/v1/embeddings",
14+
"embedserve": "/v1/embeddings",
1515
"retriever": "/v1/retrieval",
1616
"reranking": "/v1/reranking",
17-
"reranking_serving": "/rerank",
17+
"rerankserve": "/rerank",
1818
"llm": "/v1/chat/completions",
19-
"llm_serving": "/v1/chat/completions",
19+
"llmserve": "/v1/chat/completions",
2020
"e2e": "/v1/chatqna",
2121
},
22-
"codegen": {"llm": "/generate_stream", "llm_serving": "/v1/chat/completions", "e2e": "/v1/codegen"},
23-
"codetrans": {"llm": "/generate", "llm_serving": "/v1/chat/completions", "e2e": "/v1/codetrans"},
24-
"faqgen": {"llm": "/v1/chat/completions", "llm_serving": "/v1/chat/completions", "e2e": "/v1/faqgen"},
22+
"codegen": {"llm": "/generate_stream", "llmserve": "/v1/chat/completions", "e2e": "/v1/codegen"},
23+
"codetrans": {"llm": "/generate", "llmserve": "/v1/chat/completions", "e2e": "/v1/codetrans"},
24+
"faqgen": {"llm": "/v1/chat/completions", "llmserve": "/v1/chat/completions", "e2e": "/v1/faqgen"},
2525
"audioqna": {
2626
"asr": "/v1/audio/transcriptions",
2727
"llm": "/v1/chat/completions",
28-
"llm_serving": "/v1/chat/completions",
28+
"llmserve": "/v1/chat/completions",
2929
"tts": "/v1/audio/speech",
3030
"e2e": "/v1/audioqna",
3131
},
32-
"visualqna": {"lvm": "/v1/chat/completions", "lvm_serving": "/v1/chat/completions", "e2e": "/v1/visualqna"},
32+
"visualqna": {"lvm": "/v1/chat/completions", "lvmserve": "/v1/chat/completions", "e2e": "/v1/visualqna"},
3333
}
3434

3535

@@ -200,19 +200,19 @@ def process_service(example, service_type, case_data, test_suite_config):
200200
example_service_map = {
201201
"chatqna": [
202202
"embedding",
203-
"embedding_serving",
203+
"embedserve",
204204
"retriever",
205205
"reranking",
206-
"reranking_serving",
206+
"rerankserve",
207207
"llm",
208-
"llm_serving",
208+
"llmserve",
209209
"e2e",
210210
],
211-
"codegen": ["llm", "llm_serving", "e2e"],
212-
"codetrans": ["llm", "llm_serving", "e2e"],
213-
"faqgen": ["llm", "llm_serving", "e2e"],
214-
"audioqna": ["asr", "llm", "llm_serving", "tts", "e2e"],
215-
"visualqna": ["lvm", "lvm_serving", "e2e"],
211+
"codegen": ["llm", "llmserve", "e2e"],
212+
"codetrans": ["llm", "llmserve", "e2e"],
213+
"faqgen": ["llm", "llmserve", "e2e"],
214+
"audioqna": ["asr", "llm", "llmserve", "tts", "e2e"],
215+
"visualqna": ["lvm", "lvmserve", "e2e"],
216216
}
217217

218218
# Process each example's services

evals/benchmark/benchmark.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ test_cases:
111111
top_p: 0.95
112112
repetition_penalty: 1.03
113113
streaming: true
114-
llm_serving:
114+
llmserve:
115115
run_test: false
116116
service_name: "faq-micro-svc" # Replace with your service name
117117
e2e:

evals/benchmark/stresscli/locust/aistress.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,19 +73,39 @@ def bench_main(self):
7373
self.environment.runner.send_message("worker_reqsent", 1)
7474
reqData = bench_package.getReqData()
7575
url = bench_package.getUrl()
76+
streaming_bench_target = [
77+
"llmfixed",
78+
"llmbench",
79+
"chatqnafixed",
80+
"chatqnabench",
81+
"codegenfixed",
82+
"codegenbench",
83+
"faqgenfixed",
84+
"faqgenbench",
85+
]
7686
try:
7787
start_ts = time.perf_counter()
7888
with self.client.post(
7989
url,
8090
json=reqData,
81-
stream=True,
91+
stream=True if self.environment.parsed_options.bench_target in streaming_bench_target else False,
8292
catch_response=True,
8393
timeout=self.environment.parsed_options.http_timeout,
8494
) as resp:
8595
logging.debug("Got response...........................")
8696

8797
if resp.status_code >= 200 and resp.status_code < 400:
8898
if self.environment.parsed_options.bench_target in [
99+
"embedservefixed",
100+
"embeddingfixed",
101+
"retrieverfixed",
102+
"rerankservefixed",
103+
"rerankingfixed",
104+
]:
105+
respData = {
106+
"total_latency": time.perf_counter() - start_ts,
107+
}
108+
elif self.environment.parsed_options.bench_target in [
89109
"audioqnafixed",
90110
"audioqnabench",
91111
]: # non-stream case

evals/benchmark/stresscli/locust/embeddingfixed.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,11 @@ def getReqData():
1414
}
1515

1616

17-
def respStatics(environment, resp):
18-
return token.respStatics(environment, resp)
17+
def respStatics(environment, reqData, resp):
18+
return {
19+
"total_latency": resp["total_latency"] * 1000,
20+
}
1921

2022

2123
def staticsOutput(environment, reqlist):
22-
token.staticsOutput(environment, reqlist)
24+
token.staticsOutputForMicroservice(environment, reqlist)

evals/benchmark/stresscli/locust/embedservefixed.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,11 @@ def getReqData():
1414
}
1515

1616

17-
def respStatics(environment, resp):
18-
return token.respStatics(environment, resp)
17+
def respStatics(environment, reqData, resp):
18+
return {
19+
"total_latency": resp["total_latency"] * 1000,
20+
}
1921

2022

2123
def staticsOutput(environment, reqlist):
22-
token.staticsOutput(environment, reqlist)
24+
token.staticsOutputForMicroservice(environment, reqlist)

evals/benchmark/stresscli/locust/rerankingfixed.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@ def getReqData():
1717
return {"initial_query": my_query, "retrieved_docs": [{"text": query_rerank_1}, {"text": query_rerank_2}]}
1818

1919

20-
def respStatics(environment, resp):
21-
return token.respStatics(environment, resp)
20+
def respStatics(environment, reqData, resp):
21+
return {
22+
"total_latency": resp["total_latency"] * 1000,
23+
}
2224

2325

2426
def staticsOutput(environment, reqlist):
25-
token.staticsOutput(environment, reqlist)
27+
token.staticsOutputForMicroservice(environment, reqlist)

evals/benchmark/stresscli/locust/rerankservefixed.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@ def getReqData():
1717
return {"query": my_query, "texts": [query_rerank_1, query_rerank_2]}
1818

1919

20-
def respStatics(environment, resp):
21-
return token.respStatics(environment, resp)
20+
def respStatics(environment, reqData, resp):
21+
return {
22+
"total_latency": resp["total_latency"] * 1000,
23+
}
2224

2325

2426
def staticsOutput(environment, reqlist):
25-
token.staticsOutput(environment, reqlist)
27+
token.staticsOutputForMicroservice(environment, reqlist)

evals/benchmark/stresscli/locust/retrieverfixed.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -786,9 +786,11 @@ def getReqData():
786786
return ({"text": my_query, "embedding": my_embedding},)
787787

788788

789-
def respStatics(environment, resp):
790-
return token.respStatics(environment, resp)
789+
def respStatics(environment, reqData, resp):
790+
return {
791+
"total_latency": resp["total_latency"] * 1000,
792+
}
791793

792794

793795
def staticsOutput(environment, reqlist):
794-
token.staticsOutput(environment, reqlist)
796+
token.staticsOutputForMicroservice(environment, reqlist)

evals/benchmark/stresscli/locust/tokenresponse.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,3 +119,38 @@ def staticsOutput(environment, reqlist):
119119
console_logger.warning(average_msg.format(numpy.average(avg_token)))
120120
console_logger.warning("======================================================\n\n")
121121
logging.shutdown()
122+
123+
124+
def staticsOutputForMicroservice(environment, reqlist):
125+
e2e_lat = []
126+
duration = environment.runner.stats.last_request_timestamp - environment.runner.stats.start_time
127+
128+
if len(reqlist) == 0:
129+
logging.debug(f"len(reqlist): {len(reqlist)}, skip printing")
130+
return
131+
for req in iter(reqlist):
132+
e2e_lat.append(req["total_latency"])
133+
134+
# Statistics for success response data only
135+
req_msg = "Succeed Response: {} (Total {}, {:.1%} Success), Duration: {:.2f}s, RPS: {:.2f}"
136+
e2e_msg = "End to End latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}"
137+
console_logger.warning("\n=================Total statistics=====================")
138+
console_logger.warning(
139+
req_msg.format(
140+
len(reqlist),
141+
environment.runner.stats.num_requests,
142+
len(reqlist) / environment.runner.stats.num_requests,
143+
duration,
144+
len(reqlist) / duration,
145+
)
146+
)
147+
console_logger.warning(
148+
e2e_msg.format(
149+
numpy.percentile(e2e_lat, 50),
150+
numpy.percentile(e2e_lat, 90),
151+
numpy.percentile(e2e_lat, 99),
152+
numpy.average(e2e_lat),
153+
)
154+
)
155+
console_logger.warning("======================================================\n\n")
156+
logging.shutdown()

0 commit comments

Comments
 (0)