Support microservice level benchmark (#95)

lvliang-intel · web-flow · commit 626d269937bd · 2024-09-06T13:05:11.000+08:00
* Support microservice level benchmark

Signed-off-by: lvliang-intel &lt;liang1.lv@intel.com&gt;
diff --git a/evals/benchmark/benchmark.py b/evals/benchmark/benchmark.py
@@ -11,25 +11,25 @@
 service_endpoints = {
     "chatqna": {
         "embedding": "/v1/embeddings",
-        "embedding_serving": "/v1/embeddings",
+        "embedserve": "/v1/embeddings",
         "retriever": "/v1/retrieval",
         "reranking": "/v1/reranking",
-        "reranking_serving": "/rerank",
+        "rerankserve": "/rerank",
         "llm": "/v1/chat/completions",
-        "llm_serving": "/v1/chat/completions",
+        "llmserve": "/v1/chat/completions",
         "e2e": "/v1/chatqna",
     },
-    "codegen": {"llm": "/generate_stream", "llm_serving": "/v1/chat/completions", "e2e": "/v1/codegen"},
-    "codetrans": {"llm": "/generate", "llm_serving": "/v1/chat/completions", "e2e": "/v1/codetrans"},
-    "faqgen": {"llm": "/v1/chat/completions", "llm_serving": "/v1/chat/completions", "e2e": "/v1/faqgen"},
+    "codegen": {"llm": "/generate_stream", "llmserve": "/v1/chat/completions", "e2e": "/v1/codegen"},
+    "codetrans": {"llm": "/generate", "llmserve": "/v1/chat/completions", "e2e": "/v1/codetrans"},
+    "faqgen": {"llm": "/v1/chat/completions", "llmserve": "/v1/chat/completions", "e2e": "/v1/faqgen"},
     "audioqna": {
         "asr": "/v1/audio/transcriptions",
         "llm": "/v1/chat/completions",
-        "llm_serving": "/v1/chat/completions",
+        "llmserve": "/v1/chat/completions",
         "tts": "/v1/audio/speech",
         "e2e": "/v1/audioqna",
     },
-    "visualqna": {"lvm": "/v1/chat/completions", "lvm_serving": "/v1/chat/completions", "e2e": "/v1/visualqna"},
+    "visualqna": {"lvm": "/v1/chat/completions", "lvmserve": "/v1/chat/completions", "e2e": "/v1/visualqna"},
 }
 
 
@@ -200,19 +200,19 @@ def process_service(example, service_type, case_data, test_suite_config):
     example_service_map = {
         "chatqna": [
             "embedding",
-            "embedding_serving",
+            "embedserve",
             "retriever",
             "reranking",
-            "reranking_serving",
+            "rerankserve",
             "llm",
-            "llm_serving",
+            "llmserve",
             "e2e",
         ],
-        "codegen": ["llm", "llm_serving", "e2e"],
-        "codetrans": ["llm", "llm_serving", "e2e"],
-        "faqgen": ["llm", "llm_serving", "e2e"],
-        "audioqna": ["asr", "llm", "llm_serving", "tts", "e2e"],
-        "visualqna": ["lvm", "lvm_serving", "e2e"],
+        "codegen": ["llm", "llmserve", "e2e"],
+        "codetrans": ["llm", "llmserve", "e2e"],
+        "faqgen": ["llm", "llmserve", "e2e"],
+        "audioqna": ["asr", "llm", "llmserve", "tts", "e2e"],
+        "visualqna": ["lvm", "lvmserve", "e2e"],
     }
 
     # Process each example's services
diff --git a/evals/benchmark/benchmark.yaml b/evals/benchmark/benchmark.yaml
@@ -111,7 +111,7 @@ test_cases:
         top_p: 0.95
         repetition_penalty: 1.03
         streaming: true
-    llm_serving:
+    llmserve:
       run_test: false
       service_name: "faq-micro-svc"  # Replace with your service name
     e2e:
diff --git a/evals/benchmark/stresscli/locust/aistress.py b/evals/benchmark/stresscli/locust/aistress.py
@@ -73,19 +73,39 @@ def bench_main(self):
             self.environment.runner.send_message("worker_reqsent", 1)
         reqData = bench_package.getReqData()
         url = bench_package.getUrl()
+        streaming_bench_target = [
+            "llmfixed",
+            "llmbench",
+            "chatqnafixed",
+            "chatqnabench",
+            "codegenfixed",
+            "codegenbench",
+            "faqgenfixed",
+            "faqgenbench",
+        ]
         try:
             start_ts = time.perf_counter()
             with self.client.post(
                 url,
                 json=reqData,
-                stream=True,
+                stream=True if self.environment.parsed_options.bench_target in streaming_bench_target else False,
                 catch_response=True,
                 timeout=self.environment.parsed_options.http_timeout,
             ) as resp:
                 logging.debug("Got response...........................")
 
                 if resp.status_code >= 200 and resp.status_code < 400:
                     if self.environment.parsed_options.bench_target in [
+                        "embedservefixed",
+                        "embeddingfixed",
+                        "retrieverfixed",
+                        "rerankservefixed",
+                        "rerankingfixed",
+                    ]:
+                        respData = {
+                            "total_latency": time.perf_counter() - start_ts,
+                        }
+                    elif self.environment.parsed_options.bench_target in [
                         "audioqnafixed",
                         "audioqnabench",
                     ]:  # non-stream case
diff --git a/evals/benchmark/stresscli/locust/embeddingfixed.py b/evals/benchmark/stresscli/locust/embeddingfixed.py
@@ -14,9 +14,11 @@ def getReqData():
     }
 
 
-def respStatics(environment, resp):
-    return token.respStatics(environment, resp)
+def respStatics(environment, reqData, resp):
+    return {
+        "total_latency": resp["total_latency"] * 1000,
+    }
 
 
 def staticsOutput(environment, reqlist):
-    token.staticsOutput(environment, reqlist)
+    token.staticsOutputForMicroservice(environment, reqlist)
diff --git a/evals/benchmark/stresscli/locust/embedservefixed.py b/evals/benchmark/stresscli/locust/embedservefixed.py
@@ -14,9 +14,11 @@ def getReqData():
     }
 
 
-def respStatics(environment, resp):
-    return token.respStatics(environment, resp)
+def respStatics(environment, reqData, resp):
+    return {
+        "total_latency": resp["total_latency"] * 1000,
+    }
 
 
 def staticsOutput(environment, reqlist):
-    token.staticsOutput(environment, reqlist)
+    token.staticsOutputForMicroservice(environment, reqlist)
diff --git a/evals/benchmark/stresscli/locust/rerankingfixed.py b/evals/benchmark/stresscli/locust/rerankingfixed.py
@@ -17,9 +17,11 @@ def getReqData():
     return {"initial_query": my_query, "retrieved_docs": [{"text": query_rerank_1}, {"text": query_rerank_2}]}
 
 
-def respStatics(environment, resp):
-    return token.respStatics(environment, resp)
+def respStatics(environment, reqData, resp):
+    return {
+        "total_latency": resp["total_latency"] * 1000,
+    }
 
 
 def staticsOutput(environment, reqlist):
-    token.staticsOutput(environment, reqlist)
+    token.staticsOutputForMicroservice(environment, reqlist)
diff --git a/evals/benchmark/stresscli/locust/rerankservefixed.py b/evals/benchmark/stresscli/locust/rerankservefixed.py
@@ -17,9 +17,11 @@ def getReqData():
     return {"query": my_query, "texts": [query_rerank_1, query_rerank_2]}
 
 
-def respStatics(environment, resp):
-    return token.respStatics(environment, resp)
+def respStatics(environment, reqData, resp):
+    return {
+        "total_latency": resp["total_latency"] * 1000,
+    }
 
 
 def staticsOutput(environment, reqlist):
-    token.staticsOutput(environment, reqlist)
+    token.staticsOutputForMicroservice(environment, reqlist)
diff --git a/evals/benchmark/stresscli/locust/retrieverfixed.py b/evals/benchmark/stresscli/locust/retrieverfixed.py
@@ -786,9 +786,11 @@ def getReqData():
     return ({"text": my_query, "embedding": my_embedding},)
 
 
-def respStatics(environment, resp):
-    return token.respStatics(environment, resp)
+def respStatics(environment, reqData, resp):
+    return {
+        "total_latency": resp["total_latency"] * 1000,
+    }
 
 
 def staticsOutput(environment, reqlist):
-    token.staticsOutput(environment, reqlist)
+    token.staticsOutputForMicroservice(environment, reqlist)
diff --git a/evals/benchmark/stresscli/locust/tokenresponse.py b/evals/benchmark/stresscli/locust/tokenresponse.py
@@ -119,3 +119,38 @@ def staticsOutput(environment, reqlist):
         console_logger.warning(average_msg.format(numpy.average(avg_token)))
     console_logger.warning("======================================================\n\n")
     logging.shutdown()
+
+
+def staticsOutputForMicroservice(environment, reqlist):
+    e2e_lat = []
+    duration = environment.runner.stats.last_request_timestamp - environment.runner.stats.start_time
+
+    if len(reqlist) == 0:
+        logging.debug(f"len(reqlist): {len(reqlist)}, skip printing")
+        return
+    for req in iter(reqlist):
+        e2e_lat.append(req["total_latency"])
+
+    # Statistics for success response data only
+    req_msg = "Succeed Response:  {} (Total {}, {:.1%} Success), Duration: {:.2f}s, RPS: {:.2f}"
+    e2e_msg = "End to End latency(ms),    P50: {:.2f},   P90: {:.2f},   P99: {:.2f},   Avg: {:.2f}"
+    console_logger.warning("\n=================Total statistics=====================")
+    console_logger.warning(
+        req_msg.format(
+            len(reqlist),
+            environment.runner.stats.num_requests,
+            len(reqlist) / environment.runner.stats.num_requests,
+            duration,
+            len(reqlist) / duration,
+        )
+    )
+    console_logger.warning(
+        e2e_msg.format(
+            numpy.percentile(e2e_lat, 50),
+            numpy.percentile(e2e_lat, 90),
+            numpy.percentile(e2e_lat, 99),
+            numpy.average(e2e_lat),
+        )
+    )
+    console_logger.warning("======================================================\n\n")
+    logging.shutdown()