sgl-project · zhyncs · May 11, 2025 · May 10, 2025
diff --git a/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py b/benchmark/bench_in_batch_prefix/bench_in_batch_prefix.py
@@ -64,11 +64,11 @@ def test_batch_by_batch(all_prompts, gen_len):
 
     tot_time = 0
     for i in range(len(all_prompts)):
-        tic = time.time()
+        tic = time.perf_counter()
         text_qa.run_batch(
             list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))),
         )
-        tot_time += time.time() - tic
+        tot_time += time.perf_counter() - tic
 
     return tot_time
 
@@ -78,13 +78,13 @@ def test_batch_by_batch_with_hint(all_prompts, gen_len):
 
     tot_time = 0
     for i in range(len(all_prompts)):
-        tic = time.time()
+        tic = time.perf_counter()
         # Send a hint to cache the prefix
         text_qa.run_batch(list(zip(all_prompts[i][:1], [gen_len])))
         # Send the batch
         text_qa.run_batch(list(zip(all_prompts[i], [gen_len] * len(all_prompts[i]))))
 
-        tot_time += time.time() - tic
+        tot_time += time.perf_counter() - tic
 
     return tot_time
 
@@ -94,11 +94,11 @@ def test_send_all(all_prompts, gen_len):
 
     all_prompts = [x for prompt_list in all_prompts for x in prompt_list]
 
-    tic = time.time()
+    tic = time.perf_counter()
     text_qa.run_batch(
         list(zip(all_prompts, [gen_len] * len(all_prompts))),
     )
-    tot_time = time.time() - tic
+    tot_time = time.perf_counter() - tic
 
     return tot_time
 

diff --git a/benchmark/benchmark_batch/benchmark_batch.py b/benchmark/benchmark_batch/benchmark_batch.py
@@ -81,7 +81,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
     }
     data = {"text": prompts, "sampling_params": sampling_params}
 
-    start_time = time.time()
+    start_time = time.perf_counter()
     try:
         response = requests.post(
             endpoint.base_url + "/generate", json=data, timeout=3600
@@ -90,7 +90,7 @@ def send_batch_request(endpoint, prompts, gen_tokens, request_id):
             error = response.json()
             raise RuntimeError(f"Request {request_id} failed: {error}")
         result = response.json()
-        elapsed_time = (time.time() - start_time) * 1000  # Convert to ms
+        elapsed_time = (time.perf_counter() - start_time) * 1000  # Convert to ms
         avg_per_prompt = elapsed_time / len(prompts) if prompts else 0
         return request_id, elapsed_time, avg_per_prompt, True, len(prompts)
     except Exception as e:
@@ -104,7 +104,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
     num_requests = len(batched_prompts)
 
     # Record start time for total latency
-    benchmark_start_time = time.time()
+    benchmark_start_time = time.perf_counter()
 
     for i, batch_prompts in enumerate(batched_prompts):
         request_id = i + 1
@@ -119,7 +119,7 @@ def run_benchmark(endpoint, batched_prompts, batch_size, gen_tokens):
         results.append(result)
 
     # Calculate total latency
-    total_latency = (time.time() - benchmark_start_time) * 1000  # Convert to ms
+    total_latency = (time.perf_counter() - benchmark_start_time) * 1000  # Convert to ms
 
     return results, total_latency
 

diff --git a/benchmark/benchmark_batch/benchmark_tokenizer.py b/benchmark/benchmark_batch/benchmark_tokenizer.py
@@ -44,20 +44,20 @@ def benchmark_sequential_vs_batch(prompts, batch_size, tokenizer):
     for run in range(NUM_RUNS):
         batch_prompts = prompts[:batch_size]  # Use same prompts for fair comparison
 
-        start_time = time.time()
+        start_time = time.perf_counter()
         for prompt in batch_prompts:
             tokens = tokenizer.encode(prompt)
-        sequential_time = (time.time() - start_time) * 1000
+        sequential_time = (time.perf_counter() - start_time) * 1000
         sequential_times.append(sequential_time)
 
     # Batch tokenization using tokenizer()
     batch_times = []
     for run in range(NUM_RUNS):
         batch_prompts = prompts[:batch_size]  # Use same prompts for fair comparison
 
-        start_time = time.time()
+        start_time = time.perf_counter()
         tokens = tokenizer(batch_prompts)
-        batch_time = (time.time() - start_time) * 1000
+        batch_time = (time.perf_counter() - start_time) * 1000
         batch_times.append(batch_time)
 
     return {

diff --git a/benchmark/generative_agents/bench_other.py b/benchmark/generative_agents/bench_other.py
@@ -39,7 +39,7 @@ async def get_one_answer_async(arg):
         answer = await call_generate(**arg, temperature=0)
         states.append(answer)
 
-    tic = time.time()
+    tic = time.perf_counter()
     # we always sequentially execute agent calls to maintain its dependency
     if args.backend != "lmql":
         for arg in tqdm(arguments):
@@ -50,7 +50,7 @@ async def get_one_answer_async(arg):
         loop = asyncio.get_event_loop()
         for arg in tqdm(arguments):
             loop.run_until_complete(get_one_answer_async(arg))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     print(f"Latency: {latency:.3f}")
 

diff --git a/benchmark/generative_agents/bench_sglang.py b/benchmark/generative_agents/bench_sglang.py
@@ -35,14 +35,14 @@ def main(args):
 
     states = []
     # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
     for a in arguments:
         # only a single key in the dict
         for func, arg in a.items():
             result = func.run(**arg)
         result.sync()
         states.append(result)
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     # Compute accuracy
     print(f"Latency: {latency:.3f}")

diff --git a/benchmark/gsm8k/bench_other.py b/benchmark/gsm8k/bench_other.py
@@ -75,7 +75,7 @@ def get_one_answer(i):
             )
             states[i] = answer
 
-        tic = time.time()
+        tic = time.perf_counter()
         if args.parallel == 1:
             for i in tqdm(range(len(questions))):
                 get_one_answer(i)
@@ -106,9 +106,9 @@ async def batched_call(batch_size):
                 for j in range(len(rets)):
                     states[i + j] = rets[j]
 
-        tic = time.time()
+        tic = time.perf_counter()
         asyncio.run(batched_call(batch_size=args.parallel))
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     preds = []
     for i in range(len(states)):

diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py
@@ -84,14 +84,14 @@ def few_shot_gsm8k(s, question):
     #####################################
 
     # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
     states = few_shot_gsm8k.run_batch(
         arguments,
         temperature=0,
         num_threads=args.parallel,
         progress_bar=True,
     )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     preds = []
     for i in range(len(states)):

diff --git a/benchmark/hellaswag/bench_other.py b/benchmark/hellaswag/bench_other.py
@@ -57,7 +57,7 @@ def get_one_answer(i):
                 context=few_shot_examples + questions[i], choices=choices[i]
             )
 
-        tic = time.time()
+        tic = time.perf_counter()
         if args.parallel == 1:
             for i in tqdm(range(len(questions))):
                 get_one_answer(i)
@@ -82,10 +82,10 @@ async def batched_call(batch_size):
                 for j in range(len(rets)):
                     preds[i + j] = rets[j]
 
-        tic = time.time()
+        tic = time.perf_counter()
         asyncio.run(batched_call(batch_size=args.parallel))
 
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     # Compute accuracy
     acc = np.mean(np.array(preds) == np.array(labels))

diff --git a/benchmark/hellaswag/bench_sglang.py b/benchmark/hellaswag/bench_sglang.py
@@ -68,15 +68,15 @@ def few_shot_hellaswag(s, question, choices):
     #####################################
 
     # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
     rets = few_shot_hellaswag.run_batch(
         arguments,
         temperature=0,
         num_threads=args.parallel,
         progress_bar=True,
     )
     preds = [choices[i].index(rets[i]["answer"]) for i in range(len(rets))]
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     # Compute accuracy
     acc = np.mean(np.array(preds) == np.array(labels))

diff --git a/benchmark/hicache/bench_multiturn.py b/benchmark/hicache/bench_multiturn.py
@@ -261,7 +261,7 @@ async def handle_request(self, item):
             client_id, payload = item
             response = await async_request_sglang_generate(payload, self.url, self.pbar)
             if self.pbar.n == self.pbar.total:
-                self.finished_time = time.time()
+                self.finished_time = time.perf_counter()
             self.response_queue.put((client_id, response))
         except Exception as e:
             print(f"Request failed: {e}")
@@ -334,7 +334,7 @@ def run(self):
         request_thread = threading.Thread(target=self.request_sender, daemon=True)
         response_thread = threading.Thread(target=self.response_handler, daemon=True)
 
-        self.start_time = time.time()
+        self.start_time = time.perf_counter()
         request_thread.start()
         response_thread.start()
 

diff --git a/benchmark/json_decode_regex/bench_other.py b/benchmark/json_decode_regex/bench_other.py
@@ -53,7 +53,7 @@ def main(args):
     def get_one_answer(i):
         states[i] = json_decode(generate=call_generate, **arguments[i])
 
-    tic = time.time()
+    tic = time.perf_counter()
     if args.parallel == 1:
         for i in tqdm(range(len(arguments))):
             get_one_answer(i)
@@ -68,7 +68,7 @@ def get_one_answer(i):
             for _ in rets:
                 pass
 
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     # Compute accuracy
     print(f"Latency: {latency:.3f}")

diff --git a/benchmark/json_decode_regex/bench_sglang.py b/benchmark/json_decode_regex/bench_sglang.py
@@ -63,11 +63,11 @@ def main(args):
     json_warm_up.run().sync()
 
     # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
     states = json_decode.run_batch(
         arguments, temperature=0, num_threads=args.parallel, progress_bar=True
     )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     # Compute accuracy
     print(f"Latency: {latency:.3f}")

diff --git a/benchmark/json_jump_forward/bench_other.py b/benchmark/json_jump_forward/bench_other.py
@@ -175,7 +175,7 @@ async def get_one_answer_async(i):
     else:
         raise ValueError(f"Invalid backend: {args.backend}")
 
-    tic = time.time()
+    tic = time.perf_counter()
 
     if args.backend != "lmql":
         if args.parallel == 1:
@@ -202,7 +202,7 @@ async def get_one_answer_async(i):
                 asyncio.gather(*[get_one_answer_async(i) for i in bt])
             )
 
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     return states, latency
 
@@ -236,7 +236,7 @@ def get_one_answer(i):
     else:
         raise ValueError(f"Invalid backend: {args.backend}")
 
-    tic = time.time()
+    tic = time.perf_counter()
     if args.parallel == 1:
         for i in tqdm(range(len(arguments))):
             get_one_answer(i)
@@ -246,7 +246,7 @@ def get_one_answer(i):
             for _ in rets:
                 pass
 
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     return states, latency
 

diff --git a/benchmark/json_jump_forward/bench_sglang.py b/benchmark/json_jump_forward/bench_sglang.py
@@ -67,14 +67,14 @@ def bench_city_doc(args):
     sgl.set_default_backend(backend)
 
     # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
     states = city_gen.run_batch(
         arguments,
         temperature=0,
         num_threads=args.parallel,
         progress_bar=True,
     )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     return states, latency
 
@@ -91,14 +91,14 @@ def bench_character(args):
     sgl.set_default_backend(backend)
 
     # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
     states = character_gen.run_batch(
         arguments,
         temperature=0,
         num_threads=args.parallel,
         progress_bar=True,
     )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     return states, latency
 

diff --git a/benchmark/json_schema/bench_sglang.py b/benchmark/json_schema/bench_sglang.py
@@ -85,14 +85,14 @@ def bench_schema(args):
     sgl.set_default_backend(backend)
 
     # Run requests
-    tic = time.time()
+    tic = time.perf_counter()
     states = schema_gen.run_batch(
         arguments,
         temperature=0,
         num_threads=args.parallel,
         progress_bar=True,
     )
-    latency = time.time() - tic
+    latency = time.perf_counter() - tic
 
     # Check if the outputs are valid
     indexs = []

diff --git a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
@@ -487,7 +487,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]:
             ]
         print(f"Start tuning over {len(search_space)} configurations...")
 
-        start = time.time()
+        start = time.perf_counter()
         configs = _distribute(
             "tune",
             [
@@ -522,7 +522,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]:
             use_int8_w8a16,
             block_shape,
         )
-        end = time.time()
+        end = time.perf_counter()
         print(f"Tuning took {end - start:.2f} seconds")
     else:
         outputs = _distribute(

diff --git a/benchmark/kernels/quantization/tuning_block_wise_kernel.py b/benchmark/kernels/quantization/tuning_block_wise_kernel.py
@@ -359,7 +359,7 @@ def tune_on_gpu(args_dict):
         config for config in search_space if block_k % config["BLOCK_SIZE_K"] == 0
     ]
 
-    start = time.time()
+    start = time.perf_counter()
     results = {}
     for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
         N, K = shape[0], shape[1]
@@ -379,7 +379,7 @@ def tune_on_gpu(args_dict):
         best_configs = {M: config for M, config in zip(batch_sizes, benchmark_results)}
         save_configs(N, K, block_n, block_k, best_configs, save_path, input_type)
 
-    end = time.time()
+    end = time.perf_counter()
     print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")