forked from ggml-org/llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmeasurement.py
More file actions
139 lines (109 loc) · 4.33 KB
/
Copy pathmeasurement.py
File metadata and controls
139 lines (109 loc) · 4.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from statistics import mean, stdev
import time
from llama_cpp import Llama
MODEL_PATH = "models/Qwen3-0.6B-Q4_K_M.gguf"
PROMPT = "You are a helpful assistant."
MAX_TOKENS = 128
RUNS = 10
WARMUP_RUNS = 3
def measure_once(model: Llama, prompt: str, max_tokens: int) -> dict:
"""Measure one run using stream mode to capture wall-clock TTFT."""
# KV cache 초기화: 동일 프롬프트 반복 시 캐시 재사용 방지
model.reset()
start = time.perf_counter()
first_token_time = None
generated_tokens = 0
total_chunks = 0
empty_chunks = 0
full_text = []
for chunk in model.create_completion(
prompt=prompt,
max_tokens=max_tokens,
stream=True,
):
# text 먼저 추출
text = chunk["choices"][0].get("text", "")
total_chunks += 1
if not text:
empty_chunks += 1
# TTFT: 실제 텍스트가 있는 첫 chunk 도착 시점 (업계 표준)
if text and first_token_time is None:
first_token_time = time.perf_counter()
# 토큰 카운팅: 빈 chunk 포함 (chunk 1개 = 토큰 1개)
# generated_tokens += 1
if text:
generated_tokens += 1
# text 수집
if text:
full_text.append(text)
end = time.perf_counter()
total_s = end - start
if first_token_time is None:
ttft_s = total_s
gen_time_s = 0.0
else:
ttft_s = first_token_time - start
gen_time_s = end - first_token_time
throughput = generated_tokens / total_s if total_s > 0 else 0.0
generation_throughput = generated_tokens / gen_time_s if gen_time_s > 0 else 0.0
return {
"ttft_ms": ttft_s * 1000.0,
"total_time_s": total_s,
"generated_tokens": generated_tokens,
"total_chunks": total_chunks,
"empty_chunks": empty_chunks,
"throughput_tps": throughput,
"generation_throughput_tps": generation_throughput,
"response_preview": "".join(full_text)[:120].replace("\n", " "),
}
def summarize(values: list[float]) -> tuple[float, float]:
if not values:
return 0.0, 0.0
if len(values) == 1:
return values[0], 0.0
return mean(values), stdev(values)
def main() -> None:
print("Loading model...")
model = Llama(
model_path=MODEL_PATH,
n_gpu_layers=-1,
n_threads=6,
)
print(f"\nPrompt: {PROMPT}")
print(f"Max tokens: {MAX_TOKENS}")
print(f"Warm-up runs: {WARMUP_RUNS}")
print(f"Measured runs: {RUNS}\n")
for i in range(WARMUP_RUNS):
_ = measure_once(model, PROMPT, MAX_TOKENS)
print(f"Warm-up {i + 1}/{WARMUP_RUNS} done")
print()
results = []
for i in range(RUNS):
result = measure_once(model, PROMPT, MAX_TOKENS)
results.append(result)
empty_ratio = result['empty_chunks'] / result['total_chunks'] if result['total_chunks'] > 0 else 0.0
print(
f"Run {i + 1:02d}: "
f"TTFT={result['ttft_ms']:.2f} ms, "
f"Throughput={result['throughput_tps']:.2f} t/s, "
f"GenThroughput={result['generation_throughput_tps']:.2f} t/s, "
f"Tokens={result['generated_tokens']}, "
f"EmptyChunks={result['empty_chunks']}/{result['total_chunks']} ({empty_ratio:.1%})"
)
ttft_values = [r["ttft_ms"] for r in results]
throughput_values = [r["throughput_tps"] for r in results]
gen_throughput_values = [r["generation_throughput_tps"] for r in results]
total_chunks_values = [r["total_chunks"] for r in results]
empty_chunks_values = [r["empty_chunks"] for r in results]
ttft_mean, ttft_std = summarize(ttft_values)
thr_mean, thr_std = summarize(throughput_values)
gen_thr_mean, gen_thr_std = summarize(gen_throughput_values)
avg_total_chunks = mean(total_chunks_values)
avg_empty_chunks = mean(empty_chunks_values)
print(f"\n===== Summary ({RUNS} runs) =====")
print(f"TTFT : {ttft_mean:.2f} ± {ttft_std:.2f} ms")
print(f"Throughput : {thr_mean:.2f} ± {thr_std:.2f} tokens/s (prefill 포함)")
print(f"Generation Throughput: {gen_thr_mean:.2f} ± {gen_thr_std:.2f} tokens/s (decode only)")
print(f"Avg Empty Chunks : {avg_empty_chunks:.1f} / {avg_total_chunks:.1f} ({avg_empty_chunks/avg_total_chunks:.1%})")
if __name__ == "__main__":
main()