-
Notifications
You must be signed in to change notification settings - Fork 751
[benchmark] Enhance benchmark metrics with ITL aggregation details #8063
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -63,6 +63,12 @@ class BenchmarkMetrics: | |
| request_goodput: float | ||
| output_throughput: float | ||
| total_token_throughput: float | ||
| # 全局聚合解码速度(过滤 burst 与 preemption 后) | ||
| s_decode_clean: float # tok/s | ||
| n_itls_total: int # 全部 itl 样本数 | ||
| n_itls_burst: int # itl < 1ms 的数量 | ||
| n_itls_preempt: int # itl > 500ms 的数量 | ||
| n_itls_clean: int # 1ms <= itl <= 500ms 的数量 | ||
| mean_s_decode: float | ||
| median_s_decode: float | ||
| std_s_decode: float | ||
|
|
@@ -289,6 +295,22 @@ def calculate_metrics( | |
| "All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.", | ||
| stacklevel=2, | ||
| ) | ||
|
|
||
| # === Cleaned ITL aggregation === | ||
| BURST_THRESHOLD_S = 0.001 # 1 ms | ||
| PREEMPT_THRESHOLD_S = 0.5 # 500 ms | ||
| all_itls_flat: list[float] = [] | ||
| for o in outputs: | ||
| if o.success: | ||
| all_itls_flat.extend(o.itl) | ||
| _arr = np.asarray(all_itls_flat, dtype=float) if all_itls_flat else np.empty(0) | ||
| n_itls_total = int(_arr.size) | ||
| n_itls_burst = int((_arr < BURST_THRESHOLD_S).sum()) | ||
| n_itls_preempt = int((_arr > PREEMPT_THRESHOLD_S).sum()) | ||
| _clean = _arr[(_arr >= BURST_THRESHOLD_S) & (_arr <= PREEMPT_THRESHOLD_S)] | ||
| n_itls_clean = int(_clean.size) | ||
| s_decode_clean = float(_clean.size / _clean.sum()) if _clean.sum() > 0 else 0.0 | ||
|
|
||
| metrics = BenchmarkMetrics( | ||
| completed=completed, | ||
| total_input=total_input, | ||
|
|
@@ -349,6 +371,11 @@ def calculate_metrics( | |
| std_res_ttft_ms=np.std(res_ttfts or 0) * 1000, | ||
| median_res_ttft_ms=np.median(res_ttfts or 0) * 1000, | ||
| percentiles_res_ttft_ms=[(p, np.percentile(res_ttfts or 0, p) * 1000) for p in selected_percentiles], | ||
| s_decode_clean=s_decode_clean, | ||
| n_itls_total=n_itls_total, | ||
| n_itls_burst=n_itls_burst, | ||
| n_itls_preempt=n_itls_preempt, | ||
| n_itls_clean=n_itls_clean, | ||
| ) | ||
|
|
||
| return metrics, actual_output_lens | ||
|
|
@@ -441,8 +468,8 @@ async def benchmark( | |
| out_list, metrics = test_output | ||
| test_output = out_list[0] | ||
|
|
||
| print("test_output:", test_output, flush=True) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 安全 这里把 warmup 成功响应也无条件打印到日志,会泄露完整
建议修复方式:恢复为仅失败时打印完整 |
||
| if not test_output.success: | ||
| print("test_output:", test_output, flush=True) | ||
| raise ValueError( | ||
| f"Initial test run failed - Please make sure that 1. benchmark arguments are correctly specified and 2. the http_proxy and https_proxy are turned off. Error: {test_output.error}" | ||
| ) | ||
|
|
@@ -750,6 +777,11 @@ async def limited_request_func_per_ip(req_input, semaphore, pbar): | |
| "reasoning_contents": [output.reasoning_content for output in outputs], | ||
| "errors": [output.error for output in outputs], | ||
| "metrics": [output.metrics for output in outputs], | ||
| "s_decode_clean": metrics.s_decode_clean, | ||
| "n_itls_total": metrics.n_itls_total, | ||
| "n_itls_burst": metrics.n_itls_burst, | ||
| "n_itls_preempt": metrics.n_itls_preempt, | ||
| "n_itls_clean": metrics.n_itls_clean, | ||
| } | ||
|
|
||
| def process_one_metric( | ||
|
|
@@ -898,6 +930,25 @@ def process_one_length( | |
| print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value)) | ||
| result[f"p{p_word}_{metric_attribute_name}"] = value | ||
|
|
||
| print("{s:{c}^{n}}".format(s="解码速度 (ITL全局聚合)", n=50, c="-")) | ||
| _tot = max(metrics.n_itls_total, 1) | ||
| print("{:<40} {:<10d}".format("Total ITLs:", metrics.n_itls_total)) | ||
| print( | ||
| "{:<40} {:<10d} ({:.2f}%)".format( | ||
| "ITL < 1ms (burst):", metrics.n_itls_burst, 100 * metrics.n_itls_burst / _tot | ||
| ) | ||
| ) | ||
| print( | ||
| "{:<40} {:<10d} ({:.2f}%)".format( | ||
| "ITL > 500ms (preempt):", metrics.n_itls_preempt, 100 * metrics.n_itls_preempt / _tot | ||
| ) | ||
| ) | ||
| print( | ||
| "{:<40} {:<10d} ({:.2f}%)".format( | ||
| "ITL clean [1ms,500ms]:", metrics.n_itls_clean, 100 * metrics.n_itls_clean / _tot | ||
| ) | ||
| ) | ||
| print("{:<40} {:<10.2f}".format("Decode speed (clean, tok/s):", metrics.s_decode_clean)) | ||
| process_one_length("s_decode", "Decode", "解码速度(tok/s)") | ||
| process_one_metric("ttft", "TTFT", "Time to First Token") | ||
| process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🔴 Bug 新增 clean ITL 聚合只按
o.success收集样本,会把主统计已经跳过的output_tokens == 0响应重新计入。calculate_metrics()前面的主循环在output_len = outputs[i].output_tokens后会对空输出直接continue,因此这些请求不会进入completed、total_output、普通itl/s_decode等指标。这里重新遍历outputs并只判断o.success,会让s_decode_clean、n_itls_*与同一份结果中的主指标使用不同样本集;对某些流式后端,成功返回文本但没有 usage/completion_tokens 时,已有代码正是走这个跳过路径。建议修复方式:复用主循环已经确认有效的
itls样本,例如直接基于itls构造_arr,或把 clean 聚合移动到if not output_len: continue之后,保证与completed/total_output/普通 ITL 指标口径一致。