From 20e675ab4179edcdb9b5830e7055f50e8d678405 Mon Sep 17 00:00:00 2001 From: Zhang Yulong <35552275+ZhangYulongg@users.noreply.github.com> Date: Wed, 17 Jun 2026 16:27:39 +0800 Subject: [PATCH] [benchmark] Enhance benchmark metrics with ITL aggregation details Added metrics for cleaned ITL aggregation including decode speed and counts for different ITL categories. --- benchmarks/benchmark_serving.py | 53 ++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index d48edb7df4b..7f5faf56688 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -63,6 +63,12 @@ class BenchmarkMetrics: request_goodput: float output_throughput: float total_token_throughput: float + # 全局聚合解码速度(过滤 burst 与 preemption 后) + s_decode_clean: float # tok/s + n_itls_total: int # 全部 itl 样本数 + n_itls_burst: int # itl < 1ms 的数量 + n_itls_preempt: int # itl > 500ms 的数量 + n_itls_clean: int # 1ms <= itl <= 500ms 的数量 mean_s_decode: float median_s_decode: float std_s_decode: float @@ -289,6 +295,22 @@ def calculate_metrics( "All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.", stacklevel=2, ) + + # === Cleaned ITL aggregation === + BURST_THRESHOLD_S = 0.001 # 1 ms + PREEMPT_THRESHOLD_S = 0.5 # 500 ms + all_itls_flat: list[float] = [] + for o in outputs: + if o.success: + all_itls_flat.extend(o.itl) + _arr = np.asarray(all_itls_flat, dtype=float) if all_itls_flat else np.empty(0) + n_itls_total = int(_arr.size) + n_itls_burst = int((_arr < BURST_THRESHOLD_S).sum()) + n_itls_preempt = int((_arr > PREEMPT_THRESHOLD_S).sum()) + _clean = _arr[(_arr >= BURST_THRESHOLD_S) & (_arr <= PREEMPT_THRESHOLD_S)] + n_itls_clean = int(_clean.size) + s_decode_clean = float(_clean.size / _clean.sum()) if _clean.sum() > 0 else 0.0 + metrics = BenchmarkMetrics( completed=completed, total_input=total_input, @@ -349,6 +371,11 @@ def calculate_metrics( std_res_ttft_ms=np.std(res_ttfts or 0) * 1000, median_res_ttft_ms=np.median(res_ttfts or 0) * 1000, percentiles_res_ttft_ms=[(p, np.percentile(res_ttfts or 0, p) * 1000) for p in selected_percentiles], + s_decode_clean=s_decode_clean, + n_itls_total=n_itls_total, + n_itls_burst=n_itls_burst, + n_itls_preempt=n_itls_preempt, + n_itls_clean=n_itls_clean, ) return metrics, actual_output_lens @@ -441,8 +468,8 @@ async def benchmark( out_list, metrics = test_output test_output = out_list[0] + print("test_output:", test_output, flush=True) if not test_output.success: - print("test_output:", test_output, flush=True) raise ValueError( f"Initial test run failed - Please make sure that 1. benchmark arguments are correctly specified and 2. the http_proxy and https_proxy are turned off. Error: {test_output.error}" ) @@ -750,6 +777,11 @@ async def limited_request_func_per_ip(req_input, semaphore, pbar): "reasoning_contents": [output.reasoning_content for output in outputs], "errors": [output.error for output in outputs], "metrics": [output.metrics for output in outputs], + "s_decode_clean": metrics.s_decode_clean, + "n_itls_total": metrics.n_itls_total, + "n_itls_burst": metrics.n_itls_burst, + "n_itls_preempt": metrics.n_itls_preempt, + "n_itls_clean": metrics.n_itls_clean, } def process_one_metric( @@ -898,6 +930,25 @@ def process_one_length( print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value)) result[f"p{p_word}_{metric_attribute_name}"] = value + print("{s:{c}^{n}}".format(s="解码速度 (ITL全局聚合)", n=50, c="-")) + _tot = max(metrics.n_itls_total, 1) + print("{:<40} {:<10d}".format("Total ITLs:", metrics.n_itls_total)) + print( + "{:<40} {:<10d} ({:.2f}%)".format( + "ITL < 1ms (burst):", metrics.n_itls_burst, 100 * metrics.n_itls_burst / _tot + ) + ) + print( + "{:<40} {:<10d} ({:.2f}%)".format( + "ITL > 500ms (preempt):", metrics.n_itls_preempt, 100 * metrics.n_itls_preempt / _tot + ) + ) + print( + "{:<40} {:<10d} ({:.2f}%)".format( + "ITL clean [1ms,500ms]:", metrics.n_itls_clean, 100 * metrics.n_itls_clean / _tot + ) + ) + print("{:<40} {:<10.2f}".format("Decode speed (clean, tok/s):", metrics.s_decode_clean)) process_one_length("s_decode", "Decode", "解码速度(tok/s)") process_one_metric("ttft", "TTFT", "Time to First Token") process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token")