From 20e675ab4179edcdb9b5830e7055f50e8d678405 Mon Sep 17 00:00:00 2001
From: Zhang Yulong <35552275+ZhangYulongg@users.noreply.github.com>
Date: Wed, 17 Jun 2026 16:27:39 +0800
Subject: [PATCH] [benchmark] Enhance benchmark metrics with ITL aggregation
 details

Added metrics for cleaned ITL aggregation including decode speed and counts for different ITL categories.
---
 benchmarks/benchmark_serving.py | 53 ++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index d48edb7df4b..7f5faf56688 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -63,6 +63,12 @@ class BenchmarkMetrics:
     request_goodput: float
     output_throughput: float
     total_token_throughput: float
+    # 全局聚合解码速度(过滤 burst 与 preemption 后)
+    s_decode_clean: float  # tok/s
+    n_itls_total: int  # 全部 itl 样本数
+    n_itls_burst: int  # itl < 1ms 的数量
+    n_itls_preempt: int  # itl > 500ms 的数量
+    n_itls_clean: int  # 1ms <= itl <= 500ms 的数量
     mean_s_decode: float
     median_s_decode: float
     std_s_decode: float
@@ -289,6 +295,22 @@ def calculate_metrics(
             "All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.",
             stacklevel=2,
         )
+
+    # === Cleaned ITL aggregation ===
+    BURST_THRESHOLD_S = 0.001  # 1 ms
+    PREEMPT_THRESHOLD_S = 0.5  # 500 ms
+    all_itls_flat: list[float] = []
+    for o in outputs:
+        if o.success:
+            all_itls_flat.extend(o.itl)
+    _arr = np.asarray(all_itls_flat, dtype=float) if all_itls_flat else np.empty(0)
+    n_itls_total = int(_arr.size)
+    n_itls_burst = int((_arr < BURST_THRESHOLD_S).sum())
+    n_itls_preempt = int((_arr > PREEMPT_THRESHOLD_S).sum())
+    _clean = _arr[(_arr >= BURST_THRESHOLD_S) & (_arr <= PREEMPT_THRESHOLD_S)]
+    n_itls_clean = int(_clean.size)
+    s_decode_clean = float(_clean.size / _clean.sum()) if _clean.sum() > 0 else 0.0
+
     metrics = BenchmarkMetrics(
         completed=completed,
         total_input=total_input,
@@ -349,6 +371,11 @@ def calculate_metrics(
         std_res_ttft_ms=np.std(res_ttfts or 0) * 1000,
         median_res_ttft_ms=np.median(res_ttfts or 0) * 1000,
         percentiles_res_ttft_ms=[(p, np.percentile(res_ttfts or 0, p) * 1000) for p in selected_percentiles],
+        s_decode_clean=s_decode_clean,
+        n_itls_total=n_itls_total,
+        n_itls_burst=n_itls_burst,
+        n_itls_preempt=n_itls_preempt,
+        n_itls_clean=n_itls_clean,
     )
 
     return metrics, actual_output_lens
@@ -441,8 +468,8 @@ async def benchmark(
             out_list, metrics = test_output
             test_output = out_list[0]
 
+        print("test_output:", test_output, flush=True)
         if not test_output.success:
-            print("test_output:", test_output, flush=True)
             raise ValueError(
                 f"Initial test run failed - Please make sure that 1. benchmark arguments are correctly specified and 2. the http_proxy and https_proxy are turned off. Error: {test_output.error}"
             )
@@ -750,6 +777,11 @@ async def limited_request_func_per_ip(req_input, semaphore, pbar):
         "reasoning_contents": [output.reasoning_content for output in outputs],
         "errors": [output.error for output in outputs],
         "metrics": [output.metrics for output in outputs],
+        "s_decode_clean": metrics.s_decode_clean,
+        "n_itls_total": metrics.n_itls_total,
+        "n_itls_burst": metrics.n_itls_burst,
+        "n_itls_preempt": metrics.n_itls_preempt,
+        "n_itls_clean": metrics.n_itls_clean,
     }
 
     def process_one_metric(
@@ -898,6 +930,25 @@ def process_one_length(
             print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value))
             result[f"p{p_word}_{metric_attribute_name}"] = value
 
+    print("{s:{c}^{n}}".format(s="解码速度 (ITL全局聚合)", n=50, c="-"))
+    _tot = max(metrics.n_itls_total, 1)
+    print("{:<40} {:<10d}".format("Total ITLs:", metrics.n_itls_total))
+    print(
+        "{:<40} {:<10d} ({:.2f}%)".format(
+            "ITL < 1ms (burst):", metrics.n_itls_burst, 100 * metrics.n_itls_burst / _tot
+        )
+    )
+    print(
+        "{:<40} {:<10d} ({:.2f}%)".format(
+            "ITL > 500ms (preempt):", metrics.n_itls_preempt, 100 * metrics.n_itls_preempt / _tot
+        )
+    )
+    print(
+        "{:<40} {:<10d} ({:.2f}%)".format(
+            "ITL clean [1ms,500ms]:", metrics.n_itls_clean, 100 * metrics.n_itls_clean / _tot
+        )
+    )
+    print("{:<40} {:<10.2f}".format("Decode speed (clean, tok/s):", metrics.s_decode_clean))
     process_one_length("s_decode", "Decode", "解码速度(tok/s)")
     process_one_metric("ttft", "TTFT", "Time to First Token")
     process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token")