Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 52 additions & 1 deletion benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ class BenchmarkMetrics:
request_goodput: float
output_throughput: float
total_token_throughput: float
# 全局聚合解码速度(过滤 burst 与 preemption 后)
s_decode_clean: float # tok/s
n_itls_total: int # 全部 itl 样本数
n_itls_burst: int # itl < 1ms 的数量
n_itls_preempt: int # itl > 500ms 的数量
n_itls_clean: int # 1ms <= itl <= 500ms 的数量
mean_s_decode: float
median_s_decode: float
std_s_decode: float
Expand Down Expand Up @@ -289,6 +295,22 @@ def calculate_metrics(
"All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.",
stacklevel=2,
)

# === Cleaned ITL aggregation ===
BURST_THRESHOLD_S = 0.001 # 1 ms
PREEMPT_THRESHOLD_S = 0.5 # 500 ms
all_itls_flat: list[float] = []
for o in outputs:
if o.success:

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 Bug 新增 clean ITL 聚合只按 o.success 收集样本,会把主统计已经跳过的 output_tokens == 0 响应重新计入。

calculate_metrics() 前面的主循环在 output_len = outputs[i].output_tokens 后会对空输出直接 continue,因此这些请求不会进入 completedtotal_output、普通 itl/s_decode 等指标。这里重新遍历 outputs 并只判断 o.success,会让 s_decode_cleann_itls_* 与同一份结果中的主指标使用不同样本集;对某些流式后端,成功返回文本但没有 usage/completion_tokens 时,已有代码正是走这个跳过路径。

建议修复方式:复用主循环已经确认有效的 itls 样本,例如直接基于 itls 构造 _arr,或把 clean 聚合移动到 if not output_len: continue 之后,保证与 completed/total_output/普通 ITL 指标口径一致。

all_itls_flat.extend(o.itl)
_arr = np.asarray(all_itls_flat, dtype=float) if all_itls_flat else np.empty(0)
n_itls_total = int(_arr.size)
n_itls_burst = int((_arr < BURST_THRESHOLD_S).sum())
n_itls_preempt = int((_arr > PREEMPT_THRESHOLD_S).sum())
_clean = _arr[(_arr >= BURST_THRESHOLD_S) & (_arr <= PREEMPT_THRESHOLD_S)]
n_itls_clean = int(_clean.size)
s_decode_clean = float(_clean.size / _clean.sum()) if _clean.sum() > 0 else 0.0

metrics = BenchmarkMetrics(
completed=completed,
total_input=total_input,
Expand Down Expand Up @@ -349,6 +371,11 @@ def calculate_metrics(
std_res_ttft_ms=np.std(res_ttfts or 0) * 1000,
median_res_ttft_ms=np.median(res_ttfts or 0) * 1000,
percentiles_res_ttft_ms=[(p, np.percentile(res_ttfts or 0, p) * 1000) for p in selected_percentiles],
s_decode_clean=s_decode_clean,
n_itls_total=n_itls_total,
n_itls_burst=n_itls_burst,
n_itls_preempt=n_itls_preempt,
n_itls_clean=n_itls_clean,
)

return metrics, actual_output_lens
Expand Down Expand Up @@ -441,8 +468,8 @@ async def benchmark(
out_list, metrics = test_output
test_output = out_list[0]

print("test_output:", test_output, flush=True)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 安全 这里把 warmup 成功响应也无条件打印到日志,会泄露完整 RequestFuncOutput

RequestFuncOutput 的 repr 包含 generated_textreasoning_contentoutput_idstool_calls 等字段。此前只在 warmup 失败时打印,主要用于排错;现在成功请求也会进入 stdout,在 benchmark 日志被采集或共享时会暴露生成内容和 token ids。

建议修复方式:恢复为仅失败时打印完整 test_output;如成功路径确实需要调试信息,请放到 if debug: 或只打印 request id、latency、output_tokens 等非内容字段。

if not test_output.success:
print("test_output:", test_output, flush=True)
raise ValueError(
f"Initial test run failed - Please make sure that 1. benchmark arguments are correctly specified and 2. the http_proxy and https_proxy are turned off. Error: {test_output.error}"
)
Expand Down Expand Up @@ -750,6 +777,11 @@ async def limited_request_func_per_ip(req_input, semaphore, pbar):
"reasoning_contents": [output.reasoning_content for output in outputs],
"errors": [output.error for output in outputs],
"metrics": [output.metrics for output in outputs],
"s_decode_clean": metrics.s_decode_clean,
"n_itls_total": metrics.n_itls_total,
"n_itls_burst": metrics.n_itls_burst,
"n_itls_preempt": metrics.n_itls_preempt,
"n_itls_clean": metrics.n_itls_clean,
}

def process_one_metric(
Expand Down Expand Up @@ -898,6 +930,25 @@ def process_one_length(
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value))
result[f"p{p_word}_{metric_attribute_name}"] = value

print("{s:{c}^{n}}".format(s="解码速度 (ITL全局聚合)", n=50, c="-"))
_tot = max(metrics.n_itls_total, 1)
print("{:<40} {:<10d}".format("Total ITLs:", metrics.n_itls_total))
print(
"{:<40} {:<10d} ({:.2f}%)".format(
"ITL < 1ms (burst):", metrics.n_itls_burst, 100 * metrics.n_itls_burst / _tot
)
)
print(
"{:<40} {:<10d} ({:.2f}%)".format(
"ITL > 500ms (preempt):", metrics.n_itls_preempt, 100 * metrics.n_itls_preempt / _tot
)
)
print(
"{:<40} {:<10d} ({:.2f}%)".format(
"ITL clean [1ms,500ms]:", metrics.n_itls_clean, 100 * metrics.n_itls_clean / _tot
)
)
print("{:<40} {:<10.2f}".format("Decode speed (clean, tok/s):", metrics.s_decode_clean))
process_one_length("s_decode", "Decode", "解码速度(tok/s)")
process_one_metric("ttft", "TTFT", "Time to First Token")
process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token")
Expand Down
Loading