fix(agentic): fail jobs with excessive aiperf errors

cquil11 · cquil11 · commit 76a3f09b4d4f · 2026-06-03T12:40:35.000-05:00
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -899,6 +899,7 @@ run_eval() {
 INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}"
 AGENTIC_DIR="${AGENTIC_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/agentic-benchmark}"
 AIPERF_DIR="${AIPERF_DIR:-${INFMAX_CONTAINER_WORKSPACE}/utils/aiperf}"
+AIPERF_FAILED_REQUEST_THRESHOLD=0.10
 
 agentic_pip_install() {
     local pip_install=(python3 -m pip install)
@@ -1034,7 +1035,7 @@ build_replay_cmd() {
     # transient low-rate failures from killing long sweeps while still
     # catching malformed payloads or server crashes before they get aggregated
     # as benchmarkable data.
-    REPLAY_CMD+=" --failed-request-threshold 0.10"
+    REPLAY_CMD+=" --failed-request-threshold $AIPERF_FAILED_REQUEST_THRESHOLD"
     # Sample each trajectory's warmup start position uniformly from
     # [25%, 75%] of the trace's turn count (was hardcoded 0%-70% upstream).
     # Avoids starting trajectories right at turn 0 where the KV cache is
@@ -1095,8 +1096,9 @@ build_replay_cmd() {
 
 write_agentic_result_json() {
     # Aggregate aiperf's profile_export.{json,jsonl} + server_metrics_export.json
-    # into $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json. The workflow's existing
-    # retry-based existence check is the single success gate.
+    # into $AGENTIC_OUTPUT_DIR/$RESULT_FILENAME.json. The workflow checks that
+    # this file exists; run_agentic_replay_and_write_outputs separately rejects
+    # aggregates whose request error rate exceeds the configured limit.
     local result_dir="$1"
     RESULT_DIR="$result_dir" AGENTIC_OUTPUT_DIR="${AGENTIC_OUTPUT_DIR:-$INFMAX_CONTAINER_WORKSPACE}" \
         python3 "$INFMAX_CONTAINER_WORKSPACE/utils/process_agentic_result.py"
@@ -1110,6 +1112,7 @@ write_agentic_result_json() {
 run_agentic_replay_and_write_outputs() {
     local result_dir="$1"
     local replay_rc
+    local validation_rc
 
     echo "$REPLAY_CMD" > "$result_dir/benchmark_command.txt"
 
@@ -1125,8 +1128,20 @@ run_agentic_replay_and_write_outputs() {
     python3 "$AGENTIC_DIR/scripts/analyze_benchmark_distributions.py" \
         "$result_dir/aiperf_artifacts" -o "$result_dir" 2>&1 || true
 
+    set +e
+    python3 "$INFMAX_CONTAINER_WORKSPACE/utils/validate_agentic_result.py" \
+        "$result_dir/aiperf_artifacts" \
+        --failed-request-threshold "$AIPERF_FAILED_REQUEST_THRESHOLD"
+    validation_rc=$?
+    set -e
+
     if [ "$replay_rc" -ne 0 ]; then
         echo "ERROR: agentic trace replay exited with code $replay_rc after writing available results" >&2
         return "$replay_rc"
     fi
+
+    if [ "$validation_rc" -ne 0 ]; then
+        echo "ERROR: agentic trace replay produced invalid results after writing available artifacts" >&2
+        return "$validation_rc"
+    fi
 }
diff --git a/utils/test_validate_agentic_result.py b/utils/test_validate_agentic_result.py
@@ -0,0 +1,73 @@
+"""Tests for the agentic aiperf result validity gate."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from validate_agentic_result import validate_result
+
+
+def _write_aggregate(tmp_path: Path, aggregate: dict, *, per_run: bool = False) -> Path:
+    artifact_dir = tmp_path / "aiperf_artifacts"
+    output_dir = artifact_dir / "run_0" if per_run else artifact_dir
+    output_dir.mkdir(parents=True)
+    with open(output_dir / "profile_export_aiperf.json", "w") as f:
+        json.dump(aggregate, f)
+    return artifact_dir
+
+
+def test_passes_when_request_error_rate_is_within_limit(tmp_path: Path):
+    artifact_dir = _write_aggregate(
+        tmp_path,
+        {
+            "request_count": {"avg": 90},
+            "error_request_count": {"avg": 10},
+            "completed_request_count": {"avg": 100},
+        },
+    )
+
+    assert validate_result(artifact_dir, 0.10) == []
+
+
+def test_fails_when_request_error_rate_exceeds_limit(tmp_path: Path):
+    artifact_dir = _write_aggregate(
+        tmp_path,
+        {
+            "request_count": {"avg": 2},
+            "error_request_count": {"avg": 65},
+            "completed_request_count": {"avg": 67},
+        },
+    )
+
+    errors = validate_result(artifact_dir, 0.10)
+    assert errors == [
+        "aiperf request error rate exceeded the benchmark limit: "
+        "65/67 = 97.015% > 10.000%"
+    ]
+
+
+def test_treats_missing_error_count_as_zero(tmp_path: Path):
+    artifact_dir = _write_aggregate(
+        tmp_path,
+        {"request_count": {"avg": 12}},
+    )
+
+    assert validate_result(artifact_dir, 0.10) == []
+
+
+def test_supports_per_run_artifact_layout(tmp_path: Path):
+    artifact_dir = _write_aggregate(
+        tmp_path,
+        {"request_count": {"avg": 12}},
+        per_run=True,
+    )
+
+    assert validate_result(artifact_dir, 0.10) == []
+
+
+def test_fails_when_aggregate_is_missing(tmp_path: Path):
+    errors = validate_result(tmp_path / "aiperf_artifacts", 0.10)
+
+    assert len(errors) == 1
+    assert errors[0].endswith("profile_export_aiperf.json not found")
diff --git a/utils/validate_agentic_result.py b/utils/validate_agentic_result.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""Validate whether an aiperf agentic replay produced benchmarkable results."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import sys
+from pathlib import Path
+from typing import Any
+
+
+def _resolve_aggregate_path(artifact_dir: Path) -> Path:
+    """Find aiperf's aggregate JSON in the direct or per-run artifact layout."""
+    direct = artifact_dir / "profile_export_aiperf.json"
+    if direct.is_file():
+        return direct
+
+    if artifact_dir.is_dir():
+        for child in sorted(artifact_dir.iterdir()):
+            candidate = child / "profile_export_aiperf.json"
+            if child.is_dir() and candidate.is_file():
+                return candidate
+
+    return direct
+
+
+def _metric_avg(aggregate: dict[str, Any], name: str) -> float | None:
+    """Read an aggregate metric's numeric average, if present."""
+    metric = aggregate.get(name)
+    if metric is None:
+        return None
+    if not isinstance(metric, dict):
+        raise ValueError(f"{name} must be an object")
+
+    value = metric.get("avg")
+    if value is None:
+        return None
+    if not isinstance(value, int | float) or isinstance(value, bool):
+        raise ValueError(f"{name}.avg must be numeric")
+
+    value = float(value)
+    if not math.isfinite(value) or value < 0:
+        raise ValueError(f"{name}.avg must be a finite non-negative number")
+    return value
+
+
+def validate_result(artifact_dir: Path, failed_request_threshold: float) -> list[str]:
+    """Return validation errors for an aiperf artifact directory."""
+    aggregate_path = _resolve_aggregate_path(artifact_dir)
+    if not aggregate_path.is_file():
+        return [f"{aggregate_path} not found"]
+
+    try:
+        with open(aggregate_path) as f:
+            aggregate = json.load(f)
+        if not isinstance(aggregate, dict):
+            return [f"{aggregate_path} must contain a JSON object"]
+
+        successes = _metric_avg(aggregate, "request_count")
+        errors = _metric_avg(aggregate, "error_request_count") or 0.0
+        completed = _metric_avg(aggregate, "completed_request_count")
+    except (OSError, json.JSONDecodeError, ValueError) as exc:
+        return [f"failed to read {aggregate_path}: {exc}"]
+
+    if successes is None:
+        return ["request_count.avg is missing"]
+    if completed is None:
+        completed = successes + errors
+    if completed <= 0:
+        return ["aiperf completed zero requests"]
+
+    error_rate = errors / completed
+    if error_rate > failed_request_threshold:
+        return [
+            "aiperf request error rate exceeded the benchmark limit: "
+            f"{errors:g}/{completed:g} = {error_rate:.3%} > "
+            f"{failed_request_threshold:.3%}"
+        ]
+
+    print(
+        "Validated aiperf request error rate: "
+        f"{errors:g}/{completed:g} = {error_rate:.3%} <= "
+        f"{failed_request_threshold:.3%}"
+    )
+    return []
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("artifact_dir", type=Path)
+    parser.add_argument(
+        "--failed-request-threshold",
+        type=float,
+        required=True,
+        help="Maximum accepted error fraction, inclusive",
+    )
+    args = parser.parse_args()
+
+    if not 0 <= args.failed_request_threshold <= 1:
+        parser.error("--failed-request-threshold must be between 0 and 1")
+
+    errors = validate_result(args.artifact_dir, args.failed_request_threshold)
+    for error in errors:
+        print(f"ERROR: {error}", file=sys.stderr)
+    return 1 if errors else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())