Skip to content

Commit 5fa9848

Browse files
aryguptclaude
andcommitted
feat(sweep): add benchmarks-only changelog option; use it for power re-run
Adds `benchmarks-only: true` to a changelog entry to skip the eval pass (symmetric with the existing `evals-only`; the two are mutually exclusive). Power telemetry comes from the benchmark runs, so the MiniMax power re-run doesn't need evals — sets the flag, dropping 14 unnecessary eval runs. - validation.py: new `benchmarks_only` field + mutual-exclusion validator - process_changelog.py: skip eval generation when benchmarks_only is set - test_validation.py: ChangelogEntry coverage (aliases, exclusivity, forbid) Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent b271a56 commit 5fa9848

4 files changed

Lines changed: 61 additions & 2 deletions

File tree

perf-changelog.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3484,4 +3484,6 @@
34843484
description:
34853485
- "Re-run MiniMax-M2.5 single-node vLLM sweeps (H100/H200 FP8, B200/B300/MI355X FP4) with no recipe change, to capture per-GPU power telemetry (avg_power_w) added in #1558 for the power/energy canvas"
34863486
- "Source rows for the canvas predate the 2026-05-27 power-capture merge, so they carry throughput/latency but no measured power; this re-run replaces the modeled power layer with measured power"
3487+
- "benchmarks-only: power comes from the benchmark runs, evals add nothing here"
3488+
benchmarks-only: true
34873489
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1666

utils/matrix_logic/test_validation.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Comprehensive tests for validation.py"""
22
import pytest
33
from validation import (
4+
ChangelogEntry,
45
Fields,
56
SingleNodeMatrixEntry,
67
SingleNodeAgenticMatrixEntry,
@@ -932,3 +933,44 @@ def test_validation_runs_by_default(self, tmp_path):
932933
with pytest.raises(ValueError) as exc_info:
933934
load_runner_file(str(runner_file))
934935
assert "must be a list" in str(exc_info.value)
936+
937+
938+
class TestChangelogEntry:
939+
"""Tests for ChangelogEntry, incl. the benchmarks-only / evals-only options."""
940+
941+
def _base(self, **extra):
942+
entry = {
943+
"config-keys": ["minimaxm2.5-fp4-b200-vllm"],
944+
"description": ["re-run for power capture"],
945+
"pr-link": "https://github.com/SemiAnalysisAI/InferenceX/pull/1666",
946+
}
947+
entry.update(extra)
948+
return entry
949+
950+
def test_defaults(self):
951+
"""Both opt-out flags default to False."""
952+
entry = ChangelogEntry.model_validate(self._base())
953+
assert entry.evals_only is False
954+
assert entry.benchmarks_only is False
955+
956+
def test_benchmarks_only_alias(self):
957+
"""benchmarks-only YAML key maps to benchmarks_only."""
958+
entry = ChangelogEntry.model_validate(self._base(**{"benchmarks-only": True}))
959+
assert entry.benchmarks_only is True
960+
961+
def test_evals_only_alias(self):
962+
entry = ChangelogEntry.model_validate(self._base(**{"evals-only": True}))
963+
assert entry.evals_only is True
964+
965+
def test_evals_and_benchmarks_only_mutually_exclusive(self):
966+
"""Setting both opt-out flags is rejected."""
967+
with pytest.raises(ValueError) as exc_info:
968+
ChangelogEntry.model_validate(
969+
self._base(**{"evals-only": True, "benchmarks-only": True})
970+
)
971+
assert "mutually exclusive" in str(exc_info.value)
972+
973+
def test_unknown_field_forbidden(self):
974+
"""extra='forbid' rejects typos like a singular 'benchmark-only'."""
975+
with pytest.raises(ValueError):
976+
ChangelogEntry.model_validate(self._base(**{"benchmark-only": True}))

utils/matrix_logic/validation.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -486,11 +486,23 @@ class ChangelogEntry(BaseModel):
486486
description: list[str] = Field(min_length=1)
487487
pr_link: str = Field(alias="pr-link")
488488
evals_only: bool = Field(alias="evals-only", default=False)
489+
benchmarks_only: bool = Field(
490+
alias="benchmarks-only", default=False,
491+
description="Skip the eval pass; generate benchmarks only (e.g. power-only re-runs)."
492+
)
489493
scenario_type: Optional[List[str]] = Field(
490494
alias="scenario-type", default=None,
491495
description="Restrict to specific scenario types (e.g., ['fixed-seq-len', 'agentic-coding'])"
492496
)
493497

498+
@model_validator(mode='after')
499+
def check_evals_benchmarks_exclusive(self) -> "ChangelogEntry":
500+
if self.evals_only and self.benchmarks_only:
501+
raise ValueError(
502+
"'evals-only' and 'benchmarks-only' are mutually exclusive; set at most one."
503+
)
504+
return self
505+
494506

495507
class ChangelogMetadata(BaseModel):
496508
"""Pydantic model for validating changelog metadata structure."""

utils/process_changelog.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,11 @@ def main():
175175
raise
176176
all_benchmark_results.extend(json.loads(result.stdout))
177177

178-
# Generate eval entries separately
179-
eval_configs = [c for c in all_configs if c not in eval_configs_seen]
178+
# Generate eval entries separately (skipped when the entry opts out via
179+
# benchmarks-only, e.g. power-only re-runs that don't need eval scoring).
180+
eval_configs = [] if entry.benchmarks_only else [
181+
c for c in all_configs if c not in eval_configs_seen
182+
]
180183
if eval_configs:
181184
eval_configs_seen.update(eval_configs)
182185
base_cmd = [

0 commit comments

Comments
 (0)