From c77238791cab289bfa10700a5e4dfbbbe9eeeb15 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 4 Jun 2026 14:11:10 -0700 Subject: [PATCH 1/3] chore(sweep): re-run MiniMax-M2.5 vLLM sweeps to capture power telemetry Re-runs the MiniMax-M2.5 single-node vLLM configs (H100/H200 FP8, B200/B300/MI355X FP4) with no recipe change, so the new rows carry the per-GPU power telemetry (avg_power_w) added in #1558. The power/energy canvas currently models power because its source rows predate the 2026-05-27 capture merge; this re-run lets it use measured power. Co-Authored-By: Claude Opus 4.8 --- perf-changelog.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a5f3f3478..ed492df16 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3474,3 +3474,14 @@ - "Use scheduler-recv-interval values 2/60/30/1200/600/1920 for conc 1-4/8/16/32/64/128-256" - "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544 + +- config-keys: + - minimaxm2.5-fp8-h100-vllm + - minimaxm2.5-fp8-h200-vllm + - minimaxm2.5-fp4-b200-vllm + - minimaxm2.5-fp4-b300-vllm + - minimaxm2.5-fp4-mi355x-vllm + description: + - "Re-run MiniMax-M2.5 single-node vLLM sweeps (H100/H200 FP8, B200/B300/MI355X FP4) with no recipe change, to capture per-GPU power telemetry (avg_power_w) added in #1558 for the power/energy canvas" + - "Source rows for the canvas predate the 2026-05-27 power-capture merge, so they carry throughput/latency but no measured power; this re-run replaces the modeled power layer with measured power" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX From b271a5617b4720eb1008ffc456768d416fdec9df Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 4 Jun 2026 14:11:50 -0700 Subject: [PATCH 2/3] chore(sweep): point changelog pr-link at #1666 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ed492df16..936b874c0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3484,4 +3484,4 @@ description: - "Re-run MiniMax-M2.5 single-node vLLM sweeps (H100/H200 FP8, B200/B300/MI355X FP4) with no recipe change, to capture per-GPU power telemetry (avg_power_w) added in #1558 for the power/energy canvas" - "Source rows for the canvas predate the 2026-05-27 power-capture merge, so they carry throughput/latency but no measured power; this re-run replaces the modeled power layer with measured power" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1666 From 5fa9848fc8d661177b13f8270ea1e0a2bf541a21 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 4 Jun 2026 14:33:37 -0700 Subject: [PATCH 3/3] feat(sweep): add benchmarks-only changelog option; use it for power re-run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `benchmarks-only: true` to a changelog entry to skip the eval pass (symmetric with the existing `evals-only`; the two are mutually exclusive). Power telemetry comes from the benchmark runs, so the MiniMax power re-run doesn't need evals — sets the flag, dropping 14 unnecessary eval runs. - validation.py: new `benchmarks_only` field + mutual-exclusion validator - process_changelog.py: skip eval generation when benchmarks_only is set - test_validation.py: ChangelogEntry coverage (aliases, exclusivity, forbid) Co-Authored-By: Claude Opus 4.8 --- perf-changelog.yaml | 2 ++ utils/matrix_logic/test_validation.py | 42 +++++++++++++++++++++++++++ utils/matrix_logic/validation.py | 12 ++++++++ utils/process_changelog.py | 7 +++-- 4 files changed, 61 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 936b874c0..4d1e9e7c0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3484,4 +3484,6 @@ description: - "Re-run MiniMax-M2.5 single-node vLLM sweeps (H100/H200 FP8, B200/B300/MI355X FP4) with no recipe change, to capture per-GPU power telemetry (avg_power_w) added in #1558 for the power/energy canvas" - "Source rows for the canvas predate the 2026-05-27 power-capture merge, so they carry throughput/latency but no measured power; this re-run replaces the modeled power layer with measured power" + - "benchmarks-only: power comes from the benchmark runs, evals add nothing here" + benchmarks-only: true pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1666 diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py index c385017b1..bff53d86d 100644 --- a/utils/matrix_logic/test_validation.py +++ b/utils/matrix_logic/test_validation.py @@ -1,6 +1,7 @@ """Comprehensive tests for validation.py""" import pytest from validation import ( + ChangelogEntry, Fields, SingleNodeMatrixEntry, SingleNodeAgenticMatrixEntry, @@ -932,3 +933,44 @@ def test_validation_runs_by_default(self, tmp_path): with pytest.raises(ValueError) as exc_info: load_runner_file(str(runner_file)) assert "must be a list" in str(exc_info.value) + + +class TestChangelogEntry: + """Tests for ChangelogEntry, incl. the benchmarks-only / evals-only options.""" + + def _base(self, **extra): + entry = { + "config-keys": ["minimaxm2.5-fp4-b200-vllm"], + "description": ["re-run for power capture"], + "pr-link": "https://github.com/SemiAnalysisAI/InferenceX/pull/1666", + } + entry.update(extra) + return entry + + def test_defaults(self): + """Both opt-out flags default to False.""" + entry = ChangelogEntry.model_validate(self._base()) + assert entry.evals_only is False + assert entry.benchmarks_only is False + + def test_benchmarks_only_alias(self): + """benchmarks-only YAML key maps to benchmarks_only.""" + entry = ChangelogEntry.model_validate(self._base(**{"benchmarks-only": True})) + assert entry.benchmarks_only is True + + def test_evals_only_alias(self): + entry = ChangelogEntry.model_validate(self._base(**{"evals-only": True})) + assert entry.evals_only is True + + def test_evals_and_benchmarks_only_mutually_exclusive(self): + """Setting both opt-out flags is rejected.""" + with pytest.raises(ValueError) as exc_info: + ChangelogEntry.model_validate( + self._base(**{"evals-only": True, "benchmarks-only": True}) + ) + assert "mutually exclusive" in str(exc_info.value) + + def test_unknown_field_forbidden(self): + """extra='forbid' rejects typos like a singular 'benchmark-only'.""" + with pytest.raises(ValueError): + ChangelogEntry.model_validate(self._base(**{"benchmark-only": True})) diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py index 4e3f0bbd7..c33db6421 100644 --- a/utils/matrix_logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -486,11 +486,23 @@ class ChangelogEntry(BaseModel): description: list[str] = Field(min_length=1) pr_link: str = Field(alias="pr-link") evals_only: bool = Field(alias="evals-only", default=False) + benchmarks_only: bool = Field( + alias="benchmarks-only", default=False, + description="Skip the eval pass; generate benchmarks only (e.g. power-only re-runs)." + ) scenario_type: Optional[List[str]] = Field( alias="scenario-type", default=None, description="Restrict to specific scenario types (e.g., ['fixed-seq-len', 'agentic-coding'])" ) + @model_validator(mode='after') + def check_evals_benchmarks_exclusive(self) -> "ChangelogEntry": + if self.evals_only and self.benchmarks_only: + raise ValueError( + "'evals-only' and 'benchmarks-only' are mutually exclusive; set at most one." + ) + return self + class ChangelogMetadata(BaseModel): """Pydantic model for validating changelog metadata structure.""" diff --git a/utils/process_changelog.py b/utils/process_changelog.py index 1514f8d36..d0f3bbd42 100644 --- a/utils/process_changelog.py +++ b/utils/process_changelog.py @@ -175,8 +175,11 @@ def main(): raise all_benchmark_results.extend(json.loads(result.stdout)) - # Generate eval entries separately - eval_configs = [c for c in all_configs if c not in eval_configs_seen] + # Generate eval entries separately (skipped when the entry opts out via + # benchmarks-only, e.g. power-only re-runs that don't need eval scoring). + eval_configs = [] if entry.benchmarks_only else [ + c for c in all_configs if c not in eval_configs_seen + ] if eval_configs: eval_configs_seen.update(eval_configs) base_cmd = [