Skip to content

Commit cd71d63

Browse files
committed
Harden mp-300k gate handoff
1 parent af546b8 commit cd71d63

5 files changed

Lines changed: 267 additions & 13 deletions

File tree

.github/workflows/mp300k-artifact-gates.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,13 @@ jobs:
225225
handle.extractall(extract_root, filter="data")
226226
elif zipfile.is_zipfile(archive):
227227
with zipfile.ZipFile(archive) as handle:
228-
handle.extractall(extract_root)
228+
for member in handle.infolist():
229+
destination = (extract_root / member.filename).resolve()
230+
if not destination.is_relative_to(extract_root.resolve()):
231+
raise SystemExit(
232+
f"zip archive member escapes artifact root: {member.filename}"
233+
)
234+
handle.extract(member, extract_root)
229235
else:
230236
raise SystemExit("artifact_archive_url must point to a tar or zip archive")
231237

src/microplex_us/pipelines/mp300k_artifact_gates.py

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def build_mp300k_artifact_gate_report(
6363
baseline_dataset = (
6464
Path(baseline_dataset_path).expanduser()
6565
if baseline_dataset_path is not None
66-
else _manifest_baseline_dataset(manifest)
66+
else _manifest_baseline_dataset(artifact_root, manifest)
6767
)
6868

6969
candidate_gate = _candidate_artifact_gate(
@@ -191,12 +191,17 @@ def _resolve_candidate_dataset_path(
191191
return dataset_path
192192

193193

194-
def _manifest_baseline_dataset(manifest: dict[str, Any]) -> Path | None:
194+
def _manifest_baseline_dataset(
195+
artifact_root: Path, manifest: dict[str, Any]
196+
) -> Path | None:
195197
config = dict(manifest.get("config", {}))
196198
value = config.get("policyengine_baseline_dataset")
197199
if value is None:
198200
return None
199-
return Path(value).expanduser()
201+
baseline_path = Path(value).expanduser()
202+
if not baseline_path.is_absolute():
203+
baseline_path = artifact_root / baseline_path
204+
return baseline_path
200205

201206

202207
def _candidate_artifact_gate(
@@ -368,9 +373,27 @@ def _ecps_comparison_gate(
368373
candidate_loss = summary.get("candidate_enhanced_cps_native_loss")
369374
baseline_loss = summary.get("baseline_enhanced_cps_native_loss")
370375
loss_delta = summary.get("enhanced_cps_native_loss_delta")
371-
candidate_beats = summary.get("candidate_beats_baseline")
372-
if candidate_beats is None and loss_delta is not None:
376+
reported_candidate_beats = summary.get("candidate_beats_baseline")
377+
details: dict[str, Any] = {}
378+
if candidate_loss is not None and baseline_loss is not None:
379+
computed_loss_delta = float(candidate_loss) - float(baseline_loss)
380+
if (
381+
loss_delta is not None
382+
and abs(float(loss_delta) - computed_loss_delta) > 1e-12
383+
):
384+
details["reported_loss_delta"] = loss_delta
385+
details["computed_loss_delta"] = computed_loss_delta
386+
loss_delta = computed_loss_delta
387+
candidate_beats = None
388+
if loss_delta is not None:
373389
candidate_beats = float(loss_delta) < 0.0
390+
if (
391+
reported_candidate_beats is not None
392+
and candidate_beats is not None
393+
and bool(reported_candidate_beats) != candidate_beats
394+
):
395+
details["reported_candidate_beats_baseline"] = reported_candidate_beats
396+
details["computed_candidate_beats_baseline"] = candidate_beats
374397
status: GateStatus
375398
if candidate_beats is None:
376399
status = "unmeasured"
@@ -393,6 +416,7 @@ def _ecps_comparison_gate(
393416
"enhanced_cps_native_loss_delta": loss_delta,
394417
"n_targets_kept": summary.get("n_targets_kept"),
395418
},
419+
details=details,
396420
)
397421

398422

@@ -447,24 +471,27 @@ def _runtime_gate(
447471
if ratio is None and candidate_seconds is not None and baseline_seconds:
448472
ratio = float(candidate_seconds) / float(baseline_seconds)
449473
passes = payload.get("passes_runtime_gate")
450-
if passes is None and ratio is not None:
451-
passes = float(ratio) <= threshold
452-
if passes is None:
474+
details: dict[str, Any] = {}
475+
if ratio is None:
453476
return _gate(
454477
"unmeasured",
455-
"runtime smoke payload is missing ratio or pass/fail result",
478+
"runtime smoke payload is missing ratio or candidate/baseline seconds",
456479
metrics={
457480
"candidate_seconds": candidate_seconds,
458481
"baseline_seconds": baseline_seconds,
459482
"runtime_ratio": ratio,
460483
"runtime_ratio_threshold": threshold,
461484
},
462485
)
486+
derived_passes = float(ratio) <= threshold
487+
if passes is not None and bool(passes) != derived_passes:
488+
details["reported_passes_runtime_gate"] = passes
489+
details["computed_passes_runtime_gate"] = derived_passes
463490
return _gate(
464-
"pass" if bool(passes) else "fail",
491+
"pass" if derived_passes else "fail",
465492
(
466493
"candidate runtime is inside the smoke benchmark threshold"
467-
if bool(passes)
494+
if derived_passes
468495
else "candidate runtime exceeds the smoke benchmark threshold"
469496
),
470497
metrics={
@@ -473,6 +500,7 @@ def _runtime_gate(
473500
"runtime_ratio": ratio,
474501
"runtime_ratio_threshold": threshold,
475502
},
503+
details=details,
476504
)
477505

478506

src/microplex_us/pipelines/mp300k_gate_inputs.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def package_mp300k_gate_inputs(
1717
output_dir: str | Path,
1818
*,
1919
candidate_dataset_path: str | Path | None = None,
20+
baseline_dataset_path: str | Path | None = None,
2021
ecps_comparison_path: str | Path | None = None,
2122
runtime_smoke_path: str | Path | None = None,
2223
benchmark_manifest_path: str | Path | None = None,
@@ -39,6 +40,11 @@ def package_mp300k_gate_inputs(
3940
)
4041
if not candidate_dataset.exists():
4142
raise FileNotFoundError(f"candidate dataset not found: {candidate_dataset}")
43+
baseline_dataset = _resolve_baseline_dataset_path(
44+
artifact_root,
45+
manifest,
46+
baseline_dataset_path,
47+
)
4248

4349
output_root.mkdir(parents=True, exist_ok=True)
4450
archive_path = output_root / archive_name
@@ -56,12 +62,26 @@ def package_mp300k_gate_inputs(
5662
staged_candidate = stage_root / candidate_relpath
5763
staged_candidate.parent.mkdir(parents=True, exist_ok=True)
5864
shutil.copy2(candidate_dataset, staged_candidate)
65+
baseline_relpath = None
66+
if baseline_dataset is not None:
67+
if not baseline_dataset.exists():
68+
raise FileNotFoundError(f"baseline dataset not found: {baseline_dataset}")
69+
baseline_relpath = _baseline_archive_relpath(
70+
manifest,
71+
baseline_dataset=baseline_dataset,
72+
explicit_baseline_path=baseline_dataset_path,
73+
)
74+
staged_baseline = stage_root / baseline_relpath
75+
staged_baseline.parent.mkdir(parents=True, exist_ok=True)
76+
shutil.copy2(baseline_dataset, staged_baseline)
5977

6078
staged_manifest = _manifest_for_archive(
6179
manifest,
6280
source_artifact_dir=artifact_root,
6381
source_candidate_dataset=candidate_dataset,
6482
candidate_relpath=candidate_relpath,
83+
source_baseline_dataset=baseline_dataset,
84+
baseline_relpath=baseline_relpath,
6585
)
6686
_write_json(stage_root / "manifest.json", staged_manifest)
6787
_write_archive(archive_path, stage_root)
@@ -86,6 +106,9 @@ def package_mp300k_gate_inputs(
86106
"source_artifact_dir": str(artifact_root.resolve()),
87107
"source_manifest": _file_descriptor(manifest_path),
88108
"source_candidate_dataset": _file_descriptor(candidate_dataset),
109+
"source_baseline_dataset": (
110+
_file_descriptor(baseline_dataset) if baseline_dataset is not None else None
111+
),
89112
"artifact_archive": _file_descriptor(archive_path),
90113
"evidence": evidence,
91114
"workflow_call": {
@@ -124,6 +147,24 @@ def _resolve_candidate_dataset_path(
124147
return dataset_path
125148

126149

150+
def _resolve_baseline_dataset_path(
151+
artifact_root: Path,
152+
manifest: dict[str, Any],
153+
explicit_path: str | Path | None,
154+
) -> Path | None:
155+
if explicit_path is not None:
156+
return Path(explicit_path).expanduser()
157+
value = dict(manifest.get("config", {})).get("policyengine_baseline_dataset")
158+
if value is None:
159+
return None
160+
if not isinstance(value, str) or not value:
161+
raise ValueError("config.policyengine_baseline_dataset must be a path string")
162+
baseline_path = Path(value).expanduser()
163+
if not baseline_path.is_absolute():
164+
baseline_path = artifact_root / baseline_path
165+
return baseline_path
166+
167+
127168
def _candidate_archive_relpath(
128169
manifest: dict[str, Any],
129170
*,
@@ -140,21 +181,48 @@ def _candidate_archive_relpath(
140181
return Path(candidate_dataset.name)
141182

142183

184+
def _baseline_archive_relpath(
185+
manifest: dict[str, Any],
186+
*,
187+
baseline_dataset: Path,
188+
explicit_baseline_path: str | Path | None,
189+
) -> Path:
190+
if explicit_baseline_path is not None:
191+
return Path("baseline") / baseline_dataset.name
192+
value = dict(manifest.get("config", {})).get("policyengine_baseline_dataset")
193+
if isinstance(value, str) and value:
194+
relpath = Path(value)
195+
if not relpath.is_absolute():
196+
return relpath
197+
return Path("baseline") / baseline_dataset.name
198+
199+
143200
def _manifest_for_archive(
144201
manifest: dict[str, Any],
145202
*,
146203
source_artifact_dir: Path,
147204
source_candidate_dataset: Path,
148205
candidate_relpath: Path,
206+
source_baseline_dataset: Path | None,
207+
baseline_relpath: Path | None,
149208
) -> dict[str, Any]:
150209
updated = dict(manifest)
151210
artifacts = dict(updated.get("artifacts", {}))
152211
artifacts["policyengine_dataset"] = str(candidate_relpath)
153212
updated["artifacts"] = artifacts
213+
config = dict(updated.get("config", {}))
214+
if baseline_relpath is not None:
215+
config["policyengine_baseline_dataset"] = str(baseline_relpath)
216+
updated["config"] = config
154217
updated["mp300k_gate_inputs"] = {
155218
"packaged_at": datetime.now(UTC).isoformat(),
156219
"source_artifact_dir": str(source_artifact_dir.resolve()),
157220
"source_candidate_dataset": str(source_candidate_dataset.resolve()),
221+
"source_baseline_dataset": (
222+
str(source_baseline_dataset.resolve())
223+
if source_baseline_dataset is not None
224+
else None
225+
),
158226
}
159227
return updated
160228

@@ -205,6 +273,7 @@ def main(argv: list[str] | None = None) -> int:
205273
parser.add_argument("--artifact-dir", required=True)
206274
parser.add_argument("--output-dir", required=True)
207275
parser.add_argument("--candidate-dataset")
276+
parser.add_argument("--baseline-dataset")
208277
parser.add_argument("--ecps-comparison-json")
209278
parser.add_argument("--runtime-smoke-json")
210279
parser.add_argument("--benchmark-manifest")
@@ -215,6 +284,7 @@ def main(argv: list[str] | None = None) -> int:
215284
args.artifact_dir,
216285
args.output_dir,
217286
candidate_dataset_path=args.candidate_dataset,
287+
baseline_dataset_path=args.baseline_dataset,
218288
ecps_comparison_path=args.ecps_comparison_json,
219289
runtime_smoke_path=args.runtime_smoke_json,
220290
benchmark_manifest_path=args.benchmark_manifest,

tests/pipelines/test_mp300k_artifact_gates.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import h5py
99
import numpy as np
10+
import pytest
1011

1112
from microplex_us.pipelines.mp300k_artifact_gates import (
1213
main,
@@ -258,3 +259,74 @@ def test_ecps_comparison_accepts_existing_broad_loss_array_payload(tmp_path):
258259
]
259260
== 0.25
260261
)
262+
263+
264+
def test_runtime_gate_ignores_contradictory_producer_verdict(tmp_path):
265+
artifact_dir = tmp_path / "artifact"
266+
artifact_dir.mkdir()
267+
_write_minimal_policyengine_dataset(artifact_dir / "candidate.h5")
268+
baseline_dataset = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5")
269+
benchmark_manifest = tmp_path / "benchmark_manifest.json"
270+
benchmark_manifest.write_text(json.dumps({"schema_version": 1}))
271+
_write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset)
272+
273+
report_path = write_mp300k_artifact_gate_report(
274+
artifact_dir,
275+
ecps_comparison_payload={
276+
"summary": {
277+
"candidate_enhanced_cps_native_loss": 0.1,
278+
"baseline_enhanced_cps_native_loss": 0.2,
279+
}
280+
},
281+
runtime_smoke_payload={
282+
"runtime_ratio": 10.0,
283+
"runtime_ratio_threshold": 1.25,
284+
"passes_runtime_gate": True,
285+
},
286+
benchmark_manifest_path=benchmark_manifest,
287+
compute_native_scores=False,
288+
update_manifest=False,
289+
)
290+
291+
record = json.loads(report_path.read_text())
292+
runtime_gate = record["gates"]["runtime"]
293+
294+
assert record["summary"]["status"] == "failed"
295+
assert runtime_gate["status"] == "fail"
296+
assert runtime_gate["details"]["reported_passes_runtime_gate"] is True
297+
assert runtime_gate["details"]["computed_passes_runtime_gate"] is False
298+
299+
300+
def test_ecps_gate_derives_verdict_from_losses_not_producer_flag(tmp_path):
301+
artifact_dir = tmp_path / "artifact"
302+
artifact_dir.mkdir()
303+
_write_minimal_policyengine_dataset(artifact_dir / "candidate.h5")
304+
baseline_dataset = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5")
305+
benchmark_manifest = tmp_path / "benchmark_manifest.json"
306+
benchmark_manifest.write_text(json.dumps({"schema_version": 1}))
307+
_write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset)
308+
309+
report_path = write_mp300k_artifact_gate_report(
310+
artifact_dir,
311+
ecps_comparison_payload={
312+
"summary": {
313+
"candidate_enhanced_cps_native_loss": 0.3,
314+
"baseline_enhanced_cps_native_loss": 0.2,
315+
"enhanced_cps_native_loss_delta": -0.1,
316+
"candidate_beats_baseline": True,
317+
}
318+
},
319+
runtime_smoke_payload={"runtime_ratio": 1.0},
320+
benchmark_manifest_path=benchmark_manifest,
321+
compute_native_scores=False,
322+
update_manifest=False,
323+
)
324+
325+
record = json.loads(report_path.read_text())
326+
ecps_gate = record["gates"]["ecps_comparison"]
327+
328+
assert record["summary"]["status"] == "failed"
329+
assert ecps_gate["status"] == "fail"
330+
assert ecps_gate["metrics"]["enhanced_cps_native_loss_delta"] == pytest.approx(0.1)
331+
assert ecps_gate["details"]["reported_candidate_beats_baseline"] is True
332+
assert ecps_gate["details"]["computed_candidate_beats_baseline"] is False

0 commit comments

Comments
 (0)