Skip to content

Commit 9398ea5

Browse files
committed
Track validation report identity in release candidates
1 parent f8b27ce commit 9398ea5

3 files changed

Lines changed: 213 additions & 5 deletions

File tree

docs/engineering/stages/release_promotion.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,12 @@ Production Stage 5 code should not depend on Stage 4 contracts until the
4141
contract and inventory are canonical, complete, and populated with semantic
4242
artifact identity plus checksum/size material.
4343

44+
Candidate bundles may record validation reports as path-only
45+
`validation_report_paths` for compatibility. When Stage 4 or another upstream
46+
producer can provide report checksums, prefer `validation_report_refs` with
47+
canonical `DiagnosticRef` / `ArtifactRef` identity so rerun comparison can
48+
distinguish an overwritten report at the same diagnostics path.
49+
4450
## Validation Reports
4551

4652
Stage 5 must use the shared validation schema for durable validation output:
@@ -64,7 +70,8 @@ comparison material should include:
6470
- run ID, candidate version, release version, HF repository, and GCS bucket;
6571
- Stage 4 output contract fingerprint when available;
6672
- output inventory paths/checksums when available;
67-
- validation report paths and their identities when available;
73+
- validation report paths and `DiagnosticRef` checksum identities when
74+
available;
6875
- expected production-relative artifact paths;
6976
- the Stage 5 candidate bundle fingerprint.
7077

policyengine_us_data/release_promotion/candidate.py

Lines changed: 116 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from policyengine_us_data.pipeline_metadata import pipeline_node
1313
from policyengine_us_data.stage_contracts import (
1414
ArtifactRef,
15+
DiagnosticRef,
1516
StageContract,
1617
read_contract,
1718
)
@@ -80,6 +81,7 @@ class ReleaseCandidateInputBundle:
8081
source_output_contract_path: str | None = None
8182
release_candidate_fingerprint: str | None = None
8283
validation_report_paths: tuple[str, ...] = ()
84+
validation_report_refs: tuple[DiagnosticRef, ...] = ()
8385
diagnostics_manifest_path: str | None = None
8486
metadata: Mapping[str, Any] = field(default_factory=dict)
8587
bundle_type: str = RELEASE_CANDIDATE_BUNDLE_TYPE
@@ -131,6 +133,18 @@ def __post_init__(self) -> None:
131133
for path in self.validation_report_paths
132134
),
133135
)
136+
validation_report_refs = freeze_sequence(
137+
self.validation_report_refs,
138+
"validation_report_refs",
139+
DiagnosticRef,
140+
)
141+
for ref in validation_report_refs:
142+
_validation_report_ref_path(ref, self.context)
143+
object.__setattr__(
144+
self,
145+
"validation_report_refs",
146+
validation_report_refs,
147+
)
134148
object.__setattr__(
135149
self,
136150
"diagnostics_manifest_path",
@@ -161,6 +175,9 @@ def to_dict(self) -> dict[str, Any]:
161175
"release_candidate_fingerprint": self.release_candidate_fingerprint,
162176
"artifacts": [artifact.to_dict() for artifact in self.artifacts],
163177
"validation_report_paths": list(self.validation_report_paths),
178+
"validation_report_refs": [
179+
ref.to_dict() for ref in self.validation_report_refs
180+
],
164181
"diagnostics_manifest_path": self.diagnostics_manifest_path,
165182
"metadata": jsonable_value(self.metadata),
166183
}
@@ -187,6 +204,10 @@ def from_dict(cls, data: Mapping[str, Any]) -> "ReleaseCandidateInputBundle":
187204
required_string({"path": item}, "path")
188205
for item in data.get("validation_report_paths", ())
189206
),
207+
validation_report_refs=tuple(
208+
DiagnosticRef.from_dict(item)
209+
for item in data.get("validation_report_refs", ())
210+
),
190211
diagnostics_manifest_path=optional_string(
191212
data,
192213
"diagnostics_manifest_path",
@@ -216,6 +237,7 @@ def build_legacy_release_candidate_bundle(
216237
rel_paths: Sequence[str],
217238
artifact_metadata_by_path: Mapping[str, Mapping[str, Any]] | None = None,
218239
validation_report_paths: Sequence[str] = (),
240+
validation_report_refs: Sequence[DiagnosticRef] = (),
219241
source_output_contract_path: str | None = None,
220242
diagnostics_manifest_path: str | None = None,
221243
) -> ReleaseCandidateInputBundle:
@@ -240,6 +262,7 @@ def build_legacy_release_candidate_bundle(
240262
artifacts=artifacts,
241263
source_output_contract_path=source_output_contract_path,
242264
validation_report_paths=validation_report_paths,
265+
validation_report_refs=validation_report_refs,
243266
diagnostics_manifest_path=diagnostics_manifest_path,
244267
reader="legacy_staged_paths",
245268
)
@@ -264,6 +287,7 @@ def build_release_candidate_bundle_from_stage4_contract(
264287
inventory_records: Iterable[Mapping[str, Any]] = (),
265288
source_output_contract_path: str | None = None,
266289
validation_report_paths: Sequence[str] = (),
290+
validation_report_refs: Sequence[DiagnosticRef] = (),
267291
diagnostics_manifest_path: str | None = None,
268292
) -> ReleaseCandidateInputBundle:
269293
"""Build a candidate bundle from a Stage 4 output contract shape."""
@@ -317,6 +341,7 @@ def build_release_candidate_bundle_from_stage4_contract(
317341
artifacts=tuple(sorted(artifacts, key=lambda item: item.relative_path)),
318342
source_output_contract_path=source_output_contract_path,
319343
validation_report_paths=validation_report_paths,
344+
validation_report_refs=validation_report_refs,
320345
diagnostics_manifest_path=derived_diagnostics_manifest_path,
321346
reader="stage4_contract",
322347
extra_fingerprint_material=extra_fingerprint_material,
@@ -342,6 +367,7 @@ def read_stage4_release_candidate_bundle(
342367
output_inventory_path: str | Path | None = None,
343368
source_output_contract_path: str | None = None,
344369
validation_report_paths: Sequence[str] = (),
370+
validation_report_refs: Sequence[DiagnosticRef] = (),
345371
diagnostics_manifest_path: str | None = None,
346372
) -> ReleaseCandidateInputBundle:
347373
"""Read a candidate bundle from Stage 4 contract and optional inventory files."""
@@ -356,6 +382,7 @@ def read_stage4_release_candidate_bundle(
356382
inventory_records=inventory_records,
357383
source_output_contract_path=source_output_contract_path,
358384
validation_report_paths=validation_report_paths,
385+
validation_report_refs=validation_report_refs,
359386
diagnostics_manifest_path=diagnostics_manifest_path,
360387
)
361388

@@ -366,6 +393,7 @@ def _candidate_bundle_with_fingerprint(
366393
artifacts: tuple[ReleaseArtifactSpec, ...],
367394
source_output_contract_path: str | None,
368395
validation_report_paths: Sequence[str],
396+
validation_report_refs: Sequence[DiagnosticRef],
369397
diagnostics_manifest_path: str | None,
370398
reader: str,
371399
extra_fingerprint_material: Mapping[str, Any] | None = None,
@@ -380,14 +408,35 @@ def _candidate_bundle_with_fingerprint(
380408
_normalize_run_diagnostic_path(path, context)
381409
for path in validation_report_paths
382410
)
411+
normalized_validation_report_refs = freeze_sequence(
412+
validation_report_refs,
413+
"validation_report_refs",
414+
DiagnosticRef,
415+
)
416+
normalized_validation_report_ref_paths = tuple(
417+
_validation_report_ref_path(ref, context)
418+
for ref in normalized_validation_report_refs
419+
)
383420
normalized_diagnostics_manifest_path = (
384421
_normalize_run_diagnostic_path(diagnostics_manifest_path, context)
385422
if diagnostics_manifest_path is not None
386423
else None
387424
)
388-
fingerprint_status, missing_identity_paths = _fingerprint_identity_status(
389-
sorted_artifacts
425+
fingerprint_status, missing_artifact_identity_paths = (
426+
_release_artifact_identity_status(sorted_artifacts)
390427
)
428+
missing_validation_report_identity_paths = (
429+
_validation_report_identity_missing_paths(
430+
normalized_validation_report_refs,
431+
context=context,
432+
)
433+
)
434+
if missing_validation_report_identity_paths:
435+
fingerprint_status = (
436+
"path_only_missing_identity"
437+
if missing_artifact_identity_paths
438+
else "path_only_missing_validation_report_identity"
439+
)
391440
fingerprint = None
392441
if fingerprint_status == "complete":
393442
fingerprint = fingerprint_material(
@@ -400,6 +449,17 @@ def _candidate_bundle_with_fingerprint(
400449
],
401450
"source_output_contract_path": normalized_source_output_contract_path,
402451
"validation_report_paths": sorted(normalized_validation_report_paths),
452+
"validation_report_refs": sorted(
453+
(
454+
_validation_report_ref_fingerprint_material(ref, context)
455+
for ref in normalized_validation_report_refs
456+
),
457+
key=lambda item: (
458+
item["path"],
459+
item["name"],
460+
item["kind"],
461+
),
462+
),
403463
"diagnostics_manifest_path": normalized_diagnostics_manifest_path,
404464
**(extra_fingerprint_material or {}),
405465
}
@@ -410,11 +470,16 @@ def _candidate_bundle_with_fingerprint(
410470
source_output_contract_path=normalized_source_output_contract_path,
411471
release_candidate_fingerprint=fingerprint,
412472
validation_report_paths=normalized_validation_report_paths,
473+
validation_report_refs=normalized_validation_report_refs,
413474
diagnostics_manifest_path=normalized_diagnostics_manifest_path,
414475
metadata={
415476
"reader": reader,
416477
"fingerprint_status": fingerprint_status,
417-
"missing_fingerprint_identity_paths": missing_identity_paths,
478+
"missing_fingerprint_identity_paths": missing_artifact_identity_paths,
479+
"missing_validation_report_identity_paths": (
480+
missing_validation_report_identity_paths
481+
),
482+
"validation_report_ref_paths": normalized_validation_report_ref_paths,
418483
},
419484
)
420485

@@ -540,7 +605,7 @@ def _merge_duplicate_artifact_spec(
540605
)
541606

542607

543-
def _fingerprint_identity_status(
608+
def _release_artifact_identity_status(
544609
artifacts: Sequence[ReleaseArtifactSpec],
545610
) -> tuple[str, tuple[str, ...]]:
546611
missing_identity_paths = tuple(
@@ -554,6 +619,20 @@ def _fingerprint_identity_status(
554619
return "complete", ()
555620

556621

622+
def _validation_report_identity_missing_paths(
623+
refs: Sequence[DiagnosticRef],
624+
*,
625+
context: ReleasePromotionContext,
626+
) -> tuple[str, ...]:
627+
missing_paths: list[str] = []
628+
for ref in refs:
629+
path = _validation_report_ref_path(ref, context)
630+
artifact = ref.artifact
631+
if artifact is None or artifact.sha256 is None or artifact.size_bytes is None:
632+
missing_paths.append(path)
633+
return tuple(missing_paths)
634+
635+
557636
def _artifact_spec_from_inventory_record(
558637
record: Mapping[str, Any],
559638
*,
@@ -725,6 +804,39 @@ def _diagnostic_artifact_identity(
725804
}
726805

727806

807+
def _validation_report_ref_path(
808+
ref: DiagnosticRef,
809+
context: ReleasePromotionContext,
810+
) -> str:
811+
artifact = ref.artifact
812+
if artifact is None:
813+
raise ValueError("validation_report_refs entries must include artifacts")
814+
path = _diagnostic_artifact_path(artifact, context)
815+
if path is None:
816+
raise ValueError(
817+
"validation_report_refs artifacts must live under run diagnostics"
818+
)
819+
return path
820+
821+
822+
def _validation_report_ref_fingerprint_material(
823+
ref: DiagnosticRef,
824+
context: ReleasePromotionContext,
825+
) -> dict[str, Any]:
826+
artifact = ref.artifact
827+
path = _validation_report_ref_path(ref, context)
828+
if artifact is None:
829+
raise ValueError("validation_report_refs entries must include artifacts")
830+
return {
831+
"name": ref.name,
832+
"kind": ref.kind,
833+
"path": path,
834+
"logical_name": artifact.logical_name,
835+
"sha256": artifact.sha256,
836+
"size_bytes": artifact.size_bytes,
837+
}
838+
839+
728840
def _optional_record_string(record: Mapping[str, Any], key: str) -> str | None:
729841
value = record.get(key)
730842
return value if isinstance(value, str) and value else None

tests/unit/release_promotion/test_candidate.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,25 @@ def _legacy_identity_metadata() -> dict[str, dict]:
134134
}
135135

136136

137+
def _validation_report_ref(
138+
*,
139+
path: str = "calibration/runs/run-123/diagnostics/validation_report.json",
140+
sha256: str | None = "sha256:validation-report",
141+
size_bytes: int | None = 128,
142+
) -> DiagnosticRef:
143+
return DiagnosticRef(
144+
name="stage4_validation_report",
145+
kind="validation_report",
146+
artifact=ArtifactRef(
147+
logical_name="stage4_validation_report",
148+
uri=f"hf://policyengine/policyengine-us-data/{path}",
149+
sha256=sha256,
150+
size_bytes=size_bytes,
151+
metadata={"relative_path": path},
152+
),
153+
)
154+
155+
137156
def test_release_path_normalization_rejects_parent_paths() -> None:
138157
assert normalize_release_path("./states//AL.h5") == "states/AL.h5"
139158

@@ -389,6 +408,74 @@ def test_candidate_fingerprint_uses_normalized_paths() -> None:
389408
)
390409

391410

411+
def test_candidate_fingerprint_tracks_validation_report_ref_identity() -> None:
412+
first = build_legacy_release_candidate_bundle(
413+
context=_context(),
414+
rel_paths=["states/AL.h5"],
415+
artifact_metadata_by_path={
416+
"states/AL.h5": _legacy_identity_metadata()["states/AL.h5"]
417+
},
418+
validation_report_paths=[
419+
"calibration/runs/run-123/diagnostics/validation_report.json"
420+
],
421+
validation_report_refs=[
422+
_validation_report_ref(sha256="sha256:first", size_bytes=128)
423+
],
424+
)
425+
overwritten = build_legacy_release_candidate_bundle(
426+
context=_context(),
427+
rel_paths=["states/AL.h5"],
428+
artifact_metadata_by_path={
429+
"states/AL.h5": _legacy_identity_metadata()["states/AL.h5"]
430+
},
431+
validation_report_paths=[
432+
"calibration/runs/run-123/diagnostics/validation_report.json"
433+
],
434+
validation_report_refs=[
435+
_validation_report_ref(sha256="sha256:second", size_bytes=128)
436+
],
437+
)
438+
439+
assert first.release_candidate_fingerprint != (
440+
overwritten.release_candidate_fingerprint
441+
)
442+
assert first.metadata["validation_report_ref_paths"] == (
443+
"calibration/runs/run-123/diagnostics/validation_report.json",
444+
)
445+
446+
447+
def test_candidate_fingerprint_requires_validation_report_ref_identity() -> None:
448+
bundle = build_legacy_release_candidate_bundle(
449+
context=_context(),
450+
rel_paths=["states/AL.h5"],
451+
artifact_metadata_by_path={
452+
"states/AL.h5": _legacy_identity_metadata()["states/AL.h5"]
453+
},
454+
validation_report_refs=[_validation_report_ref(sha256=None, size_bytes=128)],
455+
)
456+
457+
assert bundle.release_candidate_fingerprint is None
458+
assert bundle.metadata["fingerprint_status"] == (
459+
"path_only_missing_validation_report_identity"
460+
)
461+
assert bundle.metadata["missing_validation_report_identity_paths"] == (
462+
"calibration/runs/run-123/diagnostics/validation_report.json",
463+
)
464+
465+
466+
def test_candidate_validation_report_refs_are_run_scoped() -> None:
467+
with pytest.raises(ValueError, match="context.run_id"):
468+
build_legacy_release_candidate_bundle(
469+
context=_context(),
470+
rel_paths=["states/AL.h5"],
471+
validation_report_refs=[
472+
_validation_report_ref(
473+
path="calibration/runs/other-run/diagnostics/validation_report.json"
474+
)
475+
],
476+
)
477+
478+
392479
def test_stage4_candidate_reader_accepts_inventory_records() -> None:
393480
bundle = build_release_candidate_bundle_from_stage4_contract(
394481
context=_context(),
@@ -1015,6 +1102,7 @@ def test_release_candidate_bundle_round_trips_through_dict_and_json() -> None:
10151102
validation_report_paths=(
10161103
"calibration/runs/run-123/diagnostics/validation_report.json",
10171104
),
1105+
validation_report_refs=(_validation_report_ref(),),
10181106
diagnostics_manifest_path="calibration/runs/run-123/diagnostics/manifest.json",
10191107
metadata={"reader": "fixture"},
10201108
)
@@ -1025,6 +1113,7 @@ def test_release_candidate_bundle_round_trips_through_dict_and_json() -> None:
10251113
assert payload["bundle_type"] == "release_candidate_input_bundle"
10261114
assert payload["stage_id"] == "5_validate_and_promote_release"
10271115
assert payload["schema_version"]
1116+
assert len(payload["validation_report_refs"]) == 1
10281117
assert ReleaseArtifactSpec.from_dict(artifact.to_dict()) == artifact
10291118
assert restored.to_dict() == payload
10301119

0 commit comments

Comments
 (0)