Skip to content

Commit fb478fd

Browse files
committed
Add Stage 5 published artifact index
1 parent 4544761 commit fb478fd

12 files changed

Lines changed: 1089 additions & 27 deletions

File tree

changelog.d/1044.added

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added the Stage 5 published artifact index JSONL artifact.

docs/engineering/pipeline-map.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1364,6 +1364,22 @@ def normalize_worker_response(*, worker_index: int, result: object) -> Coordinat
13641364

13651365
Normalize worker JSON into explicit fatal and nonfatal coordinator issues.
13661366

1367+
### `policyengine_us_data.release_promotion.published_index.build_published_artifact_index`
1368+
1369+
```python
1370+
def build_published_artifact_index(*, candidate_bundle: ReleaseCandidateInputBundle, promotion_result: FullPromotionResult, release_manifest: Mapping[str, Any] | None = None, diagnostic_artifacts: Sequence[ArtifactRef] = ()) -> tuple[PublishedArtifactIndexRow, ...]
1371+
```
1372+
1373+
Build deterministic published artifact rows for a promoted release.
1374+
1375+
### `policyengine_us_data.release_promotion.published_index.PublishedArtifactIndexRow`
1376+
1377+
```python
1378+
class PublishedArtifactIndexRow
1379+
```
1380+
1381+
One row in the Stage 5 published artifact JSONL index.
1382+
13671383
### `policyengine_us_data.release_promotion.artifacts.ReleaseArtifactSpec`
13681384

13691385
```python

docs/engineering/stages/release_promotion.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,3 +147,18 @@ Runtime step manifests for `5_validate_and_promote_release` should include the
147147
contract as a JSON `contract` output. They may still record legacy validated
148148
input artifacts for compatibility, but the contract is the preferred semantic
149149
entry point for Stage 5 status and lineage.
150+
151+
## Published Artifact Index
152+
153+
Stage 5 also writes `published_artifact_index.jsonl` under the run-local
154+
`diagnostics/` directory. Each JSONL row describes one promoted artifact or
155+
release metadata artifact with its canonical `run_id`, candidate version,
156+
release version, source-stage metadata, final Hugging Face URI, and GCS URI
157+
when the artifact is mirrored to GCS.
158+
159+
Build index rows from typed release candidate and promotion-result objects, not
160+
from console logs. Release manifest entries may supply final checksum, size,
161+
revision, and kind fields for promoted data artifacts; the index should leave
162+
the release manifest schema unchanged. The release promotion contract must
163+
reference the index as a `published_artifact_index` output so dashboards and AI
164+
systems can discover the per-artifact rows from the Stage 5 contract.

docs/generated/pipeline_api.json

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3086,7 +3086,7 @@
30863086
"docstring": "Promote a completed pipeline run to production.\n\n1. Verify run status is \"completed\"\n2. Promote every staged artifact in one Hugging Face commit\n3. Upload/copy every artifact to GCS\n4. Finalize release_manifest.json, tag the release, and update\n version_manifest.json\n5. Update run status to \"promoted\"\n\nArgs:\n run_id: The run ID to promote.\n candidate_version: Candidate staging scope used for staged source files.\n release_version: Stable version used for final release metadata.\n\nReturns:\n Summary message.",
30873087
"id": "promote_pipeline_run",
30883088
"kind": "function",
3089-
"line": 2091,
3089+
"line": 2136,
30903090
"metadata": {
30913091
"api_refs": [
30923092
"modal_app.pipeline.promote_run"
@@ -3117,6 +3117,74 @@
31173117
"signature": "def promote_run(run_id: str, candidate_version: str = '', release_version: str = '') -> str",
31183118
"source_file": "modal_app/pipeline.py"
31193119
},
3120+
"published_artifact_index_builder": {
3121+
"docstring": "Build deterministic published artifact rows for a promoted release.",
3122+
"id": "published_artifact_index_builder",
3123+
"kind": "function",
3124+
"line": 247,
3125+
"metadata": {
3126+
"api_refs": [
3127+
"policyengine_us_data.release_promotion.published_index.build_published_artifact_index"
3128+
],
3129+
"artifacts_in": [
3130+
"release candidate bundle",
3131+
"typed promotion result"
3132+
],
3133+
"artifacts_out": [
3134+
"published_artifact_index.jsonl"
3135+
],
3136+
"description": "Build the Stage 5 published artifact JSONL index.",
3137+
"id": "published_artifact_index_builder",
3138+
"label": "Published Artifact Index Builder",
3139+
"node_type": "library",
3140+
"pathways": [
3141+
"5_validate_and_promote_release"
3142+
],
3143+
"source_file": "policyengine_us_data/release_promotion/published_index.py",
3144+
"stability": "moving",
3145+
"status": "transitional",
3146+
"validation_commands": [
3147+
"uv run pytest tests/unit/release_promotion/test_published_index.py"
3148+
]
3149+
},
3150+
"object_path": "policyengine_us_data.release_promotion.published_index.build_published_artifact_index",
3151+
"signature": "def build_published_artifact_index(*, candidate_bundle: ReleaseCandidateInputBundle, promotion_result: FullPromotionResult, release_manifest: Mapping[str, Any] | None = None, diagnostic_artifacts: Sequence[ArtifactRef] = ()) -> tuple[PublishedArtifactIndexRow, ...]",
3152+
"source_file": "policyengine_us_data/release_promotion/published_index.py"
3153+
},
3154+
"published_artifact_index_row": {
3155+
"docstring": "One row in the Stage 5 published artifact JSONL index.",
3156+
"id": "published_artifact_index_row",
3157+
"kind": "class",
3158+
"line": 95,
3159+
"metadata": {
3160+
"api_refs": [
3161+
"policyengine_us_data.release_promotion.published_index.PublishedArtifactIndexRow"
3162+
],
3163+
"artifacts_in": [
3164+
"release candidate bundle",
3165+
"release manifest"
3166+
],
3167+
"artifacts_out": [
3168+
"published_artifact_index.jsonl"
3169+
],
3170+
"description": "One published HF/GCS artifact row emitted by Stage 5.",
3171+
"id": "published_artifact_index_row",
3172+
"label": "PublishedArtifactIndexRow",
3173+
"node_type": "library",
3174+
"pathways": [
3175+
"5_validate_and_promote_release"
3176+
],
3177+
"source_file": "policyengine_us_data/release_promotion/published_index.py",
3178+
"stability": "moving",
3179+
"status": "transitional",
3180+
"validation_commands": [
3181+
"uv run pytest tests/unit/release_promotion/test_published_index.py"
3182+
]
3183+
},
3184+
"object_path": "policyengine_us_data.release_promotion.published_index.PublishedArtifactIndexRow",
3185+
"signature": "class PublishedArtifactIndexRow",
3186+
"source_file": "policyengine_us_data/release_promotion/published_index.py"
3187+
},
31203188
"puf_qrf_pass": {
31213189
"docstring": "Run QRF imputation for PUF variables.\n\nStratified-subsamples PUF records (top 0.5% by AGI kept,\nrest randomly sampled to ~20K total), trains QRF, and\npredicts on CPS data.\n\nArgs:\n data: CPS data dict.\n time_period: Tax year.\n puf_dataset: PUF dataset class or path.\n dataset_path: Path to CPS h5 for computing\n demographic predictors via Microsimulation.\n\nReturns:\n Tuple of (y_full_imputations, y_override_imputations)\n as dicts of {variable: np.ndarray}.",
31223190
"id": "puf_qrf_pass",
@@ -3541,7 +3609,7 @@
35413609
"docstring": "Run the full pipeline end-to-end.\n\nArgs:\n branch: Git branch to build from.\n gpu: GPU type for regional calibration.\n epochs: Training epochs for regional calibration.\n national_gpu: GPU type for national calibration.\n national_epochs: Training epochs for national.\n num_workers: Number of parallel H5 workers.\n n_clones: Number of clones for H5 building.\n skip_national: Skip national calibration/H5.\n resume_run_id: Resume a previously failed run.\n clear_checkpoints: Wipe ALL checkpoints before building\n (default False). Normally not needed \u2014 checkpoints are\n scoped by commit SHA, so stale ones from other commits\n are cleaned automatically. Use True only to force a\n full rebuild of the current commit.\n candidate_version: Candidate staging scope used for HF staging.\n release_version: Final stable release version. Usually empty until\n promotion.\n base_release_version: Stable release current when this candidate was\n built.\n release_bump: Intended SemVer bump for this candidate.\n sha_override: Exact source SHA deployed by GitHub Actions. When\n provided, this is recorded instead of reading the current\n branch tip.\n run_id: Cross-system run ID created by GitHub.\n run_context: Serialized run context from the launcher workflow.\n modal_app_name: Deployed Modal app name for this run.\n modal_environment: Modal environment used for this run.\n chunked_matrix: Build the calibration matrix in clone-household\n chunks instead of the non-chunked path. Opt-in; default off.\n chunk_size: Clone-household columns per chunk when\n ``chunked_matrix`` is True.\n parallel_matrix: Fan chunked matrix building across Modal\n workers via ``build_matrix_chunk_worker``. Only meaningful\n when ``chunked_matrix`` is True; ignored otherwise.\n num_matrix_workers: Number of Modal workers when\n ``parallel_matrix`` is True.\n\nReturns:\n The run ID for use with promote.",
35423610
"id": "run_modal_pipeline",
35433611
"kind": "function",
3544-
"line": 1113,
3612+
"line": 1158,
35453613
"metadata": {
35463614
"api_refs": [
35473615
"modal_app.pipeline.run_pipeline"
@@ -4479,7 +4547,7 @@
44794547
"docstring": "Verify deployed-image imports and subprocess seams.",
44804548
"id": "verify_runtime_seams",
44814549
"kind": "function",
4482-
"line": 739,
4550+
"line": 784,
44834551
"metadata": {
44844552
"api_refs": [
44854553
"modal_app.pipeline.verify_runtime_seams"

docs/generated/pipeline_map.json

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1407,6 +1407,56 @@
14071407
"uv run pytest tests/unit/build_outputs/test_worker_responses.py"
14081408
]
14091409
},
1410+
{
1411+
"api_refs": [
1412+
"policyengine_us_data.release_promotion.published_index.build_published_artifact_index"
1413+
],
1414+
"artifacts_in": [
1415+
"release candidate bundle",
1416+
"typed promotion result"
1417+
],
1418+
"artifacts_out": [
1419+
"published_artifact_index.jsonl"
1420+
],
1421+
"description": "Build the Stage 5 published artifact JSONL index.",
1422+
"id": "published_artifact_index_builder",
1423+
"label": "Published Artifact Index Builder",
1424+
"node_type": "library",
1425+
"pathways": [
1426+
"5_validate_and_promote_release"
1427+
],
1428+
"source_file": "policyengine_us_data/release_promotion/published_index.py",
1429+
"stability": "moving",
1430+
"status": "transitional",
1431+
"validation_commands": [
1432+
"uv run pytest tests/unit/release_promotion/test_published_index.py"
1433+
]
1434+
},
1435+
{
1436+
"api_refs": [
1437+
"policyengine_us_data.release_promotion.published_index.PublishedArtifactIndexRow"
1438+
],
1439+
"artifacts_in": [
1440+
"release candidate bundle",
1441+
"release manifest"
1442+
],
1443+
"artifacts_out": [
1444+
"published_artifact_index.jsonl"
1445+
],
1446+
"description": "One published HF/GCS artifact row emitted by Stage 5.",
1447+
"id": "published_artifact_index_row",
1448+
"label": "PublishedArtifactIndexRow",
1449+
"node_type": "library",
1450+
"pathways": [
1451+
"5_validate_and_promote_release"
1452+
],
1453+
"source_file": "policyengine_us_data/release_promotion/published_index.py",
1454+
"stability": "moving",
1455+
"status": "transitional",
1456+
"validation_commands": [
1457+
"uv run pytest tests/unit/release_promotion/test_published_index.py"
1458+
]
1459+
},
14101460
{
14111461
"api_refs": [
14121462
"policyengine_us_data.release_promotion.artifacts.ReleaseArtifactSpec"
@@ -1996,9 +2046,9 @@
19962046
}
19972047
],
19982048
"metadata": {
1999-
"api_node_count": 96,
2049+
"api_node_count": 98,
20002050
"canonical_stage_count": 5,
2001-
"decorated_object_count": 156,
2051+
"decorated_object_count": 158,
20022052
"mapped_decorated_node_count": 60,
20032053
"stage_count": 17,
20042054
"substage_count": 17

modal_app/pipeline.py

Lines changed: 55 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -671,19 +671,26 @@ def _write_release_promotion_contract_for_run(
671671
run_context: RunContext,
672672
rel_paths: list[str],
673673
promotion_result,
674-
) -> ArtifactReference:
675-
"""Write Stage 5's run-local contract and return its manifest reference."""
674+
) -> tuple[ArtifactReference, ...]:
675+
"""Write Stage 5's run-local index/contract and return manifest references."""
676676

677677
from policyengine_us_data.release_promotion import (
678678
build_legacy_release_candidate_bundle,
679+
build_published_artifact_index,
680+
published_artifact_index_artifact_ref,
681+
published_artifact_index_path,
682+
release_promotion_contract_repo_path,
679683
release_promotion_contract_path,
684+
write_published_artifact_index,
680685
write_release_promotion_contract,
681686
)
687+
from policyengine_us_data.stage_contracts import ArtifactRef
682688

683689
run_dir = _run_dir(run_context.run_id)
690+
release_context = _release_promotion_context_from_run_context(run_context)
684691
contract_path = release_promotion_contract_path(run_dir)
685692
candidate_bundle = build_legacy_release_candidate_bundle(
686-
context=_release_promotion_context_from_run_context(run_context),
693+
context=release_context,
687694
rel_paths=rel_paths,
688695
artifact_metadata_by_path=_release_artifact_metadata_by_path(
689696
run_context.run_id,
@@ -699,23 +706,61 @@ def _write_release_promotion_contract_for_run(
699706
run_context.run_id
700707
),
701708
)
709+
contract_artifact = ArtifactRef(
710+
logical_name="release_promotion_contract",
711+
uri=(
712+
f"hf://{release_context.hf_repo_name}/"
713+
f"{release_promotion_contract_repo_path(release_context.run_id)}"
714+
),
715+
media_type="application/json",
716+
metadata={
717+
"artifact_family": "stage_contract",
718+
"source_stage_id": "5_validate_and_promote_release",
719+
"relative_path": release_promotion_contract_repo_path(
720+
release_context.run_id
721+
),
722+
},
723+
)
724+
published_index_path = published_artifact_index_path(run_dir)
725+
published_index_rows = build_published_artifact_index(
726+
candidate_bundle=candidate_bundle,
727+
promotion_result=promotion_result,
728+
diagnostic_artifacts=(contract_artifact,),
729+
)
730+
write_published_artifact_index(published_index_rows, published_index_path)
731+
published_index_manifest_ref = ArtifactReference.from_path(
732+
published_index_path,
733+
role="index",
734+
base_dir=run_dir,
735+
media_type="application/jsonl",
736+
)
737+
published_index_artifact = published_artifact_index_artifact_ref(
738+
release_context,
739+
row_count=len(published_index_rows),
740+
sha256=f"sha256:{published_index_manifest_ref.sha256}",
741+
size_bytes=published_index_manifest_ref.size_bytes,
742+
)
702743
write_release_promotion_contract(
703744
contract_path=contract_path,
704745
candidate_bundle=candidate_bundle,
705746
promotion_result=promotion_result,
706747
created_at=datetime.now(timezone.utc).isoformat(),
707748
code_sha=meta.sha,
708749
package_version=meta.version,
750+
published_artifact_index=published_index_artifact,
709751
metadata={
710752
"writer": "modal_app.pipeline.promote_run",
711753
"branch": meta.branch,
712754
},
713755
)
714-
return ArtifactReference.from_path(
715-
contract_path,
716-
role="contract",
717-
base_dir=run_dir,
718-
media_type="application/json",
756+
return (
757+
ArtifactReference.from_path(
758+
contract_path,
759+
role="contract",
760+
base_dir=run_dir,
761+
media_type="application/json",
762+
),
763+
published_index_manifest_ref,
719764
)
720765

721766

@@ -2221,7 +2266,7 @@ def promote_run(
22212266
)
22222267
print(f" {promotion_stdout}")
22232268
promotion_result = _promotion_result_from_stdout(promotion_stdout)
2224-
release_promotion_contract_ref = _write_release_promotion_contract_for_run(
2269+
release_promotion_refs = _write_release_promotion_contract_for_run(
22252270
meta=meta,
22262271
run_context=promotion_context,
22272272
rel_paths=rel_paths,
@@ -2239,7 +2284,7 @@ def promote_run(
22392284
ArtifactReference.from_dict(artifact)
22402285
for artifact in promote_inputs["validated_step_outputs"]
22412286
],
2242-
release_promotion_contract_ref,
2287+
*release_promotion_refs,
22432288
],
22442289
reuse_decision="computed",
22452290
vol=pipeline_volume,

policyengine_us_data/release_promotion/__init__.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,19 @@
3333
release_promotion_contract_repo_path,
3434
write_release_promotion_contract,
3535
)
36+
from .published_index import (
37+
PUBLISHED_ARTIFACT_INDEX_FILENAME,
38+
PUBLISHED_ARTIFACT_INDEX_MEDIA_TYPE,
39+
PublishedArtifactIndexRow,
40+
build_published_artifact_index,
41+
published_artifact_index_artifact_ref,
42+
published_artifact_index_from_jsonl,
43+
published_artifact_index_path,
44+
published_artifact_index_repo_path,
45+
published_artifact_index_to_jsonl,
46+
read_published_artifact_index,
47+
write_published_artifact_index,
48+
)
3649
from .results import (
3750
CleanupPromotionResult,
3851
CompletionMarkerPromotionResult,
@@ -60,11 +73,14 @@
6073
"RELEASE_VALIDATION_SUBSTAGE_ID",
6174
"RELEASE_PROMOTION_CONTRACT_FILENAME",
6275
"RELEASE_PROMOTION_CONTRACT_TYPE",
76+
"PUBLISHED_ARTIFACT_INDEX_FILENAME",
77+
"PUBLISHED_ARTIFACT_INDEX_MEDIA_TYPE",
6378
"CleanupPromotionResult",
6479
"CompletionMarkerPromotionResult",
6580
"FullPromotionResult",
6681
"GcsPromotionResult",
6782
"HuggingFacePromotionResult",
83+
"PublishedArtifactIndexRow",
6884
"ReleaseArtifactSpec",
6985
"ReleaseCandidateInputBundle",
7086
"ReleasePromotionContractBuilder",
@@ -76,6 +92,7 @@
7692
"VALIDATION_REPORT_POLICY_PRESENCE_ONLY",
7793
"VALIDATION_REPORT_POLICY_REQUIRE_PASSING",
7894
"build_legacy_release_candidate_bundle",
95+
"build_published_artifact_index",
7996
"build_release_promotion_contract",
8097
"build_release_candidate_bundle_from_stage4_contract",
8198
"build_release_candidate_shape_report",
@@ -86,9 +103,16 @@
86103
"logical_name_for_release_path",
87104
"normalize_release_path",
88105
"parse_full_promotion_result_json",
106+
"published_artifact_index_artifact_ref",
107+
"published_artifact_index_from_jsonl",
108+
"published_artifact_index_path",
109+
"published_artifact_index_repo_path",
110+
"published_artifact_index_to_jsonl",
89111
"release_promotion_contract_path",
90112
"release_promotion_contract_repo_path",
113+
"read_published_artifact_index",
91114
"read_stage4_release_candidate_bundle",
92115
"strip_staging_prefix",
116+
"write_published_artifact_index",
93117
"write_release_promotion_contract",
94118
]

0 commit comments

Comments
 (0)