Skip to content

Commit ba204cf

Browse files
committed
Add Stage 5 published artifact index
1 parent 981624d commit ba204cf

12 files changed

Lines changed: 1089 additions & 27 deletions

File tree

changelog.d/1044.added

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added the Stage 5 published artifact index JSONL artifact.

docs/engineering/pipeline-map.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1355,6 +1355,22 @@ def normalize_worker_response(*, worker_index: int, result: object) -> Coordinat
13551355

13561356
Normalize worker JSON into explicit fatal and nonfatal coordinator issues.
13571357

1358+
### `policyengine_us_data.release_promotion.published_index.build_published_artifact_index`
1359+
1360+
```python
1361+
def build_published_artifact_index(*, candidate_bundle: ReleaseCandidateInputBundle, promotion_result: FullPromotionResult, release_manifest: Mapping[str, Any] | None = None, diagnostic_artifacts: Sequence[ArtifactRef] = ()) -> tuple[PublishedArtifactIndexRow, ...]
1362+
```
1363+
1364+
Build deterministic published artifact rows for a promoted release.
1365+
1366+
### `policyengine_us_data.release_promotion.published_index.PublishedArtifactIndexRow`
1367+
1368+
```python
1369+
class PublishedArtifactIndexRow
1370+
```
1371+
1372+
One row in the Stage 5 published artifact JSONL index.
1373+
13581374
### `policyengine_us_data.release_promotion.artifacts.ReleaseArtifactSpec`
13591375

13601376
```python

docs/engineering/stages/release_promotion.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,3 +147,18 @@ Runtime step manifests for `5_validate_and_promote_release` should include the
147147
contract as a JSON `contract` output. They may still record legacy validated
148148
input artifacts for compatibility, but the contract is the preferred semantic
149149
entry point for Stage 5 status and lineage.
150+
151+
## Published Artifact Index
152+
153+
Stage 5 also writes `published_artifact_index.jsonl` under the run-local
154+
`diagnostics/` directory. Each JSONL row describes one promoted artifact or
155+
release metadata artifact with its canonical `run_id`, candidate version,
156+
release version, source-stage metadata, final Hugging Face URI, and GCS URI
157+
when the artifact is mirrored to GCS.
158+
159+
Build index rows from typed release candidate and promotion-result objects, not
160+
from console logs. Release manifest entries may supply final checksum, size,
161+
revision, and kind fields for promoted data artifacts; the index should leave
162+
the release manifest schema unchanged. The release promotion contract must
163+
reference the index as a `published_artifact_index` output so dashboards and AI
164+
systems can discover the per-artifact rows from the Stage 5 contract.

docs/generated/pipeline_api.json

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3086,7 +3086,7 @@
30863086
"docstring": "Promote a completed pipeline run to production.\n\n1. Verify run status is \"completed\"\n2. Promote every staged artifact in one Hugging Face commit\n3. Upload/copy every artifact to GCS\n4. Finalize release_manifest.json, tag the release, and update\n version_manifest.json\n5. Update run status to \"promoted\"\n\nArgs:\n run_id: The run ID to promote.\n candidate_version: Candidate staging scope used for staged source files.\n release_version: Stable version used for final release metadata.\n\nReturns:\n Summary message.",
30873087
"id": "promote_pipeline_run",
30883088
"kind": "function",
3089-
"line": 2079,
3089+
"line": 2124,
30903090
"metadata": {
30913091
"api_refs": [
30923092
"modal_app.pipeline.promote_run"
@@ -3117,6 +3117,74 @@
31173117
"signature": "def promote_run(run_id: str, candidate_version: str = '', release_version: str = '') -> str",
31183118
"source_file": "modal_app/pipeline.py"
31193119
},
3120+
"published_artifact_index_builder": {
3121+
"docstring": "Build deterministic published artifact rows for a promoted release.",
3122+
"id": "published_artifact_index_builder",
3123+
"kind": "function",
3124+
"line": 247,
3125+
"metadata": {
3126+
"api_refs": [
3127+
"policyengine_us_data.release_promotion.published_index.build_published_artifact_index"
3128+
],
3129+
"artifacts_in": [
3130+
"release candidate bundle",
3131+
"typed promotion result"
3132+
],
3133+
"artifacts_out": [
3134+
"published_artifact_index.jsonl"
3135+
],
3136+
"description": "Build the Stage 5 published artifact JSONL index.",
3137+
"id": "published_artifact_index_builder",
3138+
"label": "Published Artifact Index Builder",
3139+
"node_type": "library",
3140+
"pathways": [
3141+
"5_validate_and_promote_release"
3142+
],
3143+
"source_file": "policyengine_us_data/release_promotion/published_index.py",
3144+
"stability": "moving",
3145+
"status": "transitional",
3146+
"validation_commands": [
3147+
"uv run pytest tests/unit/release_promotion/test_published_index.py"
3148+
]
3149+
},
3150+
"object_path": "policyengine_us_data.release_promotion.published_index.build_published_artifact_index",
3151+
"signature": "def build_published_artifact_index(*, candidate_bundle: ReleaseCandidateInputBundle, promotion_result: FullPromotionResult, release_manifest: Mapping[str, Any] | None = None, diagnostic_artifacts: Sequence[ArtifactRef] = ()) -> tuple[PublishedArtifactIndexRow, ...]",
3152+
"source_file": "policyengine_us_data/release_promotion/published_index.py"
3153+
},
3154+
"published_artifact_index_row": {
3155+
"docstring": "One row in the Stage 5 published artifact JSONL index.",
3156+
"id": "published_artifact_index_row",
3157+
"kind": "class",
3158+
"line": 95,
3159+
"metadata": {
3160+
"api_refs": [
3161+
"policyengine_us_data.release_promotion.published_index.PublishedArtifactIndexRow"
3162+
],
3163+
"artifacts_in": [
3164+
"release candidate bundle",
3165+
"release manifest"
3166+
],
3167+
"artifacts_out": [
3168+
"published_artifact_index.jsonl"
3169+
],
3170+
"description": "One published HF/GCS artifact row emitted by Stage 5.",
3171+
"id": "published_artifact_index_row",
3172+
"label": "PublishedArtifactIndexRow",
3173+
"node_type": "library",
3174+
"pathways": [
3175+
"5_validate_and_promote_release"
3176+
],
3177+
"source_file": "policyengine_us_data/release_promotion/published_index.py",
3178+
"stability": "moving",
3179+
"status": "transitional",
3180+
"validation_commands": [
3181+
"uv run pytest tests/unit/release_promotion/test_published_index.py"
3182+
]
3183+
},
3184+
"object_path": "policyengine_us_data.release_promotion.published_index.PublishedArtifactIndexRow",
3185+
"signature": "class PublishedArtifactIndexRow",
3186+
"source_file": "policyengine_us_data/release_promotion/published_index.py"
3187+
},
31203188
"puf_qrf_pass": {
31213189
"docstring": "Run QRF imputation for PUF variables.\n\nStratified-subsamples PUF records (top 0.5% by AGI kept,\nrest randomly sampled to ~20K total), trains QRF, and\npredicts on CPS data.\n\nArgs:\n data: CPS data dict.\n time_period: Tax year.\n puf_dataset: PUF dataset class or path.\n dataset_path: Path to CPS h5 for computing\n demographic predictors via Microsimulation.\n\nReturns:\n Tuple of (y_full_imputations, y_override_imputations)\n as dicts of {variable: np.ndarray}.",
31223190
"id": "puf_qrf_pass",
@@ -3541,7 +3609,7 @@
35413609
"docstring": "Run the full pipeline end-to-end.\n\nArgs:\n branch: Git branch to build from.\n gpu: GPU type for regional calibration.\n epochs: Training epochs for regional calibration.\n national_gpu: GPU type for national calibration.\n national_epochs: Training epochs for national.\n num_workers: Number of parallel H5 workers.\n n_clones: Number of clones for H5 building.\n skip_national: Skip national calibration/H5.\n resume_run_id: Resume a previously failed run.\n clear_checkpoints: Wipe ALL checkpoints before building\n (default False). Normally not needed \u2014 checkpoints are\n scoped by commit SHA, so stale ones from other commits\n are cleaned automatically. Use True only to force a\n full rebuild of the current commit.\n candidate_version: Candidate staging scope used for HF staging.\n release_version: Final stable release version. Usually empty until\n promotion.\n base_release_version: Stable release current when this candidate was\n built.\n release_bump: Intended SemVer bump for this candidate.\n sha_override: Exact source SHA deployed by GitHub Actions. When\n provided, this is recorded instead of reading the current\n branch tip.\n run_id: Cross-system run ID created by GitHub.\n run_context: Serialized run context from the launcher workflow.\n modal_app_name: Deployed Modal app name for this run.\n modal_environment: Modal environment used for this run.\n chunked_matrix: Build the calibration matrix in clone-household\n chunks instead of the non-chunked path. Opt-in; default off.\n chunk_size: Clone-household columns per chunk when\n ``chunked_matrix`` is True.\n parallel_matrix: Fan chunked matrix building across Modal\n workers via ``build_matrix_chunk_worker``. Only meaningful\n when ``chunked_matrix`` is True; ignored otherwise.\n num_matrix_workers: Number of Modal workers when\n ``parallel_matrix`` is True.\n\nReturns:\n The run ID for use with promote.",
35423610
"id": "run_modal_pipeline",
35433611
"kind": "function",
3544-
"line": 1112,
3612+
"line": 1157,
35453613
"metadata": {
35463614
"api_refs": [
35473615
"modal_app.pipeline.run_pipeline"
@@ -4421,7 +4489,7 @@
44214489
"docstring": "Verify deployed-image imports and subprocess seams.",
44224490
"id": "verify_runtime_seams",
44234491
"kind": "function",
4424-
"line": 738,
4492+
"line": 783,
44254493
"metadata": {
44264494
"api_refs": [
44274495
"modal_app.pipeline.verify_runtime_seams"

docs/generated/pipeline_map.json

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1407,6 +1407,56 @@
14071407
"uv run pytest tests/unit/build_outputs/test_worker_responses.py"
14081408
]
14091409
},
1410+
{
1411+
"api_refs": [
1412+
"policyengine_us_data.release_promotion.published_index.build_published_artifact_index"
1413+
],
1414+
"artifacts_in": [
1415+
"release candidate bundle",
1416+
"typed promotion result"
1417+
],
1418+
"artifacts_out": [
1419+
"published_artifact_index.jsonl"
1420+
],
1421+
"description": "Build the Stage 5 published artifact JSONL index.",
1422+
"id": "published_artifact_index_builder",
1423+
"label": "Published Artifact Index Builder",
1424+
"node_type": "library",
1425+
"pathways": [
1426+
"5_validate_and_promote_release"
1427+
],
1428+
"source_file": "policyengine_us_data/release_promotion/published_index.py",
1429+
"stability": "moving",
1430+
"status": "transitional",
1431+
"validation_commands": [
1432+
"uv run pytest tests/unit/release_promotion/test_published_index.py"
1433+
]
1434+
},
1435+
{
1436+
"api_refs": [
1437+
"policyengine_us_data.release_promotion.published_index.PublishedArtifactIndexRow"
1438+
],
1439+
"artifacts_in": [
1440+
"release candidate bundle",
1441+
"release manifest"
1442+
],
1443+
"artifacts_out": [
1444+
"published_artifact_index.jsonl"
1445+
],
1446+
"description": "One published HF/GCS artifact row emitted by Stage 5.",
1447+
"id": "published_artifact_index_row",
1448+
"label": "PublishedArtifactIndexRow",
1449+
"node_type": "library",
1450+
"pathways": [
1451+
"5_validate_and_promote_release"
1452+
],
1453+
"source_file": "policyengine_us_data/release_promotion/published_index.py",
1454+
"stability": "moving",
1455+
"status": "transitional",
1456+
"validation_commands": [
1457+
"uv run pytest tests/unit/release_promotion/test_published_index.py"
1458+
]
1459+
},
14101460
{
14111461
"api_refs": [
14121462
"policyengine_us_data.release_promotion.artifacts.ReleaseArtifactSpec"
@@ -1996,9 +2046,9 @@
19962046
}
19972047
],
19982048
"metadata": {
1999-
"api_node_count": 96,
2049+
"api_node_count": 98,
20002050
"canonical_stage_count": 5,
2001-
"decorated_object_count": 154,
2051+
"decorated_object_count": 156,
20022052
"mapped_decorated_node_count": 58,
20032053
"stage_count": 17,
20042054
"substage_count": 17

modal_app/pipeline.py

Lines changed: 55 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -670,19 +670,26 @@ def _write_release_promotion_contract_for_run(
670670
run_context: RunContext,
671671
rel_paths: list[str],
672672
promotion_result,
673-
) -> ArtifactReference:
674-
"""Write Stage 5's run-local contract and return its manifest reference."""
673+
) -> tuple[ArtifactReference, ...]:
674+
"""Write Stage 5's run-local index/contract and return manifest references."""
675675

676676
from policyengine_us_data.release_promotion import (
677677
build_legacy_release_candidate_bundle,
678+
build_published_artifact_index,
679+
published_artifact_index_artifact_ref,
680+
published_artifact_index_path,
681+
release_promotion_contract_repo_path,
678682
release_promotion_contract_path,
683+
write_published_artifact_index,
679684
write_release_promotion_contract,
680685
)
686+
from policyengine_us_data.stage_contracts import ArtifactRef
681687

682688
run_dir = _run_dir(run_context.run_id)
689+
release_context = _release_promotion_context_from_run_context(run_context)
683690
contract_path = release_promotion_contract_path(run_dir)
684691
candidate_bundle = build_legacy_release_candidate_bundle(
685-
context=_release_promotion_context_from_run_context(run_context),
692+
context=release_context,
686693
rel_paths=rel_paths,
687694
artifact_metadata_by_path=_release_artifact_metadata_by_path(
688695
run_context.run_id,
@@ -698,23 +705,61 @@ def _write_release_promotion_contract_for_run(
698705
run_context.run_id
699706
),
700707
)
708+
contract_artifact = ArtifactRef(
709+
logical_name="release_promotion_contract",
710+
uri=(
711+
f"hf://{release_context.hf_repo_name}/"
712+
f"{release_promotion_contract_repo_path(release_context.run_id)}"
713+
),
714+
media_type="application/json",
715+
metadata={
716+
"artifact_family": "stage_contract",
717+
"source_stage_id": "5_validate_and_promote_release",
718+
"relative_path": release_promotion_contract_repo_path(
719+
release_context.run_id
720+
),
721+
},
722+
)
723+
published_index_path = published_artifact_index_path(run_dir)
724+
published_index_rows = build_published_artifact_index(
725+
candidate_bundle=candidate_bundle,
726+
promotion_result=promotion_result,
727+
diagnostic_artifacts=(contract_artifact,),
728+
)
729+
write_published_artifact_index(published_index_rows, published_index_path)
730+
published_index_manifest_ref = ArtifactReference.from_path(
731+
published_index_path,
732+
role="index",
733+
base_dir=run_dir,
734+
media_type="application/jsonl",
735+
)
736+
published_index_artifact = published_artifact_index_artifact_ref(
737+
release_context,
738+
row_count=len(published_index_rows),
739+
sha256=f"sha256:{published_index_manifest_ref.sha256}",
740+
size_bytes=published_index_manifest_ref.size_bytes,
741+
)
701742
write_release_promotion_contract(
702743
contract_path=contract_path,
703744
candidate_bundle=candidate_bundle,
704745
promotion_result=promotion_result,
705746
created_at=datetime.now(timezone.utc).isoformat(),
706747
code_sha=meta.sha,
707748
package_version=meta.version,
749+
published_artifact_index=published_index_artifact,
708750
metadata={
709751
"writer": "modal_app.pipeline.promote_run",
710752
"branch": meta.branch,
711753
},
712754
)
713-
return ArtifactReference.from_path(
714-
contract_path,
715-
role="contract",
716-
base_dir=run_dir,
717-
media_type="application/json",
755+
return (
756+
ArtifactReference.from_path(
757+
contract_path,
758+
role="contract",
759+
base_dir=run_dir,
760+
media_type="application/json",
761+
),
762+
published_index_manifest_ref,
718763
)
719764

720765

@@ -2209,7 +2254,7 @@ def promote_run(
22092254
)
22102255
print(f" {promotion_stdout}")
22112256
promotion_result = _promotion_result_from_stdout(promotion_stdout)
2212-
release_promotion_contract_ref = _write_release_promotion_contract_for_run(
2257+
release_promotion_refs = _write_release_promotion_contract_for_run(
22132258
meta=meta,
22142259
run_context=promotion_context,
22152260
rel_paths=rel_paths,
@@ -2227,7 +2272,7 @@ def promote_run(
22272272
ArtifactReference.from_dict(artifact)
22282273
for artifact in promote_inputs["validated_step_outputs"]
22292274
],
2230-
release_promotion_contract_ref,
2275+
*release_promotion_refs,
22312276
],
22322277
reuse_decision="computed",
22332278
vol=pipeline_volume,

policyengine_us_data/release_promotion/__init__.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,19 @@
3333
release_promotion_contract_repo_path,
3434
write_release_promotion_contract,
3535
)
36+
from .published_index import (
37+
PUBLISHED_ARTIFACT_INDEX_FILENAME,
38+
PUBLISHED_ARTIFACT_INDEX_MEDIA_TYPE,
39+
PublishedArtifactIndexRow,
40+
build_published_artifact_index,
41+
published_artifact_index_artifact_ref,
42+
published_artifact_index_from_jsonl,
43+
published_artifact_index_path,
44+
published_artifact_index_repo_path,
45+
published_artifact_index_to_jsonl,
46+
read_published_artifact_index,
47+
write_published_artifact_index,
48+
)
3649
from .results import (
3750
CleanupPromotionResult,
3851
CompletionMarkerPromotionResult,
@@ -60,11 +73,14 @@
6073
"RELEASE_VALIDATION_SUBSTAGE_ID",
6174
"RELEASE_PROMOTION_CONTRACT_FILENAME",
6275
"RELEASE_PROMOTION_CONTRACT_TYPE",
76+
"PUBLISHED_ARTIFACT_INDEX_FILENAME",
77+
"PUBLISHED_ARTIFACT_INDEX_MEDIA_TYPE",
6378
"CleanupPromotionResult",
6479
"CompletionMarkerPromotionResult",
6580
"FullPromotionResult",
6681
"GcsPromotionResult",
6782
"HuggingFacePromotionResult",
83+
"PublishedArtifactIndexRow",
6884
"ReleaseArtifactSpec",
6985
"ReleaseCandidateInputBundle",
7086
"ReleasePromotionContractBuilder",
@@ -76,6 +92,7 @@
7692
"VALIDATION_REPORT_POLICY_PRESENCE_ONLY",
7793
"VALIDATION_REPORT_POLICY_REQUIRE_PASSING",
7894
"build_legacy_release_candidate_bundle",
95+
"build_published_artifact_index",
7996
"build_release_promotion_contract",
8097
"build_release_candidate_bundle_from_stage4_contract",
8198
"build_release_candidate_shape_report",
@@ -86,9 +103,16 @@
86103
"logical_name_for_release_path",
87104
"normalize_release_path",
88105
"parse_full_promotion_result_json",
106+
"published_artifact_index_artifact_ref",
107+
"published_artifact_index_from_jsonl",
108+
"published_artifact_index_path",
109+
"published_artifact_index_repo_path",
110+
"published_artifact_index_to_jsonl",
89111
"release_promotion_contract_path",
90112
"release_promotion_contract_repo_path",
113+
"read_published_artifact_index",
91114
"read_stage4_release_candidate_bundle",
92115
"strip_staging_prefix",
116+
"write_published_artifact_index",
93117
"write_release_promotion_contract",
94118
]

0 commit comments

Comments
 (0)