Skip to content

Commit c802a08

Browse files
authored
Merge pull request #741 from PolicyEngine/codex/release-manifest
Publish canonical release manifests for US data releases
2 parents 1a5239f + a737d1f commit c802a08

16 files changed

Lines changed: 2603 additions & 162 deletions

.github/workflows/pr.yaml

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,11 +143,27 @@ jobs:
143143
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
144144
steps:
145145
- uses: actions/checkout@v4
146+
with:
147+
fetch-depth: 0
146148
- uses: actions/setup-python@v5
147149
with:
148150
python-version: "3.14"
149151
- run: pip install modal
150152
- name: Build datasets and run integration tests on Modal
151153
run: |
154+
STAGE_ARGS=""
155+
if git diff --name-only origin/main...HEAD | grep -qx 'pyproject.toml'; then
156+
VERSION=$(python -c "from pathlib import Path; import tomllib; print(tomllib.load(Path('pyproject.toml').open('rb'))['project']['version'])")
157+
STAGE_ARGS="--upload --stage-only --run-id=${VERSION}"
158+
{
159+
echo "## Release Artifact Staging"
160+
echo ""
161+
echo "- package version: \`${VERSION}\`"
162+
echo "- staged HF prefix: \`staging/${VERSION}/\`"
163+
echo "- promote with: \`uv run python policyengine_us_data/storage/upload_completed_datasets.py --promote-only --run-id=${VERSION} --version=${VERSION}\`"
164+
} >> "$GITHUB_STEP_SUMMARY"
165+
fi
166+
152167
modal run modal_app/data_build.py \
153-
--branch=${{ github.head_ref || github.ref_name }}
168+
--branch=${{ github.head_ref || github.ref_name }} \
169+
${STAGE_ARGS}

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ validate-package:
126126
python -m policyengine_us_data.calibration.validate_package
127127

128128
publish-local-area:
129-
python policyengine_us_data/calibration/publish_local_area.py --upload
129+
python -m policyengine_us_data.calibration.promote_local_h5s --local-dir local_area_build
130130

131131
build-h5s:
132132
python -m policyengine_us_data.calibration.publish_local_area \
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Preflight release-manifest finalization before promoting staged artifacts so failed finalization checks cannot partially publish a release.

modal_app/data_build.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,8 @@ def validate_and_maybe_upload_datasets(
238238
upload: bool,
239239
skip_enhanced_cps: bool,
240240
env: dict,
241+
stage_only: bool = False,
242+
run_id: str = "",
241243
) -> None:
242244
validation_args = ["--validate-only"]
243245
if skip_enhanced_cps:
@@ -254,6 +256,10 @@ def validate_and_maybe_upload_datasets(
254256
upload_args = []
255257
if skip_enhanced_cps:
256258
upload_args.append("--no-require-enhanced-cps")
259+
if stage_only:
260+
upload_args.append("--stage-only")
261+
if run_id:
262+
upload_args.append(f"--run-id={run_id}")
257263
run_script(
258264
"policyengine_us_data/storage/upload_completed_datasets.py",
259265
args=upload_args,
@@ -375,6 +381,7 @@ def build_datasets(
375381
clear_checkpoints: bool = False,
376382
skip_tests: bool = False,
377383
skip_enhanced_cps: bool = False,
384+
stage_only: bool = False,
378385
run_id: str = "",
379386
):
380387
"""Build all datasets with preemption-resilient checkpointing.
@@ -387,6 +394,7 @@ def build_datasets(
387394
skip_tests: Skip running the test suite (useful for calibration runs).
388395
skip_enhanced_cps: Skip enhanced_cps.py and small_enhanced_cps.py
389396
(useful for calibration runs that only need source_imputed H5).
397+
stage_only: Upload to HF staging only, without promoting a release.
390398
"""
391399
setup_gcp_credentials()
392400

@@ -673,6 +681,8 @@ def build_datasets(
673681
upload=upload,
674682
skip_enhanced_cps=skip_enhanced_cps,
675683
env=env,
684+
stage_only=stage_only,
685+
run_id=run_id,
676686
)
677687

678688
# Clean up checkpoints after successful completion
@@ -689,6 +699,8 @@ def main(
689699
clear_checkpoints: bool = False,
690700
skip_tests: bool = False,
691701
skip_enhanced_cps: bool = False,
702+
stage_only: bool = False,
703+
run_id: str = "",
692704
):
693705
result = build_datasets.remote(
694706
upload=upload,
@@ -697,5 +709,7 @@ def main(
697709
clear_checkpoints=clear_checkpoints,
698710
skip_tests=skip_tests,
699711
skip_enhanced_cps=skip_enhanced_cps,
712+
stage_only=stage_only,
713+
run_id=run_id,
700714
)
701715
print(result)

modal_app/local_area.py

Lines changed: 176 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828
if _p not in sys.path:
2929
sys.path.insert(0, _p)
3030

31-
from modal_app.images import cpu_image as image
32-
from modal_app.resilience import reconcile_run_dir_fingerprint
31+
from modal_app.images import cpu_image as image # noqa: E402
32+
from modal_app.resilience import reconcile_run_dir_fingerprint # noqa: E402
3333

3434
app = modal.App("policyengine-us-data-local-area")
3535

@@ -70,6 +70,170 @@ def setup_repo(branch: str):
7070
os.chdir("/root/policyengine-us-data")
7171

7272

73+
def _build_promote_national_publish_script(
74+
*,
75+
version: str,
76+
run_id: str,
77+
rel_paths: list[str],
78+
) -> str:
79+
rel_paths_json = json.dumps(rel_paths)
80+
return f"""
81+
import json
82+
from pathlib import Path
83+
from policyengine_us_data.utils.data_upload import (
84+
promote_staging_to_production_hf,
85+
cleanup_staging_hf,
86+
upload_local_area_file,
87+
publish_release_manifest_to_hf,
88+
should_finalize_local_area_release,
89+
)
90+
from policyengine_us_data.utils.version_manifest import (
91+
HFVersionInfo,
92+
build_manifest,
93+
upload_manifest,
94+
)
95+
96+
version = "{version}"
97+
run_id = "{run_id}"
98+
rel_paths = json.loads('''{rel_paths_json}''')
99+
run_dir = Path("{VOLUME_MOUNT}") / run_id
100+
101+
print(f"Promoting national H5 from staging to production (run_id={{run_id!r}})...")
102+
promoted = promote_staging_to_production_hf(rel_paths, version, run_id=run_id)
103+
print(f"Promoted {{promoted}} files to HuggingFace production")
104+
105+
national_h5 = run_dir / "national" / "US.h5"
106+
if national_h5.exists():
107+
print("Uploading national H5 to GCS...")
108+
upload_local_area_file(
109+
str(national_h5), "national", version=version, skip_hf=True
110+
)
111+
print("Uploaded national H5 to GCS")
112+
else:
113+
raise RuntimeError(f"Expected national H5 at {{national_h5}}")
114+
115+
print("Updating release manifest...")
116+
should_finalize, missing_prefixes = should_finalize_local_area_release(
117+
version=version,
118+
new_repo_paths=["national/US.h5"],
119+
)
120+
manifest = publish_release_manifest_to_hf(
121+
[(national_h5, "national/US.h5")],
122+
version=version,
123+
create_tag=should_finalize,
124+
)
125+
if should_finalize:
126+
upload_manifest(
127+
build_manifest(
128+
version=version,
129+
blob_names=sorted(
130+
artifact["path"] for artifact in manifest["artifacts"].values()
131+
),
132+
hf_info=HFVersionInfo(
133+
repo="policyengine/policyengine-us-data",
134+
commit=version,
135+
),
136+
)
137+
)
138+
print("Updated release manifest and created tag")
139+
else:
140+
print(
141+
"Updated release manifest without creating a tag; "
142+
f"missing prefixes: {{', '.join(missing_prefixes)}}"
143+
)
144+
145+
print("Cleaning up staging...")
146+
cleaned = cleanup_staging_hf(rel_paths, version, run_id=run_id)
147+
print(f"Cleaned up {{cleaned}} files from staging")
148+
print(f"Successfully promoted national H5 for version {{version}}")
149+
"""
150+
151+
152+
def _build_promote_publish_script(
153+
*,
154+
version: str,
155+
run_id: str,
156+
rel_paths: list[str],
157+
) -> str:
158+
rel_paths_json = json.dumps(rel_paths)
159+
return f"""
160+
import json
161+
from pathlib import Path
162+
from policyengine_us_data.utils.data_upload import (
163+
promote_staging_to_production_hf,
164+
cleanup_staging_hf,
165+
upload_local_area_file,
166+
publish_release_manifest_to_hf,
167+
should_finalize_local_area_release,
168+
)
169+
from policyengine_us_data.utils.version_manifest import (
170+
HFVersionInfo,
171+
build_manifest,
172+
upload_manifest,
173+
)
174+
175+
rel_paths = json.loads('''{rel_paths_json}''')
176+
version = "{version}"
177+
run_id = "{run_id}"
178+
run_dir = Path("{VOLUME_MOUNT}") / run_id
179+
180+
print(f"Promoting {{len(rel_paths)}} files from staging/ to production (run_id={{run_id!r}})...")
181+
promoted = promote_staging_to_production_hf(rel_paths, version, run_id=run_id)
182+
print(f"Promoted {{promoted}} files to HuggingFace production")
183+
184+
print(f"Uploading {{len(rel_paths)}} files to GCS...")
185+
gcs_count = 0
186+
for rel_path in rel_paths:
187+
local_path = run_dir / rel_path
188+
subdirectory = str(Path(rel_path).parent)
189+
upload_local_area_file(
190+
str(local_path),
191+
subdirectory,
192+
version=version,
193+
skip_hf=True,
194+
)
195+
gcs_count += 1
196+
print(f"Uploaded {{gcs_count}} files to GCS")
197+
198+
print("Updating release manifest...")
199+
should_finalize, missing_prefixes = should_finalize_local_area_release(
200+
version=version,
201+
new_repo_paths=rel_paths,
202+
)
203+
manifest = publish_release_manifest_to_hf(
204+
[(run_dir / rel_path, rel_path) for rel_path in rel_paths],
205+
version=version,
206+
create_tag=should_finalize,
207+
)
208+
if should_finalize:
209+
upload_manifest(
210+
build_manifest(
211+
version=version,
212+
blob_names=sorted(
213+
artifact["path"] for artifact in manifest["artifacts"].values()
214+
),
215+
hf_info=HFVersionInfo(
216+
repo="policyengine/policyengine-us-data",
217+
commit=version,
218+
),
219+
)
220+
)
221+
print("Updated release manifest and created tag")
222+
else:
223+
print(
224+
"Updated release manifest without final tag; missing local-area prefixes: "
225+
+ ", ".join(missing_prefixes)
226+
)
227+
print("Deferring version_manifest.json update until release finalization")
228+
229+
print("Cleaning up staging/...")
230+
cleaned = cleanup_staging_hf(rel_paths, version, run_id=run_id)
231+
print(f"Cleaned up {{cleaned}} files from staging/")
232+
233+
print(f"Successfully published version {{version}}")
234+
"""
235+
236+
73237
def validate_artifacts(
74238
config_path: Path,
75239
artifact_dir: Path,
@@ -556,52 +720,17 @@ def promote_publish(branch: str = "main", version: str = "", run_id: str = "") -
556720
with open(manifest_path) as f:
557721
manifest = json.load(f)
558722

559-
rel_paths_json = json.dumps(list(manifest["files"].keys()))
560-
561723
result = subprocess.run(
562724
[
563725
"uv",
564726
"run",
565727
"python",
566728
"-c",
567-
f"""
568-
import json
569-
from pathlib import Path
570-
from policyengine_us_data.utils.data_upload import (
571-
promote_staging_to_production_hf,
572-
cleanup_staging_hf,
573-
upload_local_area_file,
574-
)
575-
576-
rel_paths = json.loads('''{rel_paths_json}''')
577-
version = "{version}"
578-
run_id = "{run_id}"
579-
run_dir = Path("{VOLUME_MOUNT}") / run_id
580-
581-
print(f"Promoting {{len(rel_paths)}} files from staging/ to production (run_id={{run_id!r}})...")
582-
promoted = promote_staging_to_production_hf(rel_paths, version, run_id=run_id)
583-
print(f"Promoted {{promoted}} files to HuggingFace production")
584-
585-
print(f"Uploading {{len(rel_paths)}} files to GCS...")
586-
gcs_count = 0
587-
for rel_path in rel_paths:
588-
local_path = run_dir / rel_path
589-
subdirectory = str(Path(rel_path).parent)
590-
upload_local_area_file(
591-
str(local_path),
592-
subdirectory,
593-
version=version,
594-
skip_hf=True,
595-
)
596-
gcs_count += 1
597-
print(f"Uploaded {{gcs_count}} files to GCS")
598-
599-
print("Cleaning up staging/...")
600-
cleaned = cleanup_staging_hf(rel_paths, version, run_id=run_id)
601-
print(f"Cleaned up {{cleaned}} files from staging/")
602-
603-
print(f"Successfully published version {{version}}")
604-
""",
729+
_build_promote_publish_script(
730+
version=version,
731+
run_id=run_id,
732+
rel_paths=list(manifest["files"].keys()),
733+
),
605734
],
606735
text=True,
607736
env=os.environ.copy(),
@@ -1133,39 +1262,11 @@ def promote_national_publish(
11331262
"run",
11341263
"python",
11351264
"-c",
1136-
f"""
1137-
import json
1138-
from pathlib import Path
1139-
from policyengine_us_data.utils.data_upload import (
1140-
promote_staging_to_production_hf,
1141-
cleanup_staging_hf,
1142-
upload_local_area_file,
1143-
)
1144-
1145-
version = "{version}"
1146-
run_id = "{run_id}"
1147-
rel_paths = {json.dumps(rel_paths)}
1148-
run_dir = Path("{VOLUME_MOUNT}") / run_id
1149-
1150-
print(f"Promoting national H5 from staging to production (run_id={{run_id!r}})...")
1151-
promoted = promote_staging_to_production_hf(rel_paths, version, run_id=run_id)
1152-
print(f"Promoted {{promoted}} files to HuggingFace production")
1153-
1154-
national_h5 = run_dir / "national" / "US.h5"
1155-
if national_h5.exists():
1156-
print("Uploading national H5 to GCS...")
1157-
upload_local_area_file(
1158-
str(national_h5), "national", version=version, skip_hf=True
1159-
)
1160-
print("Uploaded national H5 to GCS")
1161-
else:
1162-
print(f"WARNING: {{national_h5}} not on volume, skipping GCS")
1163-
1164-
print("Cleaning up staging...")
1165-
cleaned = cleanup_staging_hf(rel_paths, version, run_id=run_id)
1166-
print(f"Cleaned up {{cleaned}} files from staging")
1167-
print(f"Successfully promoted national H5 for version {{version}}")
1168-
""",
1265+
_build_promote_national_publish_script(
1266+
version=version,
1267+
run_id=run_id,
1268+
rel_paths=rel_paths,
1269+
),
11691270
],
11701271
text=True,
11711272
env=os.environ.copy(),

0 commit comments

Comments
 (0)