Skip to content

Commit 3cdf5ba

Browse files
baogorekclaude
andcommitted
Scope HF staging by run_id, decouple upload from validation, deduplicate Modal image
- Add run_id parameter to staging/promote/cleanup functions in data_upload.py so HF paths become staging/{run_id}/... instead of flat staging/ - Generate run_id in coordinate_publish/coordinate_national_publish if not provided - Store run_id in manifest.json; promote_publish reads it back as fallback - Downgrade manifest verification failure from hard error to warning so uploads proceed even if checksums have issues - Add --run-id CLI arg to validate_staging, check_staging_sums, promote_local_h5s - Thread run_id through pipeline.py spawn/promote calls - Consolidate duplicated Modal image definition into images.py (addresses PR #611 review) - All changes are backward-compatible: run_id="" preserves flat staging/ paths Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent b520c2f commit 3cdf5ba

11 files changed

Lines changed: 187 additions & 269 deletions

File tree

Makefile

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -211,11 +211,13 @@ promote:
211211

212212
validate-staging:
213213
python -m policyengine_us_data.calibration.validate_staging \
214-
--area-type states --output validation_results.csv
214+
--area-type states --output validation_results.csv \
215+
$(if $(RUN_ID),--run-id $(RUN_ID))
215216

216217
validate-staging-full:
217218
python -m policyengine_us_data.calibration.validate_staging \
218-
--area-type states,districts --output validation_results.csv
219+
--area-type states,districts --output validation_results.csv \
220+
$(if $(RUN_ID),--run-id $(RUN_ID))
219221

220222
upload-validation:
221223
python -c "from policyengine_us_data.utils.huggingface import upload; \
@@ -224,11 +226,13 @@ upload-validation:
224226
'calibration/logs/validation_results.csv')"
225227

226228
check-staging:
227-
python -m policyengine_us_data.calibration.check_staging_sums
229+
python -m policyengine_us_data.calibration.check_staging_sums \
230+
$(if $(RUN_ID),--run-id $(RUN_ID))
228231

229232
check-sanity:
230233
python -m policyengine_us_data.calibration.validate_staging \
231-
--sanity-only --area-type states --areas NC
234+
--sanity-only --area-type states --areas NC \
235+
$(if $(RUN_ID),--run-id $(RUN_ID))
232236

233237
build-data-modal:
234238
modal run --detach modal_app/data_build.py::main --branch $(BRANCH) --upload --skip-tests

modal_app/data_build.py

Lines changed: 2 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111

1212
import modal
1313

14+
from modal_app.images import cpu_image as image
15+
1416
app = modal.App("policyengine-us-data")
1517

1618
hf_secret = modal.Secret.from_name("huggingface-token")
@@ -29,50 +31,6 @@
2931
)
3032
PIPELINE_MOUNT = "/pipeline"
3133

32-
_REPO_ROOT = Path(__file__).resolve().parent.parent
33-
34-
try:
35-
_LOCAL_SHA = subprocess.check_output(
36-
["git", "rev-parse", "HEAD"],
37-
text=True,
38-
stderr=subprocess.DEVNULL,
39-
cwd=str(_REPO_ROOT),
40-
).strip()
41-
except Exception:
42-
_LOCAL_SHA = None
43-
44-
_IGNORE = [
45-
".git",
46-
"__pycache__",
47-
"*.egg-info",
48-
".pytest_cache",
49-
"*.h5",
50-
"*.npy",
51-
"*.pkl",
52-
"*.db",
53-
"node_modules",
54-
"venv",
55-
".venv",
56-
"docs/_build",
57-
"paper",
58-
"presentations",
59-
]
60-
image = (
61-
modal.Image.debian_slim(python_version="3.13")
62-
.apt_install("git")
63-
.pip_install("uv>=0.8")
64-
.add_local_dir(
65-
str(_REPO_ROOT),
66-
remote_path="/root/policyengine-us-data",
67-
copy=True,
68-
ignore=_IGNORE,
69-
)
70-
.env({"BUILD_COMMIT_SHA": _LOCAL_SHA or ""})
71-
.run_commands(
72-
"cd /root/policyengine-us-data && UV_HTTP_TIMEOUT=300 uv sync --frozen"
73-
)
74-
)
75-
7634
VOLUME_MOUNT = "/checkpoints"
7735
_volume_lock = threading.Lock()
7836

modal_app/images.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,32 @@
55
changes, the image rebuilds; if not, the cached layer is reused.
66
"""
77

8+
import subprocess
89
import modal
910
from pathlib import Path
1011

1112
REPO_ROOT = Path(__file__).resolve().parent.parent
1213

13-
_ignore = [
14+
GIT_ENV = {}
15+
try:
16+
GIT_ENV["GIT_COMMIT"] = (
17+
subprocess.check_output(["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL)
18+
.decode()
19+
.strip()
20+
)
21+
GIT_ENV["GIT_BRANCH"] = (
22+
subprocess.check_output(
23+
["git", "rev-parse", "--abbrev-ref", "HEAD"],
24+
stderr=subprocess.DEVNULL,
25+
)
26+
.decode()
27+
.strip()
28+
)
29+
GIT_ENV["BUILD_COMMIT_SHA"] = GIT_ENV["GIT_COMMIT"]
30+
except Exception:
31+
pass
32+
33+
_IGNORE = [
1434
".git",
1535
"__pycache__",
1636
"*.egg-info",
@@ -38,8 +58,9 @@ def _base_image(extras: list[str] | None = None):
3858
str(REPO_ROOT),
3959
remote_path="/root/policyengine-us-data",
4060
copy=True,
41-
ignore=_ignore,
61+
ignore=_IGNORE,
4262
)
63+
.env(GIT_ENV)
4364
.run_commands(
4465
f"cd /root/policyengine-us-data && "
4566
f"UV_HTTP_TIMEOUT=300 uv sync --frozen {extra_flags}"

0 commit comments

Comments
 (0)