Skip to content

Commit 2d2374f

Browse files
authored
Merge pull request #819 from PolicyEngine/maria/chunk_parallelization
Pipeline environment fixes
2 parents 6b3d2de + 74f59dd commit 2d2374f

29 files changed

Lines changed: 624 additions & 228 deletions

.github/workflows/pr.yaml

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,26 @@ jobs:
8686
env:
8787
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
8888

89+
optimized-integration-tests:
90+
runs-on: ubuntu-latest
91+
needs: [check-fork, lint]
92+
env:
93+
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
94+
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
95+
MODAL_ENVIRONMENT: main
96+
MODAL_APP_NAME: policyengine-us-data-pipeline
97+
steps:
98+
- uses: actions/checkout@v4
99+
- uses: actions/setup-python@v5
100+
with:
101+
python-version: "3.14"
102+
- name: Install optimized test deps
103+
run: pip install modal pytest
104+
- name: Deploy Modal pipeline app
105+
run: modal deploy --env="${MODAL_ENVIRONMENT}" modal_app/pipeline.py
106+
- name: Run optimized integration tests
107+
run: python -m pytest tests/optimized/test_modal_pipeline_seams.py -v
108+
89109
smoke-test:
90110
runs-on: ubuntu-latest
91111
needs: [check-fork, lint]
@@ -140,6 +160,7 @@ jobs:
140160
env:
141161
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
142162
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
163+
MODAL_ENVIRONMENT: main
143164
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
144165
steps:
145166
- uses: actions/checkout@v4
@@ -164,6 +185,6 @@ jobs:
164185
} >> "$GITHUB_STEP_SUMMARY"
165186
fi
166187
167-
modal run modal_app/data_build.py \
188+
modal run --env="${MODAL_ENVIRONMENT}" modal_app/data_build.py \
168189
--branch=${{ github.head_ref || github.ref_name }} \
169190
${STAGE_ARGS}

.github/workflows/push.yaml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,29 +12,32 @@ jobs:
1212
- run: pip install ruff>=0.9.0
1313
- run: ruff format --check .
1414

15-
# ── Build datasets and run integration tests ────────────────
16-
build-and-test:
15+
# ── Build and linear integration tests ──────────────────────
16+
build-and-linear-integration-tests:
17+
name: Build and linear integration tests
1718
runs-on: ubuntu-latest
1819
needs: lint
1920
if: github.event.head_commit.message != 'Update package version'
2021
env:
2122
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
2223
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
24+
MODAL_ENVIRONMENT: main
2325
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
2426
steps:
2527
- uses: actions/checkout@v4
2628
- uses: actions/setup-python@v5
2729
with:
2830
python-version: "3.14"
2931
- run: pip install modal
30-
- name: Build datasets and run integration tests on Modal
32+
- name: Run linear integration tests on Modal
3133
run: |
32-
modal run modal_app/data_build.py \
34+
modal run --env="${MODAL_ENVIRONMENT}" modal_app/data_build.py \
3335
--upload \
3436
--branch=${{ github.ref_name }}
3537
3638
# ── Documentation ──────────────────────────────────────────
3739
docs:
40+
name: Documentation
3841
runs-on: ubuntu-latest
3942
if: github.event.head_commit.message != 'Update package version'
4043
permissions:
@@ -62,6 +65,7 @@ jobs:
6265

6366
# ── Versioning (bump + changelog on non-version-bump pushes) ──
6467
versioning:
68+
name: Versioning
6569
runs-on: ubuntu-latest
6670
if: github.event.head_commit.message != 'Update package version'
6771
steps:

Makefile

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,10 @@ changelog:
4141
python .github/bump_version.py
4242
towncrier build --yes --version $$(python -c "import re; print(re.search(r'version = \"(.+?)\"', open('pyproject.toml').read()).group(1))")
4343
download:
44-
python policyengine_us_data/storage/download_private_prerequisites.py
44+
python -m policyengine_us_data.storage.download_private_prerequisites
4545

4646
upload:
47-
python policyengine_us_data/storage/upload_completed_datasets.py
47+
python -m policyengine_us_data.storage.upload_completed_datasets
4848

4949
docker:
5050
docker buildx build --platform linux/amd64 . -t policyengine-us-data:latest
@@ -78,19 +78,19 @@ DATABASE_YEAR ?= 2024
7878

7979
database:
8080
rm -f policyengine_us_data/storage/calibration/policy_data.db
81-
python policyengine_us_data/db/create_database_tables.py
82-
python policyengine_us_data/db/create_initial_strata.py --year $(YEAR)
83-
python policyengine_us_data/db/etl_national_targets.py --year $(YEAR)
84-
python policyengine_us_data/db/etl_age.py --year $(YEAR)
85-
python policyengine_us_data/db/etl_medicaid.py --year $(YEAR)
86-
python policyengine_us_data/db/etl_snap.py --year $(YEAR)
87-
python policyengine_us_data/db/etl_tanf.py --year $(YEAR)
88-
python policyengine_us_data/db/etl_state_income_tax.py --year $(YEAR)
89-
python policyengine_us_data/db/etl_irs_soi.py --year $(YEAR)
90-
python policyengine_us_data/db/etl_aca_agi_state_targets.py --year $(YEAR)
91-
python policyengine_us_data/db/etl_aca_marketplace.py --year $(YEAR)
92-
python policyengine_us_data/db/etl_pregnancy.py --year $(YEAR)
93-
python policyengine_us_data/db/validate_database.py
81+
python -m policyengine_us_data.db.create_database_tables
82+
python -m policyengine_us_data.db.create_initial_strata --year $(YEAR)
83+
python -m policyengine_us_data.db.etl_national_targets --year $(YEAR)
84+
python -m policyengine_us_data.db.etl_age --year $(YEAR)
85+
python -m policyengine_us_data.db.etl_medicaid --year $(YEAR)
86+
python -m policyengine_us_data.db.etl_snap --year $(YEAR)
87+
python -m policyengine_us_data.db.etl_tanf --year $(YEAR)
88+
python -m policyengine_us_data.db.etl_state_income_tax --year $(YEAR)
89+
python -m policyengine_us_data.db.etl_irs_soi --year $(YEAR)
90+
python -m policyengine_us_data.db.etl_aca_agi_state_targets --year $(YEAR)
91+
python -m policyengine_us_data.db.etl_aca_marketplace --year $(YEAR)
92+
python -m policyengine_us_data.db.etl_pregnancy --year $(YEAR)
93+
python -m policyengine_us_data.db.validate_database
9494

9595
database-refresh:
9696
rm -f policyengine_us_data/storage/calibration/policy_data.db
@@ -105,18 +105,18 @@ promote-dataset:
105105
@echo "Dataset promoted to HF."
106106

107107
data: download database
108-
python policyengine_us_data/utils/uprating.py
109-
python policyengine_us_data/datasets/acs/acs.py
110-
python policyengine_us_data/datasets/cps/cps.py
111-
python policyengine_us_data/datasets/puf/irs_puf.py
112-
python policyengine_us_data/datasets/puf/puf.py
113-
python policyengine_us_data/datasets/cps/extended_cps.py
114-
python policyengine_us_data/calibration/create_stratified_cps.py
115-
python policyengine_us_data/calibration/create_source_imputed_cps.py
108+
python -m policyengine_us_data.utils.uprating
109+
python -m policyengine_us_data.datasets.acs.acs
110+
python -m policyengine_us_data.datasets.cps.cps
111+
python -m policyengine_us_data.datasets.puf.irs_puf
112+
python -m policyengine_us_data.datasets.puf.puf
113+
python -m policyengine_us_data.datasets.cps.extended_cps
114+
python -m policyengine_us_data.calibration.create_stratified_cps
115+
python -m policyengine_us_data.calibration.create_source_imputed_cps
116116

117117
data-legacy: data
118-
python policyengine_us_data/datasets/cps/enhanced_cps.py
119-
python policyengine_us_data/datasets/cps/small_enhanced_cps.py
118+
python -m policyengine_us_data.datasets.cps.enhanced_cps
119+
python -m policyengine_us_data.datasets.cps.small_enhanced_cps
120120

121121
calibrate: data
122122
python -m policyengine_us_data.calibration.unified_calibration \
@@ -151,7 +151,7 @@ validate-data:
151151
python -c "from policyengine_us_data.storage.upload_completed_datasets import validate_all_datasets; validate_all_datasets()"
152152

153153
refresh-soi-targets:
154-
python policyengine_us_data/storage/calibration_targets/refresh_soi_table_targets.py \
154+
python -m policyengine_us_data.storage.calibration_targets.refresh_soi_table_targets \
155155
--source-year $(SOI_SOURCE_YEAR) \
156156
--target-year $(SOI_TARGET_YEAR)
157157

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Refactor Modal runtime setup to rely on `Image.uv_sync()` and the active Python interpreter rather than manual venv wiring, and add an optimized deployed-image seam test lane to the push workflow.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Activate the uv-managed venv inside Modal pipeline containers so module-scope imports from `policyengine_us_data` (notably `pandas` via `geography/__init__.py`) resolve at container boot. `uv sync --frozen` installs dependencies into `/root/policyengine-us-data/.venv/`, but Modal boots the container with the system Python, so `pipeline.py` imports were failing with `ModuleNotFoundError: No module named 'pandas'`. The image now sets `VIRTUAL_ENV`, prepends `.venv/bin` to `PATH`, and adds the venv site-packages to `PYTHONPATH`.

modal_app/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ Produces `source_imputed_stratified_extended_cps_2024.h5` from raw CPS/PUF/ACS/S
145145
| Method | Command |
146146
|--------|---------|
147147
| **Local** | `make data` |
148-
| **Modal (CI)** | `modal run modal_app/data_build.py --branch=<branch>` |
148+
| **Modal (CI)** | `modal run --env=main modal_app/data_build.py --branch=<branch>` |
149149
| **GitHub Actions** | Automatic on merge to `main` via `code_changes.yaml``reusable_test.yaml` (with `full_suite: true`). Also triggered by `pr_code_changes.yaml` on PRs. |
150150

151151
Notes:

modal_app/data_build.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,11 @@
8787
]
8888

8989

90+
def _python_cmd(*args: str) -> list[str]:
91+
"""Build a command that uses the current interpreter."""
92+
return [sys.executable, *args]
93+
94+
9095
def setup_gcp_credentials():
9196
"""Write GCP credentials JSON to a temp file for google.auth.default()."""
9297
creds_json = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS_JSON")
@@ -207,7 +212,7 @@ def run_script(
207212
env: Optional[dict] = None,
208213
log_file: IO = None,
209214
) -> str:
210-
"""Run a script with uv and return its path for logging.
215+
"""Run a script with the current interpreter and return its path.
211216
212217
Args:
213218
script_path: Path to the Python script to run.
@@ -220,7 +225,15 @@ def run_script(
220225
Raises:
221226
subprocess.CalledProcessError: If the script fails.
222227
"""
223-
cmd = ["uv", "run", "python", "-u", script_path]
228+
script = Path(script_path)
229+
if (
230+
script.suffix == ".py"
231+
and script.parts
232+
and script.parts[0] in {"policyengine_us_data", "modal_app"}
233+
):
234+
cmd = _python_cmd("-u", "-m", ".".join(script.with_suffix("").parts))
235+
else:
236+
cmd = _python_cmd("-u", script_path)
224237
if args:
225238
cmd.extend(args)
226239
run_env = env or os.environ.copy()
@@ -371,7 +384,7 @@ def run_tests_with_checkpoints(
371384

372385
print(f"Running tests: {module}")
373386
result = subprocess.run(
374-
["uv", "run", "python", "-u", "-m", "pytest", module, "-v"],
387+
_python_cmd("-u", "-m", "pytest", module, "-v"),
375388
env=env,
376389
)
377390

@@ -467,7 +480,7 @@ def build_datasets(
467480
)
468481
# Build policy_data.db from source
469482
subprocess.run(
470-
["uv", "run", "make", "database"],
483+
["make", "database"],
471484
check=True,
472485
cwd="/root/policyengine-us-data",
473486
env=env,

modal_app/images.py

Lines changed: 70 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
"""
77

88
import subprocess
9-
import modal
109
from pathlib import Path
10+
from typing import Callable
11+
12+
import modal
1113

1214
REPO_ROOT = Path(__file__).resolve().parent.parent
1315

@@ -30,41 +32,85 @@
3032
except Exception:
3133
pass
3234

33-
_IGNORE = [
34-
".git",
35-
"__pycache__",
36-
"*.egg-info",
37-
".pytest_cache",
38-
"*.h5",
39-
"*.npy",
40-
"*.pkl",
41-
"*.db",
42-
"node_modules",
43-
"venv",
44-
".venv",
45-
"docs/_build",
46-
"paper",
47-
"presentations",
48-
]
35+
36+
# Extra paths the Modal image must never include, beyond what .gitignore
37+
# already covers. `.git` holds hundreds of MB of pack data that Modal never
38+
# reads; `paper` and `presentations` are authoring directories.
39+
_MODAL_EXTRA_IGNORE = {".git", "paper", "presentations"}
40+
41+
42+
def _build_ignore_callable(repo_root: Path) -> Callable[[Path], bool]:
43+
"""Return an ignore predicate that mirrors .gitignore for Modal.
44+
45+
Modal's `add_local_dir(ignore=...)` uses dockerignore semantics, where
46+
bare patterns like `*.h5` only match root-level files. Our `.gitignore`
47+
mixes gitignore semantics (bare names match at any depth). Translating
48+
patterns by hand is error-prone and drifts over time. Instead, we ask
49+
git directly for the set of locally-ignored paths and exclude those.
50+
Untracked-but-not-ignored files still ship so uncommitted edits to
51+
Modal code (e.g. `modal_app/images.py` itself) make it into the image.
52+
"""
53+
repo_root = repo_root.resolve()
54+
ignored_paths: set[Path] = set()
55+
try:
56+
out = subprocess.check_output(
57+
[
58+
"git",
59+
"-C",
60+
str(repo_root),
61+
"ls-files",
62+
"--others",
63+
"--ignored",
64+
"--exclude-standard",
65+
"--directory",
66+
],
67+
text=True,
68+
stderr=subprocess.DEVNULL,
69+
)
70+
for line in out.splitlines():
71+
entry = line.strip().rstrip("/")
72+
if entry:
73+
ignored_paths.add((repo_root / entry).resolve())
74+
except (subprocess.CalledProcessError, FileNotFoundError):
75+
pass
76+
77+
for name in _MODAL_EXTRA_IGNORE:
78+
ignored_paths.add((repo_root / name).resolve())
79+
80+
def should_ignore(path: Path) -> bool:
81+
try:
82+
resolved = path.resolve()
83+
except (OSError, ValueError):
84+
return False
85+
if resolved in ignored_paths:
86+
return True
87+
for parent in resolved.parents:
88+
if parent in ignored_paths:
89+
return True
90+
if parent == repo_root:
91+
return False
92+
return False
93+
94+
return should_ignore
4995

5096

5197
def _base_image(extras: list[str] | None = None):
52-
extra_flags = " ".join(f"--extra {e}" for e in (extras or []))
5398
return (
5499
modal.Image.debian_slim(python_version="3.14")
55100
.apt_install("git", "make")
56-
.pip_install("uv>=0.8")
101+
.uv_sync(
102+
uv_project_dir=str(REPO_ROOT),
103+
frozen=True,
104+
extras=extras,
105+
)
57106
.add_local_dir(
58107
str(REPO_ROOT),
59108
remote_path="/root/policyengine-us-data",
60109
copy=True,
61-
ignore=_IGNORE,
110+
ignore=_build_ignore_callable(REPO_ROOT),
62111
)
112+
.workdir("/root/policyengine-us-data")
63113
.env(GIT_ENV)
64-
.run_commands(
65-
f"cd /root/policyengine-us-data && "
66-
f"UV_HTTP_TIMEOUT=300 uv sync --frozen {extra_flags}"
67-
)
68114
)
69115

70116

0 commit comments

Comments
 (0)