[TEST] Add push workflow for dry-run testing

anth-volk · claude · anth-volk · commit 02eb9fa89536 · 2026-03-31T00:10:33.000+02:00
DO NOT MERGE - test-only PR to validate push.yaml workflow.
Contains deliberate lint failure to prevent accidental merge.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml
@@ -0,0 +1,238 @@
+name: "[TEST] Push workflow dry run"
+
+on:
+  pull_request:
+    branches: [main]
+
+jobs:
+  # ── Lint ────────────────────────────────────────────────────
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install ruff
+        run: pip install ruff>=0.9.0
+      - name: Check formatting
+        run: ruff format --check .
+
+  # ── Per-dataset build and test on Modal ─────────────────────
+  build-and-test:
+    runs-on: ubuntu-latest
+    needs: lint
+    if: github.event.head_commit.message != 'Update package version'
+    timeout-minutes: 240
+    env:
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+      HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+      - uses: astral-sh/setup-uv@v5
+      - name: Install Modal CLI
+        run: pip install modal
+      - name: Install package
+        run: uv sync --dev
+
+      - name: Initialize summary
+        run: |
+          echo "## Data Build & Integration Tests" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "| Step | Status | Duration |" >> $GITHUB_STEP_SUMMARY
+          echo "|------|--------|----------|" >> $GITHUB_STEP_SUMMARY
+
+      # ── Phase 1: Download prerequisites ───────────────────
+      - name: "Build: download prerequisites"
+        run: |
+          START=$(date +%s)
+          modal run modal_app/data_build.py --script download_prerequisites \
+            --branch=${{ github.ref_name }}
+          ELAPSED=$(( $(date +%s) - START ))
+          echo "| download_prerequisites | :white_check_mark: | ${ELAPSED}s |" >> $GITHUB_STEP_SUMMARY
+
+      # ── Phase 1: Independent datasets (sequential) ────────
+      - name: "Build: uprating"
+        run: |
+          START=$(date +%s)
+          modal run modal_app/data_build.py --script uprating \
+            --branch=${{ github.ref_name }}
+          ELAPSED=$(( $(date +%s) - START ))
+          echo "| uprating | :white_check_mark: | ${ELAPSED}s |" >> $GITHUB_STEP_SUMMARY
+
+      - name: "Build: acs"
+        run: |
+          START=$(date +%s)
+          modal run modal_app/data_build.py --script acs \
+            --branch=${{ github.ref_name }}
+          ELAPSED=$(( $(date +%s) - START ))
+          echo "| acs | :white_check_mark: | ${ELAPSED}s |" >> $GITHUB_STEP_SUMMARY
+
+      - name: "Test: acs"
+        run: |
+          uv run pytest policyengine_us_data/tests/integration/test_acs.py -v
+          echo "| test_acs | :white_check_mark: | - |" >> $GITHUB_STEP_SUMMARY
+
+      - name: "Build: irs_puf"
+        run: |
+          START=$(date +%s)
+          modal run modal_app/data_build.py --script irs_puf \
+            --branch=${{ github.ref_name }}
+          ELAPSED=$(( $(date +%s) - START ))
+          echo "| irs_puf | :white_check_mark: | ${ELAPSED}s |" >> $GITHUB_STEP_SUMMARY
+
+      # ── Phase 2: CPS and PUF (depend on Phase 1) ─────────
+      - name: "Build: cps"
+        run: |
+          START=$(date +%s)
+          modal run modal_app/data_build.py --script cps \
+            --branch=${{ github.ref_name }}
+          ELAPSED=$(( $(date +%s) - START ))
+          echo "| cps | :white_check_mark: | ${ELAPSED}s |" >> $GITHUB_STEP_SUMMARY
+
+      - name: "Test: cps"
+        run: |
+          uv run pytest policyengine_us_data/tests/integration/test_cps.py -v
+          echo "| test_cps | :white_check_mark: | - |" >> $GITHUB_STEP_SUMMARY
+
+      - name: "Build: puf"
+        run: |
+          START=$(date +%s)
+          modal run modal_app/data_build.py --script puf \
+            --branch=${{ github.ref_name }}
+          ELAPSED=$(( $(date +%s) - START ))
+          echo "| puf | :white_check_mark: | ${ELAPSED}s |" >> $GITHUB_STEP_SUMMARY
+
+      # ── Phase 3: Extended CPS (depends on CPS + PUF) ─────
+      - name: "Build: extended_cps"
+        run: |
+          START=$(date +%s)
+          modal run modal_app/data_build.py --script extended_cps \
+            --branch=${{ github.ref_name }}
+          ELAPSED=$(( $(date +%s) - START ))
+          echo "| extended_cps | :white_check_mark: | ${ELAPSED}s |" >> $GITHUB_STEP_SUMMARY
+
+      - name: "Test: extended_cps"
+        run: |
+          uv run pytest policyengine_us_data/tests/integration/test_extended_cps.py -v
+          echo "| test_extended_cps | :white_check_mark: | - |" >> $GITHUB_STEP_SUMMARY
+
+      # ── Phase 4: Enhanced + Stratified CPS ────────────────
+      - name: "Build: enhanced_cps"
+        run: |
+          START=$(date +%s)
+          modal run modal_app/data_build.py --script enhanced_cps \
+            --branch=${{ github.ref_name }}
+          ELAPSED=$(( $(date +%s) - START ))
+          echo "| enhanced_cps | :white_check_mark: | ${ELAPSED}s |" >> $GITHUB_STEP_SUMMARY
+
+      - name: "Test: enhanced_cps"
+        run: |
+          uv run pytest policyengine_us_data/tests/integration/test_enhanced_cps.py -v
+          echo "| test_enhanced_cps | :white_check_mark: | - |" >> $GITHUB_STEP_SUMMARY
+
+      - name: "Build: stratified_cps"
+        run: |
+          START=$(date +%s)
+          modal run modal_app/data_build.py --script stratified_cps \
+            --branch=${{ github.ref_name }}
+          ELAPSED=$(( $(date +%s) - START ))
+          echo "| stratified_cps | :white_check_mark: | ${ELAPSED}s |" >> $GITHUB_STEP_SUMMARY
+
+      # ── Phase 5: Source imputed + Small enhanced CPS ──────
+      - name: "Build: source_imputed_cps"
+        run: |
+          START=$(date +%s)
+          modal run modal_app/data_build.py --script source_imputed_cps \
+            --branch=${{ github.ref_name }}
+          ELAPSED=$(( $(date +%s) - START ))
+          echo "| source_imputed_cps | :white_check_mark: | ${ELAPSED}s |" >> $GITHUB_STEP_SUMMARY
+
+      - name: "Test: source_imputed_cps"
+        run: |
+          uv run pytest policyengine_us_data/tests/integration/test_source_imputed_cps_masking.py policyengine_us_data/tests/integration/test_source_imputed_cps_consistency.py -v
+          echo "| test_source_imputed_cps | :white_check_mark: | - |" >> $GITHUB_STEP_SUMMARY
+
+      - name: "Build: small_enhanced_cps"
+        run: |
+          START=$(date +%s)
+          modal run modal_app/data_build.py --script small_enhanced_cps \
+            --branch=${{ github.ref_name }}
+          ELAPSED=$(( $(date +%s) - START ))
+          echo "| small_enhanced_cps | :white_check_mark: | ${ELAPSED}s |" >> $GITHUB_STEP_SUMMARY
+
+      - name: "Test: small_enhanced_cps"
+        run: |
+          uv run pytest policyengine_us_data/tests/integration/test_small_enhanced_cps.py -v
+          echo "| test_small_enhanced_cps | :white_check_mark: | - |" >> $GITHUB_STEP_SUMMARY
+
+      # ── Remaining integration tests ───────────────────────
+      - name: "Test: sparse_enhanced_cps"
+        run: |
+          uv run pytest policyengine_us_data/tests/integration/test_sparse_enhanced_cps.py -v
+          echo "| test_sparse_enhanced_cps | :white_check_mark: | - |" >> $GITHUB_STEP_SUMMARY
+
+      - name: "Test: sipp_assets"
+        run: |
+          uv run pytest policyengine_us_data/tests/integration/test_sipp_assets.py -v
+          echo "| test_sipp_assets | :white_check_mark: | - |" >> $GITHUB_STEP_SUMMARY
+
+      - name: "Test: census_cps"
+        run: |
+          uv run pytest policyengine_us_data/tests/integration/test_census_cps.py -v
+          echo "| test_census_cps | :white_check_mark: | - |" >> $GITHUB_STEP_SUMMARY
+
+      - name: "Test: database_build"
+        run: |
+          uv run pytest policyengine_us_data/tests/integration/test_database_build.py -v
+          echo "| test_database_build | :white_check_mark: | - |" >> $GITHUB_STEP_SUMMARY
+
+  # ── Manual approval gate ────────────────────────────────────
+  approval-gate:
+    needs: build-and-test
+    runs-on: ubuntu-latest
+    environment: pipeline-approval
+    steps:
+      - run: echo "Pipeline approved. Dispatching H5 build."
+
+  # ── Dispatch pipeline ───────────────────────────────────────
+  trigger-pipeline:
+    needs: approval-gate
+    runs-on: ubuntu-latest
+    steps:
+      - name: Trigger pipeline workflow
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            await github.rest.actions.createWorkflowDispatch({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              workflow_id: 'pipeline.yaml',
+              ref: 'main',
+              inputs: { scope: 'all' }
+            })
+            console.log('Pipeline dispatched with scope=all')
+
+  # ── PyPI publish (version bump commits only) ────────────────
+  publish:
+    runs-on: ubuntu-latest
+    needs: lint
+    if: github.event.head_commit.message == 'Update package version'
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+      - uses: astral-sh/setup-uv@v5
+      - name: Install package
+        run: uv sync --dev
+      - name: Build package
+        run: uv run python -m build
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI }}
+          skip-existing: true
diff --git a/changelog.d/test-push-workflow.fixed.md b/changelog.d/test-push-workflow.fixed.md
@@ -0,0 +1 @@
+Test-only PR for push workflow dry run. Do not merge.
diff --git a/modal_app/data_build.py b/modal_app/data_build.py
@@ -79,13 +79,25 @@
 
 # Test modules to run individually for checkpoint tracking
 TEST_MODULES = [
-    "policyengine_us_data/tests/test_import.py",
-    "policyengine_us_data/tests/test_database.py",
-    "policyengine_us_data/tests/test_pandas3_compatibility.py",
-    "policyengine_us_data/tests/test_datasets/",
-    "policyengine_us_data/tests/test_calibration/",
+    "policyengine_us_data/tests/unit/",
+    "policyengine_us_data/tests/integration/",
 ]
 
+# Short names for --script mode (maps to SCRIPT_OUTPUTS keys)
+SCRIPT_SHORT_NAMES = {
+    "download_prerequisites": "policyengine_us_data/storage/download_private_prerequisites.py",
+    "uprating": "policyengine_us_data/utils/uprating.py",
+    "acs": "policyengine_us_data/datasets/acs/acs.py",
+    "irs_puf": "policyengine_us_data/datasets/puf/irs_puf.py",
+    "cps": "policyengine_us_data/datasets/cps/cps.py",
+    "puf": "policyengine_us_data/datasets/puf/puf.py",
+    "extended_cps": "policyengine_us_data/datasets/cps/extended_cps.py",
+    "enhanced_cps": "policyengine_us_data/datasets/cps/enhanced_cps.py",
+    "stratified_cps": "policyengine_us_data/calibration/create_stratified_cps.py",
+    "source_imputed_cps": "policyengine_us_data/calibration/create_source_imputed_cps.py",
+    "small_enhanced_cps": "policyengine_us_data/datasets/cps/small_enhanced_cps.py",
+}
+
 
 def setup_gcp_credentials():
     """Write GCP credentials JSON to a temp file for google.auth.default()."""
@@ -654,6 +666,68 @@ def build_datasets(
     return "Data build completed successfully"
 
 
+@app.function(
+    image=image,
+    secrets=[hf_secret, gcp_secret],
+    volumes={
+        VOLUME_MOUNT: checkpoint_volume,
+        PIPELINE_MOUNT: pipeline_volume,
+    },
+    memory=32768,
+    cpu=8.0,
+    timeout=14400,
+    nonpreemptible=True,
+)
+def run_single_script(
+    script_name: str,
+    branch: str = "main",
+) -> str:
+    """Run a single dataset build script with checkpointing.
+
+    Args:
+        script_name: Short name (e.g. 'cps') or full path to the script.
+        branch: Git branch for checkpoint scoping.
+
+    Returns:
+        Status message.
+    """
+    setup_gcp_credentials()
+    os.chdir("/root/policyengine-us-data")
+
+    # Resolve short name to full path
+    script_path = SCRIPT_SHORT_NAMES.get(script_name, script_name)
+
+    # Handle download_prerequisites specially (no SCRIPT_OUTPUTS entry)
+    if script_name == "download_prerequisites":
+        run_script(script_path)
+        checkpoint_volume.commit()
+        return f"Completed {script_name}"
+
+    output_files = SCRIPT_OUTPUTS.get(script_path)
+    if output_files is None:
+        raise ValueError(
+            f"Unknown script: {script_name}. "
+            f"Valid names: {', '.join(SCRIPT_SHORT_NAMES.keys())}"
+        )
+
+    # Restore any existing checkpoints for dependencies
+    for dep_path, dep_outputs in SCRIPT_OUTPUTS.items():
+        if dep_path == script_path:
+            continue
+        if isinstance(dep_outputs, str):
+            dep_outputs = [dep_outputs]
+        for dep_output in dep_outputs:
+            restore_from_checkpoint(branch, dep_output)
+
+    run_script_with_checkpoint(
+        script_path,
+        output_files,
+        branch,
+        checkpoint_volume,
+    )
+    return f"Completed {script_name}"
+
+
 @app.local_entrypoint()
 def main(
     upload: bool = False,
@@ -662,13 +736,21 @@ def main(
     clear_checkpoints: bool = False,
     skip_tests: bool = False,
     skip_enhanced_cps: bool = False,
+    script: str = "",
 ):
-    result = build_datasets.remote(
-        upload=upload,
-        branch=branch,
-        sequential=sequential,
-        clear_checkpoints=clear_checkpoints,
-        skip_tests=skip_tests,
-        skip_enhanced_cps=skip_enhanced_cps,
-    )
-    print(result)
+    if script:
+        result = run_single_script.remote(
+            script_name=script,
+            branch=branch,
+        )
+        print(result)
+    else:
+        result = build_datasets.remote(
+            upload=upload,
+            branch=branch,
+            sequential=sequential,
+            clear_checkpoints=clear_checkpoints,
+            skip_tests=skip_tests,
+            skip_enhanced_cps=skip_enhanced_cps,
+        )
+        print(result)
diff --git a/policyengine_us_data/__init__.py b/policyengine_us_data/__init__.py
@@ -1,2 +1,3 @@
 from .datasets import *
 from .geography import ZIP_CODE_DATASET
+DO_NOT_MERGE_THIS_PR =        True   # deliberate lint failure

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Test-only PR for push workflow dry run. Do not merge.`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`from .datasets import *`
`2`	`2`	`from .geography import ZIP_CODE_DATASET`
	`3`	`+DO_NOT_MERGE_THIS_PR = True # deliberate lint failure`