Centralize long-run calibration weight access (#1035)

MaxGhenis · web-flow · commit 35321f9b455e · 2026-05-20T08:58:33.000-04:00
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -165,10 +165,20 @@ jobs:
       MODAL_PROXY_TOKEN_SECRET: ${{ secrets.MODAL_PROXY_TOKEN_SECRET }}
       HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
       GOOGLE_APPLICATION_CREDENTIALS: ${{ secrets.GOOGLE_APPLICATION_CREDENTIALS }}
-      MODAL_ENVIRONMENT: staging-us-data-pr-${{ github.event.pull_request.number }}
-      MODAL_APP_NAME: policyengine-us-data-pipeline
-      MODAL_LOCAL_AREA_APP_NAME: policyengine-us-data-local-area
-      MODAL_H5_TEST_HARNESS_APP_NAME: policyengine-us-data-h5-test-harness
+      # Modal PR environments cannot reliably receive secrets with the CI token.
+      # Deploy isolated PR apps and volumes into main, where required secrets
+      # already exist, then stop/delete the PR resources in cleanup steps.
+      MODAL_ENVIRONMENT: main
+      MODAL_APP_NAME: us-data-pipeline-pr-${{ github.event.pull_request.number }}-${{ github.run_id }}-${{ github.run_attempt }}
+      MODAL_LOCAL_AREA_APP_NAME: us-data-local-area-pr-${{ github.event.pull_request.number }}-${{ github.run_id }}-${{ github.run_attempt }}
+      MODAL_H5_TEST_HARNESS_APP_NAME: us-data-h5-pr-${{ github.event.pull_request.number }}-${{ github.run_id }}-${{ github.run_attempt }}
+      US_DATA_PIPELINE_APP_NAME: us-data-pipeline-pr-${{ github.event.pull_request.number }}-${{ github.run_id }}-${{ github.run_attempt }}
+      US_DATA_MODAL_APP_NAME: us-data-pipeline-pr-${{ github.event.pull_request.number }}-${{ github.run_id }}-${{ github.run_attempt }}
+      US_DATA_LOCAL_AREA_APP_NAME: us-data-local-area-pr-${{ github.event.pull_request.number }}-${{ github.run_id }}-${{ github.run_attempt }}
+      US_DATA_H5_HARNESS_APP_NAME: us-data-h5-pr-${{ github.event.pull_request.number }}-${{ github.run_id }}-${{ github.run_attempt }}
+      US_DATA_PIPELINE_VOLUME_NAME: pipeline-artifacts-pr-${{ github.event.pull_request.number }}-${{ github.run_id }}-${{ github.run_attempt }}
+      US_DATA_STAGING_VOLUME_NAME: local-area-staging-pr-${{ github.event.pull_request.number }}-${{ github.run_id }}-${{ github.run_attempt }}
+      US_DATA_CHECKPOINT_VOLUME_NAME: data-build-checkpoints-pr-${{ github.event.pull_request.number }}-${{ github.run_id }}-${{ github.run_attempt }}
     steps:
       - uses: actions/checkout@v6
       - uses: actions/setup-python@v6
@@ -178,15 +188,11 @@ jobs:
       - run: uv sync --dev
       - name: Install integration test deps
         run: uv pip install modal pytest numpy pandas
-      - name: Ensure PR Modal environment exists
-        run: uv run python .github/scripts/ensure_modal_environment.py
-      - name: Sync Modal secrets to PR environment
-        run: uv run python .github/scripts/sync_modal_secrets.py
-      - name: Deploy Modal pipeline app to PR staging
+      - name: Deploy PR Modal pipeline app
         run: uv run modal deploy --env="${MODAL_ENVIRONMENT}" modal_app/pipeline.py
-      - name: Deploy Modal local-area app to PR staging
+      - name: Deploy PR Modal local-area app
         run: uv run modal deploy --env="${MODAL_ENVIRONMENT}" modal_app/local_area.py
-      - name: Deploy Modal H5 test harness to PR staging
+      - name: Deploy PR Modal H5 test harness
         run: uv run modal deploy --env="${MODAL_ENVIRONMENT}" modal_app/h5_test_harness.py
       - name: Run integration tests
         run: >
@@ -204,9 +210,32 @@ jobs:
           tests/integration/test_tiny_h5_pipeline.py
           tests/integration/test_modal_pipeline_e2e.py
           -v
-      - name: Cleanup PR Modal environment
+      - name: Stop PR Modal apps
         if: always()
-        run: uv run python .github/scripts/delete_modal_environment.py
+        run: |
+          for app_name in \
+            "${MODAL_H5_TEST_HARNESS_APP_NAME}" \
+            "${MODAL_LOCAL_AREA_APP_NAME}" \
+            "${MODAL_APP_NAME}"
+          do
+            yes | uv run modal app stop \
+              --env="${MODAL_ENVIRONMENT}" \
+              "${app_name}" || true
+          done
+      - name: Delete PR Modal volumes
+        if: always()
+        run: |
+          for volume_name in \
+            "${US_DATA_STAGING_VOLUME_NAME}" \
+            "${US_DATA_PIPELINE_VOLUME_NAME}" \
+            "${US_DATA_CHECKPOINT_VOLUME_NAME}"
+          do
+            uv run modal volume delete \
+              --env="${MODAL_ENVIRONMENT}" \
+              --allow-missing \
+              --yes \
+              "${volume_name}" || true
+          done
 
   smoke-test:
     runs-on: ubuntu-latest
diff --git a/changelog.d/1033.fixed.md b/changelog.d/1033.fixed.md
@@ -0,0 +1 @@
+Centralized long-run calibration weight access so baseline diagnostics use PolicyEngine weighted operations, hardened PR Modal integration isolation, and retried the Census county lookup used by local-area H5 builds.
diff --git a/policyengine_us_data/calibration/block_assignment.py b/policyengine_us_data/calibration/block_assignment.py
@@ -22,6 +22,7 @@
 """
 
 import random
+import time
 import unicodedata
 from functools import lru_cache
 from io import StringIO
@@ -63,6 +64,35 @@ def get_tract_geoid_from_block(block_geoid: str) -> str:
 
 # === County FIPS to Enum Mapping ===
 
+COUNTY_FIPS_2020_URL = (
+    "https://www2.census.gov/geo/docs/reference/codes2020/national_county2020.txt"
+)
+COUNTY_FIPS_DOWNLOAD_ATTEMPTS = 5
+COUNTY_FIPS_RETRY_BACKOFF_SECONDS = 1.0
+
+
+def _county_fips_session() -> requests.Session:
+    return requests.Session()
+
+
+def _download_county_fips_2020(
+    session: requests.Session | None = None,
+) -> str:
+    session = session or _county_fips_session()
+    last_exception = None
+    for attempt in range(COUNTY_FIPS_DOWNLOAD_ATTEMPTS):
+        try:
+            response = session.get(COUNTY_FIPS_2020_URL, timeout=(10, 60))
+            response.raise_for_status()
+            return response.content.decode("utf-8")
+        except requests.RequestException as exc:
+            last_exception = exc
+            if attempt == COUNTY_FIPS_DOWNLOAD_ATTEMPTS - 1:
+                raise
+            time.sleep(COUNTY_FIPS_RETRY_BACKOFF_SECONDS * (2**attempt))
+
+    raise RuntimeError("Failed to download 2020 county FIPS data") from last_exception
+
 
 @lru_cache(maxsize=1)
 def _build_county_fips_to_enum() -> Dict[str, str]:
@@ -72,11 +102,8 @@ def _build_county_fips_to_enum() -> Dict[str, str]:
     Downloads Census county FIPS file and matches to County enum names.
     Cached to avoid repeated downloads.
     """
-    url = "https://www2.census.gov/geo/docs/reference/codes2020/national_county2020.txt"
-    response = requests.get(url, timeout=60)
-    response.raise_for_status()
     df = pd.read_csv(
-        StringIO(response.content.decode("utf-8")),
+        StringIO(_download_county_fips_2020()),
         delimiter="|",
         dtype=str,
         usecols=["STATE", "STATEFP", "COUNTYFP", "COUNTYNAME"],
diff --git a/policyengine_us_data/datasets/cps/long_term/assess_publishable_horizon.py b/policyengine_us_data/datasets/cps/long_term/assess_publishable_horizon.py
@@ -23,6 +23,7 @@
     aggregate_household_age_matrix,
     build_age_bins,
     build_household_age_matrix,
+    household_calibration_weights,
 )
 from ssa_data import (
     get_long_term_target_source,
@@ -145,9 +146,9 @@ def assess_years(
     target_matrix = load_ssa_age_projections(start_year=start_year, end_year=end_year)
     n_ages = target_matrix.shape[0]
 
-    sim = Microsimulation(dataset=base_dataset_path)
-    X, _, _ = build_household_age_matrix(sim, n_ages)
-    del sim
+    base_sim = Microsimulation(dataset=base_dataset_path)
+    X, _, _ = build_household_age_matrix(base_sim, n_ages)
+    del base_sim
     gc.collect()
 
     aggregated_age_cache: dict[int, tuple[np.ndarray, np.ndarray]] = {}
@@ -158,8 +159,7 @@ def assess_years(
         year_idx = year - start_year
         sim = Microsimulation(dataset=base_dataset_path)
 
-        household_microseries = sim.calculate("household_id", map_to="household")
-        baseline_weights = household_microseries.weights.values
+        baseline_weights = household_calibration_weights(sim)
 
         ss_values = None
         ss_target = None
@@ -294,7 +294,7 @@ def assess_years(
                     best_case_match.group(2)
                 )
             rows.append(row)
-            del sim
+            sim = None
             gc.collect()
             continue
 
@@ -375,7 +375,7 @@ def assess_years(
 
         rows.append(row)
 
-        del sim
+        sim = None
         gc.collect()
 
     return rows
diff --git a/policyengine_us_data/datasets/cps/long_term/check_calibrated_estimates_interactive.py b/policyengine_us_data/datasets/cps/long_term/check_calibrated_estimates_interactive.py
@@ -1,5 +1,6 @@
 import numpy as np
 
+from policyengine_core.reforms import Reform
 from policyengine_us import Microsimulation
 
 # H5_PATH = 'hf://policyengine/test/'
@@ -32,11 +33,8 @@
 ## Population demographics, total
 
 ### Population count of 6 year olds
-person_weights = sim.calculate("age", map_to="person").weights
-person_ages = sim.calculate("age", map_to="person").values
-person_is_6 = person_ages == 6
-
-total_age6_est = np.sum(person_is_6 * person_weights)
+person_ages = sim.calculate("age", map_to="person")
+total_age6_est = (person_ages == 6).sum()
 
 ### Single Year Age demographic projections - latest published is 2024:
 ### "Mid Year" CSV from https://www.ssa.gov/oact/HistEst/Population/2024/Population2024.html
@@ -73,11 +71,8 @@
 ## Population demographics, total
 
 ### Population count of 6 year olds
-person_weights = sim.calculate("age", map_to="person").weights
-person_ages = sim.calculate("age", map_to="person").values
-person_is_6 = person_ages == 6
-
-total_age6_est = np.sum(person_is_6 * person_weights)
+person_ages = sim.calculate("age", map_to="person")
+total_age6_est = (person_ages == 6).sum()
 
 ### Single Year Age demographic projections - latest published is 2024:
 ### "Mid Year" CSV from https://www.ssa.gov/oact/HistEst/Population/2024/Population2024.html
@@ -101,9 +96,6 @@
 
 # Testing the H6 Reform ------------------------------------------------------
 
-from policyengine_us import Microsimulation
-from policyengine_core.reforms import Reform
-
 
 def create_h6_reform():
     """
diff --git a/policyengine_us_data/datasets/cps/long_term/evaluate_support_augmentation.py b/policyengine_us_data/datasets/cps/long_term/evaluate_support_augmentation.py
@@ -2,7 +2,6 @@
 
 import argparse
 import json
-from pathlib import Path
 
 import numpy as np
 from policyengine_us import Microsimulation
@@ -17,6 +16,7 @@
     aggregate_household_age_matrix,
     build_age_bins,
     build_household_age_matrix,
+    household_calibration_weights,
 )
 from ssa_data import (
     get_long_term_target_source,
@@ -60,8 +60,7 @@ def _evaluate_dataset(
         y_target = target_matrix[:, 0]
         age_bucket_size = 1
 
-    household_series = sim.calculate("household_id", period=year, map_to="household")
-    baseline_weights = household_series.weights.values
+    baseline_weights = household_calibration_weights(sim, period=year)
 
     ss_values = None
     ss_target = None
diff --git a/policyengine_us_data/datasets/cps/long_term/projection_utils.py b/policyengine_us_data/datasets/cps/long_term/projection_utils.py
@@ -32,6 +32,26 @@ def _row_values(series):
     return np.asarray(series)
 
 
+def household_calibration_weights(sim, *, period=None) -> np.ndarray:
+    """
+    Return household weights for calibration decision vectors only.
+
+    Ordinary weighted totals should use MicroSeries/MicroDataFrame methods such
+    as ``sum()`` so PolicyEngine owns the entity-to-weight mapping. The long-run
+    calibration optimizer is the exception: it needs the household-level weight
+    vector because it directly solves for adjusted household weights.
+    """
+    if period is None:
+        household_series = sim.calculate("household_id", map_to="household")
+    else:
+        household_series = sim.calculate(
+            "household_id",
+            period=period,
+            map_to="household",
+        )
+    return np.asarray(household_series.weights, dtype=float)
+
+
 def _person_level_values(sim, variable, *, period):
     try:
         series = sim.calculate(variable, period=period, map_to="person")
@@ -426,10 +446,7 @@ def calculate_year_statistics(
     income_tax_baseline_total = income_tax_hh.sum()
     income_tax_values = income_tax_hh.values
 
-    household_microseries = sim.calculate("household_id", map_to="household")
-    # Explicit weight access is reserved for the household-level calibration
-    # decision vector; ordinary aggregates should use MicroSeries methods.
-    baseline_weights_actual = household_microseries.weights.values
+    baseline_weights_actual = household_calibration_weights(sim)
 
     ss_values = None
     ss_target = None
diff --git a/policyengine_us_data/datasets/cps/long_term/run_household_projection.py b/policyengine_us_data/datasets/cps/long_term/run_household_projection.py
@@ -83,6 +83,7 @@
     build_age_bins,
     build_household_age_matrix,
     create_household_year_h5,
+    household_calibration_weights,
     validate_projected_social_security_cap,
 )
 from tax_assumptions import (
@@ -1115,25 +1116,24 @@ def _print_support_augmentation_summary(augmentation_report: dict) -> None:
     income_tax_values = income_tax_hh.values
 
     household_microseries = sim.calculate("household_id", map_to="household")
-    # This is the calibrated household-weight decision vector. All ordinary
-    # baseline aggregates should continue to use MicroSeries methods directly.
-    baseline_weights = household_microseries.weights.values
-    household_ids_hh = household_microseries.values
+    baseline_weights = household_calibration_weights(sim)
+    household_ids_hh = np.asarray(household_microseries.array)
 
     income_guard_constraints = {}
     if year >= SUPPORT_AUGMENTATION_START_YEAR:
         for group_name, components in INCOME_GUARD_GROUPS.items():
             group_values = np.zeros(len(baseline_weights), dtype=float)
+            group_target = 0.0
             included_components = []
             for component in components:
                 if component not in sim.tax_benefit_system.variables:
                     continue
                 component_hh = sim.calculate(component, period=year, map_to="household")
                 group_values += np.asarray(component_hh.values, dtype=float)
+                group_target += float(component_hh.sum())
                 included_components.append(component)
             if not included_components:
                 continue
-            group_target = float(np.sum(group_values * baseline_weights))
             if abs(group_target) <= 1e-6:
                 continue
             income_guard_constraints[f"income_guard_{group_name}"] = (
@@ -1164,7 +1164,7 @@ def _print_support_augmentation_summary(augmentation_report: dict) -> None:
         ss_values = ss_hh.values
         ss_target = load_ssa_benefit_projections(year)
         if year in display_years:
-            ss_baseline = np.sum(ss_values * baseline_weights)
+            ss_baseline = ss_hh.sum()
             print(
                 f"  [DEBUG {year}] SS baseline: ${ss_baseline / 1e9:.1f}B, target: ${ss_target / 1e9:.1f}B"
             )
@@ -1190,7 +1190,7 @@ def _print_support_augmentation_summary(augmentation_report: dict) -> None:
         payroll_values = taxable_wages_hh.values + taxable_self_emp_hh.values
         payroll_target = load_taxable_payroll_projections(year)
         if year in display_years:
-            payroll_baseline = np.sum(payroll_values * baseline_weights)
+            payroll_baseline = taxable_wages_hh.sum() + taxable_self_emp_hh.sum()
             print(f"  [DEBUG {year}] Payroll cap: ${payroll_cap:,.0f}")
             print(
                 f"  [DEBUG {year}] Payroll baseline: ${payroll_baseline / 1e9:.1f}B, target: ${payroll_target / 1e9:.1f}B"
@@ -1231,7 +1231,7 @@ def _print_support_augmentation_summary(augmentation_report: dict) -> None:
 
             # Debug output for key years
             if year in display_years:
-                h6_impact_baseline = np.sum(h6_income_values * baseline_weights)
+                h6_impact_baseline = income_tax_reform_hh.sum() - income_tax_hh.sum()
                 print(
                     f"  [DEBUG {year}] H6 baseline revenue: ${h6_impact_baseline / 1e9:.3f}B, target: ${h6_revenue_target / 1e9:.3f}B"
                 )
@@ -1260,8 +1260,8 @@ def _print_support_augmentation_summary(augmentation_report: dict) -> None:
         hi_tob_target = load_hi_tob_projections(year)
 
         if year in display_years:
-            oasdi_baseline = np.sum(oasdi_tob_values * baseline_weights)
-            hi_baseline = np.sum(hi_tob_values * baseline_weights)
+            oasdi_baseline = oasdi_tob_hh.sum()
+            hi_baseline = hi_tob_hh.sum()
             print(
                 f"  [DEBUG {year}] OASDI TOB baseline: ${oasdi_baseline / 1e9:.1f}B, target: ${oasdi_tob_target / 1e9:.1f}B"
             )
diff --git a/tests/unit/calibration/test_block_assignment.py b/tests/unit/calibration/test_block_assignment.py
diff --git a/tests/unit/test_long_term_calibration_contract.py b/tests/unit/test_long_term_calibration_contract.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Centralized long-run calibration weight access so baseline diagnostics use PolicyEngine weighted operations, hardened PR Modal integration isolation, and retried the Census county lookup used by local-area H5 builds.`