Address source imputation review findings

MaxGhenis · MaxGhenis · commit f89bc1a9a0bb · 2026-05-21T15:00:46.000-04:00
diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py
@@ -81,6 +81,7 @@
 from policyengine_us_data.pipeline_metadata import pipeline_node
 from policyengine_us_data.pipeline_schema import PipelineNode
 from policyengine_us_data.utils.source_quality import (
+    require_columns_present,
     target_observed_source_masks,
 )
 
@@ -532,11 +533,20 @@ def _impute_acs(
     acs_df["state_fips"] = acs.calculate("state_fips", map_to="person").values.astype(
         np.float32
     )
+    required_acs_flags = [
+        column
+        for columns in ACS_TARGET_ALLOCATION_COLUMNS.values()
+        for column in columns
+    ]
     with h5py.File(ACS_2022.file_path, "r") as acs_h5:
+        require_columns_present(
+            acs_h5,
+            required_acs_flags,
+            source_name="ACS_2022 artifact",
+        )
         for flag_columns in ACS_TARGET_ALLOCATION_COLUMNS.values():
             for flag_column in flag_columns:
-                if flag_column in acs_h5:
-                    acs_df[flag_column] = np.asarray(acs_h5[flag_column], dtype=bool)
+                acs_df[flag_column] = np.asarray(acs_h5[flag_column], dtype=bool)
 
     train_df = acs_df[acs_df.is_household_head].copy()
     train_df = _encode_tenure_type(train_df)
@@ -654,6 +664,8 @@ def _impute_sipp(
         sipp_df["treasury_tipped_occupation_code"]
     )
 
+    if "MONTHCODE" in sipp_df:
+        sipp_df = sipp_df[sipp_df["MONTHCODE"] == 12].copy()
     sipp_df["is_under_18"] = sipp_df.TAGE < 18
     sipp_df["is_under_6"] = sipp_df.TAGE < 6
     sipp_df["count_under_18"] = (
@@ -662,8 +674,6 @@ def _impute_sipp(
     sipp_df["count_under_6"] = (
         sipp_df.groupby("SSUID")["is_under_6"].sum().loc[sipp_df.SSUID.values].values
     )
-    if "MONTHCODE" in sipp_df:
-        sipp_df = sipp_df[sipp_df["MONTHCODE"] == 12].copy()
 
     tip_target_filters = target_observed_source_masks(
         sipp_df,
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -72,7 +72,10 @@
 )
 from policyengine_us_data.pipeline_metadata import pipeline_node
 from policyengine_us_data.pipeline_schema import PipelineNode
-from policyengine_us_data.utils.source_quality import target_observed_source_masks
+from policyengine_us_data.utils.source_quality import (
+    require_columns_present,
+    target_observed_source_masks,
+)
 
 ACS_RENT_TARGET_ALLOCATION_COLUMNS = {
     "rent": ["rent_is_allocated"],
@@ -415,14 +418,23 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame):
     # H5; for CPS we use the in-memory dict (already populated upstream in
     # add_id_variables). Remove both overrides once pyproject.toml's
     # policyengine-core upper bound is lifted.
+    required_acs_flags = [
+        column
+        for columns in ACS_RENT_TARGET_ALLOCATION_COLUMNS.values()
+        for column in columns
+    ]
     with h5py.File(ACS_2022.file_path, "r") as acs_h5:
         train_df["is_household_head"] = np.asarray(
             acs_h5["is_household_head"], dtype=bool
         )
+        require_columns_present(
+            acs_h5,
+            required_acs_flags,
+            source_name="ACS_2022 artifact",
+        )
         for flag_columns in ACS_RENT_TARGET_ALLOCATION_COLUMNS.values():
             for flag_column in flag_columns:
-                if flag_column in acs_h5:
-                    train_df[flag_column] = np.asarray(acs_h5[flag_column], dtype=bool)
+                train_df[flag_column] = np.asarray(acs_h5[flag_column], dtype=bool)
     train_df.tenure_type = train_df.tenure_type.map(
         {
             "OWNED_OUTRIGHT": "OWNED_WITH_MORTGAGE",
diff --git a/policyengine_us_data/utils/source_quality.py b/policyengine_us_data/utils/source_quality.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import logging
-from collections.abc import Mapping, Sequence
+from collections.abc import Container, Mapping, Sequence
 
 import pandas as pd
 
@@ -17,6 +17,24 @@ def sipp_allocation_flag_for(source_column: str) -> str:
     return f"A{source_column[1:]}"
 
 
+def require_columns_present(
+    available_columns: Container[str],
+    required_columns: Sequence[str],
+    *,
+    source_name: str,
+) -> None:
+    """Raise if required donor-source provenance columns are unavailable."""
+    missing_columns = sorted(
+        {column for column in required_columns if column not in available_columns}
+    )
+    if missing_columns:
+        raise KeyError(
+            f"{source_name} is missing required source-quality columns: "
+            f"{', '.join(missing_columns)}. Regenerate the donor artifact with "
+            "allocation flag columns before fitting source imputations."
+        )
+
+
 def observed_source_mask(
     df: pd.DataFrame,
     *,
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,7 @@ dependencies = [
     "tqdm>=4.60.0",
     "microdf_python>=1.2.1",
     "setuptools>=60",
-    "microimpute>=2.0.5",
+    "microimpute>=2.1.0",
     "pip-system-certs>=3.0",
     "google-cloud-storage>=2.0.0",
     "google-auth>=2.0.0",
diff --git a/tests/unit/calibration/test_source_impute.py b/tests/unit/calibration/test_source_impute.py
@@ -5,7 +5,9 @@
 
 import numpy as np
 import pandas as pd
+import huggingface_hub
 
+from policyengine_us_data.calibration import source_impute
 from policyengine_us_data.calibration.source_impute import (
     ACS_IMPUTED_VARIABLES,
     ACS_PREDICTORS,
@@ -332,6 +334,62 @@ def test_impute_acs_exists(self):
     def test_impute_sipp_exists(self):
         assert callable(_impute_sipp)
 
+    def test_calibration_sipp_tip_counts_use_reference_month(self, monkeypatch):
+        captured = {}
+
+        columns = {
+            "SSUID": [1, 1, 1, 2],
+            "MONTHCODE": [1, 12, 12, 12],
+            "TAGE": [5, 40, 10, 30],
+            "TPTOTINC": [1_000.0, 2_000.0, 0.0, 3_000.0],
+            "WPFINWGT": [1.0, 1.0, 1.0, 1.0],
+        }
+        for column in source_impute.SIPP_TIP_AMOUNT_COLUMNS:
+            columns[column] = [0.0, 10.0, 0.0, 5.0]
+        for column in source_impute.SIPP_TIP_ALLOCATION_COLUMNS:
+            columns[column] = [0, 0, 0, 0]
+        for column in source_impute.SIPP_JOB_OCCUPATION_COLUMNS:
+            columns[column] = [0, 0, 0, 0]
+        tip_source = pd.DataFrame(columns)
+
+        read_count = {"count": 0}
+
+        def fake_read_csv(*args, **kwargs):
+            read_count["count"] += 1
+            if read_count["count"] == 1:
+                return tip_source.copy()
+            raise FileNotFoundError("stop after tip imputation")
+
+        class FakeQRF:
+            def __init__(self, *args, **kwargs):
+                pass
+
+            def fit(self, X_train, **kwargs):
+                captured["train"] = X_train.copy()
+                return self
+
+            def predict(self, X_test):
+                return pd.DataFrame({"tip_income": np.zeros(len(X_test))})
+
+        monkeypatch.setattr(
+            huggingface_hub,
+            "hf_hub_download",
+            lambda *args, **kwargs: None,
+        )
+        monkeypatch.setattr(source_impute.pd, "read_csv", fake_read_csv)
+        monkeypatch.setattr(source_impute, "QRF", FakeQRF)
+
+        data = _make_data_dict(n_persons=4)
+        _impute_sipp(
+            data=data,
+            state_fips=np.array([1, 1], dtype=np.int32),
+            time_period=2024,
+        )
+
+        household_one = captured["train"][captured["train"]["household_id"] == 1]
+        np.testing.assert_array_equal(household_one["count_under_18"], [1, 1])
+        np.testing.assert_array_equal(household_one["count_under_6"], [0, 0])
+
     def test_impute_org_exists(self):
         assert callable(_impute_org)
 
diff --git a/tests/unit/datasets/test_cps_file_handles.py b/tests/unit/datasets/test_cps_file_handles.py
@@ -390,6 +390,8 @@ def recording_hdfstore(path, mode="a", *args, **kwargs):
     acs_fixture_path = tmp_path / "acs_fixture.h5"
     with h5py.File(acs_fixture_path, "w") as acs_fixture:
         acs_fixture["is_household_head"] = np.ones(10_000, dtype=bool)
+        acs_fixture["rent_is_allocated"] = np.zeros(10_000, dtype=bool)
+        acs_fixture["real_estate_taxes_is_allocated"] = np.zeros(10_000, dtype=bool)
 
     real_h5py_file = cps_module.h5py.File
     opened_h5_paths = []
diff --git a/tests/unit/datasets/test_rng_seeding.py b/tests/unit/datasets/test_rng_seeding.py
@@ -130,7 +130,7 @@ def test_select_random_subset_uses_local_generator_only():
 
 def test_sipp_training_samples_use_seeded_rng():
     """N5: the weighted resample for tip and asset training frames
-    must come from a seeded Generator, not the global ``np.random``."""
+    must come from a deterministic sampler, not the global ``np.random``."""
     src = SIPP_SOURCE.read_text()
     assert "seeded_rng(" in src, "sipp.py must import/use seeded_rng()"
     tree = ast.parse(src)
@@ -149,8 +149,9 @@ def test_sipp_training_samples_use_seeded_rng():
         assert "np.random.choice" not in fn_src, (
             f"{fn_name} must not use np.random.choice (use a seeded_rng Generator)"
         )
-        assert "seeded_rng(" in fn_src, (
-            f"{fn_name} must derive its resampler from a seeded generator"
+        assert "seeded_rng(" in fn_src or "max_train_samples=" in fn_src, (
+            f"{fn_name} must derive its resampler from a seeded generator or "
+            "delegate capped sampling to microimpute's deterministic QRF sampler"
         )
 
 
diff --git a/tests/unit/test_source_quality.py b/tests/unit/test_source_quality.py
@@ -3,6 +3,7 @@
 
 from policyengine_us_data.utils.source_quality import (
     observed_source_mask,
+    require_columns_present,
     sipp_allocation_flag_for,
     target_observed_source_masks,
 )
@@ -14,6 +15,30 @@ def test_sipp_allocation_flag_for_source_column():
     assert sipp_allocation_flag_for("TJB1_TXAMT") == "AJB1_TXAMT"
 
 
+def test_require_columns_present_accepts_available_columns():
+    require_columns_present(
+        {"rent_is_allocated", "real_estate_taxes_is_allocated"},
+        ["rent_is_allocated", "real_estate_taxes_is_allocated"],
+        source_name="ACS",
+    )
+
+
+def test_require_columns_present_raises_for_missing_columns():
+    try:
+        require_columns_present(
+            {"rent_is_allocated"},
+            ["rent_is_allocated", "real_estate_taxes_is_allocated"],
+            source_name="ACS",
+        )
+    except KeyError as error:
+        message = str(error)
+    else:
+        raise AssertionError("Expected missing source-quality columns to fail")
+
+    assert "real_estate_taxes_is_allocated" in message
+    assert "Regenerate the donor artifact" in message
+
+
 def test_observed_source_mask_excludes_nonzero_allocation_flags():
     df = pd.DataFrame(
         {
diff --git a/uv.lock b/uv.lock