PolicyEngine
diff --git a/‎AGENTS.md‎
Lines changed: 3 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/engineering/skills/README.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/engineering/skills/README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/engineering/skills/imputation.md‎
Lines changed: 36 additions & 0 deletions b/‎docs/engineering/skills/imputation.md‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎policyengine_us_data/calibration/source_impute.py‎
Lines changed: 93 additions & 70 deletions b/‎policyengine_us_data/calibration/source_impute.py‎
Lines changed: 93 additions & 70 deletions
@@ -23,6 +23,9 @@ read `docs/engineering/skills/pipeline_operations.md`.
 When adding, changing, or reviewing calibration target definitions, read
 `docs/engineering/skills/calibration_targets.md`.
 
+When adding, changing, or reviewing donor-survey imputations, read
+`docs/engineering/skills/imputation.md`.
+
 ## Calibration targets
 
 Manually sourced national or local-file calibration targets must be registered
 
@@ -14,6 +14,8 @@ Current skills:
   notes.
 - `github-prs.md`: same-repository PR workflow, PR head verification, and title
   conventions.
+- `imputation.md`: donor-survey imputation provenance rules, including
+  target-level exclusion of allocated source values.
 - `pipeline_docs.md`: decorator-backed pipeline map maintenance and generated
   pydoc-style artifacts.
 - `pipeline_operations.md`: model-neutral workflow for diagnosing deployed Modal
 
@@ -0,0 +1,36 @@
+# Imputation
+
+Use this guide when adding, changing, or reviewing donor-survey imputations.
+
+## Source Provenance
+
+Do not train an imputation target on donor rows whose source value for that
+target is itself allocated, hot-decked, edited, or imputed by the source survey.
+Wire source-survey allocation or quality flags into the training frame whenever
+the donor file exposes them.
+
+Apply this rule at the target-variable level, not the donor-row level. A donor
+row with observed tip income but allocated bank-account assets can train
+`tip_income`; the same row must be excluded from the `bank_account_assets`
+training target. Use `policyengine_us_data.utils.source_quality` to build
+target masks, then pass them to `microimpute` through `target_filters` or
+`row_filter` so the filtering logic lives in the imputation library rather than
+in one-off model wrappers.
+
+Do not drop final CPS, ECPS, or calibration records solely because a donor
+survey target was excluded from training. The exclusion applies to donor
+training rows only; recipient datasets should remain complete.
+
+When a donor source lacks target-level quality flags, document that limitation
+near the imputation code and keep the training surface structured so flags can
+be added later.
+
+## Tests
+
+Add focused regression tests when adding a donor imputation or a source-quality
+flag:
+
+- allocation flags are read from the donor source,
+- allocated source values are excluded for the affected target,
+- unrelated observed targets from the same row can still train, and
+- legacy and current imputation surfaces use the same target provenance rule.
@@ -26,18 +26,26 @@
 """
 
 import gc
+import h5py
 import logging
 from typing import Dict, Optional
 
 import numpy as np
 import pandas as pd
+from microimpute.models.qrf import QRF
 from policyengine_us_data.datasets.cps.tipped_occupation import (
     derive_any_treasury_tipped_occupation_code,
     derive_is_tipped_occupation,
 )
 from policyengine_us_data.datasets.sipp.sipp import (
     ASSET_JOB_EARNINGS_COLUMNS,
     ASSET_PREDICTORS,
+    SIPP_ASSET_ALLOCATION_COLUMNS,
+    SIPP_ASSET_TARGET_ALLOCATION_COLUMNS,
+    SIPP_ASSET_TARGET_SOURCE_COLUMNS,
+    SIPP_TIP_ALLOCATION_COLUMNS,
+    SIPP_TIP_AMOUNT_COLUMNS,
+    SIPP_VEHICLE_TARGET_ALLOCATION_COLUMNS,
     SSI_DISABILITY_MODEL_VARIABLE,
     VEHICLE_MODEL_PREDICTORS,
     build_vehicle_training_frame,
@@ -72,13 +80,20 @@
 )
 from policyengine_us_data.pipeline_metadata import pipeline_node
 from policyengine_us_data.pipeline_schema import PipelineNode
+from policyengine_us_data.utils.source_quality import (
+    target_observed_source_masks,
+)
 
 logger = logging.getLogger(__name__)
 
 ACS_IMPUTED_VARIABLES = [
     "rent",
     "real_estate_taxes",
 ]
+ACS_TARGET_ALLOCATION_COLUMNS = {
+    "rent": ["rent_is_allocated"],
+    "real_estate_taxes": ["real_estate_taxes_is_allocated"],
+}
 
 SIPP_IMPUTED_VARIABLES = [
     "tip_income",
@@ -504,7 +519,6 @@ def _impute_acs(
     Returns:
         Updated data dict.
     """
-    from microimpute.models.qrf import QRF
     from policyengine_us import Microsimulation
 
     from policyengine_us_data.datasets.acs.acs import ACS_2022
@@ -518,8 +532,13 @@ def _impute_acs(
     acs_df["state_fips"] = acs.calculate("state_fips", map_to="person").values.astype(
         np.float32
     )
+    with h5py.File(ACS_2022.file_path, "r") as acs_h5:
+        for flag_columns in ACS_TARGET_ALLOCATION_COLUMNS.values():
+            for flag_column in flag_columns:
+                if flag_column in acs_h5:
+                    acs_df[flag_column] = np.asarray(acs_h5[flag_column], dtype=bool)
 
-    train_df = acs_df[acs_df.is_household_head].sample(10_000, random_state=42)
+    train_df = acs_df[acs_df.is_household_head].copy()
     train_df = _encode_tenure_type(train_df)
     del acs
 
@@ -545,17 +564,21 @@ def _impute_acs(
     )
     cps_heads = cps_df[mask]
 
-    qrf = QRF()
     logger.info(
         "ACS QRF: %d train, %d test, %d predictors",
         len(train_df),
         len(cps_heads),
         len(predictors),
     )
-    fitted = qrf.fit(
+    fitted = QRF(max_train_samples=10_000).fit(
         X_train=train_df,
         predictors=predictors,
         imputed_variables=ACS_IMPUTED_VARIABLES,
+        target_filters=target_observed_source_masks(
+            train_df,
+            targets=ACS_IMPUTED_VARIABLES,
+            target_allocation_flag_columns=ACS_TARGET_ALLOCATION_COLUMNS,
+        ),
     )
     predictions = fitted.predict(X_test=cps_heads)
 
@@ -606,12 +629,8 @@ def _impute_sipp(
         Updated data dict.
     """
     from huggingface_hub import hf_hub_download
-    from microimpute.models.qrf import QRF
-
     from policyengine_us_data.storage import STORAGE_FOLDER
 
-    rng = np.random.default_rng(seed=88)
-
     hf_hub_download(
         repo_id="PolicyEngine/policyengine-us-data",
         filename="pu2023_slim.csv",
@@ -620,12 +639,10 @@ def _impute_sipp(
     )
     sipp_df = pd.read_csv(STORAGE_FOLDER / "pu2023_slim.csv")
 
-    sipp_df["tip_income"] = (
-        sipp_df[sipp_df.columns[sipp_df.columns.str.contains("TXAMT")]]
-        .fillna(0)
-        .sum(axis=1)
-        * 12
-    )
+    tip_amount_columns = [
+        column for column in SIPP_TIP_AMOUNT_COLUMNS if column in sipp_df
+    ]
+    sipp_df["tip_income"] = sipp_df[tip_amount_columns].fillna(0).sum(axis=1) * 12
     sipp_df["employment_income"] = sipp_df.TPTOTINC * 12
     sipp_df["age"] = sipp_df.TAGE
     sipp_df["household_weight"] = sipp_df.WPFINWGT
@@ -645,6 +662,16 @@ def _impute_sipp(
     sipp_df["count_under_6"] = (
         sipp_df.groupby("SSUID")["is_under_6"].sum().loc[sipp_df.SSUID.values].values
     )
+    if "MONTHCODE" in sipp_df:
+        sipp_df = sipp_df[sipp_df["MONTHCODE"] == 12].copy()
+
+    tip_target_filters = target_observed_source_masks(
+        sipp_df,
+        targets=["tip_income"],
+        target_source_columns={"tip_income": tip_amount_columns},
+        target_allocation_flag_columns={"tip_income": SIPP_TIP_ALLOCATION_COLUMNS},
+        require_nonmissing_source=False,
+    )
 
     tip_cols = [
         "household_id",
@@ -657,14 +684,6 @@ def _impute_sipp(
         "household_weight",
     ]
     tip_train = sipp_df[tip_cols].dropna()
-    tip_train = tip_train.loc[
-        rng.choice(
-            tip_train.index,
-            size=min(10_000, len(tip_train)),
-            replace=True,
-            p=(tip_train.household_weight / tip_train.household_weight.sum()),
-        )
-    ]
 
     cps_tip_df = _build_cps_receiver(
         data, time_period, dataset_path, ["employment_income", "age"]
@@ -691,16 +710,17 @@ def _impute_sipp(
     else:
         cps_tip_df["is_tipped_occupation"] = 0.0
 
-    qrf = QRF()
     logger.info(
         "SIPP tips QRF: %d train, %d test",
         len(tip_train),
         len(cps_tip_df),
     )
-    fitted = qrf.fit(
+    fitted = QRF(max_train_samples=10_000).fit(
         X_train=tip_train,
         predictors=SIPP_TIPS_PREDICTORS,
         imputed_variables=["tip_income"],
+        target_filters=tip_target_filters,
+        weight_col="household_weight",
     )
     tip_preds = fitted.predict(X_test=cps_tip_df)
     data["tip_income"] = {
@@ -719,24 +739,28 @@ def _impute_sipp(
             repo_type="model",
             local_dir=STORAGE_FOLDER,
         )
-        asset_cols = [
-            "SSUID",
-            "PNUM",
-            "MONTHCODE",
-            "WPFINWGT",
-            "TAGE",
-            "ESEX",
-            "EMS",
-            "TSSSAMT",
-            "TRETINCAMT",
-            "TVAL_BANK",
-            "TVAL_STMF",
-            "TVAL_BOND",
-            "TINC_BANK",
-            "TINC_STMF",
-            "TINC_BOND",
-            "TINC_RENT",
-        ] + ASSET_JOB_EARNINGS_COLUMNS
+        asset_cols = (
+            [
+                "SSUID",
+                "PNUM",
+                "MONTHCODE",
+                "WPFINWGT",
+                "TAGE",
+                "ESEX",
+                "EMS",
+                "TSSSAMT",
+                "TRETINCAMT",
+                "TVAL_BANK",
+                "TVAL_STMF",
+                "TVAL_BOND",
+                "TINC_BANK",
+                "TINC_STMF",
+                "TINC_BOND",
+                "TINC_RENT",
+            ]
+            + ASSET_JOB_EARNINGS_COLUMNS
+            + SIPP_ASSET_ALLOCATION_COLUMNS
+        )
         asset_df = pd.read_csv(
             STORAGE_FOLDER / "pu2023.csv",
             delimiter="|",
@@ -751,16 +775,14 @@ def _impute_sipp(
             "bond_assets",
             "household_weight",
             *SIPP_ASSETS_PREDICTORS,
+            *[
+                column
+                for columns in SIPP_ASSET_TARGET_SOURCE_COLUMNS.values()
+                for column in columns
+            ],
+            *SIPP_ASSET_ALLOCATION_COLUMNS,
         ]
-        asset_train = asset_df[asset_train_cols].dropna()
-        asset_train = asset_train.loc[
-            rng.choice(
-                asset_train.index,
-                size=min(20_000, len(asset_train)),
-                replace=True,
-                p=(asset_train.household_weight / asset_train.household_weight.sum()),
-            )
-        ]
+        asset_train = asset_df[asset_train_cols].copy()
 
         cps_asset_df = _build_cps_receiver(
             data,
@@ -789,16 +811,22 @@ def _impute_sipp(
             "stock_assets",
             "bond_assets",
         ]
-        qrf = QRF()
         logger.info(
             "SIPP assets QRF: %d train, %d test",
             len(asset_train),
             len(cps_asset_df),
         )
-        fitted = qrf.fit(
+        fitted = QRF(max_train_samples=20_000).fit(
             X_train=asset_train,
             predictors=SIPP_ASSETS_PREDICTORS,
             imputed_variables=asset_vars,
+            target_filters=target_observed_source_masks(
+                asset_train,
+                targets=asset_vars,
+                target_source_columns=SIPP_ASSET_TARGET_SOURCE_COLUMNS,
+                target_allocation_flag_columns=SIPP_ASSET_TARGET_ALLOCATION_COLUMNS,
+            ),
+            weight_col="household_weight",
         )
         asset_preds = fitted.predict(X_test=cps_asset_df)
 
@@ -889,17 +917,6 @@ def _impute_sipp(
         logger.info("SIPP SSI disability criteria imputation complete")
 
         vehicle_train = build_vehicle_training_frame()
-        vehicle_train = vehicle_train.loc[
-            rng.choice(
-                vehicle_train.index,
-                size=min(20_000, len(vehicle_train)),
-                replace=True,
-                p=(
-                    vehicle_train.household_weight
-                    / vehicle_train.household_weight.sum()
-                ),
-            )
-        ]
 
         cps_vehicle_df = _build_cps_receiver(
             data,
@@ -943,19 +960,25 @@ def _impute_sipp(
             tenure_type=data.get("tenure_type", {}).get(time_period),
         )
 
-        qrf = QRF()
         logger.info(
             "SIPP vehicle QRF: %d train, %d test",
             len(vehicle_train),
             len(vehicle_receiver),
         )
-        fitted = qrf.fit(
+        vehicle_vars = [
+            "household_vehicles_owned",
+            "household_vehicles_value",
+        ]
+        fitted = QRF(max_train_samples=20_000).fit(
             X_train=vehicle_train,
             predictors=VEHICLE_MODEL_PREDICTORS,
-            imputed_variables=[
-                "household_vehicles_owned",
-                "household_vehicles_value",
-            ],
+            imputed_variables=vehicle_vars,
+            target_filters=target_observed_source_masks(
+                vehicle_train,
+                targets=vehicle_vars,
+                target_allocation_flag_columns=SIPP_VEHICLE_TARGET_ALLOCATION_COLUMNS,
+            ),
+            weight_col="household_weight",
         )
         vehicle_preds = fitted.predict(X_test=vehicle_receiver)
         data["household_vehicles_owned"] = {