Derive clone-half childcare caps deterministically (#704)

MaxGhenis · web-flow · commit 7a79100d6f3c · 2026-04-08T21:15:23.000-04:00
* Derive clone childcare cap from clone inputs

* Add childcare changelog fragment

* Handle missing ACA takeup in enhanced CPS build
diff --git a/changelog.d/704.fixed b/changelog.d/704.fixed
@@ -0,0 +1 @@
+Stop independently QRF-imputing clone-half ``spm_unit_capped_work_childcare_expenses`` and rebuild it deterministically from clone pre-subsidy childcare, donor capping shares, and clone earnings caps.
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -38,6 +38,36 @@ def _get_period_array(period_values: dict, period: int) -> np.ndarray:
     return np.asarray(value)
 
 
+def _get_base_aca_takeup(
+    data: dict,
+    base_year: int,
+    tax_unit_count: int,
+) -> np.ndarray:
+    """Return stored ACA takeup or the default all-True baseline."""
+    period_values = data.get("takes_up_aca_if_eligible")
+    if period_values is None:
+        logging.info(
+            "takes_up_aca_if_eligible missing from base dataset; using default "
+            "all-True takeup for ACA 2025 override"
+        )
+        return np.ones(tax_unit_count, dtype=bool)
+    return _get_period_array(period_values, base_year).astype(bool, copy=False)
+
+
+def _set_period_array(
+    data: dict,
+    variable: str,
+    period: int,
+    values: np.ndarray,
+) -> None:
+    """Store a time-period array, creating the variable entry if needed."""
+    period_values = data.get(variable)
+    if period_values is None:
+        period_values = {}
+        data[variable] = period_values
+    period_values[period] = values
+
+
 def create_aca_2025_takeup_override(
     base_takeup: np.ndarray,
     person_enrolled_if_takeup: np.ndarray,
@@ -282,32 +312,40 @@ def generate(self):
             )
             sim.delete_arrays("aca_ptc")
 
-            data["takes_up_aca_if_eligible"][2025] = create_aca_2025_takeup_override(
-                base_takeup=_get_period_array(
-                    data["takes_up_aca_if_eligible"],
-                    base_year,
-                ),
-                person_enrolled_if_takeup=np.asarray(
-                    sim.calculate(
-                        "aca_ptc",
-                        map_to="person",
-                        period=2025,
-                        use_weights=False,
+            _set_period_array(
+                data=data,
+                variable="takes_up_aca_if_eligible",
+                period=2025,
+                values=create_aca_2025_takeup_override(
+                    base_takeup=_get_base_aca_takeup(
+                        data=data,
+                        base_year=base_year,
+                        tax_unit_count=len(
+                            _get_period_array(data["tax_unit_id"], base_year)
+                        ),
+                    ),
+                    person_enrolled_if_takeup=np.asarray(
+                        sim.calculate(
+                            "aca_ptc",
+                            map_to="person",
+                            period=2025,
+                            use_weights=False,
+                        )
                     )
-                )
-                > 0,
-                person_weights=np.asarray(
-                    sim.calculate(
-                        "person_weight",
-                        period=2025,
-                        use_weights=False,
-                    )
-                ),
-                person_tax_unit_ids=_get_period_array(
-                    data["person_tax_unit_id"],
-                    base_year,
+                    > 0,
+                    person_weights=np.asarray(
+                        sim.calculate(
+                            "person_weight",
+                            period=2025,
+                            use_weights=False,
+                        )
+                    ),
+                    person_tax_unit_ids=_get_period_array(
+                        data["person_tax_unit_id"],
+                        base_year,
+                    ),
+                    tax_unit_ids=_get_period_array(data["tax_unit_id"], base_year),
                 ),
-                tax_unit_ids=_get_period_array(data["tax_unit_id"], base_year),
             )
 
         logging.info("Post-generation weight validation passed")
diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -76,7 +76,6 @@ def _supports_structural_mortgage_inputs() -> bool:
     "spm_unit_payroll_tax_reported",
     "spm_unit_federal_tax_reported",
     "spm_unit_state_tax_reported",
-    "spm_unit_capped_work_childcare_expenses",
     "spm_unit_spm_threshold",
     "spm_unit_net_income_reported",
     "spm_unit_pre_subsidy_childcare_expenses",
@@ -326,6 +325,125 @@ def reconcile_ss_subcomponents(predictions, total_ss):
 }
 
 
+def derive_clone_capped_childcare_expenses(
+    donor_pre_subsidy: np.ndarray,
+    donor_capped: np.ndarray,
+    clone_pre_subsidy: np.ndarray,
+    clone_person_data: pd.DataFrame,
+    clone_spm_unit_ids: np.ndarray,
+) -> np.ndarray:
+    """Derive clone-half capped childcare from clone inputs.
+
+    The CPS provides both pre-subsidy childcare and the SPM-specific
+    capped childcare deduction. For the clone half, we impute only the
+    pre-subsidy amount, then deterministically rebuild the capped amount
+    instead of letting a second QRF predict it independently.
+
+    We preserve the donor's observed capping share while also respecting
+    the clone's own earnings cap. This keeps the clone-half value
+    consistent with pre-subsidy childcare and avoids impossible outputs
+    such as capped childcare exceeding pre-subsidy childcare.
+    """
+
+    donor_pre_subsidy = np.asarray(donor_pre_subsidy, dtype=float)
+    donor_capped = np.asarray(donor_capped, dtype=float)
+    clone_pre_subsidy = np.asarray(clone_pre_subsidy, dtype=float)
+    clone_spm_unit_ids = np.asarray(clone_spm_unit_ids)
+
+    donor_cap_share = np.divide(
+        donor_capped,
+        donor_pre_subsidy,
+        out=np.zeros_like(donor_capped, dtype=float),
+        where=donor_pre_subsidy > 0,
+    )
+    donor_cap_share = np.clip(donor_cap_share, 0.0, 1.0)
+    capped_from_share = np.maximum(clone_pre_subsidy, 0.0) * donor_cap_share
+
+    if clone_person_data.empty:
+        earnings_cap = np.zeros(len(clone_spm_unit_ids), dtype=float)
+    else:
+        eligible = clone_person_data["is_parent_proxy"].astype(bool)
+        parent_rows = clone_person_data.loc[
+            eligible, ["spm_unit_id", "age", "earnings"]
+        ].copy()
+        if parent_rows.empty:
+            earnings_cap = np.zeros(len(clone_spm_unit_ids), dtype=float)
+        else:
+            parent_rows["earnings"] = parent_rows["earnings"].clip(lower=0.0)
+            parent_rows["age_rank"] = parent_rows.groupby("spm_unit_id")["age"].rank(
+                method="first", ascending=False
+            )
+            top_two = parent_rows[parent_rows["age_rank"] <= 2].sort_values(
+                ["spm_unit_id", "age_rank"]
+            )
+            earnings_cap_by_unit = top_two.groupby("spm_unit_id")["earnings"].agg(
+                lambda values: (
+                    float(values.iloc[0])
+                    if len(values) == 1
+                    else float(np.minimum(values.iloc[0], values.iloc[1]))
+                )
+            )
+            earnings_cap = earnings_cap_by_unit.reindex(
+                clone_spm_unit_ids, fill_value=0.0
+            ).to_numpy(dtype=float)
+
+    return np.minimum(capped_from_share, earnings_cap)
+
+
+def _rebuild_clone_capped_childcare_expenses(
+    data: dict,
+    time_period: int,
+    cps_sim,
+) -> np.ndarray:
+    """Rebuild clone-half capped childcare expenses after stage-2 imputation."""
+
+    n_persons_half = len(data["person_id"][time_period]) // 2
+    n_spm_units_half = len(data["spm_unit_id"][time_period]) // 2
+
+    person_roles = cps_sim.calculate_dataframe(
+        ["age", "is_tax_unit_head", "is_tax_unit_spouse"]
+    )
+    if len(person_roles) != n_persons_half:
+        raise ValueError(
+            "Unexpected person role frame length while rebuilding clone childcare "
+            f"expenses: got {len(person_roles)}, expected {n_persons_half}"
+        )
+
+    clone_person_data = pd.DataFrame(
+        {
+            "spm_unit_id": data["person_spm_unit_id"][time_period][n_persons_half:],
+            "age": person_roles["age"].values,
+            "is_parent_proxy": (
+                person_roles["is_tax_unit_head"].values
+                | person_roles["is_tax_unit_spouse"].values
+            ),
+            "earnings": (
+                data["employment_income"][time_period][n_persons_half:]
+                + data["self_employment_income"][time_period][n_persons_half:]
+            ),
+        }
+    )
+
+    donor_pre_subsidy = data["spm_unit_pre_subsidy_childcare_expenses"][time_period][
+        :n_spm_units_half
+    ]
+    donor_capped = data["spm_unit_capped_work_childcare_expenses"][time_period][
+        :n_spm_units_half
+    ]
+    clone_pre_subsidy = data["spm_unit_pre_subsidy_childcare_expenses"][time_period][
+        n_spm_units_half:
+    ]
+    clone_spm_unit_ids = data["spm_unit_id"][time_period][n_spm_units_half:]
+
+    return derive_clone_capped_childcare_expenses(
+        donor_pre_subsidy=donor_pre_subsidy,
+        donor_capped=donor_capped,
+        clone_pre_subsidy=clone_pre_subsidy,
+        clone_person_data=clone_person_data,
+        clone_spm_unit_ids=clone_spm_unit_ids,
+    )
+
+
 def _apply_post_processing(predictions, X_test, time_period, data):
     """Apply retirement constraints and SS reconciliation."""
     ret_cols = [c for c in predictions.columns if c in _RETIREMENT_VARS]
@@ -430,6 +548,24 @@ def _splice_cps_only_predictions(
         new_values = np.concatenate([cps_half, pred_values])
         data[var] = {time_period: new_values}
 
+    if (
+        "spm_unit_capped_work_childcare_expenses" in data
+        and "spm_unit_pre_subsidy_childcare_expenses" in data
+    ):
+        n_half = entity_half_lengths.get(
+            "spm_unit",
+            len(data["spm_unit_capped_work_childcare_expenses"][time_period]) // 2,
+        )
+        cps_half = data["spm_unit_capped_work_childcare_expenses"][time_period][:n_half]
+        clone_half = _rebuild_clone_capped_childcare_expenses(
+            data=data,
+            time_period=time_period,
+            cps_sim=cps_sim,
+        )
+        data["spm_unit_capped_work_childcare_expenses"] = {
+            time_period: np.concatenate([cps_half, clone_half])
+        }
+
     del cps_sim
     return data
 
diff --git a/tests/unit/test_enhanced_cps.py b/tests/unit/test_enhanced_cps.py
@@ -0,0 +1,36 @@
+import numpy as np
+
+from policyengine_us_data.datasets.cps.enhanced_cps import (
+    _get_base_aca_takeup,
+    _set_period_array,
+)
+
+
+def test_get_base_aca_takeup_uses_stored_values():
+    data = {
+        "takes_up_aca_if_eligible": {
+            2024: np.array([True, False, True], dtype=bool),
+        }
+    }
+
+    result = _get_base_aca_takeup(data=data, base_year=2024, tax_unit_count=3)
+
+    np.testing.assert_array_equal(
+        result,
+        np.array([True, False, True], dtype=bool),
+    )
+
+
+def test_get_base_aca_takeup_defaults_to_true_when_missing():
+    result = _get_base_aca_takeup(data={}, base_year=2024, tax_unit_count=4)
+
+    np.testing.assert_array_equal(result, np.ones(4, dtype=bool))
+
+
+def test_set_period_array_creates_missing_variable_entry():
+    data = {}
+    values = np.array([True, False], dtype=bool)
+
+    _set_period_array(data, "takes_up_aca_if_eligible", 2025, values)
+
+    np.testing.assert_array_equal(data["takes_up_aca_if_eligible"][2025], values)
diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py
@@ -19,6 +19,7 @@
     CPS_ONLY_IMPUTED_VARIABLES,
     CPS_STAGE2_INCOME_PREDICTORS,
     apply_retirement_constraints,
+    derive_clone_capped_childcare_expenses,
     reconcile_ss_subcomponents,
 )
 from policyengine_us_data.datasets.org import ORG_IMPUTED_VARIABLES
@@ -116,6 +117,86 @@ def test_pension_income_not_in_cps_only(self):
             f"Pension income vars should not be in CPS_ONLY: {present}"
         )
 
+    def test_capped_childcare_not_in_cps_only(self):
+        """Capped childcare should be derived from clone-half inputs, not
+        independently QRF-imputed."""
+        assert "spm_unit_capped_work_childcare_expenses" not in set(
+            CPS_ONLY_IMPUTED_VARIABLES
+        )
+
+
+class TestCloneChildcareDerivation:
+    """Clone-half capped childcare should be derived deterministically."""
+
+    def test_caps_at_pre_subsidy_and_clone_earnings(self):
+        donor_pre_subsidy = np.array([10000.0, 4000.0, 6000.0])
+        donor_capped = np.array([4000.0, 4000.0, 0.0])
+        clone_pre_subsidy = np.array([12000.0, 5000.0, 3000.0])
+        person_data = pd.DataFrame(
+            {
+                "spm_unit_id": [1, 1, 2, 2, 3],
+                "age": [40, 38, 35, 33, 29],
+                "is_parent_proxy": [True, True, True, True, True],
+                "earnings": [9000.0, 3000.0, 1500.0, 0.0, 2000.0],
+            }
+        )
+
+        result = derive_clone_capped_childcare_expenses(
+            donor_pre_subsidy=donor_pre_subsidy,
+            donor_capped=donor_capped,
+            clone_pre_subsidy=clone_pre_subsidy,
+            clone_person_data=person_data,
+            clone_spm_unit_ids=np.array([1, 2, 3]),
+        )
+
+        np.testing.assert_allclose(result, np.array([3000.0, 0.0, 0.0]))
+
+    def test_uses_single_parent_earnings_cap_for_single_proxy_units(self):
+        donor_pre_subsidy = np.array([4000.0])
+        donor_capped = np.array([4000.0])
+        clone_pre_subsidy = np.array([6000.0])
+        person_data = pd.DataFrame(
+            {
+                "spm_unit_id": [10],
+                "age": [31],
+                "is_parent_proxy": [True],
+                "earnings": [2500.0],
+            }
+        )
+
+        result = derive_clone_capped_childcare_expenses(
+            donor_pre_subsidy=donor_pre_subsidy,
+            donor_capped=donor_capped,
+            clone_pre_subsidy=clone_pre_subsidy,
+            clone_person_data=person_data,
+            clone_spm_unit_ids=np.array([10]),
+        )
+
+        np.testing.assert_allclose(result, np.array([2500.0]))
+
+    def test_falls_back_to_zero_without_parent_proxies(self):
+        donor_pre_subsidy = np.array([3000.0])
+        donor_capped = np.array([2000.0])
+        clone_pre_subsidy = np.array([3000.0])
+        person_data = pd.DataFrame(
+            {
+                "spm_unit_id": [20, 20],
+                "age": [12, 9],
+                "is_parent_proxy": [False, False],
+                "earnings": [0.0, 0.0],
+            }
+        )
+
+        result = derive_clone_capped_childcare_expenses(
+            donor_pre_subsidy=donor_pre_subsidy,
+            donor_capped=donor_capped,
+            clone_pre_subsidy=clone_pre_subsidy,
+            clone_person_data=person_data,
+            clone_spm_unit_ids=np.array([20]),
+        )
+
+        np.testing.assert_allclose(result, np.array([0.0]))
+
 
 class TestRetirementConstraints:
     """Post-processing retirement constraints enforce IRS caps."""

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Stop independently QRF-imputing clone-half ``spm_unit_capped_work_childcare_expenses`` and rebuild it deterministically from clone pre-subsidy childcare, donor capping shares, and clone earnings caps.