Skip to content

Commit 7a79100

Browse files
authored
Derive clone-half childcare caps deterministically (#704)
* Derive clone childcare cap from clone inputs * Add childcare changelog fragment * Handle missing ACA takeup in enhanced CPS build
1 parent d47fe50 commit 7a79100

5 files changed

Lines changed: 317 additions & 25 deletions

File tree

changelog.d/704.fixed

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Stop independently QRF-imputing clone-half ``spm_unit_capped_work_childcare_expenses`` and rebuild it deterministically from clone pre-subsidy childcare, donor capping shares, and clone earnings caps.

policyengine_us_data/datasets/cps/enhanced_cps.py

Lines changed: 62 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,36 @@ def _get_period_array(period_values: dict, period: int) -> np.ndarray:
3838
return np.asarray(value)
3939

4040

41+
def _get_base_aca_takeup(
42+
data: dict,
43+
base_year: int,
44+
tax_unit_count: int,
45+
) -> np.ndarray:
46+
"""Return stored ACA takeup or the default all-True baseline."""
47+
period_values = data.get("takes_up_aca_if_eligible")
48+
if period_values is None:
49+
logging.info(
50+
"takes_up_aca_if_eligible missing from base dataset; using default "
51+
"all-True takeup for ACA 2025 override"
52+
)
53+
return np.ones(tax_unit_count, dtype=bool)
54+
return _get_period_array(period_values, base_year).astype(bool, copy=False)
55+
56+
57+
def _set_period_array(
58+
data: dict,
59+
variable: str,
60+
period: int,
61+
values: np.ndarray,
62+
) -> None:
63+
"""Store a time-period array, creating the variable entry if needed."""
64+
period_values = data.get(variable)
65+
if period_values is None:
66+
period_values = {}
67+
data[variable] = period_values
68+
period_values[period] = values
69+
70+
4171
def create_aca_2025_takeup_override(
4272
base_takeup: np.ndarray,
4373
person_enrolled_if_takeup: np.ndarray,
@@ -282,32 +312,40 @@ def generate(self):
282312
)
283313
sim.delete_arrays("aca_ptc")
284314

285-
data["takes_up_aca_if_eligible"][2025] = create_aca_2025_takeup_override(
286-
base_takeup=_get_period_array(
287-
data["takes_up_aca_if_eligible"],
288-
base_year,
289-
),
290-
person_enrolled_if_takeup=np.asarray(
291-
sim.calculate(
292-
"aca_ptc",
293-
map_to="person",
294-
period=2025,
295-
use_weights=False,
315+
_set_period_array(
316+
data=data,
317+
variable="takes_up_aca_if_eligible",
318+
period=2025,
319+
values=create_aca_2025_takeup_override(
320+
base_takeup=_get_base_aca_takeup(
321+
data=data,
322+
base_year=base_year,
323+
tax_unit_count=len(
324+
_get_period_array(data["tax_unit_id"], base_year)
325+
),
326+
),
327+
person_enrolled_if_takeup=np.asarray(
328+
sim.calculate(
329+
"aca_ptc",
330+
map_to="person",
331+
period=2025,
332+
use_weights=False,
333+
)
296334
)
297-
)
298-
> 0,
299-
person_weights=np.asarray(
300-
sim.calculate(
301-
"person_weight",
302-
period=2025,
303-
use_weights=False,
304-
)
305-
),
306-
person_tax_unit_ids=_get_period_array(
307-
data["person_tax_unit_id"],
308-
base_year,
335+
> 0,
336+
person_weights=np.asarray(
337+
sim.calculate(
338+
"person_weight",
339+
period=2025,
340+
use_weights=False,
341+
)
342+
),
343+
person_tax_unit_ids=_get_period_array(
344+
data["person_tax_unit_id"],
345+
base_year,
346+
),
347+
tax_unit_ids=_get_period_array(data["tax_unit_id"], base_year),
309348
),
310-
tax_unit_ids=_get_period_array(data["tax_unit_id"], base_year),
311349
)
312350

313351
logging.info("Post-generation weight validation passed")

policyengine_us_data/datasets/cps/extended_cps.py

Lines changed: 137 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ def _supports_structural_mortgage_inputs() -> bool:
7676
"spm_unit_payroll_tax_reported",
7777
"spm_unit_federal_tax_reported",
7878
"spm_unit_state_tax_reported",
79-
"spm_unit_capped_work_childcare_expenses",
8079
"spm_unit_spm_threshold",
8180
"spm_unit_net_income_reported",
8281
"spm_unit_pre_subsidy_childcare_expenses",
@@ -326,6 +325,125 @@ def reconcile_ss_subcomponents(predictions, total_ss):
326325
}
327326

328327

328+
def derive_clone_capped_childcare_expenses(
329+
donor_pre_subsidy: np.ndarray,
330+
donor_capped: np.ndarray,
331+
clone_pre_subsidy: np.ndarray,
332+
clone_person_data: pd.DataFrame,
333+
clone_spm_unit_ids: np.ndarray,
334+
) -> np.ndarray:
335+
"""Derive clone-half capped childcare from clone inputs.
336+
337+
The CPS provides both pre-subsidy childcare and the SPM-specific
338+
capped childcare deduction. For the clone half, we impute only the
339+
pre-subsidy amount, then deterministically rebuild the capped amount
340+
instead of letting a second QRF predict it independently.
341+
342+
We preserve the donor's observed capping share while also respecting
343+
the clone's own earnings cap. This keeps the clone-half value
344+
consistent with pre-subsidy childcare and avoids impossible outputs
345+
such as capped childcare exceeding pre-subsidy childcare.
346+
"""
347+
348+
donor_pre_subsidy = np.asarray(donor_pre_subsidy, dtype=float)
349+
donor_capped = np.asarray(donor_capped, dtype=float)
350+
clone_pre_subsidy = np.asarray(clone_pre_subsidy, dtype=float)
351+
clone_spm_unit_ids = np.asarray(clone_spm_unit_ids)
352+
353+
donor_cap_share = np.divide(
354+
donor_capped,
355+
donor_pre_subsidy,
356+
out=np.zeros_like(donor_capped, dtype=float),
357+
where=donor_pre_subsidy > 0,
358+
)
359+
donor_cap_share = np.clip(donor_cap_share, 0.0, 1.0)
360+
capped_from_share = np.maximum(clone_pre_subsidy, 0.0) * donor_cap_share
361+
362+
if clone_person_data.empty:
363+
earnings_cap = np.zeros(len(clone_spm_unit_ids), dtype=float)
364+
else:
365+
eligible = clone_person_data["is_parent_proxy"].astype(bool)
366+
parent_rows = clone_person_data.loc[
367+
eligible, ["spm_unit_id", "age", "earnings"]
368+
].copy()
369+
if parent_rows.empty:
370+
earnings_cap = np.zeros(len(clone_spm_unit_ids), dtype=float)
371+
else:
372+
parent_rows["earnings"] = parent_rows["earnings"].clip(lower=0.0)
373+
parent_rows["age_rank"] = parent_rows.groupby("spm_unit_id")["age"].rank(
374+
method="first", ascending=False
375+
)
376+
top_two = parent_rows[parent_rows["age_rank"] <= 2].sort_values(
377+
["spm_unit_id", "age_rank"]
378+
)
379+
earnings_cap_by_unit = top_two.groupby("spm_unit_id")["earnings"].agg(
380+
lambda values: (
381+
float(values.iloc[0])
382+
if len(values) == 1
383+
else float(np.minimum(values.iloc[0], values.iloc[1]))
384+
)
385+
)
386+
earnings_cap = earnings_cap_by_unit.reindex(
387+
clone_spm_unit_ids, fill_value=0.0
388+
).to_numpy(dtype=float)
389+
390+
return np.minimum(capped_from_share, earnings_cap)
391+
392+
393+
def _rebuild_clone_capped_childcare_expenses(
394+
data: dict,
395+
time_period: int,
396+
cps_sim,
397+
) -> np.ndarray:
398+
"""Rebuild clone-half capped childcare expenses after stage-2 imputation."""
399+
400+
n_persons_half = len(data["person_id"][time_period]) // 2
401+
n_spm_units_half = len(data["spm_unit_id"][time_period]) // 2
402+
403+
person_roles = cps_sim.calculate_dataframe(
404+
["age", "is_tax_unit_head", "is_tax_unit_spouse"]
405+
)
406+
if len(person_roles) != n_persons_half:
407+
raise ValueError(
408+
"Unexpected person role frame length while rebuilding clone childcare "
409+
f"expenses: got {len(person_roles)}, expected {n_persons_half}"
410+
)
411+
412+
clone_person_data = pd.DataFrame(
413+
{
414+
"spm_unit_id": data["person_spm_unit_id"][time_period][n_persons_half:],
415+
"age": person_roles["age"].values,
416+
"is_parent_proxy": (
417+
person_roles["is_tax_unit_head"].values
418+
| person_roles["is_tax_unit_spouse"].values
419+
),
420+
"earnings": (
421+
data["employment_income"][time_period][n_persons_half:]
422+
+ data["self_employment_income"][time_period][n_persons_half:]
423+
),
424+
}
425+
)
426+
427+
donor_pre_subsidy = data["spm_unit_pre_subsidy_childcare_expenses"][time_period][
428+
:n_spm_units_half
429+
]
430+
donor_capped = data["spm_unit_capped_work_childcare_expenses"][time_period][
431+
:n_spm_units_half
432+
]
433+
clone_pre_subsidy = data["spm_unit_pre_subsidy_childcare_expenses"][time_period][
434+
n_spm_units_half:
435+
]
436+
clone_spm_unit_ids = data["spm_unit_id"][time_period][n_spm_units_half:]
437+
438+
return derive_clone_capped_childcare_expenses(
439+
donor_pre_subsidy=donor_pre_subsidy,
440+
donor_capped=donor_capped,
441+
clone_pre_subsidy=clone_pre_subsidy,
442+
clone_person_data=clone_person_data,
443+
clone_spm_unit_ids=clone_spm_unit_ids,
444+
)
445+
446+
329447
def _apply_post_processing(predictions, X_test, time_period, data):
330448
"""Apply retirement constraints and SS reconciliation."""
331449
ret_cols = [c for c in predictions.columns if c in _RETIREMENT_VARS]
@@ -430,6 +548,24 @@ def _splice_cps_only_predictions(
430548
new_values = np.concatenate([cps_half, pred_values])
431549
data[var] = {time_period: new_values}
432550

551+
if (
552+
"spm_unit_capped_work_childcare_expenses" in data
553+
and "spm_unit_pre_subsidy_childcare_expenses" in data
554+
):
555+
n_half = entity_half_lengths.get(
556+
"spm_unit",
557+
len(data["spm_unit_capped_work_childcare_expenses"][time_period]) // 2,
558+
)
559+
cps_half = data["spm_unit_capped_work_childcare_expenses"][time_period][:n_half]
560+
clone_half = _rebuild_clone_capped_childcare_expenses(
561+
data=data,
562+
time_period=time_period,
563+
cps_sim=cps_sim,
564+
)
565+
data["spm_unit_capped_work_childcare_expenses"] = {
566+
time_period: np.concatenate([cps_half, clone_half])
567+
}
568+
433569
del cps_sim
434570
return data
435571

tests/unit/test_enhanced_cps.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import numpy as np
2+
3+
from policyengine_us_data.datasets.cps.enhanced_cps import (
4+
_get_base_aca_takeup,
5+
_set_period_array,
6+
)
7+
8+
9+
def test_get_base_aca_takeup_uses_stored_values():
10+
data = {
11+
"takes_up_aca_if_eligible": {
12+
2024: np.array([True, False, True], dtype=bool),
13+
}
14+
}
15+
16+
result = _get_base_aca_takeup(data=data, base_year=2024, tax_unit_count=3)
17+
18+
np.testing.assert_array_equal(
19+
result,
20+
np.array([True, False, True], dtype=bool),
21+
)
22+
23+
24+
def test_get_base_aca_takeup_defaults_to_true_when_missing():
25+
result = _get_base_aca_takeup(data={}, base_year=2024, tax_unit_count=4)
26+
27+
np.testing.assert_array_equal(result, np.ones(4, dtype=bool))
28+
29+
30+
def test_set_period_array_creates_missing_variable_entry():
31+
data = {}
32+
values = np.array([True, False], dtype=bool)
33+
34+
_set_period_array(data, "takes_up_aca_if_eligible", 2025, values)
35+
36+
np.testing.assert_array_equal(data["takes_up_aca_if_eligible"][2025], values)

tests/unit/test_extended_cps.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
CPS_ONLY_IMPUTED_VARIABLES,
2020
CPS_STAGE2_INCOME_PREDICTORS,
2121
apply_retirement_constraints,
22+
derive_clone_capped_childcare_expenses,
2223
reconcile_ss_subcomponents,
2324
)
2425
from policyengine_us_data.datasets.org import ORG_IMPUTED_VARIABLES
@@ -116,6 +117,86 @@ def test_pension_income_not_in_cps_only(self):
116117
f"Pension income vars should not be in CPS_ONLY: {present}"
117118
)
118119

120+
def test_capped_childcare_not_in_cps_only(self):
121+
"""Capped childcare should be derived from clone-half inputs, not
122+
independently QRF-imputed."""
123+
assert "spm_unit_capped_work_childcare_expenses" not in set(
124+
CPS_ONLY_IMPUTED_VARIABLES
125+
)
126+
127+
128+
class TestCloneChildcareDerivation:
129+
"""Clone-half capped childcare should be derived deterministically."""
130+
131+
def test_caps_at_pre_subsidy_and_clone_earnings(self):
132+
donor_pre_subsidy = np.array([10000.0, 4000.0, 6000.0])
133+
donor_capped = np.array([4000.0, 4000.0, 0.0])
134+
clone_pre_subsidy = np.array([12000.0, 5000.0, 3000.0])
135+
person_data = pd.DataFrame(
136+
{
137+
"spm_unit_id": [1, 1, 2, 2, 3],
138+
"age": [40, 38, 35, 33, 29],
139+
"is_parent_proxy": [True, True, True, True, True],
140+
"earnings": [9000.0, 3000.0, 1500.0, 0.0, 2000.0],
141+
}
142+
)
143+
144+
result = derive_clone_capped_childcare_expenses(
145+
donor_pre_subsidy=donor_pre_subsidy,
146+
donor_capped=donor_capped,
147+
clone_pre_subsidy=clone_pre_subsidy,
148+
clone_person_data=person_data,
149+
clone_spm_unit_ids=np.array([1, 2, 3]),
150+
)
151+
152+
np.testing.assert_allclose(result, np.array([3000.0, 0.0, 0.0]))
153+
154+
def test_uses_single_parent_earnings_cap_for_single_proxy_units(self):
155+
donor_pre_subsidy = np.array([4000.0])
156+
donor_capped = np.array([4000.0])
157+
clone_pre_subsidy = np.array([6000.0])
158+
person_data = pd.DataFrame(
159+
{
160+
"spm_unit_id": [10],
161+
"age": [31],
162+
"is_parent_proxy": [True],
163+
"earnings": [2500.0],
164+
}
165+
)
166+
167+
result = derive_clone_capped_childcare_expenses(
168+
donor_pre_subsidy=donor_pre_subsidy,
169+
donor_capped=donor_capped,
170+
clone_pre_subsidy=clone_pre_subsidy,
171+
clone_person_data=person_data,
172+
clone_spm_unit_ids=np.array([10]),
173+
)
174+
175+
np.testing.assert_allclose(result, np.array([2500.0]))
176+
177+
def test_falls_back_to_zero_without_parent_proxies(self):
178+
donor_pre_subsidy = np.array([3000.0])
179+
donor_capped = np.array([2000.0])
180+
clone_pre_subsidy = np.array([3000.0])
181+
person_data = pd.DataFrame(
182+
{
183+
"spm_unit_id": [20, 20],
184+
"age": [12, 9],
185+
"is_parent_proxy": [False, False],
186+
"earnings": [0.0, 0.0],
187+
}
188+
)
189+
190+
result = derive_clone_capped_childcare_expenses(
191+
donor_pre_subsidy=donor_pre_subsidy,
192+
donor_capped=donor_capped,
193+
clone_pre_subsidy=clone_pre_subsidy,
194+
clone_person_data=person_data,
195+
clone_spm_unit_ids=np.array([20]),
196+
)
197+
198+
np.testing.assert_allclose(result, np.array([0.0]))
199+
119200

120201
class TestRetirementConstraints:
121202
"""Post-processing retirement constraints enforce IRS caps."""

0 commit comments

Comments
 (0)