Skip to content

Commit 3333a7b

Browse files
authored
Validate housing assistance formulas before export (#995)
1 parent dd0957b commit 3333a7b

6 files changed

Lines changed: 294 additions & 5 deletions

File tree

changelog.d/995.changed.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Validate Extended CPS housing assistance formulas before dropping formula outputs from the final export.

policyengine_us_data/datasets/cps/extended_cps.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
CPS_2024,
1212
CPS_2024_Full,
1313
ESI_POLICYHOLDER_VARIABLE,
14+
_open_dataset_read_only,
1415
)
1516
from policyengine_us_data.datasets.org import (
1617
ORG_IMPUTED_VARIABLES,
@@ -675,6 +676,46 @@ def reconcile_ss_subcomponents(predictions, total_ss):
675676
"weekly_hours_worked": "weekly_hours_worked_before_lsr",
676677
}
677678

679+
_HOUSING_ASSISTANCE_FORMULA_OUTPUTS = {
680+
"housing_assistance",
681+
"spm_unit_capped_housing_subsidy",
682+
}
683+
_MIN_MODELED_HOUSING_SHARE_OF_BENCHMARK = 0.01
684+
685+
686+
class _InMemoryTimePeriodDataset(Dataset):
687+
name = "extended_cps_validation"
688+
label = "Extended CPS validation"
689+
data_format = Dataset.TIME_PERIOD_ARRAYS
690+
file_path = STORAGE_FOLDER / "extended_cps_validation.h5"
691+
692+
def __init__(self, data: dict, time_period: int):
693+
self._data = data
694+
self.time_period = time_period
695+
super().__init__()
696+
697+
def load(self):
698+
return self._data
699+
700+
def load_dataset(self):
701+
return self._data
702+
703+
704+
def _load_raw_spm_capped_housing_subsidy(cps_dataset, time_period: int):
705+
"""Load Census SPM capped housing subsidy for validation only."""
706+
707+
raw_cps = getattr(cps_dataset, "raw_cps", None)
708+
if raw_cps is None:
709+
return None
710+
711+
with _open_dataset_read_only(raw_cps) as raw_data:
712+
spm_unit = raw_data["spm_unit"]
713+
if "SPM_CAPHOUSESUB" not in spm_unit.columns:
714+
return None
715+
values = np.asarray(spm_unit["SPM_CAPHOUSESUB"], dtype=float)
716+
717+
return {time_period: values}
718+
678719

679720
def _apply_post_processing(predictions, X_test, time_period, data):
680721
"""Apply retirement constraints and SS reconciliation."""
@@ -834,6 +875,22 @@ def generate(self):
834875
data_dict = {}
835876
for var in data:
836877
data_dict[var] = {self.time_period: data[var][...]}
878+
raw_spm_capped_housing_subsidy = _load_raw_spm_capped_housing_subsidy(
879+
self.cps,
880+
self.time_period,
881+
)
882+
if raw_spm_capped_housing_subsidy is not None:
883+
source_values = raw_spm_capped_housing_subsidy[self.time_period]
884+
spm_unit_ids = data_dict.get("spm_unit_id", {}).get(self.time_period)
885+
if spm_unit_ids is not None and len(source_values) == len(spm_unit_ids):
886+
data_dict["spm_unit_capped_housing_subsidy"] = (
887+
raw_spm_capped_housing_subsidy
888+
)
889+
else:
890+
logger.warning(
891+
"Skipping raw SPM capped housing subsidy validation benchmark "
892+
"due to SPM unit length mismatch"
893+
)
837894

838895
state_fips = data_dict["state_fips"][self.time_period]
839896
county_fips = data_dict.get("county_fips", {}).get(self.time_period)
@@ -890,6 +947,11 @@ def generate(self):
890947
new_data = self._impute_aotc_eligibility_inputs(new_data, self.time_period)
891948
new_data = self._impute_llc_eligibility_inputs(new_data, self.time_period)
892949
new_data = self._rename_imputed_to_inputs(new_data)
950+
new_data = self._validate_housing_assistance_microsimulation(
951+
new_data,
952+
self.time_period,
953+
)
954+
new_data = self._drop_housing_assistance_formula_outputs(new_data)
893955
if _supports_structural_mortgage_inputs():
894956
had_positive_mortgage_input = self._has_positive_mortgage_input(
895957
new_data,
@@ -1237,6 +1299,126 @@ def _validate_structural_mortgage_conversion(
12371299
"Structural mortgage conversion lost positive mortgage inputs."
12381300
)
12391301

1302+
@classmethod
1303+
@pipeline_node(
1304+
PipelineNode(
1305+
id="housing_assistance_microsim_validation",
1306+
label="Validate Housing Assistance Microsimulation",
1307+
node_type="process",
1308+
description=(
1309+
"Runs a temporary microsimulation before final export to ensure "
1310+
"housing-assistance leaf inputs reconstruct nonzero modeled "
1311+
"housing assistance and capped SPM housing subsidy."
1312+
),
1313+
status="transitional",
1314+
stability="moving",
1315+
pathways=["data_build"],
1316+
artifacts_in=["extended_cps_stage2"],
1317+
artifacts_out=["housing_validated_extended_cps"],
1318+
pydoc=True,
1319+
)
1320+
)
1321+
def _validate_housing_assistance_microsimulation(
1322+
cls,
1323+
data,
1324+
time_period,
1325+
microsimulation_cls=None,
1326+
):
1327+
"""Check formula-reconstructed housing assistance before export.
1328+
1329+
The final H5 must not export formula outputs such as ``housing_assistance``.
1330+
This guard verifies that the remaining leaf inputs still make those
1331+
formulas produce nonzero values before the export contract strips or
1332+
rejects computed variables.
1333+
"""
1334+
1335+
receives = data.get("receives_housing_assistance", {}).get(time_period)
1336+
takes_up = data.get("takes_up_housing_assistance_if_eligible", {}).get(
1337+
time_period
1338+
)
1339+
if receives is None and takes_up is None:
1340+
return data
1341+
1342+
receives = (
1343+
np.asarray(receives, dtype=bool)
1344+
if receives is not None
1345+
else np.zeros_like(np.asarray(takes_up, dtype=bool))
1346+
)
1347+
takes_up = (
1348+
np.asarray(takes_up, dtype=bool)
1349+
if takes_up is not None
1350+
else np.zeros_like(receives, dtype=bool)
1351+
)
1352+
if not (receives | takes_up).any():
1353+
return data
1354+
1355+
validation_data = {
1356+
variable: values
1357+
for variable, values in data.items()
1358+
if variable not in _HOUSING_ASSISTANCE_FORMULA_OUTPUTS
1359+
}
1360+
if microsimulation_cls is None:
1361+
from policyengine_us import Microsimulation
1362+
1363+
microsimulation_cls = Microsimulation
1364+
1365+
simulation = microsimulation_cls(
1366+
dataset=_InMemoryTimePeriodDataset(validation_data, time_period)
1367+
)
1368+
housing_assistance = simulation.calculate("housing_assistance", time_period)
1369+
capped_housing_subsidy = simulation.calculate(
1370+
"spm_unit_capped_housing_subsidy",
1371+
time_period,
1372+
)
1373+
housing_total = float(housing_assistance.sum())
1374+
capped_total = float(capped_housing_subsidy.sum())
1375+
if housing_total <= 0 or capped_total <= 0:
1376+
raise RuntimeError(
1377+
"Housing assistance inputs do not reconstruct modeled benefits: "
1378+
f"housing_assistance={housing_total:,.0f}, "
1379+
f"spm_unit_capped_housing_subsidy={capped_total:,.0f}. "
1380+
"Check receives_housing_assistance, "
1381+
"takes_up_housing_assistance_if_eligible, county_fips, rent, "
1382+
"and HUD payment-standard inputs before dropping formula outputs."
1383+
)
1384+
1385+
benchmark = data.get("spm_unit_capped_housing_subsidy", {}).get(time_period)
1386+
if benchmark is None:
1387+
return data
1388+
1389+
from microdf import MicroSeries
1390+
1391+
spm_unit_weight = simulation.calculate(
1392+
"spm_unit_weight",
1393+
time_period,
1394+
use_weights=False,
1395+
)
1396+
weights = np.asarray(getattr(spm_unit_weight, "values", spm_unit_weight))
1397+
benchmark_total = float(
1398+
MicroSeries(np.asarray(benchmark, dtype=float), weights=weights).sum()
1399+
)
1400+
if benchmark_total <= 0:
1401+
return data
1402+
1403+
minimum_total = benchmark_total * _MIN_MODELED_HOUSING_SHARE_OF_BENCHMARK
1404+
if capped_total < minimum_total:
1405+
raise RuntimeError(
1406+
"Modeled capped housing subsidy is implausibly small relative "
1407+
"to the raw ASEC SPM housing subsidy benchmark: "
1408+
f"modeled={capped_total:,.0f}, benchmark={benchmark_total:,.0f}. "
1409+
"This likely means a required formula input is missing before "
1410+
"housing assistance formula outputs are dropped from the final export."
1411+
)
1412+
return data
1413+
1414+
@classmethod
1415+
def _drop_housing_assistance_formula_outputs(cls, data):
1416+
"""Remove housing assistance formula outputs after validation."""
1417+
1418+
for variable in sorted(set(data) & _HOUSING_ASSISTANCE_FORMULA_OUTPUTS):
1419+
del data[variable]
1420+
return data
1421+
12401422
# QRF imputes formula-level variables (e.g. taxable_pension_income)
12411423
# but we must store them under leaf input names. The engine then
12421424
# recomputes the formula var from its adds.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ classifiers = [
2222
"Programming Language :: Python :: 3.14",
2323
]
2424
dependencies = [
25-
"policyengine-us==1.692.1",
25+
"policyengine-us==1.693.1",
2626
# policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for
2727
# PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost
2828
# after _invalidate_all_caches) and is required by policyengine-us 1.682.1+.

tests/unit/test_extended_cps.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,22 @@
4242
from policyengine_us_data.utils.dataset_validation import DatasetContractError
4343

4444

45+
class _FakeHousingMicrosimulation:
46+
outputs = {
47+
"housing_assistance": np.array([5_000.0, 0.0]),
48+
"spm_unit_capped_housing_subsidy": np.array([4_000.0, 0.0]),
49+
"spm_unit_weight": np.array([2.0, 3.0]),
50+
}
51+
seen_data = None
52+
53+
def __init__(self, dataset):
54+
self.dataset = dataset
55+
type(self).seen_data = dataset.load_dataset()
56+
57+
def calculate(self, variable, period, **kwargs):
58+
return type(self).outputs[variable]
59+
60+
4561
class TestVariableListConsistency:
4662
"""Variable lists should not overlap — no variable should be
4763
imputed by two different mechanisms."""
@@ -237,6 +253,83 @@ def test_finalize_stage2_computed_variables_renames_and_drops(self):
237253
assert "tax_unit_is_joint" not in result
238254
assert "employment_income_last_year" not in result
239255

256+
def test_housing_assistance_validation_removes_formula_outputs_for_microsim(self):
257+
data = {
258+
"receives_housing_assistance": {2024: np.array([True, False])},
259+
"takes_up_housing_assistance_if_eligible": {2024: np.array([True, False])},
260+
"housing_assistance": {2024: np.array([99_000.0, 99_000.0])},
261+
"spm_unit_capped_housing_subsidy": {2024: np.array([3_000.0, 0.0])},
262+
}
263+
_FakeHousingMicrosimulation.outputs = {
264+
"housing_assistance": np.array([5_000.0, 0.0]),
265+
"spm_unit_capped_housing_subsidy": np.array([4_000.0, 0.0]),
266+
"spm_unit_weight": np.array([2.0, 3.0]),
267+
}
268+
269+
result = ExtendedCPS._validate_housing_assistance_microsimulation(
270+
data,
271+
2024,
272+
microsimulation_cls=_FakeHousingMicrosimulation,
273+
)
274+
275+
assert result is data
276+
assert "housing_assistance" not in _FakeHousingMicrosimulation.seen_data
277+
assert (
278+
"spm_unit_capped_housing_subsidy"
279+
not in _FakeHousingMicrosimulation.seen_data
280+
)
281+
282+
def test_housing_assistance_validation_rejects_zero_modeled_benefits(self):
283+
data = {
284+
"receives_housing_assistance": {2024: np.array([True])},
285+
"takes_up_housing_assistance_if_eligible": {2024: np.array([True])},
286+
}
287+
_FakeHousingMicrosimulation.outputs = {
288+
"housing_assistance": np.array([0.0]),
289+
"spm_unit_capped_housing_subsidy": np.array([0.0]),
290+
"spm_unit_weight": np.array([1.0]),
291+
}
292+
293+
with pytest.raises(RuntimeError, match="do not reconstruct modeled benefits"):
294+
ExtendedCPS._validate_housing_assistance_microsimulation(
295+
data,
296+
2024,
297+
microsimulation_cls=_FakeHousingMicrosimulation,
298+
)
299+
300+
def test_housing_assistance_validation_rejects_tiny_reported_match(self):
301+
data = {
302+
"receives_housing_assistance": {2024: np.array([True])},
303+
"takes_up_housing_assistance_if_eligible": {2024: np.array([True])},
304+
"spm_unit_capped_housing_subsidy": {2024: np.array([10_000.0])},
305+
}
306+
_FakeHousingMicrosimulation.outputs = {
307+
"housing_assistance": np.array([100.0]),
308+
"spm_unit_capped_housing_subsidy": np.array([50.0]),
309+
"spm_unit_weight": np.array([1.0]),
310+
}
311+
312+
with pytest.raises(RuntimeError, match="implausibly small"):
313+
ExtendedCPS._validate_housing_assistance_microsimulation(
314+
data,
315+
2024,
316+
microsimulation_cls=_FakeHousingMicrosimulation,
317+
)
318+
319+
def test_drop_housing_assistance_formula_outputs_after_validation(self):
320+
data = {
321+
"housing_assistance": {2024: np.array([1_000.0])},
322+
"spm_unit_capped_housing_subsidy": {2024: np.array([800.0])},
323+
"receives_housing_assistance": {2024: np.array([True])},
324+
}
325+
326+
result = ExtendedCPS._drop_housing_assistance_formula_outputs(data)
327+
328+
assert result is data
329+
assert "housing_assistance" not in result
330+
assert "spm_unit_capped_housing_subsidy" not in result
331+
assert "receives_housing_assistance" in result
332+
240333

241334
class TestStructuralMortgageValidation:
242335
def test_positive_mortgage_input_ignores_non_mortgage_interest_deduction(self):

tests/unit/test_policyengine_us_dependency_contract.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,16 @@ def test_policyengine_us_defines_housing_assistance_takeup_input():
99
assert not getattr(variable, "formulas", None)
1010
assert not getattr(variable, "adds", None)
1111
assert not getattr(variable, "subtracts", None)
12+
13+
14+
def test_policyengine_us_defines_housing_assistance_formulas():
15+
tax_benefit_system = CountryTaxBenefitSystem()
16+
17+
for variable_name in (
18+
"housing_assistance",
19+
"spm_unit_capped_housing_subsidy",
20+
):
21+
variable = tax_benefit_system.variables[variable_name]
22+
23+
assert variable.entity.key == "spm_unit"
24+
assert getattr(variable, "formulas", None)

uv.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)