|
11 | 11 | CPS_2024, |
12 | 12 | CPS_2024_Full, |
13 | 13 | ESI_POLICYHOLDER_VARIABLE, |
| 14 | + _open_dataset_read_only, |
14 | 15 | ) |
15 | 16 | from policyengine_us_data.datasets.org import ( |
16 | 17 | ORG_IMPUTED_VARIABLES, |
@@ -675,6 +676,46 @@ def reconcile_ss_subcomponents(predictions, total_ss): |
675 | 676 | "weekly_hours_worked": "weekly_hours_worked_before_lsr", |
676 | 677 | } |
677 | 678 |
|
| 679 | +_HOUSING_ASSISTANCE_FORMULA_OUTPUTS = { |
| 680 | + "housing_assistance", |
| 681 | + "spm_unit_capped_housing_subsidy", |
| 682 | +} |
| 683 | +_MIN_MODELED_HOUSING_SHARE_OF_BENCHMARK = 0.01 |
| 684 | + |
| 685 | + |
| 686 | +class _InMemoryTimePeriodDataset(Dataset): |
| 687 | + name = "extended_cps_validation" |
| 688 | + label = "Extended CPS validation" |
| 689 | + data_format = Dataset.TIME_PERIOD_ARRAYS |
| 690 | + file_path = STORAGE_FOLDER / "extended_cps_validation.h5" |
| 691 | + |
| 692 | + def __init__(self, data: dict, time_period: int): |
| 693 | + self._data = data |
| 694 | + self.time_period = time_period |
| 695 | + super().__init__() |
| 696 | + |
| 697 | + def load(self): |
| 698 | + return self._data |
| 699 | + |
| 700 | + def load_dataset(self): |
| 701 | + return self._data |
| 702 | + |
| 703 | + |
| 704 | +def _load_raw_spm_capped_housing_subsidy(cps_dataset, time_period: int): |
| 705 | + """Load Census SPM capped housing subsidy for validation only.""" |
| 706 | + |
| 707 | + raw_cps = getattr(cps_dataset, "raw_cps", None) |
| 708 | + if raw_cps is None: |
| 709 | + return None |
| 710 | + |
| 711 | + with _open_dataset_read_only(raw_cps) as raw_data: |
| 712 | + spm_unit = raw_data["spm_unit"] |
| 713 | + if "SPM_CAPHOUSESUB" not in spm_unit.columns: |
| 714 | + return None |
| 715 | + values = np.asarray(spm_unit["SPM_CAPHOUSESUB"], dtype=float) |
| 716 | + |
| 717 | + return {time_period: values} |
| 718 | + |
678 | 719 |
|
679 | 720 | def _apply_post_processing(predictions, X_test, time_period, data): |
680 | 721 | """Apply retirement constraints and SS reconciliation.""" |
@@ -834,6 +875,22 @@ def generate(self): |
834 | 875 | data_dict = {} |
835 | 876 | for var in data: |
836 | 877 | data_dict[var] = {self.time_period: data[var][...]} |
| 878 | + raw_spm_capped_housing_subsidy = _load_raw_spm_capped_housing_subsidy( |
| 879 | + self.cps, |
| 880 | + self.time_period, |
| 881 | + ) |
| 882 | + if raw_spm_capped_housing_subsidy is not None: |
| 883 | + source_values = raw_spm_capped_housing_subsidy[self.time_period] |
| 884 | + spm_unit_ids = data_dict.get("spm_unit_id", {}).get(self.time_period) |
| 885 | + if spm_unit_ids is not None and len(source_values) == len(spm_unit_ids): |
| 886 | + data_dict["spm_unit_capped_housing_subsidy"] = ( |
| 887 | + raw_spm_capped_housing_subsidy |
| 888 | + ) |
| 889 | + else: |
| 890 | + logger.warning( |
| 891 | + "Skipping raw SPM capped housing subsidy validation benchmark " |
| 892 | + "due to SPM unit length mismatch" |
| 893 | + ) |
837 | 894 |
|
838 | 895 | state_fips = data_dict["state_fips"][self.time_period] |
839 | 896 | county_fips = data_dict.get("county_fips", {}).get(self.time_period) |
@@ -890,6 +947,11 @@ def generate(self): |
890 | 947 | new_data = self._impute_aotc_eligibility_inputs(new_data, self.time_period) |
891 | 948 | new_data = self._impute_llc_eligibility_inputs(new_data, self.time_period) |
892 | 949 | new_data = self._rename_imputed_to_inputs(new_data) |
| 950 | + new_data = self._validate_housing_assistance_microsimulation( |
| 951 | + new_data, |
| 952 | + self.time_period, |
| 953 | + ) |
| 954 | + new_data = self._drop_housing_assistance_formula_outputs(new_data) |
893 | 955 | if _supports_structural_mortgage_inputs(): |
894 | 956 | had_positive_mortgage_input = self._has_positive_mortgage_input( |
895 | 957 | new_data, |
@@ -1237,6 +1299,126 @@ def _validate_structural_mortgage_conversion( |
1237 | 1299 | "Structural mortgage conversion lost positive mortgage inputs." |
1238 | 1300 | ) |
1239 | 1301 |
|
| 1302 | + @classmethod |
| 1303 | + @pipeline_node( |
| 1304 | + PipelineNode( |
| 1305 | + id="housing_assistance_microsim_validation", |
| 1306 | + label="Validate Housing Assistance Microsimulation", |
| 1307 | + node_type="process", |
| 1308 | + description=( |
| 1309 | + "Runs a temporary microsimulation before final export to ensure " |
| 1310 | + "housing-assistance leaf inputs reconstruct nonzero modeled " |
| 1311 | + "housing assistance and capped SPM housing subsidy." |
| 1312 | + ), |
| 1313 | + status="transitional", |
| 1314 | + stability="moving", |
| 1315 | + pathways=["data_build"], |
| 1316 | + artifacts_in=["extended_cps_stage2"], |
| 1317 | + artifacts_out=["housing_validated_extended_cps"], |
| 1318 | + pydoc=True, |
| 1319 | + ) |
| 1320 | + ) |
| 1321 | + def _validate_housing_assistance_microsimulation( |
| 1322 | + cls, |
| 1323 | + data, |
| 1324 | + time_period, |
| 1325 | + microsimulation_cls=None, |
| 1326 | + ): |
| 1327 | + """Check formula-reconstructed housing assistance before export. |
| 1328 | +
|
| 1329 | + The final H5 must not export formula outputs such as ``housing_assistance``. |
| 1330 | + This guard verifies that the remaining leaf inputs still make those |
| 1331 | + formulas produce nonzero values before the export contract strips or |
| 1332 | + rejects computed variables. |
| 1333 | + """ |
| 1334 | + |
| 1335 | + receives = data.get("receives_housing_assistance", {}).get(time_period) |
| 1336 | + takes_up = data.get("takes_up_housing_assistance_if_eligible", {}).get( |
| 1337 | + time_period |
| 1338 | + ) |
| 1339 | + if receives is None and takes_up is None: |
| 1340 | + return data |
| 1341 | + |
| 1342 | + receives = ( |
| 1343 | + np.asarray(receives, dtype=bool) |
| 1344 | + if receives is not None |
| 1345 | + else np.zeros_like(np.asarray(takes_up, dtype=bool)) |
| 1346 | + ) |
| 1347 | + takes_up = ( |
| 1348 | + np.asarray(takes_up, dtype=bool) |
| 1349 | + if takes_up is not None |
| 1350 | + else np.zeros_like(receives, dtype=bool) |
| 1351 | + ) |
| 1352 | + if not (receives | takes_up).any(): |
| 1353 | + return data |
| 1354 | + |
| 1355 | + validation_data = { |
| 1356 | + variable: values |
| 1357 | + for variable, values in data.items() |
| 1358 | + if variable not in _HOUSING_ASSISTANCE_FORMULA_OUTPUTS |
| 1359 | + } |
| 1360 | + if microsimulation_cls is None: |
| 1361 | + from policyengine_us import Microsimulation |
| 1362 | + |
| 1363 | + microsimulation_cls = Microsimulation |
| 1364 | + |
| 1365 | + simulation = microsimulation_cls( |
| 1366 | + dataset=_InMemoryTimePeriodDataset(validation_data, time_period) |
| 1367 | + ) |
| 1368 | + housing_assistance = simulation.calculate("housing_assistance", time_period) |
| 1369 | + capped_housing_subsidy = simulation.calculate( |
| 1370 | + "spm_unit_capped_housing_subsidy", |
| 1371 | + time_period, |
| 1372 | + ) |
| 1373 | + housing_total = float(housing_assistance.sum()) |
| 1374 | + capped_total = float(capped_housing_subsidy.sum()) |
| 1375 | + if housing_total <= 0 or capped_total <= 0: |
| 1376 | + raise RuntimeError( |
| 1377 | + "Housing assistance inputs do not reconstruct modeled benefits: " |
| 1378 | + f"housing_assistance={housing_total:,.0f}, " |
| 1379 | + f"spm_unit_capped_housing_subsidy={capped_total:,.0f}. " |
| 1380 | + "Check receives_housing_assistance, " |
| 1381 | + "takes_up_housing_assistance_if_eligible, county_fips, rent, " |
| 1382 | + "and HUD payment-standard inputs before dropping formula outputs." |
| 1383 | + ) |
| 1384 | + |
| 1385 | + benchmark = data.get("spm_unit_capped_housing_subsidy", {}).get(time_period) |
| 1386 | + if benchmark is None: |
| 1387 | + return data |
| 1388 | + |
| 1389 | + from microdf import MicroSeries |
| 1390 | + |
| 1391 | + spm_unit_weight = simulation.calculate( |
| 1392 | + "spm_unit_weight", |
| 1393 | + time_period, |
| 1394 | + use_weights=False, |
| 1395 | + ) |
| 1396 | + weights = np.asarray(getattr(spm_unit_weight, "values", spm_unit_weight)) |
| 1397 | + benchmark_total = float( |
| 1398 | + MicroSeries(np.asarray(benchmark, dtype=float), weights=weights).sum() |
| 1399 | + ) |
| 1400 | + if benchmark_total <= 0: |
| 1401 | + return data |
| 1402 | + |
| 1403 | + minimum_total = benchmark_total * _MIN_MODELED_HOUSING_SHARE_OF_BENCHMARK |
| 1404 | + if capped_total < minimum_total: |
| 1405 | + raise RuntimeError( |
| 1406 | + "Modeled capped housing subsidy is implausibly small relative " |
| 1407 | + "to the raw ASEC SPM housing subsidy benchmark: " |
| 1408 | + f"modeled={capped_total:,.0f}, benchmark={benchmark_total:,.0f}. " |
| 1409 | + "This likely means a required formula input is missing before " |
| 1410 | + "housing assistance formula outputs are dropped from the final export." |
| 1411 | + ) |
| 1412 | + return data |
| 1413 | + |
| 1414 | + @classmethod |
| 1415 | + def _drop_housing_assistance_formula_outputs(cls, data): |
| 1416 | + """Remove housing assistance formula outputs after validation.""" |
| 1417 | + |
| 1418 | + for variable in sorted(set(data) & _HOUSING_ASSISTANCE_FORMULA_OUTPUTS): |
| 1419 | + del data[variable] |
| 1420 | + return data |
| 1421 | + |
1240 | 1422 | # QRF imputes formula-level variables (e.g. taxable_pension_income) |
1241 | 1423 | # but we must store them under leaf input names. The engine then |
1242 | 1424 | # recomputes the formula var from its adds. |
|
0 commit comments