Skip to content

Commit d7ad2fd

Browse files
committed
Add housing assistance count calibration targets
1 parent 05a2fa8 commit d7ad2fd

11 files changed

Lines changed: 391 additions & 16 deletions

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ database:
8787
python -m policyengine_us_data.db.etl_age --year $(YEAR)
8888
python -m policyengine_us_data.db.etl_medicaid --year $(YEAR)
8989
python -m policyengine_us_data.db.etl_snap --year $(YEAR)
90+
python -m policyengine_us_data.db.etl_housing_assistance --year $(YEAR)
9091
python -m policyengine_us_data.db.etl_tanf --year $(YEAR)
9192
python -m policyengine_us_data.db.etl_state_income_tax --year $(YEAR)
9293
python -m policyengine_us_data.db.etl_irs_soi --year $(YEAR)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add HUD Picture of Subsidized Households assisted-household count targets for housing assistance calibration.

policyengine_us_data/calibration/target_config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ include:
5151
- variable: household_count
5252
geo_level: state
5353
domain_variable: snap
54+
- variable: household_count
55+
geo_level: state
56+
domain_variable: housing_assistance
5457
- variable: tanf
5558
geo_level: state
5659
- variable: adjusted_gross_income
@@ -199,6 +202,9 @@ include:
199202
- variable: household_count
200203
geo_level: national
201204
domain_variable: spm_unit_energy_subsidy_reported
205+
- variable: household_count
206+
geo_level: national
207+
domain_variable: housing_assistance
202208
- variable: tip_income
203209
geo_level: national
204210
- variable: unemployment_compensation

policyengine_us_data/calibration/unified_matrix_builder.py

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050

5151
COUNTY_DEPENDENT_VARS = {
5252
"aca_ptc",
53+
"housing_assistance",
5354
}
5455

5556

@@ -1316,13 +1317,18 @@ def _process_single_clone(
13161317
ent_hh_ids = household_ids[ent_hh]
13171318
ent_ci = np.full(n_ent, clone_idx, dtype=np.int64)
13181319

1320+
eligible_mask = None
1321+
if takeup_var == "takes_up_housing_assistance_if_eligible":
1322+
eligible_mask = ent_eligible > 0
1323+
13191324
ent_takeup = compute_block_takeup_for_entities(
13201325
takeup_var,
13211326
precomputed_rates[info["rate_key"]],
13221327
ent_blocks,
13231328
ent_hh_ids,
13241329
ent_ci,
13251330
reported_mask=reported_takeup_anchors.get(takeup_var),
1331+
eligible_mask=eligible_mask,
13261332
)
13271333

13281334
ent_values = (ent_eligible * ent_takeup).astype(np.float32)
@@ -1548,7 +1554,7 @@ def _build_state_values(
15481554
# Identify takeup-affected targets before the state loop
15491555
affected_targets = {}
15501556
if rerandomize_takeup:
1551-
for tvar in target_vars:
1557+
for tvar in target_vars | constraint_vars:
15521558
for key, info in TAKEUP_AFFECTED_TARGETS.items():
15531559
if tvar == key or tvar.startswith(key):
15541560
affected_targets[tvar] = info
@@ -2672,8 +2678,15 @@ def build_matrix(
26722678
for _, row in targets_df.iterrows()
26732679
if int(row.get("reform_id", 0)) > 0
26742680
}
2681+
2682+
# 5a. Collect unique constraint variables
2683+
unique_constraint_vars = set()
2684+
for constraints in non_geo_constraints_list:
2685+
for c in constraints:
2686+
unique_constraint_vars.add(c["variable"])
2687+
26752688
variable_entity_map: Dict[str, str] = {}
2676-
for var in unique_variables:
2689+
for var in unique_variables | unique_constraint_vars:
26772690
if var in sim.tax_benefit_system.variables:
26782691
variable_entity_map[var] = sim.tax_benefit_system.variables[
26792692
var
@@ -2689,12 +2702,6 @@ def build_matrix(
26892702
var
26902703
].entity.key
26912704

2692-
# 5a. Collect unique constraint variables
2693-
unique_constraint_vars = set()
2694-
for constraints in non_geo_constraints_list:
2695-
for c in constraints:
2696-
unique_constraint_vars.add(c["variable"])
2697-
26982705
# 5b. Per-state precomputation (51 sims on one object)
26992706
self._entity_rel_cache = None
27002707
state_values = self._build_state_values(
@@ -2709,7 +2716,9 @@ def build_matrix(
27092716
)
27102717

27112718
# 5b-county. Per-county precomputation for county-dependent vars
2712-
county_dep_targets = unique_variables & COUNTY_DEPENDENT_VARS
2719+
county_dep_targets = (
2720+
unique_variables | unique_constraint_vars
2721+
) & COUNTY_DEPENDENT_VARS
27132722
county_values = self._build_county_values(
27142723
sim,
27152724
county_dep_targets,
@@ -2773,8 +2782,15 @@ def build_matrix(
27732782
reported_takeup_anchors["takes_up_medicaid_if_eligible"] = f[
27742783
"has_medicaid_health_coverage_at_interview"
27752784
][period_key][...].astype(bool)
2785+
if (
2786+
"receives_housing_assistance" in f
2787+
and period_key in f["receives_housing_assistance"]
2788+
):
2789+
reported_takeup_anchors[
2790+
"takes_up_housing_assistance_if_eligible"
2791+
] = f["receives_housing_assistance"][period_key][...].astype(bool)
27762792

2777-
for tvar in unique_variables:
2793+
for tvar in unique_variables | unique_constraint_vars:
27782794
for key, info in TAKEUP_AFFECTED_TARGETS.items():
27792795
if tvar == key:
27802796
affected_target_info[tvar] = info
@@ -3111,13 +3127,18 @@ def build_matrix(
31113127
ent_hh_ids = household_ids[ent_hh]
31123128
ent_ci = np.full(n_ent, clone_idx, dtype=np.int64)
31133129

3130+
eligible_mask = None
3131+
if takeup_var == "takes_up_housing_assistance_if_eligible":
3132+
eligible_mask = ent_eligible > 0
3133+
31143134
ent_takeup = compute_block_takeup_for_entities(
31153135
takeup_var,
31163136
precomputed_rates[info["rate_key"]],
31173137
ent_blocks,
31183138
ent_hh_ids,
31193139
ent_ci,
31203140
reported_mask=reported_takeup_anchors.get(takeup_var),
3141+
eligible_mask=eligible_mask,
31213142
)
31223143

31233144
ent_values = (ent_eligible * ent_takeup).astype(np.float32)

policyengine_us_data/db/DATABASE_GUIDE.md

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,12 @@ make database-refresh # Force re-download all sources and rebuild
2929
| 4 | `etl_age.py` | Census ACS 1-year | Age distribution: 18 bins x 488 geographies |
3030
| 5 | `etl_medicaid.py` | Census ACS + CMS | Medicaid enrollment (admin state-level, survey district-level) |
3131
| 6 | `etl_snap.py` | USDA FNS + Census ACS | SNAP participation (admin state-level, survey district-level) |
32-
| 7 | `etl_tanf.py` | HHS ACF | TANF caseload families and cash-assistance spending (FY2024) |
33-
| 8 | `etl_state_income_tax.py` | Census STC | State income tax collections (Census STC FY2023 `T40`, downloaded and cached) |
34-
| 9 | `etl_irs_soi.py` | IRS | Tax variables, EITC by child count, AGI brackets, conditional strata |
35-
| 10 | `etl_pregnancy.py` | CDC VSRR + Census ACS | Pregnancy prevalence by state (provisional birth counts) |
36-
| 11 | `validate_database.py` | No | Checks all target variables exist in policyengine-us |
32+
| 7 | `etl_housing_assistance.py` | HUD | HUD-assisted household counts from Picture of Subsidized Households |
33+
| 8 | `etl_tanf.py` | HHS ACF | TANF caseload families and cash-assistance spending (FY2024) |
34+
| 9 | `etl_state_income_tax.py` | Census STC | State income tax collections (Census STC FY2023 `T40`, downloaded and cached) |
35+
| 10 | `etl_irs_soi.py` | IRS | Tax variables, EITC by child count, AGI brackets, conditional strata |
36+
| 11 | `etl_pregnancy.py` | CDC VSRR + Census ACS | Pregnancy prevalence by state (provisional birth counts) |
37+
| 12 | `validate_database.py` | No | Checks all target variables exist in policyengine-us |
3738

3839
### Raw Input Caching
3940

@@ -152,6 +153,7 @@ Strata are categorized by their **constraints**, not by a separate group ID fiel
152153
| `adjusted_gross_income` | Income/AGI brackets |
153154
| `snap` | SNAP recipient strata |
154155
| `medicaid_enrolled` | Medicaid enrollment strata |
156+
| `housing_assistance` | HUD-assisted household strata |
155157
| `is_pregnant` | Pregnancy prevalence strata |
156158
| `eitc_child_count` | EITC recipients by qualifying children |
157159
| `state_income_tax` | State-level income tax collections |

policyengine_us_data/db/create_field_valid_values.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ def populate_field_valid_values(session: Session) -> None:
7474
("source", "CMS Marketplace", "administrative"),
7575
("source", "CMS 2024 OEP state metal status PUF", "administrative"),
7676
("source", "CMS Medicaid", "administrative"),
77+
("source", "HUD Picture of Subsidized Households", "administrative"),
7778
("source", "Census ACS S2704", "survey"),
7879
("source", "USDA FNS SNAP", "administrative"),
7980
("source", "Census ACS S2201", "survey"),
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
"""ETL for HUD-assisted household count calibration targets."""
2+
3+
from __future__ import annotations
4+
5+
import io
6+
import logging
7+
8+
import pandas as pd
9+
import requests
10+
from sqlmodel import Session, create_engine
11+
12+
from policyengine_us_data.db.create_database_tables import (
13+
Stratum,
14+
StratumConstraint,
15+
Target,
16+
)
17+
from policyengine_us_data.storage import STORAGE_FOLDER
18+
from policyengine_us_data.utils.census import STATE_ABBREV_TO_FIPS
19+
from policyengine_us_data.utils.db import (
20+
etl_argparser,
21+
get_geographic_strata,
22+
parse_ucgid,
23+
)
24+
from policyengine_us_data.utils.raw_cache import (
25+
is_cached,
26+
load_bytes,
27+
save_bytes,
28+
)
29+
30+
logger = logging.getLogger(__name__)
31+
32+
HUD_PICTURE_SOURCE = "HUD Picture of Subsidized Households"
33+
34+
35+
def _hud_picture_state_url(year: int) -> str:
36+
return (
37+
"https://www.huduser.gov/portal/datasets/pictures/files/"
38+
f"STATE_{year}_2020census.xlsx"
39+
)
40+
41+
42+
def extract_hud_picture_state_data(year: int) -> bytes:
43+
"""Download HUD Picture of Subsidized Households state extract."""
44+
cache_file = f"hud_picture_state_{year}_2020census.xlsx"
45+
if is_cached(cache_file):
46+
logger.info("Using cached %s", cache_file)
47+
return load_bytes(cache_file)
48+
49+
headers = {
50+
"User-Agent": (
51+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
52+
"AppleWebKit/537.36 (KHTML, like Gecko) "
53+
"Chrome/123.0.0.0 Safari/537.36"
54+
),
55+
"Accept": (
56+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,"
57+
"application/vnd.ms-excel,*/*"
58+
),
59+
"Accept-Language": "en-US,en;q=0.9",
60+
}
61+
response = requests.get(
62+
_hud_picture_state_url(year),
63+
headers=headers,
64+
timeout=60,
65+
)
66+
response.raise_for_status()
67+
if not response.content.startswith(b"PK"):
68+
raise ValueError(
69+
"HUD Picture state extract did not return an Excel workbook. "
70+
f"HTTP status={response.status_code}, "
71+
f"x-amzn-waf-action={response.headers.get('x-amzn-waf-action')!r}"
72+
)
73+
save_bytes(cache_file, response.content)
74+
return response.content
75+
76+
77+
def transform_hud_picture_state_data(workbook_content: bytes) -> pd.DataFrame:
78+
"""Return state assisted-household targets from a HUD Picture workbook."""
79+
raw_df = pd.read_excel(io.BytesIO(workbook_content))
80+
summary = raw_df.loc[
81+
raw_df["program_label"].eq("Summary of All HUD Programs")
82+
& raw_df["State"].isin(STATE_ABBREV_TO_FIPS)
83+
].copy()
84+
summary["STATE_FIPS"] = summary["State"].map(STATE_ABBREV_TO_FIPS)
85+
summary["ucgid_str"] = "0400000US" + summary["STATE_FIPS"]
86+
summary["assisted_households"] = pd.to_numeric(
87+
summary["number_reported"],
88+
errors="raise",
89+
)
90+
return (
91+
summary[["ucgid_str", "assisted_households"]]
92+
.sort_values("ucgid_str")
93+
.reset_index(drop=True)
94+
)
95+
96+
97+
def load_housing_assistance_data(state_df: pd.DataFrame, year: int) -> None:
98+
"""Load national and state assisted-household count targets."""
99+
database_url = f"sqlite:///{STORAGE_FOLDER / 'calibration' / 'policy_data.db'}"
100+
engine = create_engine(database_url)
101+
national_households = float(state_df["assisted_households"].sum())
102+
103+
with Session(engine) as session:
104+
geo_strata = get_geographic_strata(session)
105+
if geo_strata["national"] is None:
106+
raise ValueError(
107+
"National stratum not found. Run create_initial_strata.py first."
108+
)
109+
110+
national_stratum = Stratum(
111+
parent_stratum_id=geo_strata["national"],
112+
notes="National HUD-assisted households",
113+
)
114+
national_stratum.constraints_rel = [
115+
StratumConstraint(
116+
constraint_variable="housing_assistance",
117+
operation=">",
118+
value="0",
119+
)
120+
]
121+
national_stratum.targets_rel.append(
122+
Target(
123+
variable="household_count",
124+
period=year,
125+
value=national_households,
126+
active=True,
127+
source=HUD_PICTURE_SOURCE,
128+
notes=(
129+
"HUD Picture of Subsidized Households state extract, "
130+
"Summary of All HUD Programs, number_reported column. "
131+
"This is a December point-in-time assisted-household count."
132+
),
133+
)
134+
)
135+
session.add(national_stratum)
136+
session.flush()
137+
138+
for _, row in state_df.iterrows():
139+
state_fips = parse_ucgid(row["ucgid_str"])["state_fips"]
140+
parent_stratum_id = geo_strata["state"][state_fips]
141+
state_stratum = Stratum(
142+
parent_stratum_id=parent_stratum_id,
143+
notes=f"State FIPS {state_fips} HUD-assisted households",
144+
)
145+
state_stratum.constraints_rel = [
146+
StratumConstraint(
147+
constraint_variable="state_fips",
148+
operation="==",
149+
value=str(state_fips),
150+
),
151+
StratumConstraint(
152+
constraint_variable="housing_assistance",
153+
operation=">",
154+
value="0",
155+
),
156+
]
157+
state_stratum.targets_rel.append(
158+
Target(
159+
variable="household_count",
160+
period=year,
161+
value=float(row["assisted_households"]),
162+
active=True,
163+
source=HUD_PICTURE_SOURCE,
164+
notes=(
165+
"HUD Picture of Subsidized Households state extract, "
166+
"Summary of All HUD Programs, number_reported column. "
167+
"This is a December point-in-time assisted-household count."
168+
),
169+
)
170+
)
171+
session.add(state_stratum)
172+
173+
session.commit()
174+
175+
176+
def main() -> None:
177+
_, year = etl_argparser("ETL for HUD housing assistance count targets")
178+
workbook = extract_hud_picture_state_data(year)
179+
state_df = transform_hud_picture_state_data(workbook)
180+
load_housing_assistance_data(state_df, year)
181+
182+
183+
if __name__ == "__main__":
184+
main()

policyengine_us_data/utils/takeup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@
7676
"variable": "takes_up_housing_assistance_if_eligible",
7777
"entity": "spm_unit",
7878
"rate_key": "housing_assistance",
79-
"target": None,
79+
"target": "housing_assistance",
8080
},
8181
]
8282

tests/unit/calibration/test_target_config.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,28 @@ def test_training_config_includes_national_ctc_agi_targets(self):
206206
"domain_variable": "adjusted_gross_income,non_refundable_ctc",
207207
} in include_rules
208208

209+
def test_training_config_includes_housing_assistance_count_targets(self):
210+
config = load_target_config(
211+
str(
212+
Path(__file__).resolve().parents[3]
213+
/ "policyengine_us_data"
214+
/ "calibration"
215+
/ "target_config.yaml"
216+
)
217+
)
218+
219+
include_rules = config["include"]
220+
assert {
221+
"variable": "household_count",
222+
"geo_level": "state",
223+
"domain_variable": "housing_assistance",
224+
} in include_rules
225+
assert {
226+
"variable": "household_count",
227+
"geo_level": "national",
228+
"domain_variable": "housing_assistance",
229+
} in include_rules
230+
209231
def test_training_config_includes_national_capital_income_agi_targets(self):
210232
config = load_target_config(
211233
str(

0 commit comments

Comments
 (0)