Skip to content
This repository was archived by the owner on Jun 14, 2026. It is now read-only.

Commit 12d2e8d

Browse files
committed
Cover wealth and Part B Arch targets
1 parent e25df05 commit 12d2e8d

6 files changed

Lines changed: 246 additions & 49 deletions

File tree

docs/arch-target-gap-queue.md

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,9 @@ It excludes only cells with explicit reasons in
5353
- SOI multi-domain cells that would require joint AGI, filing status, and
5454
positive income-tax-before-credits facts not currently published by the loaded
5555
SOI packages
56-
- survey-heavy or model-input cells such as rent, net worth, child support,
57-
medical-premium subcomponents, SPM capped expenses, and `ssn_card_type`
56+
- survey-heavy or model-input cells such as rent, child support,
57+
non-Part-B medical premium/expense components, SPM capped expenses, and
58+
`ssn_card_type`
5859
- source-near but non-equivalent rows such as `childcare_expenses`, where IRS
5960
credit expenses and W-2 dependent-care benefits are narrower tax concepts
6061
- pregnancy stock by state, where live births are a flow rather than a direct
@@ -74,11 +75,13 @@ Inputs:
7475
- `/tmp/arch-suite-soi-historic-table-2-state-agi-2022/consumer_facts.jsonl`
7576
- `/tmp/arch-suite-soi-w2-statistics-2020/consumer_facts.jsonl`
7677
- `/tmp/arch-suite-soi-table-1-4-2023/consumer_facts.jsonl`
78+
- `/tmp/arch-suite-federal-reserve-z1-household-net-worth/consumer_facts.jsonl`
79+
- `/tmp/arch-suite-cms-medicare-trustees-report-2025-part-b-premium-income/consumer_facts.jsonl`
7780

7881
Command:
7982

8083
```bash
81-
uv run microplex-us-arch-target-refresh \
84+
uv run --extra policyengine microplex-us-arch-target-refresh \
8285
--arch-targets-db /Users/maxghenis/CosilicoAI/arch/arch/fixtures/consumer_facts.jsonl \
8386
--arch-targets-db /Users/maxghenis/CosilicoAI/arch/macro/targets.db \
8487
--arch-targets-db /tmp/arch-suite-hhs-acf-tanf-caseload-2024/consumer_facts.jsonl \
@@ -87,38 +90,46 @@ uv run microplex-us-arch-target-refresh \
8790
--arch-targets-db /tmp/arch-suite-soi-historic-table-2-state-agi-2022/consumer_facts.jsonl \
8891
--arch-targets-db /tmp/arch-suite-soi-w2-statistics-2020/consumer_facts.jsonl \
8992
--arch-targets-db /tmp/arch-suite-soi-table-1-4-2023/consumer_facts.jsonl \
93+
--arch-targets-db /tmp/arch-suite-federal-reserve-z1-household-net-worth/consumer_facts.jsonl \
94+
--arch-targets-db /tmp/arch-suite-cms-medicare-trustees-report-2025-part-b-premium-income/consumer_facts.jsonl \
9095
--period 2024 \
9196
--profile pe_native_broad_source_backed \
92-
--output-dir artifacts/arch-target-coverage
97+
--output-dir artifacts/arch-target-coverage-source-backed
9398
```
9499

95100
Coverage:
96101

97-
- 172 target cells in `pe_native_broad_source_backed`
98-
- 172 covered
102+
- 174 target cells in `pe_native_broad_source_backed`
103+
- 174 covered
99104
- 0 uncovered
100105
- 100.0% coverage
101106

102-
The raw `pe_native_broad` profile remains at 172 of 189 covered with 17
103-
explicitly reviewed rows outside the source-backed boundary:
107+
The raw `pe_native_broad` profile is at 174 of 189 covered with 15 explicitly
108+
reviewed rows outside the source-backed boundary. Federal Reserve Z.1 household
109+
net worth and CMS Medicare Trustees Report Part B premium income are now
110+
source-backed.
104111

105112
| Category | Rows |
106113
| --- | ---: |
107-
| `survey_or_model_input_deprioritized` | 12 |
108114
| `adapter_or_constraint_review` | 3 |
109115
| `source_mapping_review` | 2 |
116+
| `survey_or_model_input_deprioritized` | 10 |
110117

111118
Generated outputs:
112119

113-
- `artifacts/arch-target-coverage/pe_native_broad_source_backed_2024_coverage.json`
114-
- `artifacts/arch-target-coverage/pe_native_broad_source_backed_2024_gaps.json`
115-
- `artifacts/arch-target-coverage/pe_native_broad_source_backed_2024_gaps.csv`
116-
- `artifacts/arch-target-coverage/pe_native_broad_source_backed_2024_summary.md`
120+
- `artifacts/arch-target-coverage-source-backed/pe_native_broad_source_backed_2024_coverage.json`
121+
- `artifacts/arch-target-coverage-source-backed/pe_native_broad_source_backed_2024_gaps.json`
122+
- `artifacts/arch-target-coverage-source-backed/pe_native_broad_source_backed_2024_gaps.csv`
123+
- `artifacts/arch-target-coverage-source-backed/pe_native_broad_source_backed_2024_summary.md`
124+
- `artifacts/arch-target-coverage-broad-plus/pe_native_broad_2024_coverage.json`
125+
- `artifacts/arch-target-coverage-broad-plus/pe_native_broad_2024_gaps.json`
126+
- `artifacts/arch-target-coverage-broad-plus/pe_native_broad_2024_gaps.csv`
127+
- `artifacts/arch-target-coverage-broad-plus/pe_native_broad_2024_summary.md`
117128

118129
Remaining work is concentrated in:
119130

120131
- the raw `pe_native_broad` cells excluded from the source-backed profile, if a
121132
future primary publisher source can support them without changing semantics
122-
- UK profile parity, which should follow the same pattern: keep the raw PE
123-
target surface intact and expose a source-backed profile with explicit
124-
exclusions where source equivalence is not defensible
133+
- keeping the UK source-backed/raw boundary aligned with the same rule: leave
134+
raw PE target rows visible, and exclude only rows where source equivalence is
135+
not defensible

src/microplex_us/pipelines/us.py

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
from microplex_us.policyengine.us import (
8484
subset_policyengine_tables_by_households as _subset_policyengine_tables_by_households,
8585
)
86+
from microplex_us.targets.arch import resolve_arch_sqlite_target_provider
8687
from microplex_us.variables import (
8788
PE_STYLE_PUF_IRS_DEMOGRAPHIC_PREDICTORS,
8889
DonorMatchStrategy,
@@ -1407,6 +1408,8 @@ class USMicroplexBuildConfig:
14071408
policyengine_prefer_existing_tax_unit_ids: bool = False
14081409
policyengine_quantity_targets: tuple[PolicyEngineUSQuantityTarget, ...] = ()
14091410
policyengine_targets_db: str | None = None
1411+
arch_targets_db: str | tuple[str, ...] | None = None
1412+
calibration_target_source: Literal["policyengine", "arch"] = "policyengine"
14101413
policyengine_target_period: int | None = None
14111414
policyengine_target_variables: tuple[str, ...] = ()
14121415
policyengine_target_domains: tuple[str, ...] = ()
@@ -1786,7 +1789,7 @@ def build_from_frames(
17861789
rows=int(len(synthetic_data)),
17871790
columns=int(len(synthetic_data.columns)),
17881791
)
1789-
if self.config.policyengine_targets_db is not None:
1792+
if self._has_policyengine_calibration_targets():
17901793
_emit_us_pipeline_progress(
17911794
"US microplex build: policyengine tables start",
17921795
rows=int(len(synthetic_data)),
@@ -2816,10 +2819,7 @@ def calibrate_policyengine_tables(
28162819
tables: PolicyEngineUSEntityTableBundle,
28172820
) -> tuple[PolicyEngineUSEntityTableBundle, pd.DataFrame, dict[str, Any]]:
28182821
"""Calibrate household weights using PolicyEngine US target DB constraints."""
2819-
if self.config.policyengine_targets_db is None:
2820-
raise ValueError("policyengine_targets_db is required for DB calibration")
2821-
2822-
provider = PolicyEngineUSDBTargetProvider(self.config.policyengine_targets_db)
2822+
provider, _source = self._resolve_calibration_target_provider()
28232823
target_period = (
28242824
self.config.policyengine_target_period
28252825
or self.config.policyengine_dataset_year
@@ -3629,9 +3629,33 @@ def _resolve_policyengine_calibration_targets(
36293629
materialization_failures,
36303630
)
36313631

3632+
def _has_policyengine_calibration_targets(self) -> bool:
3633+
if self.config.calibration_target_source == "arch":
3634+
return self.config.arch_targets_db is not None
3635+
return self.config.policyengine_targets_db is not None
3636+
3637+
def _resolve_calibration_target_provider(self):
3638+
if self.config.calibration_target_source == "arch":
3639+
if self.config.arch_targets_db is None:
3640+
raise ValueError(
3641+
"arch_targets_db is required when calibration_target_source='arch'"
3642+
)
3643+
return (
3644+
resolve_arch_sqlite_target_provider(self.config.arch_targets_db),
3645+
"arch",
3646+
)
3647+
if self.config.policyengine_targets_db is None:
3648+
raise ValueError(
3649+
"policyengine_targets_db is required for PolicyEngine DB calibration"
3650+
)
3651+
return (
3652+
PolicyEngineUSDBTargetProvider(self.config.policyengine_targets_db),
3653+
"policyengine",
3654+
)
3655+
36323656
def _load_policyengine_target_set(
36333657
self,
3634-
provider: PolicyEngineUSDBTargetProvider,
3658+
provider: Any,
36353659
*,
36363660
bindings: dict[str, PolicyEngineUSVariableBinding],
36373661
period: int,

src/microplex_us/policyengine/target_profiles.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -807,25 +807,6 @@ def _target_cell_key(cell: PolicyEngineUSTargetCell) -> PolicyEngineUSTargetCell
807807
"This premium component is a modeled/survey input; no accepted primary "
808808
"aggregate source mapping is encoded for Arch."
809809
),
810-
(
811-
"medicare_part_b_premiums",
812-
"national",
813-
None,
814-
None,
815-
): (
816-
"PolicyEngine Medicare Part B premiums depend on person-level "
817-
"enrollment and IRMAA status; no accepted aggregate source fact is "
818-
"encoded for this modeled input."
819-
),
820-
(
821-
"net_worth",
822-
"national",
823-
None,
824-
None,
825-
): (
826-
"Net worth is a wealth survey/model input; no accepted primary "
827-
"administrative aggregate source mapping is encoded for Arch."
828-
),
829810
(
830811
"other_medical_expenses",
831812
"national",

src/microplex_us/targets/arch.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@
4848
"census-stc": "CENSUS_STC",
4949
"usda-snap": "USDA_SNAP",
5050
"cms-aca": "CMS_ACA",
51+
"cms-medicare": "CMS_MEDICARE",
5152
"cms-medicaid": "CMS_MEDICAID",
53+
"federal-reserve": "FEDERAL_RESERVE",
5254
"hhs-acf-liheap": "HHS_ACF_LIHEAP",
5355
"hhs-acf-tanf": "HHS_ACF_TANF",
5456
}
@@ -134,6 +136,8 @@
134136
"ssi_payments": "ssi",
135137
"ssi_total_payments": "ssi",
136138
"tanf_cash_assistance": "tanf",
139+
"medicare_part_b_premiums": "medicare_part_b_premiums",
140+
"net_worth": "net_worth",
137141
}
138142

139143
ARCH_SELF_DOMAIN_AMOUNT_VARIABLES = frozenset(
@@ -408,6 +412,14 @@
408412
"self_employed_pension_contribution_ald",
409413
"AMOUNT",
410414
),
415+
"federal_reserve.z1.households_nonprofits_net_worth": (
416+
"net_worth",
417+
"AMOUNT",
418+
),
419+
"cms_medicare.part_b_premium_income": (
420+
"medicare_part_b_premiums",
421+
"AMOUNT",
422+
),
411423
"census_decennial.resident_population": ("population", "COUNT"),
412424
"census_decennial.occupied_housing_units": ("household_count", "COUNT"),
413425
"census_pep.resident_population": ("population", "COUNT"),
@@ -570,6 +582,7 @@
570582
ARCH_FACT_DOMAIN_CONSTRAINTS = {
571583
"all_individual_income_tax_returns": (("is_tax_filer", "==", "1"),),
572584
"form_w2_items": (),
585+
"household_balance_sheet": (),
573586
"individual_income_tax_returns": (("is_tax_filer", "==", "1"),),
574587
"individual_income_tax_returns_excluding_dependents": (
575588
("is_dependent", "==", "0"),
@@ -584,6 +597,7 @@
584597
"aca_marketplace_effectuated_enrollment": (),
585598
"aca_marketplace_qhp_selections": (),
586599
"medicaid_chip_enrollment": (),
600+
"medicare_financing": (),
587601
"national_health_expenditures": (),
588602
"personal_current_transfer_receipts": (),
589603
"personal_income": (),
@@ -607,10 +621,13 @@
607621
ARCH_IGNORED_FACT_CONSTRAINT_VARIABLES = frozenset(
608622
{
609623
"administering_entity",
624+
"amount_basis",
610625
"bea_nipa.series_code",
611626
"bea_regional.geo_name",
612627
"bea_regional.line_code",
613628
"bea_regional.table_name",
629+
"medicare.financing_component",
630+
"medicare.part",
614631
"program",
615632
}
616633
)
@@ -655,7 +672,9 @@
655672
"tip_income": EntityType.PERSON,
656673
"traditional_401k_contributions": EntityType.PERSON,
657674
"unemployment_compensation": EntityType.PERSON,
675+
"medicare_part_b_premiums": EntityType.PERSON,
658676
"medicaid": EntityType.PERSON,
677+
"net_worth": EntityType.HOUSEHOLD,
659678
"social_security": EntityType.PERSON,
660679
"social_security_dependents": EntityType.PERSON,
661680
"social_security_disability": EntityType.PERSON,
@@ -809,7 +828,9 @@
809828
"income_tax_positive": "income_tax_liability",
810829
"income_tax_before_credits": "income_tax_before_credits_amount",
811830
"interest_deduction": "interest_paid_deduction_amount",
831+
"medicare_part_b_premiums": "medicare_part_b_premiums",
812832
"net_capital_gains": "net_capital_gains_amount",
833+
"net_worth": "net_worth",
813834
"real_estate_taxes": "real_estate_taxes_amount",
814835
"roth_401k_contributions": "roth_401k_contributions",
815836
"self_employed_pension_contribution_ald": (
@@ -886,8 +907,6 @@
886907
"child_support_expense",
887908
"child_support_received",
888909
"health_insurance_premiums_without_medicare_part_b",
889-
"medicare_part_b_premiums",
890-
"net_worth",
891910
"other_medical_expenses",
892911
"over_the_counter_health_expenses",
893912
"rent",
@@ -5189,10 +5208,12 @@ def _arch_gap_queue_sort_key(row: ArchTargetGapQueueRow) -> tuple[Any, ...]:
51895208
"CENSUS_ACS": 2,
51905209
"CMS_ACA": 3,
51915210
"CMS_MEDICAID": 4,
5192-
"USDA_SNAP": 5,
5193-
"SSA": 6,
5194-
"HHS_ACF_TANF": 7,
5195-
"HHS_ACF_LIHEAP": 8,
5211+
"CMS_MEDICARE": 5,
5212+
"USDA_SNAP": 6,
5213+
"SSA": 7,
5214+
"HHS_ACF_TANF": 8,
5215+
"HHS_ACF_LIHEAP": 9,
5216+
"FEDERAL_RESERVE": 10,
51965217
}.get(str(row.expected_source), 99)
51975218
return (
51985219
row.covered,
@@ -5238,6 +5259,10 @@ def _arch_gap_expected_source(cell: dict[str, Any]) -> str | None:
52385259
return "SSA"
52395260
if variable == "state_income_tax":
52405261
return "CENSUS_STC"
5262+
if variable == "medicare_part_b_premiums":
5263+
return "CMS_MEDICARE"
5264+
if variable == "net_worth":
5265+
return "FEDERAL_RESERVE"
52415266
if variable == "person_count":
52425267
if _normalize_geo_level(cell.get("geo_level")) in {"sldu", "sldl"}:
52435268
return "CENSUS_DECENNIAL"
@@ -5483,6 +5508,10 @@ def _arch_gap_expected_source_table(
54835508
return "CMS Marketplace Open Enrollment public-use files"
54845509
if expected_source == "CMS_MEDICAID":
54855510
return "CMS Medicaid enrollment and expenditure reports"
5511+
if expected_source == "CMS_MEDICARE":
5512+
return "CMS Medicare Trustees Report Part B premium income"
5513+
if expected_source == "FEDERAL_RESERVE":
5514+
return "Federal Reserve Financial Accounts Z.1 household net worth"
54865515
if expected_source == "SSA":
54875516
return "SSA Annual Statistical Supplement"
54885517
if expected_source == "HHS_ACF_TANF":

tests/policyengine/test_target_profiles.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -370,10 +370,10 @@ def test_source_backed_profile_excludes_only_documented_non_source_cells() -> No
370370
}
371371

372372
assert len(broad_cells) == 189
373-
assert len(exclusion_reasons) == 17
373+
assert len(exclusion_reasons) == 15
374374
assert all(reason for reason in exclusion_reasons.values())
375375
assert set(exclusion_reasons) <= broad_cells
376-
assert len(source_backed_cells) == 172
376+
assert len(source_backed_cells) == 174
377377
assert source_backed_cells == broad_cells - set(exclusion_reasons)
378378
assert (
379379
"childcare_expenses",
@@ -393,3 +393,15 @@ def test_source_backed_profile_excludes_only_documented_non_source_cells() -> No
393393
None,
394394
None,
395395
) in source_backed_cells
396+
assert (
397+
"medicare_part_b_premiums",
398+
"national",
399+
None,
400+
None,
401+
) in source_backed_cells
402+
assert (
403+
"net_worth",
404+
"national",
405+
None,
406+
None,
407+
) in source_backed_cells

0 commit comments

Comments
 (0)