Skip to content

Commit 2e04457

Browse files
authored
Calibrate ESI premiums for CBO income
Adds employer-sponsored insurance premium imputation and calibration targets. Bumps policyengine-us to 1.682.1 so the target variable is available.
1 parent a0d69ea commit 2e04457

12 files changed

Lines changed: 293 additions & 14 deletions

File tree

changelog.d/885.added.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Calibrate employer-sponsored insurance premiums and seed CPS policyholder ESI contributions for CBO-style income concepts.

policyengine_us_data/calibration/target_config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ include:
118118
geo_level: national
119119
- variable: eitc
120120
geo_level: national
121+
- variable: employer_sponsored_insurance_premiums
122+
geo_level: national
121123
- variable: health_insurance_premiums_without_medicare_part_b
122124
geo_level: national
123125
- variable: long_term_capital_gains

policyengine_us_data/datasets/cps/census_cps.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
"NOW_PRIV",
2121
"NOW_PUB",
2222
"NOW_GRP",
23+
"NOW_OWNGRP",
24+
"NOW_HIPAID",
25+
"NOW_GRPFTYP",
2326
"NOW_CAID",
2427
"NOW_MCAID",
2528
"NOW_PCHIP",
@@ -36,7 +39,12 @@
3639
def _resolve_person_usecols(
3740
available_columns, spm_unit_columns: list[str]
3841
) -> list[str]:
39-
requested_columns = PERSON_COLUMNS + spm_unit_columns + TAX_UNIT_COLUMNS
42+
requested_columns = (
43+
PERSON_COLUMNS
44+
+ sorted(OPTIONAL_PERSON_COLUMNS.difference(PERSON_COLUMNS))
45+
+ spm_unit_columns
46+
+ TAX_UNIT_COLUMNS
47+
)
4048
available_columns = set(available_columns)
4149
missing_required = sorted(
4250
column

policyengine_us_data/datasets/cps/cps.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,82 @@
9898
),
9999
}
100100

101+
ESI_POLICYHOLDER_VARIABLE = (
102+
"reported_owns_employer_sponsored_health_insurance_at_interview"
103+
)
104+
ESI_SOURCE_COLUMNS = {"NOW_OWNGRP", "NOW_HIPAID", "NOW_GRPFTYP"}
105+
106+
107+
_ESI_PLAN_PRIORS_2024 = {
108+
# AHRQ MEPS-IC Table IV.A.1 (private sector, 2024). These plan-type
109+
# averages seed CPS policyholder records; national calibration later
110+
# aligns the aggregate to the BEA full-economy employer premium total.
111+
"family": {
112+
"total_premium": 21_207.52589669509,
113+
"employee_contribution": 6_490.205059544782,
114+
},
115+
"self_only": {
116+
"total_premium": 8_389.275834815255,
117+
"employee_contribution": 1_909.5781466113417,
118+
},
119+
}
120+
_HAS_CURRENT_OWN_ESI = 1
121+
_EMPLOYER_PAYS_ALL = 1
122+
_EMPLOYER_PAYS_SOME = 2
123+
_ESI_FAMILY_PLAN = 1
124+
_ESI_SELF_ONLY_PLAN = 2
125+
126+
127+
def _person_column(person: DataFrame, column: str, default=0) -> np.ndarray:
128+
if column in person:
129+
return person[column].to_numpy()
130+
return np.full(len(person), default)
131+
132+
133+
def impute_employer_sponsored_insurance_premiums(person: DataFrame) -> np.ndarray:
134+
"""Impute annual employer-paid ESI premiums for CPS policyholders."""
135+
136+
own_esi = _person_column(person, "NOW_OWNGRP").astype(int) == _HAS_CURRENT_OWN_ESI
137+
premium_status = _person_column(person, "NOW_HIPAID").astype(int)
138+
plan_type = _person_column(person, "NOW_GRPFTYP").astype(int)
139+
employee_paid = np.clip(person.PHIP_VAL.to_numpy(dtype=float), 0, None)
140+
141+
total_premium = np.where(
142+
plan_type == _ESI_SELF_ONLY_PLAN,
143+
_ESI_PLAN_PRIORS_2024["self_only"]["total_premium"],
144+
_ESI_PLAN_PRIORS_2024["family"]["total_premium"],
145+
)
146+
average_employee_contribution = np.where(
147+
plan_type == _ESI_SELF_ONLY_PLAN,
148+
_ESI_PLAN_PRIORS_2024["self_only"]["employee_contribution"],
149+
_ESI_PLAN_PRIORS_2024["family"]["employee_contribution"],
150+
)
151+
employee_share = np.where(
152+
employee_paid > 0,
153+
employee_paid,
154+
average_employee_contribution,
155+
)
156+
employer_paid_when_some = np.clip(
157+
total_premium - employee_share,
158+
0,
159+
total_premium,
160+
)
161+
162+
employer_paid = np.where(
163+
premium_status == _EMPLOYER_PAYS_ALL,
164+
total_premium,
165+
np.where(
166+
premium_status == _EMPLOYER_PAYS_SOME,
167+
employer_paid_when_some,
168+
0,
169+
),
170+
)
171+
valid_owner_with_plan = own_esi & np.isin(
172+
plan_type,
173+
[_ESI_FAMILY_PLAN, _ESI_SELF_ONLY_PLAN],
174+
)
175+
return np.where(valid_owner_with_plan, employer_paid, 0)
176+
101177

102178
@contextmanager
103179
def _open_dataset_read_only(dataset_source):
@@ -708,6 +784,7 @@ def _validate_raw_cps_schema(
708784
) -> None:
709785
required_person_columns = {
710786
"CENSUS_TAX_ID",
787+
*ESI_SOURCE_COLUMNS,
711788
}
712789
required_tax_unit_columns = set()
713790

@@ -1136,6 +1213,12 @@ def add_personal_income_variables(cps: h5py.File, person: DataFrame, year: int):
11361213
# "What is the annual amount of child support paid?"
11371214
cps["child_support_expense"] = person.CHSP_VAL
11381215
cps["health_insurance_premiums_without_medicare_part_b"] = person.PHIP_VAL
1216+
cps[ESI_POLICYHOLDER_VARIABLE] = (
1217+
_person_column(person, "NOW_OWNGRP").astype(int) == _HAS_CURRENT_OWN_ESI
1218+
)
1219+
cps["employer_sponsored_insurance_premiums"] = (
1220+
impute_employer_sponsored_insurance_premiums(person)
1221+
)
11391222
cps["over_the_counter_health_expenses"] = person.POTC_VAL
11401223
cps["other_medical_expenses"] = person.PMED_VAL
11411224
cps["medicare_enrolled"] = person.MCARE == 1

policyengine_us_data/datasets/cps/extended_cps.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,12 @@
66
import pandas as pd
77
from policyengine_core.data import Dataset
88

9-
from policyengine_us_data.datasets.cps.cps import CPS, CPS_2024, CPS_2024_Full
9+
from policyengine_us_data.datasets.cps.cps import (
10+
CPS,
11+
CPS_2024,
12+
CPS_2024_Full,
13+
ESI_POLICYHOLDER_VARIABLE,
14+
)
1015
from policyengine_us_data.datasets.org import (
1116
ORG_IMPUTED_VARIABLES,
1217
apply_org_domain_constraints,
@@ -147,6 +152,7 @@ def _supports_structural_mortgage_inputs() -> bool:
147152
"spm_unit_net_income_reported",
148153
"spm_unit_pre_subsidy_childcare_expenses",
149154
# Medical expenses
155+
"employer_sponsored_insurance_premiums",
150156
"health_insurance_premiums_without_medicare_part_b",
151157
"other_health_insurance_premiums",
152158
"over_the_counter_health_expenses",
@@ -172,6 +178,7 @@ def _supports_structural_mortgage_inputs() -> bool:
172178
CPS_STAGE2_DEMOGRAPHIC_PREDICTORS = [
173179
"age",
174180
"is_male",
181+
"has_esi",
175182
"tax_unit_is_joint",
176183
"tax_unit_count_dependents",
177184
]
@@ -738,6 +745,16 @@ def _apply_post_processing(predictions, X_test, time_period, data):
738745
for col in org_cols:
739746
predictions[col] = constrained[col]
740747

748+
if "employer_sponsored_insurance_premiums" in predictions.columns:
749+
policyholder = _clone_half_person_values(
750+
data, ESI_POLICYHOLDER_VARIABLE, time_period
751+
)
752+
if policyholder is not None:
753+
predictions.loc[
754+
~np.asarray(policyholder, dtype=bool),
755+
"employer_sponsored_insurance_premiums",
756+
] = 0
757+
741758
return predictions
742759

743760

policyengine_us_data/db/etl_national_targets.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,13 @@ def extract_national_targets(year: int = DEFAULT_YEAR):
141141
"notes": "Total household net worth",
142142
"year": 2024,
143143
},
144+
{
145+
"variable": "employer_sponsored_insurance_premiums",
146+
"value": 1_002.9e9,
147+
"source": "https://apps.bea.gov/scb/issues/2025/09-september/0925-nipa-methodologies.htm",
148+
"notes": "BEA group health insurance total in employer contributions for employee pension and insurance funds",
149+
"year": 2024,
150+
},
144151
{
145152
"variable": "health_insurance_premiums_without_medicare_part_b",
146153
"value": 385e9,

policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
"""
88

99
HARD_CODED_TOTALS = {
10+
# BEA NIPA 2024 employer contributions for employee pension and
11+
# insurance funds: group health insurance.
12+
"employer_sponsored_insurance_premiums": 1_002.9e9,
1013
"health_insurance_premiums_without_medicare_part_b": 385e9,
1114
"other_medical_expenses": 278e9,
1215
"medicare_part_b_premium": 112e9,

policyengine_us_data/utils/loss.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
# database so this dict can be deleted. See PR #488.
3131

3232
HARD_CODED_TOTALS = {
33+
"employer_sponsored_insurance_premiums": 1_002.9e9,
3334
"health_insurance_premiums_without_medicare_part_b": 385e9,
3435
"other_medical_expenses": 278e9,
3536
MEDICARE_PART_B_PREMIUM_VARIABLE: (

pyproject.toml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,11 @@ classifiers = [
2222
"Programming Language :: Python :: 3.14",
2323
]
2424
dependencies = [
25-
"policyengine-us>=1.680.0",
26-
# policyengine-core 3.25.4 fixes PolicyEngine/policyengine-core#482
27-
# (user-set ETERNITY inputs lost after _invalidate_all_caches).
28-
"policyengine-core>=3.25.4,<3.26",
25+
"policyengine-us>=1.682.1",
26+
# policyengine-core 3.26.0 includes the fix for
27+
# PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost
28+
# after _invalidate_all_caches) and is required by policyengine-us 1.682.1.
29+
"policyengine-core>=3.26.0,<3.27",
2930
"pandas>=2.3.1",
3031
"requests>=2.25.0",
3132
"tqdm>=4.60.0",
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
from pathlib import Path
2+
3+
import numpy as np
4+
import pandas as pd
5+
6+
from policyengine_us_data.datasets.cps.census_cps import (
7+
PERSON_COLUMNS,
8+
TAX_UNIT_COLUMNS,
9+
_resolve_person_usecols,
10+
)
11+
from policyengine_us_data.datasets.cps.cps import (
12+
ESI_POLICYHOLDER_VARIABLE,
13+
ESI_SOURCE_COLUMNS,
14+
_EMPLOYER_PAYS_ALL,
15+
_EMPLOYER_PAYS_SOME,
16+
_ESI_PLAN_PRIORS_2024,
17+
_validate_raw_cps_schema,
18+
impute_employer_sponsored_insurance_premiums,
19+
)
20+
from policyengine_us_data.datasets.cps.extended_cps import (
21+
CPS_ONLY_IMPUTED_VARIABLES,
22+
)
23+
from policyengine_us_data.storage.calibration_targets.pull_hardcoded_targets import (
24+
HARD_CODED_TOTALS,
25+
)
26+
27+
28+
def test_resolve_person_usecols_requests_optional_esi_columns_when_available():
29+
available = (
30+
PERSON_COLUMNS
31+
+ TAX_UNIT_COLUMNS
32+
+ [
33+
"NOW_OWNGRP",
34+
"NOW_HIPAID",
35+
"NOW_GRPFTYP",
36+
]
37+
)
38+
usecols = _resolve_person_usecols(available, spm_unit_columns=[])
39+
40+
for column in ["NOW_OWNGRP", "NOW_HIPAID", "NOW_GRPFTYP"]:
41+
assert column in usecols
42+
43+
44+
def test_impute_employer_sponsored_insurance_premiums():
45+
person = pd.DataFrame(
46+
{
47+
"NOW_OWNGRP": [1, 1, 1, 0, 1],
48+
"NOW_HIPAID": [1, 2, 2, 1, 2],
49+
"NOW_GRPFTYP": [2, 2, 1, 2, 1],
50+
"PHIP_VAL": [0, 1_200, 0, 0, 50_000],
51+
}
52+
)
53+
54+
result = impute_employer_sponsored_insurance_premiums(person)
55+
56+
np.testing.assert_allclose(
57+
result[0],
58+
_ESI_PLAN_PRIORS_2024["self_only"]["total_premium"],
59+
)
60+
np.testing.assert_allclose(
61+
result[1],
62+
_ESI_PLAN_PRIORS_2024["self_only"]["total_premium"] - 1_200,
63+
)
64+
np.testing.assert_allclose(
65+
result[2],
66+
_ESI_PLAN_PRIORS_2024["family"]["total_premium"]
67+
- _ESI_PLAN_PRIORS_2024["family"]["employee_contribution"],
68+
)
69+
assert result[3] == 0
70+
assert result[4] == 0
71+
72+
73+
def test_impute_employer_sponsored_insurance_premiums_tolerates_missing_esi_columns():
74+
person = pd.DataFrame({"PHIP_VAL": [1_000, 2_000]})
75+
76+
result = impute_employer_sponsored_insurance_premiums(person)
77+
78+
np.testing.assert_array_equal(result, np.zeros(2))
79+
80+
81+
def test_imputation_status_codes_remain_stable():
82+
assert _EMPLOYER_PAYS_ALL == 1
83+
assert _EMPLOYER_PAYS_SOME == 2
84+
85+
86+
def test_extended_cps_imputes_esi_premiums_for_clone_half():
87+
assert "employer_sponsored_insurance_premiums" in CPS_ONLY_IMPUTED_VARIABLES
88+
89+
90+
def test_hardcoded_targets_include_total_esi_premiums():
91+
assert HARD_CODED_TOTALS["employer_sponsored_insurance_premiums"] == 1_002.9e9
92+
93+
94+
def test_target_config_includes_total_esi_premiums():
95+
target_config_path = Path(__file__).parents[2] / (
96+
"policyengine_us_data/calibration/target_config.yaml"
97+
)
98+
content = target_config_path.read_text()
99+
100+
assert "employer_sponsored_insurance_premiums" in content
101+
102+
103+
def test_policyholder_variable_name_remains_stable():
104+
assert (
105+
ESI_POLICYHOLDER_VARIABLE
106+
== "reported_owns_employer_sponsored_health_insurance_at_interview"
107+
)
108+
109+
110+
def test_raw_cps_schema_requires_esi_source_columns():
111+
person = pd.DataFrame(
112+
{
113+
"CENSUS_TAX_ID": [1],
114+
**{column: [1] for column in ESI_SOURCE_COLUMNS},
115+
}
116+
)
117+
tax_unit = pd.DataFrame()
118+
119+
_validate_raw_cps_schema(person, tax_unit, "raw")
120+
121+
stale_person = person.drop(columns=["NOW_OWNGRP"])
122+
try:
123+
_validate_raw_cps_schema(stale_person, tax_unit, "raw")
124+
except ValueError as error:
125+
assert "NOW_OWNGRP" in str(error)
126+
else:
127+
raise AssertionError("Expected missing ESI source column to fail validation")

0 commit comments

Comments
 (0)