Calibrate ESI premiums for CBO income

MaxGhenis · web-flow · commit 2e04457114c6 · 2026-05-04T21:52:15.000-04:00
Adds employer-sponsored insurance premium imputation and calibration targets. Bumps policyengine-us to 1.682.1 so the target variable is available.
diff --git a/changelog.d/885.added.md b/changelog.d/885.added.md
@@ -0,0 +1 @@
+Calibrate employer-sponsored insurance premiums and seed CPS policyholder ESI contributions for CBO-style income concepts.
diff --git a/policyengine_us_data/calibration/target_config.yaml b/policyengine_us_data/calibration/target_config.yaml
@@ -118,6 +118,8 @@ include:
     geo_level: national
   - variable: eitc
     geo_level: national
+  - variable: employer_sponsored_insurance_premiums
+    geo_level: national
   - variable: health_insurance_premiums_without_medicare_part_b
     geo_level: national
   - variable: long_term_capital_gains
diff --git a/policyengine_us_data/datasets/cps/census_cps.py b/policyengine_us_data/datasets/cps/census_cps.py
@@ -20,6 +20,9 @@
     "NOW_PRIV",
     "NOW_PUB",
     "NOW_GRP",
+    "NOW_OWNGRP",
+    "NOW_HIPAID",
+    "NOW_GRPFTYP",
     "NOW_CAID",
     "NOW_MCAID",
     "NOW_PCHIP",
@@ -36,7 +39,12 @@
 def _resolve_person_usecols(
     available_columns, spm_unit_columns: list[str]
 ) -> list[str]:
-    requested_columns = PERSON_COLUMNS + spm_unit_columns + TAX_UNIT_COLUMNS
+    requested_columns = (
+        PERSON_COLUMNS
+        + sorted(OPTIONAL_PERSON_COLUMNS.difference(PERSON_COLUMNS))
+        + spm_unit_columns
+        + TAX_UNIT_COLUMNS
+    )
     available_columns = set(available_columns)
     missing_required = sorted(
         column
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -98,6 +98,82 @@
     ),
 }
 
+ESI_POLICYHOLDER_VARIABLE = (
+    "reported_owns_employer_sponsored_health_insurance_at_interview"
+)
+ESI_SOURCE_COLUMNS = {"NOW_OWNGRP", "NOW_HIPAID", "NOW_GRPFTYP"}
+
+
+_ESI_PLAN_PRIORS_2024 = {
+    # AHRQ MEPS-IC Table IV.A.1 (private sector, 2024). These plan-type
+    # averages seed CPS policyholder records; national calibration later
+    # aligns the aggregate to the BEA full-economy employer premium total.
+    "family": {
+        "total_premium": 21_207.52589669509,
+        "employee_contribution": 6_490.205059544782,
+    },
+    "self_only": {
+        "total_premium": 8_389.275834815255,
+        "employee_contribution": 1_909.5781466113417,
+    },
+}
+_HAS_CURRENT_OWN_ESI = 1
+_EMPLOYER_PAYS_ALL = 1
+_EMPLOYER_PAYS_SOME = 2
+_ESI_FAMILY_PLAN = 1
+_ESI_SELF_ONLY_PLAN = 2
+
+
+def _person_column(person: DataFrame, column: str, default=0) -> np.ndarray:
+    if column in person:
+        return person[column].to_numpy()
+    return np.full(len(person), default)
+
+
+def impute_employer_sponsored_insurance_premiums(person: DataFrame) -> np.ndarray:
+    """Impute annual employer-paid ESI premiums for CPS policyholders."""
+
+    own_esi = _person_column(person, "NOW_OWNGRP").astype(int) == _HAS_CURRENT_OWN_ESI
+    premium_status = _person_column(person, "NOW_HIPAID").astype(int)
+    plan_type = _person_column(person, "NOW_GRPFTYP").astype(int)
+    employee_paid = np.clip(person.PHIP_VAL.to_numpy(dtype=float), 0, None)
+
+    total_premium = np.where(
+        plan_type == _ESI_SELF_ONLY_PLAN,
+        _ESI_PLAN_PRIORS_2024["self_only"]["total_premium"],
+        _ESI_PLAN_PRIORS_2024["family"]["total_premium"],
+    )
+    average_employee_contribution = np.where(
+        plan_type == _ESI_SELF_ONLY_PLAN,
+        _ESI_PLAN_PRIORS_2024["self_only"]["employee_contribution"],
+        _ESI_PLAN_PRIORS_2024["family"]["employee_contribution"],
+    )
+    employee_share = np.where(
+        employee_paid > 0,
+        employee_paid,
+        average_employee_contribution,
+    )
+    employer_paid_when_some = np.clip(
+        total_premium - employee_share,
+        0,
+        total_premium,
+    )
+
+    employer_paid = np.where(
+        premium_status == _EMPLOYER_PAYS_ALL,
+        total_premium,
+        np.where(
+            premium_status == _EMPLOYER_PAYS_SOME,
+            employer_paid_when_some,
+            0,
+        ),
+    )
+    valid_owner_with_plan = own_esi & np.isin(
+        plan_type,
+        [_ESI_FAMILY_PLAN, _ESI_SELF_ONLY_PLAN],
+    )
+    return np.where(valid_owner_with_plan, employer_paid, 0)
+
 
 @contextmanager
 def _open_dataset_read_only(dataset_source):
@@ -708,6 +784,7 @@ def _validate_raw_cps_schema(
 ) -> None:
     required_person_columns = {
         "CENSUS_TAX_ID",
+        *ESI_SOURCE_COLUMNS,
     }
     required_tax_unit_columns = set()
 
@@ -1136,6 +1213,12 @@ def add_personal_income_variables(cps: h5py.File, person: DataFrame, year: int):
     # "What is the annual amount of child support paid?"
     cps["child_support_expense"] = person.CHSP_VAL
     cps["health_insurance_premiums_without_medicare_part_b"] = person.PHIP_VAL
+    cps[ESI_POLICYHOLDER_VARIABLE] = (
+        _person_column(person, "NOW_OWNGRP").astype(int) == _HAS_CURRENT_OWN_ESI
+    )
+    cps["employer_sponsored_insurance_premiums"] = (
+        impute_employer_sponsored_insurance_premiums(person)
+    )
     cps["over_the_counter_health_expenses"] = person.POTC_VAL
     cps["other_medical_expenses"] = person.PMED_VAL
     cps["medicare_enrolled"] = person.MCARE == 1
diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -6,7 +6,12 @@
 import pandas as pd
 from policyengine_core.data import Dataset
 
-from policyengine_us_data.datasets.cps.cps import CPS, CPS_2024, CPS_2024_Full
+from policyengine_us_data.datasets.cps.cps import (
+    CPS,
+    CPS_2024,
+    CPS_2024_Full,
+    ESI_POLICYHOLDER_VARIABLE,
+)
 from policyengine_us_data.datasets.org import (
     ORG_IMPUTED_VARIABLES,
     apply_org_domain_constraints,
@@ -147,6 +152,7 @@ def _supports_structural_mortgage_inputs() -> bool:
     "spm_unit_net_income_reported",
     "spm_unit_pre_subsidy_childcare_expenses",
     # Medical expenses
+    "employer_sponsored_insurance_premiums",
     "health_insurance_premiums_without_medicare_part_b",
     "other_health_insurance_premiums",
     "over_the_counter_health_expenses",
@@ -172,6 +178,7 @@ def _supports_structural_mortgage_inputs() -> bool:
 CPS_STAGE2_DEMOGRAPHIC_PREDICTORS = [
     "age",
     "is_male",
+    "has_esi",
     "tax_unit_is_joint",
     "tax_unit_count_dependents",
 ]
@@ -738,6 +745,16 @@ def _apply_post_processing(predictions, X_test, time_period, data):
         for col in org_cols:
             predictions[col] = constrained[col]
 
+    if "employer_sponsored_insurance_premiums" in predictions.columns:
+        policyholder = _clone_half_person_values(
+            data, ESI_POLICYHOLDER_VARIABLE, time_period
+        )
+        if policyholder is not None:
+            predictions.loc[
+                ~np.asarray(policyholder, dtype=bool),
+                "employer_sponsored_insurance_premiums",
+            ] = 0
+
     return predictions
 
 
diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py
@@ -141,6 +141,13 @@ def extract_national_targets(year: int = DEFAULT_YEAR):
             "notes": "Total household net worth",
             "year": 2024,
         },
+        {
+            "variable": "employer_sponsored_insurance_premiums",
+            "value": 1_002.9e9,
+            "source": "https://apps.bea.gov/scb/issues/2025/09-september/0925-nipa-methodologies.htm",
+            "notes": "BEA group health insurance total in employer contributions for employee pension and insurance funds",
+            "year": 2024,
+        },
         {
             "variable": "health_insurance_premiums_without_medicare_part_b",
             "value": 385e9,
diff --git a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py
@@ -7,6 +7,9 @@
 """
 
 HARD_CODED_TOTALS = {
+    # BEA NIPA 2024 employer contributions for employee pension and
+    # insurance funds: group health insurance.
+    "employer_sponsored_insurance_premiums": 1_002.9e9,
     "health_insurance_premiums_without_medicare_part_b": 385e9,
     "other_medical_expenses": 278e9,
     "medicare_part_b_premium": 112e9,
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
@@ -30,6 +30,7 @@
 # database so this dict can be deleted.  See PR #488.
 
 HARD_CODED_TOTALS = {
+    "employer_sponsored_insurance_premiums": 1_002.9e9,
     "health_insurance_premiums_without_medicare_part_b": 385e9,
     "other_medical_expenses": 278e9,
     MEDICARE_PART_B_PREMIUM_VARIABLE: (
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,10 +22,11 @@ classifiers = [
     "Programming Language :: Python :: 3.14",
 ]
 dependencies = [
-    "policyengine-us>=1.680.0",
-    # policyengine-core 3.25.4 fixes PolicyEngine/policyengine-core#482
-    # (user-set ETERNITY inputs lost after _invalidate_all_caches).
-    "policyengine-core>=3.25.4,<3.26",
+    "policyengine-us>=1.682.1",
+    # policyengine-core 3.26.0 includes the fix for
+    # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost
+    # after _invalidate_all_caches) and is required by policyengine-us 1.682.1.
+    "policyengine-core>=3.26.0,<3.27",
     "pandas>=2.3.1",
     "requests>=2.25.0",
     "tqdm>=4.60.0",
diff --git a/tests/unit/test_employer_sponsored_insurance_premiums.py b/tests/unit/test_employer_sponsored_insurance_premiums.py
@@ -0,0 +1,127 @@
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+from policyengine_us_data.datasets.cps.census_cps import (
+    PERSON_COLUMNS,
+    TAX_UNIT_COLUMNS,
+    _resolve_person_usecols,
+)
+from policyengine_us_data.datasets.cps.cps import (
+    ESI_POLICYHOLDER_VARIABLE,
+    ESI_SOURCE_COLUMNS,
+    _EMPLOYER_PAYS_ALL,
+    _EMPLOYER_PAYS_SOME,
+    _ESI_PLAN_PRIORS_2024,
+    _validate_raw_cps_schema,
+    impute_employer_sponsored_insurance_premiums,
+)
+from policyengine_us_data.datasets.cps.extended_cps import (
+    CPS_ONLY_IMPUTED_VARIABLES,
+)
+from policyengine_us_data.storage.calibration_targets.pull_hardcoded_targets import (
+    HARD_CODED_TOTALS,
+)
+
+
+def test_resolve_person_usecols_requests_optional_esi_columns_when_available():
+    available = (
+        PERSON_COLUMNS
+        + TAX_UNIT_COLUMNS
+        + [
+            "NOW_OWNGRP",
+            "NOW_HIPAID",
+            "NOW_GRPFTYP",
+        ]
+    )
+    usecols = _resolve_person_usecols(available, spm_unit_columns=[])
+
+    for column in ["NOW_OWNGRP", "NOW_HIPAID", "NOW_GRPFTYP"]:
+        assert column in usecols
+
+
+def test_impute_employer_sponsored_insurance_premiums():
+    person = pd.DataFrame(
+        {
+            "NOW_OWNGRP": [1, 1, 1, 0, 1],
+            "NOW_HIPAID": [1, 2, 2, 1, 2],
+            "NOW_GRPFTYP": [2, 2, 1, 2, 1],
+            "PHIP_VAL": [0, 1_200, 0, 0, 50_000],
+        }
+    )
+
+    result = impute_employer_sponsored_insurance_premiums(person)
+
+    np.testing.assert_allclose(
+        result[0],
+        _ESI_PLAN_PRIORS_2024["self_only"]["total_premium"],
+    )
+    np.testing.assert_allclose(
+        result[1],
+        _ESI_PLAN_PRIORS_2024["self_only"]["total_premium"] - 1_200,
+    )
+    np.testing.assert_allclose(
+        result[2],
+        _ESI_PLAN_PRIORS_2024["family"]["total_premium"]
+        - _ESI_PLAN_PRIORS_2024["family"]["employee_contribution"],
+    )
+    assert result[3] == 0
+    assert result[4] == 0
+
+
+def test_impute_employer_sponsored_insurance_premiums_tolerates_missing_esi_columns():
+    person = pd.DataFrame({"PHIP_VAL": [1_000, 2_000]})
+
+    result = impute_employer_sponsored_insurance_premiums(person)
+
+    np.testing.assert_array_equal(result, np.zeros(2))
+
+
+def test_imputation_status_codes_remain_stable():
+    assert _EMPLOYER_PAYS_ALL == 1
+    assert _EMPLOYER_PAYS_SOME == 2
+
+
+def test_extended_cps_imputes_esi_premiums_for_clone_half():
+    assert "employer_sponsored_insurance_premiums" in CPS_ONLY_IMPUTED_VARIABLES
+
+
+def test_hardcoded_targets_include_total_esi_premiums():
+    assert HARD_CODED_TOTALS["employer_sponsored_insurance_premiums"] == 1_002.9e9
+
+
+def test_target_config_includes_total_esi_premiums():
+    target_config_path = Path(__file__).parents[2] / (
+        "policyengine_us_data/calibration/target_config.yaml"
+    )
+    content = target_config_path.read_text()
+
+    assert "employer_sponsored_insurance_premiums" in content
+
+
+def test_policyholder_variable_name_remains_stable():
+    assert (
+        ESI_POLICYHOLDER_VARIABLE
+        == "reported_owns_employer_sponsored_health_insurance_at_interview"
+    )
+
+
+def test_raw_cps_schema_requires_esi_source_columns():
+    person = pd.DataFrame(
+        {
+            "CENSUS_TAX_ID": [1],
+            **{column: [1] for column in ESI_SOURCE_COLUMNS},
+        }
+    )
+    tax_unit = pd.DataFrame()
+
+    _validate_raw_cps_schema(person, tax_unit, "raw")
+
+    stale_person = person.drop(columns=["NOW_OWNGRP"])
+    try:
+        _validate_raw_cps_schema(stale_person, tax_unit, "raw")
+    except ValueError as error:
+        assert "NOW_OWNGRP" in str(error)
+    else:
+        raise AssertionError("Expected missing ESI source column to fail validation")
diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Calibrate employer-sponsored insurance premiums and seed CPS policyholder ESI contributions for CBO-style income concepts.`