diff --git a/changelog.d/1149.fixed.md b/changelog.d/1149.fixed.md new file mode 100644 index 000000000..333d947de --- /dev/null +++ b/changelog.d/1149.fixed.md @@ -0,0 +1 @@ +Constrain ECPS calibration to the source household count so PUF clone reweighting cannot inflate total household weight. diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index c097ea0bd..de0d0b58e 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -6,6 +6,7 @@ import pandas as pd from policyengine_us_data.utils import ( ABSOLUTE_ERROR_SCALE_TARGETS, + HOUSEHOLD_COUNT_TARGET, build_loss_matrix, get_target_error_normalisation, get_target_loss_weights, @@ -669,6 +670,11 @@ def generate(self): del loss_matrix, targets_array gc.collect() assert loss_matrix_clean.shape[1] == targets_array_clean.size + if HOUSEHOLD_COUNT_TARGET not in loss_matrix_clean.columns: + raise ValueError( + f"{HOUSEHOLD_COUNT_TARGET} missing from EnhancedCPS " + "calibration targets" + ) loss_matrix_clean = loss_matrix_clean.astype(np.float32) diff --git a/policyengine_us_data/utils/__init__.py b/policyengine_us_data/utils/__init__.py index ffe7639a5..a0a0f2507 100644 --- a/policyengine_us_data/utils/__init__.py +++ b/policyengine_us_data/utils/__init__.py @@ -11,6 +11,7 @@ __all__ = [ "ABSOLUTE_ERROR_SCALE_TARGETS", "HardConcrete", + "HOUSEHOLD_COUNT_TARGET", "build_loss_matrix", "get_target_error_normalisation", "get_target_loss_weights", diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 54fb4bc9e..373dc4731 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -82,6 +82,8 @@ BEA_NIPA_DIRECT_SUM_LOSS_WEIGHT = 1_000.0 BEA_WAGES_AND_SALARIES_LOSS_WEIGHT = 1_000.0 +HOUSEHOLD_COUNT_TARGET = "nation/source/household_count" +HOUSEHOLD_COUNT_LOSS_WEIGHT = 1_000.0 CBO_INCOME_BY_SOURCE_TARGETS = [ ("irs_employment_income", "employment_income"), @@ -1199,6 +1201,31 @@ def _add_transfer_balance_targets(loss_matrix, targets_list, sim, time_period): return targets_list, loss_matrix +def _add_household_count_target(loss_matrix, targets_list, sim): + """Constrain total household weight to the source survey total.""" + + household_weights = sim.calculate("household_weight").values + if len(loss_matrix) != len(household_weights): + raise ValueError( + "Household count target length mismatch: " + f"loss matrix has {len(loss_matrix)} rows but household_weight has " + f"{len(household_weights)} values" + ) + + target = float(np.sum(household_weights)) + if not np.isfinite(target) or target <= 0: + raise ValueError( + "Household count target must have positive finite source weight total" + ) + + loss_matrix[HOUSEHOLD_COUNT_TARGET] = np.ones( + len(household_weights), + dtype=np.float32, + ) + targets_list.append(target) + return targets_list, loss_matrix + + def get_target_error_normalisation(target_names, targets_array): """Return numerator shifts and denominators for target loss scaling.""" target_names = np.asarray(target_names) @@ -1227,6 +1254,7 @@ def get_target_loss_weights(target_names): ) | np.char.startswith(target_names, "state/bea/wages_and_salaries/") weights[is_bea_direct_sum_target] = BEA_NIPA_DIRECT_SUM_LOSS_WEIGHT weights[is_bea_wage_target] = BEA_WAGES_AND_SALARIES_LOSS_WEIGHT + weights[target_names == HOUSEHOLD_COUNT_TARGET] = HOUSEHOLD_COUNT_LOSS_WEIGHT return weights @@ -1360,6 +1388,12 @@ def build_loss_matrix(dataset: type, time_period): hh_id = sim.calculate("household_id").values loss_matrix = loss_matrix.loc[hh_id] + targets_array, loss_matrix = _add_household_count_target( + loss_matrix, + targets_array, + sim, + ) + # Census single-year age population projections populations = pd.read_csv(CALIBRATION_FOLDER / "np2023_d5_mid.csv") diff --git a/policyengine_us_data/utils/national_target_parity.py b/policyengine_us_data/utils/national_target_parity.py index 00f0655d8..9656bdd1b 100644 --- a/policyengine_us_data/utils/national_target_parity.py +++ b/policyengine_us_data/utils/national_target_parity.py @@ -675,6 +675,8 @@ def _legacy_reason(target_name: str) -> str: return "legacy_cms_aca_spending_target_not_in_target_db" if target_name.startswith("nation/accounting/"): return "legacy_accounting_balance_target_not_in_target_db" + if target_name == "nation/source/household_count": + return "legacy_source_household_count_target_not_in_target_db" if target_name.startswith("nation/irs/negative_household_market_income_"): return "legacy_negative_market_income_target_not_in_target_db" if target_name == "nation/census/infants": diff --git a/pyproject.toml b/pyproject.toml index 0ff9663b1..975e75829 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: 3.14", ] dependencies = [ - "policyengine-us==1.709.1", + "policyengine-us==1.711.0", # policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost # after _invalidate_all_caches) and is required by policyengine-us 1.682.1+. diff --git a/tests/unit/calibration/test_loss_targets.py b/tests/unit/calibration/test_loss_targets.py index 8c7920773..2cacbbc9a 100644 --- a/tests/unit/calibration/test_loss_targets.py +++ b/tests/unit/calibration/test_loss_targets.py @@ -16,12 +16,15 @@ BEA_WAGES_AND_SALARIES_LOSS_WEIGHT, BLS_CE_TOTALS, HARD_CODED_TOTALS, + HOUSEHOLD_COUNT_LOSS_WEIGHT, + HOUSEHOLD_COUNT_TARGET, LOW_AGI_INVESTMENT_INCOME_SOI_VARIABLES, SOI_NEGATIVE_AGI_TARGETED_VARIABLES, TRANSFER_BALANCE_TARGETS, _add_bea_state_wage_targets, _add_agi_metric_columns, _add_acs_housing_cost_targets, + _add_household_count_target, _add_aotc_targets, _add_bls_ce_targets, _add_ctc_targets, @@ -167,6 +170,22 @@ def test_bea_nipa_direct_sum_targets_get_higher_loss_weight(): ] +def test_household_count_target_gets_higher_loss_weight(): + target_names = np.array( + [ + HOUSEHOLD_COUNT_TARGET, + "nation/census/population_by_age/0", + ] + ) + + weights = get_target_loss_weights(target_names) + + assert weights.tolist() == [ + HOUSEHOLD_COUNT_LOSS_WEIGHT, + 1.0, + ] + + def test_aca_targets_roll_forward_to_2025(): targets, data_year = _load_aca_spending_and_enrollment_targets(2025) @@ -243,6 +262,17 @@ def __init__(self, values): self.values = np.asarray(values) +class _FakeHouseholdWeightSimulation: + def __init__(self, weights): + self.weights = weights + + def calculate(self, variable, map_to=None, period=None): + assert variable == "household_weight" + assert map_to is None + assert period is None + return _FakeArrayResult(self.weights) + + class _FakeSimulation: def __init__(self): self.calculate_calls = [] @@ -427,6 +457,28 @@ def test_state_agi_targets_are_limited_to_filers(tmp_path, monkeypatch): ) +def test_add_household_count_target_uses_source_weight_total(): + loss_matrix = pd.DataFrame(index=[101, 102, 103, 104]) + + targets, loss_matrix = _add_household_count_target( + loss_matrix, + [], + _FakeHouseholdWeightSimulation([80.0, 20.0, 0.0, 0.0]), + ) + + assert targets == [100.0] + np.testing.assert_array_equal( + loss_matrix[HOUSEHOLD_COUNT_TARGET].to_numpy(), + np.ones(4, dtype=np.float32), + ) + + +def test_build_loss_matrix_adds_household_count_target_before_reweighting(): + source = inspect.getsource(build_loss_matrix) + + assert "_add_household_count_target" in source + + def test_add_ssi_recipient_targets_adds_total_and_age_counts(): targets, loss_matrix = _add_ssi_recipient_targets( pd.DataFrame(), diff --git a/tests/unit/datasets/test_enhanced_cps_seeding.py b/tests/unit/datasets/test_enhanced_cps_seeding.py index 60bba54d6..11e7e457d 100644 --- a/tests/unit/datasets/test_enhanced_cps_seeding.py +++ b/tests/unit/datasets/test_enhanced_cps_seeding.py @@ -42,3 +42,17 @@ def test_enhanced_cps_sources_use_deterministic_weight_priors(): assert "np.random.normal" not in source assert source.count("initialize_weight_priors(original_weights.values)") == 2 + + +def test_initialize_weight_priors_preserves_source_weight_total(): + from policyengine_us_data.datasets.cps.enhanced_cps import ( + initialize_weight_priors, + ) + + priors = initialize_weight_priors( + np.array([80.0, 20.0, 0.0, 0.0]), + zero_weight_total_share=0.5, + ) + + np.testing.assert_allclose(priors.sum(), 100.0) + np.testing.assert_allclose(priors, np.array([40.0, 10.0, 25.0, 25.0])) diff --git a/uv.lock b/uv.lock index e0e4ab23d..71f4a0242 100644 --- a/uv.lock +++ b/uv.lock @@ -2164,7 +2164,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.709.1" +version = "1.711.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -2174,9 +2174,9 @@ dependencies = [ { name = "tables" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/20/dc/068115100ff17bb7245e7237777c7efee926f8788420e49b6dcd8091c483/policyengine_us-1.709.1.tar.gz", hash = "sha256:66e4f09629fcca1fd9094e6211d0ee1e514c4f00ce55addca5d7063794963384", size = 9951226, upload-time = "2026-05-26T22:23:52.351Z" } +sdist = { url = "https://files.pythonhosted.org/packages/48/ed/8825980a62e009610d6fa36f55f6c8a32deb0fb770d1f3513e2df9c7f7fe/policyengine_us-1.711.0.tar.gz", hash = "sha256:c52c8e68f3a01ee5935320175e841459503e67f84c41899f9768f4a5b300b4a3", size = 9956103, upload-time = "2026-05-27T21:31:17.868Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/3a/fc/4925a9e5d2a6f738d4c3c8252e6e52ece83c7da8e5cb8a66e98e8e80faa3/policyengine_us-1.709.1-py3-none-any.whl", hash = "sha256:9d26ff9a84a4f0c99cf9529fbfb6f07572c697010628fcd1f65fdb350cec40e6", size = 10870006, upload-time = "2026-05-26T22:23:49.08Z" }, + { url = "https://files.pythonhosted.org/packages/ff/aa/3e8471c852c75ecc7c2cbbdaedf79b70a8d207df7f689abfd2b3b570bd7a/policyengine_us-1.711.0-py3-none-any.whl", hash = "sha256:e37d7ee5926954ecf9e03d91ccd190a1609e6322426c12fd6cdd867a913ee2d9", size = 10887738, upload-time = "2026-05-27T21:31:14.859Z" }, ] [[package]] @@ -2246,7 +2246,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.3.1" }, { name = "pip-system-certs", specifier = ">=3.0" }, { name = "policyengine-core", specifier = ">=3.26.1,<3.27" }, - { name = "policyengine-us", specifier = "==1.709.1" }, + { name = "policyengine-us", specifier = "==1.711.0" }, { name = "requests", specifier = ">=2.25.0" }, { name = "scipy", specifier = ">=1.15.3" }, { name = "setuptools", specifier = ">=60" },