diff --git a/changelog.d/disability-benefit-categories.changed.md b/changelog.d/disability-benefit-categories.changed.md new file mode 100644 index 000000000..6098696ea --- /dev/null +++ b/changelog.d/disability-benefit-categories.changed.md @@ -0,0 +1 @@ +Map reported disability benefit amounts to category inputs in the data pipeline. diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py index d49566781..8a8cec5f8 100644 --- a/policyengine_uk_data/datasets/create_datasets.py +++ b/policyengine_uk_data/datasets/create_datasets.py @@ -30,6 +30,9 @@ def main(): assert_local_build_environment() from policyengine_uk.data import UKSingleYearDataset + from policyengine_uk_data.datasets.disability_benefits import ( + strip_internal_disability_reported_amounts, + ) from policyengine_uk_data.datasets.frs import create_frs from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk_data.utils.progress import ( @@ -79,8 +82,11 @@ def main(): frs = create_frs( raw_frs_folder=STORAGE_FOLDER / "frs_2023_24", year=2023, + include_internal_disability_reported_amounts=True, + ) + strip_internal_disability_reported_amounts(frs).save( + STORAGE_FOLDER / "frs_2023_24.h5" ) - frs.save(STORAGE_FOLDER / "frs_2023_24.h5") update_dataset("Create base FRS dataset", "completed") # Import imputation functions @@ -212,7 +218,9 @@ def main(): update_dataset("Downrate to 2023", "completed") update_dataset("Save final dataset", "processing") - frs_calibrated.save(STORAGE_FOLDER / "enhanced_frs_2023_24.h5") + strip_internal_disability_reported_amounts(frs_calibrated).save( + STORAGE_FOLDER / "enhanced_frs_2023_24.h5" + ) update_dataset("Save final dataset", "completed") # Create tiny (n=1000 households) versions for testing @@ -225,7 +233,10 @@ def main(): tiny_frs = subsample_dataset(frs_base, TINY_SIZE) tiny_frs.save(STORAGE_FOLDER / "frs_2023_24_tiny.h5") - tiny_enhanced = subsample_dataset(frs_calibrated, TINY_SIZE) + tiny_enhanced = subsample_dataset( + strip_internal_disability_reported_amounts(frs_calibrated), + TINY_SIZE, + ) tiny_enhanced.save(STORAGE_FOLDER / "enhanced_frs_2023_24_tiny.h5") update_dataset("Create tiny datasets", "completed") diff --git a/policyengine_uk_data/datasets/disability_benefits.py b/policyengine_uk_data/datasets/disability_benefits.py new file mode 100644 index 000000000..f223a94a9 --- /dev/null +++ b/policyengine_uk_data/datasets/disability_benefits.py @@ -0,0 +1,204 @@ +"""Dataset-side disability benefit category mapping. + +PolicyEngine UK models PIP, DLA, and Attendance Allowance from category +inputs. The FRS observes reported amounts, so the data pipeline keeps those +amounts as internal build intermediates and converts them to model inputs +before datasets are published. +""" + +from __future__ import annotations + +from functools import lru_cache + +import numpy as np +import pandas as pd +from policyengine_uk import CountryTaxBenefitSystem +from policyengine_uk.data import UKSingleYearDataset +from policyengine_uk.model_api import WEEKS_IN_YEAR as MODEL_WEEKS_IN_YEAR + + +DISABILITY_REPORTED_AMOUNT_COLUMNS = ( + "attendance_allowance_reported", + "dla_sc_reported", + "dla_m_reported", + "pip_m_reported", + "pip_dl_reported", +) + +DISABILITY_CATEGORY_COLUMNS = ( + "aa_category", + "dla_sc_category", + "dla_m_category", + "pip_m_category", + "pip_dl_category", +) + +SAFETY_MARGIN = 0.1 +SURVEY_REPORTED_AMOUNT_WEEKS_IN_YEAR = 365.25 / 7 + + +@lru_cache(maxsize=None) +def _dwp_category_threshold_parameters(year: int): + # Match the category formulas removed from policyengine-uk. Those formulas + # thresholded reported amounts against the baseline DWP rates. + return CountryTaxBenefitSystem().parameters(year).baseline.gov.dwp + + +@lru_cache(maxsize=None) +def _dwp_flag_parameters(year: int): + # Match the FRS disability flag derivation that already lived in uk-data. + return CountryTaxBenefitSystem().parameters(year).gov.dwp + + +def _reported_amount(person: pd.DataFrame, column: str) -> pd.Series: + if column not in person.columns: + return pd.Series(0.0, index=person.index) + return pd.to_numeric(person[column], errors="coerce").fillna(0.0) + + +def _category_from_reported_amount( + reported_amount: pd.Series, + thresholds: tuple[tuple[str, float], ...], +) -> np.ndarray: + weekly_amount = pd.to_numeric(reported_amount, errors="coerce").fillna(0) + weekly_amount = weekly_amount.to_numpy(dtype=float) / MODEL_WEEKS_IN_YEAR + category = np.full(len(weekly_amount), "NONE", dtype=object) + for category_name, weekly_rate in thresholds: + category[weekly_amount >= float(weekly_rate) * (1 - SAFETY_MARGIN)] = ( + category_name + ) + return category + + +def add_disability_benefit_categories_from_reported_amounts( + person: pd.DataFrame, + year: int, + *, + inplace: bool = False, +) -> pd.DataFrame: + """Convert reported disability benefit amounts into category inputs.""" + + if not inplace: + person = person.copy() + + dwp = _dwp_category_threshold_parameters(int(year)) + mappings = ( + ( + "attendance_allowance_reported", + "aa_category", + ( + ("LOWER", dwp.attendance_allowance.lower), + ("HIGHER", dwp.attendance_allowance.higher), + ), + ), + ( + "dla_sc_reported", + "dla_sc_category", + ( + ("LOWER", dwp.dla.self_care.lower), + ("MIDDLE", dwp.dla.self_care.middle), + ("HIGHER", dwp.dla.self_care.higher), + ), + ), + ( + "dla_m_reported", + "dla_m_category", + ( + ("LOWER", dwp.dla.mobility.lower), + ("HIGHER", dwp.dla.mobility.higher), + ), + ), + ( + "pip_m_reported", + "pip_m_category", + ( + ("STANDARD", dwp.pip.mobility.standard), + ("ENHANCED", dwp.pip.mobility.enhanced), + ), + ), + ( + "pip_dl_reported", + "pip_dl_category", + ( + ("STANDARD", dwp.pip.daily_living.standard), + ("ENHANCED", dwp.pip.daily_living.enhanced), + ), + ), + ) + + for reported_column, category_column, thresholds in mappings: + if reported_column in person.columns: + person[category_column] = _category_from_reported_amount( + person[reported_column], + thresholds, + ) + + return person + + +def add_disability_benefit_flags_from_reported_amounts( + person: pd.DataFrame, + year: int, + *, + inplace: bool = False, +) -> pd.DataFrame: + """Recompute disability flags derived from reported benefit amounts.""" + + if not inplace: + person = person.copy() + + dwp = _dwp_flag_parameters(int(year)) + dla_sc = _reported_amount(person, "dla_sc_reported") + dla_m = _reported_amount(person, "dla_m_reported") + pip_m = _reported_amount(person, "pip_m_reported") + pip_dl = _reported_amount(person, "pip_dl_reported") + afcs = _reported_amount(person, "afcs_reported") + + person["is_disabled_for_benefits"] = (dla_sc + dla_m + pip_m + pip_dl) > 0 + + threshold_safety_gap = 1 * SURVEY_REPORTED_AMOUNT_WEEKS_IN_YEAR + dla_sc_higher = ( + dwp.dla.self_care.higher * SURVEY_REPORTED_AMOUNT_WEEKS_IN_YEAR + - threshold_safety_gap + ) + pip_dl_enhanced = ( + dwp.pip.daily_living.enhanced * SURVEY_REPORTED_AMOUNT_WEEKS_IN_YEAR + - threshold_safety_gap + ) + + person["is_enhanced_disabled_for_benefits"] = dla_sc > dla_sc_higher + person["is_severely_disabled_for_benefits"] = ( + (dla_sc >= dla_sc_higher) | (pip_dl >= pip_dl_enhanced) | (afcs > 0) + ) + + return person + + +def drop_internal_disability_reported_amounts( + person: pd.DataFrame, + *, + inplace: bool = False, +) -> pd.DataFrame: + """Drop disability amount intermediates that are not PE-UK inputs.""" + + if inplace: + person.drop( + columns=list(DISABILITY_REPORTED_AMOUNT_COLUMNS), + errors="ignore", + inplace=True, + ) + return person + return person.drop( + columns=list(DISABILITY_REPORTED_AMOUNT_COLUMNS), + errors="ignore", + ) + + +def strip_internal_disability_reported_amounts( + dataset: UKSingleYearDataset, +) -> UKSingleYearDataset: + """Return ``dataset`` without internal disability amount intermediates.""" + + dataset = dataset.copy() + dataset.person = drop_internal_disability_reported_amounts(dataset.person) + return dataset diff --git a/policyengine_uk_data/datasets/enhanced_cps.py b/policyengine_uk_data/datasets/enhanced_cps.py index 5819573d4..611161462 100644 --- a/policyengine_uk_data/datasets/enhanced_cps.py +++ b/policyengine_uk_data/datasets/enhanced_cps.py @@ -35,26 +35,13 @@ "yearly-average-currency-exchange-rates" ) -# 2025/26 reported-benefit mapping assumptions used only to populate UK input -# leaves from U.S. source records. PolicyEngine UK applies its own parameters -# when calculating derived tax and benefit outputs. +# 2025/26 benefit mapping assumptions used only to populate UK input leaves from +# U.S. source records. PolicyEngine UK applies its own parameters when +# calculating derived tax and benefit outputs. NEW_STATE_PENSION_2025 = 224.96 * 52 DIVIDEND_YIELD_FOR_WEALTH_IMPUTATION = 0.03 RENTAL_YIELD_FOR_WEALTH_IMPUTATION = 0.04 -PIP_2025_WEEKLY_RATES = { - "daily_living": { - "NONE": 0.0, - "STANDARD": 73.89, - "ENHANCED": 110.40, - }, - "mobility": { - "NONE": 0.0, - "STANDARD": 29.19, - "ENHANCED": 77.04, - }, -} - REGION_SHARES = ( ("NORTH_EAST", 0.04), ("NORTH_WEST", 0.11), @@ -248,11 +235,6 @@ def _pip_category(person: dict) -> str: return "ENHANCED" if severe_signal or low_earnings else "STANDARD" -def _pip_reported_amount(category: str, component: str) -> float: - weekly = PIP_2025_WEEKLY_RATES[component][category] - return round(weekly * 52, 2) - - def _household_cash_income(people: list[dict], exchange_rate: float) -> float: total = 0.0 for person in people: @@ -688,14 +670,8 @@ def _build_base_dataset( if bool(inputs.get("is_blind", False)) else 0.0, "is_disabled_for_benefits": bool(inputs.get("is_disabled", False)), - "pip_dl_reported": _pip_reported_amount( - pip_category, - "daily_living", - ), - "pip_m_reported": _pip_reported_amount( - pip_category, - "mobility", - ), + "pip_dl_category": pip_category, + "pip_m_category": pip_category, "hours_worked": float( inputs.get( "weekly_hours_worked", diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index f20dd08e8..358d08a1b 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -17,6 +17,11 @@ from policyengine_uk.variables.household.income.employment_status import ( EmploymentStatus, ) +from policyengine_uk_data.datasets.disability_benefits import ( + add_disability_benefit_categories_from_reported_amounts, + add_disability_benefit_flags_from_reported_amounts, + drop_internal_disability_reported_amounts, +) from policyengine_uk_data.utils.datasets import ( sum_to_entity, categorical, @@ -475,6 +480,7 @@ def split_reported_education_grants( def create_frs( raw_frs_folder: str, year: int, + include_internal_disability_reported_amounts: bool = False, ) -> UKSingleYearDataset: """ Process raw FRS data into PolicyEngine UK dataset format. @@ -487,6 +493,9 @@ def create_frs( Args: raw_frs_folder: Path to folder containing raw FRS .tab files. year: Survey year for the dataset. + include_internal_disability_reported_amounts: Keep raw disability + benefit amount intermediates for downstream imputation. Public + saved datasets should leave this as ``False``. Returns: UKSingleYearDataset with processed FRS data ready for policy simulation. @@ -1010,6 +1019,12 @@ def determine_education_level(fted_val, typeed2_val, age_val): * WEEKS_IN_YEAR ) + pe_person = add_disability_benefit_categories_from_reported_amounts( + pe_person, + year, + inplace=True, + ) + pe_person["jsa_contrib_reported"] = ( sum_to_entity( benefits.benamt * (benefits.var2.isin((1, 3))) * (benefits.benefit == 14), @@ -1266,35 +1281,10 @@ def determine_education_level(fted_val, typeed2_val, age_val): pe_household["brma"] = brmas - parameters = sim.tax_benefit_system.parameters - benefit = parameters(year).gov.dwp - - pe_person["is_disabled_for_benefits"] = ( - pe_person.dla_sc_reported - + pe_person.dla_m_reported - + pe_person.pip_m_reported - + pe_person.pip_dl_reported - ) > 0 - - THRESHOLD_SAFETY_GAP = 1 * WEEKS_IN_YEAR - - pe_person["is_enhanced_disabled_for_benefits"] = ( - pe_person.dla_sc_reported - > benefit.dla.self_care.higher * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP - ) - - # Child Tax Credit Regulations 2002 s. 8 - paragraph_3 = ( - pe_person.dla_sc_reported - >= benefit.dla.self_care.higher * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP - ) - paragraph_4 = ( - pe_person.pip_dl_reported - >= benefit.pip.daily_living.enhanced * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP - ) - paragraph_5 = pe_person.afcs_reported > 0 - pe_person["is_severely_disabled_for_benefits"] = ( - paragraph_3 | paragraph_4 | paragraph_5 + pe_person = add_disability_benefit_flags_from_reported_amounts( + pe_person, + year, + inplace=True, ) # Dataset-side claimant-state approximations for future legacy ESA/JSA @@ -1460,6 +1450,9 @@ def _reported_benunit_mask(person_column: str) -> np.ndarray: np.random.random(len(pe_household)) < PROPERTY_PURCHASE_RATE ) + if not include_internal_disability_reported_amounts: + pe_person = drop_internal_disability_reported_amounts(pe_person) + dataset = UKSingleYearDataset( person=pe_person, benunit=pe_benunit, diff --git a/policyengine_uk_data/datasets/imputations/frs_only.py b/policyengine_uk_data/datasets/imputations/frs_only.py index 1730bc257..242fbf247 100644 --- a/policyengine_uk_data/datasets/imputations/frs_only.py +++ b/policyengine_uk_data/datasets/imputations/frs_only.py @@ -36,6 +36,10 @@ import numpy as np import pandas as pd from policyengine_uk.data import UKSingleYearDataset +from policyengine_uk_data.datasets.disability_benefits import ( + add_disability_benefit_categories_from_reported_amounts, + add_disability_benefit_flags_from_reported_amounts, +) logger = logging.getLogger(__name__) @@ -237,4 +241,13 @@ def impute_frs_only_variables( values = np.maximum(predictions[column].values, 0.0) target_dataset.person[column] = values + target_dataset.person = add_disability_benefit_categories_from_reported_amounts( + target_dataset.person, + int(str(target_dataset.time_period)[:4]), + ) + target_dataset.person = add_disability_benefit_flags_from_reported_amounts( + target_dataset.person, + int(str(target_dataset.time_period)[:4]), + ) + return target_dataset diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py index d39392b74..a108f83ad 100644 --- a/policyengine_uk_data/datasets/imputations/income.py +++ b/policyengine_uk_data/datasets/imputations/income.py @@ -7,7 +7,6 @@ """ import pandas as pd -from pathlib import Path import numpy as np from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk.data import UKSingleYearDataset @@ -265,6 +264,9 @@ def impute_income(dataset: UKSingleYearDataset) -> UKSingleYearDataset: from policyengine_uk_data.datasets.imputations.frs_only import ( impute_frs_only_variables, ) + from policyengine_uk_data.datasets.disability_benefits import ( + strip_internal_disability_reported_amounts, + ) zero_weight_copy = impute_frs_only_variables( train_dataset=dataset, @@ -285,4 +287,4 @@ def impute_income(dataset: UKSingleYearDataset) -> UKSingleYearDataset: zero_weight_copy, ) - return data + return strip_internal_disability_reported_amounts(data) diff --git a/policyengine_uk_data/storage/enhanced_cps_2025.h5 b/policyengine_uk_data/storage/enhanced_cps_2025.h5 index 6184d619b..e3f20a981 100644 Binary files a/policyengine_uk_data/storage/enhanced_cps_2025.h5 and b/policyengine_uk_data/storage/enhanced_cps_2025.h5 differ diff --git a/policyengine_uk_data/storage/uprating_factors.csv b/policyengine_uk_data/storage/uprating_factors.csv index 5a7e54df4..c5f6f638e 100644 --- a/policyengine_uk_data/storage/uprating_factors.csv +++ b/policyengine_uk_data/storage/uprating_factors.csv @@ -1,7 +1,6 @@ Variable,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034 afcs_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 alcohol_and_tobacco_consumption,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 -attendance_allowance_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 benunit_rent,1.0,1.0,1.0,1.11,1.184,1.223,1.275,1.312,1.351,1.392,1.392,1.392,1.392,1.392,1.392 bsp_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 capital_gains,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384 @@ -15,8 +14,6 @@ communication_consumption,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.3 corporate_wealth,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384 diesel_spending,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 dividend_income,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384 -dla_m_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 -dla_sc_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 domestic_energy_consumption,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 education_consumption,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 employee_pension_contributions,1.0,1.059,1.127,1.205,1.261,1.308,1.337,1.365,1.396,1.431,1.431,1.431,1.431,1.431,1.431 @@ -60,8 +57,6 @@ pension_credit_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38, pension_income,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384 personal_pension_contributions,1.0,1.059,1.127,1.205,1.261,1.308,1.337,1.365,1.396,1.431,1.431,1.431,1.431,1.431,1.431 petrol_spending,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 -pip_dl_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 -pip_m_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 private_pension_income,1.0,1.003,1.053,1.106,1.161,1.216,1.261,1.288,1.315,1.346,1.346,1.346,1.346,1.346,1.346 private_transfer_income,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384 property_income,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384 diff --git a/policyengine_uk_data/storage/uprating_growth_factors.csv b/policyengine_uk_data/storage/uprating_growth_factors.csv index eb8b7fb6d..7d330d17a 100644 --- a/policyengine_uk_data/storage/uprating_growth_factors.csv +++ b/policyengine_uk_data/storage/uprating_growth_factors.csv @@ -1,7 +1,6 @@ Variable,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034 afcs_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 alcohol_and_tobacco_consumption,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 -attendance_allowance_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 benunit_rent,0,0.0,0.0,0.11,0.067,0.033,0.043,0.029,0.03,0.03,0.0,0.0,0.0,0.0,0.0 bsp_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 capital_gains,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0 @@ -15,8 +14,6 @@ communication_consumption,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0, corporate_wealth,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0 diesel_spending,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 dividend_income,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0 -dla_m_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 -dla_sc_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 domestic_energy_consumption,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 education_consumption,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 employee_pension_contributions,0,0.059,0.064,0.069,0.046,0.037,0.022,0.021,0.023,0.025,0.0,0.0,0.0,0.0,0.0 @@ -60,8 +57,6 @@ pension_credit_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0. pension_income,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0 personal_pension_contributions,0,0.059,0.064,0.069,0.046,0.037,0.022,0.021,0.023,0.025,0.0,0.0,0.0,0.0,0.0 petrol_spending,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 -pip_dl_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 -pip_m_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 private_pension_income,0,0.003,0.05,0.05,0.05,0.047,0.037,0.021,0.021,0.024,0.0,0.0,0.0,0.0,0.0 private_transfer_income,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0 property_income,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0 diff --git a/policyengine_uk_data/tests/test_disability_benefits.py b/policyengine_uk_data/tests/test_disability_benefits.py new file mode 100644 index 000000000..44bb016d7 --- /dev/null +++ b/policyengine_uk_data/tests/test_disability_benefits.py @@ -0,0 +1,132 @@ +from __future__ import annotations + +import pandas as pd +from policyengine_uk import CountryTaxBenefitSystem +from policyengine_uk.data import UKSingleYearDataset +from policyengine_uk.model_api import WEEKS_IN_YEAR + +from policyengine_uk_data.datasets.disability_benefits import ( + add_disability_benefit_categories_from_reported_amounts, + add_disability_benefit_flags_from_reported_amounts, + drop_internal_disability_reported_amounts, + strip_internal_disability_reported_amounts, +) + + +def test_reported_amounts_map_to_disability_categories(): + year = 2025 + dwp = CountryTaxBenefitSystem().parameters(year).baseline.gov.dwp + person = pd.DataFrame( + { + "attendance_allowance_reported": [ + 0, + dwp.attendance_allowance.lower * WEEKS_IN_YEAR * 0.91, + dwp.attendance_allowance.higher * WEEKS_IN_YEAR * 0.91, + ], + "dla_sc_reported": [ + 0, + dwp.dla.self_care.lower * WEEKS_IN_YEAR * 0.91, + dwp.dla.self_care.middle * WEEKS_IN_YEAR * 0.91, + ], + "dla_m_reported": [ + 0, + dwp.dla.mobility.lower * WEEKS_IN_YEAR * 0.91, + dwp.dla.mobility.higher * WEEKS_IN_YEAR * 0.91, + ], + "pip_m_reported": [ + 0, + dwp.pip.mobility.standard * WEEKS_IN_YEAR * 0.91, + dwp.pip.mobility.enhanced * WEEKS_IN_YEAR * 0.91, + ], + "pip_dl_reported": [ + 0, + dwp.pip.daily_living.standard * WEEKS_IN_YEAR * 0.91, + dwp.pip.daily_living.enhanced * WEEKS_IN_YEAR * 0.91, + ], + } + ) + + result = add_disability_benefit_categories_from_reported_amounts(person, year) + + assert result["aa_category"].tolist() == ["NONE", "LOWER", "HIGHER"] + assert result["dla_sc_category"].tolist() == ["NONE", "LOWER", "MIDDLE"] + assert result["dla_m_category"].tolist() == ["NONE", "LOWER", "HIGHER"] + assert result["pip_m_category"].tolist() == ["NONE", "STANDARD", "ENHANCED"] + assert result["pip_dl_category"].tolist() == ["NONE", "STANDARD", "ENHANCED"] + + +def test_reported_amounts_recompute_disability_flags(): + year = 2025 + dwp = CountryTaxBenefitSystem().parameters(year).gov.dwp + person = pd.DataFrame( + { + "dla_sc_reported": [ + 0.0, + dwp.dla.self_care.higher * (365.25 / 7), + 0.0, + ], + "dla_m_reported": [0.0, 0.0, 0.0], + "pip_m_reported": [0.0, 0.0, 0.0], + "pip_dl_reported": [ + 0.0, + 0.0, + dwp.pip.daily_living.enhanced * (365.25 / 7), + ], + "afcs_reported": [0.0, 0.0, 0.0], + "is_disabled_for_benefits": [True, False, False], + "is_enhanced_disabled_for_benefits": [True, False, False], + "is_severely_disabled_for_benefits": [True, False, False], + } + ) + + result = add_disability_benefit_flags_from_reported_amounts(person, year) + + assert result["is_disabled_for_benefits"].tolist() == [False, True, True] + assert result["is_enhanced_disabled_for_benefits"].tolist() == [ + False, + True, + False, + ] + assert result["is_severely_disabled_for_benefits"].tolist() == [ + False, + True, + True, + ] + + +def test_drop_internal_disability_reported_amounts_keeps_categories(): + person = pd.DataFrame( + { + "person_id": [1], + "pip_dl_reported": [1_000.0], + "pip_dl_category": ["STANDARD"], + } + ) + + result = drop_internal_disability_reported_amounts(person) + + assert "pip_dl_reported" not in result.columns + assert result["pip_dl_category"].tolist() == ["STANDARD"] + assert "pip_dl_reported" in person.columns + + +def test_strip_internal_disability_reported_amounts_cleans_dataset_person_frame(): + dataset = UKSingleYearDataset( + person=pd.DataFrame( + { + "person_id": [0], + "person_benunit_id": [0], + "person_household_id": [0], + "pip_dl_reported": [1_000.0], + "pip_dl_category": ["STANDARD"], + } + ), + benunit=pd.DataFrame({"benunit_id": [0]}), + household=pd.DataFrame({"household_id": [0]}), + fiscal_year=2025, + ) + + result = strip_internal_disability_reported_amounts(dataset) + + assert "pip_dl_reported" not in result.person.columns + assert "pip_dl_reported" in dataset.person.columns diff --git a/policyengine_uk_data/tests/test_frs_only_imputation.py b/policyengine_uk_data/tests/test_frs_only_imputation.py index 9393c7436..1961274e4 100644 --- a/policyengine_uk_data/tests/test_frs_only_imputation.py +++ b/policyengine_uk_data/tests/test_frs_only_imputation.py @@ -221,3 +221,40 @@ def test_frs_only_reported_values_correlate_with_training_pattern(): "Stage-2 QRF should produce lower UC-receipt predictions for high-" f"income target rows (got high={high_mean:.2f} vs low={low_mean:.2f})." ) + + +def test_frs_only_recomputes_disability_flags_after_amount_imputation(monkeypatch): + """Disability flags should follow imputed amounts, not donor rows.""" + from policyengine_uk_data.datasets.imputations.frs_only import ( + impute_frs_only_variables, + ) + import policyengine_uk_data.utils.qrf as qrf_module + + class ZeroQRF: + def fit(self, _x, y): + self.output_columns = list(y.columns) + + def predict(self, x): + return pd.DataFrame( + {column: np.zeros(len(x)) for column in self.output_columns}, + index=x.index, + ) + + monkeypatch.setattr(qrf_module, "QRF", ZeroQRF) + + train = _fake_dataset(person_rows=20, seed=0) + target = _fake_dataset(person_rows=5, seed=1) + target.person["is_disabled_for_benefits"] = True + target.person["is_enhanced_disabled_for_benefits"] = True + target.person["is_severely_disabled_for_benefits"] = True + + result = impute_frs_only_variables( + train_dataset=train, + target_dataset=target, + ) + + assert not result.person["is_disabled_for_benefits"].any() + assert not result.person["is_enhanced_disabled_for_benefits"].any() + assert not result.person["is_severely_disabled_for_benefits"].any() + assert (result.person["pip_dl_category"] == "NONE").all() + assert (result.person["pip_m_category"] == "NONE").all() diff --git a/policyengine_uk_data/tests/test_policybench_transfer.py b/policyengine_uk_data/tests/test_policybench_transfer.py index 2d5c10f6d..fb922f24a 100644 --- a/policyengine_uk_data/tests/test_policybench_transfer.py +++ b/policyengine_uk_data/tests/test_policybench_transfer.py @@ -5,8 +5,10 @@ import pandas as pd from policyengine_uk import CountryTaxBenefitSystem from policyengine_uk import Microsimulation +from policyengine_uk.data import UKSingleYearDataset from policyengine_uk_data.datasets import ( + ENHANCED_CPS_FILE, ENHANCED_CPS_SOURCE_FILE, create_enhanced_cps, ) @@ -14,10 +16,15 @@ from policyengine_uk_data.utils import reweight as reweight_module from policyengine_uk_data.utils.loss import get_loss_results -ALLOWED_REPORTED_DATA_INPUTS = { +ALLOWED_COMPATIBILITY_INPUTS = { # PE-UK uses this reported base field to derive basic/additional/new # state pension; it carries a formula only for year-to-year uprating. "state_pension_reported", + # These become leaf inputs in policyengine-uk#1656. Current released + # PE-UK versions still expose them as formula variables, but dataset + # loading accepts category columns and uses them as inputs. + "pip_dl_category", + "pip_m_category", } @@ -62,13 +69,17 @@ def test_policybench_transfer_writes_only_valid_leaf_inputs(tmp_path: Path): or system.variables[column].entity.key != entity or ( not system.variables[column].is_input_variable() - and column not in ALLOWED_REPORTED_DATA_INPUTS + and column not in ALLOWED_COMPATIBILITY_INPUTS ) ] assert invalid_columns == [] assert "household_wealth" not in dataset.household.columns assert "total_wealth" not in dataset.household.columns + assert "pip_dl_reported" not in dataset.person.columns + assert "pip_m_reported" not in dataset.person.columns + assert "pip_dl_category" in dataset.person.columns + assert "pip_m_category" in dataset.person.columns for column in ( "savings", "main_residence_value", @@ -80,6 +91,15 @@ def test_policybench_transfer_writes_only_valid_leaf_inputs(tmp_path: Path): assert column in dataset.household.columns +def test_checked_in_enhanced_cps_h5_uses_pip_categories(): + dataset = UKSingleYearDataset(file_path=str(ENHANCED_CPS_FILE)) + + assert "pip_dl_reported" not in dataset.person.columns + assert "pip_m_reported" not in dataset.person.columns + assert "pip_dl_category" in dataset.person.columns + assert "pip_m_category" in dataset.person.columns + + def test_policybench_transfer_runs_uk_microsimulation(tmp_path: Path): dataset = create_enhanced_cps( source_file_path=_subset_source(tmp_path, 10),