Skip to content
This repository was archived by the owner on Jun 19, 2026. It is now read-only.

Commit bdd69a5

Browse files
authored
Map disability benefit categories in data (#383)
1 parent 604621c commit bdd69a5

13 files changed

Lines changed: 454 additions & 75 deletions
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Map reported disability benefit amounts to category inputs in the data pipeline.

policyengine_uk_data/datasets/create_datasets.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ def main():
3030
assert_local_build_environment()
3131

3232
from policyengine_uk.data import UKSingleYearDataset
33+
from policyengine_uk_data.datasets.disability_benefits import (
34+
strip_internal_disability_reported_amounts,
35+
)
3336
from policyengine_uk_data.datasets.frs import create_frs
3437
from policyengine_uk_data.storage import STORAGE_FOLDER
3538
from policyengine_uk_data.utils.progress import (
@@ -79,8 +82,11 @@ def main():
7982
frs = create_frs(
8083
raw_frs_folder=STORAGE_FOLDER / "frs_2023_24",
8184
year=2023,
85+
include_internal_disability_reported_amounts=True,
86+
)
87+
strip_internal_disability_reported_amounts(frs).save(
88+
STORAGE_FOLDER / "frs_2023_24.h5"
8289
)
83-
frs.save(STORAGE_FOLDER / "frs_2023_24.h5")
8490
update_dataset("Create base FRS dataset", "completed")
8591

8692
# Import imputation functions
@@ -212,7 +218,9 @@ def main():
212218
update_dataset("Downrate to 2023", "completed")
213219

214220
update_dataset("Save final dataset", "processing")
215-
frs_calibrated.save(STORAGE_FOLDER / "enhanced_frs_2023_24.h5")
221+
strip_internal_disability_reported_amounts(frs_calibrated).save(
222+
STORAGE_FOLDER / "enhanced_frs_2023_24.h5"
223+
)
216224
update_dataset("Save final dataset", "completed")
217225

218226
# Create tiny (n=1000 households) versions for testing
@@ -225,7 +233,10 @@ def main():
225233
tiny_frs = subsample_dataset(frs_base, TINY_SIZE)
226234
tiny_frs.save(STORAGE_FOLDER / "frs_2023_24_tiny.h5")
227235

228-
tiny_enhanced = subsample_dataset(frs_calibrated, TINY_SIZE)
236+
tiny_enhanced = subsample_dataset(
237+
strip_internal_disability_reported_amounts(frs_calibrated),
238+
TINY_SIZE,
239+
)
229240
tiny_enhanced.save(STORAGE_FOLDER / "enhanced_frs_2023_24_tiny.h5")
230241
update_dataset("Create tiny datasets", "completed")
231242

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
"""Dataset-side disability benefit category mapping.
2+
3+
PolicyEngine UK models PIP, DLA, and Attendance Allowance from category
4+
inputs. The FRS observes reported amounts, so the data pipeline keeps those
5+
amounts as internal build intermediates and converts them to model inputs
6+
before datasets are published.
7+
"""
8+
9+
from __future__ import annotations
10+
11+
from functools import lru_cache
12+
13+
import numpy as np
14+
import pandas as pd
15+
from policyengine_uk import CountryTaxBenefitSystem
16+
from policyengine_uk.data import UKSingleYearDataset
17+
from policyengine_uk.model_api import WEEKS_IN_YEAR as MODEL_WEEKS_IN_YEAR
18+
19+
20+
DISABILITY_REPORTED_AMOUNT_COLUMNS = (
21+
"attendance_allowance_reported",
22+
"dla_sc_reported",
23+
"dla_m_reported",
24+
"pip_m_reported",
25+
"pip_dl_reported",
26+
)
27+
28+
DISABILITY_CATEGORY_COLUMNS = (
29+
"aa_category",
30+
"dla_sc_category",
31+
"dla_m_category",
32+
"pip_m_category",
33+
"pip_dl_category",
34+
)
35+
36+
SAFETY_MARGIN = 0.1
37+
SURVEY_REPORTED_AMOUNT_WEEKS_IN_YEAR = 365.25 / 7
38+
39+
40+
@lru_cache(maxsize=None)
41+
def _dwp_category_threshold_parameters(year: int):
42+
# Match the category formulas removed from policyengine-uk. Those formulas
43+
# thresholded reported amounts against the baseline DWP rates.
44+
return CountryTaxBenefitSystem().parameters(year).baseline.gov.dwp
45+
46+
47+
@lru_cache(maxsize=None)
48+
def _dwp_flag_parameters(year: int):
49+
# Match the FRS disability flag derivation that already lived in uk-data.
50+
return CountryTaxBenefitSystem().parameters(year).gov.dwp
51+
52+
53+
def _reported_amount(person: pd.DataFrame, column: str) -> pd.Series:
54+
if column not in person.columns:
55+
return pd.Series(0.0, index=person.index)
56+
return pd.to_numeric(person[column], errors="coerce").fillna(0.0)
57+
58+
59+
def _category_from_reported_amount(
60+
reported_amount: pd.Series,
61+
thresholds: tuple[tuple[str, float], ...],
62+
) -> np.ndarray:
63+
weekly_amount = pd.to_numeric(reported_amount, errors="coerce").fillna(0)
64+
weekly_amount = weekly_amount.to_numpy(dtype=float) / MODEL_WEEKS_IN_YEAR
65+
category = np.full(len(weekly_amount), "NONE", dtype=object)
66+
for category_name, weekly_rate in thresholds:
67+
category[weekly_amount >= float(weekly_rate) * (1 - SAFETY_MARGIN)] = (
68+
category_name
69+
)
70+
return category
71+
72+
73+
def add_disability_benefit_categories_from_reported_amounts(
74+
person: pd.DataFrame,
75+
year: int,
76+
*,
77+
inplace: bool = False,
78+
) -> pd.DataFrame:
79+
"""Convert reported disability benefit amounts into category inputs."""
80+
81+
if not inplace:
82+
person = person.copy()
83+
84+
dwp = _dwp_category_threshold_parameters(int(year))
85+
mappings = (
86+
(
87+
"attendance_allowance_reported",
88+
"aa_category",
89+
(
90+
("LOWER", dwp.attendance_allowance.lower),
91+
("HIGHER", dwp.attendance_allowance.higher),
92+
),
93+
),
94+
(
95+
"dla_sc_reported",
96+
"dla_sc_category",
97+
(
98+
("LOWER", dwp.dla.self_care.lower),
99+
("MIDDLE", dwp.dla.self_care.middle),
100+
("HIGHER", dwp.dla.self_care.higher),
101+
),
102+
),
103+
(
104+
"dla_m_reported",
105+
"dla_m_category",
106+
(
107+
("LOWER", dwp.dla.mobility.lower),
108+
("HIGHER", dwp.dla.mobility.higher),
109+
),
110+
),
111+
(
112+
"pip_m_reported",
113+
"pip_m_category",
114+
(
115+
("STANDARD", dwp.pip.mobility.standard),
116+
("ENHANCED", dwp.pip.mobility.enhanced),
117+
),
118+
),
119+
(
120+
"pip_dl_reported",
121+
"pip_dl_category",
122+
(
123+
("STANDARD", dwp.pip.daily_living.standard),
124+
("ENHANCED", dwp.pip.daily_living.enhanced),
125+
),
126+
),
127+
)
128+
129+
for reported_column, category_column, thresholds in mappings:
130+
if reported_column in person.columns:
131+
person[category_column] = _category_from_reported_amount(
132+
person[reported_column],
133+
thresholds,
134+
)
135+
136+
return person
137+
138+
139+
def add_disability_benefit_flags_from_reported_amounts(
140+
person: pd.DataFrame,
141+
year: int,
142+
*,
143+
inplace: bool = False,
144+
) -> pd.DataFrame:
145+
"""Recompute disability flags derived from reported benefit amounts."""
146+
147+
if not inplace:
148+
person = person.copy()
149+
150+
dwp = _dwp_flag_parameters(int(year))
151+
dla_sc = _reported_amount(person, "dla_sc_reported")
152+
dla_m = _reported_amount(person, "dla_m_reported")
153+
pip_m = _reported_amount(person, "pip_m_reported")
154+
pip_dl = _reported_amount(person, "pip_dl_reported")
155+
afcs = _reported_amount(person, "afcs_reported")
156+
157+
person["is_disabled_for_benefits"] = (dla_sc + dla_m + pip_m + pip_dl) > 0
158+
159+
threshold_safety_gap = 1 * SURVEY_REPORTED_AMOUNT_WEEKS_IN_YEAR
160+
dla_sc_higher = (
161+
dwp.dla.self_care.higher * SURVEY_REPORTED_AMOUNT_WEEKS_IN_YEAR
162+
- threshold_safety_gap
163+
)
164+
pip_dl_enhanced = (
165+
dwp.pip.daily_living.enhanced * SURVEY_REPORTED_AMOUNT_WEEKS_IN_YEAR
166+
- threshold_safety_gap
167+
)
168+
169+
person["is_enhanced_disabled_for_benefits"] = dla_sc > dla_sc_higher
170+
person["is_severely_disabled_for_benefits"] = (
171+
(dla_sc >= dla_sc_higher) | (pip_dl >= pip_dl_enhanced) | (afcs > 0)
172+
)
173+
174+
return person
175+
176+
177+
def drop_internal_disability_reported_amounts(
178+
person: pd.DataFrame,
179+
*,
180+
inplace: bool = False,
181+
) -> pd.DataFrame:
182+
"""Drop disability amount intermediates that are not PE-UK inputs."""
183+
184+
if inplace:
185+
person.drop(
186+
columns=list(DISABILITY_REPORTED_AMOUNT_COLUMNS),
187+
errors="ignore",
188+
inplace=True,
189+
)
190+
return person
191+
return person.drop(
192+
columns=list(DISABILITY_REPORTED_AMOUNT_COLUMNS),
193+
errors="ignore",
194+
)
195+
196+
197+
def strip_internal_disability_reported_amounts(
198+
dataset: UKSingleYearDataset,
199+
) -> UKSingleYearDataset:
200+
"""Return ``dataset`` without internal disability amount intermediates."""
201+
202+
dataset = dataset.copy()
203+
dataset.person = drop_internal_disability_reported_amounts(dataset.person)
204+
return dataset

policyengine_uk_data/datasets/enhanced_cps.py

Lines changed: 5 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -35,26 +35,13 @@
3535
"yearly-average-currency-exchange-rates"
3636
)
3737

38-
# 2025/26 reported-benefit mapping assumptions used only to populate UK input
39-
# leaves from U.S. source records. PolicyEngine UK applies its own parameters
40-
# when calculating derived tax and benefit outputs.
38+
# 2025/26 benefit mapping assumptions used only to populate UK input leaves from
39+
# U.S. source records. PolicyEngine UK applies its own parameters when
40+
# calculating derived tax and benefit outputs.
4141
NEW_STATE_PENSION_2025 = 224.96 * 52
4242
DIVIDEND_YIELD_FOR_WEALTH_IMPUTATION = 0.03
4343
RENTAL_YIELD_FOR_WEALTH_IMPUTATION = 0.04
4444

45-
PIP_2025_WEEKLY_RATES = {
46-
"daily_living": {
47-
"NONE": 0.0,
48-
"STANDARD": 73.89,
49-
"ENHANCED": 110.40,
50-
},
51-
"mobility": {
52-
"NONE": 0.0,
53-
"STANDARD": 29.19,
54-
"ENHANCED": 77.04,
55-
},
56-
}
57-
5845
REGION_SHARES = (
5946
("NORTH_EAST", 0.04),
6047
("NORTH_WEST", 0.11),
@@ -248,11 +235,6 @@ def _pip_category(person: dict) -> str:
248235
return "ENHANCED" if severe_signal or low_earnings else "STANDARD"
249236

250237

251-
def _pip_reported_amount(category: str, component: str) -> float:
252-
weekly = PIP_2025_WEEKLY_RATES[component][category]
253-
return round(weekly * 52, 2)
254-
255-
256238
def _household_cash_income(people: list[dict], exchange_rate: float) -> float:
257239
total = 0.0
258240
for person in people:
@@ -688,14 +670,8 @@ def _build_base_dataset(
688670
if bool(inputs.get("is_blind", False))
689671
else 0.0,
690672
"is_disabled_for_benefits": bool(inputs.get("is_disabled", False)),
691-
"pip_dl_reported": _pip_reported_amount(
692-
pip_category,
693-
"daily_living",
694-
),
695-
"pip_m_reported": _pip_reported_amount(
696-
pip_category,
697-
"mobility",
698-
),
673+
"pip_dl_category": pip_category,
674+
"pip_m_category": pip_category,
699675
"hours_worked": float(
700676
inputs.get(
701677
"weekly_hours_worked",

0 commit comments

Comments
 (0)