Skip to content

Commit 3245377

Browse files
Merge pull request #220 from PolicyEngine/impute-salary-sacrifice
Add salary sacrifice imputation to dataset pipeline
2 parents dcf8ee7 + d2ac11b commit 3245377

6 files changed

Lines changed: 294 additions & 7 deletions

File tree

changelog_entry.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
- bump: minor
2+
changes:
3+
added:
4+
- Salary sacrifice imputation using FRS SALSAC routing question to impute ~30% employee participation per HMRC survey data.

policyengine_uk_data/datasets/create_datasets.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def main():
2626
"Impute public service usage",
2727
"Impute income",
2828
"Impute capital gains",
29+
"Impute salary sacrifice",
2930
"Uprate to 2025",
3031
"Calibrate dataset",
3132
"Downrate to 2023",
@@ -54,6 +55,7 @@ def main():
5455
impute_income,
5556
impute_capital_gains,
5657
impute_services,
58+
impute_salary_sacrifice,
5759
)
5860

5961
# Apply imputations with progress tracking
@@ -81,6 +83,10 @@ def main():
8183
frs = impute_capital_gains(frs)
8284
update_dataset("Impute capital gains", "completed")
8385

86+
update_dataset("Impute salary sacrifice", "processing")
87+
frs = impute_salary_sacrifice(frs)
88+
update_dataset("Impute salary sacrifice", "completed")
89+
8490
# Uprate dataset
8591
update_dataset("Uprate to 2025", "processing")
8692
frs = uprate_dataset(frs, 2025)
@@ -137,7 +143,7 @@ def main():
137143
details={
138144
"base_dataset": "frs_2023_24.h5",
139145
"enhanced_dataset": "enhanced_frs_2023_24.h5",
140-
"imputations_applied": "consumption, wealth, VAT, services, income, capital_gains",
146+
"imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice",
141147
"calibration": "national and constituency targets",
142148
},
143149
)

policyengine_uk_data/datasets/frs.py

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,26 @@ def create_frs(
4545
raise FileNotFoundError(f"Raw folder {raw_folder} does not exist.")
4646

4747
frs = {}
48+
# Store SALSAC values before numeric conversion (for salary sacrifice
49+
# imputation)
50+
job_salsac_raw = None
4851
for file in raw_folder.glob("*.tab"):
4952
table_name = file.stem
50-
# Read and make numeric where possible
51-
df = pd.read_csv(file, sep="\t").apply(pd.to_numeric, errors="coerce")
53+
# Read raw data first
54+
df_raw = pd.read_csv(file, sep="\t")
55+
df_raw.columns = df_raw.columns.str.lower()
5256

53-
# Standardise column names to lower case
54-
df.columns = df.columns.str.lower()
57+
# Preserve SALSAC column from job table before numeric conversion
58+
# SALSAC indicates salary sacrifice participation:
59+
# '1' = Yes, '2' = No, ' ' or blank = skip/not asked
60+
if table_name == "job" and "salsac" in df_raw.columns:
61+
job_salsac_raw = df_raw["salsac"].copy()
62+
63+
# Make numeric where possible
64+
df = df_raw.apply(pd.to_numeric, errors="coerce")
65+
66+
# Standardise column names to lower case (already done above)
67+
# df.columns = df.columns.str.lower()
5568

5669
# Edit ID variables for simplicity
5770
if "sernum" in df.columns:
@@ -86,6 +99,10 @@ def create_frs(
8699
oddjob = frs["oddjob"]
87100
account = frs["accounts"]
88101
job = frs["job"]
102+
# Add raw SALSAC column to job table for salary sacrifice imputation
103+
# SALSAC values: '1' = Yes (participates), '2' = No, ' '/blank = not asked
104+
if job_salsac_raw is not None:
105+
job["salsac_raw"] = job_salsac_raw.values
89106
benefits = frs["benefits"]
90107
maintenance = frs["maint"]
91108
pen_prov = frs["penprov"]
@@ -646,6 +663,42 @@ def determine_education_level(fted_val, typeed2_val, age_val):
646663
* WEEKS_IN_YEAR,
647664
)
648665

666+
# Salary sacrifice participation indicator from SALSAC field
667+
# Used for imputation: 1 = Yes, 0 = No, -1 = not asked (skip)
668+
# This allows distinguishing between explicit No responses and
669+
# respondents who were not asked the question (imputation candidates)
670+
if "salsac_raw" in job.columns:
671+
salsac_numeric = (
672+
job["salsac_raw"]
673+
.map({"1": 1, "2": 0, " ": -1})
674+
.fillna(-1)
675+
.astype(int)
676+
)
677+
# Aggregate to person level: take max (any job with SS = person has SS)
678+
pe_person["salary_sacrifice_reported"] = np.clip(
679+
sum_to_entity(
680+
(salsac_numeric == 1).astype(int),
681+
job.person_id,
682+
person.person_id,
683+
),
684+
0,
685+
1,
686+
)
687+
# Track if person was asked about SS in any job (for imputation)
688+
pe_person["salary_sacrifice_asked"] = np.clip(
689+
sum_to_entity(
690+
(salsac_numeric >= 0).astype(int),
691+
job.person_id,
692+
person.person_id,
693+
),
694+
0,
695+
1,
696+
)
697+
else:
698+
# If SALSAC not available, mark all as not asked
699+
pe_person["salary_sacrifice_reported"] = 0
700+
pe_person["salary_sacrifice_asked"] = 0
701+
649702
pe_household["housing_service_charges"] = (
650703
pd.DataFrame(
651704
[

policyengine_uk_data/datasets/imputations/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@
44
from .income import *
55
from .capital_gains import *
66
from .services import impute_services
7+
from .salary_sacrifice import impute_salary_sacrifice

policyengine_uk_data/datasets/imputations/income.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,7 @@ def generate_spi_table(spi: pd.DataFrame):
8181

8282
spi = pd.concat(
8383
[
84-
spi.sample(20_000),
85-
spi[spi.TI > 1_000_000],
84+
spi.sample(100_000, weights=spi.person_weight),
8685
]
8786
)
8887

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
"""
2+
Salary sacrifice imputation for pension contributions.
3+
4+
This module imputes salary sacrifice pension amounts using QRF trained on
5+
FRS respondents who were asked the SALSAC question. The model predicts
6+
the continuous amount (pension_contributions_via_salary_sacrifice), with
7+
non-participants naturally having 0.
8+
9+
Training data (FRS 2023-24):
10+
- SALSAC='1' (Yes): ~224 jobs with reported SPNAMT amounts
11+
- SALSAC='2' (No): ~3,803 jobs with SPNAMT=0
12+
13+
Imputation candidates:
14+
- SALSAC=' ' (skip/not asked): ~13,265 jobs
15+
16+
Targeting to HMRC totals (~24bn SS contributions) happens via weight
17+
calibration, not in this imputation step.
18+
"""
19+
20+
import pandas as pd
21+
import numpy as np
22+
from policyengine_uk_data.storage import STORAGE_FOLDER
23+
from policyengine_uk.data import UKSingleYearDataset
24+
from policyengine_uk import Microsimulation
25+
26+
27+
PREDICTORS = [
28+
"age",
29+
"employment_income",
30+
]
31+
32+
IMPUTATIONS = [
33+
"pension_contributions_via_salary_sacrifice",
34+
]
35+
36+
37+
def save_salary_sacrifice_model():
38+
"""
39+
Train and save salary sacrifice imputation model using FRS data.
40+
41+
Uses FRS respondents who were asked about salary sacrifice (SALSAC field)
42+
as training data. The model learns to predict the SS pension amount
43+
directly - non-participants have 0, participants have their reported
44+
SPNAMT value.
45+
46+
Returns:
47+
Trained QRF model for salary sacrifice imputation.
48+
"""
49+
from policyengine_uk_data.utils import QRF
50+
51+
# Load the base FRS dataset
52+
frs_path = STORAGE_FOLDER / "frs_2023_24.h5"
53+
if not frs_path.exists():
54+
raise FileNotFoundError(
55+
f"FRS dataset not found at {frs_path}. "
56+
"Run create_frs() first to generate the base dataset."
57+
)
58+
59+
dataset = UKSingleYearDataset(frs_path)
60+
sim = Microsimulation(dataset=dataset)
61+
62+
# Get predictor variables
63+
age = sim.calculate("age").values
64+
employment_income = sim.calculate("employment_income").values
65+
66+
# Get SS amounts and indicator for who was asked
67+
ss_amount = (
68+
dataset.person.pension_contributions_via_salary_sacrifice.values
69+
)
70+
if "salary_sacrifice_asked" not in dataset.person.columns:
71+
raise ValueError(
72+
"Dataset missing salary_sacrifice_asked field. "
73+
"Ensure frs.py extracts SALSAC before numeric conversion."
74+
)
75+
ss_asked = dataset.person.salary_sacrifice_asked.values
76+
77+
# Build training DataFrame with only those who were asked
78+
# This includes both participants (with amounts) and non-participants (0)
79+
training_mask = ss_asked == 1
80+
81+
if training_mask.sum() == 0:
82+
raise ValueError(
83+
"No training data found - no respondents were asked SALSAC."
84+
)
85+
86+
train_df = pd.DataFrame(
87+
{
88+
"age": age[training_mask],
89+
"employment_income": employment_income[training_mask],
90+
"pension_contributions_via_salary_sacrifice": ss_amount[
91+
training_mask
92+
],
93+
}
94+
)
95+
96+
n_participants = (
97+
train_df["pension_contributions_via_salary_sacrifice"] > 0
98+
).sum()
99+
print(f"Training salary sacrifice model on {len(train_df)} observations")
100+
print(
101+
f" With SS contributions: {n_participants} "
102+
f"({n_participants / len(train_df):.1%})"
103+
)
104+
mean_amount = train_df.loc[
105+
train_df["pension_contributions_via_salary_sacrifice"] > 0,
106+
"pension_contributions_via_salary_sacrifice",
107+
].mean()
108+
print(f" Mean SS amount (participants): £{mean_amount:,.0f}")
109+
110+
# Train QRF model
111+
model = QRF()
112+
model.fit(train_df[PREDICTORS], train_df[IMPUTATIONS])
113+
model.save(STORAGE_FOLDER / "salary_sacrifice.pkl")
114+
115+
return model
116+
117+
118+
def create_salary_sacrifice_model(overwrite_existing: bool = False):
119+
"""
120+
Create or load salary sacrifice imputation model.
121+
122+
Args:
123+
overwrite_existing: Whether to retrain model if it exists.
124+
125+
Returns:
126+
Trained QRF model for salary sacrifice imputation.
127+
"""
128+
from policyengine_uk_data.utils.qrf import QRF
129+
130+
model_path = STORAGE_FOLDER / "salary_sacrifice.pkl"
131+
if model_path.exists() and not overwrite_existing:
132+
return QRF(file_path=model_path)
133+
return save_salary_sacrifice_model()
134+
135+
136+
def impute_salary_sacrifice(
137+
dataset: UKSingleYearDataset,
138+
) -> UKSingleYearDataset:
139+
"""
140+
Impute salary sacrifice pension amounts for FRS non-respondents.
141+
142+
For respondents not asked about salary sacrifice (SALSAC=' '), uses
143+
a QRF model trained on those who were asked to predict the SS pension
144+
contribution amount directly. The model naturally predicts 0 for
145+
non-participants and positive amounts for likely participants.
146+
147+
Note: This imputation does NOT target any specific total. Targeting
148+
to HMRC figures happens via weight calibration in a subsequent step.
149+
150+
Args:
151+
dataset: PolicyEngine UK dataset with salary_sacrifice_asked
152+
field from FRS processing.
153+
154+
Returns:
155+
Dataset with imputed salary sacrifice amounts.
156+
"""
157+
dataset = dataset.copy()
158+
sim = Microsimulation(dataset=dataset)
159+
160+
# Get variables needed for imputation
161+
age = sim.calculate("age").values
162+
employment_income = sim.calculate("employment_income").values
163+
current_ss = (
164+
dataset.person.pension_contributions_via_salary_sacrifice.values
165+
)
166+
167+
# Get indicator for who was asked
168+
if "salary_sacrifice_asked" not in dataset.person.columns:
169+
print(
170+
"Warning: salary_sacrifice_asked not in dataset, "
171+
"skipping imputation"
172+
)
173+
return dataset
174+
175+
ss_asked = dataset.person.salary_sacrifice_asked.values
176+
177+
# Identify imputation candidates: those not asked about SS
178+
not_asked = ss_asked == 0
179+
180+
# Create prediction DataFrame for all records
181+
pred_df = pd.DataFrame(
182+
{
183+
"age": age,
184+
"employment_income": employment_income,
185+
}
186+
)
187+
188+
# Get or train model and predict
189+
model = create_salary_sacrifice_model()
190+
predictions = model.predict(pred_df)
191+
192+
# Get imputed amounts (QRF predicts continuous values)
193+
imputed_ss = predictions[
194+
"pension_contributions_via_salary_sacrifice"
195+
].values
196+
197+
# Ensure non-negative
198+
imputed_ss = np.maximum(0, imputed_ss)
199+
200+
# For those who were asked, keep their reported values
201+
# For those not asked, use the imputed values
202+
final_ss = np.where(
203+
ss_asked == 1,
204+
current_ss, # Keep reported values exactly
205+
imputed_ss, # Use imputed for non-respondents
206+
)
207+
208+
# Update dataset
209+
dataset.person["pension_contributions_via_salary_sacrifice"] = final_ss
210+
211+
# Report results (no targeting - just descriptive)
212+
weights = sim.calculate("person_weight").values
213+
is_employee = employment_income > 0
214+
total_ss = (final_ss * weights).sum()
215+
participation_rate = ((final_ss > 0) * weights * is_employee).sum() / (
216+
weights * is_employee
217+
).sum()
218+
219+
print("Salary sacrifice imputation results (pre-calibration):")
220+
print(f" Total SS contributions: £{total_ss / 1e9:.1f}bn")
221+
print(f" Employee participation rate: {participation_rate:.1%}")
222+
print(" (Final totals depend on subsequent weight calibration)")
223+
224+
return dataset

0 commit comments

Comments
 (0)