Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: minor
changes:
added:
- Salary sacrifice imputation using FRS SALSAC routing question to impute ~30% employee participation per HMRC survey data.
8 changes: 7 additions & 1 deletion policyengine_uk_data/datasets/create_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def main():
"Impute public service usage",
"Impute income",
"Impute capital gains",
"Impute salary sacrifice",
"Uprate to 2025",
"Calibrate dataset",
"Downrate to 2023",
Expand Down Expand Up @@ -54,6 +55,7 @@ def main():
impute_income,
impute_capital_gains,
impute_services,
impute_salary_sacrifice,
)

# Apply imputations with progress tracking
Expand Down Expand Up @@ -81,6 +83,10 @@ def main():
frs = impute_capital_gains(frs)
update_dataset("Impute capital gains", "completed")

update_dataset("Impute salary sacrifice", "processing")
frs = impute_salary_sacrifice(frs)
update_dataset("Impute salary sacrifice", "completed")

# Uprate dataset
update_dataset("Uprate to 2025", "processing")
frs = uprate_dataset(frs, 2025)
Expand Down Expand Up @@ -137,7 +143,7 @@ def main():
details={
"base_dataset": "frs_2023_24.h5",
"enhanced_dataset": "enhanced_frs_2023_24.h5",
"imputations_applied": "consumption, wealth, VAT, services, income, capital_gains",
"imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice",
"calibration": "national and constituency targets",
},
)
Expand Down
61 changes: 57 additions & 4 deletions policyengine_uk_data/datasets/frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,26 @@ def create_frs(
raise FileNotFoundError(f"Raw folder {raw_folder} does not exist.")

frs = {}
# Store SALSAC values before numeric conversion (for salary sacrifice
# imputation)
job_salsac_raw = None
for file in raw_folder.glob("*.tab"):
table_name = file.stem
# Read and make numeric where possible
df = pd.read_csv(file, sep="\t").apply(pd.to_numeric, errors="coerce")
# Read raw data first
df_raw = pd.read_csv(file, sep="\t")
df_raw.columns = df_raw.columns.str.lower()

# Standardise column names to lower case
df.columns = df.columns.str.lower()
# Preserve SALSAC column from job table before numeric conversion
# SALSAC indicates salary sacrifice participation:
# '1' = Yes, '2' = No, ' ' or blank = skip/not asked
if table_name == "job" and "salsac" in df_raw.columns:
job_salsac_raw = df_raw["salsac"].copy()

# Make numeric where possible
df = df_raw.apply(pd.to_numeric, errors="coerce")

# Standardise column names to lower case (already done above)
# df.columns = df.columns.str.lower()

# Edit ID variables for simplicity
if "sernum" in df.columns:
Expand Down Expand Up @@ -86,6 +99,10 @@ def create_frs(
oddjob = frs["oddjob"]
account = frs["accounts"]
job = frs["job"]
# Add raw SALSAC column to job table for salary sacrifice imputation
# SALSAC values: '1' = Yes (participates), '2' = No, ' '/blank = not asked
if job_salsac_raw is not None:
job["salsac_raw"] = job_salsac_raw.values
benefits = frs["benefits"]
maintenance = frs["maint"]
pen_prov = frs["penprov"]
Expand Down Expand Up @@ -646,6 +663,42 @@ def determine_education_level(fted_val, typeed2_val, age_val):
* WEEKS_IN_YEAR,
)

# Salary sacrifice participation indicator from SALSAC field
# Used for imputation: 1 = Yes, 0 = No, -1 = not asked (skip)
# This allows distinguishing between explicit No responses and
# respondents who were not asked the question (imputation candidates)
if "salsac_raw" in job.columns:
salsac_numeric = (
job["salsac_raw"]
.map({"1": 1, "2": 0, " ": -1})
.fillna(-1)
.astype(int)
)
# Aggregate to person level: take max (any job with SS = person has SS)
pe_person["salary_sacrifice_reported"] = np.clip(
sum_to_entity(
(salsac_numeric == 1).astype(int),
job.person_id,
person.person_id,
),
0,
1,
)
# Track if person was asked about SS in any job (for imputation)
pe_person["salary_sacrifice_asked"] = np.clip(
sum_to_entity(
(salsac_numeric >= 0).astype(int),
job.person_id,
person.person_id,
),
0,
1,
)
else:
# If SALSAC not available, mark all as not asked
pe_person["salary_sacrifice_reported"] = 0
pe_person["salary_sacrifice_asked"] = 0

pe_household["housing_service_charges"] = (
pd.DataFrame(
[
Expand Down
1 change: 1 addition & 0 deletions policyengine_uk_data/datasets/imputations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
from .income import *
from .capital_gains import *
from .services import impute_services
from .salary_sacrifice import impute_salary_sacrifice
3 changes: 1 addition & 2 deletions policyengine_uk_data/datasets/imputations/income.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,7 @@ def generate_spi_table(spi: pd.DataFrame):

spi = pd.concat(
[
spi.sample(20_000),
spi[spi.TI > 1_000_000],
spi.sample(100_000, weights=spi.person_weight),
]
)

Expand Down
224 changes: 224 additions & 0 deletions policyengine_uk_data/datasets/imputations/salary_sacrifice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
"""
Salary sacrifice imputation for pension contributions.

This module imputes salary sacrifice pension amounts using QRF trained on
FRS respondents who were asked the SALSAC question. The model predicts
the continuous amount (pension_contributions_via_salary_sacrifice), with
non-participants naturally having 0.

Training data (FRS 2023-24):
- SALSAC='1' (Yes): ~224 jobs with reported SPNAMT amounts
- SALSAC='2' (No): ~3,803 jobs with SPNAMT=0

Imputation candidates:
- SALSAC=' ' (skip/not asked): ~13,265 jobs

Targeting to HMRC totals (~24bn SS contributions) happens via weight
calibration, not in this imputation step.
"""

import pandas as pd
import numpy as np
from policyengine_uk_data.storage import STORAGE_FOLDER
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk import Microsimulation


PREDICTORS = [
"age",
"employment_income",
]

IMPUTATIONS = [
"pension_contributions_via_salary_sacrifice",
]


def save_salary_sacrifice_model():
"""
Train and save salary sacrifice imputation model using FRS data.

Uses FRS respondents who were asked about salary sacrifice (SALSAC field)
as training data. The model learns to predict the SS pension amount
directly - non-participants have 0, participants have their reported
SPNAMT value.

Returns:
Trained QRF model for salary sacrifice imputation.
"""
from policyengine_uk_data.utils import QRF

# Load the base FRS dataset
frs_path = STORAGE_FOLDER / "frs_2023_24.h5"
if not frs_path.exists():
raise FileNotFoundError(
f"FRS dataset not found at {frs_path}. "
"Run create_frs() first to generate the base dataset."
)

dataset = UKSingleYearDataset(frs_path)
sim = Microsimulation(dataset=dataset)

# Get predictor variables
age = sim.calculate("age").values
employment_income = sim.calculate("employment_income").values

# Get SS amounts and indicator for who was asked
ss_amount = (
dataset.person.pension_contributions_via_salary_sacrifice.values
)
if "salary_sacrifice_asked" not in dataset.person.columns:
raise ValueError(
"Dataset missing salary_sacrifice_asked field. "
"Ensure frs.py extracts SALSAC before numeric conversion."
)
ss_asked = dataset.person.salary_sacrifice_asked.values

# Build training DataFrame with only those who were asked
# This includes both participants (with amounts) and non-participants (0)
training_mask = ss_asked == 1

if training_mask.sum() == 0:
raise ValueError(
"No training data found - no respondents were asked SALSAC."
)

train_df = pd.DataFrame(
{
"age": age[training_mask],
"employment_income": employment_income[training_mask],
"pension_contributions_via_salary_sacrifice": ss_amount[
training_mask
],
}
)

n_participants = (
train_df["pension_contributions_via_salary_sacrifice"] > 0
).sum()
print(f"Training salary sacrifice model on {len(train_df)} observations")
print(
f" With SS contributions: {n_participants} "
f"({n_participants / len(train_df):.1%})"
)
mean_amount = train_df.loc[
train_df["pension_contributions_via_salary_sacrifice"] > 0,
"pension_contributions_via_salary_sacrifice",
].mean()
print(f" Mean SS amount (participants): £{mean_amount:,.0f}")

# Train QRF model
model = QRF()
model.fit(train_df[PREDICTORS], train_df[IMPUTATIONS])
model.save(STORAGE_FOLDER / "salary_sacrifice.pkl")

return model


def create_salary_sacrifice_model(overwrite_existing: bool = False):
"""
Create or load salary sacrifice imputation model.

Args:
overwrite_existing: Whether to retrain model if it exists.

Returns:
Trained QRF model for salary sacrifice imputation.
"""
from policyengine_uk_data.utils.qrf import QRF

model_path = STORAGE_FOLDER / "salary_sacrifice.pkl"
if model_path.exists() and not overwrite_existing:
return QRF(file_path=model_path)
return save_salary_sacrifice_model()


def impute_salary_sacrifice(
dataset: UKSingleYearDataset,
) -> UKSingleYearDataset:
"""
Impute salary sacrifice pension amounts for FRS non-respondents.

For respondents not asked about salary sacrifice (SALSAC=' '), uses
a QRF model trained on those who were asked to predict the SS pension
contribution amount directly. The model naturally predicts 0 for
non-participants and positive amounts for likely participants.

Note: This imputation does NOT target any specific total. Targeting
to HMRC figures happens via weight calibration in a subsequent step.

Args:
dataset: PolicyEngine UK dataset with salary_sacrifice_asked
field from FRS processing.

Returns:
Dataset with imputed salary sacrifice amounts.
"""
dataset = dataset.copy()
sim = Microsimulation(dataset=dataset)

# Get variables needed for imputation
age = sim.calculate("age").values
employment_income = sim.calculate("employment_income").values
current_ss = (
dataset.person.pension_contributions_via_salary_sacrifice.values
)

# Get indicator for who was asked
if "salary_sacrifice_asked" not in dataset.person.columns:
print(
"Warning: salary_sacrifice_asked not in dataset, "
"skipping imputation"
)
return dataset

ss_asked = dataset.person.salary_sacrifice_asked.values

# Identify imputation candidates: those not asked about SS
not_asked = ss_asked == 0

# Create prediction DataFrame for all records
pred_df = pd.DataFrame(
{
"age": age,
"employment_income": employment_income,
}
)

# Get or train model and predict
model = create_salary_sacrifice_model()
predictions = model.predict(pred_df)

# Get imputed amounts (QRF predicts continuous values)
imputed_ss = predictions[
"pension_contributions_via_salary_sacrifice"
].values

# Ensure non-negative
imputed_ss = np.maximum(0, imputed_ss)

# For those who were asked, keep their reported values
# For those not asked, use the imputed values
final_ss = np.where(
ss_asked == 1,
current_ss, # Keep reported values exactly
imputed_ss, # Use imputed for non-respondents
)

# Update dataset
dataset.person["pension_contributions_via_salary_sacrifice"] = final_ss

# Report results (no targeting - just descriptive)
weights = sim.calculate("person_weight").values
is_employee = employment_income > 0
total_ss = (final_ss * weights).sum()
participation_rate = ((final_ss > 0) * weights * is_employee).sum() / (
weights * is_employee
).sum()

print("Salary sacrifice imputation results (pre-calibration):")
print(f" Total SS contributions: £{total_ss / 1e9:.1f}bn")
print(f" Employee participation rate: {participation_rate:.1%}")
print(" (Final totals depend on subsequent weight calibration)")

return dataset