Merge pull request #220 from PolicyEngine/impute-salary-sacrifice

nikhilwoodruff · web-flow · commit 3245377c0160 · 2025-11-26T22:46:23.000Z
Add salary sacrifice imputation to dataset pipeline
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    added:
+      - Salary sacrifice imputation using FRS SALSAC routing question to impute ~30% employee participation per HMRC survey data.
diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py
@@ -26,6 +26,7 @@ def main():
             "Impute public service usage",
             "Impute income",
             "Impute capital gains",
+            "Impute salary sacrifice",
             "Uprate to 2025",
             "Calibrate dataset",
             "Downrate to 2023",
@@ -54,6 +55,7 @@ def main():
                 impute_income,
                 impute_capital_gains,
                 impute_services,
+                impute_salary_sacrifice,
             )
 
             # Apply imputations with progress tracking
@@ -81,6 +83,10 @@ def main():
             frs = impute_capital_gains(frs)
             update_dataset("Impute capital gains", "completed")
 
+            update_dataset("Impute salary sacrifice", "processing")
+            frs = impute_salary_sacrifice(frs)
+            update_dataset("Impute salary sacrifice", "completed")
+
             # Uprate dataset
             update_dataset("Uprate to 2025", "processing")
             frs = uprate_dataset(frs, 2025)
@@ -137,7 +143,7 @@ def main():
             details={
                 "base_dataset": "frs_2023_24.h5",
                 "enhanced_dataset": "enhanced_frs_2023_24.h5",
-                "imputations_applied": "consumption, wealth, VAT, services, income, capital_gains",
+                "imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice",
                 "calibration": "national and constituency targets",
             },
         )
diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py
@@ -45,13 +45,26 @@ def create_frs(
         raise FileNotFoundError(f"Raw folder {raw_folder} does not exist.")
 
     frs = {}
+    # Store SALSAC values before numeric conversion (for salary sacrifice
+    # imputation)
+    job_salsac_raw = None
     for file in raw_folder.glob("*.tab"):
         table_name = file.stem
-        # Read and make numeric where possible
-        df = pd.read_csv(file, sep="\t").apply(pd.to_numeric, errors="coerce")
+        # Read raw data first
+        df_raw = pd.read_csv(file, sep="\t")
+        df_raw.columns = df_raw.columns.str.lower()
 
-        # Standardise column names to lower case
-        df.columns = df.columns.str.lower()
+        # Preserve SALSAC column from job table before numeric conversion
+        # SALSAC indicates salary sacrifice participation:
+        # '1' = Yes, '2' = No, ' ' or blank = skip/not asked
+        if table_name == "job" and "salsac" in df_raw.columns:
+            job_salsac_raw = df_raw["salsac"].copy()
+
+        # Make numeric where possible
+        df = df_raw.apply(pd.to_numeric, errors="coerce")
+
+        # Standardise column names to lower case (already done above)
+        # df.columns = df.columns.str.lower()
 
         # Edit ID variables for simplicity
         if "sernum" in df.columns:
@@ -86,6 +99,10 @@ def create_frs(
     oddjob = frs["oddjob"]
     account = frs["accounts"]
     job = frs["job"]
+    # Add raw SALSAC column to job table for salary sacrifice imputation
+    # SALSAC values: '1' = Yes (participates), '2' = No, ' '/blank = not asked
+    if job_salsac_raw is not None:
+        job["salsac_raw"] = job_salsac_raw.values
     benefits = frs["benefits"]
     maintenance = frs["maint"]
     pen_prov = frs["penprov"]
@@ -646,6 +663,42 @@ def determine_education_level(fted_val, typeed2_val, age_val):
         * WEEKS_IN_YEAR,
     )
 
+    # Salary sacrifice participation indicator from SALSAC field
+    # Used for imputation: 1 = Yes, 0 = No, -1 = not asked (skip)
+    # This allows distinguishing between explicit No responses and
+    # respondents who were not asked the question (imputation candidates)
+    if "salsac_raw" in job.columns:
+        salsac_numeric = (
+            job["salsac_raw"]
+            .map({"1": 1, "2": 0, " ": -1})
+            .fillna(-1)
+            .astype(int)
+        )
+        # Aggregate to person level: take max (any job with SS = person has SS)
+        pe_person["salary_sacrifice_reported"] = np.clip(
+            sum_to_entity(
+                (salsac_numeric == 1).astype(int),
+                job.person_id,
+                person.person_id,
+            ),
+            0,
+            1,
+        )
+        # Track if person was asked about SS in any job (for imputation)
+        pe_person["salary_sacrifice_asked"] = np.clip(
+            sum_to_entity(
+                (salsac_numeric >= 0).astype(int),
+                job.person_id,
+                person.person_id,
+            ),
+            0,
+            1,
+        )
+    else:
+        # If SALSAC not available, mark all as not asked
+        pe_person["salary_sacrifice_reported"] = 0
+        pe_person["salary_sacrifice_asked"] = 0
+
     pe_household["housing_service_charges"] = (
         pd.DataFrame(
             [
diff --git a/policyengine_uk_data/datasets/imputations/__init__.py b/policyengine_uk_data/datasets/imputations/__init__.py
@@ -4,3 +4,4 @@
 from .income import *
 from .capital_gains import *
 from .services import impute_services
+from .salary_sacrifice import impute_salary_sacrifice
diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py
@@ -81,8 +81,7 @@ def generate_spi_table(spi: pd.DataFrame):
 
     spi = pd.concat(
         [
-            spi.sample(20_000),
-            spi[spi.TI > 1_000_000],
+            spi.sample(100_000, weights=spi.person_weight),
         ]
     )
 
diff --git a/policyengine_uk_data/datasets/imputations/salary_sacrifice.py b/policyengine_uk_data/datasets/imputations/salary_sacrifice.py
@@ -0,0 +1,224 @@
+"""
+Salary sacrifice imputation for pension contributions.
+
+This module imputes salary sacrifice pension amounts using QRF trained on
+FRS respondents who were asked the SALSAC question. The model predicts
+the continuous amount (pension_contributions_via_salary_sacrifice), with
+non-participants naturally having 0.
+
+Training data (FRS 2023-24):
+- SALSAC='1' (Yes): ~224 jobs with reported SPNAMT amounts
+- SALSAC='2' (No): ~3,803 jobs with SPNAMT=0
+
+Imputation candidates:
+- SALSAC=' ' (skip/not asked): ~13,265 jobs
+
+Targeting to HMRC totals (~24bn SS contributions) happens via weight
+calibration, not in this imputation step.
+"""
+
+import pandas as pd
+import numpy as np
+from policyengine_uk_data.storage import STORAGE_FOLDER
+from policyengine_uk.data import UKSingleYearDataset
+from policyengine_uk import Microsimulation
+
+
+PREDICTORS = [
+    "age",
+    "employment_income",
+]
+
+IMPUTATIONS = [
+    "pension_contributions_via_salary_sacrifice",
+]
+
+
+def save_salary_sacrifice_model():
+    """
+    Train and save salary sacrifice imputation model using FRS data.
+
+    Uses FRS respondents who were asked about salary sacrifice (SALSAC field)
+    as training data. The model learns to predict the SS pension amount
+    directly - non-participants have 0, participants have their reported
+    SPNAMT value.
+
+    Returns:
+        Trained QRF model for salary sacrifice imputation.
+    """
+    from policyengine_uk_data.utils import QRF
+
+    # Load the base FRS dataset
+    frs_path = STORAGE_FOLDER / "frs_2023_24.h5"
+    if not frs_path.exists():
+        raise FileNotFoundError(
+            f"FRS dataset not found at {frs_path}. "
+            "Run create_frs() first to generate the base dataset."
+        )
+
+    dataset = UKSingleYearDataset(frs_path)
+    sim = Microsimulation(dataset=dataset)
+
+    # Get predictor variables
+    age = sim.calculate("age").values
+    employment_income = sim.calculate("employment_income").values
+
+    # Get SS amounts and indicator for who was asked
+    ss_amount = (
+        dataset.person.pension_contributions_via_salary_sacrifice.values
+    )
+    if "salary_sacrifice_asked" not in dataset.person.columns:
+        raise ValueError(
+            "Dataset missing salary_sacrifice_asked field. "
+            "Ensure frs.py extracts SALSAC before numeric conversion."
+        )
+    ss_asked = dataset.person.salary_sacrifice_asked.values
+
+    # Build training DataFrame with only those who were asked
+    # This includes both participants (with amounts) and non-participants (0)
+    training_mask = ss_asked == 1
+
+    if training_mask.sum() == 0:
+        raise ValueError(
+            "No training data found - no respondents were asked SALSAC."
+        )
+
+    train_df = pd.DataFrame(
+        {
+            "age": age[training_mask],
+            "employment_income": employment_income[training_mask],
+            "pension_contributions_via_salary_sacrifice": ss_amount[
+                training_mask
+            ],
+        }
+    )
+
+    n_participants = (
+        train_df["pension_contributions_via_salary_sacrifice"] > 0
+    ).sum()
+    print(f"Training salary sacrifice model on {len(train_df)} observations")
+    print(
+        f"  With SS contributions: {n_participants} "
+        f"({n_participants / len(train_df):.1%})"
+    )
+    mean_amount = train_df.loc[
+        train_df["pension_contributions_via_salary_sacrifice"] > 0,
+        "pension_contributions_via_salary_sacrifice",
+    ].mean()
+    print(f"  Mean SS amount (participants): £{mean_amount:,.0f}")
+
+    # Train QRF model
+    model = QRF()
+    model.fit(train_df[PREDICTORS], train_df[IMPUTATIONS])
+    model.save(STORAGE_FOLDER / "salary_sacrifice.pkl")
+
+    return model
+
+
+def create_salary_sacrifice_model(overwrite_existing: bool = False):
+    """
+    Create or load salary sacrifice imputation model.
+
+    Args:
+        overwrite_existing: Whether to retrain model if it exists.
+
+    Returns:
+        Trained QRF model for salary sacrifice imputation.
+    """
+    from policyengine_uk_data.utils.qrf import QRF
+
+    model_path = STORAGE_FOLDER / "salary_sacrifice.pkl"
+    if model_path.exists() and not overwrite_existing:
+        return QRF(file_path=model_path)
+    return save_salary_sacrifice_model()
+
+
+def impute_salary_sacrifice(
+    dataset: UKSingleYearDataset,
+) -> UKSingleYearDataset:
+    """
+    Impute salary sacrifice pension amounts for FRS non-respondents.
+
+    For respondents not asked about salary sacrifice (SALSAC=' '), uses
+    a QRF model trained on those who were asked to predict the SS pension
+    contribution amount directly. The model naturally predicts 0 for
+    non-participants and positive amounts for likely participants.
+
+    Note: This imputation does NOT target any specific total. Targeting
+    to HMRC figures happens via weight calibration in a subsequent step.
+
+    Args:
+        dataset: PolicyEngine UK dataset with salary_sacrifice_asked
+            field from FRS processing.
+
+    Returns:
+        Dataset with imputed salary sacrifice amounts.
+    """
+    dataset = dataset.copy()
+    sim = Microsimulation(dataset=dataset)
+
+    # Get variables needed for imputation
+    age = sim.calculate("age").values
+    employment_income = sim.calculate("employment_income").values
+    current_ss = (
+        dataset.person.pension_contributions_via_salary_sacrifice.values
+    )
+
+    # Get indicator for who was asked
+    if "salary_sacrifice_asked" not in dataset.person.columns:
+        print(
+            "Warning: salary_sacrifice_asked not in dataset, "
+            "skipping imputation"
+        )
+        return dataset
+
+    ss_asked = dataset.person.salary_sacrifice_asked.values
+
+    # Identify imputation candidates: those not asked about SS
+    not_asked = ss_asked == 0
+
+    # Create prediction DataFrame for all records
+    pred_df = pd.DataFrame(
+        {
+            "age": age,
+            "employment_income": employment_income,
+        }
+    )
+
+    # Get or train model and predict
+    model = create_salary_sacrifice_model()
+    predictions = model.predict(pred_df)
+
+    # Get imputed amounts (QRF predicts continuous values)
+    imputed_ss = predictions[
+        "pension_contributions_via_salary_sacrifice"
+    ].values
+
+    # Ensure non-negative
+    imputed_ss = np.maximum(0, imputed_ss)
+
+    # For those who were asked, keep their reported values
+    # For those not asked, use the imputed values
+    final_ss = np.where(
+        ss_asked == 1,
+        current_ss,  # Keep reported values exactly
+        imputed_ss,  # Use imputed for non-respondents
+    )
+
+    # Update dataset
+    dataset.person["pension_contributions_via_salary_sacrifice"] = final_ss
+
+    # Report results (no targeting - just descriptive)
+    weights = sim.calculate("person_weight").values
+    is_employee = employment_income > 0
+    total_ss = (final_ss * weights).sum()
+    participation_rate = ((final_ss > 0) * weights * is_employee).sum() / (
+        weights * is_employee
+    ).sum()
+
+    print("Salary sacrifice imputation results (pre-calibration):")
+    print(f"  Total SS contributions: £{total_ss / 1e9:.1f}bn")
+    print(f"  Employee participation rate: {participation_rate:.1%}")
+    print("  (Final totals depend on subsequent weight calibration)")
+
+    return dataset

Original file line number	Diff line number	Diff line change
`@@ -81,8 +81,7 @@ def generate_spi_table(spi: pd.DataFrame):`
`81`	`81`
`82`	`82`	`spi = pd.concat(`
`83`	`83`	`[`
`84`		`- spi.sample(20_000),`
`85`		`- spi[spi.TI > 1_000_000],`
	`84`	`+ spi.sample(100_000, weights=spi.person_weight),`
`86`	`85`	`]`
`87`	`86`	`)`
`88`	`87`