Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: minor
changes:
added:
- Student loan plan imputation based on age and reported repayments
8 changes: 7 additions & 1 deletion policyengine_uk_data/datasets/create_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def main():
"Impute income",
"Impute capital gains",
"Impute salary sacrifice",
"Impute student loan plan",
"Uprate to 2025",
"Calibrate dataset",
"Downrate to 2023",
Expand Down Expand Up @@ -56,6 +57,7 @@ def main():
impute_capital_gains,
impute_services,
impute_salary_sacrifice,
impute_student_loan_plan,
)

# Apply imputations with progress tracking
Expand Down Expand Up @@ -87,6 +89,10 @@ def main():
frs = impute_salary_sacrifice(frs)
update_dataset("Impute salary sacrifice", "completed")

update_dataset("Impute student loan plan", "processing")
frs = impute_student_loan_plan(frs, year=2023)
update_dataset("Impute student loan plan", "completed")

# Uprate dataset
update_dataset("Uprate to 2025", "processing")
frs = uprate_dataset(frs, 2025)
Expand Down Expand Up @@ -143,7 +149,7 @@ def main():
details={
"base_dataset": "frs_2023_24.h5",
"enhanced_dataset": "enhanced_frs_2023_24.h5",
"imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice",
"imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice, student_loan_plan",
"calibration": "national and constituency targets",
},
)
Expand Down
1 change: 1 addition & 0 deletions policyengine_uk_data/datasets/imputations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
from .capital_gains import *
from .services import impute_services
from .salary_sacrifice import impute_salary_sacrifice
from .student_loans import impute_student_loan_plan
91 changes: 91 additions & 0 deletions policyengine_uk_data/datasets/imputations/student_loans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""
Student loan plan imputation.

This module imputes the student_loan_plan variable based on:
- Whether the person has reported student loan repayments
- Their estimated university attendance year (inferred from age)

The imputation assigns plan types according to when the loan system changed:
- NONE: No reported repayments
- PLAN_1: Started university before September 2012
- PLAN_2: Started September 2012 - August 2023
- PLAN_5: Started September 2023 onwards

This enables policyengine-uk's student_loan_repayment variable to calculate
repayments using official threshold parameters.
"""

import numpy as np
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk import Microsimulation


def impute_student_loan_plan(
dataset: UKSingleYearDataset,
year: int = 2025,
) -> UKSingleYearDataset:
"""
Impute student loan plan type based on age and reported repayments.

The plan type determines which repayment threshold applies:
- PLAN_1: £26,065 (2025), pre-Sept 2012 England/Wales
- PLAN_2: £29,385 (2026-2029 frozen), Sept 2012 - Aug 2023
- PLAN_4: Scottish loans (not imputed here - requires explicit flag)
- PLAN_5: £25,000 (2025), Sept 2023 onwards

Args:
dataset: PolicyEngine UK dataset with student_loan_repayments.
year: The simulation year, used to estimate university attendance.

Returns:
Dataset with imputed student_loan_plan values.
"""
dataset = dataset.copy()
sim = Microsimulation(dataset=dataset)

# Get required variables
age = sim.calculate("age").values
student_loan_repayments = sim.calculate("student_loan_repayments").values

# Determine if person has a student loan based on reported repayments
has_student_loan = student_loan_repayments > 0

# Estimate when they started university (assume age 18)
# For simulation year Y and age A, university start year = Y - A + 18
estimated_uni_start_year = year - age + 18

# Assign plan types based on when loan system changed
# StudentLoanPlan is a string enum: "NONE", "PLAN_1", "PLAN_2", "PLAN_4", "PLAN_5"
plan = np.full(len(age), "NONE", dtype=object)

# Plan 1: Started before September 2012
plan_1_mask = has_student_loan & (estimated_uni_start_year < 2012)
plan[plan_1_mask] = "PLAN_1"

# Plan 2: Started September 2012 - August 2023
plan_2_mask = has_student_loan & (
(estimated_uni_start_year >= 2012) & (estimated_uni_start_year < 2023)
)
plan[plan_2_mask] = "PLAN_2"

# Plan 5: Started September 2023 onwards
plan_5_mask = has_student_loan & (estimated_uni_start_year >= 2023)
plan[plan_5_mask] = "PLAN_5"

# Store as the plan type
dataset.person["student_loan_plan"] = plan

# Report imputation results
weights = sim.calculate("person_weight").values
total_with_loan = (has_student_loan * weights).sum()
plan_1_count = (plan_1_mask * weights).sum()
plan_2_count = (plan_2_mask * weights).sum()
plan_5_count = (plan_5_mask * weights).sum()

print("Student loan plan imputation results:")
print(f" Total with student loan: {total_with_loan / 1e6:.2f}m")
print(f" Plan 1 (pre-2012): {plan_1_count / 1e6:.2f}m")
print(f" Plan 2 (2012-2023): {plan_2_count / 1e6:.2f}m")
print(f" Plan 5 (2023+): {plan_5_count / 1e6:.2f}m")

return dataset
46 changes: 46 additions & 0 deletions policyengine_uk_data/tests/test_student_loan_plan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Tests for student loan plan imputation."""

import numpy as np
import pytest


def test_student_loan_plan_imputation_logic():
"""Test the plan assignment logic based on university start year."""
# Test data: (age, year, expected_uni_start, expected_plan)
# Plan 1: pre-2012, Plan 2: 2012-2022, Plan 5: 2023+

year = 2025

# Age 40 in 2025 -> started uni ~2003 -> Plan 1
age_40_uni_year = year - 40 + 18 # = 2003
assert age_40_uni_year < 2012, "Age 40 should be Plan 1"

# Age 30 in 2025 -> started uni ~2013 -> Plan 2
age_30_uni_year = year - 30 + 18 # = 2013
assert 2012 <= age_30_uni_year < 2023, "Age 30 should be Plan 2"

# Age 25 in 2025 -> started uni ~2018 -> Plan 2
age_25_uni_year = year - 25 + 18 # = 2018
assert 2012 <= age_25_uni_year < 2023, "Age 25 should be Plan 2"

# Age 20 in 2025 -> started uni ~2023 -> Plan 5
age_20_uni_year = year - 20 + 18 # = 2023
assert age_20_uni_year >= 2023, "Age 20 should be Plan 5"

# Age 18 in 2025 -> started uni ~2025 -> Plan 5
age_18_uni_year = year - 18 + 18 # = 2025
assert age_18_uni_year >= 2023, "Age 18 should be Plan 5"


def test_student_loan_plan_enum_values():
"""Test that plan enum values match policyengine-uk's string enum."""
from policyengine_uk.variables.gov.hmrc.student_loans.student_loan_plan import (
StudentLoanPlan,
)

# Verify our assumptions about enum values (string-based enum)
assert StudentLoanPlan.NONE.value == "NONE"
assert StudentLoanPlan.PLAN_1.value == "PLAN_1"
assert StudentLoanPlan.PLAN_2.value == "PLAN_2"
assert StudentLoanPlan.PLAN_4.value == "PLAN_4"
assert StudentLoanPlan.PLAN_5.value == "PLAN_5"
8 changes: 4 additions & 4 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.