diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29bb..f66110c12 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Student loan plan imputation based on age and reported repayments diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py index 04e4b0ac8..f0a58148a 100644 --- a/policyengine_uk_data/datasets/create_datasets.py +++ b/policyengine_uk_data/datasets/create_datasets.py @@ -27,6 +27,7 @@ def main(): "Impute income", "Impute capital gains", "Impute salary sacrifice", + "Impute student loan plan", "Uprate to 2025", "Calibrate dataset", "Downrate to 2023", @@ -56,6 +57,7 @@ def main(): impute_capital_gains, impute_services, impute_salary_sacrifice, + impute_student_loan_plan, ) # Apply imputations with progress tracking @@ -87,6 +89,10 @@ def main(): frs = impute_salary_sacrifice(frs) update_dataset("Impute salary sacrifice", "completed") + update_dataset("Impute student loan plan", "processing") + frs = impute_student_loan_plan(frs, year=2023) + update_dataset("Impute student loan plan", "completed") + # Uprate dataset update_dataset("Uprate to 2025", "processing") frs = uprate_dataset(frs, 2025) @@ -143,7 +149,7 @@ def main(): details={ "base_dataset": "frs_2023_24.h5", "enhanced_dataset": "enhanced_frs_2023_24.h5", - "imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice", + "imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice, student_loan_plan", "calibration": "national and constituency targets", }, ) diff --git a/policyengine_uk_data/datasets/imputations/__init__.py b/policyengine_uk_data/datasets/imputations/__init__.py index ffff235ec..fe2573206 100644 --- a/policyengine_uk_data/datasets/imputations/__init__.py +++ b/policyengine_uk_data/datasets/imputations/__init__.py @@ -5,3 +5,4 @@ from .capital_gains import * from .services import impute_services from .salary_sacrifice import impute_salary_sacrifice +from .student_loans import impute_student_loan_plan diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py new file mode 100644 index 000000000..9847117fe --- /dev/null +++ b/policyengine_uk_data/datasets/imputations/student_loans.py @@ -0,0 +1,91 @@ +""" +Student loan plan imputation. + +This module imputes the student_loan_plan variable based on: +- Whether the person has reported student loan repayments +- Their estimated university attendance year (inferred from age) + +The imputation assigns plan types according to when the loan system changed: +- NONE: No reported repayments +- PLAN_1: Started university before September 2012 +- PLAN_2: Started September 2012 - August 2023 +- PLAN_5: Started September 2023 onwards + +This enables policyengine-uk's student_loan_repayment variable to calculate +repayments using official threshold parameters. +""" + +import numpy as np +from policyengine_uk.data import UKSingleYearDataset +from policyengine_uk import Microsimulation + + +def impute_student_loan_plan( + dataset: UKSingleYearDataset, + year: int = 2025, +) -> UKSingleYearDataset: + """ + Impute student loan plan type based on age and reported repayments. + + The plan type determines which repayment threshold applies: + - PLAN_1: £26,065 (2025), pre-Sept 2012 England/Wales + - PLAN_2: £29,385 (2026-2029 frozen), Sept 2012 - Aug 2023 + - PLAN_4: Scottish loans (not imputed here - requires explicit flag) + - PLAN_5: £25,000 (2025), Sept 2023 onwards + + Args: + dataset: PolicyEngine UK dataset with student_loan_repayments. + year: The simulation year, used to estimate university attendance. + + Returns: + Dataset with imputed student_loan_plan values. + """ + dataset = dataset.copy() + sim = Microsimulation(dataset=dataset) + + # Get required variables + age = sim.calculate("age").values + student_loan_repayments = sim.calculate("student_loan_repayments").values + + # Determine if person has a student loan based on reported repayments + has_student_loan = student_loan_repayments > 0 + + # Estimate when they started university (assume age 18) + # For simulation year Y and age A, university start year = Y - A + 18 + estimated_uni_start_year = year - age + 18 + + # Assign plan types based on when loan system changed + # StudentLoanPlan is a string enum: "NONE", "PLAN_1", "PLAN_2", "PLAN_4", "PLAN_5" + plan = np.full(len(age), "NONE", dtype=object) + + # Plan 1: Started before September 2012 + plan_1_mask = has_student_loan & (estimated_uni_start_year < 2012) + plan[plan_1_mask] = "PLAN_1" + + # Plan 2: Started September 2012 - August 2023 + plan_2_mask = has_student_loan & ( + (estimated_uni_start_year >= 2012) & (estimated_uni_start_year < 2023) + ) + plan[plan_2_mask] = "PLAN_2" + + # Plan 5: Started September 2023 onwards + plan_5_mask = has_student_loan & (estimated_uni_start_year >= 2023) + plan[plan_5_mask] = "PLAN_5" + + # Store as the plan type + dataset.person["student_loan_plan"] = plan + + # Report imputation results + weights = sim.calculate("person_weight").values + total_with_loan = (has_student_loan * weights).sum() + plan_1_count = (plan_1_mask * weights).sum() + plan_2_count = (plan_2_mask * weights).sum() + plan_5_count = (plan_5_mask * weights).sum() + + print("Student loan plan imputation results:") + print(f" Total with student loan: {total_with_loan / 1e6:.2f}m") + print(f" Plan 1 (pre-2012): {plan_1_count / 1e6:.2f}m") + print(f" Plan 2 (2012-2023): {plan_2_count / 1e6:.2f}m") + print(f" Plan 5 (2023+): {plan_5_count / 1e6:.2f}m") + + return dataset diff --git a/policyengine_uk_data/tests/test_student_loan_plan.py b/policyengine_uk_data/tests/test_student_loan_plan.py new file mode 100644 index 000000000..ddbfd419b --- /dev/null +++ b/policyengine_uk_data/tests/test_student_loan_plan.py @@ -0,0 +1,46 @@ +"""Tests for student loan plan imputation.""" + +import numpy as np +import pytest + + +def test_student_loan_plan_imputation_logic(): + """Test the plan assignment logic based on university start year.""" + # Test data: (age, year, expected_uni_start, expected_plan) + # Plan 1: pre-2012, Plan 2: 2012-2022, Plan 5: 2023+ + + year = 2025 + + # Age 40 in 2025 -> started uni ~2003 -> Plan 1 + age_40_uni_year = year - 40 + 18 # = 2003 + assert age_40_uni_year < 2012, "Age 40 should be Plan 1" + + # Age 30 in 2025 -> started uni ~2013 -> Plan 2 + age_30_uni_year = year - 30 + 18 # = 2013 + assert 2012 <= age_30_uni_year < 2023, "Age 30 should be Plan 2" + + # Age 25 in 2025 -> started uni ~2018 -> Plan 2 + age_25_uni_year = year - 25 + 18 # = 2018 + assert 2012 <= age_25_uni_year < 2023, "Age 25 should be Plan 2" + + # Age 20 in 2025 -> started uni ~2023 -> Plan 5 + age_20_uni_year = year - 20 + 18 # = 2023 + assert age_20_uni_year >= 2023, "Age 20 should be Plan 5" + + # Age 18 in 2025 -> started uni ~2025 -> Plan 5 + age_18_uni_year = year - 18 + 18 # = 2025 + assert age_18_uni_year >= 2023, "Age 18 should be Plan 5" + + +def test_student_loan_plan_enum_values(): + """Test that plan enum values match policyengine-uk's string enum.""" + from policyengine_uk.variables.gov.hmrc.student_loans.student_loan_plan import ( + StudentLoanPlan, + ) + + # Verify our assumptions about enum values (string-based enum) + assert StudentLoanPlan.NONE.value == "NONE" + assert StudentLoanPlan.PLAN_1.value == "PLAN_1" + assert StudentLoanPlan.PLAN_2.value == "PLAN_2" + assert StudentLoanPlan.PLAN_4.value == "PLAN_4" + assert StudentLoanPlan.PLAN_5.value == "PLAN_5" diff --git a/uv.lock b/uv.lock index 01009ed45..114aea48f 100644 --- a/uv.lock +++ b/uv.lock @@ -1394,7 +1394,7 @@ wheels = [ [[package]] name = "policyengine-uk" -version = "2.57.0" +version = "2.61.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -1402,14 +1402,14 @@ dependencies = [ { name = "pydantic" }, { name = "tables" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d9/cd/afaceecde0bcdabe945225bf2f479ecdb8bfff259a0f73c125ca9f8107f8/policyengine_uk-2.57.0.tar.gz", hash = "sha256:d7adefffd979765e51f07d8b5718471560bb5242d39ba24c870a7af26fdbab22", size = 1054410, upload-time = "2025-11-21T15:00:27.822Z" } +sdist = { url = "https://files.pythonhosted.org/packages/71/94/4f6c1bba2085cd2c17a57c6c1da5bda9a71f7f5ec0bce6e5057ebd5b90a1/policyengine_uk-2.61.2.tar.gz", hash = "sha256:05263fef974e885ded0ce6e06010a67c1c09d665d6b3acf061b86e5d680d4766", size = 1062165, upload-time = "2025-11-28T14:59:18.219Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/04/37/51bbe77eb8ce163b54b21a8c465e418ead15c2e02d14300f9a9b5d3eee8d/policyengine_uk-2.57.0-py3-none-any.whl", hash = "sha256:080bbc33c6a78d28552b0e5806a74cf89af8b445fd4e6971d45ceaf7ff9c5df0", size = 1619134, upload-time = "2025-11-21T15:00:26.117Z" }, + { url = "https://files.pythonhosted.org/packages/c7/88/ef6888197f7a62cb9e71998bf692358be7a40359e0662d765f6b4c67a414/policyengine_uk-2.61.2-py3-none-any.whl", hash = "sha256:66b7853e791e7dda1f43dfa1e056bd9ad0d381cb7d381803400a6c2de62616ab", size = 1635936, upload-time = "2025-11-28T14:59:15.883Z" }, ] [[package]] name = "policyengine-uk-data" -version = "1.24.0" +version = "1.24.2" source = { editable = "." } dependencies = [ { name = "black" },