From fc92942bf0e7b3a61b1a376204acefd5006b476f Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 13 Apr 2026 07:26:22 -0400 Subject: [PATCH 1/2] Impute below-threshold student loan holders --- changelog.d/281.md | 1 + .../datasets/imputations/student_loans.py | 177 ++++++++++++----- .../targets/build_loss_matrix.py | 5 +- .../targets/compute/__init__.py | 2 + policyengine_uk_data/targets/compute/other.py | 38 ++-- policyengine_uk_data/targets/sources/slc.py | 184 ++++++++++-------- .../tests/test_student_loan_plan.py | 90 ++++++--- .../tests/test_student_loan_targets.py | 132 +++++++++++-- 8 files changed, 446 insertions(+), 183 deletions(-) create mode 100644 changelog.d/281.md diff --git a/changelog.d/281.md b/changelog.d/281.md new file mode 100644 index 000000000..4dce3b60a --- /dev/null +++ b/changelog.d/281.md @@ -0,0 +1 @@ +Impute below-threshold England student loan holders into the FRS base dataset and add SLC liable-to-repay calibration targets for Plans 2 and 5. diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py index 383918996..52e1cac2a 100644 --- a/policyengine_uk_data/datasets/imputations/student_loans.py +++ b/policyengine_uk_data/datasets/imputations/student_loans.py @@ -1,28 +1,128 @@ -""" -Student loan plan imputation. - -This module imputes the student_loan_plan variable based on: -- Whether the person has reported student loan repayments -- Their estimated university attendance year (inferred from age) +"""Student loan plan imputation. -The imputation assigns plan types according to when the loan system changed: -- NONE: No reported repayments -- PLAN_1: Started university before September 2012 -- PLAN_2: Started September 2012 - August 2023 -- PLAN_5: Started September 2023 onwards +This module imputes `student_loan_plan` in two steps: +- assign plans to people with reported PAYE student loan repayments +- assign missing below-threshold holders to match SLC liable-to-repay totals -This enables policyengine-uk's student_loan_repayment variable to calculate -repayments using official threshold parameters. +The FRS only observes active repayment through PAYE, so many England borrowers +who hold a loan but earn below the repayment threshold are missing from the +base dataset. We fill that stock using the checked-in SLC snapshot, restricting +the new assignments to plausible England tertiary-education cohorts. """ import numpy as np -from policyengine_uk.data import UKSingleYearDataset from policyengine_uk import Microsimulation +from policyengine_uk.data import UKSingleYearDataset + +from policyengine_uk_data.targets.sources.slc import get_snapshot_data + +_ENGLAND = "ENGLAND" +_PLAN_2_MIN_AGE = 21 +_PLAN_2_MAX_AGE = 55 +_PLAN_5_MAX_AGE = 25 + + +def _weighted_count(mask: np.ndarray, weights: np.ndarray) -> float: + return float(np.sum(weights[mask])) + + +def _assign_probabilistically( + plan: np.ndarray, + eligible: np.ndarray, + weights: np.ndarray, + target_count: float, + plan_name: str, + rng: np.random.Generator, +) -> None: + """Assign a plan to a weighted eligible pool up to a target count.""" + eligible_weight = _weighted_count(eligible, weights) + if target_count <= 0 or eligible_weight <= 0: + return + assignment_probability = min(1.0, target_count / eligible_weight) + draws = rng.random(len(plan)) + plan[eligible & (draws < assignment_probability)] = plan_name + + +def _impute_student_loan_plan_values( + age: np.ndarray, + student_loan_repayments: np.ndarray, + country: np.ndarray, + highest_education: np.ndarray, + person_weight: np.ndarray, + *, + year: int, + seed: int = 42, + slc_data: dict | None = None, +) -> np.ndarray: + """Impute plan values from person-level arrays.""" + age = np.asarray(age) + repayments = np.asarray(student_loan_repayments) + country = np.asarray(country) + highest_education = np.asarray(highest_education) + person_weight = np.asarray(person_weight, dtype=float) + slc_data = get_snapshot_data() if slc_data is None else slc_data + + rng = np.random.default_rng(seed) + plan = np.full(len(age), "NONE", dtype=object) + + has_repayments = repayments > 0 + is_england = country == _ENGLAND + is_tertiary = highest_education == "TERTIARY" + estimated_uni_start_year = year - age + 18 + + plan_1_cohort = estimated_uni_start_year < 2012 + plan_5_cohort = estimated_uni_start_year >= 2023 + plan_2_age_band = (age >= _PLAN_2_MIN_AGE) & (age <= _PLAN_2_MAX_AGE) + plan_5_age_band = (age >= 18) & (age <= _PLAN_5_MAX_AGE) + + # Reported PAYE repayers identify the active stock directly. + plan[has_repayments & plan_1_cohort] = "PLAN_1" + plan[has_repayments & plan_5_cohort] = "PLAN_5" + plan[has_repayments & (plan == "NONE")] = "PLAN_2" + + # Impute missing below-threshold holders so the total England stock matches + # the SLC liable-to-repay series, using the observed repayer stock as the + # starting point rather than the official above-threshold count. + plan_5_target = slc_data["plan_5"]["liable"].get(year, 0) + plan_5_shortfall = max( + 0.0, + plan_5_target - _weighted_count((plan == "PLAN_5") & is_england, person_weight), + ) + plan_5_eligible = ( + (plan == "NONE") & is_england & is_tertiary & plan_5_age_band & plan_5_cohort + ) + _assign_probabilistically( + plan, + plan_5_eligible, + person_weight, + plan_5_shortfall, + "PLAN_5", + rng, + ) + + plan_2_target = slc_data["plan_2"]["liable"].get(year, 0) + plan_2_shortfall = max( + 0.0, + plan_2_target - _weighted_count((plan == "PLAN_2") & is_england, person_weight), + ) + plan_2_eligible = (plan == "NONE") & is_england & is_tertiary & plan_2_age_band + _assign_probabilistically( + plan, + plan_2_eligible, + person_weight, + plan_2_shortfall, + "PLAN_2", + rng, + ) + + return plan def impute_student_loan_plan( dataset: UKSingleYearDataset, year: int = 2025, + seed: int = 42, + slc_data: dict | None = None, ) -> UKSingleYearDataset: """ Impute student loan plan type based on age and reported repayments. @@ -34,45 +134,22 @@ def impute_student_loan_plan( - PLAN_5: £25,000 (2025), Sept 2023 onwards Args: - dataset: PolicyEngine UK dataset with student_loan_repayments. - year: The simulation year, used to estimate university attendance. - - Returns: - Dataset with imputed student_loan_plan values. + dataset: PolicyEngine UK dataset with student loan inputs. + year: Simulation year, used to estimate university start cohorts. + seed: Random seed for reproducible below-threshold assignment. + slc_data: Optional override for the SLC borrower snapshot. """ dataset = dataset.copy() sim = Microsimulation(dataset=dataset) - - # Get required variables - age = sim.calculate("age").values - student_loan_repayments = sim.calculate("student_loan_repayments").values - - # Determine if person has a student loan based on reported repayments - has_student_loan = student_loan_repayments > 0 - - # Estimate when they started university (assume age 18) - # For simulation year Y and age A, university start year = Y - A + 18 - estimated_uni_start_year = year - age + 18 - - # Assign plan types based on when loan system changed - # StudentLoanPlan is a string enum: "NONE", "PLAN_1", "PLAN_2", "PLAN_4", "PLAN_5" - plan = np.full(len(age), "NONE", dtype=object) - - # Plan 1: Started before September 2012 - plan_1_mask = has_student_loan & (estimated_uni_start_year < 2012) - plan[plan_1_mask] = "PLAN_1" - - # Plan 2: Started September 2012 - August 2023 - plan_2_mask = has_student_loan & ( - (estimated_uni_start_year >= 2012) & (estimated_uni_start_year < 2023) + dataset.person["student_loan_plan"] = _impute_student_loan_plan_values( + age=sim.calculate("age").values, + student_loan_repayments=sim.calculate("student_loan_repayments").values, + country=sim.calculate("country", map_to="person").values, + highest_education=sim.calculate("highest_education").values, + person_weight=sim.calculate("person_weight").values, + year=year, + seed=seed, + slc_data=slc_data, ) - plan[plan_2_mask] = "PLAN_2" - - # Plan 5: Started September 2023 onwards - plan_5_mask = has_student_loan & (estimated_uni_start_year >= 2023) - plan[plan_5_mask] = "PLAN_5" - - # Store as the plan type - dataset.person["student_loan_plan"] = plan return dataset diff --git a/policyengine_uk_data/targets/build_loss_matrix.py b/policyengine_uk_data/targets/build_loss_matrix.py index e92ecbc1b..35827bc29 100644 --- a/policyengine_uk_data/targets/build_loss_matrix.py +++ b/policyengine_uk_data/targets/build_loss_matrix.py @@ -40,6 +40,7 @@ compute_scotland_uc_child, compute_scottish_child_payment, compute_student_loan_plan, + compute_student_loan_plan_liable, compute_ss_contributions, compute_ss_headcount, compute_ss_it_relief, @@ -316,8 +317,10 @@ def _compute_column(target: Target, ctx: _SimContext, year: int) -> np.ndarray | return compute_scottish_child_payment(target, ctx) # Student loan plan borrower counts (SLC) - if name.startswith("slc/plan_"): + if name.startswith("slc/plan_") and "above_threshold" in name: return compute_student_loan_plan(target, ctx) + if name.startswith("slc/plan_") and "liable" in name: + return compute_student_loan_plan_liable(target, ctx) # PIP claimants if name in ( diff --git a/policyengine_uk_data/targets/compute/__init__.py b/policyengine_uk_data/targets/compute/__init__.py index b9fe37643..a87cf8143 100644 --- a/policyengine_uk_data/targets/compute/__init__.py +++ b/policyengine_uk_data/targets/compute/__init__.py @@ -40,6 +40,7 @@ compute_savings_interest, compute_scottish_child_payment, compute_student_loan_plan, + compute_student_loan_plan_liable, compute_vehicles, ) @@ -61,6 +62,7 @@ "compute_scotland_uc_child", "compute_scottish_child_payment", "compute_student_loan_plan", + "compute_student_loan_plan_liable", "compute_ss_contributions", "compute_ss_headcount", "compute_ss_it_relief", diff --git a/policyengine_uk_data/targets/compute/other.py b/policyengine_uk_data/targets/compute/other.py index 0bd02cb7f..10314fe03 100644 --- a/policyengine_uk_data/targets/compute/other.py +++ b/policyengine_uk_data/targets/compute/other.py @@ -1,20 +1,7 @@ -"""Miscellaneous compute functions (vehicles, housing, savings, SCP, -student loans).""" +"""Miscellaneous compute functions (vehicles, housing, savings, SCP, student loans).""" import numpy as np -_ENGLAND_REGIONS = { - "NORTH_EAST", - "NORTH_WEST", - "YORKSHIRE", - "EAST_MIDLANDS", - "WEST_MIDLANDS", - "EAST_OF_ENGLAND", - "LONDON", - "SOUTH_EAST", - "SOUTH_WEST", -} - def compute_vehicles(target, ctx) -> np.ndarray: """Compute vehicle ownership targets.""" @@ -78,9 +65,24 @@ def compute_student_loan_plan(target, ctx) -> np.ndarray: else: return None - plan = ctx.sim.calculate("student_loan_plan").values - region = ctx.sim.calculate("region", map_to="person").values - is_england = np.isin(region, list(_ENGLAND_REGIONS)) - on_plan = (plan == plan_value) & is_england + plan = ctx.pe_person("student_loan_plan") + repayments = ctx.pe_person("student_loan_repayments") + on_plan = (plan == plan_value) & (ctx.country == "ENGLAND") & (repayments > 0) + + return ctx.household_from_person(on_plan.astype(float)) + + +def compute_student_loan_plan_liable(target, ctx) -> np.ndarray: + """Count all England borrowers on a given plan, including non-repayers.""" + plan_name = target.name # e.g. "slc/plan_2_borrowers_liable" + if "plan_2" in plan_name: + plan_value = "PLAN_2" + elif "plan_5" in plan_name: + plan_value = "PLAN_5" + else: + return None + + plan = ctx.pe_person("student_loan_plan") + on_plan = (plan == plan_value) & (ctx.country == "ENGLAND") return ctx.household_from_person(on_plan.astype(float)) diff --git a/policyengine_uk_data/targets/sources/slc.py b/policyengine_uk_data/targets/sources/slc.py index d49e35bdb..08689c41e 100644 --- a/policyengine_uk_data/targets/sources/slc.py +++ b/policyengine_uk_data/targets/sources/slc.py @@ -1,8 +1,10 @@ """Student Loans Company (SLC) calibration targets. -Borrower counts for England only: Plan 2 and Plan 5, restricted to -borrowers liable to repay and earning above the repayment threshold. -This matches the FRS coverage (PAYE deductions only). +Borrower counts for England only: Plan 2 and Plan 5. + +Two target types are exposed: +- `above_threshold`: borrowers liable to repay and earning above threshold +- `liable`: all borrowers liable to repay, including below-threshold holders Source: Explore Education Statistics — Student loan forecasts for England, Table 6a: Forecast number of student borrowers liable to repay and number @@ -17,9 +19,10 @@ import json import os import re -import requests from functools import lru_cache +import requests + from policyengine_uk_data.targets.schema import Target, Unit _PERMALINK_ID = "6ff75517-7124-487c-cb4e-08de6eccf22d" @@ -29,33 +32,62 @@ ) _TESTING_DATA = { "plan_2": { - 2025: 3_670_000, - 2026: 4_130_000, - 2027: 4_480_000, - 2028: 4_700_000, - 2029: 4_820_000, - 2030: 4_870_000, + "above_threshold": { + 2025: 3_985_000, + 2026: 4_460_000, + 2027: 4_825_000, + 2028: 5_045_000, + 2029: 5_160_000, + 2030: 5_205_000, + }, + "liable": { + 2025: 8_940_000, + 2026: 9_710_000, + 2027: 10_360_000, + 2028: 10_615_000, + 2029: 10_600_000, + 2030: 10_525_000, + }, }, "plan_5": { - 2026: 25_000, - 2027: 115_000, - 2028: 340_000, - 2029: 700_000, - 2030: 1_140_000, + "above_threshold": { + 2026: 35_000, + 2027: 145_000, + 2028: 390_000, + 2029: 770_000, + 2030: 1_235_000, + }, + "liable": { + 2025: 10_000, + 2026: 230_000, + 2027: 630_000, + 2028: 1_380_000, + 2029: 2_360_000, + 2030: 3_400_000, + }, }, } +def get_snapshot_data() -> dict: + """Return the checked-in SLC snapshot used for tests and deterministic builds.""" + return { + plan: { + target_type: values.copy() for target_type, values in target_data.items() + } + for plan, target_data in _TESTING_DATA.items() + } + + @lru_cache(maxsize=1) def _fetch_slc_data() -> dict: """Fetch and parse SLC Table 6a data from Explore Education Statistics. Returns: - Dict with keys 'plan_2' and 'plan_5', each containing a dict - mapping calendar year (int) to borrower count above threshold (int). + Nested dict of plan -> target type -> year -> count. """ if os.environ.get("TESTING", "0") == "1": - return _TESTING_DATA + return get_snapshot_data() response = requests.get(_PERMALINK_URL, timeout=30) response.raise_for_status() @@ -75,59 +107,62 @@ def _fetch_slc_data() -> dict: # Structure: Plan 2 (6 years), Plan 5 (6 years), Plan 3 (5 years) header_row = table_json["thead"][1] - # Get Plan 2 years (first 6 columns) plan_2_years = [] for i in range(6): year_text = header_row[i]["text"] # e.g., "2029-30" start_year = int(year_text.split("-")[0]) - calendar_year = start_year + 1 # 2029-30 → 2030 - plan_2_years.append(calendar_year) + plan_2_years.append(start_year + 1) # 2029-30 → 2030 - # Get Plan 5 years (next 6 columns) plan_5_years = [] for i in range(6, 12): year_text = header_row[i]["text"] start_year = int(year_text.split("-")[0]) - calendar_year = start_year + 1 - plan_5_years.append(calendar_year) + plan_5_years.append(start_year + 1) - # Find the "Higher education total" / "earning above threshold" row - # This is the row following "Higher education total" with "liable to repay" tbody = table_json["tbody"] - - # Row 11 contains: header + 6 Plan 2 values + 6 Plan 5 values + 5 Plan 3 - target_row = None - for row in tbody: + liable_row = None + above_threshold_row = None + for index, row in enumerate(tbody): header_text = row[0].get("text", "") - if "earning above repayment threshold" in header_text: - # Check if previous context was "Higher education total" - # Actually, row 11 is after HE total row 10, and starts with - # the "earning above" header (no group header due to rowSpan) - target_row = row + if header_text == "Higher education total": + liable_row = row + if index + 1 < len(tbody): + next_row = tbody[index + 1] + next_header = next_row[0].get("text", "") + if "earning above repayment threshold" in next_header: + above_threshold_row = next_row break - if target_row is None: + if liable_row is None: + raise ValueError("Could not find 'Higher education total' row") + if above_threshold_row is None: raise ValueError("Could not find 'earning above threshold' row") - # Parse Plan 2 data (cells 1-6, mapping to plan_2_years) - plan_2_data = {} - for i, year in enumerate(plan_2_years): - cell_idx = 1 + i # Skip header cell - value_text = target_row[cell_idx].get("text", "") - if value_text and value_text not in ("no data", "0"): - value = int(value_text.replace(",", "")) - plan_2_data[year] = value - - # Parse Plan 5 data (cells 7-12, mapping to plan_5_years) - plan_5_data = {} - for i, year in enumerate(plan_5_years): - cell_idx = 7 + i # Skip header + Plan 2 cells - value_text = target_row[cell_idx].get("text", "") - if value_text and value_text not in ("no data", "0"): - value = int(value_text.replace(",", "")) - plan_5_data[year] = value - - return {"plan_2": plan_2_data, "plan_5": plan_5_data} + def parse_values(row, start_index, years): + data = {} + for offset, year in enumerate(years): + cell_idx = start_index + offset + if cell_idx >= len(row): + continue + value_text = row[cell_idx].get("text", "") + if value_text and value_text not in ("no data", "0"): + data[year] = int(value_text.replace(",", "")) + return data + + return { + "plan_2": { + "above_threshold": parse_values( + above_threshold_row, start_index=1, years=plan_2_years + ), + "liable": parse_values(liable_row, start_index=2, years=plan_2_years), + }, + "plan_5": { + "above_threshold": parse_values( + above_threshold_row, start_index=7, years=plan_5_years + ), + "liable": parse_values(liable_row, start_index=8, years=plan_5_years), + }, + } def get_targets() -> list[Target]: @@ -136,28 +171,21 @@ def get_targets() -> list[Target]: targets = [] - targets.append( - Target( - name="slc/plan_2_borrowers_above_threshold", - variable="student_loan_plan", - source="slc", - unit=Unit.COUNT, - is_count=True, - values=slc_data["plan_2"], - reference_url=_PERMALINK_URL, - ) - ) - - targets.append( - Target( - name="slc/plan_5_borrowers_above_threshold", - variable="student_loan_plan", - source="slc", - unit=Unit.COUNT, - is_count=True, - values=slc_data["plan_5"], - reference_url=_PERMALINK_URL, - ) - ) + for plan, plan_label in (("plan_2", "2"), ("plan_5", "5")): + for target_type, suffix in ( + ("above_threshold", "above_threshold"), + ("liable", "liable"), + ): + targets.append( + Target( + name=f"slc/plan_{plan_label}_borrowers_{suffix}", + variable="student_loan_plan", + source="slc", + unit=Unit.COUNT, + is_count=True, + values=slc_data[plan][target_type], + reference_url=_PERMALINK_URL, + ) + ) return targets diff --git a/policyengine_uk_data/tests/test_student_loan_plan.py b/policyengine_uk_data/tests/test_student_loan_plan.py index ddbfd419b..b8d73d903 100644 --- a/policyengine_uk_data/tests/test_student_loan_plan.py +++ b/policyengine_uk_data/tests/test_student_loan_plan.py @@ -1,44 +1,88 @@ """Tests for student loan plan imputation.""" import numpy as np -import pytest -def test_student_loan_plan_imputation_logic(): - """Test the plan assignment logic based on university start year.""" - # Test data: (age, year, expected_uni_start, expected_plan) - # Plan 1: pre-2012, Plan 2: 2012-2022, Plan 5: 2023+ +def test_repaying_borrowers_are_assigned_expected_plans(): + """Repayers should map to the expected plan cohorts.""" + from policyengine_uk_data.datasets.imputations.student_loans import ( + _impute_student_loan_plan_values, + ) + + plans = _impute_student_loan_plan_values( + age=np.array([40, 30, 20]), + student_loan_repayments=np.array([100.0, 100.0, 100.0]), + country=np.array(["ENGLAND", "ENGLAND", "ENGLAND"]), + highest_education=np.array(["TERTIARY", "TERTIARY", "TERTIARY"]), + person_weight=np.ones(3), + year=2025, + slc_data={"plan_2": {"liable": {2025: 1}}, "plan_5": {"liable": {2025: 1}}}, + ) + + assert plans.tolist() == ["PLAN_1", "PLAN_2", "PLAN_5"] + - year = 2025 +def test_below_threshold_imputation_uses_liable_shortfall(): + """Missing holders should be imputed from the liable target shortfall.""" + from policyengine_uk_data.datasets.imputations.student_loans import ( + _impute_student_loan_plan_values, + ) - # Age 40 in 2025 -> started uni ~2003 -> Plan 1 - age_40_uni_year = year - 40 + 18 # = 2003 - assert age_40_uni_year < 2012, "Age 40 should be Plan 1" + plans = _impute_student_loan_plan_values( + age=np.array([40, 30, 20, 30, 30, 30]), + student_loan_repayments=np.array([100.0, 100.0, 0.0, 0.0, 0.0, 0.0]), + country=np.array( + ["ENGLAND", "ENGLAND", "ENGLAND", "ENGLAND", "WALES", "ENGLAND"] + ), + highest_education=np.array( + ["POST_SECONDARY", "TERTIARY", "TERTIARY", "TERTIARY", "TERTIARY", "GCSE"] + ), + person_weight=np.ones(6), + year=2025, + slc_data={ + "plan_2": {"liable": {2025: 2}}, + "plan_5": {"liable": {2025: 1}}, + }, + ) - # Age 30 in 2025 -> started uni ~2013 -> Plan 2 - age_30_uni_year = year - 30 + 18 # = 2013 - assert 2012 <= age_30_uni_year < 2023, "Age 30 should be Plan 2" + assert plans.tolist() == [ + "PLAN_1", + "PLAN_2", + "PLAN_5", + "PLAN_2", + "NONE", + "NONE", + ] - # Age 25 in 2025 -> started uni ~2018 -> Plan 2 - age_25_uni_year = year - 25 + 18 # = 2018 - assert 2012 <= age_25_uni_year < 2023, "Age 25 should be Plan 2" - # Age 20 in 2025 -> started uni ~2023 -> Plan 5 - age_20_uni_year = year - 20 + 18 # = 2023 - assert age_20_uni_year >= 2023, "Age 20 should be Plan 5" +def test_plan5_assignment_has_priority_over_plan2_for_recent_cohort(): + """Recent cohorts should stay on Plan 5 rather than being swallowed by Plan 2.""" + from policyengine_uk_data.datasets.imputations.student_loans import ( + _impute_student_loan_plan_values, + ) + + plans = _impute_student_loan_plan_values( + age=np.array([21]), + student_loan_repayments=np.array([0.0]), + country=np.array(["ENGLAND"]), + highest_education=np.array(["TERTIARY"]), + person_weight=np.ones(1), + year=2026, + slc_data={ + "plan_2": {"liable": {2026: 1}}, + "plan_5": {"liable": {2026: 1}}, + }, + ) - # Age 18 in 2025 -> started uni ~2025 -> Plan 5 - age_18_uni_year = year - 18 + 18 # = 2025 - assert age_18_uni_year >= 2023, "Age 18 should be Plan 5" + assert plans.tolist() == ["PLAN_5"] def test_student_loan_plan_enum_values(): - """Test that plan enum values match policyengine-uk's string enum.""" + """Student-loan plan strings should still match policyengine-uk's enum.""" from policyengine_uk.variables.gov.hmrc.student_loans.student_loan_plan import ( StudentLoanPlan, ) - # Verify our assumptions about enum values (string-based enum) assert StudentLoanPlan.NONE.value == "NONE" assert StudentLoanPlan.PLAN_1.value == "PLAN_1" assert StudentLoanPlan.PLAN_2.value == "PLAN_2" diff --git a/policyengine_uk_data/tests/test_student_loan_targets.py b/policyengine_uk_data/tests/test_student_loan_targets.py index 3e9da19c5..b10e3aab6 100644 --- a/policyengine_uk_data/tests/test_student_loan_targets.py +++ b/policyengine_uk_data/tests/test_student_loan_targets.py @@ -1,5 +1,10 @@ """Tests for SLC student loan calibration targets.""" +import json +from types import SimpleNamespace + +import numpy as np + def test_slc_targets_registered(): """SLC targets appear in the target registry.""" @@ -8,28 +13,37 @@ def test_slc_targets_registered(): targets = {t.name: t for t in get_all_targets()} assert "slc/plan_2_borrowers_above_threshold" in targets assert "slc/plan_5_borrowers_above_threshold" in targets + assert "slc/plan_2_borrowers_liable" in targets + assert "slc/plan_5_borrowers_liable" in targets -def test_slc_plan2_values(): - """Plan 2 target values match SLC Table 6a.""" +def test_slc_snapshot_values_match_higher_education_total_rows(): + """Snapshot values should match the HE-total borrower rows.""" from policyengine_uk_data.targets.registry import get_all_targets targets = {t.name: t for t in get_all_targets()} - p2 = targets["slc/plan_2_borrowers_above_threshold"] - assert p2.values[2025] == 3_670_000 - assert p2.values[2026] == 4_130_000 - assert p2.values[2029] == 4_820_000 + + assert targets["slc/plan_2_borrowers_above_threshold"].values[2025] == 3_985_000 + assert targets["slc/plan_2_borrowers_above_threshold"].values[2030] == 5_205_000 + assert targets["slc/plan_2_borrowers_liable"].values[2025] == 8_940_000 + assert targets["slc/plan_2_borrowers_liable"].values[2030] == 10_525_000 + + assert 2025 not in targets["slc/plan_5_borrowers_above_threshold"].values + assert targets["slc/plan_5_borrowers_above_threshold"].values[2026] == 35_000 + assert targets["slc/plan_5_borrowers_above_threshold"].values[2030] == 1_235_000 + assert targets["slc/plan_5_borrowers_liable"].values[2025] == 10_000 + assert targets["slc/plan_5_borrowers_liable"].values[2030] == 3_400_000 -def test_slc_plan5_values(): - """Plan 5 target values match SLC Table 6a.""" +def test_liable_targets_exceed_above_threshold_targets(): + """Liable counts should exceed above-threshold counts in the same year.""" from policyengine_uk_data.targets.registry import get_all_targets targets = {t.name: t for t in get_all_targets()} - p5 = targets["slc/plan_5_borrowers_above_threshold"] - assert 2025 not in p5.values # no Plan 5 borrowers yet in 2024-25 - assert p5.values[2026] == 25_000 - assert p5.values[2029] == 700_000 + for year, count in targets["slc/plan_2_borrowers_above_threshold"].values.items(): + assert targets["slc/plan_2_borrowers_liable"].values[year] > count + for year, count in targets["slc/plan_5_borrowers_above_threshold"].values.items(): + assert targets["slc/plan_5_borrowers_liable"].values[year] > count def test_slc_testing_mode_uses_snapshot_without_network(monkeypatch): @@ -44,6 +58,98 @@ def fail_network(*args, **kwargs): monkeypatch.setattr(slc.requests, "get", fail_network) - assert slc._fetch_slc_data() == slc._TESTING_DATA + assert slc._fetch_slc_data() == slc.get_snapshot_data() + slc._fetch_slc_data.cache_clear() + + +def test_slc_parser_uses_higher_education_total_rows(monkeypatch): + """Parser should read HE-total rows, not the first matching above-threshold row.""" + from policyengine_uk_data.targets.sources import slc + + table_json = { + "thead": [ + [], + [{"text": "2024-25"}] * 6 + [{"text": "2024-25"}] * 6, + ], + "tbody": [ + [{"text": "Higher education full-time"}, {"text": "liable"}] + + [{"text": "1,000"}] * 12, + [ + { + "text": "Number of borrowers liable to repay and earning above repayment threshold" + } + ] + + [{"text": "100"}] * 12, + [{"text": "Higher education total"}, {"text": "liable"}] + + [{"text": "8,940,000"}] * 6 + + [{"text": "10,000"}] * 6, + [ + { + "text": "Number of borrowers liable to repay and earning above repayment threshold" + } + ] + + [{"text": "3,985,000"}] * 6 + + [{"text": "35,000"}] * 6, + ], + } + html = ( + '" + ) + + class DummyResponse: + text = html + + @staticmethod + def raise_for_status(): + return None slc._fetch_slc_data.cache_clear() + monkeypatch.delenv("TESTING", raising=False) + monkeypatch.setattr(slc.requests, "get", lambda *args, **kwargs: DummyResponse()) + + data = slc._fetch_slc_data() + assert data["plan_2"]["liable"][2025] == 8_940_000 + assert data["plan_2"]["above_threshold"][2025] == 3_985_000 + assert data["plan_5"]["liable"][2025] == 10_000 + assert data["plan_5"]["above_threshold"][2025] == 35_000 + + slc._fetch_slc_data.cache_clear() + + +def test_student_loan_target_compute_distinguishes_liable_from_repaying(): + """Above-threshold counts should require repayments, while liable counts should not.""" + from policyengine_uk_data.targets.compute.other import ( + compute_student_loan_plan, + compute_student_loan_plan_liable, + ) + + class DummyCtx: + country = np.array(["ENGLAND", "ENGLAND", "WALES", "ENGLAND"]) + + @staticmethod + def pe_person(variable): + values = { + "student_loan_plan": np.array(["PLAN_2", "PLAN_2", "PLAN_2", "PLAN_5"]), + "student_loan_repayments": np.array([10.0, 0.0, 15.0, 0.0]), + } + return values[variable] + + @staticmethod + def household_from_person(values): + return values + + above_threshold = compute_student_loan_plan( + SimpleNamespace(name="slc/plan_2_borrowers_above_threshold"), + DummyCtx(), + ) + liable = compute_student_loan_plan_liable( + SimpleNamespace(name="slc/plan_2_borrowers_liable"), + DummyCtx(), + ) + + assert above_threshold.tolist() == [1.0, 0.0, 0.0, 0.0] + assert liable.tolist() == [1.0, 1.0, 0.0, 0.0] From 21b224320cbd4a5f093736c87105be70fdef427f Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 13 Apr 2026 09:37:52 -0400 Subject: [PATCH 2/2] Fix student loan target entity mapping --- .../datasets/imputations/student_loans.py | 7 +++++- policyengine_uk_data/targets/compute/other.py | 6 +++-- .../tests/test_student_loan_plan.py | 22 +++++++++++++++++++ .../tests/test_student_loan_targets.py | 11 +++++++++- 4 files changed, 42 insertions(+), 4 deletions(-) diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py index 52e1cac2a..7a9f7f73c 100644 --- a/policyengine_uk_data/datasets/imputations/student_loans.py +++ b/policyengine_uk_data/datasets/imputations/student_loans.py @@ -71,6 +71,9 @@ def _impute_student_loan_plan_values( estimated_uni_start_year = year - age + 18 plan_1_cohort = estimated_uni_start_year < 2012 + plan_2_cohort = (estimated_uni_start_year >= 2012) & ( + estimated_uni_start_year < 2023 + ) plan_5_cohort = estimated_uni_start_year >= 2023 plan_2_age_band = (age >= _PLAN_2_MIN_AGE) & (age <= _PLAN_2_MAX_AGE) plan_5_age_band = (age >= 18) & (age <= _PLAN_5_MAX_AGE) @@ -105,7 +108,9 @@ def _impute_student_loan_plan_values( 0.0, plan_2_target - _weighted_count((plan == "PLAN_2") & is_england, person_weight), ) - plan_2_eligible = (plan == "NONE") & is_england & is_tertiary & plan_2_age_band + plan_2_eligible = ( + (plan == "NONE") & is_england & is_tertiary & plan_2_age_band & plan_2_cohort + ) _assign_probabilistically( plan, plan_2_eligible, diff --git a/policyengine_uk_data/targets/compute/other.py b/policyengine_uk_data/targets/compute/other.py index 10314fe03..65151f2e4 100644 --- a/policyengine_uk_data/targets/compute/other.py +++ b/policyengine_uk_data/targets/compute/other.py @@ -67,7 +67,8 @@ def compute_student_loan_plan(target, ctx) -> np.ndarray: plan = ctx.pe_person("student_loan_plan") repayments = ctx.pe_person("student_loan_repayments") - on_plan = (plan == plan_value) & (ctx.country == "ENGLAND") & (repayments > 0) + person_country = ctx.sim.calculate("country", map_to="person").values + on_plan = (plan == plan_value) & (person_country == "ENGLAND") & (repayments > 0) return ctx.household_from_person(on_plan.astype(float)) @@ -83,6 +84,7 @@ def compute_student_loan_plan_liable(target, ctx) -> np.ndarray: return None plan = ctx.pe_person("student_loan_plan") - on_plan = (plan == plan_value) & (ctx.country == "ENGLAND") + person_country = ctx.sim.calculate("country", map_to="person").values + on_plan = (plan == plan_value) & (person_country == "ENGLAND") return ctx.household_from_person(on_plan.astype(float)) diff --git a/policyengine_uk_data/tests/test_student_loan_plan.py b/policyengine_uk_data/tests/test_student_loan_plan.py index b8d73d903..d782f25f8 100644 --- a/policyengine_uk_data/tests/test_student_loan_plan.py +++ b/policyengine_uk_data/tests/test_student_loan_plan.py @@ -77,6 +77,28 @@ def test_plan5_assignment_has_priority_over_plan2_for_recent_cohort(): assert plans.tolist() == ["PLAN_5"] +def test_plan2_below_threshold_imputation_respects_estimated_cohort(): + """Pre-2012 cohorts should not be assigned Plan 2 just because they fit the age band.""" + from policyengine_uk_data.datasets.imputations.student_loans import ( + _impute_student_loan_plan_values, + ) + + plans = _impute_student_loan_plan_values( + age=np.array([40]), + student_loan_repayments=np.array([0.0]), + country=np.array(["ENGLAND"]), + highest_education=np.array(["TERTIARY"]), + person_weight=np.ones(1), + year=2025, + slc_data={ + "plan_2": {"liable": {2025: 1}}, + "plan_5": {"liable": {2025: 0}}, + }, + ) + + assert plans.tolist() == ["NONE"] + + def test_student_loan_plan_enum_values(): """Student-loan plan strings should still match policyengine-uk's enum.""" from policyengine_uk.variables.gov.hmrc.student_loans.student_loan_plan import ( diff --git a/policyengine_uk_data/tests/test_student_loan_targets.py b/policyengine_uk_data/tests/test_student_loan_targets.py index b10e3aab6..fc9858f33 100644 --- a/policyengine_uk_data/tests/test_student_loan_targets.py +++ b/policyengine_uk_data/tests/test_student_loan_targets.py @@ -128,7 +128,16 @@ def test_student_loan_target_compute_distinguishes_liable_from_repaying(): ) class DummyCtx: - country = np.array(["ENGLAND", "ENGLAND", "WALES", "ENGLAND"]) + country = np.array(["ENGLAND", "WALES"]) + + class sim: + @staticmethod + def calculate(variable, map_to=None): + if variable == "country" and map_to == "person": + return SimpleNamespace( + values=np.array(["ENGLAND", "ENGLAND", "WALES", "ENGLAND"]) + ) + raise AssertionError(f"Unexpected calculate call: {variable}, {map_to}") @staticmethod def pe_person(variable):