From fc92942bf0e7b3a61b1a376204acefd5006b476f Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Mon, 13 Apr 2026 07:26:22 -0400
Subject: [PATCH 1/2] Impute below-threshold student loan holders

---
 changelog.d/281.md                            |   1 +
 .../datasets/imputations/student_loans.py     | 177 ++++++++++++-----
 .../targets/build_loss_matrix.py              |   5 +-
 .../targets/compute/__init__.py               |   2 +
 policyengine_uk_data/targets/compute/other.py |  38 ++--
 policyengine_uk_data/targets/sources/slc.py   | 184 ++++++++++--------
 .../tests/test_student_loan_plan.py           |  90 ++++++---
 .../tests/test_student_loan_targets.py        | 132 +++++++++++--
 8 files changed, 446 insertions(+), 183 deletions(-)
 create mode 100644 changelog.d/281.md

diff --git a/changelog.d/281.md b/changelog.d/281.md
new file mode 100644
index 000000000..4dce3b60a
--- /dev/null
+++ b/changelog.d/281.md
@@ -0,0 +1 @@
+Impute below-threshold England student loan holders into the FRS base dataset and add SLC liable-to-repay calibration targets for Plans 2 and 5.
diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py
index 383918996..52e1cac2a 100644
--- a/policyengine_uk_data/datasets/imputations/student_loans.py
+++ b/policyengine_uk_data/datasets/imputations/student_loans.py
@@ -1,28 +1,128 @@
-"""
-Student loan plan imputation.
-
-This module imputes the student_loan_plan variable based on:
-- Whether the person has reported student loan repayments
-- Their estimated university attendance year (inferred from age)
+"""Student loan plan imputation.
 
-The imputation assigns plan types according to when the loan system changed:
-- NONE: No reported repayments
-- PLAN_1: Started university before September 2012
-- PLAN_2: Started September 2012 - August 2023
-- PLAN_5: Started September 2023 onwards
+This module imputes `student_loan_plan` in two steps:
+- assign plans to people with reported PAYE student loan repayments
+- assign missing below-threshold holders to match SLC liable-to-repay totals
 
-This enables policyengine-uk's student_loan_repayment variable to calculate
-repayments using official threshold parameters.
+The FRS only observes active repayment through PAYE, so many England borrowers
+who hold a loan but earn below the repayment threshold are missing from the
+base dataset. We fill that stock using the checked-in SLC snapshot, restricting
+the new assignments to plausible England tertiary-education cohorts.
 """
 
 import numpy as np
-from policyengine_uk.data import UKSingleYearDataset
 from policyengine_uk import Microsimulation
+from policyengine_uk.data import UKSingleYearDataset
+
+from policyengine_uk_data.targets.sources.slc import get_snapshot_data
+
+_ENGLAND = "ENGLAND"
+_PLAN_2_MIN_AGE = 21
+_PLAN_2_MAX_AGE = 55
+_PLAN_5_MAX_AGE = 25
+
+
+def _weighted_count(mask: np.ndarray, weights: np.ndarray) -> float:
+    return float(np.sum(weights[mask]))
+
+
+def _assign_probabilistically(
+    plan: np.ndarray,
+    eligible: np.ndarray,
+    weights: np.ndarray,
+    target_count: float,
+    plan_name: str,
+    rng: np.random.Generator,
+) -> None:
+    """Assign a plan to a weighted eligible pool up to a target count."""
+    eligible_weight = _weighted_count(eligible, weights)
+    if target_count <= 0 or eligible_weight <= 0:
+        return
+    assignment_probability = min(1.0, target_count / eligible_weight)
+    draws = rng.random(len(plan))
+    plan[eligible & (draws < assignment_probability)] = plan_name
+
+
+def _impute_student_loan_plan_values(
+    age: np.ndarray,
+    student_loan_repayments: np.ndarray,
+    country: np.ndarray,
+    highest_education: np.ndarray,
+    person_weight: np.ndarray,
+    *,
+    year: int,
+    seed: int = 42,
+    slc_data: dict | None = None,
+) -> np.ndarray:
+    """Impute plan values from person-level arrays."""
+    age = np.asarray(age)
+    repayments = np.asarray(student_loan_repayments)
+    country = np.asarray(country)
+    highest_education = np.asarray(highest_education)
+    person_weight = np.asarray(person_weight, dtype=float)
+    slc_data = get_snapshot_data() if slc_data is None else slc_data
+
+    rng = np.random.default_rng(seed)
+    plan = np.full(len(age), "NONE", dtype=object)
+
+    has_repayments = repayments > 0
+    is_england = country == _ENGLAND
+    is_tertiary = highest_education == "TERTIARY"
+    estimated_uni_start_year = year - age + 18
+
+    plan_1_cohort = estimated_uni_start_year < 2012
+    plan_5_cohort = estimated_uni_start_year >= 2023
+    plan_2_age_band = (age >= _PLAN_2_MIN_AGE) & (age <= _PLAN_2_MAX_AGE)
+    plan_5_age_band = (age >= 18) & (age <= _PLAN_5_MAX_AGE)
+
+    # Reported PAYE repayers identify the active stock directly.
+    plan[has_repayments & plan_1_cohort] = "PLAN_1"
+    plan[has_repayments & plan_5_cohort] = "PLAN_5"
+    plan[has_repayments & (plan == "NONE")] = "PLAN_2"
+
+    # Impute missing below-threshold holders so the total England stock matches
+    # the SLC liable-to-repay series, using the observed repayer stock as the
+    # starting point rather than the official above-threshold count.
+    plan_5_target = slc_data["plan_5"]["liable"].get(year, 0)
+    plan_5_shortfall = max(
+        0.0,
+        plan_5_target - _weighted_count((plan == "PLAN_5") & is_england, person_weight),
+    )
+    plan_5_eligible = (
+        (plan == "NONE") & is_england & is_tertiary & plan_5_age_band & plan_5_cohort
+    )
+    _assign_probabilistically(
+        plan,
+        plan_5_eligible,
+        person_weight,
+        plan_5_shortfall,
+        "PLAN_5",
+        rng,
+    )
+
+    plan_2_target = slc_data["plan_2"]["liable"].get(year, 0)
+    plan_2_shortfall = max(
+        0.0,
+        plan_2_target - _weighted_count((plan == "PLAN_2") & is_england, person_weight),
+    )
+    plan_2_eligible = (plan == "NONE") & is_england & is_tertiary & plan_2_age_band
+    _assign_probabilistically(
+        plan,
+        plan_2_eligible,
+        person_weight,
+        plan_2_shortfall,
+        "PLAN_2",
+        rng,
+    )
+
+    return plan
 
 
 def impute_student_loan_plan(
     dataset: UKSingleYearDataset,
     year: int = 2025,
+    seed: int = 42,
+    slc_data: dict | None = None,
 ) -> UKSingleYearDataset:
     """
     Impute student loan plan type based on age and reported repayments.
@@ -34,45 +134,22 @@ def impute_student_loan_plan(
     - PLAN_5: £25,000 (2025), Sept 2023 onwards
 
     Args:
-        dataset: PolicyEngine UK dataset with student_loan_repayments.
-        year: The simulation year, used to estimate university attendance.
-
-    Returns:
-        Dataset with imputed student_loan_plan values.
+        dataset: PolicyEngine UK dataset with student loan inputs.
+        year: Simulation year, used to estimate university start cohorts.
+        seed: Random seed for reproducible below-threshold assignment.
+        slc_data: Optional override for the SLC borrower snapshot.
     """
     dataset = dataset.copy()
     sim = Microsimulation(dataset=dataset)
-
-    # Get required variables
-    age = sim.calculate("age").values
-    student_loan_repayments = sim.calculate("student_loan_repayments").values
-
-    # Determine if person has a student loan based on reported repayments
-    has_student_loan = student_loan_repayments > 0
-
-    # Estimate when they started university (assume age 18)
-    # For simulation year Y and age A, university start year = Y - A + 18
-    estimated_uni_start_year = year - age + 18
-
-    # Assign plan types based on when loan system changed
-    # StudentLoanPlan is a string enum: "NONE", "PLAN_1", "PLAN_2", "PLAN_4", "PLAN_5"
-    plan = np.full(len(age), "NONE", dtype=object)
-
-    # Plan 1: Started before September 2012
-    plan_1_mask = has_student_loan & (estimated_uni_start_year < 2012)
-    plan[plan_1_mask] = "PLAN_1"
-
-    # Plan 2: Started September 2012 - August 2023
-    plan_2_mask = has_student_loan & (
-        (estimated_uni_start_year >= 2012) & (estimated_uni_start_year < 2023)
+    dataset.person["student_loan_plan"] = _impute_student_loan_plan_values(
+        age=sim.calculate("age").values,
+        student_loan_repayments=sim.calculate("student_loan_repayments").values,
+        country=sim.calculate("country", map_to="person").values,
+        highest_education=sim.calculate("highest_education").values,
+        person_weight=sim.calculate("person_weight").values,
+        year=year,
+        seed=seed,
+        slc_data=slc_data,
     )
-    plan[plan_2_mask] = "PLAN_2"
-
-    # Plan 5: Started September 2023 onwards
-    plan_5_mask = has_student_loan & (estimated_uni_start_year >= 2023)
-    plan[plan_5_mask] = "PLAN_5"
-
-    # Store as the plan type
-    dataset.person["student_loan_plan"] = plan
 
     return dataset
diff --git a/policyengine_uk_data/targets/build_loss_matrix.py b/policyengine_uk_data/targets/build_loss_matrix.py
index e92ecbc1b..35827bc29 100644
--- a/policyengine_uk_data/targets/build_loss_matrix.py
+++ b/policyengine_uk_data/targets/build_loss_matrix.py
@@ -40,6 +40,7 @@
     compute_scotland_uc_child,
     compute_scottish_child_payment,
     compute_student_loan_plan,
+    compute_student_loan_plan_liable,
     compute_ss_contributions,
     compute_ss_headcount,
     compute_ss_it_relief,
@@ -316,8 +317,10 @@ def _compute_column(target: Target, ctx: _SimContext, year: int) -> np.ndarray |
         return compute_scottish_child_payment(target, ctx)
 
     # Student loan plan borrower counts (SLC)
-    if name.startswith("slc/plan_"):
+    if name.startswith("slc/plan_") and "above_threshold" in name:
         return compute_student_loan_plan(target, ctx)
+    if name.startswith("slc/plan_") and "liable" in name:
+        return compute_student_loan_plan_liable(target, ctx)
 
     # PIP claimants
     if name in (
diff --git a/policyengine_uk_data/targets/compute/__init__.py b/policyengine_uk_data/targets/compute/__init__.py
index b9fe37643..a87cf8143 100644
--- a/policyengine_uk_data/targets/compute/__init__.py
+++ b/policyengine_uk_data/targets/compute/__init__.py
@@ -40,6 +40,7 @@
     compute_savings_interest,
     compute_scottish_child_payment,
     compute_student_loan_plan,
+    compute_student_loan_plan_liable,
     compute_vehicles,
 )
 
@@ -61,6 +62,7 @@
     "compute_scotland_uc_child",
     "compute_scottish_child_payment",
     "compute_student_loan_plan",
+    "compute_student_loan_plan_liable",
     "compute_ss_contributions",
     "compute_ss_headcount",
     "compute_ss_it_relief",
diff --git a/policyengine_uk_data/targets/compute/other.py b/policyengine_uk_data/targets/compute/other.py
index 0bd02cb7f..10314fe03 100644
--- a/policyengine_uk_data/targets/compute/other.py
+++ b/policyengine_uk_data/targets/compute/other.py
@@ -1,20 +1,7 @@
-"""Miscellaneous compute functions (vehicles, housing, savings, SCP,
-student loans)."""
+"""Miscellaneous compute functions (vehicles, housing, savings, SCP, student loans)."""
 
 import numpy as np
 
-_ENGLAND_REGIONS = {
-    "NORTH_EAST",
-    "NORTH_WEST",
-    "YORKSHIRE",
-    "EAST_MIDLANDS",
-    "WEST_MIDLANDS",
-    "EAST_OF_ENGLAND",
-    "LONDON",
-    "SOUTH_EAST",
-    "SOUTH_WEST",
-}
-
 
 def compute_vehicles(target, ctx) -> np.ndarray:
     """Compute vehicle ownership targets."""
@@ -78,9 +65,24 @@ def compute_student_loan_plan(target, ctx) -> np.ndarray:
     else:
         return None
 
-    plan = ctx.sim.calculate("student_loan_plan").values
-    region = ctx.sim.calculate("region", map_to="person").values
-    is_england = np.isin(region, list(_ENGLAND_REGIONS))
-    on_plan = (plan == plan_value) & is_england
+    plan = ctx.pe_person("student_loan_plan")
+    repayments = ctx.pe_person("student_loan_repayments")
+    on_plan = (plan == plan_value) & (ctx.country == "ENGLAND") & (repayments > 0)
+
+    return ctx.household_from_person(on_plan.astype(float))
+
+
+def compute_student_loan_plan_liable(target, ctx) -> np.ndarray:
+    """Count all England borrowers on a given plan, including non-repayers."""
+    plan_name = target.name  # e.g. "slc/plan_2_borrowers_liable"
+    if "plan_2" in plan_name:
+        plan_value = "PLAN_2"
+    elif "plan_5" in plan_name:
+        plan_value = "PLAN_5"
+    else:
+        return None
+
+    plan = ctx.pe_person("student_loan_plan")
+    on_plan = (plan == plan_value) & (ctx.country == "ENGLAND")
 
     return ctx.household_from_person(on_plan.astype(float))
diff --git a/policyengine_uk_data/targets/sources/slc.py b/policyengine_uk_data/targets/sources/slc.py
index d49e35bdb..08689c41e 100644
--- a/policyengine_uk_data/targets/sources/slc.py
+++ b/policyengine_uk_data/targets/sources/slc.py
@@ -1,8 +1,10 @@
 """Student Loans Company (SLC) calibration targets.
 
-Borrower counts for England only: Plan 2 and Plan 5, restricted to
-borrowers liable to repay and earning above the repayment threshold.
-This matches the FRS coverage (PAYE deductions only).
+Borrower counts for England only: Plan 2 and Plan 5.
+
+Two target types are exposed:
+- `above_threshold`: borrowers liable to repay and earning above threshold
+- `liable`: all borrowers liable to repay, including below-threshold holders
 
 Source: Explore Education Statistics — Student loan forecasts for England,
 Table 6a: Forecast number of student borrowers liable to repay and number
@@ -17,9 +19,10 @@
 import json
 import os
 import re
-import requests
 from functools import lru_cache
 
+import requests
+
 from policyengine_uk_data.targets.schema import Target, Unit
 
 _PERMALINK_ID = "6ff75517-7124-487c-cb4e-08de6eccf22d"
@@ -29,33 +32,62 @@
 )
 _TESTING_DATA = {
     "plan_2": {
-        2025: 3_670_000,
-        2026: 4_130_000,
-        2027: 4_480_000,
-        2028: 4_700_000,
-        2029: 4_820_000,
-        2030: 4_870_000,
+        "above_threshold": {
+            2025: 3_985_000,
+            2026: 4_460_000,
+            2027: 4_825_000,
+            2028: 5_045_000,
+            2029: 5_160_000,
+            2030: 5_205_000,
+        },
+        "liable": {
+            2025: 8_940_000,
+            2026: 9_710_000,
+            2027: 10_360_000,
+            2028: 10_615_000,
+            2029: 10_600_000,
+            2030: 10_525_000,
+        },
     },
     "plan_5": {
-        2026: 25_000,
-        2027: 115_000,
-        2028: 340_000,
-        2029: 700_000,
-        2030: 1_140_000,
+        "above_threshold": {
+            2026: 35_000,
+            2027: 145_000,
+            2028: 390_000,
+            2029: 770_000,
+            2030: 1_235_000,
+        },
+        "liable": {
+            2025: 10_000,
+            2026: 230_000,
+            2027: 630_000,
+            2028: 1_380_000,
+            2029: 2_360_000,
+            2030: 3_400_000,
+        },
     },
 }
 
 
+def get_snapshot_data() -> dict:
+    """Return the checked-in SLC snapshot used for tests and deterministic builds."""
+    return {
+        plan: {
+            target_type: values.copy() for target_type, values in target_data.items()
+        }
+        for plan, target_data in _TESTING_DATA.items()
+    }
+
+
 @lru_cache(maxsize=1)
 def _fetch_slc_data() -> dict:
     """Fetch and parse SLC Table 6a data from Explore Education Statistics.
 
     Returns:
-        Dict with keys 'plan_2' and 'plan_5', each containing a dict
-        mapping calendar year (int) to borrower count above threshold (int).
+        Nested dict of plan -> target type -> year -> count.
     """
     if os.environ.get("TESTING", "0") == "1":
-        return _TESTING_DATA
+        return get_snapshot_data()
 
     response = requests.get(_PERMALINK_URL, timeout=30)
     response.raise_for_status()
@@ -75,59 +107,62 @@ def _fetch_slc_data() -> dict:
     # Structure: Plan 2 (6 years), Plan 5 (6 years), Plan 3 (5 years)
     header_row = table_json["thead"][1]
 
-    # Get Plan 2 years (first 6 columns)
     plan_2_years = []
     for i in range(6):
         year_text = header_row[i]["text"]  # e.g., "2029-30"
         start_year = int(year_text.split("-")[0])
-        calendar_year = start_year + 1  # 2029-30 → 2030
-        plan_2_years.append(calendar_year)
+        plan_2_years.append(start_year + 1)  # 2029-30 → 2030
 
-    # Get Plan 5 years (next 6 columns)
     plan_5_years = []
     for i in range(6, 12):
         year_text = header_row[i]["text"]
         start_year = int(year_text.split("-")[0])
-        calendar_year = start_year + 1
-        plan_5_years.append(calendar_year)
+        plan_5_years.append(start_year + 1)
 
-    # Find the "Higher education total" / "earning above threshold" row
-    # This is the row following "Higher education total" with "liable to repay"
     tbody = table_json["tbody"]
-
-    # Row 11 contains: header + 6 Plan 2 values + 6 Plan 5 values + 5 Plan 3
-    target_row = None
-    for row in tbody:
+    liable_row = None
+    above_threshold_row = None
+    for index, row in enumerate(tbody):
         header_text = row[0].get("text", "")
-        if "earning above repayment threshold" in header_text:
-            # Check if previous context was "Higher education total"
-            # Actually, row 11 is after HE total row 10, and starts with
-            # the "earning above" header (no group header due to rowSpan)
-            target_row = row
+        if header_text == "Higher education total":
+            liable_row = row
+            if index + 1 < len(tbody):
+                next_row = tbody[index + 1]
+                next_header = next_row[0].get("text", "")
+                if "earning above repayment threshold" in next_header:
+                    above_threshold_row = next_row
             break
 
-    if target_row is None:
+    if liable_row is None:
+        raise ValueError("Could not find 'Higher education total' row")
+    if above_threshold_row is None:
         raise ValueError("Could not find 'earning above threshold' row")
 
-    # Parse Plan 2 data (cells 1-6, mapping to plan_2_years)
-    plan_2_data = {}
-    for i, year in enumerate(plan_2_years):
-        cell_idx = 1 + i  # Skip header cell
-        value_text = target_row[cell_idx].get("text", "")
-        if value_text and value_text not in ("no data", "0"):
-            value = int(value_text.replace(",", ""))
-            plan_2_data[year] = value
-
-    # Parse Plan 5 data (cells 7-12, mapping to plan_5_years)
-    plan_5_data = {}
-    for i, year in enumerate(plan_5_years):
-        cell_idx = 7 + i  # Skip header + Plan 2 cells
-        value_text = target_row[cell_idx].get("text", "")
-        if value_text and value_text not in ("no data", "0"):
-            value = int(value_text.replace(",", ""))
-            plan_5_data[year] = value
-
-    return {"plan_2": plan_2_data, "plan_5": plan_5_data}
+    def parse_values(row, start_index, years):
+        data = {}
+        for offset, year in enumerate(years):
+            cell_idx = start_index + offset
+            if cell_idx >= len(row):
+                continue
+            value_text = row[cell_idx].get("text", "")
+            if value_text and value_text not in ("no data", "0"):
+                data[year] = int(value_text.replace(",", ""))
+        return data
+
+    return {
+        "plan_2": {
+            "above_threshold": parse_values(
+                above_threshold_row, start_index=1, years=plan_2_years
+            ),
+            "liable": parse_values(liable_row, start_index=2, years=plan_2_years),
+        },
+        "plan_5": {
+            "above_threshold": parse_values(
+                above_threshold_row, start_index=7, years=plan_5_years
+            ),
+            "liable": parse_values(liable_row, start_index=8, years=plan_5_years),
+        },
+    }
 
 
 def get_targets() -> list[Target]:
@@ -136,28 +171,21 @@ def get_targets() -> list[Target]:
 
     targets = []
 
-    targets.append(
-        Target(
-            name="slc/plan_2_borrowers_above_threshold",
-            variable="student_loan_plan",
-            source="slc",
-            unit=Unit.COUNT,
-            is_count=True,
-            values=slc_data["plan_2"],
-            reference_url=_PERMALINK_URL,
-        )
-    )
-
-    targets.append(
-        Target(
-            name="slc/plan_5_borrowers_above_threshold",
-            variable="student_loan_plan",
-            source="slc",
-            unit=Unit.COUNT,
-            is_count=True,
-            values=slc_data["plan_5"],
-            reference_url=_PERMALINK_URL,
-        )
-    )
+    for plan, plan_label in (("plan_2", "2"), ("plan_5", "5")):
+        for target_type, suffix in (
+            ("above_threshold", "above_threshold"),
+            ("liable", "liable"),
+        ):
+            targets.append(
+                Target(
+                    name=f"slc/plan_{plan_label}_borrowers_{suffix}",
+                    variable="student_loan_plan",
+                    source="slc",
+                    unit=Unit.COUNT,
+                    is_count=True,
+                    values=slc_data[plan][target_type],
+                    reference_url=_PERMALINK_URL,
+                )
+            )
 
     return targets
diff --git a/policyengine_uk_data/tests/test_student_loan_plan.py b/policyengine_uk_data/tests/test_student_loan_plan.py
index ddbfd419b..b8d73d903 100644
--- a/policyengine_uk_data/tests/test_student_loan_plan.py
+++ b/policyengine_uk_data/tests/test_student_loan_plan.py
@@ -1,44 +1,88 @@
 """Tests for student loan plan imputation."""
 
 import numpy as np
-import pytest
 
 
-def test_student_loan_plan_imputation_logic():
-    """Test the plan assignment logic based on university start year."""
-    # Test data: (age, year, expected_uni_start, expected_plan)
-    # Plan 1: pre-2012, Plan 2: 2012-2022, Plan 5: 2023+
+def test_repaying_borrowers_are_assigned_expected_plans():
+    """Repayers should map to the expected plan cohorts."""
+    from policyengine_uk_data.datasets.imputations.student_loans import (
+        _impute_student_loan_plan_values,
+    )
+
+    plans = _impute_student_loan_plan_values(
+        age=np.array([40, 30, 20]),
+        student_loan_repayments=np.array([100.0, 100.0, 100.0]),
+        country=np.array(["ENGLAND", "ENGLAND", "ENGLAND"]),
+        highest_education=np.array(["TERTIARY", "TERTIARY", "TERTIARY"]),
+        person_weight=np.ones(3),
+        year=2025,
+        slc_data={"plan_2": {"liable": {2025: 1}}, "plan_5": {"liable": {2025: 1}}},
+    )
+
+    assert plans.tolist() == ["PLAN_1", "PLAN_2", "PLAN_5"]
+
 
-    year = 2025
+def test_below_threshold_imputation_uses_liable_shortfall():
+    """Missing holders should be imputed from the liable target shortfall."""
+    from policyengine_uk_data.datasets.imputations.student_loans import (
+        _impute_student_loan_plan_values,
+    )
 
-    # Age 40 in 2025 -> started uni ~2003 -> Plan 1
-    age_40_uni_year = year - 40 + 18  # = 2003
-    assert age_40_uni_year < 2012, "Age 40 should be Plan 1"
+    plans = _impute_student_loan_plan_values(
+        age=np.array([40, 30, 20, 30, 30, 30]),
+        student_loan_repayments=np.array([100.0, 100.0, 0.0, 0.0, 0.0, 0.0]),
+        country=np.array(
+            ["ENGLAND", "ENGLAND", "ENGLAND", "ENGLAND", "WALES", "ENGLAND"]
+        ),
+        highest_education=np.array(
+            ["POST_SECONDARY", "TERTIARY", "TERTIARY", "TERTIARY", "TERTIARY", "GCSE"]
+        ),
+        person_weight=np.ones(6),
+        year=2025,
+        slc_data={
+            "plan_2": {"liable": {2025: 2}},
+            "plan_5": {"liable": {2025: 1}},
+        },
+    )
 
-    # Age 30 in 2025 -> started uni ~2013 -> Plan 2
-    age_30_uni_year = year - 30 + 18  # = 2013
-    assert 2012 <= age_30_uni_year < 2023, "Age 30 should be Plan 2"
+    assert plans.tolist() == [
+        "PLAN_1",
+        "PLAN_2",
+        "PLAN_5",
+        "PLAN_2",
+        "NONE",
+        "NONE",
+    ]
 
-    # Age 25 in 2025 -> started uni ~2018 -> Plan 2
-    age_25_uni_year = year - 25 + 18  # = 2018
-    assert 2012 <= age_25_uni_year < 2023, "Age 25 should be Plan 2"
 
-    # Age 20 in 2025 -> started uni ~2023 -> Plan 5
-    age_20_uni_year = year - 20 + 18  # = 2023
-    assert age_20_uni_year >= 2023, "Age 20 should be Plan 5"
+def test_plan5_assignment_has_priority_over_plan2_for_recent_cohort():
+    """Recent cohorts should stay on Plan 5 rather than being swallowed by Plan 2."""
+    from policyengine_uk_data.datasets.imputations.student_loans import (
+        _impute_student_loan_plan_values,
+    )
+
+    plans = _impute_student_loan_plan_values(
+        age=np.array([21]),
+        student_loan_repayments=np.array([0.0]),
+        country=np.array(["ENGLAND"]),
+        highest_education=np.array(["TERTIARY"]),
+        person_weight=np.ones(1),
+        year=2026,
+        slc_data={
+            "plan_2": {"liable": {2026: 1}},
+            "plan_5": {"liable": {2026: 1}},
+        },
+    )
 
-    # Age 18 in 2025 -> started uni ~2025 -> Plan 5
-    age_18_uni_year = year - 18 + 18  # = 2025
-    assert age_18_uni_year >= 2023, "Age 18 should be Plan 5"
+    assert plans.tolist() == ["PLAN_5"]
 
 
 def test_student_loan_plan_enum_values():
-    """Test that plan enum values match policyengine-uk's string enum."""
+    """Student-loan plan strings should still match policyengine-uk's enum."""
     from policyengine_uk.variables.gov.hmrc.student_loans.student_loan_plan import (
         StudentLoanPlan,
     )
 
-    # Verify our assumptions about enum values (string-based enum)
     assert StudentLoanPlan.NONE.value == "NONE"
     assert StudentLoanPlan.PLAN_1.value == "PLAN_1"
     assert StudentLoanPlan.PLAN_2.value == "PLAN_2"
diff --git a/policyengine_uk_data/tests/test_student_loan_targets.py b/policyengine_uk_data/tests/test_student_loan_targets.py
index 3e9da19c5..b10e3aab6 100644
--- a/policyengine_uk_data/tests/test_student_loan_targets.py
+++ b/policyengine_uk_data/tests/test_student_loan_targets.py
@@ -1,5 +1,10 @@
 """Tests for SLC student loan calibration targets."""
 
+import json
+from types import SimpleNamespace
+
+import numpy as np
+
 
 def test_slc_targets_registered():
     """SLC targets appear in the target registry."""
@@ -8,28 +13,37 @@ def test_slc_targets_registered():
     targets = {t.name: t for t in get_all_targets()}
     assert "slc/plan_2_borrowers_above_threshold" in targets
     assert "slc/plan_5_borrowers_above_threshold" in targets
+    assert "slc/plan_2_borrowers_liable" in targets
+    assert "slc/plan_5_borrowers_liable" in targets
 
 
-def test_slc_plan2_values():
-    """Plan 2 target values match SLC Table 6a."""
+def test_slc_snapshot_values_match_higher_education_total_rows():
+    """Snapshot values should match the HE-total borrower rows."""
     from policyengine_uk_data.targets.registry import get_all_targets
 
     targets = {t.name: t for t in get_all_targets()}
-    p2 = targets["slc/plan_2_borrowers_above_threshold"]
-    assert p2.values[2025] == 3_670_000
-    assert p2.values[2026] == 4_130_000
-    assert p2.values[2029] == 4_820_000
+
+    assert targets["slc/plan_2_borrowers_above_threshold"].values[2025] == 3_985_000
+    assert targets["slc/plan_2_borrowers_above_threshold"].values[2030] == 5_205_000
+    assert targets["slc/plan_2_borrowers_liable"].values[2025] == 8_940_000
+    assert targets["slc/plan_2_borrowers_liable"].values[2030] == 10_525_000
+
+    assert 2025 not in targets["slc/plan_5_borrowers_above_threshold"].values
+    assert targets["slc/plan_5_borrowers_above_threshold"].values[2026] == 35_000
+    assert targets["slc/plan_5_borrowers_above_threshold"].values[2030] == 1_235_000
+    assert targets["slc/plan_5_borrowers_liable"].values[2025] == 10_000
+    assert targets["slc/plan_5_borrowers_liable"].values[2030] == 3_400_000
 
 
-def test_slc_plan5_values():
-    """Plan 5 target values match SLC Table 6a."""
+def test_liable_targets_exceed_above_threshold_targets():
+    """Liable counts should exceed above-threshold counts in the same year."""
     from policyengine_uk_data.targets.registry import get_all_targets
 
     targets = {t.name: t for t in get_all_targets()}
-    p5 = targets["slc/plan_5_borrowers_above_threshold"]
-    assert 2025 not in p5.values  # no Plan 5 borrowers yet in 2024-25
-    assert p5.values[2026] == 25_000
-    assert p5.values[2029] == 700_000
+    for year, count in targets["slc/plan_2_borrowers_above_threshold"].values.items():
+        assert targets["slc/plan_2_borrowers_liable"].values[year] > count
+    for year, count in targets["slc/plan_5_borrowers_above_threshold"].values.items():
+        assert targets["slc/plan_5_borrowers_liable"].values[year] > count
 
 
 def test_slc_testing_mode_uses_snapshot_without_network(monkeypatch):
@@ -44,6 +58,98 @@ def fail_network(*args, **kwargs):
 
     monkeypatch.setattr(slc.requests, "get", fail_network)
 
-    assert slc._fetch_slc_data() == slc._TESTING_DATA
+    assert slc._fetch_slc_data() == slc.get_snapshot_data()
+    slc._fetch_slc_data.cache_clear()
+
+
+def test_slc_parser_uses_higher_education_total_rows(monkeypatch):
+    """Parser should read HE-total rows, not the first matching above-threshold row."""
+    from policyengine_uk_data.targets.sources import slc
+
+    table_json = {
+        "thead": [
+            [],
+            [{"text": "2024-25"}] * 6 + [{"text": "2024-25"}] * 6,
+        ],
+        "tbody": [
+            [{"text": "Higher education full-time"}, {"text": "liable"}]
+            + [{"text": "1,000"}] * 12,
+            [
+                {
+                    "text": "Number of borrowers liable to repay and earning above repayment threshold"
+                }
+            ]
+            + [{"text": "100"}] * 12,
+            [{"text": "Higher education total"}, {"text": "liable"}]
+            + [{"text": "8,940,000"}] * 6
+            + [{"text": "10,000"}] * 6,
+            [
+                {
+                    "text": "Number of borrowers liable to repay and earning above repayment threshold"
+                }
+            ]
+            + [{"text": "3,985,000"}] * 6
+            + [{"text": "35,000"}] * 6,
+        ],
+    }
+    html = (
+        '<script id="__NEXT_DATA__" type="application/json">'
+        + json.dumps(
+            {"props": {"pageProps": {"data": {"table": {"json": table_json}}}}}
+        )
+        + "</script>"
+    )
+
+    class DummyResponse:
+        text = html
+
+        @staticmethod
+        def raise_for_status():
+            return None
 
     slc._fetch_slc_data.cache_clear()
+    monkeypatch.delenv("TESTING", raising=False)
+    monkeypatch.setattr(slc.requests, "get", lambda *args, **kwargs: DummyResponse())
+
+    data = slc._fetch_slc_data()
+    assert data["plan_2"]["liable"][2025] == 8_940_000
+    assert data["plan_2"]["above_threshold"][2025] == 3_985_000
+    assert data["plan_5"]["liable"][2025] == 10_000
+    assert data["plan_5"]["above_threshold"][2025] == 35_000
+
+    slc._fetch_slc_data.cache_clear()
+
+
+def test_student_loan_target_compute_distinguishes_liable_from_repaying():
+    """Above-threshold counts should require repayments, while liable counts should not."""
+    from policyengine_uk_data.targets.compute.other import (
+        compute_student_loan_plan,
+        compute_student_loan_plan_liable,
+    )
+
+    class DummyCtx:
+        country = np.array(["ENGLAND", "ENGLAND", "WALES", "ENGLAND"])
+
+        @staticmethod
+        def pe_person(variable):
+            values = {
+                "student_loan_plan": np.array(["PLAN_2", "PLAN_2", "PLAN_2", "PLAN_5"]),
+                "student_loan_repayments": np.array([10.0, 0.0, 15.0, 0.0]),
+            }
+            return values[variable]
+
+        @staticmethod
+        def household_from_person(values):
+            return values
+
+    above_threshold = compute_student_loan_plan(
+        SimpleNamespace(name="slc/plan_2_borrowers_above_threshold"),
+        DummyCtx(),
+    )
+    liable = compute_student_loan_plan_liable(
+        SimpleNamespace(name="slc/plan_2_borrowers_liable"),
+        DummyCtx(),
+    )
+
+    assert above_threshold.tolist() == [1.0, 0.0, 0.0, 0.0]
+    assert liable.tolist() == [1.0, 1.0, 0.0, 0.0]

From 21b224320cbd4a5f093736c87105be70fdef427f Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Mon, 13 Apr 2026 09:37:52 -0400
Subject: [PATCH 2/2] Fix student loan target entity mapping

---
 .../datasets/imputations/student_loans.py     |  7 +++++-
 policyengine_uk_data/targets/compute/other.py |  6 +++--
 .../tests/test_student_loan_plan.py           | 22 +++++++++++++++++++
 .../tests/test_student_loan_targets.py        | 11 +++++++++-
 4 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py
index 52e1cac2a..7a9f7f73c 100644
--- a/policyengine_uk_data/datasets/imputations/student_loans.py
+++ b/policyengine_uk_data/datasets/imputations/student_loans.py
@@ -71,6 +71,9 @@ def _impute_student_loan_plan_values(
     estimated_uni_start_year = year - age + 18
 
     plan_1_cohort = estimated_uni_start_year < 2012
+    plan_2_cohort = (estimated_uni_start_year >= 2012) & (
+        estimated_uni_start_year < 2023
+    )
     plan_5_cohort = estimated_uni_start_year >= 2023
     plan_2_age_band = (age >= _PLAN_2_MIN_AGE) & (age <= _PLAN_2_MAX_AGE)
     plan_5_age_band = (age >= 18) & (age <= _PLAN_5_MAX_AGE)
@@ -105,7 +108,9 @@ def _impute_student_loan_plan_values(
         0.0,
         plan_2_target - _weighted_count((plan == "PLAN_2") & is_england, person_weight),
     )
-    plan_2_eligible = (plan == "NONE") & is_england & is_tertiary & plan_2_age_band
+    plan_2_eligible = (
+        (plan == "NONE") & is_england & is_tertiary & plan_2_age_band & plan_2_cohort
+    )
     _assign_probabilistically(
         plan,
         plan_2_eligible,
diff --git a/policyengine_uk_data/targets/compute/other.py b/policyengine_uk_data/targets/compute/other.py
index 10314fe03..65151f2e4 100644
--- a/policyengine_uk_data/targets/compute/other.py
+++ b/policyengine_uk_data/targets/compute/other.py
@@ -67,7 +67,8 @@ def compute_student_loan_plan(target, ctx) -> np.ndarray:
 
     plan = ctx.pe_person("student_loan_plan")
     repayments = ctx.pe_person("student_loan_repayments")
-    on_plan = (plan == plan_value) & (ctx.country == "ENGLAND") & (repayments > 0)
+    person_country = ctx.sim.calculate("country", map_to="person").values
+    on_plan = (plan == plan_value) & (person_country == "ENGLAND") & (repayments > 0)
 
     return ctx.household_from_person(on_plan.astype(float))
 
@@ -83,6 +84,7 @@ def compute_student_loan_plan_liable(target, ctx) -> np.ndarray:
         return None
 
     plan = ctx.pe_person("student_loan_plan")
-    on_plan = (plan == plan_value) & (ctx.country == "ENGLAND")
+    person_country = ctx.sim.calculate("country", map_to="person").values
+    on_plan = (plan == plan_value) & (person_country == "ENGLAND")
 
     return ctx.household_from_person(on_plan.astype(float))
diff --git a/policyengine_uk_data/tests/test_student_loan_plan.py b/policyengine_uk_data/tests/test_student_loan_plan.py
index b8d73d903..d782f25f8 100644
--- a/policyengine_uk_data/tests/test_student_loan_plan.py
+++ b/policyengine_uk_data/tests/test_student_loan_plan.py
@@ -77,6 +77,28 @@ def test_plan5_assignment_has_priority_over_plan2_for_recent_cohort():
     assert plans.tolist() == ["PLAN_5"]
 
 
+def test_plan2_below_threshold_imputation_respects_estimated_cohort():
+    """Pre-2012 cohorts should not be assigned Plan 2 just because they fit the age band."""
+    from policyengine_uk_data.datasets.imputations.student_loans import (
+        _impute_student_loan_plan_values,
+    )
+
+    plans = _impute_student_loan_plan_values(
+        age=np.array([40]),
+        student_loan_repayments=np.array([0.0]),
+        country=np.array(["ENGLAND"]),
+        highest_education=np.array(["TERTIARY"]),
+        person_weight=np.ones(1),
+        year=2025,
+        slc_data={
+            "plan_2": {"liable": {2025: 1}},
+            "plan_5": {"liable": {2025: 0}},
+        },
+    )
+
+    assert plans.tolist() == ["NONE"]
+
+
 def test_student_loan_plan_enum_values():
     """Student-loan plan strings should still match policyengine-uk's enum."""
     from policyengine_uk.variables.gov.hmrc.student_loans.student_loan_plan import (
diff --git a/policyengine_uk_data/tests/test_student_loan_targets.py b/policyengine_uk_data/tests/test_student_loan_targets.py
index b10e3aab6..fc9858f33 100644
--- a/policyengine_uk_data/tests/test_student_loan_targets.py
+++ b/policyengine_uk_data/tests/test_student_loan_targets.py
@@ -128,7 +128,16 @@ def test_student_loan_target_compute_distinguishes_liable_from_repaying():
     )
 
     class DummyCtx:
-        country = np.array(["ENGLAND", "ENGLAND", "WALES", "ENGLAND"])
+        country = np.array(["ENGLAND", "WALES"])
+
+        class sim:
+            @staticmethod
+            def calculate(variable, map_to=None):
+                if variable == "country" and map_to == "person":
+                    return SimpleNamespace(
+                        values=np.array(["ENGLAND", "ENGLAND", "WALES", "ENGLAND"])
+                    )
+                raise AssertionError(f"Unexpected calculate call: {variable}, {map_to}")
 
         @staticmethod
         def pe_person(variable):