Skip to content

Commit fd72fca

Browse files
authored
Impute below-threshold student loan holders (#332)
* Impute below-threshold student loan holders * Fix student loan target entity mapping
1 parent e743602 commit fd72fca

8 files changed

Lines changed: 484 additions & 183 deletions

File tree

changelog.d/281.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Impute below-threshold England student loan holders into the FRS base dataset and add SLC liable-to-repay calibration targets for Plans 2 and 5.
Lines changed: 132 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,133 @@
1-
"""
2-
Student loan plan imputation.
3-
4-
This module imputes the student_loan_plan variable based on:
5-
- Whether the person has reported student loan repayments
6-
- Their estimated university attendance year (inferred from age)
1+
"""Student loan plan imputation.
72
8-
The imputation assigns plan types according to when the loan system changed:
9-
- NONE: No reported repayments
10-
- PLAN_1: Started university before September 2012
11-
- PLAN_2: Started September 2012 - August 2023
12-
- PLAN_5: Started September 2023 onwards
3+
This module imputes `student_loan_plan` in two steps:
4+
- assign plans to people with reported PAYE student loan repayments
5+
- assign missing below-threshold holders to match SLC liable-to-repay totals
136
14-
This enables policyengine-uk's student_loan_repayment variable to calculate
15-
repayments using official threshold parameters.
7+
The FRS only observes active repayment through PAYE, so many England borrowers
8+
who hold a loan but earn below the repayment threshold are missing from the
9+
base dataset. We fill that stock using the checked-in SLC snapshot, restricting
10+
the new assignments to plausible England tertiary-education cohorts.
1611
"""
1712

1813
import numpy as np
19-
from policyengine_uk.data import UKSingleYearDataset
2014
from policyengine_uk import Microsimulation
15+
from policyengine_uk.data import UKSingleYearDataset
16+
17+
from policyengine_uk_data.targets.sources.slc import get_snapshot_data
18+
19+
_ENGLAND = "ENGLAND"
20+
_PLAN_2_MIN_AGE = 21
21+
_PLAN_2_MAX_AGE = 55
22+
_PLAN_5_MAX_AGE = 25
23+
24+
25+
def _weighted_count(mask: np.ndarray, weights: np.ndarray) -> float:
26+
return float(np.sum(weights[mask]))
27+
28+
29+
def _assign_probabilistically(
30+
plan: np.ndarray,
31+
eligible: np.ndarray,
32+
weights: np.ndarray,
33+
target_count: float,
34+
plan_name: str,
35+
rng: np.random.Generator,
36+
) -> None:
37+
"""Assign a plan to a weighted eligible pool up to a target count."""
38+
eligible_weight = _weighted_count(eligible, weights)
39+
if target_count <= 0 or eligible_weight <= 0:
40+
return
41+
assignment_probability = min(1.0, target_count / eligible_weight)
42+
draws = rng.random(len(plan))
43+
plan[eligible & (draws < assignment_probability)] = plan_name
44+
45+
46+
def _impute_student_loan_plan_values(
47+
age: np.ndarray,
48+
student_loan_repayments: np.ndarray,
49+
country: np.ndarray,
50+
highest_education: np.ndarray,
51+
person_weight: np.ndarray,
52+
*,
53+
year: int,
54+
seed: int = 42,
55+
slc_data: dict | None = None,
56+
) -> np.ndarray:
57+
"""Impute plan values from person-level arrays."""
58+
age = np.asarray(age)
59+
repayments = np.asarray(student_loan_repayments)
60+
country = np.asarray(country)
61+
highest_education = np.asarray(highest_education)
62+
person_weight = np.asarray(person_weight, dtype=float)
63+
slc_data = get_snapshot_data() if slc_data is None else slc_data
64+
65+
rng = np.random.default_rng(seed)
66+
plan = np.full(len(age), "NONE", dtype=object)
67+
68+
has_repayments = repayments > 0
69+
is_england = country == _ENGLAND
70+
is_tertiary = highest_education == "TERTIARY"
71+
estimated_uni_start_year = year - age + 18
72+
73+
plan_1_cohort = estimated_uni_start_year < 2012
74+
plan_2_cohort = (estimated_uni_start_year >= 2012) & (
75+
estimated_uni_start_year < 2023
76+
)
77+
plan_5_cohort = estimated_uni_start_year >= 2023
78+
plan_2_age_band = (age >= _PLAN_2_MIN_AGE) & (age <= _PLAN_2_MAX_AGE)
79+
plan_5_age_band = (age >= 18) & (age <= _PLAN_5_MAX_AGE)
80+
81+
# Reported PAYE repayers identify the active stock directly.
82+
plan[has_repayments & plan_1_cohort] = "PLAN_1"
83+
plan[has_repayments & plan_5_cohort] = "PLAN_5"
84+
plan[has_repayments & (plan == "NONE")] = "PLAN_2"
85+
86+
# Impute missing below-threshold holders so the total England stock matches
87+
# the SLC liable-to-repay series, using the observed repayer stock as the
88+
# starting point rather than the official above-threshold count.
89+
plan_5_target = slc_data["plan_5"]["liable"].get(year, 0)
90+
plan_5_shortfall = max(
91+
0.0,
92+
plan_5_target - _weighted_count((plan == "PLAN_5") & is_england, person_weight),
93+
)
94+
plan_5_eligible = (
95+
(plan == "NONE") & is_england & is_tertiary & plan_5_age_band & plan_5_cohort
96+
)
97+
_assign_probabilistically(
98+
plan,
99+
plan_5_eligible,
100+
person_weight,
101+
plan_5_shortfall,
102+
"PLAN_5",
103+
rng,
104+
)
105+
106+
plan_2_target = slc_data["plan_2"]["liable"].get(year, 0)
107+
plan_2_shortfall = max(
108+
0.0,
109+
plan_2_target - _weighted_count((plan == "PLAN_2") & is_england, person_weight),
110+
)
111+
plan_2_eligible = (
112+
(plan == "NONE") & is_england & is_tertiary & plan_2_age_band & plan_2_cohort
113+
)
114+
_assign_probabilistically(
115+
plan,
116+
plan_2_eligible,
117+
person_weight,
118+
plan_2_shortfall,
119+
"PLAN_2",
120+
rng,
121+
)
122+
123+
return plan
21124

22125

23126
def impute_student_loan_plan(
24127
dataset: UKSingleYearDataset,
25128
year: int = 2025,
129+
seed: int = 42,
130+
slc_data: dict | None = None,
26131
) -> UKSingleYearDataset:
27132
"""
28133
Impute student loan plan type based on age and reported repayments.
@@ -34,45 +139,22 @@ def impute_student_loan_plan(
34139
- PLAN_5: £25,000 (2025), Sept 2023 onwards
35140
36141
Args:
37-
dataset: PolicyEngine UK dataset with student_loan_repayments.
38-
year: The simulation year, used to estimate university attendance.
39-
40-
Returns:
41-
Dataset with imputed student_loan_plan values.
142+
dataset: PolicyEngine UK dataset with student loan inputs.
143+
year: Simulation year, used to estimate university start cohorts.
144+
seed: Random seed for reproducible below-threshold assignment.
145+
slc_data: Optional override for the SLC borrower snapshot.
42146
"""
43147
dataset = dataset.copy()
44148
sim = Microsimulation(dataset=dataset)
45-
46-
# Get required variables
47-
age = sim.calculate("age").values
48-
student_loan_repayments = sim.calculate("student_loan_repayments").values
49-
50-
# Determine if person has a student loan based on reported repayments
51-
has_student_loan = student_loan_repayments > 0
52-
53-
# Estimate when they started university (assume age 18)
54-
# For simulation year Y and age A, university start year = Y - A + 18
55-
estimated_uni_start_year = year - age + 18
56-
57-
# Assign plan types based on when loan system changed
58-
# StudentLoanPlan is a string enum: "NONE", "PLAN_1", "PLAN_2", "PLAN_4", "PLAN_5"
59-
plan = np.full(len(age), "NONE", dtype=object)
60-
61-
# Plan 1: Started before September 2012
62-
plan_1_mask = has_student_loan & (estimated_uni_start_year < 2012)
63-
plan[plan_1_mask] = "PLAN_1"
64-
65-
# Plan 2: Started September 2012 - August 2023
66-
plan_2_mask = has_student_loan & (
67-
(estimated_uni_start_year >= 2012) & (estimated_uni_start_year < 2023)
149+
dataset.person["student_loan_plan"] = _impute_student_loan_plan_values(
150+
age=sim.calculate("age").values,
151+
student_loan_repayments=sim.calculate("student_loan_repayments").values,
152+
country=sim.calculate("country", map_to="person").values,
153+
highest_education=sim.calculate("highest_education").values,
154+
person_weight=sim.calculate("person_weight").values,
155+
year=year,
156+
seed=seed,
157+
slc_data=slc_data,
68158
)
69-
plan[plan_2_mask] = "PLAN_2"
70-
71-
# Plan 5: Started September 2023 onwards
72-
plan_5_mask = has_student_loan & (estimated_uni_start_year >= 2023)
73-
plan[plan_5_mask] = "PLAN_5"
74-
75-
# Store as the plan type
76-
dataset.person["student_loan_plan"] = plan
77159

78160
return dataset

policyengine_uk_data/targets/build_loss_matrix.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
compute_scotland_uc_child,
4141
compute_scottish_child_payment,
4242
compute_student_loan_plan,
43+
compute_student_loan_plan_liable,
4344
compute_ss_contributions,
4445
compute_ss_headcount,
4546
compute_ss_it_relief,
@@ -316,8 +317,10 @@ def _compute_column(target: Target, ctx: _SimContext, year: int) -> np.ndarray |
316317
return compute_scottish_child_payment(target, ctx)
317318

318319
# Student loan plan borrower counts (SLC)
319-
if name.startswith("slc/plan_"):
320+
if name.startswith("slc/plan_") and "above_threshold" in name:
320321
return compute_student_loan_plan(target, ctx)
322+
if name.startswith("slc/plan_") and "liable" in name:
323+
return compute_student_loan_plan_liable(target, ctx)
321324

322325
# PIP claimants
323326
if name in (

policyengine_uk_data/targets/compute/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
compute_savings_interest,
4141
compute_scottish_child_payment,
4242
compute_student_loan_plan,
43+
compute_student_loan_plan_liable,
4344
compute_vehicles,
4445
)
4546

@@ -61,6 +62,7 @@
6162
"compute_scotland_uc_child",
6263
"compute_scottish_child_payment",
6364
"compute_student_loan_plan",
65+
"compute_student_loan_plan_liable",
6466
"compute_ss_contributions",
6567
"compute_ss_headcount",
6668
"compute_ss_it_relief",

policyengine_uk_data/targets/compute/other.py

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,7 @@
1-
"""Miscellaneous compute functions (vehicles, housing, savings, SCP,
2-
student loans)."""
1+
"""Miscellaneous compute functions (vehicles, housing, savings, SCP, student loans)."""
32

43
import numpy as np
54

6-
_ENGLAND_REGIONS = {
7-
"NORTH_EAST",
8-
"NORTH_WEST",
9-
"YORKSHIRE",
10-
"EAST_MIDLANDS",
11-
"WEST_MIDLANDS",
12-
"EAST_OF_ENGLAND",
13-
"LONDON",
14-
"SOUTH_EAST",
15-
"SOUTH_WEST",
16-
}
17-
185

196
def compute_vehicles(target, ctx) -> np.ndarray:
207
"""Compute vehicle ownership targets."""
@@ -78,9 +65,26 @@ def compute_student_loan_plan(target, ctx) -> np.ndarray:
7865
else:
7966
return None
8067

81-
plan = ctx.sim.calculate("student_loan_plan").values
82-
region = ctx.sim.calculate("region", map_to="person").values
83-
is_england = np.isin(region, list(_ENGLAND_REGIONS))
84-
on_plan = (plan == plan_value) & is_england
68+
plan = ctx.pe_person("student_loan_plan")
69+
repayments = ctx.pe_person("student_loan_repayments")
70+
person_country = ctx.sim.calculate("country", map_to="person").values
71+
on_plan = (plan == plan_value) & (person_country == "ENGLAND") & (repayments > 0)
72+
73+
return ctx.household_from_person(on_plan.astype(float))
74+
75+
76+
def compute_student_loan_plan_liable(target, ctx) -> np.ndarray:
77+
"""Count all England borrowers on a given plan, including non-repayers."""
78+
plan_name = target.name # e.g. "slc/plan_2_borrowers_liable"
79+
if "plan_2" in plan_name:
80+
plan_value = "PLAN_2"
81+
elif "plan_5" in plan_name:
82+
plan_value = "PLAN_5"
83+
else:
84+
return None
85+
86+
plan = ctx.pe_person("student_loan_plan")
87+
person_country = ctx.sim.calculate("country", map_to="person").values
88+
on_plan = (plan == plan_value) & (person_country == "ENGLAND")
8589

8690
return ctx.household_from_person(on_plan.astype(float))

0 commit comments

Comments
 (0)