Skip to content

Commit 59cd885

Browse files
authored
Merge pull request #339 from PolicyEngine/fix-maintenance-loan-targets
Add maintenance loan calibration targets
2 parents 57f8330 + 4c90dad commit 59cd885

10 files changed

Lines changed: 493 additions & 7 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add Student Loans Company maintenance-loan recipient-count and spend targets for England full-time undergraduates.

policyengine_uk_data/datasets/imputations/wealth.py

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
Survey (WAS) data.
77
"""
88

9+
import numpy as np
910
import pandas as pd
1011
from policyengine_uk_data.storage import STORAGE_FOLDER
1112
from policyengine_uk.data import UKSingleYearDataset
@@ -160,6 +161,93 @@ def _wealth_model_outputs_are_current(model: QRF) -> bool:
160161
return list(trained_outputs) == IMPUTE_VARIABLES
161162

162163

164+
def _person_column(person: pd.DataFrame, name: str, default) -> pd.Series:
165+
if name in person:
166+
return person[name]
167+
return pd.Series(default, index=person.index)
168+
169+
170+
def _allocate_student_loan_balance_to_people(
171+
household_balances: pd.Series,
172+
person: pd.DataFrame,
173+
) -> np.ndarray:
174+
"""
175+
Allocate household-imputed student loan balances to plausible holders.
176+
177+
The WAS target is household-level, but `student_loan_balance` is a person-
178+
level input in `policyengine-uk`. We therefore allocate each household's
179+
imputed balance to the most plausible holder set in priority order:
180+
current repayers, reported borrowers, tertiary-qualified adults, current
181+
tertiary students, then working-age adults as a final fallback.
182+
"""
183+
balances = np.zeros(len(person), dtype=float)
184+
if len(person) == 0:
185+
return balances
186+
187+
age = (
188+
pd.to_numeric(_person_column(person, "age", 0), errors="coerce")
189+
.fillna(0)
190+
.to_numpy()
191+
)
192+
repayments = (
193+
pd.to_numeric(
194+
_person_column(person, "student_loan_repayments", 0), errors="coerce"
195+
)
196+
.fillna(0)
197+
.to_numpy()
198+
)
199+
reported_loans = (
200+
pd.to_numeric(_person_column(person, "student_loans", 0), errors="coerce")
201+
.fillna(0)
202+
.to_numpy()
203+
)
204+
current_education = (
205+
_person_column(person, "current_education", "NOT_IN_EDUCATION")
206+
.fillna("NOT_IN_EDUCATION")
207+
.astype(str)
208+
.to_numpy()
209+
)
210+
highest_education = (
211+
_person_column(person, "highest_education", "UPPER_SECONDARY")
212+
.fillna("UPPER_SECONDARY")
213+
.astype(str)
214+
.to_numpy()
215+
)
216+
217+
group_indices = person.groupby("person_household_id").indices
218+
219+
for household_id, household_balance in household_balances.items():
220+
if household_balance <= 0 or household_id not in group_indices:
221+
continue
222+
223+
idx = np.asarray(group_indices[household_id], dtype=int)
224+
repayer_mask = repayments[idx] > 0
225+
borrower_mask = reported_loans[idx] > 0
226+
tertiary_grad_mask = highest_education[idx] == "TERTIARY"
227+
current_student_mask = current_education[idx] == "TERTIARY"
228+
working_age_mask = (age[idx] >= 18) & (age[idx] <= 55)
229+
230+
for mask in (
231+
repayer_mask,
232+
borrower_mask,
233+
tertiary_grad_mask,
234+
current_student_mask,
235+
working_age_mask,
236+
np.ones(len(idx), dtype=bool),
237+
):
238+
if mask.any():
239+
chosen = idx[mask]
240+
break
241+
242+
if repayer_mask.any() and np.sum(repayments[idx][repayer_mask]) > 0:
243+
weights = repayments[idx][repayer_mask]
244+
balances[idx[repayer_mask]] += household_balance * (weights / weights.sum())
245+
else:
246+
balances[chosen] += household_balance / len(chosen)
247+
248+
return balances
249+
250+
163251
def save_imputation_models():
164252
"""
165253
Train and save wealth imputation model.
@@ -213,7 +301,8 @@ def impute_wealth(dataset: UKSingleYearDataset) -> UKSingleYearDataset:
213301
dataset: PolicyEngine UK dataset to augment with wealth data.
214302
215303
Returns:
216-
Dataset with imputed wealth variables added to household table.
304+
Dataset with household wealth variables added to the household table and
305+
`student_loan_balance` allocated to people.
217306
"""
218307
dataset = dataset.copy()
219308

@@ -229,6 +318,12 @@ def impute_wealth(dataset: UKSingleYearDataset) -> UKSingleYearDataset:
229318
output_df = model.predict(input_df)
230319

231320
for column in output_df.columns:
321+
if column == "student_loan_balance":
322+
dataset.person[column] = _allocate_student_loan_balance_to_people(
323+
household_balances=output_df[column].clip(lower=0),
324+
person=dataset.person,
325+
)
326+
continue
232327
dataset.household[column] = output_df[column].values
233328

234329
dataset.validate()

policyengine_uk_data/targets/build_loss_matrix.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
compute_housing,
3232
compute_income_band,
3333
compute_land_value,
34+
compute_maintenance_loan,
3435
compute_regional_land_value,
3536
compute_obr_council_tax,
3637
compute_pip_claimants,
@@ -320,6 +321,8 @@ def _compute_column(target: Target, ctx: _SimContext, year: int) -> np.ndarray |
320321
# Student loan plan borrower counts (SLC)
321322
if name.startswith("slc/student_loan_repayment/"):
322323
return compute_student_loan_repayment(target, ctx)
324+
if name in ("slc/maintenance_loan_recipients", "slc/maintenance_loan_spend"):
325+
return compute_maintenance_loan(target, ctx)
323326
if name.startswith("slc/plan_") and "above_threshold" in name:
324327
return compute_student_loan_plan(target, ctx)
325328
if name.startswith("slc/plan_") and "liable" in name:

policyengine_uk_data/targets/compute/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
compute_ss_ni_relief,
3535
)
3636
from policyengine_uk_data.targets.compute.other import (
37+
compute_maintenance_loan,
3738
compute_housing,
3839
compute_land_value,
3940
compute_regional_land_value,
@@ -53,6 +54,7 @@
5354
"compute_household_type",
5455
"compute_housing",
5556
"compute_land_value",
57+
"compute_maintenance_loan",
5658
"compute_regional_land_value",
5759
"compute_income_band",
5860
"compute_obr_council_tax",

policyengine_uk_data/targets/compute/other.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,3 +126,13 @@ def compute_student_loan_repayment(target, ctx) -> np.ndarray:
126126
mask &= plan == plan_value
127127

128128
return ctx.household_from_person(repayments * mask)
129+
130+
131+
def compute_maintenance_loan(target, ctx) -> np.ndarray:
132+
"""Compute maintenance-loan recipient-count and spend targets."""
133+
maintenance_loan = ctx.pe_person("maintenance_loan")
134+
if target.name == "slc/maintenance_loan_recipients":
135+
return ctx.household_from_person((maintenance_loan > 0).astype(float))
136+
if target.name == "slc/maintenance_loan_spend":
137+
return ctx.household_from_person(maintenance_loan)
138+
return None

policyengine_uk_data/targets/sources/slc.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,19 @@
55
Two target types are exposed:
66
- `above_threshold`: borrowers liable to repay and earning above threshold
77
- `liable`: all borrowers liable to repay, including below-threshold holders
8+
- `maintenance_loan`: full-time undergraduate England maintenance-loan
9+
recipient counts and total amount paid
810
911
Source: Explore Education Statistics — Student loan forecasts for England,
1012
Table 6a: Forecast number of student borrowers liable to repay and number
1113
earning above repayment threshold, by product. We use the "Higher education
1214
total" row which sums HE full-time, HE part-time, and Advanced Learner loans.
1315
Academic year 20XX-YY maps to calendar year 20XX+1 (e.g., 2024-25 → 2025).
1416
17+
Maintenance-loan targets come from Student support for higher education in
18+
England 2025, Table 3A: Maintenance Loans paid to full-time undergraduate
19+
students. Academic year 20XX/YY maps to calendar year 20XX+1.
20+
1521
Data permalink:
1622
https://explore-education-statistics.service.gov.uk/data-tables/permalink/6ff75517-7124-487c-cb4e-08de6eccf22d
1723
"""
@@ -21,6 +27,7 @@
2127
import re
2228
from functools import lru_cache
2329

30+
import pandas as pd
2431
import requests
2532

2633
from policyengine_uk_data.targets.schema import Target, Unit
@@ -30,6 +37,10 @@
3037
f"https://explore-education-statistics.service.gov.uk"
3138
f"/data-tables/permalink/{_PERMALINK_ID}"
3239
)
40+
_MAINTENANCE_LOAN_URL = (
41+
"https://assets.publishing.service.gov.uk/media/"
42+
"691d9e662c6b98ecdbc5003f/slcsp052025.xlsx"
43+
)
3344
_TESTING_DATA = {
3445
"plan_2": {
3546
"above_threshold": {
@@ -68,6 +79,36 @@
6879
},
6980
},
7081
}
82+
_MAINTENANCE_LOAN_TESTING_DATA = {
83+
"recipients": {
84+
2014: 972_830,
85+
2015: 963_084,
86+
2016: 986_323,
87+
2017: 1_013_354,
88+
2018: 1_028_438,
89+
2019: 1_044_973,
90+
2020: 1_055_702,
91+
2021: 1_117_591,
92+
2022: 1_145_289,
93+
2023: 1_151_607,
94+
2024: 1_154_427,
95+
2025: 1_159_761,
96+
},
97+
"amount_paid": {
98+
2014: 3_783_626_551,
99+
2015: 3_784_628_482,
100+
2016: 3_996_708_360,
101+
2017: 4_870_158_274,
102+
2018: 5_746_431_691,
103+
2019: 6_555_506_426,
104+
2020: 7_113_141_652,
105+
2021: 7_914_340_039,
106+
2022: 8_332_837_845,
107+
2023: 8_594_103_415,
108+
2024: 8_881_701_387,
109+
2025: 8_591_659_718,
110+
},
111+
}
71112

72113

73114
def get_snapshot_data() -> dict:
@@ -80,6 +121,13 @@ def get_snapshot_data() -> dict:
80121
}
81122

82123

124+
def get_maintenance_loan_snapshot_data() -> dict:
125+
"""Return the checked-in maintenance-loan snapshot."""
126+
return {
127+
key: values.copy() for key, values in _MAINTENANCE_LOAN_TESTING_DATA.items()
128+
}
129+
130+
83131
@lru_cache(maxsize=1)
84132
def _fetch_slc_data() -> dict:
85133
"""Fetch and parse SLC Table 6a data from Explore Education Statistics.
@@ -166,9 +214,62 @@ def parse_values(row, start_index, years):
166214
}
167215

168216

217+
def _row_contains_text(df: pd.DataFrame, row_index: int, text: str) -> bool:
218+
row = df.iloc[row_index].dropna()
219+
return any(str(value).strip() == text for value in row)
220+
221+
222+
def _find_row(df: pd.DataFrame, text: str, start: int = 0) -> int:
223+
for row_index in range(start, len(df)):
224+
if _row_contains_text(df, row_index, text):
225+
return row_index
226+
raise ValueError(f"Could not find row containing {text!r}")
227+
228+
229+
@lru_cache(maxsize=1)
230+
def _fetch_maintenance_loan_data() -> dict:
231+
"""Fetch full-time England maintenance-loan recipient counts and spend."""
232+
if os.environ.get("TESTING", "0") == "1":
233+
return get_maintenance_loan_snapshot_data()
234+
235+
df = pd.read_excel(_MAINTENANCE_LOAN_URL, sheet_name="Table 3A", header=None)
236+
237+
count_header_row = _find_row(df, "Number of students paid (000s) [27]")
238+
count_year_row = count_header_row + 1
239+
count_total_row = _find_row(df, "Grand total", start=count_year_row + 1)
240+
241+
amount_header_row = _find_row(df, "Amount paid (£m)")
242+
amount_year_row = amount_header_row + 1
243+
amount_total_row = _find_row(df, "Grand total", start=amount_year_row + 1)
244+
245+
year_columns = {}
246+
for column, value in df.iloc[count_year_row].items():
247+
if isinstance(value, str) and re.fullmatch(r"\d{4}/\d{2}", value):
248+
year_columns[column] = int(value[:4]) + 1
249+
250+
if not year_columns:
251+
raise ValueError("Could not find maintenance-loan year columns")
252+
253+
recipients = {}
254+
amount_paid = {}
255+
for column, year in year_columns.items():
256+
count_value = df.iloc[count_total_row, column]
257+
amount_value = df.iloc[amount_total_row, column]
258+
if pd.notna(count_value):
259+
recipients[year] = int(round(float(count_value) * 1_000))
260+
if pd.notna(amount_value):
261+
amount_paid[year] = int(round(float(amount_value) * 1_000_000))
262+
263+
return {
264+
"recipients": recipients,
265+
"amount_paid": amount_paid,
266+
}
267+
268+
169269
def get_targets() -> list[Target]:
170270
"""Generate SLC calibration targets by fetching live data."""
171271
slc_data = _fetch_slc_data()
272+
maintenance_loan_data = _fetch_maintenance_loan_data()
172273

173274
targets = []
174275

@@ -189,4 +290,26 @@ def get_targets() -> list[Target]:
189290
)
190291
)
191292

293+
targets.extend(
294+
[
295+
Target(
296+
name="slc/maintenance_loan_recipients",
297+
variable="maintenance_loan",
298+
source="slc",
299+
unit=Unit.COUNT,
300+
is_count=True,
301+
values=maintenance_loan_data["recipients"],
302+
reference_url=_MAINTENANCE_LOAN_URL,
303+
),
304+
Target(
305+
name="slc/maintenance_loan_spend",
306+
variable="maintenance_loan",
307+
source="slc",
308+
unit=Unit.GBP,
309+
values=maintenance_loan_data["amount_paid"],
310+
reference_url=_MAINTENANCE_LOAN_URL,
311+
),
312+
]
313+
)
314+
192315
return targets

0 commit comments

Comments
 (0)