1- """
2- Student loan plan imputation.
3-
4- This module imputes the student_loan_plan variable based on:
5- - Whether the person has reported student loan repayments
6- - Their estimated university attendance year (inferred from age)
1+ """Student loan plan imputation.
72
8- The imputation assigns plan types according to when the loan system changed:
9- - NONE: No reported repayments
10- - PLAN_1: Started university before September 2012
11- - PLAN_2: Started September 2012 - August 2023
12- - PLAN_5: Started September 2023 onwards
3+ This module imputes `student_loan_plan` in two steps:
4+ - assign plans to people with reported PAYE student loan repayments
5+ - assign missing below-threshold holders to match SLC liable-to-repay totals
136
14- This enables policyengine-uk's student_loan_repayment variable to calculate
15- repayments using official threshold parameters.
7+ The FRS only observes active repayment through PAYE, so many England borrowers
8+ who hold a loan but earn below the repayment threshold are missing from the
9+ base dataset. We fill that stock using the checked-in SLC snapshot, restricting
10+ the new assignments to plausible England tertiary-education cohorts.
1611"""
1712
1813import numpy as np
19- from policyengine_uk .data import UKSingleYearDataset
2014from policyengine_uk import Microsimulation
15+ from policyengine_uk .data import UKSingleYearDataset
16+
17+ from policyengine_uk_data .targets .sources .slc import get_snapshot_data
18+
19+ _ENGLAND = "ENGLAND"
20+ _PLAN_2_MIN_AGE = 21
21+ _PLAN_2_MAX_AGE = 55
22+ _PLAN_5_MAX_AGE = 25
23+
24+
25+ def _weighted_count (mask : np .ndarray , weights : np .ndarray ) -> float :
26+ return float (np .sum (weights [mask ]))
27+
28+
29+ def _assign_probabilistically (
30+ plan : np .ndarray ,
31+ eligible : np .ndarray ,
32+ weights : np .ndarray ,
33+ target_count : float ,
34+ plan_name : str ,
35+ rng : np .random .Generator ,
36+ ) -> None :
37+ """Assign a plan to a weighted eligible pool up to a target count."""
38+ eligible_weight = _weighted_count (eligible , weights )
39+ if target_count <= 0 or eligible_weight <= 0 :
40+ return
41+ assignment_probability = min (1.0 , target_count / eligible_weight )
42+ draws = rng .random (len (plan ))
43+ plan [eligible & (draws < assignment_probability )] = plan_name
44+
45+
46+ def _impute_student_loan_plan_values (
47+ age : np .ndarray ,
48+ student_loan_repayments : np .ndarray ,
49+ country : np .ndarray ,
50+ highest_education : np .ndarray ,
51+ person_weight : np .ndarray ,
52+ * ,
53+ year : int ,
54+ seed : int = 42 ,
55+ slc_data : dict | None = None ,
56+ ) -> np .ndarray :
57+ """Impute plan values from person-level arrays."""
58+ age = np .asarray (age )
59+ repayments = np .asarray (student_loan_repayments )
60+ country = np .asarray (country )
61+ highest_education = np .asarray (highest_education )
62+ person_weight = np .asarray (person_weight , dtype = float )
63+ slc_data = get_snapshot_data () if slc_data is None else slc_data
64+
65+ rng = np .random .default_rng (seed )
66+ plan = np .full (len (age ), "NONE" , dtype = object )
67+
68+ has_repayments = repayments > 0
69+ is_england = country == _ENGLAND
70+ is_tertiary = highest_education == "TERTIARY"
71+ estimated_uni_start_year = year - age + 18
72+
73+ plan_1_cohort = estimated_uni_start_year < 2012
74+ plan_2_cohort = (estimated_uni_start_year >= 2012 ) & (
75+ estimated_uni_start_year < 2023
76+ )
77+ plan_5_cohort = estimated_uni_start_year >= 2023
78+ plan_2_age_band = (age >= _PLAN_2_MIN_AGE ) & (age <= _PLAN_2_MAX_AGE )
79+ plan_5_age_band = (age >= 18 ) & (age <= _PLAN_5_MAX_AGE )
80+
81+ # Reported PAYE repayers identify the active stock directly.
82+ plan [has_repayments & plan_1_cohort ] = "PLAN_1"
83+ plan [has_repayments & plan_5_cohort ] = "PLAN_5"
84+ plan [has_repayments & (plan == "NONE" )] = "PLAN_2"
85+
86+ # Impute missing below-threshold holders so the total England stock matches
87+ # the SLC liable-to-repay series, using the observed repayer stock as the
88+ # starting point rather than the official above-threshold count.
89+ plan_5_target = slc_data ["plan_5" ]["liable" ].get (year , 0 )
90+ plan_5_shortfall = max (
91+ 0.0 ,
92+ plan_5_target - _weighted_count ((plan == "PLAN_5" ) & is_england , person_weight ),
93+ )
94+ plan_5_eligible = (
95+ (plan == "NONE" ) & is_england & is_tertiary & plan_5_age_band & plan_5_cohort
96+ )
97+ _assign_probabilistically (
98+ plan ,
99+ plan_5_eligible ,
100+ person_weight ,
101+ plan_5_shortfall ,
102+ "PLAN_5" ,
103+ rng ,
104+ )
105+
106+ plan_2_target = slc_data ["plan_2" ]["liable" ].get (year , 0 )
107+ plan_2_shortfall = max (
108+ 0.0 ,
109+ plan_2_target - _weighted_count ((plan == "PLAN_2" ) & is_england , person_weight ),
110+ )
111+ plan_2_eligible = (
112+ (plan == "NONE" ) & is_england & is_tertiary & plan_2_age_band & plan_2_cohort
113+ )
114+ _assign_probabilistically (
115+ plan ,
116+ plan_2_eligible ,
117+ person_weight ,
118+ plan_2_shortfall ,
119+ "PLAN_2" ,
120+ rng ,
121+ )
122+
123+ return plan
21124
22125
23126def impute_student_loan_plan (
24127 dataset : UKSingleYearDataset ,
25128 year : int = 2025 ,
129+ seed : int = 42 ,
130+ slc_data : dict | None = None ,
26131) -> UKSingleYearDataset :
27132 """
28133 Impute student loan plan type based on age and reported repayments.
@@ -34,45 +139,22 @@ def impute_student_loan_plan(
34139 - PLAN_5: £25,000 (2025), Sept 2023 onwards
35140
36141 Args:
37- dataset: PolicyEngine UK dataset with student_loan_repayments.
38- year: The simulation year, used to estimate university attendance.
39-
40- Returns:
41- Dataset with imputed student_loan_plan values.
142+ dataset: PolicyEngine UK dataset with student loan inputs.
143+ year: Simulation year, used to estimate university start cohorts.
144+ seed: Random seed for reproducible below-threshold assignment.
145+ slc_data: Optional override for the SLC borrower snapshot.
42146 """
43147 dataset = dataset .copy ()
44148 sim = Microsimulation (dataset = dataset )
45-
46- # Get required variables
47- age = sim .calculate ("age" ).values
48- student_loan_repayments = sim .calculate ("student_loan_repayments" ).values
49-
50- # Determine if person has a student loan based on reported repayments
51- has_student_loan = student_loan_repayments > 0
52-
53- # Estimate when they started university (assume age 18)
54- # For simulation year Y and age A, university start year = Y - A + 18
55- estimated_uni_start_year = year - age + 18
56-
57- # Assign plan types based on when loan system changed
58- # StudentLoanPlan is a string enum: "NONE", "PLAN_1", "PLAN_2", "PLAN_4", "PLAN_5"
59- plan = np .full (len (age ), "NONE" , dtype = object )
60-
61- # Plan 1: Started before September 2012
62- plan_1_mask = has_student_loan & (estimated_uni_start_year < 2012 )
63- plan [plan_1_mask ] = "PLAN_1"
64-
65- # Plan 2: Started September 2012 - August 2023
66- plan_2_mask = has_student_loan & (
67- (estimated_uni_start_year >= 2012 ) & (estimated_uni_start_year < 2023 )
149+ dataset .person ["student_loan_plan" ] = _impute_student_loan_plan_values (
150+ age = sim .calculate ("age" ).values ,
151+ student_loan_repayments = sim .calculate ("student_loan_repayments" ).values ,
152+ country = sim .calculate ("country" , map_to = "person" ).values ,
153+ highest_education = sim .calculate ("highest_education" ).values ,
154+ person_weight = sim .calculate ("person_weight" ).values ,
155+ year = year ,
156+ seed = seed ,
157+ slc_data = slc_data ,
68158 )
69- plan [plan_2_mask ] = "PLAN_2"
70-
71- # Plan 5: Started September 2023 onwards
72- plan_5_mask = has_student_loan & (estimated_uni_start_year >= 2023 )
73- plan [plan_5_mask ] = "PLAN_5"
74-
75- # Store as the plan type
76- dataset .person ["student_loan_plan" ] = plan
77159
78160 return dataset
0 commit comments