66
77Source: Explore Education Statistics — Student loan forecasts for England,
88Table 6a: Forecast number of student borrowers liable to repay and number
9- earning above repayment threshold, by product. Figures are the sum of
10- higher education full-time, higher education part-time, and advanced
11- learner loan borrowers (Master's and Doctoral loans use Plan 3 and are
12- excluded). Academic year 20XX-YY maps to calendar year 20XX.
9+ earning above repayment threshold, by product. We use the "Higher education
10+ total" row which sums HE full-time, HE part-time, and Advanced Learner loans.
11+ Academic year 20XX-YY maps to calendar year 20XX+1 (e.g., 2024-25 → 2025).
1312
1413Data permalink:
1514https://explore-education-statistics.service.gov.uk/data-tables/permalink/6ff75517-7124-487c-cb4e-08de6eccf22d
1615"""
1716
17+ import json
18+ import re
19+ import requests
20+ from functools import lru_cache
21+
1822from policyengine_uk_data .targets .schema import Target , Unit
1923
20- _REFERENCE = (
21- "https://explore-education-statistics.service.gov.uk/data-tables"
22- "/permalink/6ff75517-7124-487c-cb4e-08de6eccf22d"
24+ _PERMALINK_ID = "6ff75517-7124-487c-cb4e-08de6eccf22d"
25+ _PERMALINK_URL = (
26+ f"https://explore-education-statistics.service.gov.uk"
27+ f"/data-tables/permalink/{ _PERMALINK_ID } "
2328)
2429
25- # Plan 2, earning above threshold — sum of HE full-time + part-time + AL
26- # 2024-25: 3,670k + 225k + 90k = 3,985k
27- # 2025-26: 4,130k + 245k + 85k = 4,460k
28- # 2026-27: 4,480k + 260k + 85k = 4,825k
29- # 2027-28: 4,700k + 265k + 80k = 5,045k
30- # 2028-29: 4,820k + 265k + 70k = 5,155k
31- # 2029-30: 4,870k + 270k + 65k = 5,205k
32- _PLAN2_ABOVE_THRESHOLD = {
33- 2025 : 3_985_000 ,
34- 2026 : 4_460_000 ,
35- 2027 : 4_825_000 ,
36- 2028 : 5_045_000 ,
37- 2029 : 5_155_000 ,
38- 2030 : 5_205_000 ,
39- }
40-
41- # Plan 5, earning above threshold — sum of HE full-time + part-time + AL
42- # 2024-25: 0 + 0 + 0 = 0
43- # 2025-26: 25k + 5k + 5k = 35k
44- # 2026-27: 115k + 20k + 10k = 145k
45- # 2027-28: 340k + 35k + 15k = 390k
46- # 2028-29: 700k + 50k + 15k = 765k
47- # 2029-30: 1,140k + 75k + 20k = 1,235k
48- _PLAN5_ABOVE_THRESHOLD = {
49- 2026 : 35_000 ,
50- 2027 : 145_000 ,
51- 2028 : 390_000 ,
52- 2029 : 765_000 ,
53- 2030 : 1_235_000 ,
54- }
30+
31+ @lru_cache (maxsize = 1 )
32+ def _fetch_slc_data () -> dict :
33+ """Fetch and parse SLC Table 6a data from Explore Education Statistics.
34+
35+ Returns:
36+ Dict with keys 'plan_2' and 'plan_5', each containing a dict
37+ mapping calendar year (int) to borrower count above threshold (int).
38+ """
39+ response = requests .get (_PERMALINK_URL , timeout = 30 )
40+ response .raise_for_status ()
41+
42+ # Extract JSON data from __NEXT_DATA__ script tag
43+ match = re .search (
44+ r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>' ,
45+ response .text ,
46+ )
47+ if not match :
48+ raise ValueError ("Could not find __NEXT_DATA__ in SLC permalink page" )
49+
50+ next_data = json .loads (match .group (1 ))
51+ table_json = next_data ["props" ]["pageProps" ]["data" ]["table" ]["json" ]
52+
53+ # Parse header row to get years - columns go newest to oldest
54+ # Structure: Plan 2 (6 years), Plan 5 (6 years), Plan 3 (5 years)
55+ header_row = table_json ["thead" ][1 ]
56+
57+ # Get Plan 2 years (first 6 columns)
58+ plan_2_years = []
59+ for i in range (6 ):
60+ year_text = header_row [i ]["text" ] # e.g., "2029-30"
61+ start_year = int (year_text .split ("-" )[0 ])
62+ calendar_year = start_year + 1 # 2029-30 → 2030
63+ plan_2_years .append (calendar_year )
64+
65+ # Get Plan 5 years (next 6 columns)
66+ plan_5_years = []
67+ for i in range (6 , 12 ):
68+ year_text = header_row [i ]["text" ]
69+ start_year = int (year_text .split ("-" )[0 ])
70+ calendar_year = start_year + 1
71+ plan_5_years .append (calendar_year )
72+
73+ # Find the "Higher education total" / "earning above threshold" row
74+ # This is the row following "Higher education total" with "liable to repay"
75+ tbody = table_json ["tbody" ]
76+
77+ # Row 11 contains: header + 6 Plan 2 values + 6 Plan 5 values + 5 Plan 3
78+ target_row = None
79+ for row in tbody :
80+ header_text = row [0 ].get ("text" , "" )
81+ if "earning above repayment threshold" in header_text :
82+ # Check if previous context was "Higher education total"
83+ # Actually, row 11 is after HE total row 10, and starts with
84+ # the "earning above" header (no group header due to rowSpan)
85+ target_row = row
86+ break
87+
88+ if target_row is None :
89+ raise ValueError ("Could not find 'earning above threshold' row" )
90+
91+ # Parse Plan 2 data (cells 1-6, mapping to plan_2_years)
92+ plan_2_data = {}
93+ for i , year in enumerate (plan_2_years ):
94+ cell_idx = 1 + i # Skip header cell
95+ value_text = target_row [cell_idx ].get ("text" , "" )
96+ if value_text and value_text not in ("no data" , "0" ):
97+ value = int (value_text .replace ("," , "" ))
98+ plan_2_data [year ] = value
99+
100+ # Parse Plan 5 data (cells 7-12, mapping to plan_5_years)
101+ plan_5_data = {}
102+ for i , year in enumerate (plan_5_years ):
103+ cell_idx = 7 + i # Skip header + Plan 2 cells
104+ value_text = target_row [cell_idx ].get ("text" , "" )
105+ if value_text and value_text not in ("no data" , "0" ):
106+ value = int (value_text .replace ("," , "" ))
107+ plan_5_data [year ] = value
108+
109+ return {"plan_2" : plan_2_data , "plan_5" : plan_5_data }
55110
56111
57112def get_targets () -> list [Target ]:
113+ """Generate SLC calibration targets by fetching live data."""
114+ slc_data = _fetch_slc_data ()
115+
58116 targets = []
59117
60118 targets .append (
@@ -64,8 +122,8 @@ def get_targets() -> list[Target]:
64122 source = "slc" ,
65123 unit = Unit .COUNT ,
66124 is_count = True ,
67- values = _PLAN2_ABOVE_THRESHOLD ,
68- reference_url = _REFERENCE ,
125+ values = slc_data [ "plan_2" ] ,
126+ reference_url = _PERMALINK_URL ,
69127 )
70128 )
71129
@@ -76,8 +134,8 @@ def get_targets() -> list[Target]:
76134 source = "slc" ,
77135 unit = Unit .COUNT ,
78136 is_count = True ,
79- values = _PLAN5_ABOVE_THRESHOLD ,
80- reference_url = _REFERENCE ,
137+ values = slc_data [ "plan_5" ] ,
138+ reference_url = _PERMALINK_URL ,
81139 )
82140 )
83141
0 commit comments