Skip to content

Commit 536f750

Browse files
refactor: fetch SLC targets from Explore Education Statistics API
Replace hardcoded SLC borrower counts with live data fetched from the Explore Education Statistics permalink. This ensures targets stay current as SLC updates their forecasts. The parser extracts Plan 2 and Plan 5 "earning above threshold" counts from the "Higher education total" row (HE full-time + part-time + AL).
1 parent 3ac204a commit 536f750

1 file changed

Lines changed: 99 additions & 41 deletions

File tree

  • policyengine_uk_data/targets/sources

policyengine_uk_data/targets/sources/slc.py

Lines changed: 99 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -6,55 +6,113 @@
66
77
Source: Explore Education Statistics — Student loan forecasts for England,
88
Table 6a: Forecast number of student borrowers liable to repay and number
9-
earning above repayment threshold, by product. Figures are the sum of
10-
higher education full-time, higher education part-time, and advanced
11-
learner loan borrowers (Master's and Doctoral loans use Plan 3 and are
12-
excluded). Academic year 20XX-YY maps to calendar year 20XX.
9+
earning above repayment threshold, by product. We use the "Higher education
10+
total" row which sums HE full-time, HE part-time, and Advanced Learner loans.
11+
Academic year 20XX-YY maps to calendar year 20XX+1 (e.g., 2024-25 → 2025).
1312
1413
Data permalink:
1514
https://explore-education-statistics.service.gov.uk/data-tables/permalink/6ff75517-7124-487c-cb4e-08de6eccf22d
1615
"""
1716

17+
import json
18+
import re
19+
import requests
20+
from functools import lru_cache
21+
1822
from policyengine_uk_data.targets.schema import Target, Unit
1923

20-
_REFERENCE = (
21-
"https://explore-education-statistics.service.gov.uk/data-tables"
22-
"/permalink/6ff75517-7124-487c-cb4e-08de6eccf22d"
24+
_PERMALINK_ID = "6ff75517-7124-487c-cb4e-08de6eccf22d"
25+
_PERMALINK_URL = (
26+
f"https://explore-education-statistics.service.gov.uk"
27+
f"/data-tables/permalink/{_PERMALINK_ID}"
2328
)
2429

25-
# Plan 2, earning above threshold — sum of HE full-time + part-time + AL
26-
# 2024-25: 3,670k + 225k + 90k = 3,985k
27-
# 2025-26: 4,130k + 245k + 85k = 4,460k
28-
# 2026-27: 4,480k + 260k + 85k = 4,825k
29-
# 2027-28: 4,700k + 265k + 80k = 5,045k
30-
# 2028-29: 4,820k + 265k + 70k = 5,155k
31-
# 2029-30: 4,870k + 270k + 65k = 5,205k
32-
_PLAN2_ABOVE_THRESHOLD = {
33-
2025: 3_985_000,
34-
2026: 4_460_000,
35-
2027: 4_825_000,
36-
2028: 5_045_000,
37-
2029: 5_155_000,
38-
2030: 5_205_000,
39-
}
40-
41-
# Plan 5, earning above threshold — sum of HE full-time + part-time + AL
42-
# 2024-25: 0 + 0 + 0 = 0
43-
# 2025-26: 25k + 5k + 5k = 35k
44-
# 2026-27: 115k + 20k + 10k = 145k
45-
# 2027-28: 340k + 35k + 15k = 390k
46-
# 2028-29: 700k + 50k + 15k = 765k
47-
# 2029-30: 1,140k + 75k + 20k = 1,235k
48-
_PLAN5_ABOVE_THRESHOLD = {
49-
2026: 35_000,
50-
2027: 145_000,
51-
2028: 390_000,
52-
2029: 765_000,
53-
2030: 1_235_000,
54-
}
30+
31+
@lru_cache(maxsize=1)
32+
def _fetch_slc_data() -> dict:
33+
"""Fetch and parse SLC Table 6a data from Explore Education Statistics.
34+
35+
Returns:
36+
Dict with keys 'plan_2' and 'plan_5', each containing a dict
37+
mapping calendar year (int) to borrower count above threshold (int).
38+
"""
39+
response = requests.get(_PERMALINK_URL, timeout=30)
40+
response.raise_for_status()
41+
42+
# Extract JSON data from __NEXT_DATA__ script tag
43+
match = re.search(
44+
r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>',
45+
response.text,
46+
)
47+
if not match:
48+
raise ValueError("Could not find __NEXT_DATA__ in SLC permalink page")
49+
50+
next_data = json.loads(match.group(1))
51+
table_json = next_data["props"]["pageProps"]["data"]["table"]["json"]
52+
53+
# Parse header row to get years - columns go newest to oldest
54+
# Structure: Plan 2 (6 years), Plan 5 (6 years), Plan 3 (5 years)
55+
header_row = table_json["thead"][1]
56+
57+
# Get Plan 2 years (first 6 columns)
58+
plan_2_years = []
59+
for i in range(6):
60+
year_text = header_row[i]["text"] # e.g., "2029-30"
61+
start_year = int(year_text.split("-")[0])
62+
calendar_year = start_year + 1 # 2029-30 → 2030
63+
plan_2_years.append(calendar_year)
64+
65+
# Get Plan 5 years (next 6 columns)
66+
plan_5_years = []
67+
for i in range(6, 12):
68+
year_text = header_row[i]["text"]
69+
start_year = int(year_text.split("-")[0])
70+
calendar_year = start_year + 1
71+
plan_5_years.append(calendar_year)
72+
73+
# Find the "Higher education total" / "earning above threshold" row
74+
# This is the row following "Higher education total" with "liable to repay"
75+
tbody = table_json["tbody"]
76+
77+
# Row 11 contains: header + 6 Plan 2 values + 6 Plan 5 values + 5 Plan 3
78+
target_row = None
79+
for row in tbody:
80+
header_text = row[0].get("text", "")
81+
if "earning above repayment threshold" in header_text:
82+
# Check if previous context was "Higher education total"
83+
# Actually, row 11 is after HE total row 10, and starts with
84+
# the "earning above" header (no group header due to rowSpan)
85+
target_row = row
86+
break
87+
88+
if target_row is None:
89+
raise ValueError("Could not find 'earning above threshold' row")
90+
91+
# Parse Plan 2 data (cells 1-6, mapping to plan_2_years)
92+
plan_2_data = {}
93+
for i, year in enumerate(plan_2_years):
94+
cell_idx = 1 + i # Skip header cell
95+
value_text = target_row[cell_idx].get("text", "")
96+
if value_text and value_text not in ("no data", "0"):
97+
value = int(value_text.replace(",", ""))
98+
plan_2_data[year] = value
99+
100+
# Parse Plan 5 data (cells 7-12, mapping to plan_5_years)
101+
plan_5_data = {}
102+
for i, year in enumerate(plan_5_years):
103+
cell_idx = 7 + i # Skip header + Plan 2 cells
104+
value_text = target_row[cell_idx].get("text", "")
105+
if value_text and value_text not in ("no data", "0"):
106+
value = int(value_text.replace(",", ""))
107+
plan_5_data[year] = value
108+
109+
return {"plan_2": plan_2_data, "plan_5": plan_5_data}
55110

56111

57112
def get_targets() -> list[Target]:
113+
"""Generate SLC calibration targets by fetching live data."""
114+
slc_data = _fetch_slc_data()
115+
58116
targets = []
59117

60118
targets.append(
@@ -64,8 +122,8 @@ def get_targets() -> list[Target]:
64122
source="slc",
65123
unit=Unit.COUNT,
66124
is_count=True,
67-
values=_PLAN2_ABOVE_THRESHOLD,
68-
reference_url=_REFERENCE,
125+
values=slc_data["plan_2"],
126+
reference_url=_PERMALINK_URL,
69127
)
70128
)
71129

@@ -76,8 +134,8 @@ def get_targets() -> list[Target]:
76134
source="slc",
77135
unit=Unit.COUNT,
78136
is_count=True,
79-
values=_PLAN5_ABOVE_THRESHOLD,
80-
reference_url=_REFERENCE,
137+
values=slc_data["plan_5"],
138+
reference_url=_PERMALINK_URL,
81139
)
82140
)
83141

0 commit comments

Comments
 (0)