refactor: fetch SLC targets from Explore Education Statistics API

nikhilwoodruff · nikhilwoodruff · commit 536f75087ee5 · 2026-02-19T17:25:16.000Z
Replace hardcoded SLC borrower counts with live data fetched from the
Explore Education Statistics permalink. This ensures targets stay
current as SLC updates their forecasts.

The parser extracts Plan 2 and Plan 5 "earning above threshold" counts
from the "Higher education total" row (HE full-time + part-time + AL).
diff --git a/policyengine_uk_data/targets/sources/slc.py b/policyengine_uk_data/targets/sources/slc.py
@@ -6,55 +6,113 @@
 
 Source: Explore Education Statistics — Student loan forecasts for England,
 Table 6a: Forecast number of student borrowers liable to repay and number
-earning above repayment threshold, by product. Figures are the sum of
-higher education full-time, higher education part-time, and advanced
-learner loan borrowers (Master's and Doctoral loans use Plan 3 and are
-excluded). Academic year 20XX-YY maps to calendar year 20XX.
+earning above repayment threshold, by product. We use the "Higher education
+total" row which sums HE full-time, HE part-time, and Advanced Learner loans.
+Academic year 20XX-YY maps to calendar year 20XX+1 (e.g., 2024-25 → 2025).
 
 Data permalink:
 https://explore-education-statistics.service.gov.uk/data-tables/permalink/6ff75517-7124-487c-cb4e-08de6eccf22d
 """
 
+import json
+import re
+import requests
+from functools import lru_cache
+
 from policyengine_uk_data.targets.schema import Target, Unit
 
-_REFERENCE = (
-    "https://explore-education-statistics.service.gov.uk/data-tables"
-    "/permalink/6ff75517-7124-487c-cb4e-08de6eccf22d"
+_PERMALINK_ID = "6ff75517-7124-487c-cb4e-08de6eccf22d"
+_PERMALINK_URL = (
+    f"https://explore-education-statistics.service.gov.uk"
+    f"/data-tables/permalink/{_PERMALINK_ID}"
 )
 
-# Plan 2, earning above threshold — sum of HE full-time + part-time + AL
-# 2024-25: 3,670k + 225k + 90k = 3,985k
-# 2025-26: 4,130k + 245k + 85k = 4,460k
-# 2026-27: 4,480k + 260k + 85k = 4,825k
-# 2027-28: 4,700k + 265k + 80k = 5,045k
-# 2028-29: 4,820k + 265k + 70k = 5,155k
-# 2029-30: 4,870k + 270k + 65k = 5,205k
-_PLAN2_ABOVE_THRESHOLD = {
-    2025: 3_985_000,
-    2026: 4_460_000,
-    2027: 4_825_000,
-    2028: 5_045_000,
-    2029: 5_155_000,
-    2030: 5_205_000,
-}
-
-# Plan 5, earning above threshold — sum of HE full-time + part-time + AL
-# 2024-25: 0 + 0 + 0 = 0
-# 2025-26: 25k + 5k + 5k = 35k
-# 2026-27: 115k + 20k + 10k = 145k
-# 2027-28: 340k + 35k + 15k = 390k
-# 2028-29: 700k + 50k + 15k = 765k
-# 2029-30: 1,140k + 75k + 20k = 1,235k
-_PLAN5_ABOVE_THRESHOLD = {
-    2026: 35_000,
-    2027: 145_000,
-    2028: 390_000,
-    2029: 765_000,
-    2030: 1_235_000,
-}
+
+@lru_cache(maxsize=1)
+def _fetch_slc_data() -> dict:
+    """Fetch and parse SLC Table 6a data from Explore Education Statistics.
+
+    Returns:
+        Dict with keys 'plan_2' and 'plan_5', each containing a dict
+        mapping calendar year (int) to borrower count above threshold (int).
+    """
+    response = requests.get(_PERMALINK_URL, timeout=30)
+    response.raise_for_status()
+
+    # Extract JSON data from __NEXT_DATA__ script tag
+    match = re.search(
+        r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>',
+        response.text,
+    )
+    if not match:
+        raise ValueError("Could not find __NEXT_DATA__ in SLC permalink page")
+
+    next_data = json.loads(match.group(1))
+    table_json = next_data["props"]["pageProps"]["data"]["table"]["json"]
+
+    # Parse header row to get years - columns go newest to oldest
+    # Structure: Plan 2 (6 years), Plan 5 (6 years), Plan 3 (5 years)
+    header_row = table_json["thead"][1]
+
+    # Get Plan 2 years (first 6 columns)
+    plan_2_years = []
+    for i in range(6):
+        year_text = header_row[i]["text"]  # e.g., "2029-30"
+        start_year = int(year_text.split("-")[0])
+        calendar_year = start_year + 1  # 2029-30 → 2030
+        plan_2_years.append(calendar_year)
+
+    # Get Plan 5 years (next 6 columns)
+    plan_5_years = []
+    for i in range(6, 12):
+        year_text = header_row[i]["text"]
+        start_year = int(year_text.split("-")[0])
+        calendar_year = start_year + 1
+        plan_5_years.append(calendar_year)
+
+    # Find the "Higher education total" / "earning above threshold" row
+    # This is the row following "Higher education total" with "liable to repay"
+    tbody = table_json["tbody"]
+
+    # Row 11 contains: header + 6 Plan 2 values + 6 Plan 5 values + 5 Plan 3
+    target_row = None
+    for row in tbody:
+        header_text = row[0].get("text", "")
+        if "earning above repayment threshold" in header_text:
+            # Check if previous context was "Higher education total"
+            # Actually, row 11 is after HE total row 10, and starts with
+            # the "earning above" header (no group header due to rowSpan)
+            target_row = row
+            break
+
+    if target_row is None:
+        raise ValueError("Could not find 'earning above threshold' row")
+
+    # Parse Plan 2 data (cells 1-6, mapping to plan_2_years)
+    plan_2_data = {}
+    for i, year in enumerate(plan_2_years):
+        cell_idx = 1 + i  # Skip header cell
+        value_text = target_row[cell_idx].get("text", "")
+        if value_text and value_text not in ("no data", "0"):
+            value = int(value_text.replace(",", ""))
+            plan_2_data[year] = value
+
+    # Parse Plan 5 data (cells 7-12, mapping to plan_5_years)
+    plan_5_data = {}
+    for i, year in enumerate(plan_5_years):
+        cell_idx = 7 + i  # Skip header + Plan 2 cells
+        value_text = target_row[cell_idx].get("text", "")
+        if value_text and value_text not in ("no data", "0"):
+            value = int(value_text.replace(",", ""))
+            plan_5_data[year] = value
+
+    return {"plan_2": plan_2_data, "plan_5": plan_5_data}
 
 
 def get_targets() -> list[Target]:
+    """Generate SLC calibration targets by fetching live data."""
+    slc_data = _fetch_slc_data()
+
     targets = []
 
     targets.append(
@@ -64,8 +122,8 @@ def get_targets() -> list[Target]:
             source="slc",
             unit=Unit.COUNT,
             is_count=True,
-            values=_PLAN2_ABOVE_THRESHOLD,
-            reference_url=_REFERENCE,
+            values=slc_data["plan_2"],
+            reference_url=_PERMALINK_URL,
         )
     )
 
@@ -76,8 +134,8 @@ def get_targets() -> list[Target]:
             source="slc",
             unit=Unit.COUNT,
             is_count=True,
-            values=_PLAN5_ABOVE_THRESHOLD,
-            reference_url=_REFERENCE,
+            values=slc_data["plan_5"],
+            reference_url=_PERMALINK_URL,
         )
     )