Skip to content

Commit 53adf33

Browse files
MaxGhenisclaude
andauthored
Add latest-data microsimulation smoke test (#1617)
Catches silent model/data skew at the point the enhanced FRS dataset is republished on HuggingFace, not after a release. Exercises whatever is currently on HF `main` (unlike `conftest.py` which pins to an older version) and asserts plausibility bounds on: - UK weighted population and household/benunit counts - `is_parent` weighted population (>10M) — catches the defaulting-to- zero failure introduced by removing the inferred formula in #1595 - Universal credit aggregate in £55-£95bn range around the OBR target - state pension / child benefit / pension credit floors - `extended_childcare_entitlement_eligible` reaching >500k benefit units Verified against the 1.45.8 stale dataset: correctly fails 3 of 5 tests (is_parent=0, UC=£51.7bn, childcare=0), passes the other two. Marked `microsimulation` so it only runs in CI when HUGGING_FACE_TOKEN is set. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent df0f286 commit 53adf33

2 files changed

Lines changed: 130 additions & 0 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
- Add a microsimulation smoke test suite that runs against the unpinned latest enhanced FRS dataset and asserts plausibility bounds for UK population, UC aggregate, `is_parent` population, core benefit totals, and extended childcare eligibility. Catches silent model/data skew at the point the dataset is republished, not after a release.
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""Smoke tests against the *latest* published enhanced FRS dataset.
2+
3+
These complement the pinned microsimulation tests in
4+
``policyengine_uk/tests/microsimulation/`` by exercising the model against
5+
whatever is currently on HuggingFace `main`, so that a silent break at the
6+
model/data boundary (e.g. the model expecting an input column the rebuilt
7+
dataset hasn't populated) shows up in CI rather than after a release.
8+
9+
Bounds are deliberately wide — they catch catastrophic failures (e.g.
10+
``is_parent`` defaulting to zero, UC aggregate collapsing by ~£25 bn) without
11+
tripping on normal calibration noise.
12+
13+
Skipped unless ``HUGGING_FACE_TOKEN`` or ``POLICYENGINE_UK_DEFAULT_DATASET`` is
14+
set, via the ``microsimulation`` marker configured in ``conftest.py``.
15+
"""
16+
17+
from __future__ import annotations
18+
19+
import os
20+
21+
import numpy as np
22+
import pytest
23+
24+
from policyengine_uk import Microsimulation
25+
26+
27+
LATEST_DATASET_URL = (
28+
"hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5"
29+
)
30+
YEAR = 2025
31+
32+
33+
@pytest.fixture(scope="module")
34+
def sim() -> Microsimulation:
35+
"""Simulation built against the unpinned latest dataset.
36+
37+
Overrides any pinned-version dataset set in conftest.py so the test
38+
exercises whatever is on HuggingFace ``main`` right now.
39+
"""
40+
os.environ["POLICYENGINE_UK_DEFAULT_DATASET"] = LATEST_DATASET_URL
41+
return Microsimulation()
42+
43+
44+
def _weighted(sim: Microsimulation, variable: str, period: int = YEAR) -> float:
45+
values = np.asarray(sim.calculate(variable, period).values, dtype=float)
46+
n = len(values)
47+
for weight_var in ("person_weight", "benunit_weight", "household_weight"):
48+
weight = np.asarray(sim.calculate(weight_var, period).values, dtype=float)
49+
if len(weight) == n:
50+
return float((values * weight).sum())
51+
raise AssertionError(
52+
f"No entity weight matches length {n} for variable {variable!r}"
53+
)
54+
55+
56+
@pytest.mark.microsimulation
57+
def test_population_totals_are_plausible(sim):
58+
"""UK weighted population and household counts sit in sensible bounds."""
59+
people = float(np.asarray(sim.calculate("person_weight", YEAR).values).sum())
60+
benunits = float(np.asarray(sim.calculate("benunit_weight", YEAR).values).sum())
61+
households = float(np.asarray(sim.calculate("household_weight", YEAR).values).sum())
62+
63+
# ONS mid-2024 estimate ~68.9M; OBR forecasts 2025 ≈ 69.5M.
64+
assert 65e6 < people < 75e6, f"People total {people:.3g} outside 65-75M"
65+
# FRS implies ~33-35M benefit units; ONS ~28M households.
66+
assert 30e6 < benunits < 38e6, f"Benefit units total {benunits:.3g} outside 30-38M"
67+
assert 26e6 < households < 34e6, f"Household total {households:.3g} outside 26-34M"
68+
69+
70+
@pytest.mark.microsimulation
71+
def test_is_parent_is_populated(sim):
72+
"""``is_parent`` must come from FRS microdata, not default to zero.
73+
74+
Catches the PolicyEngine/policyengine-uk#1595 failure mode where the
75+
inferred-formula was removed but a rebuilt dataset hadn't yet populated
76+
the column.
77+
"""
78+
parents = _weighted(sim, "is_parent")
79+
# UK has ~15M parents of dependent children — anything under a few
80+
# million indicates the column defaulted rather than loaded.
81+
assert parents > 10e6, (
82+
f"is_parent weighted total {parents:.3g} is too low — the variable "
83+
"is likely defaulting to zero because the input column is missing."
84+
)
85+
86+
87+
@pytest.mark.microsimulation
88+
def test_universal_credit_aggregate_in_range(sim):
89+
"""UC aggregate sits within plausible range of the OBR forecast.
90+
91+
Catches cases where capital-limit or other model logic interacts
92+
badly with the data (e.g. stale savings imputations producing
93+
sub-£60bn UC aggregates when the target is ~£74bn).
94+
"""
95+
uc = _weighted(sim, "universal_credit")
96+
# OBR Nov 2025 EFO calibration target is ~£74bn. Bounds allow for
97+
# +/-25% drift either side before failing.
98+
assert 55e9 < uc < 95e9, (
99+
f"Universal credit aggregate £{uc / 1e9:.1f}bn outside "
100+
"£55-£95bn plausibility range"
101+
)
102+
103+
104+
@pytest.mark.microsimulation
105+
def test_core_benefits_are_nonzero(sim):
106+
"""Core benefit aggregates must produce output, not collapse to zero."""
107+
for variable, lower in [
108+
("state_pension", 100e9),
109+
("child_benefit", 10e9),
110+
("pension_credit", 2e9),
111+
]:
112+
total = _weighted(sim, variable)
113+
assert total > lower, (
114+
f"{variable} aggregate £{total / 1e9:.2g}bn below £{lower / 1e9:.0f}bn floor"
115+
)
116+
117+
118+
@pytest.mark.microsimulation
119+
def test_childcare_entitlement_populated(sim):
120+
"""Extended childcare entitlement must reach >0 benefit units.
121+
122+
Catches the downstream failure when ``is_parent`` is defaulted —
123+
every childcare-eligibility chain collapses to zero.
124+
"""
125+
eligible = _weighted(sim, "extended_childcare_entitlement_eligible")
126+
assert eligible > 500_000, (
127+
f"extended_childcare_entitlement_eligible weighted total "
128+
f"{eligible:.3g} implies the childcare chain is broken"
129+
)

0 commit comments

Comments
 (0)