Skip to content

Commit f06bc50

Browse files
authored
Parse NIC Classes 2, 3 and 4 from OBR EFO Table 3.4 (#355)
1 parent 7a437f1 commit f06bc50

3 files changed

Lines changed: 199 additions & 21 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add OBR calibration targets for NIC Classes 2, 3 and 4 (self-employed flat-rate, voluntary and profit-based) alongside the existing Class 1 employee/employer rows, and accept common label-wording variants in OBR EFO Table 3.4.

policyengine_uk_data/targets/sources/obr.py

Lines changed: 73 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -219,43 +219,95 @@ def read_41(row_num: int) -> dict[int, float]:
219219

220220

221221
def _parse_nics(wb: openpyxl.Workbook) -> list[Target]:
222-
"""Parse Table 3.4 (income tax and NICs detail) for employee/employer."""
222+
"""Parse Table 3.4 (income tax and NICs detail) for all NIC classes.
223+
224+
Covers Class 1 employee and employer (PAYE), plus Classes 2, 3 and 4
225+
(self-employed flat-rate, voluntary, and self-employed profit-based).
226+
Omitting Classes 2/3/4 biased calibration by ~£2-5B/yr of NIC receipts
227+
— the omission pushed self-employment income downward to compensate,
228+
distorting reforms that touch SE income.
229+
230+
Where a row label is not found the parser logs a warning and skips that
231+
variable, so partial matches still produce useful targets.
232+
"""
223233
config = load_config()
224234
vintage = config["obr"]["vintage"]
225235
ref = config["obr"]["efo_receipts"]
226236
ws = wb["3.4"]
227237
cols = list(_FY_COL_TO_YEAR.keys())
228238

229-
nic_rows = {
239+
# Map of target name → (list of candidate labels to search, variable
240+
# name). Candidates let us accept minor wording variation in OBR EFOs
241+
# (e.g. "Class 2 NICs" vs "Class 2 Self-Employed NICs") without
242+
# failing the whole parse.
243+
nic_rows: dict[str, tuple[list[str], str]] = {
230244
"ni_employee": (
231-
"Class 1 Employee NICs",
245+
["Class 1 Employee NICs"],
232246
"ni_employee",
233247
),
234248
"ni_employer": (
235-
"Class 1 Employer NICs",
249+
["Class 1 Employer NICs"],
236250
"ni_employer",
237251
),
252+
"ni_class_2": (
253+
[
254+
"Class 2 NICs",
255+
"Class 2 Self-Employed NICs",
256+
"Class 2 self-employed NICs",
257+
],
258+
"ni_class_2",
259+
),
260+
"ni_class_3": (
261+
[
262+
"Class 3 NICs",
263+
"Class 3 Voluntary NICs",
264+
"Class 3 voluntary NICs",
265+
],
266+
"ni_class_3",
267+
),
268+
"ni_class_4": (
269+
[
270+
"Class 4 NICs",
271+
"Class 4 Self-Employed NICs",
272+
"Class 4 self-employed NICs",
273+
],
274+
"ni_class_4",
275+
),
238276
}
239277

240278
targets = []
241-
for name, (label, variable) in nic_rows.items():
242-
try:
243-
row_num = _find_row(ws, label, col="B", max_row=30)
244-
values = _read_row_values(ws, row_num, cols)
245-
if values:
246-
targets.append(
247-
Target(
248-
name=f"obr/{name}",
249-
variable=variable,
250-
source="obr",
251-
unit=Unit.GBP,
252-
values=values,
253-
reference_url=ref,
254-
forecast_vintage=vintage,
255-
)
279+
for name, (labels, variable) in nic_rows.items():
280+
row_num = None
281+
last_error = None
282+
for label in labels:
283+
try:
284+
row_num = _find_row(ws, label, col="B", max_row=40)
285+
break
286+
except ValueError as e:
287+
last_error = e
288+
continue
289+
if row_num is None:
290+
logger.warning(
291+
"OBR NICs: no row matched labels %s for %s: %s",
292+
labels,
293+
variable,
294+
last_error,
295+
)
296+
continue
297+
298+
values = _read_row_values(ws, row_num, cols)
299+
if values:
300+
targets.append(
301+
Target(
302+
name=f"obr/{name}",
303+
variable=variable,
304+
source="obr",
305+
unit=Unit.GBP,
306+
values=values,
307+
reference_url=ref,
308+
forecast_vintage=vintage,
256309
)
257-
except ValueError:
258-
logger.warning("OBR NICs: row '%s' not found", label)
310+
)
259311

260312
return targets
261313

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
"""Regression test for OBR NIC target parsing.
2+
3+
Bug-hunt finding U8: `_parse_nics` only covered Class 1 employee and
4+
employer NICs, omitting Classes 2 (self-employed flat-rate), 3 (voluntary)
5+
and 4 (self-employed profit-based). Calibration had no target for those
6+
receipts and pushed self-employment income downward to compensate.
7+
8+
This test drives the parser with a minimal in-memory openpyxl workbook
9+
that mimics the OBR EFO Table 3.4 layout, and asserts that targets for
10+
all five NIC variables are produced.
11+
"""
12+
13+
from __future__ import annotations
14+
15+
import importlib.util
16+
from unittest.mock import patch
17+
18+
import pytest
19+
20+
if importlib.util.find_spec("openpyxl") is None:
21+
pytest.skip("openpyxl not installed", allow_module_level=True)
22+
23+
import openpyxl # noqa: E402
24+
25+
26+
def _build_fake_obr_workbook() -> openpyxl.Workbook:
27+
"""Create a stand-in for OBR EFO receipts with a minimal Table 3.4."""
28+
wb = openpyxl.Workbook()
29+
ws = wb.active
30+
ws.title = "3.4"
31+
32+
# Column B holds row labels; columns C-I hold FY values in £bn.
33+
rows = [
34+
("Class 1 Employee NICs", [120.0] * 7),
35+
("Class 1 Employer NICs", [140.0] * 7),
36+
("Class 2 NICs", [0.3] * 7),
37+
("Class 3 NICs", [0.05] * 7),
38+
("Class 4 NICs", [4.5] * 7),
39+
]
40+
for row_idx, (label, values) in enumerate(rows, start=2):
41+
ws.cell(row=row_idx, column=2, value=label) # col B
42+
for col_idx, value in enumerate(values, start=3): # cols C-I
43+
ws.cell(row=row_idx, column=col_idx, value=value)
44+
45+
return wb
46+
47+
48+
def test_parse_nics_covers_all_five_classes():
49+
from policyengine_uk_data.targets.sources import obr as obr_module
50+
51+
wb = _build_fake_obr_workbook()
52+
53+
fake_config = {
54+
"obr": {
55+
"vintage": "test",
56+
"efo_receipts": "https://example.invalid/receipts",
57+
"efo_expenditure": "https://example.invalid/expenditure",
58+
}
59+
}
60+
with patch.object(obr_module, "load_config", return_value=fake_config):
61+
targets = obr_module._parse_nics(wb)
62+
63+
names = {t.name for t in targets}
64+
assert names == {
65+
"obr/ni_employee",
66+
"obr/ni_employer",
67+
"obr/ni_class_2",
68+
"obr/ni_class_3",
69+
"obr/ni_class_4",
70+
}
71+
72+
variables = {t.variable for t in targets}
73+
# Variable names must match the policyengine-uk variable identifiers so
74+
# calibration can map them to simulated totals.
75+
assert variables == {
76+
"ni_employee",
77+
"ni_employer",
78+
"ni_class_2",
79+
"ni_class_3",
80+
"ni_class_4",
81+
}
82+
83+
# Values are scaled by 1e9 (£bn → £) inside _read_row_values.
84+
class_4_target = next(t for t in targets if t.variable == "ni_class_4")
85+
assert class_4_target.values[2024] == pytest.approx(4.5e9)
86+
87+
88+
def test_parse_nics_tolerates_alt_label_wording():
89+
"""The parser should accept common wording variants for self-employed rows."""
90+
from policyengine_uk_data.targets.sources import obr as obr_module
91+
92+
wb = openpyxl.Workbook()
93+
ws = wb.active
94+
ws.title = "3.4"
95+
96+
# Use alternative labels that the parser should still find.
97+
rows = [
98+
("Class 1 Employee NICs", [100.0] * 7),
99+
("Class 1 Employer NICs", [110.0] * 7),
100+
("Class 2 Self-Employed NICs", [0.2] * 7),
101+
("Class 3 Voluntary NICs", [0.04] * 7),
102+
("Class 4 Self-Employed NICs", [4.0] * 7),
103+
]
104+
for row_idx, (label, values) in enumerate(rows, start=2):
105+
ws.cell(row=row_idx, column=2, value=label)
106+
for col_idx, value in enumerate(values, start=3):
107+
ws.cell(row=row_idx, column=col_idx, value=value)
108+
109+
fake_config = {
110+
"obr": {
111+
"vintage": "test",
112+
"efo_receipts": "https://example.invalid/receipts",
113+
"efo_expenditure": "https://example.invalid/expenditure",
114+
}
115+
}
116+
with patch.object(obr_module, "load_config", return_value=fake_config):
117+
targets = obr_module._parse_nics(wb)
118+
119+
assert {t.variable for t in targets} == {
120+
"ni_employee",
121+
"ni_employer",
122+
"ni_class_2",
123+
"ni_class_3",
124+
"ni_class_4",
125+
}

0 commit comments

Comments
 (0)