Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/fix-obr-nic-classes.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add OBR calibration targets for NIC Classes 2, 3 and 4 (self-employed flat-rate, voluntary and profit-based) alongside the existing Class 1 employee/employer rows, and accept common label-wording variants in OBR EFO Table 3.4.
94 changes: 73 additions & 21 deletions policyengine_uk_data/targets/sources/obr.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,43 +219,95 @@ def read_41(row_num: int) -> dict[int, float]:


def _parse_nics(wb: openpyxl.Workbook) -> list[Target]:
"""Parse Table 3.4 (income tax and NICs detail) for employee/employer."""
"""Parse Table 3.4 (income tax and NICs detail) for all NIC classes.

Covers Class 1 employee and employer (PAYE), plus Classes 2, 3 and 4
(self-employed flat-rate, voluntary, and self-employed profit-based).
Omitting Classes 2/3/4 biased calibration by ~£2-5B/yr of NIC receipts
— the omission pushed self-employment income downward to compensate,
distorting reforms that touch SE income.

Where a row label is not found the parser logs a warning and skips that
variable, so partial matches still produce useful targets.
"""
config = load_config()
vintage = config["obr"]["vintage"]
ref = config["obr"]["efo_receipts"]
ws = wb["3.4"]
cols = list(_FY_COL_TO_YEAR.keys())

nic_rows = {
# Map of target name → (list of candidate labels to search, variable
# name). Candidates let us accept minor wording variation in OBR EFOs
# (e.g. "Class 2 NICs" vs "Class 2 Self-Employed NICs") without
# failing the whole parse.
nic_rows: dict[str, tuple[list[str], str]] = {
"ni_employee": (
"Class 1 Employee NICs",
["Class 1 Employee NICs"],
"ni_employee",
),
"ni_employer": (
"Class 1 Employer NICs",
["Class 1 Employer NICs"],
"ni_employer",
),
"ni_class_2": (
[
"Class 2 NICs",
"Class 2 Self-Employed NICs",
"Class 2 self-employed NICs",
],
"ni_class_2",
),
"ni_class_3": (
[
"Class 3 NICs",
"Class 3 Voluntary NICs",
"Class 3 voluntary NICs",
],
"ni_class_3",
),
"ni_class_4": (
[
"Class 4 NICs",
"Class 4 Self-Employed NICs",
"Class 4 self-employed NICs",
],
"ni_class_4",
),
}

targets = []
for name, (label, variable) in nic_rows.items():
try:
row_num = _find_row(ws, label, col="B", max_row=30)
values = _read_row_values(ws, row_num, cols)
if values:
targets.append(
Target(
name=f"obr/{name}",
variable=variable,
source="obr",
unit=Unit.GBP,
values=values,
reference_url=ref,
forecast_vintage=vintage,
)
for name, (labels, variable) in nic_rows.items():
row_num = None
last_error = None
for label in labels:
try:
row_num = _find_row(ws, label, col="B", max_row=40)
break
except ValueError as e:
last_error = e
continue
if row_num is None:
logger.warning(
"OBR NICs: no row matched labels %s for %s: %s",
labels,
variable,
last_error,
)
continue

values = _read_row_values(ws, row_num, cols)
if values:
targets.append(
Target(
name=f"obr/{name}",
variable=variable,
source="obr",
unit=Unit.GBP,
values=values,
reference_url=ref,
forecast_vintage=vintage,
)
except ValueError:
logger.warning("OBR NICs: row '%s' not found", label)
)

return targets

Expand Down
125 changes: 125 additions & 0 deletions policyengine_uk_data/tests/test_obr_nics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""Regression test for OBR NIC target parsing.

Bug-hunt finding U8: `_parse_nics` only covered Class 1 employee and
employer NICs, omitting Classes 2 (self-employed flat-rate), 3 (voluntary)
and 4 (self-employed profit-based). Calibration had no target for those
receipts and pushed self-employment income downward to compensate.

This test drives the parser with a minimal in-memory openpyxl workbook
that mimics the OBR EFO Table 3.4 layout, and asserts that targets for
all five NIC variables are produced.
"""

from __future__ import annotations

import importlib.util
from unittest.mock import patch

import pytest

if importlib.util.find_spec("openpyxl") is None:
pytest.skip("openpyxl not installed", allow_module_level=True)

import openpyxl # noqa: E402


def _build_fake_obr_workbook() -> openpyxl.Workbook:
"""Create a stand-in for OBR EFO receipts with a minimal Table 3.4."""
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "3.4"

# Column B holds row labels; columns C-I hold FY values in £bn.
rows = [
("Class 1 Employee NICs", [120.0] * 7),
("Class 1 Employer NICs", [140.0] * 7),
("Class 2 NICs", [0.3] * 7),
("Class 3 NICs", [0.05] * 7),
("Class 4 NICs", [4.5] * 7),
]
for row_idx, (label, values) in enumerate(rows, start=2):
ws.cell(row=row_idx, column=2, value=label) # col B
for col_idx, value in enumerate(values, start=3): # cols C-I
ws.cell(row=row_idx, column=col_idx, value=value)

return wb


def test_parse_nics_covers_all_five_classes():
from policyengine_uk_data.targets.sources import obr as obr_module

wb = _build_fake_obr_workbook()

fake_config = {
"obr": {
"vintage": "test",
"efo_receipts": "https://example.invalid/receipts",
"efo_expenditure": "https://example.invalid/expenditure",
}
}
with patch.object(obr_module, "load_config", return_value=fake_config):
targets = obr_module._parse_nics(wb)

names = {t.name for t in targets}
assert names == {
"obr/ni_employee",
"obr/ni_employer",
"obr/ni_class_2",
"obr/ni_class_3",
"obr/ni_class_4",
}

variables = {t.variable for t in targets}
# Variable names must match the policyengine-uk variable identifiers so
# calibration can map them to simulated totals.
assert variables == {
"ni_employee",
"ni_employer",
"ni_class_2",
"ni_class_3",
"ni_class_4",
}

# Values are scaled by 1e9 (£bn → £) inside _read_row_values.
class_4_target = next(t for t in targets if t.variable == "ni_class_4")
assert class_4_target.values[2024] == pytest.approx(4.5e9)


def test_parse_nics_tolerates_alt_label_wording():
"""The parser should accept common wording variants for self-employed rows."""
from policyengine_uk_data.targets.sources import obr as obr_module

wb = openpyxl.Workbook()
ws = wb.active
ws.title = "3.4"

# Use alternative labels that the parser should still find.
rows = [
("Class 1 Employee NICs", [100.0] * 7),
("Class 1 Employer NICs", [110.0] * 7),
("Class 2 Self-Employed NICs", [0.2] * 7),
("Class 3 Voluntary NICs", [0.04] * 7),
("Class 4 Self-Employed NICs", [4.0] * 7),
]
for row_idx, (label, values) in enumerate(rows, start=2):
ws.cell(row=row_idx, column=2, value=label)
for col_idx, value in enumerate(values, start=3):
ws.cell(row=row_idx, column=col_idx, value=value)

fake_config = {
"obr": {
"vintage": "test",
"efo_receipts": "https://example.invalid/receipts",
"efo_expenditure": "https://example.invalid/expenditure",
}
}
with patch.object(obr_module, "load_config", return_value=fake_config):
targets = obr_module._parse_nics(wb)

assert {t.variable for t in targets} == {
"ni_employee",
"ni_employer",
"ni_class_2",
"ni_class_3",
"ni_class_4",
}
Loading