Parse NIC Classes 2, 3 and 4 from OBR EFO Table 3.4 (#355)

MaxGhenis · web-flow · commit f06bc50bceff · 2026-04-17T12:05:40.000-04:00
diff --git a/changelog.d/fix-obr-nic-classes.fixed.md b/changelog.d/fix-obr-nic-classes.fixed.md
@@ -0,0 +1 @@
+Add OBR calibration targets for NIC Classes 2, 3 and 4 (self-employed flat-rate, voluntary and profit-based) alongside the existing Class 1 employee/employer rows, and accept common label-wording variants in OBR EFO Table 3.4.
diff --git a/policyengine_uk_data/targets/sources/obr.py b/policyengine_uk_data/targets/sources/obr.py
@@ -219,43 +219,95 @@ def read_41(row_num: int) -> dict[int, float]:
 
 
 def _parse_nics(wb: openpyxl.Workbook) -> list[Target]:
-    """Parse Table 3.4 (income tax and NICs detail) for employee/employer."""
+    """Parse Table 3.4 (income tax and NICs detail) for all NIC classes.
+
+    Covers Class 1 employee and employer (PAYE), plus Classes 2, 3 and 4
+    (self-employed flat-rate, voluntary, and self-employed profit-based).
+    Omitting Classes 2/3/4 biased calibration by ~£2-5B/yr of NIC receipts
+    — the omission pushed self-employment income downward to compensate,
+    distorting reforms that touch SE income.
+
+    Where a row label is not found the parser logs a warning and skips that
+    variable, so partial matches still produce useful targets.
+    """
     config = load_config()
     vintage = config["obr"]["vintage"]
     ref = config["obr"]["efo_receipts"]
     ws = wb["3.4"]
     cols = list(_FY_COL_TO_YEAR.keys())
 
-    nic_rows = {
+    # Map of target name → (list of candidate labels to search, variable
+    # name). Candidates let us accept minor wording variation in OBR EFOs
+    # (e.g. "Class 2 NICs" vs "Class 2 Self-Employed NICs") without
+    # failing the whole parse.
+    nic_rows: dict[str, tuple[list[str], str]] = {
         "ni_employee": (
-            "Class 1 Employee NICs",
+            ["Class 1 Employee NICs"],
             "ni_employee",
         ),
         "ni_employer": (
-            "Class 1 Employer NICs",
+            ["Class 1 Employer NICs"],
             "ni_employer",
         ),
+        "ni_class_2": (
+            [
+                "Class 2 NICs",
+                "Class 2 Self-Employed NICs",
+                "Class 2 self-employed NICs",
+            ],
+            "ni_class_2",
+        ),
+        "ni_class_3": (
+            [
+                "Class 3 NICs",
+                "Class 3 Voluntary NICs",
+                "Class 3 voluntary NICs",
+            ],
+            "ni_class_3",
+        ),
+        "ni_class_4": (
+            [
+                "Class 4 NICs",
+                "Class 4 Self-Employed NICs",
+                "Class 4 self-employed NICs",
+            ],
+            "ni_class_4",
+        ),
     }
 
     targets = []
-    for name, (label, variable) in nic_rows.items():
-        try:
-            row_num = _find_row(ws, label, col="B", max_row=30)
-            values = _read_row_values(ws, row_num, cols)
-            if values:
-                targets.append(
-                    Target(
-                        name=f"obr/{name}",
-                        variable=variable,
-                        source="obr",
-                        unit=Unit.GBP,
-                        values=values,
-                        reference_url=ref,
-                        forecast_vintage=vintage,
-                    )
+    for name, (labels, variable) in nic_rows.items():
+        row_num = None
+        last_error = None
+        for label in labels:
+            try:
+                row_num = _find_row(ws, label, col="B", max_row=40)
+                break
+            except ValueError as e:
+                last_error = e
+                continue
+        if row_num is None:
+            logger.warning(
+                "OBR NICs: no row matched labels %s for %s: %s",
+                labels,
+                variable,
+                last_error,
+            )
+            continue
+
+        values = _read_row_values(ws, row_num, cols)
+        if values:
+            targets.append(
+                Target(
+                    name=f"obr/{name}",
+                    variable=variable,
+                    source="obr",
+                    unit=Unit.GBP,
+                    values=values,
+                    reference_url=ref,
+                    forecast_vintage=vintage,
                 )
-        except ValueError:
-            logger.warning("OBR NICs: row '%s' not found", label)
+            )
 
     return targets
 
diff --git a/policyengine_uk_data/tests/test_obr_nics.py b/policyengine_uk_data/tests/test_obr_nics.py
@@ -0,0 +1,125 @@
+"""Regression test for OBR NIC target parsing.
+
+Bug-hunt finding U8: `_parse_nics` only covered Class 1 employee and
+employer NICs, omitting Classes 2 (self-employed flat-rate), 3 (voluntary)
+and 4 (self-employed profit-based). Calibration had no target for those
+receipts and pushed self-employment income downward to compensate.
+
+This test drives the parser with a minimal in-memory openpyxl workbook
+that mimics the OBR EFO Table 3.4 layout, and asserts that targets for
+all five NIC variables are produced.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+from unittest.mock import patch
+
+import pytest
+
+if importlib.util.find_spec("openpyxl") is None:
+    pytest.skip("openpyxl not installed", allow_module_level=True)
+
+import openpyxl  # noqa: E402
+
+
+def _build_fake_obr_workbook() -> openpyxl.Workbook:
+    """Create a stand-in for OBR EFO receipts with a minimal Table 3.4."""
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "3.4"
+
+    # Column B holds row labels; columns C-I hold FY values in £bn.
+    rows = [
+        ("Class 1 Employee NICs", [120.0] * 7),
+        ("Class 1 Employer NICs", [140.0] * 7),
+        ("Class 2 NICs", [0.3] * 7),
+        ("Class 3 NICs", [0.05] * 7),
+        ("Class 4 NICs", [4.5] * 7),
+    ]
+    for row_idx, (label, values) in enumerate(rows, start=2):
+        ws.cell(row=row_idx, column=2, value=label)  # col B
+        for col_idx, value in enumerate(values, start=3):  # cols C-I
+            ws.cell(row=row_idx, column=col_idx, value=value)
+
+    return wb
+
+
+def test_parse_nics_covers_all_five_classes():
+    from policyengine_uk_data.targets.sources import obr as obr_module
+
+    wb = _build_fake_obr_workbook()
+
+    fake_config = {
+        "obr": {
+            "vintage": "test",
+            "efo_receipts": "https://example.invalid/receipts",
+            "efo_expenditure": "https://example.invalid/expenditure",
+        }
+    }
+    with patch.object(obr_module, "load_config", return_value=fake_config):
+        targets = obr_module._parse_nics(wb)
+
+    names = {t.name for t in targets}
+    assert names == {
+        "obr/ni_employee",
+        "obr/ni_employer",
+        "obr/ni_class_2",
+        "obr/ni_class_3",
+        "obr/ni_class_4",
+    }
+
+    variables = {t.variable for t in targets}
+    # Variable names must match the policyengine-uk variable identifiers so
+    # calibration can map them to simulated totals.
+    assert variables == {
+        "ni_employee",
+        "ni_employer",
+        "ni_class_2",
+        "ni_class_3",
+        "ni_class_4",
+    }
+
+    # Values are scaled by 1e9 (£bn → £) inside _read_row_values.
+    class_4_target = next(t for t in targets if t.variable == "ni_class_4")
+    assert class_4_target.values[2024] == pytest.approx(4.5e9)
+
+
+def test_parse_nics_tolerates_alt_label_wording():
+    """The parser should accept common wording variants for self-employed rows."""
+    from policyengine_uk_data.targets.sources import obr as obr_module
+
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "3.4"
+
+    # Use alternative labels that the parser should still find.
+    rows = [
+        ("Class 1 Employee NICs", [100.0] * 7),
+        ("Class 1 Employer NICs", [110.0] * 7),
+        ("Class 2 Self-Employed NICs", [0.2] * 7),
+        ("Class 3 Voluntary NICs", [0.04] * 7),
+        ("Class 4 Self-Employed NICs", [4.0] * 7),
+    ]
+    for row_idx, (label, values) in enumerate(rows, start=2):
+        ws.cell(row=row_idx, column=2, value=label)
+        for col_idx, value in enumerate(values, start=3):
+            ws.cell(row=row_idx, column=col_idx, value=value)
+
+    fake_config = {
+        "obr": {
+            "vintage": "test",
+            "efo_receipts": "https://example.invalid/receipts",
+            "efo_expenditure": "https://example.invalid/expenditure",
+        }
+    }
+    with patch.object(obr_module, "load_config", return_value=fake_config):
+        targets = obr_module._parse_nics(wb)
+
+    assert {t.variable for t in targets} == {
+        "ni_employee",
+        "ni_employer",
+        "ni_class_2",
+        "ni_class_3",
+        "ni_class_4",
+    }

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Add OBR calibration targets for NIC Classes 2, 3 and 4 (self-employed flat-rate, voluntary and profit-based) alongside the existing Class 1 employee/employer rows, and accept common label-wording variants in OBR EFO Table 3.4.`