Skip to content

Commit 7a437f1

Browse files
authored
Parameterise VAT rate and ETB year in VAT imputation (#354)
* Parameterise VAT rate and ETB year in VAT imputation * Apply ruff format
1 parent 959d47f commit 7a437f1

3 files changed

Lines changed: 177 additions & 20 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Parameterise the VAT standard rate and reduced-rate share in ETB-based VAT imputation by reading from `policyengine_uk.parameters.gov.hmrc.vat` keyed on the training year, with a `VAT_RATE_BY_YEAR` fallback for offline use. Promote the `etb.year == 2020` filter to a `year` argument with a `DEFAULT_ETB_YEAR` default.

policyengine_uk_data/datasets/imputations/vat.py

Lines changed: 70 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,13 @@
33
44
This module imputes household VAT expenditure rates based on demographic
55
characteristics using machine learning models trained on ETB survey data.
6+
7+
The ETB VAT columns report the standard-rate VAT actually paid plus a
8+
reduced-rate share of expenditure. To back out the underlying
9+
full-rate-taxable expenditure we divide by the statutory VAT standard
10+
rate and subtract an OBR-published reduced-rate share of consumption.
11+
Both are parameterised per-year so later years (or forthcoming rate
12+
changes) don't need a code edit.
613
"""
714

815
import pandas as pd
@@ -14,39 +21,82 @@
1421

1522
ETB_TAB_FOLDER = STORAGE_FOLDER / "etb_1977_21"
1623

17-
CONSUMPTION_PCT_REDUCED_RATE = 0.03 # From OBR's VAT page
18-
CURRENT_VAT_RATE = 0.2
24+
# Default ETB vintage used when training the imputation model. Kept at 2020
25+
# for backward compatibility with the checked-in vat.pkl fingerprint, but
26+
# exposed as a module constant rather than an inline magic number so later
27+
# updates require only a one-line change (not scattered `etb.year == 2020`
28+
# checks).
29+
DEFAULT_ETB_YEAR = 2020
30+
31+
# Fallback VAT parameters used when `policyengine_uk` is unavailable (e.g.
32+
# unit-test environments). Values match the 2020-21 UK statutory position.
33+
_FALLBACK_VAT_STANDARD_RATE = 0.2
34+
_FALLBACK_REDUCED_RATE_SHARE = 0.03
35+
36+
# Manual year → (standard rate, reduced rate share) override used when
37+
# `policyengine_uk` parameters are not available. Kept intentionally short:
38+
# extend only if the team agrees that a VAT code change warrants a hardcoded
39+
# value until the parameter file is updated upstream.
40+
VAT_RATE_BY_YEAR: dict[int, tuple[float, float]] = {
41+
2020: (0.2, 0.03),
42+
2021: (0.2, 0.03),
43+
}
1944

2045
PREDICTORS = ["is_adult", "is_child", "is_SP_age", "household_net_income"]
2146
IMPUTATIONS = ["full_rate_vat_expenditure_rate"]
2247

2348

24-
def generate_etb_table(etb: pd.DataFrame):
49+
def _get_vat_parameters(year: int) -> tuple[float, float]:
50+
"""Return ``(standard_rate, reduced_rate_share)`` for the given calendar year.
51+
52+
Prefers live `policyengine_uk` parameters (``gov.hmrc.vat.standard_rate``
53+
and ``gov.hmrc.vat.reduced_rate_share``). Falls back to the module-level
54+
``VAT_RATE_BY_YEAR`` dict, and finally to the 2020-21 statutory values so
55+
callers never silently get wrong numbers.
56+
"""
57+
try:
58+
from policyengine_uk.system import system
59+
60+
standard_rate = float(system.parameters.gov.hmrc.vat.standard_rate(str(year)))
61+
reduced_rate_share = float(
62+
system.parameters.gov.hmrc.vat.reduced_rate_share(str(year))
63+
)
64+
return standard_rate, reduced_rate_share
65+
except Exception:
66+
if year in VAT_RATE_BY_YEAR:
67+
return VAT_RATE_BY_YEAR[year]
68+
return _FALLBACK_VAT_STANDARD_RATE, _FALLBACK_REDUCED_RATE_SHARE
69+
70+
71+
def generate_etb_table(etb: pd.DataFrame, year: int = DEFAULT_ETB_YEAR) -> pd.DataFrame:
2572
"""
2673
Clean and transform ETB data for VAT imputation model training.
2774
2875
Args:
2976
etb: Raw ETB survey data DataFrame.
77+
year: ETB survey year to filter to. Defaults to ``DEFAULT_ETB_YEAR``.
3078
3179
Returns:
3280
Cleaned DataFrame with VAT expenditure rates calculated.
3381
"""
34-
etb_2020 = etb[etb.year == 2020].dropna()
35-
for col in etb_2020:
36-
etb_2020[col] = pd.to_numeric(etb_2020[col], errors="coerce")
37-
38-
etb_2020_df = pd.DataFrame()
39-
etb_2020_df["is_adult"] = etb_2020.adults
40-
etb_2020_df["is_child"] = etb_2020.childs
41-
etb_2020_df["is_SP_age"] = etb_2020.noretd
42-
etb_2020_df["household_net_income"] = etb_2020.disinc * 52
43-
etb_2020_df["full_rate_vat_expenditure_rate"] = (
44-
etb_2020.totvat * (1 - CONSUMPTION_PCT_REDUCED_RATE) / CURRENT_VAT_RATE
45-
) / (etb_2020.expdis - etb_2020.totvat)
46-
return etb_2020_df[~etb_2020_df.full_rate_vat_expenditure_rate.isna()]
47-
48-
49-
def save_imputation_models():
82+
standard_rate, reduced_rate_share = _get_vat_parameters(year)
83+
84+
etb_year = etb[etb.year == year].dropna()
85+
for col in etb_year:
86+
etb_year[col] = pd.to_numeric(etb_year[col], errors="coerce")
87+
88+
etb_year_df = pd.DataFrame()
89+
etb_year_df["is_adult"] = etb_year.adults
90+
etb_year_df["is_child"] = etb_year.childs
91+
etb_year_df["is_SP_age"] = etb_year.noretd
92+
etb_year_df["household_net_income"] = etb_year.disinc * 52
93+
etb_year_df["full_rate_vat_expenditure_rate"] = (
94+
etb_year.totvat * (1 - reduced_rate_share) / standard_rate
95+
) / (etb_year.expdis - etb_year.totvat)
96+
return etb_year_df[~etb_year_df.full_rate_vat_expenditure_rate.isna()]
97+
98+
99+
def save_imputation_models(year: int = DEFAULT_ETB_YEAR):
50100
"""
51101
Train and save VAT imputation model.
52102
@@ -61,7 +111,7 @@ def save_imputation_models():
61111
delimiter="\t",
62112
low_memory=False,
63113
)
64-
etb = generate_etb_table(etb)
114+
etb = generate_etb_table(etb, year=year)
65115
etb = etb[PREDICTORS + IMPUTATIONS]
66116
vat.fit(etb[PREDICTORS], etb[IMPUTATIONS])
67117
vat.save(STORAGE_FOLDER / "vat.pkl")
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
"""Tests for parameterised VAT constants in `datasets/imputations/vat.py`.
2+
3+
Covers bug-hunt finding U7: the original code hardcoded
4+
``CURRENT_VAT_RATE = 0.2``, ``CONSUMPTION_PCT_REDUCED_RATE = 0.03`` and
5+
the ``etb.year == 2020`` filter inline, so any change to VAT rates,
6+
reduced-rate share, or training vintage required a code edit across
7+
multiple scattered lines.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import pandas as pd
13+
import pytest
14+
15+
16+
def test_get_vat_parameters_reads_from_policyengine_uk():
17+
"""Standard rate should come from `policyengine_uk` parameters."""
18+
try:
19+
from policyengine_uk.system import system
20+
except Exception:
21+
pytest.skip("policyengine_uk not available")
22+
23+
from policyengine_uk_data.datasets.imputations.vat import (
24+
_get_vat_parameters,
25+
)
26+
27+
expected_standard = float(system.parameters.gov.hmrc.vat.standard_rate("2020"))
28+
expected_reduced = float(system.parameters.gov.hmrc.vat.reduced_rate_share("2020"))
29+
standard, reduced = _get_vat_parameters(2020)
30+
assert standard == pytest.approx(expected_standard)
31+
assert reduced == pytest.approx(expected_reduced)
32+
33+
34+
def test_vat_rate_by_year_fallback_matches_2020_statute():
35+
"""Offline fallback must stay aligned with the statutory 2020-21 rates."""
36+
from policyengine_uk_data.datasets.imputations.vat import (
37+
VAT_RATE_BY_YEAR,
38+
)
39+
40+
assert VAT_RATE_BY_YEAR[2020] == (0.2, 0.03)
41+
42+
43+
def test_generate_etb_table_uses_year_param():
44+
"""Changing the `year` arg filters ETB rows by that year.
45+
46+
The original implementation hardcoded ``etb.year == 2020``. After the
47+
fix the year is a parameter with a sensible default.
48+
"""
49+
from policyengine_uk_data.datasets.imputations.vat import (
50+
generate_etb_table,
51+
)
52+
53+
etb = pd.DataFrame(
54+
{
55+
"year": [2020, 2020, 2021, 2021],
56+
"adults": [1, 2, 1, 2],
57+
"childs": [0, 1, 0, 1],
58+
"noretd": [0, 0, 1, 1],
59+
"disinc": [500.0, 800.0, 600.0, 900.0],
60+
"totvat": [50.0, 80.0, 60.0, 90.0],
61+
"expdis": [500.0, 800.0, 600.0, 900.0],
62+
}
63+
)
64+
65+
out_2020 = generate_etb_table(etb, year=2020)
66+
out_2021 = generate_etb_table(etb, year=2021)
67+
68+
# Filtering is by year column — disjoint row counts confirm the filter
69+
# actually moved.
70+
assert len(out_2020) == 2
71+
assert len(out_2021) == 2
72+
# Trained features use household_net_income = disinc * 52.
73+
assert set(out_2020["household_net_income"].to_numpy()) == {500 * 52, 800 * 52}
74+
assert set(out_2021["household_net_income"].to_numpy()) == {600 * 52, 900 * 52}
75+
76+
77+
def test_generate_etb_table_uses_year_specific_vat_rate(monkeypatch):
78+
"""The ``full_rate_vat_expenditure_rate`` column scales with VAT rate."""
79+
from policyengine_uk_data.datasets.imputations import vat as vat_module
80+
81+
etb = pd.DataFrame(
82+
{
83+
"year": [2020, 2030],
84+
"adults": [1, 1],
85+
"childs": [0, 0],
86+
"noretd": [0, 0],
87+
"disinc": [1000.0, 1000.0],
88+
"totvat": [100.0, 100.0],
89+
"expdis": [1000.0, 1000.0],
90+
}
91+
)
92+
93+
def _fake_params(year: int):
94+
return (0.2, 0.0) if year == 2020 else (0.25, 0.0)
95+
96+
monkeypatch.setattr(vat_module, "_get_vat_parameters", _fake_params)
97+
98+
out_2020 = vat_module.generate_etb_table(etb, year=2020)
99+
out_hypothetical = vat_module.generate_etb_table(etb, year=2030)
100+
101+
# Higher standard rate → lower implied full-rate expenditure (divide
102+
# totvat by a bigger denominator), so the computed rate must drop.
103+
assert (
104+
out_hypothetical["full_rate_vat_expenditure_rate"].iloc[0]
105+
< (out_2020["full_rate_vat_expenditure_rate"].iloc[0])
106+
)

0 commit comments

Comments
 (0)