diff --git a/changelog.d/fix-vat-parameterize.fixed.md b/changelog.d/fix-vat-parameterize.fixed.md new file mode 100644 index 000000000..de49c50f5 --- /dev/null +++ b/changelog.d/fix-vat-parameterize.fixed.md @@ -0,0 +1 @@ +Parameterise the VAT standard rate and reduced-rate share in ETB-based VAT imputation by reading from `policyengine_uk.parameters.gov.hmrc.vat` keyed on the training year, with a `VAT_RATE_BY_YEAR` fallback for offline use. Promote the `etb.year == 2020` filter to a `year` argument with a `DEFAULT_ETB_YEAR` default. diff --git a/policyengine_uk_data/datasets/imputations/vat.py b/policyengine_uk_data/datasets/imputations/vat.py index 1071f2427..5b30b4ed8 100644 --- a/policyengine_uk_data/datasets/imputations/vat.py +++ b/policyengine_uk_data/datasets/imputations/vat.py @@ -3,6 +3,13 @@ This module imputes household VAT expenditure rates based on demographic characteristics using machine learning models trained on ETB survey data. + +The ETB VAT columns report the standard-rate VAT actually paid plus a +reduced-rate share of expenditure. To back out the underlying +full-rate-taxable expenditure we divide by the statutory VAT standard +rate and subtract an OBR-published reduced-rate share of consumption. +Both are parameterised per-year so later years (or forthcoming rate +changes) don't need a code edit. """ import pandas as pd @@ -14,39 +21,82 @@ ETB_TAB_FOLDER = STORAGE_FOLDER / "etb_1977_21" -CONSUMPTION_PCT_REDUCED_RATE = 0.03 # From OBR's VAT page -CURRENT_VAT_RATE = 0.2 +# Default ETB vintage used when training the imputation model. Kept at 2020 +# for backward compatibility with the checked-in vat.pkl fingerprint, but +# exposed as a module constant rather than an inline magic number so later +# updates require only a one-line change (not scattered `etb.year == 2020` +# checks). +DEFAULT_ETB_YEAR = 2020 + +# Fallback VAT parameters used when `policyengine_uk` is unavailable (e.g. +# unit-test environments). Values match the 2020-21 UK statutory position. +_FALLBACK_VAT_STANDARD_RATE = 0.2 +_FALLBACK_REDUCED_RATE_SHARE = 0.03 + +# Manual year → (standard rate, reduced rate share) override used when +# `policyengine_uk` parameters are not available. Kept intentionally short: +# extend only if the team agrees that a VAT code change warrants a hardcoded +# value until the parameter file is updated upstream. +VAT_RATE_BY_YEAR: dict[int, tuple[float, float]] = { + 2020: (0.2, 0.03), + 2021: (0.2, 0.03), +} PREDICTORS = ["is_adult", "is_child", "is_SP_age", "household_net_income"] IMPUTATIONS = ["full_rate_vat_expenditure_rate"] -def generate_etb_table(etb: pd.DataFrame): +def _get_vat_parameters(year: int) -> tuple[float, float]: + """Return ``(standard_rate, reduced_rate_share)`` for the given calendar year. + + Prefers live `policyengine_uk` parameters (``gov.hmrc.vat.standard_rate`` + and ``gov.hmrc.vat.reduced_rate_share``). Falls back to the module-level + ``VAT_RATE_BY_YEAR`` dict, and finally to the 2020-21 statutory values so + callers never silently get wrong numbers. + """ + try: + from policyengine_uk.system import system + + standard_rate = float(system.parameters.gov.hmrc.vat.standard_rate(str(year))) + reduced_rate_share = float( + system.parameters.gov.hmrc.vat.reduced_rate_share(str(year)) + ) + return standard_rate, reduced_rate_share + except Exception: + if year in VAT_RATE_BY_YEAR: + return VAT_RATE_BY_YEAR[year] + return _FALLBACK_VAT_STANDARD_RATE, _FALLBACK_REDUCED_RATE_SHARE + + +def generate_etb_table(etb: pd.DataFrame, year: int = DEFAULT_ETB_YEAR) -> pd.DataFrame: """ Clean and transform ETB data for VAT imputation model training. Args: etb: Raw ETB survey data DataFrame. + year: ETB survey year to filter to. Defaults to ``DEFAULT_ETB_YEAR``. Returns: Cleaned DataFrame with VAT expenditure rates calculated. """ - etb_2020 = etb[etb.year == 2020].dropna() - for col in etb_2020: - etb_2020[col] = pd.to_numeric(etb_2020[col], errors="coerce") - - etb_2020_df = pd.DataFrame() - etb_2020_df["is_adult"] = etb_2020.adults - etb_2020_df["is_child"] = etb_2020.childs - etb_2020_df["is_SP_age"] = etb_2020.noretd - etb_2020_df["household_net_income"] = etb_2020.disinc * 52 - etb_2020_df["full_rate_vat_expenditure_rate"] = ( - etb_2020.totvat * (1 - CONSUMPTION_PCT_REDUCED_RATE) / CURRENT_VAT_RATE - ) / (etb_2020.expdis - etb_2020.totvat) - return etb_2020_df[~etb_2020_df.full_rate_vat_expenditure_rate.isna()] - - -def save_imputation_models(): + standard_rate, reduced_rate_share = _get_vat_parameters(year) + + etb_year = etb[etb.year == year].dropna() + for col in etb_year: + etb_year[col] = pd.to_numeric(etb_year[col], errors="coerce") + + etb_year_df = pd.DataFrame() + etb_year_df["is_adult"] = etb_year.adults + etb_year_df["is_child"] = etb_year.childs + etb_year_df["is_SP_age"] = etb_year.noretd + etb_year_df["household_net_income"] = etb_year.disinc * 52 + etb_year_df["full_rate_vat_expenditure_rate"] = ( + etb_year.totvat * (1 - reduced_rate_share) / standard_rate + ) / (etb_year.expdis - etb_year.totvat) + return etb_year_df[~etb_year_df.full_rate_vat_expenditure_rate.isna()] + + +def save_imputation_models(year: int = DEFAULT_ETB_YEAR): """ Train and save VAT imputation model. @@ -61,7 +111,7 @@ def save_imputation_models(): delimiter="\t", low_memory=False, ) - etb = generate_etb_table(etb) + etb = generate_etb_table(etb, year=year) etb = etb[PREDICTORS + IMPUTATIONS] vat.fit(etb[PREDICTORS], etb[IMPUTATIONS]) vat.save(STORAGE_FOLDER / "vat.pkl") diff --git a/policyengine_uk_data/tests/test_vat_parameters.py b/policyengine_uk_data/tests/test_vat_parameters.py new file mode 100644 index 000000000..2c6b0e8d3 --- /dev/null +++ b/policyengine_uk_data/tests/test_vat_parameters.py @@ -0,0 +1,106 @@ +"""Tests for parameterised VAT constants in `datasets/imputations/vat.py`. + +Covers bug-hunt finding U7: the original code hardcoded +``CURRENT_VAT_RATE = 0.2``, ``CONSUMPTION_PCT_REDUCED_RATE = 0.03`` and +the ``etb.year == 2020`` filter inline, so any change to VAT rates, +reduced-rate share, or training vintage required a code edit across +multiple scattered lines. +""" + +from __future__ import annotations + +import pandas as pd +import pytest + + +def test_get_vat_parameters_reads_from_policyengine_uk(): + """Standard rate should come from `policyengine_uk` parameters.""" + try: + from policyengine_uk.system import system + except Exception: + pytest.skip("policyengine_uk not available") + + from policyengine_uk_data.datasets.imputations.vat import ( + _get_vat_parameters, + ) + + expected_standard = float(system.parameters.gov.hmrc.vat.standard_rate("2020")) + expected_reduced = float(system.parameters.gov.hmrc.vat.reduced_rate_share("2020")) + standard, reduced = _get_vat_parameters(2020) + assert standard == pytest.approx(expected_standard) + assert reduced == pytest.approx(expected_reduced) + + +def test_vat_rate_by_year_fallback_matches_2020_statute(): + """Offline fallback must stay aligned with the statutory 2020-21 rates.""" + from policyengine_uk_data.datasets.imputations.vat import ( + VAT_RATE_BY_YEAR, + ) + + assert VAT_RATE_BY_YEAR[2020] == (0.2, 0.03) + + +def test_generate_etb_table_uses_year_param(): + """Changing the `year` arg filters ETB rows by that year. + + The original implementation hardcoded ``etb.year == 2020``. After the + fix the year is a parameter with a sensible default. + """ + from policyengine_uk_data.datasets.imputations.vat import ( + generate_etb_table, + ) + + etb = pd.DataFrame( + { + "year": [2020, 2020, 2021, 2021], + "adults": [1, 2, 1, 2], + "childs": [0, 1, 0, 1], + "noretd": [0, 0, 1, 1], + "disinc": [500.0, 800.0, 600.0, 900.0], + "totvat": [50.0, 80.0, 60.0, 90.0], + "expdis": [500.0, 800.0, 600.0, 900.0], + } + ) + + out_2020 = generate_etb_table(etb, year=2020) + out_2021 = generate_etb_table(etb, year=2021) + + # Filtering is by year column — disjoint row counts confirm the filter + # actually moved. + assert len(out_2020) == 2 + assert len(out_2021) == 2 + # Trained features use household_net_income = disinc * 52. + assert set(out_2020["household_net_income"].to_numpy()) == {500 * 52, 800 * 52} + assert set(out_2021["household_net_income"].to_numpy()) == {600 * 52, 900 * 52} + + +def test_generate_etb_table_uses_year_specific_vat_rate(monkeypatch): + """The ``full_rate_vat_expenditure_rate`` column scales with VAT rate.""" + from policyengine_uk_data.datasets.imputations import vat as vat_module + + etb = pd.DataFrame( + { + "year": [2020, 2030], + "adults": [1, 1], + "childs": [0, 0], + "noretd": [0, 0], + "disinc": [1000.0, 1000.0], + "totvat": [100.0, 100.0], + "expdis": [1000.0, 1000.0], + } + ) + + def _fake_params(year: int): + return (0.2, 0.0) if year == 2020 else (0.25, 0.0) + + monkeypatch.setattr(vat_module, "_get_vat_parameters", _fake_params) + + out_2020 = vat_module.generate_etb_table(etb, year=2020) + out_hypothetical = vat_module.generate_etb_table(etb, year=2030) + + # Higher standard rate → lower implied full-rate expenditure (divide + # totvat by a bigger denominator), so the computed rate must drop. + assert ( + out_hypothetical["full_rate_vat_expenditure_rate"].iloc[0] + < (out_2020["full_rate_vat_expenditure_rate"].iloc[0]) + )