33
44This module imputes household VAT expenditure rates based on demographic
55characteristics using machine learning models trained on ETB survey data.
6+
7+ The ETB VAT columns report the standard-rate VAT actually paid plus a
8+ reduced-rate share of expenditure. To back out the underlying
9+ full-rate-taxable expenditure we divide by the statutory VAT standard
10+ rate and subtract an OBR-published reduced-rate share of consumption.
11+ Both are parameterised per-year so later years (or forthcoming rate
12+ changes) don't need a code edit.
613"""
714
815import pandas as pd
1421
1522ETB_TAB_FOLDER = STORAGE_FOLDER / "etb_1977_21"
1623
17- CONSUMPTION_PCT_REDUCED_RATE = 0.03 # From OBR's VAT page
18- CURRENT_VAT_RATE = 0.2
24+ # Default ETB vintage used when training the imputation model. Kept at 2020
25+ # for backward compatibility with the checked-in vat.pkl fingerprint, but
26+ # exposed as a module constant rather than an inline magic number so later
27+ # updates require only a one-line change (not scattered `etb.year == 2020`
28+ # checks).
29+ DEFAULT_ETB_YEAR = 2020
30+
31+ # Fallback VAT parameters used when `policyengine_uk` is unavailable (e.g.
32+ # unit-test environments). Values match the 2020-21 UK statutory position.
33+ _FALLBACK_VAT_STANDARD_RATE = 0.2
34+ _FALLBACK_REDUCED_RATE_SHARE = 0.03
35+
36+ # Manual year → (standard rate, reduced rate share) override used when
37+ # `policyengine_uk` parameters are not available. Kept intentionally short:
38+ # extend only if the team agrees that a VAT code change warrants a hardcoded
39+ # value until the parameter file is updated upstream.
40+ VAT_RATE_BY_YEAR : dict [int , tuple [float , float ]] = {
41+ 2020 : (0.2 , 0.03 ),
42+ 2021 : (0.2 , 0.03 ),
43+ }
1944
2045PREDICTORS = ["is_adult" , "is_child" , "is_SP_age" , "household_net_income" ]
2146IMPUTATIONS = ["full_rate_vat_expenditure_rate" ]
2247
2348
24- def generate_etb_table (etb : pd .DataFrame ):
49+ def _get_vat_parameters (year : int ) -> tuple [float , float ]:
50+ """Return ``(standard_rate, reduced_rate_share)`` for the given calendar year.
51+
52+ Prefers live `policyengine_uk` parameters (``gov.hmrc.vat.standard_rate``
53+ and ``gov.hmrc.vat.reduced_rate_share``). Falls back to the module-level
54+ ``VAT_RATE_BY_YEAR`` dict, and finally to the 2020-21 statutory values so
55+ callers never silently get wrong numbers.
56+ """
57+ try :
58+ from policyengine_uk .system import system
59+
60+ standard_rate = float (system .parameters .gov .hmrc .vat .standard_rate (str (year )))
61+ reduced_rate_share = float (
62+ system .parameters .gov .hmrc .vat .reduced_rate_share (str (year ))
63+ )
64+ return standard_rate , reduced_rate_share
65+ except Exception :
66+ if year in VAT_RATE_BY_YEAR :
67+ return VAT_RATE_BY_YEAR [year ]
68+ return _FALLBACK_VAT_STANDARD_RATE , _FALLBACK_REDUCED_RATE_SHARE
69+
70+
71+ def generate_etb_table (etb : pd .DataFrame , year : int = DEFAULT_ETB_YEAR ) -> pd .DataFrame :
2572 """
2673 Clean and transform ETB data for VAT imputation model training.
2774
2875 Args:
2976 etb: Raw ETB survey data DataFrame.
77+ year: ETB survey year to filter to. Defaults to ``DEFAULT_ETB_YEAR``.
3078
3179 Returns:
3280 Cleaned DataFrame with VAT expenditure rates calculated.
3381 """
34- etb_2020 = etb [etb .year == 2020 ].dropna ()
35- for col in etb_2020 :
36- etb_2020 [col ] = pd .to_numeric (etb_2020 [col ], errors = "coerce" )
37-
38- etb_2020_df = pd .DataFrame ()
39- etb_2020_df ["is_adult" ] = etb_2020 .adults
40- etb_2020_df ["is_child" ] = etb_2020 .childs
41- etb_2020_df ["is_SP_age" ] = etb_2020 .noretd
42- etb_2020_df ["household_net_income" ] = etb_2020 .disinc * 52
43- etb_2020_df ["full_rate_vat_expenditure_rate" ] = (
44- etb_2020 .totvat * (1 - CONSUMPTION_PCT_REDUCED_RATE ) / CURRENT_VAT_RATE
45- ) / (etb_2020 .expdis - etb_2020 .totvat )
46- return etb_2020_df [~ etb_2020_df .full_rate_vat_expenditure_rate .isna ()]
47-
48-
49- def save_imputation_models ():
82+ standard_rate , reduced_rate_share = _get_vat_parameters (year )
83+
84+ etb_year = etb [etb .year == year ].dropna ()
85+ for col in etb_year :
86+ etb_year [col ] = pd .to_numeric (etb_year [col ], errors = "coerce" )
87+
88+ etb_year_df = pd .DataFrame ()
89+ etb_year_df ["is_adult" ] = etb_year .adults
90+ etb_year_df ["is_child" ] = etb_year .childs
91+ etb_year_df ["is_SP_age" ] = etb_year .noretd
92+ etb_year_df ["household_net_income" ] = etb_year .disinc * 52
93+ etb_year_df ["full_rate_vat_expenditure_rate" ] = (
94+ etb_year .totvat * (1 - reduced_rate_share ) / standard_rate
95+ ) / (etb_year .expdis - etb_year .totvat )
96+ return etb_year_df [~ etb_year_df .full_rate_vat_expenditure_rate .isna ()]
97+
98+
99+ def save_imputation_models (year : int = DEFAULT_ETB_YEAR ):
50100 """
51101 Train and save VAT imputation model.
52102
@@ -61,7 +111,7 @@ def save_imputation_models():
61111 delimiter = "\t " ,
62112 low_memory = False ,
63113 )
64- etb = generate_etb_table (etb )
114+ etb = generate_etb_table (etb , year = year )
65115 etb = etb [PREDICTORS + IMPUTATIONS ]
66116 vat .fit (etb [PREDICTORS ], etb [IMPUTATIONS ])
67117 vat .save (STORAGE_FOLDER / "vat.pkl" )
0 commit comments