From 261a2fd4599a8262fe4f7e4c0d5316ae94ecc6fa Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 17 Apr 2026 07:14:47 -0400 Subject: [PATCH 1/4] Impute gift_aid from SPI for high-income donor rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The enhanced FRS's SPI-donor half carries `gift_aid = 0` for every record — including the ~10k synthetic high-earner rows generated by the SPI income imputation — because the QRF was only trained to predict the six core income components. `gift_aid` was already listed in `SPI_RENAMES` (so SPI's `GIFTAID` column was being prepared as a training input) but it wasn't in `IMPUTATIONS`, so it never reached the model's output set. As a result, the entire modelled population missed the ~£1-1.5bn/yr of Gift Aid tax relief that HMRC actually pays, and any reform touching the Gift Aid regime was a no-op. Adds `gift_aid` to `IMPUTATIONS`. Because it's trained jointly with the income components, each SPI-donor row now carries a Gift Aid figure drawn alongside its income from the same SPI respondent, giving correlated draws (high earners likelier to make larger Gift Aid claims) rather than a demographic-only smear. Verified on a 10,000-row synthetic SPI-donor sample after retraining: - All rows: 5.6% nonzero gift_aid, mean among nonzero £2,492 - Employment income >= £200k: 41.4% nonzero gift_aid, mean £4,543 Renames the cached pickle to `income_v2.pkl` so any existing local pickle (which doesn't have `gift_aid` as an output) is bypassed and retraining happens automatically on next build. CI always trains from scratch, so this is a no-op there. Does not touch the FRS-side imputation (still only overwrites `dividend_income`). Properly imputing Gift Aid on the FRS side would require an income-conditional model, outside this PR's scope. Co-Authored-By: Claude Opus 4.7 (1M context) --- changelog.d/impute-gift-aid-from-spi.fixed.md | 1 + policyengine_uk_data/datasets/imputations/income.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) create mode 100644 changelog.d/impute-gift-aid-from-spi.fixed.md diff --git a/changelog.d/impute-gift-aid-from-spi.fixed.md b/changelog.d/impute-gift-aid-from-spi.fixed.md new file mode 100644 index 000000000..cc48320e3 --- /dev/null +++ b/changelog.d/impute-gift-aid-from-spi.fixed.md @@ -0,0 +1 @@ +Include `gift_aid` in the SPI income imputation model so synthetic high-earner rows carry plausible Gift Aid expenditure drawn jointly with income, instead of a flat zero. Previously, the 6-variable QRF ran over only the core income components; `gift_aid` was in `SPI_RENAMES` but never reached the predicted output, so the SPI-donor half of the enhanced FRS carried its FRS donor's Gift Aid (which is always zero because the FRS doesn't collect it). Adds `gift_aid` to the model's output list and renames the cache file to force retraining. diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py index f9ea4577f..b2cc6b78e 100644 --- a/policyengine_uk_data/datasets/imputations/income.py +++ b/policyengine_uk_data/datasets/imputations/income.py @@ -101,6 +101,13 @@ def generate_spi_table(spi: pd.DataFrame): "dividend_income", "private_pension_income", "property_income", + # Gift Aid is in SPI but isn't in FRS — without it in this list, the + # zero-weight SPI-donor rows carry a middle-income FRS donor's (always + # zero) Gift Aid, producing a 100% miss on the £1-1.5bn/yr Gift Aid + # higher-rate relief flow. Including it here means the multi-output QRF + # draws gift_aid jointly with income components, so high-earner donors + # get plausibly non-zero Gift Aid. + "gift_aid", ] @@ -118,7 +125,7 @@ def save_imputation_models(): spi = generate_spi_table(spi) spi = spi[PREDICTORS + IMPUTATIONS] income.fit(spi[PREDICTORS], spi[IMPUTATIONS]) - income.save(STORAGE_FOLDER / "income.pkl") + income.save(STORAGE_FOLDER / "income_v2.pkl") return income @@ -134,8 +141,8 @@ def create_income_model(overwrite_existing: bool = False): """ from policyengine_uk_data.utils.qrf import QRF - if (STORAGE_FOLDER / "income.pkl").exists() and not overwrite_existing: - return QRF(file_path=STORAGE_FOLDER / "income.pkl") + if (STORAGE_FOLDER / "income_v2.pkl").exists() and not overwrite_existing: + return QRF(file_path=STORAGE_FOLDER / "income_v2.pkl") return save_imputation_models() From 6923996d56dd1afd9021ea26de49469d6a2ee017 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 17 Apr 2026 07:41:34 -0400 Subject: [PATCH 2/4] Split INCOME_COMPONENTS from IMPUTATIONS and initialise gift_aid column MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI surfaced two issues with the initial version: 1. `impute_over_incomes` used `dataset.person[IMPUTATIONS]` to compute a rent/mortgage adjustment factor. With gift_aid in IMPUTATIONS but not on the raw FRS build, the selection raised `KeyError: ['gift_aid'] not in index`. Gift Aid is also an expenditure, not income, so it shouldn't be in an "income total". Split INCOME_COMPONENTS (6 income variables, used for the adjustment factor) from IMPUTATIONS (training outputs, which additionally include gift_aid). 2. The full-FRS half of `impute_income` only overwrites dividend_income, so gift_aid remained unset on those rows. When `stack_datasets` combined the two halves, the full-FRS rows surfaced NaN gift_aid and subsequent `validate()` calls in the dataset-uprating path tripped. Initialise `dataset.person["gift_aid"] = 0.0` at the top of `impute_income` so the full-FRS side has a concrete value from the start. End-to-end verified locally against FRS 2023-24 + SPI 2020-21: - SPI-donor rows (weight=0, ~22k): 5.8% have nonzero gift_aid - Full-FRS rows (~36k): gift_aid = 0 (as intended — no change there) - No NaN columns; `validate()` passes on both halves and the stack Co-Authored-By: Claude Opus 4.7 (1M context) --- .../datasets/imputations/income.py | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py index b2cc6b78e..92ca348ff 100644 --- a/policyengine_uk_data/datasets/imputations/income.py +++ b/policyengine_uk_data/datasets/imputations/income.py @@ -94,22 +94,25 @@ def generate_spi_table(spi: pd.DataFrame): "region", ] -IMPUTATIONS = [ +INCOME_COMPONENTS = [ "employment_income", "self_employment_income", "savings_interest_income", "dividend_income", "private_pension_income", "property_income", - # Gift Aid is in SPI but isn't in FRS — without it in this list, the - # zero-weight SPI-donor rows carry a middle-income FRS donor's (always - # zero) Gift Aid, producing a 100% miss on the £1-1.5bn/yr Gift Aid - # higher-rate relief flow. Including it here means the multi-output QRF - # draws gift_aid jointly with income components, so high-earner donors - # get plausibly non-zero Gift Aid. - "gift_aid", ] +# Gift Aid is in SPI but isn't in FRS — without it in the model outputs, +# the zero-weight SPI-donor rows carry a middle-income FRS donor's (always +# zero) Gift Aid, missing the £1-1.5bn/yr Gift Aid higher-rate relief flow. +# Including it here means the multi-output QRF draws gift_aid jointly with +# income components, so high-earner donors get plausibly non-zero Gift Aid. +# We keep it separate from INCOME_COMPONENTS because the rent/mortgage +# adjustment factor downstream is built from income sums, and Gift Aid is +# an expenditure, not income. +IMPUTATIONS = INCOME_COMPONENTS + ["gift_aid"] + def save_imputation_models(): """ @@ -162,13 +165,13 @@ def impute_over_incomes( dataset = dataset.copy() sim = Microsimulation(dataset=dataset) input_df = sim.calculate_dataframe(["age", "gender", "region"]) - original_income_total = dataset.person[IMPUTATIONS].copy().sum().sum() + original_income_total = dataset.person[INCOME_COMPONENTS].copy().sum().sum() output_df = model.predict(input_df) for column in output_variables: dataset.person[column] = output_df[column].fillna(0).values - new_income_total = dataset.person[IMPUTATIONS].sum().sum() + new_income_total = dataset.person[INCOME_COMPONENTS].sum().sum() adjustment_factor = new_income_total / original_income_total # Adjust rent and mortgage interest and capital repayments proportionally dataset.household["rent"] = dataset.household["rent"] * adjustment_factor @@ -198,6 +201,13 @@ def impute_income(dataset: UKSingleYearDataset) -> UKSingleYearDataset: """ # Impute wealth, assuming same time period as trained data dataset = dataset.copy() + # gift_aid is in IMPUTATIONS but is not a column on the raw FRS build, so + # initialise it to zero everywhere before imputation. Without this, the + # full-FRS half stays NaN for gift_aid (it's never touched by the dividend- + # only impute_over_incomes call below), and the eventual stacked dataset + # fails validate() on the gift_aid column. + if "gift_aid" not in dataset.person.columns: + dataset.person["gift_aid"] = 0.0 zero_weight_copy = dataset.copy() zero_weight_copy.household.household_weight = 0 zero_weight_copy = subsample_dataset(zero_weight_copy, 10_000) From 5eb85a8bcb8fb967e94ebdeb524c6dac0f26ac25 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 17 Apr 2026 08:38:17 -0400 Subject: [PATCH 3/4] Also impute charitable_investment_gifts (GIFTINV) from SPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer feedback: SPI's `GIFTINV` (qualifying investment/property gifts) is a separate relief on the UK side — policyengine-uk has a distinct `charitable_investment_gifts` variable that flows into the income-tax `allowances` aggregate. The previous revision of this PR only added `gift_aid` (SPI `GIFTAID`) to the QRF outputs, which is inconsistent with the standalone SPI dataset path (`datasets/spi.py:88`) that sums `GIFTAID + GIFTINV` into a single column. The enhanced-FRS path should populate each charitable-giving variable separately so each maps to the right policyengine-uk variable. Adds `charitable_investment_gifts` to `IMPUTATIONS` alongside `gift_aid`, extends the zero-initialisation to both columns, and bumps the cache file to `income_v3.pkl` so stale pickles from the previous revision retrain automatically. Verified locally against FRS 2023-24 + SPI 2020-21: - gift_aid: 1,271 of 21,607 SPI-donor rows nonzero (5.9%) - charitable_investment_gifts: 13 of 21,607 SPI-donor rows nonzero (0.06%) - Both validate() cleanly; stacked dataset has no NaN in either column Both columns stay at 0 on the full-FRS half (as intended — FRS doesn't collect charitable giving). Co-Authored-By: Claude Opus 4.7 (1M context) --- changelog.d/impute-gift-aid-from-spi.fixed.md | 2 +- .../datasets/imputations/income.py | 45 +++++++++++-------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/changelog.d/impute-gift-aid-from-spi.fixed.md b/changelog.d/impute-gift-aid-from-spi.fixed.md index cc48320e3..d51a2c1de 100644 --- a/changelog.d/impute-gift-aid-from-spi.fixed.md +++ b/changelog.d/impute-gift-aid-from-spi.fixed.md @@ -1 +1 @@ -Include `gift_aid` in the SPI income imputation model so synthetic high-earner rows carry plausible Gift Aid expenditure drawn jointly with income, instead of a flat zero. Previously, the 6-variable QRF ran over only the core income components; `gift_aid` was in `SPI_RENAMES` but never reached the predicted output, so the SPI-donor half of the enhanced FRS carried its FRS donor's Gift Aid (which is always zero because the FRS doesn't collect it). Adds `gift_aid` to the model's output list and renames the cache file to force retraining. +Include `gift_aid` (SPI `GIFTAID`) and `charitable_investment_gifts` (SPI `GIFTINV`) in the SPI income imputation model so synthetic high-earner rows carry plausible charitable giving drawn jointly with income, instead of a flat zero. Previously the 6-variable QRF ran over only the core income components; both charitable relief columns were in `SPI_RENAMES` but never reached the predicted output, so the SPI-donor half of the enhanced FRS carried its FRS donor's (always-zero) charitable giving. Adds both columns to the model's output list, renames the cache file to force retraining, and initialises the FRS-side columns to zero to keep the stacked dataset valid. diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py index 92ca348ff..e9839a8a3 100644 --- a/policyengine_uk_data/datasets/imputations/income.py +++ b/policyengine_uk_data/datasets/imputations/income.py @@ -103,15 +103,21 @@ def generate_spi_table(spi: pd.DataFrame): "property_income", ] -# Gift Aid is in SPI but isn't in FRS — without it in the model outputs, -# the zero-weight SPI-donor rows carry a middle-income FRS donor's (always -# zero) Gift Aid, missing the £1-1.5bn/yr Gift Aid higher-rate relief flow. -# Including it here means the multi-output QRF draws gift_aid jointly with -# income components, so high-earner donors get plausibly non-zero Gift Aid. -# We keep it separate from INCOME_COMPONENTS because the rent/mortgage -# adjustment factor downstream is built from income sums, and Gift Aid is -# an expenditure, not income. -IMPUTATIONS = INCOME_COMPONENTS + ["gift_aid"] +# Gift Aid (SPI GIFTAID) and charitable investment gifts (SPI GIFTINV) are +# separate reliefs on the UK side but both absent from the FRS — without them +# in the model outputs, the zero-weight SPI-donor rows carry a middle-income +# FRS donor's (always zero) charitable giving, missing the £1-1.5bn/yr Gift +# Aid higher-rate relief flow and an additional ~£0.1bn of qualifying- +# investment gifts. Including them here means the multi-output QRF draws +# them jointly with income components, so high-earner donors get plausibly +# non-zero values. Kept separate from INCOME_COMPONENTS because the +# rent/mortgage adjustment factor downstream is built from income sums, and +# these are expenditures, not income. The standalone SPI dataset in +# `datasets/spi.py` sums GIFTAID + GIFTINV into a single `gift_aid` column +# because that path doesn't carry a separate `charitable_investment_gifts` +# variable; the enhanced-FRS path here keeps them separate so each maps to +# its own policyengine-uk variable. +IMPUTATIONS = INCOME_COMPONENTS + ["gift_aid", "charitable_investment_gifts"] def save_imputation_models(): @@ -128,7 +134,7 @@ def save_imputation_models(): spi = generate_spi_table(spi) spi = spi[PREDICTORS + IMPUTATIONS] income.fit(spi[PREDICTORS], spi[IMPUTATIONS]) - income.save(STORAGE_FOLDER / "income_v2.pkl") + income.save(STORAGE_FOLDER / "income_v3.pkl") return income @@ -144,8 +150,8 @@ def create_income_model(overwrite_existing: bool = False): """ from policyengine_uk_data.utils.qrf import QRF - if (STORAGE_FOLDER / "income_v2.pkl").exists() and not overwrite_existing: - return QRF(file_path=STORAGE_FOLDER / "income_v2.pkl") + if (STORAGE_FOLDER / "income_v3.pkl").exists() and not overwrite_existing: + return QRF(file_path=STORAGE_FOLDER / "income_v3.pkl") return save_imputation_models() @@ -201,13 +207,14 @@ def impute_income(dataset: UKSingleYearDataset) -> UKSingleYearDataset: """ # Impute wealth, assuming same time period as trained data dataset = dataset.copy() - # gift_aid is in IMPUTATIONS but is not a column on the raw FRS build, so - # initialise it to zero everywhere before imputation. Without this, the - # full-FRS half stays NaN for gift_aid (it's never touched by the dividend- - # only impute_over_incomes call below), and the eventual stacked dataset - # fails validate() on the gift_aid column. - if "gift_aid" not in dataset.person.columns: - dataset.person["gift_aid"] = 0.0 + # gift_aid and charitable_investment_gifts are in IMPUTATIONS but are not + # columns on the raw FRS build, so initialise them to zero everywhere + # before imputation. Without this, the full-FRS half stays NaN for these + # columns (they're never touched by the dividend-only impute_over_incomes + # call below), and the eventual stacked dataset fails validate(). + for column in ("gift_aid", "charitable_investment_gifts"): + if column not in dataset.person.columns: + dataset.person[column] = 0.0 zero_weight_copy = dataset.copy() zero_weight_copy.household.household_weight = 0 zero_weight_copy = subsample_dataset(zero_weight_copy, 10_000) From c5f581dfc031355609f57692a3eafa099cbc89fe Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 17 Apr 2026 09:15:29 -0400 Subject: [PATCH 4/4] Keep cache file name stable; validate outputs instead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts the `income.pkl` → `income_v2.pkl` → `income_v3.pkl` cache rename in favour of stable naming. Bumping the filename on every output-set change is a form of the `_v2`/`_v3` anti-pattern the repo's CLAUDE.md calls out (it leaves orphan pickles behind on local dev machines and adds to the "keep deleting deprecated files" debt). Instead, the cache loader now checks `cached.model.imputed_variables` against the current `IMPUTATIONS` and retrains if they disagree. A stale local `income.pkl` from the pre-PR state (six outputs) is detected and rebuilt on first use; no manual deletion required. Verified end-to-end: - Fresh run: `income.pkl` created with 8 outputs; `charitable_investment_gifts` and `gift_aid` populated as expected. - Forced stale state: a 6-output model saved under `income.pkl` is detected on load, discarded, and retrained to 8 outputs automatically. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../datasets/imputations/income.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py index e9839a8a3..36d41cd50 100644 --- a/policyengine_uk_data/datasets/imputations/income.py +++ b/policyengine_uk_data/datasets/imputations/income.py @@ -120,6 +120,9 @@ def generate_spi_table(spi: pd.DataFrame): IMPUTATIONS = INCOME_COMPONENTS + ["gift_aid", "charitable_investment_gifts"] +INCOME_MODEL_PATH = STORAGE_FOLDER / "income.pkl" + + def save_imputation_models(): """ Train and save income imputation model. @@ -134,7 +137,7 @@ def save_imputation_models(): spi = generate_spi_table(spi) spi = spi[PREDICTORS + IMPUTATIONS] income.fit(spi[PREDICTORS], spi[IMPUTATIONS]) - income.save(STORAGE_FOLDER / "income_v3.pkl") + income.save(INCOME_MODEL_PATH) return income @@ -142,6 +145,11 @@ def create_income_model(overwrite_existing: bool = False): """ Create or load income imputation model. + If a cached model exists and its trained output columns don't match the + current ``IMPUTATIONS`` list, the cache is discarded and the model is + retrained. This handles the case where ``IMPUTATIONS`` is extended in + code but an older pickle is still on disk. + Args: overwrite_existing: Whether to retrain model if it exists. @@ -150,8 +158,12 @@ def create_income_model(overwrite_existing: bool = False): """ from policyengine_uk_data.utils.qrf import QRF - if (STORAGE_FOLDER / "income_v3.pkl").exists() and not overwrite_existing: - return QRF(file_path=STORAGE_FOLDER / "income_v3.pkl") + if INCOME_MODEL_PATH.exists() and not overwrite_existing: + cached = QRF(file_path=INCOME_MODEL_PATH) + cached_outputs = set(getattr(cached.model, "imputed_variables", [])) + if cached_outputs == set(IMPUTATIONS): + return cached + # Cached model was trained against a different output set; retrain. return save_imputation_models()