From 261a2fd4599a8262fe4f7e4c0d5316ae94ecc6fa Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Fri, 17 Apr 2026 07:14:47 -0400
Subject: [PATCH 1/4] Impute gift_aid from SPI for high-income donor rows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The enhanced FRS's SPI-donor half carries `gift_aid = 0` for every
record — including the ~10k synthetic high-earner rows generated by
the SPI income imputation — because the QRF was only trained to
predict the six core income components. `gift_aid` was already listed
in `SPI_RENAMES` (so SPI's `GIFTAID` column was being prepared as a
training input) but it wasn't in `IMPUTATIONS`, so it never reached
the model's output set. As a result, the entire modelled population
missed the ~£1-1.5bn/yr of Gift Aid tax relief that HMRC actually
pays, and any reform touching the Gift Aid regime was a no-op.

Adds `gift_aid` to `IMPUTATIONS`. Because it's trained jointly with
the income components, each SPI-donor row now carries a Gift Aid
figure drawn alongside its income from the same SPI respondent,
giving correlated draws (high earners likelier to make larger Gift
Aid claims) rather than a demographic-only smear.

Verified on a 10,000-row synthetic SPI-donor sample after retraining:

- All rows: 5.6% nonzero gift_aid, mean among nonzero £2,492
- Employment income >= £200k: 41.4% nonzero gift_aid, mean £4,543

Renames the cached pickle to `income_v2.pkl` so any existing local
pickle (which doesn't have `gift_aid` as an output) is bypassed and
retraining happens automatically on next build. CI always trains
from scratch, so this is a no-op there.

Does not touch the FRS-side imputation (still only overwrites
`dividend_income`). Properly imputing Gift Aid on the FRS side
would require an income-conditional model, outside this PR's scope.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 changelog.d/impute-gift-aid-from-spi.fixed.md       |  1 +
 policyengine_uk_data/datasets/imputations/income.py | 13 ++++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)
 create mode 100644 changelog.d/impute-gift-aid-from-spi.fixed.md

diff --git a/changelog.d/impute-gift-aid-from-spi.fixed.md b/changelog.d/impute-gift-aid-from-spi.fixed.md
new file mode 100644
index 000000000..cc48320e3
--- /dev/null
+++ b/changelog.d/impute-gift-aid-from-spi.fixed.md
@@ -0,0 +1 @@
+Include `gift_aid` in the SPI income imputation model so synthetic high-earner rows carry plausible Gift Aid expenditure drawn jointly with income, instead of a flat zero. Previously, the 6-variable QRF ran over only the core income components; `gift_aid` was in `SPI_RENAMES` but never reached the predicted output, so the SPI-donor half of the enhanced FRS carried its FRS donor's Gift Aid (which is always zero because the FRS doesn't collect it). Adds `gift_aid` to the model's output list and renames the cache file to force retraining.
diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py
index f9ea4577f..b2cc6b78e 100644
--- a/policyengine_uk_data/datasets/imputations/income.py
+++ b/policyengine_uk_data/datasets/imputations/income.py
@@ -101,6 +101,13 @@ def generate_spi_table(spi: pd.DataFrame):
     "dividend_income",
     "private_pension_income",
     "property_income",
+    # Gift Aid is in SPI but isn't in FRS — without it in this list, the
+    # zero-weight SPI-donor rows carry a middle-income FRS donor's (always
+    # zero) Gift Aid, producing a 100% miss on the £1-1.5bn/yr Gift Aid
+    # higher-rate relief flow. Including it here means the multi-output QRF
+    # draws gift_aid jointly with income components, so high-earner donors
+    # get plausibly non-zero Gift Aid.
+    "gift_aid",
 ]
 
 
@@ -118,7 +125,7 @@ def save_imputation_models():
     spi = generate_spi_table(spi)
     spi = spi[PREDICTORS + IMPUTATIONS]
     income.fit(spi[PREDICTORS], spi[IMPUTATIONS])
-    income.save(STORAGE_FOLDER / "income.pkl")
+    income.save(STORAGE_FOLDER / "income_v2.pkl")
     return income
 
 
@@ -134,8 +141,8 @@ def create_income_model(overwrite_existing: bool = False):
     """
     from policyengine_uk_data.utils.qrf import QRF
 
-    if (STORAGE_FOLDER / "income.pkl").exists() and not overwrite_existing:
-        return QRF(file_path=STORAGE_FOLDER / "income.pkl")
+    if (STORAGE_FOLDER / "income_v2.pkl").exists() and not overwrite_existing:
+        return QRF(file_path=STORAGE_FOLDER / "income_v2.pkl")
     return save_imputation_models()
 
 

From 6923996d56dd1afd9021ea26de49469d6a2ee017 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Fri, 17 Apr 2026 07:41:34 -0400
Subject: [PATCH 2/4] Split INCOME_COMPONENTS from IMPUTATIONS and initialise
 gift_aid column
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI surfaced two issues with the initial version:

1. `impute_over_incomes` used `dataset.person[IMPUTATIONS]` to compute a
   rent/mortgage adjustment factor. With gift_aid in IMPUTATIONS but not
   on the raw FRS build, the selection raised `KeyError: ['gift_aid']
   not in index`. Gift Aid is also an expenditure, not income, so it
   shouldn't be in an "income total". Split INCOME_COMPONENTS (6 income
   variables, used for the adjustment factor) from IMPUTATIONS (training
   outputs, which additionally include gift_aid).

2. The full-FRS half of `impute_income` only overwrites dividend_income,
   so gift_aid remained unset on those rows. When `stack_datasets`
   combined the two halves, the full-FRS rows surfaced NaN gift_aid and
   subsequent `validate()` calls in the dataset-uprating path tripped.
   Initialise `dataset.person["gift_aid"] = 0.0` at the top of
   `impute_income` so the full-FRS side has a concrete value from the
   start.

End-to-end verified locally against FRS 2023-24 + SPI 2020-21:

- SPI-donor rows (weight=0, ~22k): 5.8% have nonzero gift_aid
- Full-FRS rows (~36k): gift_aid = 0 (as intended — no change there)
- No NaN columns; `validate()` passes on both halves and the stack

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../datasets/imputations/income.py            | 30 ++++++++++++-------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py
index b2cc6b78e..92ca348ff 100644
--- a/policyengine_uk_data/datasets/imputations/income.py
+++ b/policyengine_uk_data/datasets/imputations/income.py
@@ -94,22 +94,25 @@ def generate_spi_table(spi: pd.DataFrame):
     "region",
 ]
 
-IMPUTATIONS = [
+INCOME_COMPONENTS = [
     "employment_income",
     "self_employment_income",
     "savings_interest_income",
     "dividend_income",
     "private_pension_income",
     "property_income",
-    # Gift Aid is in SPI but isn't in FRS — without it in this list, the
-    # zero-weight SPI-donor rows carry a middle-income FRS donor's (always
-    # zero) Gift Aid, producing a 100% miss on the £1-1.5bn/yr Gift Aid
-    # higher-rate relief flow. Including it here means the multi-output QRF
-    # draws gift_aid jointly with income components, so high-earner donors
-    # get plausibly non-zero Gift Aid.
-    "gift_aid",
 ]
 
+# Gift Aid is in SPI but isn't in FRS — without it in the model outputs,
+# the zero-weight SPI-donor rows carry a middle-income FRS donor's (always
+# zero) Gift Aid, missing the £1-1.5bn/yr Gift Aid higher-rate relief flow.
+# Including it here means the multi-output QRF draws gift_aid jointly with
+# income components, so high-earner donors get plausibly non-zero Gift Aid.
+# We keep it separate from INCOME_COMPONENTS because the rent/mortgage
+# adjustment factor downstream is built from income sums, and Gift Aid is
+# an expenditure, not income.
+IMPUTATIONS = INCOME_COMPONENTS + ["gift_aid"]
+
 
 def save_imputation_models():
     """
@@ -162,13 +165,13 @@ def impute_over_incomes(
     dataset = dataset.copy()
     sim = Microsimulation(dataset=dataset)
     input_df = sim.calculate_dataframe(["age", "gender", "region"])
-    original_income_total = dataset.person[IMPUTATIONS].copy().sum().sum()
+    original_income_total = dataset.person[INCOME_COMPONENTS].copy().sum().sum()
     output_df = model.predict(input_df)
 
     for column in output_variables:
         dataset.person[column] = output_df[column].fillna(0).values
 
-    new_income_total = dataset.person[IMPUTATIONS].sum().sum()
+    new_income_total = dataset.person[INCOME_COMPONENTS].sum().sum()
     adjustment_factor = new_income_total / original_income_total
     # Adjust rent and mortgage interest and capital repayments proportionally
     dataset.household["rent"] = dataset.household["rent"] * adjustment_factor
@@ -198,6 +201,13 @@ def impute_income(dataset: UKSingleYearDataset) -> UKSingleYearDataset:
     """
     # Impute wealth, assuming same time period as trained data
     dataset = dataset.copy()
+    # gift_aid is in IMPUTATIONS but is not a column on the raw FRS build, so
+    # initialise it to zero everywhere before imputation. Without this, the
+    # full-FRS half stays NaN for gift_aid (it's never touched by the dividend-
+    # only impute_over_incomes call below), and the eventual stacked dataset
+    # fails validate() on the gift_aid column.
+    if "gift_aid" not in dataset.person.columns:
+        dataset.person["gift_aid"] = 0.0
     zero_weight_copy = dataset.copy()
     zero_weight_copy.household.household_weight = 0
     zero_weight_copy = subsample_dataset(zero_weight_copy, 10_000)

From 5eb85a8bcb8fb967e94ebdeb524c6dac0f26ac25 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Fri, 17 Apr 2026 08:38:17 -0400
Subject: [PATCH 3/4] Also impute charitable_investment_gifts (GIFTINV) from
 SPI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewer feedback: SPI's `GIFTINV` (qualifying investment/property
gifts) is a separate relief on the UK side — policyengine-uk has a
distinct `charitable_investment_gifts` variable that flows into the
income-tax `allowances` aggregate. The previous revision of this PR
only added `gift_aid` (SPI `GIFTAID`) to the QRF outputs, which is
inconsistent with the standalone SPI dataset path
(`datasets/spi.py:88`) that sums `GIFTAID + GIFTINV` into a single
column. The enhanced-FRS path should populate each charitable-giving
variable separately so each maps to the right policyengine-uk variable.

Adds `charitable_investment_gifts` to `IMPUTATIONS` alongside
`gift_aid`, extends the zero-initialisation to both columns, and bumps
the cache file to `income_v3.pkl` so stale pickles from the previous
revision retrain automatically.

Verified locally against FRS 2023-24 + SPI 2020-21:

- gift_aid: 1,271 of 21,607 SPI-donor rows nonzero (5.9%)
- charitable_investment_gifts: 13 of 21,607 SPI-donor rows nonzero (0.06%)
- Both validate() cleanly; stacked dataset has no NaN in either column

Both columns stay at 0 on the full-FRS half (as intended — FRS doesn't
collect charitable giving).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 changelog.d/impute-gift-aid-from-spi.fixed.md |  2 +-
 .../datasets/imputations/income.py            | 45 +++++++++++--------
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/changelog.d/impute-gift-aid-from-spi.fixed.md b/changelog.d/impute-gift-aid-from-spi.fixed.md
index cc48320e3..d51a2c1de 100644
--- a/changelog.d/impute-gift-aid-from-spi.fixed.md
+++ b/changelog.d/impute-gift-aid-from-spi.fixed.md
@@ -1 +1 @@
-Include `gift_aid` in the SPI income imputation model so synthetic high-earner rows carry plausible Gift Aid expenditure drawn jointly with income, instead of a flat zero. Previously, the 6-variable QRF ran over only the core income components; `gift_aid` was in `SPI_RENAMES` but never reached the predicted output, so the SPI-donor half of the enhanced FRS carried its FRS donor's Gift Aid (which is always zero because the FRS doesn't collect it). Adds `gift_aid` to the model's output list and renames the cache file to force retraining.
+Include `gift_aid` (SPI `GIFTAID`) and `charitable_investment_gifts` (SPI `GIFTINV`) in the SPI income imputation model so synthetic high-earner rows carry plausible charitable giving drawn jointly with income, instead of a flat zero. Previously the 6-variable QRF ran over only the core income components; both charitable relief columns were in `SPI_RENAMES` but never reached the predicted output, so the SPI-donor half of the enhanced FRS carried its FRS donor's (always-zero) charitable giving. Adds both columns to the model's output list, renames the cache file to force retraining, and initialises the FRS-side columns to zero to keep the stacked dataset valid.
diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py
index 92ca348ff..e9839a8a3 100644
--- a/policyengine_uk_data/datasets/imputations/income.py
+++ b/policyengine_uk_data/datasets/imputations/income.py
@@ -103,15 +103,21 @@ def generate_spi_table(spi: pd.DataFrame):
     "property_income",
 ]
 
-# Gift Aid is in SPI but isn't in FRS — without it in the model outputs,
-# the zero-weight SPI-donor rows carry a middle-income FRS donor's (always
-# zero) Gift Aid, missing the £1-1.5bn/yr Gift Aid higher-rate relief flow.
-# Including it here means the multi-output QRF draws gift_aid jointly with
-# income components, so high-earner donors get plausibly non-zero Gift Aid.
-# We keep it separate from INCOME_COMPONENTS because the rent/mortgage
-# adjustment factor downstream is built from income sums, and Gift Aid is
-# an expenditure, not income.
-IMPUTATIONS = INCOME_COMPONENTS + ["gift_aid"]
+# Gift Aid (SPI GIFTAID) and charitable investment gifts (SPI GIFTINV) are
+# separate reliefs on the UK side but both absent from the FRS — without them
+# in the model outputs, the zero-weight SPI-donor rows carry a middle-income
+# FRS donor's (always zero) charitable giving, missing the £1-1.5bn/yr Gift
+# Aid higher-rate relief flow and an additional ~£0.1bn of qualifying-
+# investment gifts. Including them here means the multi-output QRF draws
+# them jointly with income components, so high-earner donors get plausibly
+# non-zero values. Kept separate from INCOME_COMPONENTS because the
+# rent/mortgage adjustment factor downstream is built from income sums, and
+# these are expenditures, not income. The standalone SPI dataset in
+# `datasets/spi.py` sums GIFTAID + GIFTINV into a single `gift_aid` column
+# because that path doesn't carry a separate `charitable_investment_gifts`
+# variable; the enhanced-FRS path here keeps them separate so each maps to
+# its own policyengine-uk variable.
+IMPUTATIONS = INCOME_COMPONENTS + ["gift_aid", "charitable_investment_gifts"]
 
 
 def save_imputation_models():
@@ -128,7 +134,7 @@ def save_imputation_models():
     spi = generate_spi_table(spi)
     spi = spi[PREDICTORS + IMPUTATIONS]
     income.fit(spi[PREDICTORS], spi[IMPUTATIONS])
-    income.save(STORAGE_FOLDER / "income_v2.pkl")
+    income.save(STORAGE_FOLDER / "income_v3.pkl")
     return income
 
 
@@ -144,8 +150,8 @@ def create_income_model(overwrite_existing: bool = False):
     """
     from policyengine_uk_data.utils.qrf import QRF
 
-    if (STORAGE_FOLDER / "income_v2.pkl").exists() and not overwrite_existing:
-        return QRF(file_path=STORAGE_FOLDER / "income_v2.pkl")
+    if (STORAGE_FOLDER / "income_v3.pkl").exists() and not overwrite_existing:
+        return QRF(file_path=STORAGE_FOLDER / "income_v3.pkl")
     return save_imputation_models()
 
 
@@ -201,13 +207,14 @@ def impute_income(dataset: UKSingleYearDataset) -> UKSingleYearDataset:
     """
     # Impute wealth, assuming same time period as trained data
     dataset = dataset.copy()
-    # gift_aid is in IMPUTATIONS but is not a column on the raw FRS build, so
-    # initialise it to zero everywhere before imputation. Without this, the
-    # full-FRS half stays NaN for gift_aid (it's never touched by the dividend-
-    # only impute_over_incomes call below), and the eventual stacked dataset
-    # fails validate() on the gift_aid column.
-    if "gift_aid" not in dataset.person.columns:
-        dataset.person["gift_aid"] = 0.0
+    # gift_aid and charitable_investment_gifts are in IMPUTATIONS but are not
+    # columns on the raw FRS build, so initialise them to zero everywhere
+    # before imputation. Without this, the full-FRS half stays NaN for these
+    # columns (they're never touched by the dividend-only impute_over_incomes
+    # call below), and the eventual stacked dataset fails validate().
+    for column in ("gift_aid", "charitable_investment_gifts"):
+        if column not in dataset.person.columns:
+            dataset.person[column] = 0.0
     zero_weight_copy = dataset.copy()
     zero_weight_copy.household.household_weight = 0
     zero_weight_copy = subsample_dataset(zero_weight_copy, 10_000)

From c5f581dfc031355609f57692a3eafa099cbc89fe Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Fri, 17 Apr 2026 09:15:29 -0400
Subject: [PATCH 4/4] Keep cache file name stable; validate outputs instead
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reverts the `income.pkl` → `income_v2.pkl` → `income_v3.pkl` cache
rename in favour of stable naming. Bumping the filename on every
output-set change is a form of the `_v2`/`_v3` anti-pattern the repo's
CLAUDE.md calls out (it leaves orphan pickles behind on local dev
machines and adds to the "keep deleting deprecated files" debt).

Instead, the cache loader now checks `cached.model.imputed_variables`
against the current `IMPUTATIONS` and retrains if they disagree. A
stale local `income.pkl` from the pre-PR state (six outputs) is
detected and rebuilt on first use; no manual deletion required.

Verified end-to-end:

- Fresh run: `income.pkl` created with 8 outputs; `charitable_investment_gifts` and `gift_aid` populated as expected.
- Forced stale state: a 6-output model saved under `income.pkl` is detected on load, discarded, and retrained to 8 outputs automatically.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../datasets/imputations/income.py             | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py
index e9839a8a3..36d41cd50 100644
--- a/policyengine_uk_data/datasets/imputations/income.py
+++ b/policyengine_uk_data/datasets/imputations/income.py
@@ -120,6 +120,9 @@ def generate_spi_table(spi: pd.DataFrame):
 IMPUTATIONS = INCOME_COMPONENTS + ["gift_aid", "charitable_investment_gifts"]
 
 
+INCOME_MODEL_PATH = STORAGE_FOLDER / "income.pkl"
+
+
 def save_imputation_models():
     """
     Train and save income imputation model.
@@ -134,7 +137,7 @@ def save_imputation_models():
     spi = generate_spi_table(spi)
     spi = spi[PREDICTORS + IMPUTATIONS]
     income.fit(spi[PREDICTORS], spi[IMPUTATIONS])
-    income.save(STORAGE_FOLDER / "income_v3.pkl")
+    income.save(INCOME_MODEL_PATH)
     return income
 
 
@@ -142,6 +145,11 @@ def create_income_model(overwrite_existing: bool = False):
     """
     Create or load income imputation model.
 
+    If a cached model exists and its trained output columns don't match the
+    current ``IMPUTATIONS`` list, the cache is discarded and the model is
+    retrained. This handles the case where ``IMPUTATIONS`` is extended in
+    code but an older pickle is still on disk.
+
     Args:
         overwrite_existing: Whether to retrain model if it exists.
 
@@ -150,8 +158,12 @@ def create_income_model(overwrite_existing: bool = False):
     """
     from policyengine_uk_data.utils.qrf import QRF
 
-    if (STORAGE_FOLDER / "income_v3.pkl").exists() and not overwrite_existing:
-        return QRF(file_path=STORAGE_FOLDER / "income_v3.pkl")
+    if INCOME_MODEL_PATH.exists() and not overwrite_existing:
+        cached = QRF(file_path=INCOME_MODEL_PATH)
+        cached_outputs = set(getattr(cached.model, "imputed_variables", []))
+        if cached_outputs == set(IMPUTATIONS):
+            return cached
+        # Cached model was trained against a different output set; retrain.
     return save_imputation_models()