|
| 1 | +"""Second-stage QRF imputation of FRS-only variables on SPI-donor rows. |
| 2 | +
|
| 3 | +The enhanced-FRS pipeline in :mod:`income` creates a zero-weight subsample |
| 4 | +of the FRS that will be upweighted during calibration to fit SPI-derived |
| 5 | +high-income targets. The first-stage QRF (trained on SPI) replaces only |
| 6 | +the six core income components (plus ``gift_aid`` and |
| 7 | +``charitable_investment_gifts``) on those rows. Every other FRS column — |
| 8 | +benefit ``_reported`` values, pension contributions, savings, rent, |
| 9 | +mortgage, council tax — stays at whatever the middle-income FRS donor |
| 10 | +whose row was sampled happened to report. |
| 11 | +
|
| 12 | +That produces implausible joint distributions on the synthetic |
| 13 | +high-income side. A row with imputed £2 M self-employment income carries |
| 14 | +its donor's £120 UC ``_reported`` value, its donor's tiny pension |
| 15 | +contribution, and its donor's typical rent. Under calibration upweight |
| 16 | +these cascade into false benefit aggregates, depressed allowances, and |
| 17 | +distorted housing-cost totals. |
| 18 | +
|
| 19 | +This second-stage QRF trains on the original FRS with predictors = |
| 20 | +[demographics + first-stage income outputs] and outputs = a curated list |
| 21 | +of FRS-only variables. For each SPI-donor row, it substitutes the |
| 22 | +predicted value drawn from FRS respondents with similar demographics and |
| 23 | +post-stage-1 incomes. Benefit ``_reported`` flags for high earners |
| 24 | +naturally collapse to zero (no high-earner FRS respondent reports UC), |
| 25 | +pension contributions rescale, and savings interest / rent correlate |
| 26 | +with income instead of with the random FRS donor's draw. |
| 27 | +
|
| 28 | +Mirrors the US ``_impute_cps_only_variables`` approach introduced in |
| 29 | +``policyengine-us-data#589`` but targets UK-specific FRS variables. |
| 30 | +""" |
| 31 | + |
| 32 | +from __future__ import annotations |
| 33 | + |
| 34 | +import logging |
| 35 | + |
| 36 | +import numpy as np |
| 37 | +import pandas as pd |
| 38 | +from policyengine_uk.data import UKSingleYearDataset |
| 39 | + |
| 40 | +logger = logging.getLogger(__name__) |
| 41 | + |
| 42 | + |
| 43 | +STAGE2_DEMOGRAPHIC_PREDICTORS = [ |
| 44 | + "age", |
| 45 | + "gender", |
| 46 | + "region", |
| 47 | +] |
| 48 | + |
| 49 | +# Predictors drawn from the first-stage QRF output columns. They are the |
| 50 | +# same six income components that the first stage imputes from SPI. |
| 51 | +STAGE2_INCOME_PREDICTORS = [ |
| 52 | + "employment_income", |
| 53 | + "self_employment_income", |
| 54 | + "savings_interest_income", |
| 55 | + "dividend_income", |
| 56 | + "private_pension_income", |
| 57 | + "property_income", |
| 58 | +] |
| 59 | + |
| 60 | +# FRS-only variables the second stage replaces on SPI-donor rows. Kept |
| 61 | +# conservative: benefit ``_reported`` columns and pension contributions |
| 62 | +# are the leading sources of cross-income inconsistency, and are |
| 63 | +# well-populated in the base FRS build so training is stable. |
| 64 | +FRS_ONLY_PERSON_VARIABLES = [ |
| 65 | + # Pension contributions |
| 66 | + "employee_pension_contributions", |
| 67 | + "employer_pension_contributions", |
| 68 | + "personal_pension_contributions", |
| 69 | + "pension_contributions_via_salary_sacrifice", |
| 70 | + # Savings-related |
| 71 | + "tax_free_savings_income", |
| 72 | + # Benefit `_reported` columns |
| 73 | + "universal_credit_reported", |
| 74 | + "pension_credit_reported", |
| 75 | + "child_benefit_reported", |
| 76 | + "housing_benefit_reported", |
| 77 | + "income_support_reported", |
| 78 | + "working_tax_credit_reported", |
| 79 | + "child_tax_credit_reported", |
| 80 | + "attendance_allowance_reported", |
| 81 | + "state_pension_reported", |
| 82 | + "dla_sc_reported", |
| 83 | + "dla_m_reported", |
| 84 | + "pip_m_reported", |
| 85 | + "pip_dl_reported", |
| 86 | + "sda_reported", |
| 87 | + "carers_allowance_reported", |
| 88 | + "iidb_reported", |
| 89 | + "afcs_reported", |
| 90 | + "bsp_reported", |
| 91 | + "incapacity_benefit_reported", |
| 92 | + "maternity_allowance_reported", |
| 93 | + "winter_fuel_allowance_reported", |
| 94 | + "council_tax_benefit_reported", |
| 95 | + "jsa_contrib_reported", |
| 96 | + "jsa_income_reported", |
| 97 | + "esa_contrib_reported", |
| 98 | + "esa_income_reported", |
| 99 | +] |
| 100 | + |
| 101 | + |
| 102 | +def _one_hot_encode(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame: |
| 103 | + """Return ``df`` with object-typed ``columns`` one-hot encoded. |
| 104 | +
|
| 105 | + QRF predictors must be numeric. Uses ``pandas.get_dummies`` so |
| 106 | + identical category sets are produced from the same input data. |
| 107 | + """ |
| 108 | + return pd.get_dummies(df, columns=columns, drop_first=False, dtype=float) |
| 109 | + |
| 110 | + |
| 111 | +def _align_columns( |
| 112 | + train_df: pd.DataFrame, test_df: pd.DataFrame |
| 113 | +) -> tuple[pd.DataFrame, pd.DataFrame]: |
| 114 | + """Ensure train/test share the same columns in the same order. |
| 115 | +
|
| 116 | + After independent ``get_dummies`` calls on train and test one-hot |
| 117 | + expansions can diverge if a category appears in one set and not the |
| 118 | + other. Reindex both to the union of columns, filling missing cells |
| 119 | + with zero. |
| 120 | + """ |
| 121 | + columns = sorted(set(train_df.columns) | set(test_df.columns)) |
| 122 | + return ( |
| 123 | + train_df.reindex(columns=columns, fill_value=0.0), |
| 124 | + test_df.reindex(columns=columns, fill_value=0.0), |
| 125 | + ) |
| 126 | + |
| 127 | + |
| 128 | +def impute_frs_only_variables( |
| 129 | + train_dataset: UKSingleYearDataset, |
| 130 | + target_dataset: UKSingleYearDataset, |
| 131 | +) -> UKSingleYearDataset: |
| 132 | + """Impute FRS-only person variables onto ``target_dataset``. |
| 133 | +
|
| 134 | + ``train_dataset`` must be a full FRS build (before income |
| 135 | + imputation) so the training rows preserve the original co-occurrence |
| 136 | + of income and every FRS-only variable. ``target_dataset`` is the |
| 137 | + SPI-donor subsample after the first-stage QRF has overwritten its |
| 138 | + income columns. |
| 139 | +
|
| 140 | + A single multi-output QRF is fitted on the training data and used |
| 141 | + to predict values for every row of ``target_dataset``; predictions |
| 142 | + replace the existing (donor-leaked) values in |
| 143 | + ``FRS_ONLY_PERSON_VARIABLES`` only. Variables absent from either |
| 144 | + frame are skipped silently. |
| 145 | + """ |
| 146 | + from policyengine_uk_data.utils.qrf import QRF |
| 147 | + |
| 148 | + target_dataset = target_dataset.copy() |
| 149 | + |
| 150 | + train_person = train_dataset.person |
| 151 | + target_person = target_dataset.person |
| 152 | + |
| 153 | + # Use only variables present in both frames. |
| 154 | + outputs = [ |
| 155 | + v |
| 156 | + for v in FRS_ONLY_PERSON_VARIABLES |
| 157 | + if v in train_person.columns and v in target_person.columns |
| 158 | + ] |
| 159 | + missing = set(FRS_ONLY_PERSON_VARIABLES) - set(outputs) |
| 160 | + if missing: |
| 161 | + logger.warning( |
| 162 | + "Stage-2 FRS-only imputation: %d variables absent from " |
| 163 | + "train/target frames, skipped: %s", |
| 164 | + len(missing), |
| 165 | + sorted(missing), |
| 166 | + ) |
| 167 | + if not outputs: |
| 168 | + logger.warning( |
| 169 | + "Stage-2 FRS-only imputation: no output variables available; " |
| 170 | + "returning target_dataset unchanged." |
| 171 | + ) |
| 172 | + return target_dataset |
| 173 | + |
| 174 | + predictors = STAGE2_DEMOGRAPHIC_PREDICTORS + STAGE2_INCOME_PREDICTORS |
| 175 | + |
| 176 | + train_inputs_raw = train_person[predictors].copy() |
| 177 | + target_inputs_raw = target_person[predictors].copy() |
| 178 | + |
| 179 | + train_inputs = _one_hot_encode(train_inputs_raw, columns=["gender", "region"]) |
| 180 | + target_inputs = _one_hot_encode(target_inputs_raw, columns=["gender", "region"]) |
| 181 | + train_inputs, target_inputs = _align_columns(train_inputs, target_inputs) |
| 182 | + |
| 183 | + # Replace NaNs in outputs with 0 so the QRF trains on clean targets; |
| 184 | + # FRS-only variables are almost all zero-heavy "amount if eligible" |
| 185 | + # columns that default to zero when unreported. |
| 186 | + train_outputs = train_person[outputs].fillna(0).astype(float) |
| 187 | + |
| 188 | + logger.info( |
| 189 | + "Stage-2 FRS-only imputation: %d outputs, training on %d FRS " |
| 190 | + "persons, predicting for %d SPI-donor persons", |
| 191 | + len(outputs), |
| 192 | + len(train_inputs), |
| 193 | + len(target_inputs), |
| 194 | + ) |
| 195 | + |
| 196 | + model = QRF() |
| 197 | + model.fit(train_inputs, train_outputs) |
| 198 | + predictions = model.predict(target_inputs) |
| 199 | + |
| 200 | + # The QRF occasionally returns NaN for extreme predictor combos; |
| 201 | + # clamp to zero (the population-typical value for these variables). |
| 202 | + predictions = predictions.fillna(0.0) |
| 203 | + |
| 204 | + for column in outputs: |
| 205 | + # Clamp negative predictions — these columns represent receipted |
| 206 | + # amounts or contributions and are non-negative by construction. |
| 207 | + values = np.maximum(predictions[column].values, 0.0) |
| 208 | + target_dataset.person[column] = values |
| 209 | + |
| 210 | + return target_dataset |
0 commit comments