|
| 1 | +"""Second-stage QRF imputation of FRS-only variables on SPI-donor rows. |
| 2 | +
|
| 3 | +The enhanced-FRS pipeline in :mod:`income` creates a zero-weight subsample |
| 4 | +of the FRS that will be upweighted during calibration to fit SPI-derived |
| 5 | +high-income targets. The first-stage QRF (trained on SPI) replaces only |
| 6 | +the six core income components (plus ``gift_aid`` and |
| 7 | +``charitable_investment_gifts``) on those rows. Every other FRS column — |
| 8 | +benefit ``_reported`` values, pension contributions, savings, rent, |
| 9 | +mortgage, council tax — stays at whatever the middle-income FRS donor |
| 10 | +whose row was sampled happened to report. |
| 11 | +
|
| 12 | +That produces implausible joint distributions on the synthetic |
| 13 | +high-income side. A row with imputed £2 M self-employment income carries |
| 14 | +its donor's £120 UC ``_reported`` value, its donor's tiny pension |
| 15 | +contribution, and its donor's typical rent. Under calibration upweight |
| 16 | +these cascade into false benefit aggregates, depressed allowances, and |
| 17 | +distorted housing-cost totals. |
| 18 | +
|
| 19 | +This second-stage QRF trains on the original FRS with predictors = |
| 20 | +[demographics + first-stage income outputs] and outputs = a curated list |
| 21 | +of FRS-only variables. For each SPI-donor row, it substitutes the |
| 22 | +predicted value drawn from FRS respondents with similar demographics and |
| 23 | +post-stage-1 incomes. Benefit ``_reported`` flags for high earners |
| 24 | +naturally collapse to zero (no high-earner FRS respondent reports UC), |
| 25 | +pension contributions rescale, and savings interest / rent correlate |
| 26 | +with income instead of with the random FRS donor's draw. |
| 27 | +
|
| 28 | +Mirrors the US ``_impute_cps_only_variables`` approach introduced in |
| 29 | +``policyengine-us-data#589`` but targets UK-specific FRS variables. |
| 30 | +""" |
| 31 | + |
| 32 | +from __future__ import annotations |
| 33 | + |
| 34 | +import logging |
| 35 | + |
| 36 | +import numpy as np |
| 37 | +import pandas as pd |
| 38 | +from policyengine_uk.data import UKSingleYearDataset |
| 39 | + |
| 40 | +logger = logging.getLogger(__name__) |
| 41 | + |
| 42 | + |
| 43 | +STAGE2_DEMOGRAPHIC_PREDICTORS = [ |
| 44 | + "age", |
| 45 | + "gender", |
| 46 | + "region", |
| 47 | +] |
| 48 | + |
| 49 | +# Predictors drawn from the first-stage QRF output columns. They are the |
| 50 | +# same six income components that the first stage imputes from SPI. |
| 51 | +STAGE2_INCOME_PREDICTORS = [ |
| 52 | + "employment_income", |
| 53 | + "self_employment_income", |
| 54 | + "savings_interest_income", |
| 55 | + "dividend_income", |
| 56 | + "private_pension_income", |
| 57 | + "property_income", |
| 58 | +] |
| 59 | + |
| 60 | +# FRS-only variables the second stage replaces on SPI-donor rows. Kept |
| 61 | +# conservative: benefit ``_reported`` columns and pension contributions |
| 62 | +# are the leading sources of cross-income inconsistency, and are |
| 63 | +# well-populated in the base FRS build so training is stable. |
| 64 | +FRS_ONLY_PERSON_VARIABLES = [ |
| 65 | + # Pension contributions |
| 66 | + "employee_pension_contributions", |
| 67 | + "employer_pension_contributions", |
| 68 | + "personal_pension_contributions", |
| 69 | + "pension_contributions_via_salary_sacrifice", |
| 70 | + # Savings-related |
| 71 | + "tax_free_savings_income", |
| 72 | + # Benefit `_reported` columns |
| 73 | + "universal_credit_reported", |
| 74 | + "pension_credit_reported", |
| 75 | + "child_benefit_reported", |
| 76 | + "housing_benefit_reported", |
| 77 | + "income_support_reported", |
| 78 | + "working_tax_credit_reported", |
| 79 | + "child_tax_credit_reported", |
| 80 | + "attendance_allowance_reported", |
| 81 | + "state_pension_reported", |
| 82 | + "dla_sc_reported", |
| 83 | + "dla_m_reported", |
| 84 | + "pip_m_reported", |
| 85 | + "pip_dl_reported", |
| 86 | + "sda_reported", |
| 87 | + "carers_allowance_reported", |
| 88 | + "iidb_reported", |
| 89 | + "afcs_reported", |
| 90 | + "bsp_reported", |
| 91 | + "incapacity_benefit_reported", |
| 92 | + "maternity_allowance_reported", |
| 93 | + "winter_fuel_allowance_reported", |
| 94 | + "council_tax_benefit_reported", |
| 95 | + "jsa_contrib_reported", |
| 96 | + "jsa_income_reported", |
| 97 | + "esa_contrib_reported", |
| 98 | + "esa_income_reported", |
| 99 | +] |
| 100 | + |
| 101 | + |
| 102 | +def _one_hot_encode(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame: |
| 103 | + """Return ``df`` with object-typed ``columns`` one-hot encoded. |
| 104 | +
|
| 105 | + QRF predictors must be numeric. Uses ``pandas.get_dummies`` so |
| 106 | + identical category sets are produced from the same input data. |
| 107 | + """ |
| 108 | + return pd.get_dummies(df, columns=columns, drop_first=False, dtype=float) |
| 109 | + |
| 110 | + |
| 111 | +def _align_columns( |
| 112 | + train_df: pd.DataFrame, test_df: pd.DataFrame |
| 113 | +) -> tuple[pd.DataFrame, pd.DataFrame]: |
| 114 | + """Ensure train/test share the same columns in the same order. |
| 115 | +
|
| 116 | + After independent ``get_dummies`` calls on train and test one-hot |
| 117 | + expansions can diverge if a category appears in one set and not the |
| 118 | + other. Reindex both to the union of columns, filling missing cells |
| 119 | + with zero. |
| 120 | + """ |
| 121 | + columns = sorted(set(train_df.columns) | set(test_df.columns)) |
| 122 | + return ( |
| 123 | + train_df.reindex(columns=columns, fill_value=0.0), |
| 124 | + test_df.reindex(columns=columns, fill_value=0.0), |
| 125 | + ) |
| 126 | + |
| 127 | + |
| 128 | +def _build_predictor_frame(dataset: UKSingleYearDataset) -> pd.DataFrame: |
| 129 | + """Return a person-indexed DataFrame of stage-2 predictor columns. |
| 130 | +
|
| 131 | + ``region`` lives on the household frame in the enhanced-FRS build, |
| 132 | + so it is joined onto each person row via ``person_household_id``. |
| 133 | + Remaining predictors (age, gender, the six income components) are |
| 134 | + read directly from the person frame. If the person frame already |
| 135 | + carries ``region`` (as in some test fixtures and the standalone SPI |
| 136 | + build) that value wins and no join is performed. |
| 137 | + """ |
| 138 | + person = dataset.person |
| 139 | + predictors = STAGE2_DEMOGRAPHIC_PREDICTORS + STAGE2_INCOME_PREDICTORS |
| 140 | + |
| 141 | + if "region" in person.columns: |
| 142 | + frame = person[predictors].copy() |
| 143 | + elif ( |
| 144 | + "region" in dataset.household.columns |
| 145 | + and "person_household_id" in person.columns |
| 146 | + ): |
| 147 | + hh_region = dataset.household.set_index("household_id")["region"] |
| 148 | + person_region = person["person_household_id"].map(hh_region) |
| 149 | + frame = person[[c for c in predictors if c != "region"]].copy() |
| 150 | + frame["region"] = person_region.values |
| 151 | + frame = frame[predictors] |
| 152 | + else: |
| 153 | + raise KeyError( |
| 154 | + "Stage-2 imputation needs 'region' either on the person frame " |
| 155 | + "or on the household frame with a 'person_household_id' join key." |
| 156 | + ) |
| 157 | + return frame |
| 158 | + |
| 159 | + |
| 160 | +def impute_frs_only_variables( |
| 161 | + train_dataset: UKSingleYearDataset, |
| 162 | + target_dataset: UKSingleYearDataset, |
| 163 | +) -> UKSingleYearDataset: |
| 164 | + """Impute FRS-only person variables onto ``target_dataset``. |
| 165 | +
|
| 166 | + ``train_dataset`` must be a full FRS build (before income |
| 167 | + imputation) so the training rows preserve the original co-occurrence |
| 168 | + of income and every FRS-only variable. ``target_dataset`` is the |
| 169 | + SPI-donor subsample after the first-stage QRF has overwritten its |
| 170 | + income columns. |
| 171 | +
|
| 172 | + A single multi-output QRF is fitted on the training data and used |
| 173 | + to predict values for every row of ``target_dataset``; predictions |
| 174 | + replace the existing (donor-leaked) values in |
| 175 | + ``FRS_ONLY_PERSON_VARIABLES`` only. Variables absent from either |
| 176 | + frame are skipped silently. |
| 177 | + """ |
| 178 | + from policyengine_uk_data.utils.qrf import QRF |
| 179 | + |
| 180 | + target_dataset = target_dataset.copy() |
| 181 | + |
| 182 | + train_person = train_dataset.person |
| 183 | + target_person = target_dataset.person |
| 184 | + |
| 185 | + # Use only variables present in both frames. |
| 186 | + outputs = [ |
| 187 | + v |
| 188 | + for v in FRS_ONLY_PERSON_VARIABLES |
| 189 | + if v in train_person.columns and v in target_person.columns |
| 190 | + ] |
| 191 | + missing = set(FRS_ONLY_PERSON_VARIABLES) - set(outputs) |
| 192 | + if missing: |
| 193 | + logger.warning( |
| 194 | + "Stage-2 FRS-only imputation: %d variables absent from " |
| 195 | + "train/target frames, skipped: %s", |
| 196 | + len(missing), |
| 197 | + sorted(missing), |
| 198 | + ) |
| 199 | + if not outputs: |
| 200 | + logger.warning( |
| 201 | + "Stage-2 FRS-only imputation: no output variables available; " |
| 202 | + "returning target_dataset unchanged." |
| 203 | + ) |
| 204 | + return target_dataset |
| 205 | + |
| 206 | + train_inputs_raw = _build_predictor_frame(train_dataset) |
| 207 | + target_inputs_raw = _build_predictor_frame(target_dataset) |
| 208 | + |
| 209 | + train_inputs = _one_hot_encode(train_inputs_raw, columns=["gender", "region"]) |
| 210 | + target_inputs = _one_hot_encode(target_inputs_raw, columns=["gender", "region"]) |
| 211 | + train_inputs, target_inputs = _align_columns(train_inputs, target_inputs) |
| 212 | + |
| 213 | + # Replace NaNs in outputs with 0 so the QRF trains on clean targets; |
| 214 | + # FRS-only variables are almost all zero-heavy "amount if eligible" |
| 215 | + # columns that default to zero when unreported. |
| 216 | + train_outputs = train_person[outputs].fillna(0).astype(float) |
| 217 | + |
| 218 | + logger.info( |
| 219 | + "Stage-2 FRS-only imputation: %d outputs, training on %d FRS " |
| 220 | + "persons, predicting for %d SPI-donor persons", |
| 221 | + len(outputs), |
| 222 | + len(train_inputs), |
| 223 | + len(target_inputs), |
| 224 | + ) |
| 225 | + |
| 226 | + model = QRF() |
| 227 | + model.fit(train_inputs, train_outputs) |
| 228 | + predictions = model.predict(target_inputs) |
| 229 | + |
| 230 | + # The QRF occasionally returns NaN for extreme predictor combos; |
| 231 | + # clamp to zero (the population-typical value for these variables). |
| 232 | + predictions = predictions.fillna(0.0) |
| 233 | + |
| 234 | + for column in outputs: |
| 235 | + # Clamp negative predictions — these columns represent receipted |
| 236 | + # amounts or contributions and are non-negative by construction. |
| 237 | + values = np.maximum(predictions[column].values, 0.0) |
| 238 | + target_dataset.person[column] = values |
| 239 | + |
| 240 | + return target_dataset |
0 commit comments