PolicyEngine
diff --git a/‎CHANGELOG.md‎
Lines changed: 24 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎changelog.d/368.md‎
Lines changed: 1 addition & 0 deletions b/‎changelog.d/368.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎policyengine_uk_data/datasets/childcare/takeup_rate.py‎
Lines changed: 14 additions & 4 deletions b/‎policyengine_uk_data/datasets/childcare/takeup_rate.py‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎policyengine_uk_data/datasets/frs.py‎
Lines changed: 29 additions & 8 deletions b/‎policyengine_uk_data/datasets/frs.py‎
Lines changed: 29 additions & 8 deletions
diff --git a/‎policyengine_uk_data/datasets/imputations/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎policyengine_uk_data/datasets/imputations/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎policyengine_uk_data/datasets/imputations/frs_only.py‎
Lines changed: 240 additions & 0 deletions b/‎policyengine_uk_data/datasets/imputations/frs_only.py‎
Lines changed: 240 additions & 0 deletions
diff --git a/‎policyengine_uk_data/datasets/imputations/income.py‎
Lines changed: 15 additions & 0 deletions b/‎policyengine_uk_data/datasets/imputations/income.py‎
Lines changed: 15 additions & 0 deletions
@@ -1,3 +1,27 @@
+## [1.53.1] - 2026-04-20
+
+No significant changes.
+
+
+## [1.53.0] - 2026-04-19
+
+### Added
+
+- Tightened `test_population` tolerance from 7% to 3% now that the stage-2 QRF (#362), TFC target refresh (#363), and reported-anchor takeup (#359) pulled the weighted UK population overshoot from ~6.5% down to ~1.6%. Added four regression tests in `test_population_fidelity.py` (weighted-total match, household-count range, non-inflation guard, country-sum consistency) extracted from the earlier #310 draft so any future calibration drift back toward the pre-April-2026 overshoot trips CI.
+
+
+## [1.52.2] - 2026-04-18
+
+### Changed
+
+- Add second-stage QRF imputation of FRS-only variables on SPI-donor rows. After the first-stage SPI-trained QRF overwrites income components on the zero-weight subsample, a new second-stage QRF trained on the full FRS rewrites benefit `_reported` columns, pension contributions, and savings-income so they correlate with the freshly-imputed incomes instead of staying as whatever middle-income FRS donor was sampled. Mirrors the `policyengine-us-data#589` pattern. Prevents synthetic £2 M earners from carrying a middle-income donor's UC / housing-benefit receipt into calibration, which was blowing up benefit aggregates under upweight.
+- Anchor stochastic takeup assignment for Universal Credit, Pension Credit, and Child Benefit to the FRS-reported receipt columns, matching the `policyengine-us-data` pattern. Respondents who report positive receipt in the FRS benefits table now receive `would_claim_* = True` with certainty, and non-reporters are filled probabilistically to hit the aggregate target rate. Removes a source of calibration noise where respondents who clearly took up a benefit could be randomly assigned `would_claim = False`.
+
+### Fixed
+
+- Refresh Tax-Free Childcare calibration targets and take-up rate using HMRC's June 2025 release (covering 2024-25 outturn: £632 m spending, 985 k children reached). The prior target set was calibrated against the September 2024 release and undershot current TFC spending by roughly a third. Bumps the default TFC take-up rate from 0.586 to 0.88 on 2024-04-06 to close most of the gap pending a full recalibration run.
+
+
 ## [1.52.1] - 2026-04-18
 
 ### Fixed
 
@@ -0,0 +1 @@
+- Set Marriage Allowance take-up rate to 0.5 (HMRC outturn ~2.1m claimants of ~4.2m eligible couples) instead of the placeholder 1.0, so microsimulation no longer overstates Marriage Allowance cost by ~£500m/year.
@@ -3,23 +3,33 @@
 from policyengine_uk import Microsimulation
 
 # 🎯 Calibration targets
+#
+# TFC targets refreshed from HMRC "Tax-Free Childcare statistics: June 2025"
+# (published 27 Aug 2025, covering 2024-25 outturn):
+#   - spending: £632.2 m (Table 1, annual government top-up)
+#   - caseload: 985 thousand children received TFC in 2024-25 (annual unique)
+# The prior 0.6 / 660 targets were calibrated against the Sep 2024 release
+# (2023-24 outturn) and have since been overtaken by the TFC account
+# expansion and the Sep 2025 "30 free hours for under-5s" boost in uptake.
+#
+# Other programme targets kept at their prior DfE values.
 targets = {
     "spending": {
-        "tfc": 0.6,
+        "tfc": 0.63,
         "extended": 2.5,
         "targeted": 0.6,
         "universal": 1.7,
     },
     "caseload": {
-        "tfc": 660,
+        "tfc": 985,
         "extended": 740,
         "targeted": 130,
         "universal": 490,
     },
 }
 
-# Here is the link to the UK government’s aggregate data for Tax-Free Childcare:
-# https://www.gov.uk/government/statistics/tax-free-childcare-statistics-september-2024
+# UK government aggregate Tax-Free Childcare statistics:
+# https://www.gov.uk/government/statistics/tax-free-childcare-statistics-june-2025
 
 # This is the Department for Education (DfE) data for the other childcare programmes:
 # https://skillsfunding.service.gov.uk/view-latest-funding/national-funding-allocations/DSG/2024-to-2025
 
@@ -1217,24 +1217,45 @@ def determine_education_level(fted_val, typeed2_val, age_val):
     scp_under_6_rate = load_take_up_rate("scp_under_6", year)
     scp_6_plus_rate = load_take_up_rate("scp_6_plus", year)
 
-    # Generate take-up decisions by comparing random draws to take-up rates
+    # Generate take-up decisions by comparing random draws to take-up rates,
+    # anchored to reported receipts where the FRS captures them. Respondents
+    # who report positive receipt of a benefit are assigned takeup=True with
+    # certainty; the remaining non-reporters are filled probabilistically to
+    # hit the aggregate target rate. See policyengine_uk_data/utils/takeup.py.
+    from policyengine_uk_data.utils.takeup import (
+        assign_takeup_with_reported_anchors,
+    )
+
+    def _reported_benunit_mask(person_column: str) -> np.ndarray:
+        reporter_benunits = set(
+            pe_person.loc[pe_person[person_column] > 0, "person_benunit_id"].values
+        )
+        return pe_benunit["benunit_id"].isin(reporter_benunits).values
+
     # Person-level
     pe_person["would_claim_marriage_allowance"] = (
         generator.random(len(pe_person)) < marriage_allowance_rate
     )
 
-    # Benefit unit-level
-    pe_benunit["would_claim_child_benefit"] = (
-        generator.random(len(pe_benunit)) < child_benefit_rate
+    # Benefit unit-level — anchor on any adult in the benefit unit having
+    # reported positive receipt in the FRS benefits table.
+    pe_benunit["would_claim_child_benefit"] = assign_takeup_with_reported_anchors(
+        generator.random(len(pe_benunit)),
+        child_benefit_rate,
+        reported_mask=_reported_benunit_mask("child_benefit_reported"),
     )
     pe_benunit["child_benefit_opts_out"] = (
         generator.random(len(pe_benunit)) < child_benefit_opts_out_rate
     )
-    pe_benunit["would_claim_pc"] = (
-        generator.random(len(pe_benunit)) < pension_credit_rate
+    pe_benunit["would_claim_pc"] = assign_takeup_with_reported_anchors(
+        generator.random(len(pe_benunit)),
+        pension_credit_rate,
+        reported_mask=_reported_benunit_mask("pension_credit_reported"),
     )
-    pe_benunit["would_claim_uc"] = (
-        generator.random(len(pe_benunit)) < universal_credit_rate
+    pe_benunit["would_claim_uc"] = assign_takeup_with_reported_anchors(
+        generator.random(len(pe_benunit)),
+        universal_credit_rate,
+        reported_mask=_reported_benunit_mask("universal_credit_reported"),
     )
     pe_benunit["would_claim_tfc"] = generator.random(len(pe_benunit)) < tfc_rate
     pe_benunit["would_claim_extended_childcare"] = (
 
@@ -2,6 +2,7 @@
 from .vat import *
 from .wealth import *
 from .income import *
+from .frs_only import impute_frs_only_variables
 from .capital_gains import *
 from .services import impute_services
 from .salary_sacrifice import impute_salary_sacrifice
 
@@ -0,0 +1,240 @@
+"""Second-stage QRF imputation of FRS-only variables on SPI-donor rows.
+
+The enhanced-FRS pipeline in :mod:`income` creates a zero-weight subsample
+of the FRS that will be upweighted during calibration to fit SPI-derived
+high-income targets. The first-stage QRF (trained on SPI) replaces only
+the six core income components (plus ``gift_aid`` and
+``charitable_investment_gifts``) on those rows. Every other FRS column —
+benefit ``_reported`` values, pension contributions, savings, rent,
+mortgage, council tax — stays at whatever the middle-income FRS donor
+whose row was sampled happened to report.
+
+That produces implausible joint distributions on the synthetic
+high-income side. A row with imputed £2 M self-employment income carries
+its donor's £120 UC ``_reported`` value, its donor's tiny pension
+contribution, and its donor's typical rent. Under calibration upweight
+these cascade into false benefit aggregates, depressed allowances, and
+distorted housing-cost totals.
+
+This second-stage QRF trains on the original FRS with predictors =
+[demographics + first-stage income outputs] and outputs = a curated list
+of FRS-only variables. For each SPI-donor row, it substitutes the
+predicted value drawn from FRS respondents with similar demographics and
+post-stage-1 incomes. Benefit ``_reported`` flags for high earners
+naturally collapse to zero (no high-earner FRS respondent reports UC),
+pension contributions rescale, and savings interest / rent correlate
+with income instead of with the random FRS donor's draw.
+
+Mirrors the US ``_impute_cps_only_variables`` approach introduced in
+``policyengine-us-data#589`` but targets UK-specific FRS variables.
+"""
+
+from __future__ import annotations
+
+import logging
+
+import numpy as np
+import pandas as pd
+from policyengine_uk.data import UKSingleYearDataset
+
+logger = logging.getLogger(__name__)
+
+
+STAGE2_DEMOGRAPHIC_PREDICTORS = [
+    "age",
+    "gender",
+    "region",
+]
+
+# Predictors drawn from the first-stage QRF output columns. They are the
+# same six income components that the first stage imputes from SPI.
+STAGE2_INCOME_PREDICTORS = [
+    "employment_income",
+    "self_employment_income",
+    "savings_interest_income",
+    "dividend_income",
+    "private_pension_income",
+    "property_income",
+]
+
+# FRS-only variables the second stage replaces on SPI-donor rows. Kept
+# conservative: benefit ``_reported`` columns and pension contributions
+# are the leading sources of cross-income inconsistency, and are
+# well-populated in the base FRS build so training is stable.
+FRS_ONLY_PERSON_VARIABLES = [
+    # Pension contributions
+    "employee_pension_contributions",
+    "employer_pension_contributions",
+    "personal_pension_contributions",
+    "pension_contributions_via_salary_sacrifice",
+    # Savings-related
+    "tax_free_savings_income",
+    # Benefit `_reported` columns
+    "universal_credit_reported",
+    "pension_credit_reported",
+    "child_benefit_reported",
+    "housing_benefit_reported",
+    "income_support_reported",
+    "working_tax_credit_reported",
+    "child_tax_credit_reported",
+    "attendance_allowance_reported",
+    "state_pension_reported",
+    "dla_sc_reported",
+    "dla_m_reported",
+    "pip_m_reported",
+    "pip_dl_reported",
+    "sda_reported",
+    "carers_allowance_reported",
+    "iidb_reported",
+    "afcs_reported",
+    "bsp_reported",
+    "incapacity_benefit_reported",
+    "maternity_allowance_reported",
+    "winter_fuel_allowance_reported",
+    "council_tax_benefit_reported",
+    "jsa_contrib_reported",
+    "jsa_income_reported",
+    "esa_contrib_reported",
+    "esa_income_reported",
+]
+
+
+def _one_hot_encode(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
+    """Return ``df`` with object-typed ``columns`` one-hot encoded.
+
+    QRF predictors must be numeric. Uses ``pandas.get_dummies`` so
+    identical category sets are produced from the same input data.
+    """
+    return pd.get_dummies(df, columns=columns, drop_first=False, dtype=float)
+
+
+def _align_columns(
+    train_df: pd.DataFrame, test_df: pd.DataFrame
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Ensure train/test share the same columns in the same order.
+
+    After independent ``get_dummies`` calls on train and test one-hot
+    expansions can diverge if a category appears in one set and not the
+    other. Reindex both to the union of columns, filling missing cells
+    with zero.
+    """
+    columns = sorted(set(train_df.columns) | set(test_df.columns))
+    return (
+        train_df.reindex(columns=columns, fill_value=0.0),
+        test_df.reindex(columns=columns, fill_value=0.0),
+    )
+
+
+def _build_predictor_frame(dataset: UKSingleYearDataset) -> pd.DataFrame:
+    """Return a person-indexed DataFrame of stage-2 predictor columns.
+
+    ``region`` lives on the household frame in the enhanced-FRS build,
+    so it is joined onto each person row via ``person_household_id``.
+    Remaining predictors (age, gender, the six income components) are
+    read directly from the person frame. If the person frame already
+    carries ``region`` (as in some test fixtures and the standalone SPI
+    build) that value wins and no join is performed.
+    """
+    person = dataset.person
+    predictors = STAGE2_DEMOGRAPHIC_PREDICTORS + STAGE2_INCOME_PREDICTORS
+
+    if "region" in person.columns:
+        frame = person[predictors].copy()
+    elif (
+        "region" in dataset.household.columns
+        and "person_household_id" in person.columns
+    ):
+        hh_region = dataset.household.set_index("household_id")["region"]
+        person_region = person["person_household_id"].map(hh_region)
+        frame = person[[c for c in predictors if c != "region"]].copy()
+        frame["region"] = person_region.values
+        frame = frame[predictors]
+    else:
+        raise KeyError(
+            "Stage-2 imputation needs 'region' either on the person frame "
+            "or on the household frame with a 'person_household_id' join key."
+        )
+    return frame
+
+
+def impute_frs_only_variables(
+    train_dataset: UKSingleYearDataset,
+    target_dataset: UKSingleYearDataset,
+) -> UKSingleYearDataset:
+    """Impute FRS-only person variables onto ``target_dataset``.
+
+    ``train_dataset`` must be a full FRS build (before income
+    imputation) so the training rows preserve the original co-occurrence
+    of income and every FRS-only variable. ``target_dataset`` is the
+    SPI-donor subsample after the first-stage QRF has overwritten its
+    income columns.
+
+    A single multi-output QRF is fitted on the training data and used
+    to predict values for every row of ``target_dataset``; predictions
+    replace the existing (donor-leaked) values in
+    ``FRS_ONLY_PERSON_VARIABLES`` only. Variables absent from either
+    frame are skipped silently.
+    """
+    from policyengine_uk_data.utils.qrf import QRF
+
+    target_dataset = target_dataset.copy()
+
+    train_person = train_dataset.person
+    target_person = target_dataset.person
+
+    # Use only variables present in both frames.
+    outputs = [
+        v
+        for v in FRS_ONLY_PERSON_VARIABLES
+        if v in train_person.columns and v in target_person.columns
+    ]
+    missing = set(FRS_ONLY_PERSON_VARIABLES) - set(outputs)
+    if missing:
+        logger.warning(
+            "Stage-2 FRS-only imputation: %d variables absent from "
+            "train/target frames, skipped: %s",
+            len(missing),
+            sorted(missing),
+        )
+    if not outputs:
+        logger.warning(
+            "Stage-2 FRS-only imputation: no output variables available; "
+            "returning target_dataset unchanged."
+        )
+        return target_dataset
+
+    train_inputs_raw = _build_predictor_frame(train_dataset)
+    target_inputs_raw = _build_predictor_frame(target_dataset)
+
+    train_inputs = _one_hot_encode(train_inputs_raw, columns=["gender", "region"])
+    target_inputs = _one_hot_encode(target_inputs_raw, columns=["gender", "region"])
+    train_inputs, target_inputs = _align_columns(train_inputs, target_inputs)
+
+    # Replace NaNs in outputs with 0 so the QRF trains on clean targets;
+    # FRS-only variables are almost all zero-heavy "amount if eligible"
+    # columns that default to zero when unreported.
+    train_outputs = train_person[outputs].fillna(0).astype(float)
+
+    logger.info(
+        "Stage-2 FRS-only imputation: %d outputs, training on %d FRS "
+        "persons, predicting for %d SPI-donor persons",
+        len(outputs),
+        len(train_inputs),
+        len(target_inputs),
+    )
+
+    model = QRF()
+    model.fit(train_inputs, train_outputs)
+    predictions = model.predict(target_inputs)
+
+    # The QRF occasionally returns NaN for extreme predictor combos;
+    # clamp to zero (the population-typical value for these variables).
+    predictions = predictions.fillna(0.0)
+
+    for column in outputs:
+        # Clamp negative predictions — these columns represent receipted
+        # amounts or contributions and are non-negative by construction.
+        values = np.maximum(predictions[column].values, 0.0)
+        target_dataset.person[column] = values
+
+    return target_dataset
@@ -256,6 +256,21 @@ def impute_income(dataset: UKSingleYearDataset) -> UKSingleYearDataset:
         IMPUTATIONS,
     )
 
+    # Second-stage QRF: rewrite FRS-only variables (benefit `_reported`
+    # columns, pension contributions, savings, etc.) on the SPI-donor rows
+    # so they correlate with the freshly-imputed incomes instead of staying
+    # as whatever middle-income FRS donor was sampled. Without this the
+    # £2M imputed earners keep their donor's £120 UC receipt, blowing up
+    # benefit aggregates under calibration upweight.
+    from policyengine_uk_data.datasets.imputations.frs_only import (
+        impute_frs_only_variables,
+    )
+
+    zero_weight_copy = impute_frs_only_variables(
+        train_dataset=dataset,
+        target_dataset=zero_weight_copy,
+    )
+
     dataset = impute_over_incomes(
         dataset,
         model,
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+- Set Marriage Allowance take-up rate to 0.5 (HMRC outturn ~2.1m claimants of ~4.2m eligible couples) instead of the placeholder 1.0, so microsimulation no longer overstates Marriage Allowance cost by ~£500m/year.`