Merge main and resolve changelog conflict

nikhilwoodruff · nikhilwoodruff · commit 2488d71dc048 · 2026-02-19T17:34:51.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,24 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.39.3] - 2026-02-19 16:15:46
+
+### Fixed
+
+- Widened UC taper rate reform test tolerance to 15bn to account for calibration variance.
+
+## [1.39.2] - 2026-02-19 13:58:30
+
+### Added
+
+- UC households by children count (0, 1, 2, 3+) as constituency calibration targets.
+
+## [1.39.1] - 2026-02-19 11:54:03
+
+### Added
+
+- Test for highest_education in enhanced FRS dataset.
+
 ## [1.39.0] - 2026-02-19 08:39:08
 
 ### Added
@@ -617,6 +635,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+[1.39.3]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.39.2...1.39.3
+[1.39.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.39.1...1.39.2
+[1.39.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.39.0...1.39.1
 [1.39.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.38.0...1.39.0
 [1.38.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.37.1...1.38.0
 [1.37.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.37.0...1.37.1
diff --git a/changelog.yaml b/changelog.yaml
@@ -547,3 +547,19 @@
     added:
     - highest_education variable derived from FRS EDUCQUAL field.
   date: 2026-02-19 08:39:08
+- bump: patch
+  changes:
+    added:
+    - Test for highest_education in enhanced FRS dataset.
+  date: 2026-02-19 11:54:03
+- bump: patch
+  changes:
+    added:
+    - UC households by children count (0, 1, 2, 3+) as constituency calibration targets.
+  date: 2026-02-19 13:58:30
+- bump: patch
+  changes:
+    fixed:
+    - Widened UC taper rate reform test tolerance to 15bn to account for calibration
+      variance.
+  date: 2026-02-19 16:15:46
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -1,4 +1,4 @@
 - bump: minor
   changes:
     added:
-      - SLC student loan calibration targets for Plan 2 and Plan 5 England borrowers earning above repayment threshold (2025-2030), wired into the target registry and loss matrix.
+      - SLC student loan calibration targets for Plan 2 and Plan 5 England borrowers earning above repayment threshold (2025-2030), fetched live from Explore Education Statistics.
diff --git a/policyengine_uk_data/datasets/local_areas/constituencies/loss.py b/policyengine_uk_data/datasets/local_areas/constituencies/loss.py
@@ -30,6 +30,7 @@
 )
 from policyengine_uk_data.targets.sources.local_uc import (
     get_constituency_uc_targets,
+    get_constituency_uc_by_children_targets,
 )
 
 
@@ -96,11 +97,30 @@ def create_constituency_target_matrix(
 
     # ── UC targets ─────────────────────────────────────────────────
     y["uc_households"] = get_constituency_uc_targets().values
-    matrix["uc_households"] = sim.map_result(
-        (sim.calculate("universal_credit").values > 0).astype(int),
-        "benunit",
-        "household",
+    on_uc = (sim.calculate("universal_credit").values > 0).astype(int)
+    matrix["uc_households"] = sim.map_result(on_uc, "benunit", "household")
+
+    # UC households split by number of children — forces the reweighting
+    # to match the family-size distribution within each constituency,
+    # preventing under-representation of larger families (see #274).
+    is_child = sim.calculate("is_child").values
+    children_per_hh = sim.map_result(is_child, "person", "household")
+    on_uc_hh = sim.map_result(on_uc, "benunit", "household") > 0
+
+    matrix["uc_hh_0_children"] = (on_uc_hh & (children_per_hh == 0)).astype(
+        float
     )
+    matrix["uc_hh_1_child"] = (on_uc_hh & (children_per_hh == 1)).astype(float)
+    matrix["uc_hh_2_children"] = (on_uc_hh & (children_per_hh == 2)).astype(
+        float
+    )
+    matrix["uc_hh_3plus_children"] = (
+        on_uc_hh & (children_per_hh >= 3)
+    ).astype(float)
+
+    uc_by_children = get_constituency_uc_by_children_targets()
+    for col in uc_by_children.columns:
+        y[col] = uc_by_children[col].values
 
     # ── Boundary mapping (2010 → 2024) ────────────────────────────
     const_2024 = pd.read_csv(STORAGE_FOLDER / "constituencies_2024.csv")
diff --git a/policyengine_uk_data/targets/sources/local_uc.py b/policyengine_uk_data/targets/sources/local_uc.py
@@ -4,18 +4,43 @@
 loaded from pre-downloaded Stat-Xplore exports and scaled to match
 national UC payment distribution totals.
 
+Also provides UC household counts split by number of children, using
+country-level proportions from Stat-Xplore (November 2023) applied to
+each constituency's total.  This ensures the reweighting algorithm
+places adequate weight on larger families in every constituency.
+
 Source: DWP Stat-Xplore
 https://stat-xplore.dwp.gov.uk
 """
 
 import logging
 
+import numpy as np
 import pandas as pd
 
 logger = logging.getLogger(__name__)
 
 _REF = "https://stat-xplore.dwp.gov.uk"
 
+# Country-level UC households by number of children (Nov 2023, Stat-Xplore).
+# Used to split each constituency's UC total into children-count buckets.
+# Keys: (0 children, 1 child, 2 children, 3+ children)
+_UC_CHILDREN_BY_COUNTRY = {
+    "E": np.array([2_411_993, 948_304, 802_992, 495_279], dtype=float),
+    "W": np.array([141_054, 52_953, 44_348, 26_372], dtype=float),
+    "S": np.array([253_609, 86_321, 66_829, 35_036], dtype=float),
+    # Northern Ireland: use GB-wide proportions as fallback
+    "N": np.array(
+        [
+            2_411_993 + 141_054 + 253_609,
+            948_304 + 52_953 + 86_321,
+            802_992 + 44_348 + 66_829,
+            495_279 + 26_372 + 35_036,
+        ],
+        dtype=float,
+    ),
+}
+
 
 def get_constituency_uc_targets() -> pd.Series:
     """UC household counts for 650 constituencies (positional order).
@@ -28,6 +53,44 @@ def get_constituency_uc_targets() -> pd.Series:
     return uc_pc_households.household_count
 
 
+def get_constituency_uc_by_children_targets() -> pd.DataFrame:
+    """UC households split by 0, 1, 2, 3+ children for 650 constituencies.
+
+    Applies country-level proportions from Stat-Xplore to each
+    constituency's total UC count.  Returns a DataFrame with columns
+    ``uc_hh_0_children``, ``uc_hh_1_child``, ``uc_hh_2_children``,
+    ``uc_hh_3plus_children``, in the same positional order as
+    :func:`get_constituency_uc_targets`.
+    """
+    from policyengine_uk_data.utils.uc_data import uc_pc_households
+    from policyengine_uk_data.storage import STORAGE_FOLDER
+
+    codes = pd.read_csv(STORAGE_FOLDER / "constituencies_2024.csv")["code"]
+    totals = uc_pc_households.household_count.values.astype(float)
+
+    result = pd.DataFrame(index=range(len(totals)))
+    cols = [
+        "uc_hh_0_children",
+        "uc_hh_1_child",
+        "uc_hh_2_children",
+        "uc_hh_3plus_children",
+    ]
+    for col in cols:
+        result[col] = 0.0
+
+    for i, (total, code) in enumerate(zip(totals, codes)):
+        country_prefix = code[0]
+        proportions = _UC_CHILDREN_BY_COUNTRY.get(
+            country_prefix,
+            _UC_CHILDREN_BY_COUNTRY["N"],  # fallback
+        )
+        shares = proportions / proportions.sum()
+        for j, col in enumerate(cols):
+            result.loc[i, col] = round(total * shares[j])
+
+    return result
+
+
 def get_la_uc_targets() -> pd.Series:
     """UC household counts for 360 local authorities (positional order).
 
diff --git a/policyengine_uk_data/tests/microsimulation/reforms_config.yaml b/policyengine_uk_data/tests/microsimulation/reforms_config.yaml
@@ -16,7 +16,8 @@ reforms:
   parameters:
     gov.hmrc.child_benefit.amount.additional: 25
 - name: Reduce Universal Credit taper rate to 20%
-  expected_impact: -35.0
+  expected_impact: -39.0
+  tolerance: 15.0
   parameters:
     gov.dwp.universal_credit.means_test.reduction_rate: 0.2
 - name: Raise Class 1 main employee NICs rate to 10%
diff --git a/policyengine_uk_data/tests/test_uc_by_children.py b/policyengine_uk_data/tests/test_uc_by_children.py
@@ -0,0 +1,60 @@
+"""Test UC households by number of children calibration targets.
+
+Validates that the weighted count of UC households split by number of
+children (0, 1, 2, 3+) matches DWP Stat-Xplore country-level totals
+(November 2023).
+
+Source: DWP Stat-Xplore, UC Households dataset
+https://stat-xplore.dwp.gov.uk/
+"""
+
+import pytest
+
+# DWP Stat-Xplore November 2023 national totals (GB)
+# England + Wales + Scotland
+_TARGETS = {
+    "0_children": 2_411_993 + 141_054 + 253_609,  # 2,806,656
+    "1_child": 948_304 + 52_953 + 86_321,  # 1,087,578
+    "2_children": 802_992 + 44_348 + 66_829,  # 914,169
+    "3plus_children": 495_279 + 26_372 + 35_036,  # 556,687
+}
+
+TOLERANCE = 0.30  # 30% relative tolerance
+
+
+@pytest.mark.xfail(
+    reason="Will pass after recalibration with UC-by-children constituency targets"
+)
+@pytest.mark.parametrize(
+    "bucket,target",
+    list(_TARGETS.items()),
+    ids=list(_TARGETS.keys()),
+)
+def test_uc_households_by_children(baseline, bucket, target):
+    """Test that UC households by children count matches Stat-Xplore data."""
+    uc = baseline.calculate("universal_credit", period=2025).values
+    on_uc = baseline.map_result(uc > 0, "benunit", "household") > 0
+
+    is_child = baseline.calculate(
+        "is_child", map_to="person", period=2025
+    ).values
+    children_per_hh = baseline.map_result(is_child, "person", "household")
+
+    if bucket == "0_children":
+        match = on_uc & (children_per_hh == 0)
+    elif bucket == "1_child":
+        match = on_uc & (children_per_hh == 1)
+    elif bucket == "2_children":
+        match = on_uc & (children_per_hh == 2)
+    else:  # 3plus_children
+        match = on_uc & (children_per_hh >= 3)
+
+    household_weight = baseline.calculate(
+        "household_weight", period=2025
+    ).values
+    actual = (household_weight * match).sum()
+
+    assert abs(actual / target - 1) < TOLERANCE, (
+        f"UC households with {bucket}: expected {target/1e3:.0f}k, "
+        f"got {actual/1e3:.0f}k ({actual/target*100:.0f}% of target)"
+    )
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "policyengine_uk_data"
-version = "1.39.0"
+version = "1.39.3"
 description = "A package to create representative microdata for the UK."
 readme = "README.md"
 authors = [