Merge pull request #212 from PolicyEngine/re-add-uc

nikhilwoodruff · web-flow · commit ebbce1229d0c · 2025-10-21T13:08:56.000+01:00
Re-add UC calibration
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    added:
+    - Universal Credit calibration at national level by award amount and family type, and at constituency level in total.
diff --git a/policyengine_uk_data/datasets/local_areas/constituencies/loss.py b/policyengine_uk_data/datasets/local_areas/constituencies/loss.py
@@ -16,6 +16,7 @@
     mapping_matrix,
 )
 from policyengine_uk.data import UKSingleYearDataset
+from policyengine_uk_data.utils.uc_data import uc_pc_households
 
 FOLDER = Path(__file__).parent
 
@@ -125,6 +126,14 @@ def create_constituency_target_matrix(
         employment_incomes.employment_income_lower_bound.sort_values().unique()
     ) + [np.inf]
 
+    # UC household count by constituency
+    y["uc_households"] = uc_pc_households.household_count.values
+    matrix["uc_households"] = sim.map_result(
+        (sim.calculate("universal_credit").values > 0).astype(int),
+        "benunit",
+        "household",
+    )
+
     for lower_bound, upper_bound in zip(bounds[:-1], bounds[1:]):
         continue
         if (
diff --git a/policyengine_uk_data/storage/UC_DATA_SOURCES.md b/policyengine_uk_data/storage/UC_DATA_SOURCES.md
@@ -0,0 +1,31 @@
+# Universal Credit data sources
+
+## National payment distribution
+
+Source: Stat-Xplore (DWP)
+- Rows: Monthly award amount bands + Households on Universal Credit
+- Columns: Family type
+- File: `uc_national_payment_dist.xlsx`
+
+## Parliamentary constituency households
+
+### Great Britain data
+
+Source: Stat-Xplore (DWP)
+- Rows: Westminster Parliamentary Constituency 2024 + Households on Universal Credit
+- File: `uc_pc_households.xlsx`
+
+### Northern Ireland data
+
+Source: Department for Communities Northern Ireland
+- URL: https://www.communities-ni.gov.uk/publications/universal-credit-statistics-may-2025
+- File: `dfc-ni-uc-stats-supp-tables-may-2025.ods`
+- Sheet: 5b
+- Data: Household counts by Westminster Parliamentary Constituency 2024
+
+The NI data is combined with the GB data to produce a complete UK-wide parliamentary constituency table.
+
+## Data processing notes
+
+- The "Unknown" constituency category is excluded from the constituency data
+- Constituency household counts are scaled to match the national total from the payment distribution data, as the two sources have different totals due to timing and methodology differences
diff --git a/policyengine_uk_data/storage/dfc-ni-uc-stats-supp-tables-may-2025.ods b/policyengine_uk_data/storage/dfc-ni-uc-stats-supp-tables-may-2025.ods
diff --git a/policyengine_uk_data/storage/uc_national_payment_dist.xlsx b/policyengine_uk_data/storage/uc_national_payment_dist.xlsx
diff --git a/policyengine_uk_data/storage/uc_pc_households.xlsx b/policyengine_uk_data/storage/uc_pc_households.xlsx
diff --git a/policyengine_uk_data/tests/microsimulation/reforms_config.yaml b/policyengine_uk_data/tests/microsimulation/reforms_config.yaml
@@ -4,7 +4,7 @@ reforms:
   parameters:
     gov.hmrc.income_tax.rates.uk[0].rate: 0.21
 - name: Raise higher rate by 1pp
-  expected_impact: 5.5
+  expected_impact: 5.4
   parameters:
     gov.hmrc.income_tax.rates.uk[1].rate: 0.42
 - name: Raise personal allowance by ~800GBP/year
@@ -16,15 +16,15 @@ reforms:
   parameters:
     gov.hmrc.child_benefit.amount.additional: 25
 - name: Reduce Universal Credit taper rate to 20%
-  expected_impact: -34.4
+  expected_impact: -30.7
   parameters:
     gov.dwp.universal_credit.means_test.reduction_rate: 0.2
 - name: Raise Class 1 main employee NICs rate to 10%
   expected_impact: 12.4
   parameters:
     gov.hmrc.national_insurance.class_1.rates.employee.main: 0.1
 - name: Raise VAT standard rate by 2pp
-  expected_impact: 18.7
+  expected_impact: 19.3
   parameters:
     gov.hmrc.vat.standard_rate: 0.22
 - name: Raise additional rate by 3pp
diff --git a/policyengine_uk_data/utils/loss.py b/policyengine_uk_data/utils/loss.py
@@ -11,6 +11,7 @@
 from policyengine_uk_data.storage import STORAGE_FOLDER
 from policyengine_uk_data.utils import uprate_values
 from policyengine_uk.data import UKSingleYearDataset
+from policyengine_uk_data.utils.uc_data import uc_national_payment_dist
 
 tax_benefit = pd.read_csv(STORAGE_FOLDER / "tax_benefit.csv")
 tax_benefit["name"] = tax_benefit["name"].apply(lambda x: f"obr/{x}")
@@ -382,6 +383,26 @@ def pe_count(*variables):
         60 * 52 * 115_000
     )  # same source as above, multiply avg cap amount by total capped population
 
+    # UC national payment distribution
+
+    uc_payment_dist = uc_national_payment_dist
+    uc_payments = sim.calculate("universal_credit", map_to="benunit").values
+    uc_family_type = sim.calculate("family_type", map_to="benunit").values
+
+    for i, row in uc_payment_dist.iterrows():
+        lower = row.uc_annual_payment_min
+        upper = row.uc_annual_payment_max
+        family_type = row.family_type
+        in_band = (
+            (uc_payments >= lower)
+            & (uc_payments < upper)
+            & (uc_family_type == family_type)
+        )
+        name = f"dwp/uc_payment_dist/{family_type}_annual_payment_{lower:_.0f}_to_{upper:_.0f}"
+        df[name] = household_from_family(in_band)
+        target_names.append(name)
+        target_values.append(row.household_count)
+
     combined_targets = pd.concat(
         [
             targets,
diff --git a/policyengine_uk_data/utils/uc_data.py b/policyengine_uk_data/utils/uc_data.py
@@ -0,0 +1,153 @@
+import pandas as pd
+from pathlib import Path
+
+
+def _parse_uc_national_payment_dist():
+    """Parse UC national payment distribution into long format."""
+    storage_path = Path(__file__).parent.parent / "storage"
+    file_path = storage_path / "uc_national_payment_dist.xlsx"
+
+    # Read the Excel file, skipping header rows
+    df = pd.read_excel(file_path, header=None)
+
+    # Extract family types from row 7 (index 7)
+    family_types = df.iloc[7, 3:7].tolist()  # Columns 3-6: the 4 family types
+
+    # Extract data rows (starting from row 9, index 9)
+    data_rows = []
+
+    for idx in range(9, len(df)):
+        award_band = df.iloc[idx, 1]  # Monthly award amount band
+
+        # Skip if not a valid award band
+        if pd.isna(award_band) or award_band in ["No payment", "Total"]:
+            continue
+
+        for col_idx, family_type in enumerate(family_types, start=3):
+            household_count = df.iloc[idx, col_idx]
+
+            # Skip missing, ".." (suppressed), or zero values
+            if (
+                pd.isna(household_count)
+                or household_count == ".."
+                or household_count == 0
+            ):
+                continue
+
+            data_rows.append(
+                {
+                    "monthly_award_band": award_band,
+                    "family_type": family_type,
+                    "household_count": int(household_count),
+                }
+            )
+
+    result_df = pd.DataFrame(data_rows)
+
+    # Parse monthly band into min and max, then convert to annual
+    def parse_band(band):
+        """Parse band like '£100.01 to £200.00' into (min, max)."""
+        parts = band.replace("£", "").replace(",", "").split(" to ")
+        if len(parts) == 2:
+            return float(parts[0]) * 12, float(parts[1]) * 12
+        return None, None
+
+    result_df[["uc_annual_payment_min", "uc_annual_payment_max"]] = result_df[
+        "monthly_award_band"
+    ].apply(lambda x: pd.Series(parse_band(x)))
+
+    # Map family types to constant names
+    family_type_mapping = {
+        "Single, no children": "SINGLE",
+        "Single, with children": "LONE_PARENT",
+        "Couple, no children": "COUPLE_NO_CHILDREN",
+        "Couple, with children": "COUPLE_WITH_CHILDREN",
+    }
+    result_df["family_type"] = result_df["family_type"].map(
+        family_type_mapping
+    )
+
+    # Reorder columns and drop monthly band
+    result_df = result_df[
+        [
+            "uc_annual_payment_min",
+            "uc_annual_payment_max",
+            "family_type",
+            "household_count",
+        ]
+    ]
+
+    return result_df
+
+
+def _parse_uc_pc_households():
+    """Parse UC parliamentary constituency households (GB + NI)."""
+    storage_path = Path(__file__).parent.parent / "storage"
+
+    # Parse GB data
+    gb_file_path = storage_path / "uc_pc_households.xlsx"
+    df_gb = pd.read_excel(gb_file_path, header=None)
+
+    gb_data_rows = []
+
+    for idx in range(8, len(df_gb)):
+        constituency = df_gb.iloc[idx, 1]  # Column 1: constituency name
+        household_count = df_gb.iloc[idx, 3]  # Column 3: household count
+
+        # Skip if empty, invalid, Total row, or Unknown
+        if (
+            pd.isna(constituency)
+            or pd.isna(household_count)
+            or constituency in ["Total", "Unknown"]
+        ):
+            continue
+
+        gb_data_rows.append(
+            {
+                "constituency_name": constituency,
+                "household_count": int(household_count),
+            }
+        )
+
+    # Parse NI data
+    ni_file_path = storage_path / "dfc-ni-uc-stats-supp-tables-may-2025.ods"
+    df_ni = pd.read_excel(
+        ni_file_path, sheet_name="5b", engine="odf", header=None
+    )
+
+    # Get constituency names from row 2, columns 1-18
+    ni_constituencies = df_ni.iloc[2, 1:19].tolist()
+
+    # Find May 2025 row
+    may_2025_row = df_ni[df_ni[0] == "May 2025"].iloc[0]
+
+    ni_data_rows = []
+    for col_idx, constituency_name in enumerate(ni_constituencies, start=1):
+        household_count = may_2025_row[col_idx]
+
+        if pd.notna(household_count) and household_count != 0:
+            ni_data_rows.append(
+                {
+                    "constituency_name": constituency_name,
+                    "household_count": int(household_count),
+                }
+            )
+
+    # Combine GB and NI data
+    result_df = pd.DataFrame(gb_data_rows + ni_data_rows)
+
+    # Scale constituency counts to match national total
+    national_total = _parse_uc_national_payment_dist()["household_count"].sum()
+    constituency_total = result_df["household_count"].sum()
+    scaling_factor = national_total / constituency_total
+
+    result_df["household_count"] = (
+        (result_df["household_count"] * scaling_factor).round().astype(int)
+    )
+
+    return result_df
+
+
+# Module-level dataframes for easy import
+uc_national_payment_dist = _parse_uc_national_payment_dist()
+uc_pc_households = _parse_uc_pc_households()
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,9 @@ dependencies = [
     "microimpute>=1.0.1",
     "black>=25.1.0",
     "rich>=13.0.0",
+    "odfpy",
+    "pandas",
+    "openpyxl",
 ]
 
 [project.optional-dependencies]

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,9 @@ dependencies = [`
`26`	`26`	`"microimpute>=1.0.1",`
`27`	`27`	`"black>=25.1.0",`
`28`	`28`	`"rich>=13.0.0",`
	`29`	`+ "odfpy",`
	`30`	`+ "pandas",`
	`31`	`+ "openpyxl",`
`29`	`32`	`]`
`30`	`33`
`31`	`34`	`[project.optional-dependencies]`