diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29bb..9c0140194 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Universal Credit calibration at national level by award amount and family type, and at constituency level in total. diff --git a/policyengine_uk_data/datasets/local_areas/constituencies/loss.py b/policyengine_uk_data/datasets/local_areas/constituencies/loss.py index 18bdeef40..76969bf8a 100644 --- a/policyengine_uk_data/datasets/local_areas/constituencies/loss.py +++ b/policyengine_uk_data/datasets/local_areas/constituencies/loss.py @@ -16,6 +16,7 @@ mapping_matrix, ) from policyengine_uk.data import UKSingleYearDataset +from policyengine_uk_data.utils.uc_data import uc_pc_households FOLDER = Path(__file__).parent @@ -125,6 +126,14 @@ def create_constituency_target_matrix( employment_incomes.employment_income_lower_bound.sort_values().unique() ) + [np.inf] + # UC household count by constituency + y["uc_households"] = uc_pc_households.household_count.values + matrix["uc_households"] = sim.map_result( + (sim.calculate("universal_credit").values > 0).astype(int), + "benunit", + "household", + ) + for lower_bound, upper_bound in zip(bounds[:-1], bounds[1:]): continue if ( diff --git a/policyengine_uk_data/datasets/spi.py b/policyengine_uk_data/datasets/spi.py index 5c930fa85..253b14554 100644 --- a/policyengine_uk_data/datasets/spi.py +++ b/policyengine_uk_data/datasets/spi.py @@ -22,25 +22,22 @@ def create_spi( household["household_weight"] = df.FACT person["dividend_income"] = df.DIVIDENDS person["gift_aid"] = df.GIFTAID - household["region"] = ( - df.GORCODE.map( - { - 1: "NORTH_EAST", - 2: "NORTH_WEST", - 3: "YORKSHIRE", - 4: "EAST_MIDLANDS", - 5: "WEST_MIDLANDS", - 6: "EAST_OF_ENGLAND", - 7: "LONDON", - 8: "SOUTH_EAST", - 9: "SOUTH_WEST", - 10: "WALES", - 11: "SCOTLAND", - 12: "NORTHERN_IRELAND", - } - ) - .fillna("SOUTH_EAST") - ) + household["region"] = df.GORCODE.map( + { + 1: "NORTH_EAST", + 2: "NORTH_WEST", + 3: "YORKSHIRE", + 4: "EAST_MIDLANDS", + 5: "WEST_MIDLANDS", + 6: "EAST_OF_ENGLAND", + 7: "LONDON", + 8: "SOUTH_EAST", + 9: "SOUTH_WEST", + 10: "WALES", + 11: "SCOTLAND", + 12: "NORTHERN_IRELAND", + } + ).fillna("SOUTH_EAST") household["rent"] = 0 household["tenure_type"] = "OWNED_OUTRIGHT" household["council_tax"] = 0 diff --git a/policyengine_uk_data/storage/UC_DATA_SOURCES.md b/policyengine_uk_data/storage/UC_DATA_SOURCES.md new file mode 100644 index 000000000..3c95e7f78 --- /dev/null +++ b/policyengine_uk_data/storage/UC_DATA_SOURCES.md @@ -0,0 +1,31 @@ +# Universal Credit data sources + +## National payment distribution + +Source: Stat-Xplore (DWP) +- Rows: Monthly award amount bands + Households on Universal Credit +- Columns: Family type +- File: `uc_national_payment_dist.xlsx` + +## Parliamentary constituency households + +### Great Britain data + +Source: Stat-Xplore (DWP) +- Rows: Westminster Parliamentary Constituency 2024 + Households on Universal Credit +- File: `uc_pc_households.xlsx` + +### Northern Ireland data + +Source: Department for Communities Northern Ireland +- URL: https://www.communities-ni.gov.uk/publications/universal-credit-statistics-may-2025 +- File: `dfc-ni-uc-stats-supp-tables-may-2025.ods` +- Sheet: 5b +- Data: Household counts by Westminster Parliamentary Constituency 2024 + +The NI data is combined with the GB data to produce a complete UK-wide parliamentary constituency table. + +## Data processing notes + +- The "Unknown" constituency category is excluded from the constituency data +- Constituency household counts are scaled to match the national total from the payment distribution data, as the two sources have different totals due to timing and methodology differences diff --git a/policyengine_uk_data/storage/dfc-ni-uc-stats-supp-tables-may-2025.ods b/policyengine_uk_data/storage/dfc-ni-uc-stats-supp-tables-may-2025.ods new file mode 100644 index 000000000..aa8fd524c Binary files /dev/null and b/policyengine_uk_data/storage/dfc-ni-uc-stats-supp-tables-may-2025.ods differ diff --git a/policyengine_uk_data/storage/uc_national_payment_dist.xlsx b/policyengine_uk_data/storage/uc_national_payment_dist.xlsx new file mode 100644 index 000000000..32d34afb0 Binary files /dev/null and b/policyengine_uk_data/storage/uc_national_payment_dist.xlsx differ diff --git a/policyengine_uk_data/storage/uc_pc_households.xlsx b/policyengine_uk_data/storage/uc_pc_households.xlsx new file mode 100644 index 000000000..b1bde6ae9 Binary files /dev/null and b/policyengine_uk_data/storage/uc_pc_households.xlsx differ diff --git a/policyengine_uk_data/utils/loss.py b/policyengine_uk_data/utils/loss.py index 39edfd4bd..ae24cfc1a 100644 --- a/policyengine_uk_data/utils/loss.py +++ b/policyengine_uk_data/utils/loss.py @@ -11,6 +11,7 @@ from policyengine_uk_data.storage import STORAGE_FOLDER from policyengine_uk_data.utils import uprate_values from policyengine_uk.data import UKSingleYearDataset +from policyengine_uk_data.utils.uc_data import uc_national_payment_dist tax_benefit = pd.read_csv(STORAGE_FOLDER / "tax_benefit.csv") tax_benefit["name"] = tax_benefit["name"].apply(lambda x: f"obr/{x}") @@ -382,6 +383,26 @@ def pe_count(*variables): 60 * 52 * 115_000 ) # same source as above, multiply avg cap amount by total capped population + # UC national payment distribution + + uc_payment_dist = uc_national_payment_dist + uc_payments = sim.calculate("universal_credit", map_to="benunit").values + uc_family_type = sim.calculate("family_type", map_to="benunit").values + + for i, row in uc_payment_dist.iterrows(): + lower = row.uc_annual_payment_min + upper = row.uc_annual_payment_max + family_type = row.family_type + in_band = ( + (uc_payments >= lower) + & (uc_payments < upper) + & (uc_family_type == family_type) + ) + name = f"dwp/uc_payment_dist/{family_type}_annual_payment_{lower:_.0f}_to_{upper:_.0f}" + df[name] = household_from_family(in_band) + target_names.append(name) + target_values.append(row.household_count) + combined_targets = pd.concat( [ targets, diff --git a/policyengine_uk_data/utils/uc_data.py b/policyengine_uk_data/utils/uc_data.py new file mode 100644 index 000000000..348304908 --- /dev/null +++ b/policyengine_uk_data/utils/uc_data.py @@ -0,0 +1,153 @@ +import pandas as pd +from pathlib import Path + + +def _parse_uc_national_payment_dist(): + """Parse UC national payment distribution into long format.""" + storage_path = Path(__file__).parent.parent / "storage" + file_path = storage_path / "uc_national_payment_dist.xlsx" + + # Read the Excel file, skipping header rows + df = pd.read_excel(file_path, header=None) + + # Extract family types from row 7 (index 7) + family_types = df.iloc[7, 3:7].tolist() # Columns 3-6: the 4 family types + + # Extract data rows (starting from row 9, index 9) + data_rows = [] + + for idx in range(9, len(df)): + award_band = df.iloc[idx, 1] # Monthly award amount band + + # Skip if not a valid award band + if pd.isna(award_band) or award_band in ["No payment", "Total"]: + continue + + for col_idx, family_type in enumerate(family_types, start=3): + household_count = df.iloc[idx, col_idx] + + # Skip missing, ".." (suppressed), or zero values + if ( + pd.isna(household_count) + or household_count == ".." + or household_count == 0 + ): + continue + + data_rows.append( + { + "monthly_award_band": award_band, + "family_type": family_type, + "household_count": int(household_count), + } + ) + + result_df = pd.DataFrame(data_rows) + + # Parse monthly band into min and max, then convert to annual + def parse_band(band): + """Parse band like '£100.01 to £200.00' into (min, max).""" + parts = band.replace("£", "").replace(",", "").split(" to ") + if len(parts) == 2: + return float(parts[0]) * 12, float(parts[1]) * 12 + return None, None + + result_df[["uc_annual_payment_min", "uc_annual_payment_max"]] = result_df[ + "monthly_award_band" + ].apply(lambda x: pd.Series(parse_band(x))) + + # Map family types to constant names + family_type_mapping = { + "Single, no children": "SINGLE", + "Single, with children": "LONE_PARENT", + "Couple, no children": "COUPLE_NO_CHILDREN", + "Couple, with children": "COUPLE_WITH_CHILDREN", + } + result_df["family_type"] = result_df["family_type"].map( + family_type_mapping + ) + + # Reorder columns and drop monthly band + result_df = result_df[ + [ + "uc_annual_payment_min", + "uc_annual_payment_max", + "family_type", + "household_count", + ] + ] + + return result_df + + +def _parse_uc_pc_households(): + """Parse UC parliamentary constituency households (GB + NI).""" + storage_path = Path(__file__).parent.parent / "storage" + + # Parse GB data + gb_file_path = storage_path / "uc_pc_households.xlsx" + df_gb = pd.read_excel(gb_file_path, header=None) + + gb_data_rows = [] + + for idx in range(8, len(df_gb)): + constituency = df_gb.iloc[idx, 1] # Column 1: constituency name + household_count = df_gb.iloc[idx, 3] # Column 3: household count + + # Skip if empty, invalid, Total row, or Unknown + if ( + pd.isna(constituency) + or pd.isna(household_count) + or constituency in ["Total", "Unknown"] + ): + continue + + gb_data_rows.append( + { + "constituency_name": constituency, + "household_count": int(household_count), + } + ) + + # Parse NI data + ni_file_path = storage_path / "dfc-ni-uc-stats-supp-tables-may-2025.ods" + df_ni = pd.read_excel( + ni_file_path, sheet_name="5b", engine="odf", header=None + ) + + # Get constituency names from row 2, columns 1-18 + ni_constituencies = df_ni.iloc[2, 1:19].tolist() + + # Find May 2025 row + may_2025_row = df_ni[df_ni[0] == "May 2025"].iloc[0] + + ni_data_rows = [] + for col_idx, constituency_name in enumerate(ni_constituencies, start=1): + household_count = may_2025_row[col_idx] + + if pd.notna(household_count) and household_count != 0: + ni_data_rows.append( + { + "constituency_name": constituency_name, + "household_count": int(household_count), + } + ) + + # Combine GB and NI data + result_df = pd.DataFrame(gb_data_rows + ni_data_rows) + + # Scale constituency counts to match national total + national_total = _parse_uc_national_payment_dist()["household_count"].sum() + constituency_total = result_df["household_count"].sum() + scaling_factor = national_total / constituency_total + + result_df["household_count"] = ( + (result_df["household_count"] * scaling_factor).round().astype(int) + ) + + return result_df + + +# Module-level dataframes for easy import +uc_national_payment_dist = _parse_uc_national_payment_dist() +uc_pc_households = _parse_uc_pc_households() diff --git a/pyproject.toml b/pyproject.toml index 0be38ab3e..89871eeaf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,9 @@ dependencies = [ "microimpute>=1.0.1", "black>=25.1.0", "rich>=13.0.0", + "odfpy", + "pandas", + "openpyxl", ] [project.optional-dependencies]