|
| 1 | +import pandas as pd |
| 2 | +from pathlib import Path |
| 3 | + |
| 4 | + |
| 5 | +def _parse_uc_national_payment_dist(): |
| 6 | + """Parse UC national payment distribution into long format.""" |
| 7 | + storage_path = Path(__file__).parent.parent / "storage" |
| 8 | + file_path = storage_path / "uc_national_payment_dist.xlsx" |
| 9 | + |
| 10 | + # Read the Excel file, skipping header rows |
| 11 | + df = pd.read_excel(file_path, header=None) |
| 12 | + |
| 13 | + # Extract family types from row 7 (index 7) |
| 14 | + family_types = df.iloc[7, 3:7].tolist() # Columns 3-6: the 4 family types |
| 15 | + |
| 16 | + # Extract data rows (starting from row 9, index 9) |
| 17 | + data_rows = [] |
| 18 | + |
| 19 | + for idx in range(9, len(df)): |
| 20 | + award_band = df.iloc[idx, 1] # Monthly award amount band |
| 21 | + |
| 22 | + # Skip if not a valid award band |
| 23 | + if pd.isna(award_band) or award_band in ["No payment", "Total"]: |
| 24 | + continue |
| 25 | + |
| 26 | + for col_idx, family_type in enumerate(family_types, start=3): |
| 27 | + household_count = df.iloc[idx, col_idx] |
| 28 | + |
| 29 | + # Skip missing, ".." (suppressed), or zero values |
| 30 | + if ( |
| 31 | + pd.isna(household_count) |
| 32 | + or household_count == ".." |
| 33 | + or household_count == 0 |
| 34 | + ): |
| 35 | + continue |
| 36 | + |
| 37 | + data_rows.append( |
| 38 | + { |
| 39 | + "monthly_award_band": award_band, |
| 40 | + "family_type": family_type, |
| 41 | + "household_count": int(household_count), |
| 42 | + } |
| 43 | + ) |
| 44 | + |
| 45 | + result_df = pd.DataFrame(data_rows) |
| 46 | + |
| 47 | + # Parse monthly band into min and max, then convert to annual |
| 48 | + def parse_band(band): |
| 49 | + """Parse band like '£100.01 to £200.00' into (min, max).""" |
| 50 | + parts = band.replace("£", "").replace(",", "").split(" to ") |
| 51 | + if len(parts) == 2: |
| 52 | + return float(parts[0]) * 12, float(parts[1]) * 12 |
| 53 | + return None, None |
| 54 | + |
| 55 | + result_df[["uc_annual_payment_min", "uc_annual_payment_max"]] = result_df[ |
| 56 | + "monthly_award_band" |
| 57 | + ].apply(lambda x: pd.Series(parse_band(x))) |
| 58 | + |
| 59 | + # Map family types to constant names |
| 60 | + family_type_mapping = { |
| 61 | + "Single, no children": "SINGLE", |
| 62 | + "Single, with children": "LONE_PARENT", |
| 63 | + "Couple, no children": "COUPLE_NO_CHILDREN", |
| 64 | + "Couple, with children": "COUPLE_WITH_CHILDREN", |
| 65 | + } |
| 66 | + result_df["family_type"] = result_df["family_type"].map( |
| 67 | + family_type_mapping |
| 68 | + ) |
| 69 | + |
| 70 | + # Reorder columns and drop monthly band |
| 71 | + result_df = result_df[ |
| 72 | + [ |
| 73 | + "uc_annual_payment_min", |
| 74 | + "uc_annual_payment_max", |
| 75 | + "family_type", |
| 76 | + "household_count", |
| 77 | + ] |
| 78 | + ] |
| 79 | + |
| 80 | + return result_df |
| 81 | + |
| 82 | + |
| 83 | +def _parse_uc_pc_households(): |
| 84 | + """Parse UC parliamentary constituency households (GB + NI).""" |
| 85 | + storage_path = Path(__file__).parent.parent / "storage" |
| 86 | + |
| 87 | + # Parse GB data |
| 88 | + gb_file_path = storage_path / "uc_pc_households.xlsx" |
| 89 | + df_gb = pd.read_excel(gb_file_path, header=None) |
| 90 | + |
| 91 | + gb_data_rows = [] |
| 92 | + |
| 93 | + for idx in range(8, len(df_gb)): |
| 94 | + constituency = df_gb.iloc[idx, 1] # Column 1: constituency name |
| 95 | + household_count = df_gb.iloc[idx, 3] # Column 3: household count |
| 96 | + |
| 97 | + # Skip if empty, invalid, Total row, or Unknown |
| 98 | + if ( |
| 99 | + pd.isna(constituency) |
| 100 | + or pd.isna(household_count) |
| 101 | + or constituency in ["Total", "Unknown"] |
| 102 | + ): |
| 103 | + continue |
| 104 | + |
| 105 | + gb_data_rows.append( |
| 106 | + { |
| 107 | + "constituency_name": constituency, |
| 108 | + "household_count": int(household_count), |
| 109 | + } |
| 110 | + ) |
| 111 | + |
| 112 | + # Parse NI data |
| 113 | + ni_file_path = storage_path / "dfc-ni-uc-stats-supp-tables-may-2025.ods" |
| 114 | + df_ni = pd.read_excel( |
| 115 | + ni_file_path, sheet_name="5b", engine="odf", header=None |
| 116 | + ) |
| 117 | + |
| 118 | + # Get constituency names from row 2, columns 1-18 |
| 119 | + ni_constituencies = df_ni.iloc[2, 1:19].tolist() |
| 120 | + |
| 121 | + # Find May 2025 row |
| 122 | + may_2025_row = df_ni[df_ni[0] == "May 2025"].iloc[0] |
| 123 | + |
| 124 | + ni_data_rows = [] |
| 125 | + for col_idx, constituency_name in enumerate(ni_constituencies, start=1): |
| 126 | + household_count = may_2025_row[col_idx] |
| 127 | + |
| 128 | + if pd.notna(household_count) and household_count != 0: |
| 129 | + ni_data_rows.append( |
| 130 | + { |
| 131 | + "constituency_name": constituency_name, |
| 132 | + "household_count": int(household_count), |
| 133 | + } |
| 134 | + ) |
| 135 | + |
| 136 | + # Combine GB and NI data |
| 137 | + result_df = pd.DataFrame(gb_data_rows + ni_data_rows) |
| 138 | + |
| 139 | + # Scale constituency counts to match national total |
| 140 | + national_total = _parse_uc_national_payment_dist()["household_count"].sum() |
| 141 | + constituency_total = result_df["household_count"].sum() |
| 142 | + scaling_factor = national_total / constituency_total |
| 143 | + |
| 144 | + result_df["household_count"] = ( |
| 145 | + (result_df["household_count"] * scaling_factor).round().astype(int) |
| 146 | + ) |
| 147 | + |
| 148 | + return result_df |
| 149 | + |
| 150 | + |
| 151 | +# Module-level dataframes for easy import |
| 152 | +uc_national_payment_dist = _parse_uc_national_payment_dist() |
| 153 | +uc_pc_households = _parse_uc_pc_households() |
0 commit comments