PolicyEngine · nikhilwoodruff · Dec 1, 2025 · Nov 6, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,11 @@
+# Claude notes
+
+The purpose of this repo is to build the .h5 files that feed as input into the policyengine-uk tax-benefit microsimulation model.
+
+## General principles
+
+Claude, please follow these always. These principles are aimed at preventing you from producing AI slop.
+
+1. British English, sentence case
+2. No excessive duplication, keep code files as concise as possible to produce the same meaningful value. No excessive printing
+3. Don't create multiple files for successive versions. Keep checking: have I added lots of intermediate files which are deprecated? Delete them if so, but ideally don't create them in the first place
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    fixed:
+    - LA calibration now consistent with constituency calibration.
diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py
@@ -29,7 +29,8 @@ def main():
             "Impute salary sacrifice",
             "Impute student loan plan",
             "Uprate to 2025",
-            "Calibrate dataset",
+            "Calibrate constituency weights",
+            "Calibrate local authority weights",
             "Downrate to 2023",
             "Save final dataset",
         ]
@@ -98,12 +99,12 @@ def main():
             frs = uprate_dataset(frs, 2025)
             update_dataset("Uprate to 2025", "completed")
 
-            # Calibrate dataset with nested progress
+            # Calibrate constituency weights with nested progress
             from policyengine_uk_data.datasets.local_areas.constituencies.calibrate import (
                 calibrate,
             )
 
-            update_dataset("Calibrate dataset", "processing")
+            update_dataset("Calibrate constituency weights", "processing")
 
             # Use a separate progress tracker for calibration with nested display
             from policyengine_uk_data.utils.calibrate import (
@@ -132,7 +133,30 @@ def main():
                 nested_progress=nested_progress,  # Pass the nested progress manager
             )
 
-            update_dataset("Calibrate dataset", "completed")
+            update_dataset("Calibrate constituency weights", "completed")
+
+            # Calibrate local authority weights
+            from policyengine_uk_data.datasets.local_areas.local_authorities.loss import (
+                create_local_authority_target_matrix,
+                create_national_target_matrix as create_national_target_matrix_la,
+            )
+
+            update_dataset("Calibrate local authority weights", "processing")
+
+            frs_calibrated = calibrate_local_areas(
+                dataset=frs_calibrated,
+                matrix_fn=create_local_authority_target_matrix,
+                national_matrix_fn=create_national_target_matrix_la,
+                area_count=360,
+                weight_file="local_authority_weights.h5",
+                excluded_training_targets=[],
+                log_csv=None,
+                verbose=True,
+                area_name="Local Authority",
+                nested_progress=nested_progress,
+            )
+
+            update_dataset("Calibrate local authority weights", "completed")
 
             # Downrate and save
             update_dataset("Downrate to 2023", "processing")
@@ -150,7 +174,7 @@ def main():
                 "base_dataset": "frs_2023_24.h5",
                 "enhanced_dataset": "enhanced_frs_2023_24.h5",
                 "imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice, student_loan_plan",
-                "calibration": "national and constituency targets",
+                "calibration": "national, LA and  constituency targets",
             },
         )
 

diff --git a/policyengine_uk_data/datasets/local_areas/constituencies/loss.py b/policyengine_uk_data/datasets/local_areas/constituencies/loss.py
@@ -1,11 +1,6 @@
-import torch
 from policyengine_uk import Microsimulation
 import pandas as pd
 import numpy as np
-
-# Fill in missing constituencies with average column values
-import pandas as pd
-import numpy as np
 from pathlib import Path
 
 from policyengine_uk_data.utils.loss import (
@@ -25,16 +20,12 @@ def create_constituency_target_matrix(
     dataset: UKSingleYearDataset,
     time_period: int = None,
     reform=None,
-    uprate: bool = True,
 ):
     if time_period is None:
         time_period = dataset.time_period
     ages = pd.read_csv(FOLDER / "targets" / "age.csv")
     national_demographics = pd.read_csv(STORAGE_FOLDER / "demographics.csv")
     incomes = pd.read_csv(FOLDER / "targets" / "spi_by_constituency.csv")
-    employment_incomes = pd.read_csv(
-        FOLDER / "targets" / "employment_income.csv"
-    )
 
     sim = Microsimulation(dataset=dataset, reform=reform)
     sim.default_calculation_period = dataset.time_period
@@ -121,11 +112,6 @@ def create_constituency_target_matrix(
         age_str = f"{lower_age}_{upper_age}"
         y[f"age/{age_str}"] *= uk_total_population / targets_total_pop * 0.9
 
-    employment_income = sim.calculate("employment_income").values
-    bounds = list(
-        employment_incomes.employment_income_lower_bound.sort_values().unique()
-    ) + [np.inf]
-
     # UC household count by constituency
     y["uc_households"] = uc_pc_households.household_count.values
     matrix["uc_households"] = sim.map_result(
@@ -134,59 +120,6 @@ def create_constituency_target_matrix(
         "household",
     )
 
-    for lower_bound, upper_bound in zip(bounds[:-1], bounds[1:]):
-        continue
-        if (
-            lower_bound <= 20_000
-        ):  # Skip some targets with very small sample sizes
-            continue
-        if upper_bound >= 100_000:
-            continue
-
-        national_data_row = national_incomes[
-            national_incomes.total_income_lower_bound == lower_bound
-        ]["employment_income_amount"].iloc[0]
-
-        count_target = employment_incomes[
-            (employment_incomes.employment_income_lower_bound == lower_bound)
-            & (employment_incomes.employment_income_upper_bound == upper_bound)
-        ].employment_income_count.values
-
-        amount_target = employment_incomes[
-            (employment_incomes.employment_income_lower_bound == lower_bound)
-            & (employment_incomes.employment_income_upper_bound == upper_bound)
-        ].employment_income_amount.values
-
-        sum_of_local_area_values = amount_target.sum()
-
-        adjustment = national_data_row / sum_of_local_area_values
-
-        if count_target.mean() < 200:
-            print(
-                f"Skipping employment income band {lower_bound} to {upper_bound} due to low count target mean: {count_target.mean()}"
-            )
-            continue
-
-        if amount_target.mean() < 200 * 30e3:
-            print(
-                f"Skipping employment income band {lower_bound} to {upper_bound} due to low amount target mean: {amount_target.mean()}"
-            )
-            continue
-
-        in_bound = (
-            (employment_income >= lower_bound)
-            & (employment_income < upper_bound)
-            & (employment_income != 0)
-            & (age >= 16)
-        )
-        band_str = f"{lower_bound}_{upper_bound}"
-        matrix[f"hmrc/employment_income/amount/{band_str}"] = sim.map_result(
-            employment_income * in_bound, "person", "household"
-        )
-        y[f"hmrc/employment_income/amount/{band_str}"] = (
-            amount_target * adjustment
-        )
-
     const_2024 = pd.read_csv(STORAGE_FOLDER / "constituencies_2024.csv")
     const_2010 = pd.read_csv(STORAGE_FOLDER / "constituencies_2010.csv")
 

diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py b/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py
@@ -1,13 +1,17 @@
+import pandas as pd
 from policyengine_uk_data.utils.calibrate import calibrate_local_areas
 from policyengine_uk_data.datasets.local_areas.local_authorities.loss import (
     create_local_authority_target_matrix,
     create_national_target_matrix,
 )
+from policyengine_uk_data.storage import STORAGE_FOLDER
 from policyengine_uk.data import UKSingleYearDataset
 
 
 def calibrate(
     dataset: UKSingleYearDataset,
+    excluded_training_targets=[],
+    log_csv="calibration_log.csv",
     verbose: bool = False,
 ):
     return calibrate_local_areas(
@@ -20,12 +24,95 @@ def calibrate(
         ),
         area_count=360,
         weight_file="local_authority_weights.h5",
-        excluded_training_targets=[],
-        log_csv=None,
+        excluded_training_targets=excluded_training_targets,
+        log_csv=log_csv,
         verbose=verbose,
         area_name="Local Authority",
+        get_performance=get_performance,
     )
 
 
+def get_performance(weights, m_c, y_c, m_n, y_n, excluded_targets):
+    la_target_matrix, la_actuals = m_c, y_c
+    national_target_matrix, national_actuals = m_n, y_n
+    local_authorities = pd.read_csv(
+        STORAGE_FOLDER / "local_authorities_2021.csv"
+    )
+    la_wide = weights @ la_target_matrix
+    la_wide.index = local_authorities.code.values
+    la_wide["name"] = local_authorities.name.values
+
+    la_results = pd.melt(
+        la_wide.reset_index(),
+        id_vars=["index", "name"],
+        var_name="variable",
+        value_name="value",
+    )
+
+    la_actuals.index = local_authorities.code.values
+    la_actuals["name"] = local_authorities.name.values
+    la_actuals_long = pd.melt(
+        la_actuals.reset_index(),
+        id_vars=["index", "name"],
+        var_name="variable",
+        value_name="value",
+    )
+
+    la_target_validation = pd.merge(
+        la_results,
+        la_actuals_long,
+        on=["index", "variable"],
+        suffixes=("_target", "_actual"),
+    )
+    la_target_validation.drop("name_actual", axis=1, inplace=True)
+    la_target_validation.columns = [
+        "index",
+        "name",
+        "metric",
+        "estimate",
+        "target",
+    ]
+
+    la_target_validation["error"] = (
+        la_target_validation["estimate"] - la_target_validation["target"]
+    )
+    la_target_validation["abs_error"] = la_target_validation["error"].abs()
+    la_target_validation["rel_abs_error"] = (
+        la_target_validation["abs_error"] / la_target_validation["target"]
+    )
+
+    national_performance = weights.sum(axis=0) @ national_target_matrix
+    national_target_validation = pd.DataFrame(
+        {
+            "metric": national_performance.index,
+            "estimate": national_performance.values,
+        }
+    )
+    national_target_validation["target"] = national_actuals.values
+
+    national_target_validation["error"] = (
+        national_target_validation["estimate"]
+        - national_target_validation["target"]
+    )
+    national_target_validation["abs_error"] = national_target_validation[
+        "error"
+    ].abs()
+    national_target_validation["rel_abs_error"] = (
+        national_target_validation["abs_error"]
+        / national_target_validation["target"]
+    )
+
+    df = pd.concat(
+        [
+            la_target_validation,
+            national_target_validation.assign(name="UK", index=0),
+        ]
+    ).reset_index(drop=True)
+
+    df["validation"] = df.metric.isin(excluded_targets)
+
+    return df
+
+
 if __name__ == "__main__":
     calibrate()