Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Claude notes

The purpose of this repo is to build the .h5 files that feed as input into the policyengine-uk tax-benefit microsimulation model.

## General principles

Claude, please follow these always. These principles are aimed at preventing you from producing AI slop.

1. British English, sentence case
2. No excessive duplication, keep code files as concise as possible to produce the same meaningful value. No excessive printing
3. Don't create multiple files for successive versions. Keep checking: have I added lots of intermediate files which are deprecated? Delete them if so, but ideally don't create them in the first place
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: minor
changes:
fixed:
- LA calibration now consistent with constituency calibration.
34 changes: 29 additions & 5 deletions policyengine_uk_data/datasets/create_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def main():
"Impute salary sacrifice",
"Impute student loan plan",
"Uprate to 2025",
"Calibrate dataset",
"Calibrate constituency weights",
"Calibrate local authority weights",
"Downrate to 2023",
"Save final dataset",
]
Expand Down Expand Up @@ -98,12 +99,12 @@ def main():
frs = uprate_dataset(frs, 2025)
update_dataset("Uprate to 2025", "completed")

# Calibrate dataset with nested progress
# Calibrate constituency weights with nested progress
from policyengine_uk_data.datasets.local_areas.constituencies.calibrate import (
calibrate,
)

update_dataset("Calibrate dataset", "processing")
update_dataset("Calibrate constituency weights", "processing")

# Use a separate progress tracker for calibration with nested display
from policyengine_uk_data.utils.calibrate import (
Expand Down Expand Up @@ -132,7 +133,30 @@ def main():
nested_progress=nested_progress, # Pass the nested progress manager
)

update_dataset("Calibrate dataset", "completed")
update_dataset("Calibrate constituency weights", "completed")

# Calibrate local authority weights
from policyengine_uk_data.datasets.local_areas.local_authorities.loss import (
create_local_authority_target_matrix,
create_national_target_matrix as create_national_target_matrix_la,
)

update_dataset("Calibrate local authority weights", "processing")

frs_calibrated = calibrate_local_areas(
dataset=frs_calibrated,
matrix_fn=create_local_authority_target_matrix,
national_matrix_fn=create_national_target_matrix_la,
area_count=360,
weight_file="local_authority_weights.h5",
excluded_training_targets=[],
log_csv=None,
verbose=True,
area_name="Local Authority",
nested_progress=nested_progress,
)

update_dataset("Calibrate local authority weights", "completed")

# Downrate and save
update_dataset("Downrate to 2023", "processing")
Expand All @@ -150,7 +174,7 @@ def main():
"base_dataset": "frs_2023_24.h5",
"enhanced_dataset": "enhanced_frs_2023_24.h5",
"imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice, student_loan_plan",
"calibration": "national and constituency targets",
"calibration": "national, LA and constituency targets",
},
)

Expand Down
67 changes: 0 additions & 67 deletions policyengine_uk_data/datasets/local_areas/constituencies/loss.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
import torch
from policyengine_uk import Microsimulation
import pandas as pd
import numpy as np

# Fill in missing constituencies with average column values
import pandas as pd
import numpy as np
from pathlib import Path

from policyengine_uk_data.utils.loss import (
Expand All @@ -25,16 +20,12 @@ def create_constituency_target_matrix(
dataset: UKSingleYearDataset,
time_period: int = None,
reform=None,
uprate: bool = True,
):
if time_period is None:
time_period = dataset.time_period
ages = pd.read_csv(FOLDER / "targets" / "age.csv")
national_demographics = pd.read_csv(STORAGE_FOLDER / "demographics.csv")
incomes = pd.read_csv(FOLDER / "targets" / "spi_by_constituency.csv")
employment_incomes = pd.read_csv(
FOLDER / "targets" / "employment_income.csv"
)

sim = Microsimulation(dataset=dataset, reform=reform)
sim.default_calculation_period = dataset.time_period
Expand Down Expand Up @@ -121,11 +112,6 @@ def create_constituency_target_matrix(
age_str = f"{lower_age}_{upper_age}"
y[f"age/{age_str}"] *= uk_total_population / targets_total_pop * 0.9

employment_income = sim.calculate("employment_income").values
bounds = list(
employment_incomes.employment_income_lower_bound.sort_values().unique()
) + [np.inf]

# UC household count by constituency
y["uc_households"] = uc_pc_households.household_count.values
matrix["uc_households"] = sim.map_result(
Expand All @@ -134,59 +120,6 @@ def create_constituency_target_matrix(
"household",
)

for lower_bound, upper_bound in zip(bounds[:-1], bounds[1:]):
continue
if (
lower_bound <= 20_000
): # Skip some targets with very small sample sizes
continue
if upper_bound >= 100_000:
continue

national_data_row = national_incomes[
national_incomes.total_income_lower_bound == lower_bound
]["employment_income_amount"].iloc[0]

count_target = employment_incomes[
(employment_incomes.employment_income_lower_bound == lower_bound)
& (employment_incomes.employment_income_upper_bound == upper_bound)
].employment_income_count.values

amount_target = employment_incomes[
(employment_incomes.employment_income_lower_bound == lower_bound)
& (employment_incomes.employment_income_upper_bound == upper_bound)
].employment_income_amount.values

sum_of_local_area_values = amount_target.sum()

adjustment = national_data_row / sum_of_local_area_values

if count_target.mean() < 200:
print(
f"Skipping employment income band {lower_bound} to {upper_bound} due to low count target mean: {count_target.mean()}"
)
continue

if amount_target.mean() < 200 * 30e3:
print(
f"Skipping employment income band {lower_bound} to {upper_bound} due to low amount target mean: {amount_target.mean()}"
)
continue

in_bound = (
(employment_income >= lower_bound)
& (employment_income < upper_bound)
& (employment_income != 0)
& (age >= 16)
)
band_str = f"{lower_bound}_{upper_bound}"
matrix[f"hmrc/employment_income/amount/{band_str}"] = sim.map_result(
employment_income * in_bound, "person", "household"
)
y[f"hmrc/employment_income/amount/{band_str}"] = (
amount_target * adjustment
)

const_2024 = pd.read_csv(STORAGE_FOLDER / "constituencies_2024.csv")
const_2010 = pd.read_csv(STORAGE_FOLDER / "constituencies_2010.csv")

Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
import pandas as pd
from policyengine_uk_data.utils.calibrate import calibrate_local_areas
from policyengine_uk_data.datasets.local_areas.local_authorities.loss import (
create_local_authority_target_matrix,
create_national_target_matrix,
)
from policyengine_uk_data.storage import STORAGE_FOLDER
from policyengine_uk.data import UKSingleYearDataset


def calibrate(
dataset: UKSingleYearDataset,
excluded_training_targets=[],
log_csv="calibration_log.csv",
verbose: bool = False,
):
return calibrate_local_areas(
Expand All @@ -20,12 +24,95 @@ def calibrate(
),
area_count=360,
weight_file="local_authority_weights.h5",
excluded_training_targets=[],
log_csv=None,
excluded_training_targets=excluded_training_targets,
log_csv=log_csv,
verbose=verbose,
area_name="Local Authority",
get_performance=get_performance,
)


def get_performance(weights, m_c, y_c, m_n, y_n, excluded_targets):
la_target_matrix, la_actuals = m_c, y_c
national_target_matrix, national_actuals = m_n, y_n
local_authorities = pd.read_csv(
STORAGE_FOLDER / "local_authorities_2021.csv"
)
la_wide = weights @ la_target_matrix
la_wide.index = local_authorities.code.values
la_wide["name"] = local_authorities.name.values

la_results = pd.melt(
la_wide.reset_index(),
id_vars=["index", "name"],
var_name="variable",
value_name="value",
)

la_actuals.index = local_authorities.code.values
la_actuals["name"] = local_authorities.name.values
la_actuals_long = pd.melt(
la_actuals.reset_index(),
id_vars=["index", "name"],
var_name="variable",
value_name="value",
)

la_target_validation = pd.merge(
la_results,
la_actuals_long,
on=["index", "variable"],
suffixes=("_target", "_actual"),
)
la_target_validation.drop("name_actual", axis=1, inplace=True)
la_target_validation.columns = [
"index",
"name",
"metric",
"estimate",
"target",
]

la_target_validation["error"] = (
la_target_validation["estimate"] - la_target_validation["target"]
)
la_target_validation["abs_error"] = la_target_validation["error"].abs()
la_target_validation["rel_abs_error"] = (
la_target_validation["abs_error"] / la_target_validation["target"]
)

national_performance = weights.sum(axis=0) @ national_target_matrix
national_target_validation = pd.DataFrame(
{
"metric": national_performance.index,
"estimate": national_performance.values,
}
)
national_target_validation["target"] = national_actuals.values

national_target_validation["error"] = (
national_target_validation["estimate"]
- national_target_validation["target"]
)
national_target_validation["abs_error"] = national_target_validation[
"error"
].abs()
national_target_validation["rel_abs_error"] = (
national_target_validation["abs_error"]
/ national_target_validation["target"]
)

df = pd.concat(
[
la_target_validation,
national_target_validation.assign(name="UK", index=0),
]
).reset_index(drop=True)

df["validation"] = df.metric.isin(excluded_targets)

return df


if __name__ == "__main__":
calibrate()
Loading