Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/409.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
- Update the UK data build pipeline to target FRS 2024-25 and flatten the UK Data Service TAB zip layout during prerequisite extraction.
6 changes: 5 additions & 1 deletion policyengine_uk_data/calibration/publish_local_h5s.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import pandas as pd

from policyengine_uk_data.storage import STORAGE_FOLDER
from policyengine_uk_data.utils.calibrate import default_weight_dataset_key

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -206,7 +207,7 @@ def publish_local_h5s(
dataset,
weight_file: str,
area_type: str = "constituency",
dataset_key: str = "2025",
dataset_key: str | None = None,
output_dir: Optional[Path] = None,
min_weight: float = 0.0,
) -> pd.DataFrame:
Expand All @@ -228,6 +229,9 @@ def publish_local_h5s(
DataFrame with per-area statistics: code, n_households,
n_active, total_weight.
"""
if dataset_key is None:
dataset_key = default_weight_dataset_key()

if output_dir is None:
output_dir = LOCAL_H5_DIR / area_type

Expand Down
11 changes: 7 additions & 4 deletions policyengine_uk_data/datasets/childcare/takeup_rate.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import numpy as np
from scipy.optimize import minimize
from policyengine_uk import Microsimulation
from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE
from policyengine_uk_data.utils.hf_destinations import PRIVATE_REPO

ENHANCED_FRS_DATASET = (
f"hf://{PRIVATE_REPO}/{CURRENT_FRS_RELEASE.enhanced_dataset_file}"
)

# 🎯 Calibration targets
#
Expand Down Expand Up @@ -57,13 +63,10 @@ def simulate_childcare_programs(
tfc, extended, targeted, universal, ext_hours_mean, ext_hours_sd = params

# Initialize sim
sim = Microsimulation(
dataset="hf://policyengine/policyengine-uk-data/enhanced_frs_2022_23.h5"
)
sim = Microsimulation(dataset=ENHANCED_FRS_DATASET)

# Get counts of people and benefit units
benunit_count = sim.calculate("benunit_id").values.shape[0]
person_count = sim.calculate("person_id").values.shape[0]

# Set seed
np.random.seed(seed)
Expand Down
114 changes: 92 additions & 22 deletions policyengine_uk_data/datasets/create_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,28 @@ def _get_positive_int_env(name: str, default: int) -> int:
return value


def _needs_base_year_materialization(frs_release) -> bool:
return frs_release.calibration_year != frs_release.base_year


def _needs_calibration_year_materialization(frs_release) -> bool:
return frs_release.calibration_year != frs_release.base_year


def _materialize_calibration_year_dataset(dataset, frs_release, uprate_dataset):
if not _needs_calibration_year_materialization(frs_release):
return dataset

return uprate_dataset(dataset, frs_release.calibration_year)


def _materialize_base_year_dataset(dataset, frs_release, uprate_dataset):
if not _needs_base_year_materialization(frs_release):
return dataset

return uprate_dataset(dataset, frs_release.base_year)


def main():
"""Create enhanced FRS dataset with rich progress tracking."""
try:
Expand All @@ -34,6 +56,7 @@ def main():
strip_internal_disability_reported_amounts,
)
from policyengine_uk_data.datasets.frs import create_frs
from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE
from policyengine_uk_data.storage import STORAGE_FOLDER
from policyengine_uk_data.utils.progress import (
ProcessingProgress,
Expand All @@ -50,6 +73,19 @@ def main():
"PE_UK_DATA_OA_CLONES",
2 if is_testing else 10,
)
frs_release = CURRENT_FRS_RELEASE
align_to_base_year = frs_release.base_year != frs_release.survey_year
align_step = f"Align to {frs_release.base_year} base year"
materialize_calibration_year = _needs_calibration_year_materialization(
frs_release
)
materialize_calibration_step = (
f"Materialize {frs_release.calibration_year} calibration-year dataset"
)
materialize_base_year = _needs_base_year_materialization(frs_release)
materialize_step = (
f"Materialize calibrated {frs_release.base_year} base-year dataset"
)

progress_tracker = ProcessingProgress()

Expand All @@ -65,14 +101,27 @@ def main():
"Impute salary sacrifice",
"Impute student loan plan",
"Clone and assign OA geography",
"Uprate to 2025",
"Calibrate constituency weights",
"Calibrate local authority weights",
"Downrate to 2023",
"Calibrate fuel litres",
"Save final dataset",
"Create tiny datasets",
]
if align_to_base_year:
steps.insert(
steps.index("Calibrate constituency weights"),
align_step,
)
if materialize_calibration_year:
steps.insert(
steps.index("Calibrate constituency weights"),
materialize_calibration_step,
)
if materialize_base_year:
steps.insert(
steps.index("Calibrate fuel litres"),
materialize_step,
)

with progress_tracker.track_dataset_creation(steps) as (
update_dataset,
Expand All @@ -81,12 +130,12 @@ def main():
# Create base FRS dataset
update_dataset("Create base FRS dataset", "processing")
frs = create_frs(
raw_frs_folder=STORAGE_FOLDER / "frs_2023_24",
year=2023,
raw_frs_folder=STORAGE_FOLDER / frs_release.name,
year=frs_release.survey_year,
include_internal_disability_reported_amounts=True,
)
strip_internal_disability_reported_amounts(frs).save(
STORAGE_FOLDER / "frs_2023_24.h5"
STORAGE_FOLDER / frs_release.base_dataset_file
)
update_dataset("Create base FRS dataset", "completed")

Expand Down Expand Up @@ -136,7 +185,10 @@ def main():
update_dataset("Impute salary sacrifice", "completed")

update_dataset("Impute student loan plan", "processing")
frs = impute_student_loan_plan(frs, year=2025)
frs = impute_student_loan_plan(
frs,
year=frs_release.calibration_year,
)
update_dataset("Impute student loan plan", "completed")

# Clone households and assign OA geography
Expand All @@ -148,10 +200,19 @@ def main():
frs = clone_and_assign(frs, n_clones=oa_clones)
update_dataset("Clone and assign OA geography", "completed")

# Uprate dataset
update_dataset("Uprate to 2025", "processing")
frs = uprate_dataset(frs, 2025)
update_dataset("Uprate to 2025", "completed")
if align_to_base_year:
update_dataset(align_step, "processing")
frs = uprate_dataset(frs, frs_release.base_year)
update_dataset(align_step, "completed")

if materialize_calibration_year:
update_dataset(materialize_calibration_step, "processing")
frs = _materialize_calibration_year_dataset(
frs,
frs_release,
uprate_dataset,
)
update_dataset(materialize_calibration_step, "completed")

# Calibrate constituency weights with nested progress

Expand Down Expand Up @@ -179,12 +240,14 @@ def main():
national_matrix_fn=create_national_target_matrix,
area_count=650,
weight_file="parliamentary_constituency_weights.h5",
dataset_key=str(frs_release.calibration_year),
excluded_training_targets=[],
log_csv="constituency_calibration_log.csv",
verbose=True, # Enable nested progress display
area_name="Constituency",
get_performance=get_performance,
nested_progress=nested_progress, # Pass the nested progress manager
time_period=frs_release.calibration_year,
)
update_dataset("Calibrate constituency weights", "completed")

Expand All @@ -204,19 +267,26 @@ def main():
national_matrix_fn=create_national_target_matrix,
area_count=360,
weight_file="local_authority_weights.h5",
dataset_key=str(frs_release.calibration_year),
excluded_training_targets=[],
log_csv="la_calibration_log.csv",
verbose=True, # Enable nested progress display
area_name="Local Authority",
get_performance=get_la_performance,
nested_progress=nested_progress, # Pass the nested progress manager
time_period=frs_release.calibration_year,
)
update_dataset("Calibrate local authority weights", "completed")

# Downrate and save
update_dataset("Downrate to 2023", "processing")
frs_calibrated = uprate_dataset(frs_calibrated_constituencies, 2023)
update_dataset("Downrate to 2023", "completed")
frs_calibrated = frs_calibrated_constituencies
if materialize_base_year:
update_dataset(materialize_step, "processing")
frs_calibrated = _materialize_base_year_dataset(
frs_calibrated,
frs_release,
uprate_dataset,
)
update_dataset(materialize_step, "completed")

update_dataset("Calibrate fuel litres", "processing")
from policyengine_uk_data.datasets.imputations.consumption import (
Expand All @@ -228,7 +298,7 @@ def main():

update_dataset("Save final dataset", "processing")
strip_internal_disability_reported_amounts(frs_calibrated).save(
STORAGE_FOLDER / "enhanced_frs_2023_24.h5"
STORAGE_FOLDER / frs_release.enhanced_dataset_file
)
update_dataset("Save final dataset", "completed")

Expand All @@ -237,26 +307,26 @@ def main():
TINY_SIZE = 1_000

frs_base = UKSingleYearDataset(
file_path=str(STORAGE_FOLDER / "frs_2023_24.h5")
file_path=str(STORAGE_FOLDER / frs_release.base_dataset_file)
)
tiny_frs = subsample_dataset(frs_base, TINY_SIZE)
tiny_frs.save(STORAGE_FOLDER / "frs_2023_24_tiny.h5")
tiny_frs.save(STORAGE_FOLDER / frs_release.tiny_base_dataset_file)

tiny_enhanced = subsample_dataset(
strip_internal_disability_reported_amounts(frs_calibrated),
TINY_SIZE,
)
tiny_enhanced.save(STORAGE_FOLDER / "enhanced_frs_2023_24_tiny.h5")
tiny_enhanced.save(STORAGE_FOLDER / frs_release.tiny_enhanced_dataset_file)
update_dataset("Create tiny datasets", "completed")

# Display success message
display_success_panel(
"Dataset creation completed successfully",
details={
"base_dataset": "frs_2023_24.h5",
"enhanced_dataset": "enhanced_frs_2023_24.h5",
"tiny_base_dataset": "frs_2023_24_tiny.h5",
"tiny_enhanced_dataset": "enhanced_frs_2023_24_tiny.h5",
"base_dataset": frs_release.base_dataset_file,
"enhanced_dataset": frs_release.enhanced_dataset_file,
"tiny_base_dataset": frs_release.tiny_base_dataset_file,
"tiny_enhanced_dataset": frs_release.tiny_enhanced_dataset_file,
"imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice, student_loan_plan",
"calibration": "national, LA and constituency targets",
},
Expand Down
8 changes: 5 additions & 3 deletions policyengine_uk_data/datasets/frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1464,8 +1464,10 @@ def _reported_benunit_mask(person_column: str) -> np.ndarray:


if __name__ == "__main__":
from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE

frs = create_frs(
raw_frs_folder=STORAGE_FOLDER / "frs_2022_23",
year=2022,
raw_frs_folder=STORAGE_FOLDER / CURRENT_FRS_RELEASE.name,
year=CURRENT_FRS_RELEASE.survey_year,
)
frs.save(STORAGE_FOLDER / "frs_2022.h5")
frs.save(STORAGE_FOLDER / CURRENT_FRS_RELEASE.base_dataset_file)
68 changes: 68 additions & 0 deletions policyengine_uk_data/datasets/frs_release.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from dataclasses import dataclass


@dataclass(frozen=True)
class FRSRelease:
name: str
survey_year: int
base_year: int
calibration_year: int
ukds_study_number: int
doi: str
ukds_tab_zip_filename: str
ukds_tab_zip_sha256: str
ukds_tab_subdir: str

@property
def raw_zip_name(self) -> str:
return f"{self.name}.zip"

@property
def base_dataset_name(self) -> str:
return self.name

@property
def enhanced_dataset_name(self) -> str:
return f"enhanced_{self.name}"

@property
def tiny_base_dataset_name(self) -> str:
return f"{self.name}_tiny"

@property
def tiny_enhanced_dataset_name(self) -> str:
return f"enhanced_{self.name}_tiny"

@property
def base_dataset_file(self) -> str:
return f"{self.base_dataset_name}.h5"

@property
def enhanced_dataset_file(self) -> str:
return f"{self.enhanced_dataset_name}.h5"

@property
def tiny_base_dataset_file(self) -> str:
return f"{self.tiny_base_dataset_name}.h5"

@property
def tiny_enhanced_dataset_file(self) -> str:
return f"{self.tiny_enhanced_dataset_name}.h5"


CURRENT_FRS_RELEASE = FRSRelease(
name="frs_2024_25",
survey_year=2024,
base_year=2024,
calibration_year=2025,
ukds_study_number=9563,
doi="http://doi.org/10.5255/UKDA-SN-9563-1",
ukds_tab_zip_filename=(
"9563tab_05DD0069587DBD25E5719D355CE05FC0827D5EDD58C24ECE9"
"AB85ACD954A9AEB_V1.zip"
),
ukds_tab_zip_sha256=(
"05dd0069587dbd25e5719d355ce05fc0827d5edd58c24ece9ab85acd954a9aeb"
),
ukds_tab_subdir="UKDA-9563-tab/tab",
)
3 changes: 2 additions & 1 deletion policyengine_uk_data/datasets/imputations/consumption.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import pandas as pd
import numpy as np
from policyengine_uk_data.datasets.frs_release import CURRENT_FRS_RELEASE
from policyengine_uk_data.storage import STORAGE_FOLDER
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk import Microsimulation
Expand Down Expand Up @@ -696,7 +697,7 @@ def save_imputation_models():
LCFS_TAB_FOLDER / "lcfs_2021_dvper_ukanon202122.tab", delimiter="\t"
)
household = generate_lcfs_table(lcfs_person, lcfs_household)
household = uprate_lcfs_table(household, "2024")
household = uprate_lcfs_table(household, str(CURRENT_FRS_RELEASE.base_year))
consumption.fit(household[PREDICTOR_VARIABLES], household[IMPUTATIONS])
consumption.save(STORAGE_FOLDER / CONSUMPTION_MODEL_FILENAME)
return consumption
Expand Down
Loading