-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcreate_datasets.py
More file actions
164 lines (138 loc) · 6.03 KB
/
Copy pathcreate_datasets.py
File metadata and controls
164 lines (138 loc) · 6.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
from policyengine_uk_data.datasets.frs import create_frs
from policyengine_uk_data.storage import STORAGE_FOLDER
import logging
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk_data.utils.uprating import uprate_dataset
from policyengine_uk_data.utils.progress import (
ProcessingProgress,
display_success_panel,
display_error_panel,
)
logging.basicConfig(level=logging.INFO)
def main():
"""Create enhanced FRS dataset with rich progress tracking."""
try:
progress_tracker = ProcessingProgress()
# Define dataset creation steps
steps = [
"Create base FRS dataset",
"Impute consumption",
"Impute wealth",
"Impute VAT",
"Impute public service usage",
"Impute income",
"Impute capital gains",
"Impute salary sacrifice",
"Uprate to 2025",
"Calibrate dataset",
"Downrate to 2023",
"Save final dataset",
]
with progress_tracker.track_dataset_creation(steps) as (
update_dataset,
nested_progress,
):
# Create base FRS dataset
update_dataset("Create base FRS dataset", "processing")
frs = create_frs(
raw_frs_folder=STORAGE_FOLDER / "frs_2023_24",
year=2023,
)
frs.save(STORAGE_FOLDER / "frs_2023_24.h5")
update_dataset("Create base FRS dataset", "completed")
# Import imputation functions
from policyengine_uk_data.datasets.imputations import (
impute_consumption,
impute_wealth,
impute_vat,
impute_income,
impute_capital_gains,
impute_services,
impute_salary_sacrifice,
)
# Apply imputations with progress tracking
update_dataset("Impute consumption", "processing")
frs = impute_consumption(frs)
update_dataset("Impute consumption", "completed")
update_dataset("Impute wealth", "processing")
frs = impute_wealth(frs)
update_dataset("Impute wealth", "completed")
update_dataset("Impute VAT", "processing")
frs = impute_vat(frs)
update_dataset("Impute VAT", "completed")
update_dataset("Impute public service usage", "processing")
frs = impute_services(frs)
update_dataset("Impute public service usage", "completed")
update_dataset("Impute income", "processing")
frs = impute_income(frs)
update_dataset("Impute income", "completed")
update_dataset("Impute capital gains", "processing")
frs = impute_capital_gains(frs)
update_dataset("Impute capital gains", "completed")
update_dataset("Impute salary sacrifice", "processing")
frs = impute_salary_sacrifice(frs)
update_dataset("Impute salary sacrifice", "completed")
# Uprate dataset
update_dataset("Uprate to 2025", "processing")
frs = uprate_dataset(frs, 2025)
update_dataset("Uprate to 2025", "completed")
# Calibrate dataset with nested progress
from policyengine_uk_data.datasets.local_areas.constituencies.calibrate import (
calibrate,
)
update_dataset("Calibrate dataset", "processing")
# Use a separate progress tracker for calibration with nested display
from policyengine_uk_data.utils.calibrate import (
calibrate_local_areas,
)
from policyengine_uk_data.datasets.local_areas.constituencies.loss import (
create_constituency_target_matrix,
create_national_target_matrix,
)
from policyengine_uk_data.datasets.local_areas.constituencies.calibrate import (
get_performance,
)
# Run calibration with verbose progress
frs_calibrated = calibrate_local_areas(
dataset=frs,
matrix_fn=create_constituency_target_matrix,
national_matrix_fn=create_national_target_matrix,
area_count=650,
weight_file="parliamentary_constituency_weights.h5",
excluded_training_targets=[],
log_csv="calibration_log.csv",
verbose=True, # Enable nested progress display
area_name="Constituency",
get_performance=get_performance,
nested_progress=nested_progress, # Pass the nested progress manager
)
update_dataset("Calibrate dataset", "completed")
# Downrate and save
update_dataset("Downrate to 2023", "processing")
frs_calibrated = uprate_dataset(frs_calibrated, 2023)
update_dataset("Downrate to 2023", "completed")
update_dataset("Save final dataset", "processing")
frs_calibrated.save(STORAGE_FOLDER / "enhanced_frs_2023_24.h5")
update_dataset("Save final dataset", "completed")
# Display success message
display_success_panel(
"Dataset creation completed successfully",
details={
"base_dataset": "frs_2023_24.h5",
"enhanced_dataset": "enhanced_frs_2023_24.h5",
"imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice",
"calibration": "national and constituency targets",
},
)
except Exception as e:
display_error_panel(
f"Dataset creation failed: {str(e)}",
suggestions=[
"Check that all required data files are present in storage folder",
"Verify sufficient disk space for dataset creation",
"Review log files for detailed error information",
],
)
raise
if __name__ == "__main__":
main()