Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ test:
pytest policyengine_uk/tests/ --cov=policyengine_uk --cov-report=xml --maxfail=0 -v

update-tests:
python policyengine_uk/data/economic_assumptions.py
python policyengine_uk/tests/microsimulation/update_reform_impacts.py

documentation:
Expand Down
5 changes: 5 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- bump: minor
changes:
added:
- UKMultiYearDataset class to handle multiple fiscal years.
- Uprating of datasets using the `uprate` method.
4 changes: 0 additions & 4 deletions policyengine_uk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,5 @@
from pathlib import Path
import os
from policyengine_core.taxbenefitsystems import TaxBenefitSystem
from policyengine_uk.data.economic_assumptions import (
BASELINE_GROWFACTORS,
apply_growth_factors,
)

REPO = Path(__file__).parent
5 changes: 4 additions & 1 deletion policyengine_uk/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
from policyengine_uk.data.dataset_schema import UKDataset
from policyengine_uk.data.dataset_schema import (
UKMultiYearDataset,
UKSingleYearDataset,
)
119 changes: 116 additions & 3 deletions policyengine_uk/data/dataset_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import h5py


class UKDataset:
class UKSingleYearDataset:
person: pd.DataFrame
benunit: pd.DataFrame
household: pd.DataFrame
Expand Down Expand Up @@ -61,6 +61,7 @@ def __init__(

self.data_format = "arrays"
self.tables = (self.person, self.benunit, self.household)
self.table_names = ("person", "benunit", "household")

def save(self, file_path: str):
with pd.HDFStore(file_path) as f:
Expand All @@ -80,10 +81,11 @@ def load(self):
return data

def copy(self):
return UKDataset(
return UKSingleYearDataset(
person=self.person.copy(),
benunit=self.benunit.copy(),
household=self.household.copy(),
fiscal_year=self.time_period,
)

def validate(self):
Expand All @@ -110,9 +112,120 @@ def from_simulation(
input_variables, period=fiscal_year
)

return UKDataset(
return UKSingleYearDataset(
person=entity_dfs["person"],
benunit=entity_dfs["benunit"],
household=entity_dfs["household"],
fiscal_year=fiscal_year,
)


class UKMultiYearDataset:
def __init__(
self,
file_path: str = None,
datasets: list[UKSingleYearDataset] | None = None,
):
if datasets is not None:
self.datasets = {}
for dataset in datasets:
if not isinstance(dataset, UKSingleYearDataset):
raise TypeError(
"All items in datasets must be of type UKSingleYearDataset."
)
year = int(dataset.time_period[:4])
self.datasets[year] = dataset

if file_path is not None:
UKSingleYearDataset.validate_file_path(file_path)
with pd.HDFStore(file_path) as f:
self.datasets = {}
for year in f.keys():
if year.startswith("/person/"):
fiscal_year = int(year.split("/")[2])
person_df = f[year]
benunit_df = f[f"/benunit/{fiscal_year}"]
household_df = f[f"/household/{fiscal_year}"]
self.datasets[fiscal_year] = UKSingleYearDataset(
person=person_df,
benunit=benunit_df,
household=household_df,
fiscal_year=fiscal_year,
)

self.data_format = "time_period_arrays"
self.time_period = list(sorted(self.datasets.keys()))[0]

def get_year(self, fiscal_year: int) -> UKSingleYearDataset:
if fiscal_year in self.datasets:
return self.datasets[fiscal_year]
else:
raise ValueError(f"No dataset found for year {fiscal_year}.")

def __getitem__(self, fiscal_year: int):
return self.get_year(fiscal_year)

def save(self, file_path: str):
Path(file_path).unlink(
missing_ok=True
) # Remove existing file if it exists
with pd.HDFStore(file_path) as f:
for year, dataset in self.datasets.items():
f.put(
f"person/{year}",
dataset.person,
format="table",
data_columns=True,
)
f.put(
f"benunit/{year}",
dataset.benunit,
format="table",
data_columns=True,
)
f.put(
f"household/{year}",
dataset.household,
format="table",
data_columns=True,
)
f.put(
f"time_period/{year}",
pd.Series([year]),
format="table",
data_columns=True,
)

def copy(self):
new_datasets = {
year: dataset.copy() for year, dataset in self.datasets.items()
}
return UKMultiYearDataset(datasets=list(new_datasets.values()))

@staticmethod
def validate_file_path(file_path: str):
if not file_path.endswith(".h5"):
raise ValueError(
"File path must end with '.h5' for UKMultiYearDataset."
)
if not Path(file_path).exists():
raise FileNotFoundError(f"File not found: {file_path}")

# Check if the file contains datasets for multiple years
with h5py.File(file_path, "r") as f:
if not any(key.startswith("/person/") for key in f.keys()):
raise ValueError("No person dataset found in the file.")
if not any(key.startswith("/benunit/") for key in f.keys()):
raise ValueError("No benunit dataset found in the file.")
if not any(key.startswith("/household/") for key in f.keys()):
raise ValueError("No household dataset found in the file.")

def load(self):
data = {}
for year, dataset in self.datasets.items():
for df in (dataset.person, dataset.benunit, dataset.household):
for col in df.columns:
if col not in data:
data[col] = {}
data[col][year] = df[col].values
return data
Loading
Loading