PolicyEngine · nikhilwoodruff · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/Makefile b/Makefile
@@ -16,7 +16,6 @@ test:
 	pytest policyengine_uk/tests/ --cov=policyengine_uk --cov-report=xml --maxfail=0 -v
 
 update-tests:
-	python policyengine_uk/data/economic_assumptions.py
 	python policyengine_uk/tests/microsimulation/update_reform_impacts.py
 
 documentation:

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,5 @@
+- bump: minor
+  changes:
+    added:
+    - UKMultiYearDataset class to handle multiple fiscal years.
+    - Uprating of datasets using the `uprate` method.
diff --git a/policyengine_uk/__init__.py b/policyengine_uk/__init__.py
@@ -13,9 +13,5 @@
 from pathlib import Path
 import os
 from policyengine_core.taxbenefitsystems import TaxBenefitSystem
-from policyengine_uk.data.economic_assumptions import (
-    BASELINE_GROWFACTORS,
-    apply_growth_factors,
-)
 
 REPO = Path(__file__).parent
diff --git a/policyengine_uk/data/__init__.py b/policyengine_uk/data/__init__.py
@@ -1 +1,4 @@
-from policyengine_uk.data.dataset_schema import UKDataset
+from policyengine_uk.data.dataset_schema import (
+    UKMultiYearDataset,
+    UKSingleYearDataset,
+)
diff --git a/policyengine_uk/data/dataset_schema.py b/policyengine_uk/data/dataset_schema.py
@@ -8,7 +8,7 @@
 import h5py
 
 
-class UKDataset:
+class UKSingleYearDataset:
     person: pd.DataFrame
     benunit: pd.DataFrame
     household: pd.DataFrame
@@ -61,6 +61,7 @@ def __init__(
 
         self.data_format = "arrays"
         self.tables = (self.person, self.benunit, self.household)
+        self.table_names = ("person", "benunit", "household")
 
     def save(self, file_path: str):
         with pd.HDFStore(file_path) as f:
@@ -80,10 +81,11 @@ def load(self):
         return data
 
     def copy(self):
-        return UKDataset(
+        return UKSingleYearDataset(
             person=self.person.copy(),
             benunit=self.benunit.copy(),
             household=self.household.copy(),
+            fiscal_year=self.time_period,
         )
 
     def validate(self):
@@ -110,9 +112,120 @@ def from_simulation(
                 input_variables, period=fiscal_year
             )
 
-        return UKDataset(
+        return UKSingleYearDataset(
             person=entity_dfs["person"],
             benunit=entity_dfs["benunit"],
             household=entity_dfs["household"],
             fiscal_year=fiscal_year,
         )
+
+
+class UKMultiYearDataset:
+    def __init__(
+        self,
+        file_path: str = None,
+        datasets: list[UKSingleYearDataset] | None = None,
+    ):
+        if datasets is not None:
+            self.datasets = {}
+            for dataset in datasets:
+                if not isinstance(dataset, UKSingleYearDataset):
+                    raise TypeError(
+                        "All items in datasets must be of type UKSingleYearDataset."
+                    )
+                year = int(dataset.time_period[:4])
+                self.datasets[year] = dataset
+
+        if file_path is not None:
+            UKSingleYearDataset.validate_file_path(file_path)
+            with pd.HDFStore(file_path) as f:
+                self.datasets = {}
+                for year in f.keys():
+                    if year.startswith("/person/"):
+                        fiscal_year = int(year.split("/")[2])
+                        person_df = f[year]
+                        benunit_df = f[f"/benunit/{fiscal_year}"]
+                        household_df = f[f"/household/{fiscal_year}"]
+                        self.datasets[fiscal_year] = UKSingleYearDataset(
+                            person=person_df,
+                            benunit=benunit_df,
+                            household=household_df,
+                            fiscal_year=fiscal_year,
+                        )
+
+        self.data_format = "time_period_arrays"
+        self.time_period = list(sorted(self.datasets.keys()))[0]
+
+    def get_year(self, fiscal_year: int) -> UKSingleYearDataset:
+        if fiscal_year in self.datasets:
+            return self.datasets[fiscal_year]
+        else:
+            raise ValueError(f"No dataset found for year {fiscal_year}.")
+
+    def __getitem__(self, fiscal_year: int):
+        return self.get_year(fiscal_year)
+
+    def save(self, file_path: str):
+        Path(file_path).unlink(
+            missing_ok=True
+        )  # Remove existing file if it exists
+        with pd.HDFStore(file_path) as f:
+            for year, dataset in self.datasets.items():
+                f.put(
+                    f"person/{year}",
+                    dataset.person,
+                    format="table",
+                    data_columns=True,
+                )
+                f.put(
+                    f"benunit/{year}",
+                    dataset.benunit,
+                    format="table",
+                    data_columns=True,
+                )
+                f.put(
+                    f"household/{year}",
+                    dataset.household,
+                    format="table",
+                    data_columns=True,
+                )
+                f.put(
+                    f"time_period/{year}",
+                    pd.Series([year]),
+                    format="table",
+                    data_columns=True,
+                )
+
+    def copy(self):
+        new_datasets = {
+            year: dataset.copy() for year, dataset in self.datasets.items()
+        }
+        return UKMultiYearDataset(datasets=list(new_datasets.values()))
+
+    @staticmethod
+    def validate_file_path(file_path: str):
+        if not file_path.endswith(".h5"):
+            raise ValueError(
+                "File path must end with '.h5' for UKMultiYearDataset."
+            )
+        if not Path(file_path).exists():
+            raise FileNotFoundError(f"File not found: {file_path}")
+
+        # Check if the file contains datasets for multiple years
+        with h5py.File(file_path, "r") as f:
+            if not any(key.startswith("/person/") for key in f.keys()):
+                raise ValueError("No person dataset found in the file.")
+            if not any(key.startswith("/benunit/") for key in f.keys()):
+                raise ValueError("No benunit dataset found in the file.")
+            if not any(key.startswith("/household/") for key in f.keys()):
+                raise ValueError("No household dataset found in the file.")
+
+    def load(self):
+        data = {}
+        for year, dataset in self.datasets.items():
+            for df in (dataset.person, dataset.benunit, dataset.household):
+                for col in df.columns:
+                    if col not in data:
+                        data[col] = {}
+                    data[col][year] = df[col].values
+        return data