Add data_weights to calibration Input

peanutfun · peanutfun · commit 58bf84fc8b25 · 2025-06-13T11:41:24.000+02:00
* Use weights for sampling with replacement in AverageEnsembleOptimizer.
* Update tests
diff --git a/climada/util/calibrate/base.py b/climada/util/calibrate/base.py
@@ -47,7 +47,7 @@ class Input:
         Hazard object to compute impacts from
     exposure : climada.Exposures
         Exposures object to compute impacts from
-    data : pandas.Dataframe
+    data : pandas.DataFrame
         The data to compare computed impacts to. Index: Event IDs matching the IDs of
         :py:attr:`hazard`. Columns: Arbitrary columns. NaN values in the data frame have
         special meaning: Corresponding impact values computed by the model are ignored
@@ -64,8 +64,9 @@ class Input:
     cost_func : Callable
         Function that takes two ``pandas.Dataframe`` objects and returns the scalar
         "cost" between them. The optimization algorithm will try to minimize this
-        number. The first argument is the true/correct values (:py:attr:`data`), and the
-        second argument is the estimated/predicted values.
+        number. The first argument is the true/correct values (:py:attr:`data`), the
+        second argument is the estimated/predicted values, and the third argument is the
+        :py:attr:`data_weights`.
     bounds : Mapping (str, {Bounds, tuple(float, float)}), optional
         The bounds for the parameters. Keys: parameter names. Values:
         ``scipy.minimize.Bounds`` instance or tuple of minimum and maximum value.
@@ -85,6 +86,12 @@ class Input:
         :py:attr:`data`, insert this value. Defaults to NaN, in which case the impact
         from the model is ignored. Set this to zero to explicitly calibrate to zero
         impacts in these cases.
+    data_weights : pandas.DataFrame, optional
+        Weights for each entry in :py:attr:`data`. Must have the exact same index and
+        columns. If ``None``, the weights will be ignored (equivalent to the same weight
+        for each event).
+    missing_data_value : float, optional
+        Same as :py:attr:`missing_data_value`, but for :py:attr:`data_weights`.
     assign_centroids : bool, optional
         If ``True`` (default), assign the hazard centroids to the exposure when this
         object is created.
@@ -95,14 +102,16 @@ class Input:
     data: pd.DataFrame
     impact_func_creator: Callable[..., ImpactFuncSet]
     impact_to_dataframe: Callable[[Impact], pd.DataFrame]
-    cost_func: Callable[[pd.DataFrame, pd.DataFrame], Number]
+    cost_func: Callable[[pd.DataFrame, pd.DataFrame, pd.DataFrame | None], Number]
     bounds: Optional[Mapping[str, Union[Bounds, Tuple[Number, Number]]]] = None
     constraints: Optional[Union[ConstraintType, list[ConstraintType]]] = None
     impact_calc_kwds: Mapping[str, Any] = field(
         default_factory=lambda: {"assign_centroids": False}
     )
     missing_data_value: float = np.nan
-    assign_centroids: InitVar[bool] = True
+    data_weights: pd.DataFrame | None = field(default=None, kw_only=True)
+    missing_weights_value: float = field(default=np.nan, kw_only=True)
+    assign_centroids: InitVar[bool] = field(default=True, kw_only=True)
 
     def __post_init__(self, assign_centroids):
         """Prepare input data"""
@@ -115,6 +124,17 @@ def __post_init__(self, assign_centroids):
                 )
             raise TypeError("'data' must be a pandas.DataFrame")
 
+        if self.data_weights is not None:
+            try:
+                pd.testing.assert_index_equal(self.data.index, self.data_weights.index)
+                pd.testing.assert_index_equal(
+                    self.data.columns, self.data_weights.columns
+                )
+            except AssertionError as err:
+                raise ValueError(
+                    "'data_weights' must have exact same index and columns as 'data'"
+                ) from err
+
         if assign_centroids:
             self.exposure.assign_centroids(self.hazard)
 
@@ -413,7 +433,9 @@ class Optimizer(ABC):
 
     input: Input
 
-    def _target_func(self, data: pd.DataFrame, predicted: pd.DataFrame) -> Number:
+    def _target_func(
+        self, data: pd.DataFrame, predicted: pd.DataFrame, weights: pd.DataFrame | None
+    ) -> Number:
         """Target function for the optimizer
 
         The default version of this function simply returns the value of the cost
@@ -427,12 +449,14 @@ def _target_func(self, data: pd.DataFrame, predicted: pd.DataFrame) -> Number:
         predicted : pandas.DataFrame
             The impact predicted by the data calibration after it has been transformed
             into a dataframe by :py:attr:`Input.impact_to_dataframe`.
+        weights : pandas.DataFrame
+            The relative weight for each data/entry pair.
 
         Returns
         -------
         The value of the target function for the optimizer.
         """
-        return self.input.cost_func(data, predicted)
+        return self.input.cost_func(data, predicted, weights)
 
     def _kwargs_to_impact_func_creator(self, *_, **kwargs) -> Dict[str, Any]:
         """Define how the parameters to :py:meth:`_opt_func` must be transformed
@@ -484,11 +508,24 @@ def _opt_func(self, *args, **kwargs) -> Number:
             hazard=self.input.hazard,
         ).impact(**self.input.impact_calc_kwds)
 
-        # Transform to DataFrame, align, and compute target function
+        # Transform to DataFrame and align
         data_aligned, impact_df_aligned = self.input.impact_to_aligned_df(
-            impact, fillna=0
+            impact, fillna=0.0
         )
-        return self._target_func(data_aligned, impact_df_aligned)
+
+        # Align weights
+        weights_aligned = None
+        if self.input.data_weights is not None:
+            weights_aligned, _ = self.input.data_weights.align(
+                data_aligned,
+                axis=None,
+                join="right",
+                copy=True,
+                fill_value=self.input.missing_weights_value,
+            )
+
+        # Compute target function
+        return self._target_func(data_aligned, impact_df_aligned, weights_aligned)
 
     @abstractmethod
     def run(self, **opt_kwargs) -> Output:
diff --git a/climada/util/calibrate/bayesian_optimizer.py b/climada/util/calibrate/bayesian_optimizer.py
@@ -616,9 +616,11 @@ def __post_init__(self, random_state, allow_duplicate_points, bayes_opt_kwds):
             **bayes_opt_kwds,
         )
 
-    def _target_func(self, data: pd.DataFrame, predicted: pd.DataFrame) -> Number:
+    def _target_func(
+        self, data: pd.DataFrame, predicted: pd.DataFrame, weights: pd.DataFrame | None
+    ) -> Number:
         """Invert the cost function because BayesianOptimization maximizes the target"""
-        return -self.input.cost_func(data, predicted)
+        return -self.input.cost_func(data, predicted, weights)
 
     def run(self, controller: BayesianOptimizerController) -> BayesianOptimizerOutput:
         """Execute the optimization
diff --git a/climada/util/calibrate/ensemble.py b/climada/util/calibrate/ensemble.py
@@ -75,6 +75,36 @@ def sample_data(data: pd.DataFrame, sample: list[tuple[int, int]]):
     return data_sampled
 
 
+def sample_weights(weights: pd.DataFrame, sample: list[tuple[int, int]]):
+    """
+    Return an updated DataFrame containing the appropriate weights for a sample.
+
+    Weights that are not in ``sample`` are set to zero, whereas weights that are sampled
+    multiple times will effectively multiplied by their occurrence in ``sample``.
+
+    Parameters
+    ----------
+    weights : pandas.DataFrame
+        The original weights for the data
+    sample : list of tuple of int
+        A list of (row, column) index pairs indicating which weights will be used, and
+        how often.
+
+    Returns
+    -------
+    pandas.DataFrame
+        Updated ``weights`` for ``sample``.
+    """
+    # Create all-zero weights
+    weights_sampled = pd.DataFrame(0.0, columns=weights.columns, index=weights.index)
+
+    # Add weights for each sample
+    for row, col in sample:
+        weights_sampled.iloc[row, col] += weights.iloc[row, col]
+
+    return weights_sampled
+
+
 def event_info_from_input(inp: Input) -> dict[str, Any]:
     """Get information on the event(s) for which we calibrated
 
@@ -595,7 +625,7 @@ def __post_init__(self, sample_fraction, ensemble_size, random_state, replace):
         """Create the samples"""
         if sample_fraction <= 0:
             raise ValueError("Sample fraction must be larger than 0")
-        elif sample_fraction > 1 and not replace:
+        if sample_fraction > 1 and not replace:
             raise ValueError("Sample fraction must be <=1 or replace must be True")
         if ensemble_size < 1:
             raise ValueError("Ensemble size must be >=1")
@@ -615,7 +645,17 @@ def __post_init__(self, sample_fraction, ensemble_size, random_state, replace):
     def input_from_sample(self, sample: list[tuple[int, int]]):
         """Shallow-copy the input and update the data"""
         input = copy(self.input)  # NOTE: Shallow copy!
+
+        # Sampling
+        # NOTE: We always need samples to support `replace=True`
         input.data = sample_data(input.data, sample)
+        weights = (
+            input.data_weights
+            if input.data_weights is not None
+            else pd.DataFrame(1.0, index=input.data.index, columns=input.data.columns)
+        )
+        input.data_weights = sample_weights(weights, sample)
+
         return input
 
 
@@ -666,6 +706,15 @@ def input_from_sample(self, sample: list[tuple[int, int]]):
         input.data = data.dropna(axis="columns", how="all").dropna(
             axis="index", how="all"
         )
+        if input.data_weights is not None:
+            input.data_weights, _ = input.data_weights.align(
+                input.data,
+                axis=None,
+                join="right",
+                copy=True,
+                fill_value=input.missing_weights_value,
+            )
+            input.data_weights = sample_weights(input.data_weights, sample)
 
         # Select single hazard event
         input.hazard = input.hazard.select(event_id=input.data.index)
diff --git a/climada/util/calibrate/test/test_base.py b/climada/util/calibrate/test/test_base.py
@@ -38,8 +38,8 @@
 class ConcreteOptimizer(Optimizer):
     """An instance for testing. Implements 'run' without doing anything"""
 
-    def run(self, **_):
-        pass
+    def run(self, *args, **kwargs):
+        return self._opt_func(*args, **kwargs)
 
 
 def hazard():
@@ -83,7 +83,7 @@ def setUp(self):
 
         # Create dummy funcs
         self.impact_to_dataframe = lambda _: pd.DataFrame()
-        self.cost_func = lambda impact, data: 1.0
+        self.cost_func = lambda impact, data, weights: 1.0
         self.impact_func_gen = lambda **kwargs: ImpactFuncSet()
 
     def test_post_init_calls(self):
@@ -138,7 +138,7 @@ def test_align_impact(self):
             data=pd.DataFrame(
                 data={"col1": [1, 2], "col2": [2, 3]}, index=[0, 1], dtype="float"
             ),
-            cost_func=lambda x, y: (x + y).sum(axis=None),
+            cost_func=lambda x, y, _: (x + y).sum(axis=None),
             impact_func_creator=lambda _: ImpactFuncSet([ImpactFunc()]),
             # Mock the dataframe creation by ignoring the argument
             impact_to_dataframe=lambda _: pd.DataFrame(
@@ -191,6 +191,7 @@ def test_align_impact(self):
             data_aligned, impact_df_aligned = input.impact_to_aligned_df(None)
 
 
+@patch("climada.util.calibrate.base.ImpactCalc", autospec=True)
 class TestOptimizer(unittest.TestCase):
     """Base class for testing optimizers. Creates an input mock"""
 
@@ -200,14 +201,52 @@ def setUp(self):
             hazard=hazard(),
             exposure=exposure(),
             data=pd.DataFrame(data={"col1": [1, 2], "col2": [2, 3]}, index=[0, 1]),
-            cost_func=lambda x, y: (x + y).sum(axis=None),
-            impact_func_creator=lambda _: ImpactFuncSet([ImpactFunc()]),
+            cost_func=lambda x, y: ((x - y) ** 2).sum(),
+            impact_func_creator=lambda: ImpactFuncSet([ImpactFunc()]),
             impact_to_dataframe=lambda x: x.impact_at_reg(),
         )
         self.optimizer = ConcreteOptimizer(self.input)
 
+    def test_align(self, _):
+        """Test aligning of data frames"""
+        self.input.impact_to_dataframe = lambda _: pd.DataFrame(
+            data={"col1": [2, 4], "col2": [4, 0]}, index=[0, 2]
+        )
+        self.input.cost_func = lambda x, y, w: (x, y, w)
+
+        # Apply
+        data, impact, weights = self.optimizer.run()
+
+        # Check alignment
+        self.assertIsNone(weights)
+        pd.testing.assert_frame_equal(
+            data,
+            pd.DataFrame(
+                {"col1": [1, 2, 0], "col2": [2, 3, 0]}, index=[0, 1, 2], dtype="float"
+            ),
+        )
+        pd.testing.assert_frame_equal(
+            impact,
+            pd.DataFrame(
+                {"col1": [2, 0, 0], "col2": [4, 0, 0]}, index=[0, 1, 2], dtype="float"
+            ),
+        )
+
+        # Apply with weights
+        self.input.data_weights = pd.DataFrame({"col1": [1], "col2": [1]}, index=[1])
+        self.input.missing_weights_value = 2.0
+        data, impact, weights = self.optimizer.run()
+
+        # Check alignment
+        pd.testing.assert_frame_equal(
+            weights,
+            pd.DataFrame(
+                {"col1": [2, 1, 2], "col2": [2, 1, 2]}, index=[0, 1, 2], dtype="float"
+            ),
+        )
+
 
-class TestOuput(unittest.TestCase):
+class TestOutput(unittest.TestCase):
     """Test the optimizer output"""
 
     def test_cycle(self):
@@ -261,5 +300,6 @@ def test_init(self, mock):
 if __name__ == "__main__":
     TESTS = unittest.TestLoader().loadTestsFromTestCase(TestInputPostInit)
     TESTS.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOptimizer))
+    TESTS.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOutput))
     TESTS.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOutputEvaluator))
     unittest.TextTestRunner(verbosity=2).run(TESTS)
diff --git a/climada/util/calibrate/test/test_ensemble.py b/climada/util/calibrate/test/test_ensemble.py
@@ -38,6 +38,7 @@
     TragedyEnsembleOptimizer,
     event_info_from_input,
     sample_data,
+    sample_weights,
 )
 
 from .test_base import ConcreteOptimizer, exposure, hazard
@@ -161,6 +162,25 @@ def test_sample_data(self):
         )
 
 
+class TestSampleWeights(unittest.TestCase):
+    """Test sample_weights function"""
+
+    def test_sample_weights(self):
+        """Test sampling of data weights"""
+        df = pd.DataFrame([[0, 1, 2], [3, 4, 5]], index=[1, 2], columns=["a", "b", "c"])
+        samples = [(0, 0), (0, 2), (1, 1), (0, 2)]
+
+        pdt.assert_frame_equal(
+            sample_weights(df, samples),
+            pd.DataFrame(
+                [[0, 0, 4], [0, 4, 0]],
+                index=df.index,
+                columns=df.columns,
+                dtype="float",
+            ),
+        )
+
+
 class TestEventInfoFromInput(unittest.TestCase):
     """Test retrieving event information from the input"""
 
@@ -319,6 +339,7 @@ def __init__(self, df):
         self.stub = "a"
         self.hazard = create_autospec(hazard())
         self.hazard.select.return_value = self.hazard
+        self.data_weights = None
 
 
 class TestAverageEnsembleOptimizer(unittest.TestCase):
@@ -419,9 +440,18 @@ def test_input_from_sample(self):
             input=self.input,
             optimizer_type=ConcreteOptimizer,
         )
-        inp = opt.input_from_sample([(0, 0)])
+        inp = opt.input_from_sample([(0, 0), (3, 1), (0, 0)])
+
         self.assertIsNot(inp, self.input)
         self.assertIs(inp.stub, self.input.stub)
+        pd.testing.assert_frame_equal(
+            inp.data,
+            pd.DataFrame({"a": [1.0, None, None, None], "b": [None, None, None, 4.0]}),
+        )
+        pd.testing.assert_frame_equal(
+            inp.data_weights,
+            pd.DataFrame({"a": [2.0, 0.0, 0.0, 0.0], "b": [0.0, 0.0, 0.0, 1.0]}),
+        )
 
 
 class TestTragedyEnsembleOptimizer(unittest.TestCase):