Skip to content

Commit 58bf84f

Browse files
committed
Add data_weights to calibration Input
* Use weights for sampling with replacement in AverageEnsembleOptimizer. * Update tests
1 parent 3f30b04 commit 58bf84f

5 files changed

Lines changed: 179 additions & 21 deletions

File tree

climada/util/calibrate/base.py

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ class Input:
4747
Hazard object to compute impacts from
4848
exposure : climada.Exposures
4949
Exposures object to compute impacts from
50-
data : pandas.Dataframe
50+
data : pandas.DataFrame
5151
The data to compare computed impacts to. Index: Event IDs matching the IDs of
5252
:py:attr:`hazard`. Columns: Arbitrary columns. NaN values in the data frame have
5353
special meaning: Corresponding impact values computed by the model are ignored
@@ -64,8 +64,9 @@ class Input:
6464
cost_func : Callable
6565
Function that takes two ``pandas.Dataframe`` objects and returns the scalar
6666
"cost" between them. The optimization algorithm will try to minimize this
67-
number. The first argument is the true/correct values (:py:attr:`data`), and the
68-
second argument is the estimated/predicted values.
67+
number. The first argument is the true/correct values (:py:attr:`data`), the
68+
second argument is the estimated/predicted values, and the third argument is the
69+
:py:attr:`data_weights`.
6970
bounds : Mapping (str, {Bounds, tuple(float, float)}), optional
7071
The bounds for the parameters. Keys: parameter names. Values:
7172
``scipy.minimize.Bounds`` instance or tuple of minimum and maximum value.
@@ -85,6 +86,12 @@ class Input:
8586
:py:attr:`data`, insert this value. Defaults to NaN, in which case the impact
8687
from the model is ignored. Set this to zero to explicitly calibrate to zero
8788
impacts in these cases.
89+
data_weights : pandas.DataFrame, optional
90+
Weights for each entry in :py:attr:`data`. Must have the exact same index and
91+
columns. If ``None``, the weights will be ignored (equivalent to the same weight
92+
for each event).
93+
missing_data_value : float, optional
94+
Same as :py:attr:`missing_data_value`, but for :py:attr:`data_weights`.
8895
assign_centroids : bool, optional
8996
If ``True`` (default), assign the hazard centroids to the exposure when this
9097
object is created.
@@ -95,14 +102,16 @@ class Input:
95102
data: pd.DataFrame
96103
impact_func_creator: Callable[..., ImpactFuncSet]
97104
impact_to_dataframe: Callable[[Impact], pd.DataFrame]
98-
cost_func: Callable[[pd.DataFrame, pd.DataFrame], Number]
105+
cost_func: Callable[[pd.DataFrame, pd.DataFrame, pd.DataFrame | None], Number]
99106
bounds: Optional[Mapping[str, Union[Bounds, Tuple[Number, Number]]]] = None
100107
constraints: Optional[Union[ConstraintType, list[ConstraintType]]] = None
101108
impact_calc_kwds: Mapping[str, Any] = field(
102109
default_factory=lambda: {"assign_centroids": False}
103110
)
104111
missing_data_value: float = np.nan
105-
assign_centroids: InitVar[bool] = True
112+
data_weights: pd.DataFrame | None = field(default=None, kw_only=True)
113+
missing_weights_value: float = field(default=np.nan, kw_only=True)
114+
assign_centroids: InitVar[bool] = field(default=True, kw_only=True)
106115

107116
def __post_init__(self, assign_centroids):
108117
"""Prepare input data"""
@@ -115,6 +124,17 @@ def __post_init__(self, assign_centroids):
115124
)
116125
raise TypeError("'data' must be a pandas.DataFrame")
117126

127+
if self.data_weights is not None:
128+
try:
129+
pd.testing.assert_index_equal(self.data.index, self.data_weights.index)
130+
pd.testing.assert_index_equal(
131+
self.data.columns, self.data_weights.columns
132+
)
133+
except AssertionError as err:
134+
raise ValueError(
135+
"'data_weights' must have exact same index and columns as 'data'"
136+
) from err
137+
118138
if assign_centroids:
119139
self.exposure.assign_centroids(self.hazard)
120140

@@ -413,7 +433,9 @@ class Optimizer(ABC):
413433

414434
input: Input
415435

416-
def _target_func(self, data: pd.DataFrame, predicted: pd.DataFrame) -> Number:
436+
def _target_func(
437+
self, data: pd.DataFrame, predicted: pd.DataFrame, weights: pd.DataFrame | None
438+
) -> Number:
417439
"""Target function for the optimizer
418440
419441
The default version of this function simply returns the value of the cost
@@ -427,12 +449,14 @@ def _target_func(self, data: pd.DataFrame, predicted: pd.DataFrame) -> Number:
427449
predicted : pandas.DataFrame
428450
The impact predicted by the data calibration after it has been transformed
429451
into a dataframe by :py:attr:`Input.impact_to_dataframe`.
452+
weights : pandas.DataFrame
453+
The relative weight for each data/entry pair.
430454
431455
Returns
432456
-------
433457
The value of the target function for the optimizer.
434458
"""
435-
return self.input.cost_func(data, predicted)
459+
return self.input.cost_func(data, predicted, weights)
436460

437461
def _kwargs_to_impact_func_creator(self, *_, **kwargs) -> Dict[str, Any]:
438462
"""Define how the parameters to :py:meth:`_opt_func` must be transformed
@@ -484,11 +508,24 @@ def _opt_func(self, *args, **kwargs) -> Number:
484508
hazard=self.input.hazard,
485509
).impact(**self.input.impact_calc_kwds)
486510

487-
# Transform to DataFrame, align, and compute target function
511+
# Transform to DataFrame and align
488512
data_aligned, impact_df_aligned = self.input.impact_to_aligned_df(
489-
impact, fillna=0
513+
impact, fillna=0.0
490514
)
491-
return self._target_func(data_aligned, impact_df_aligned)
515+
516+
# Align weights
517+
weights_aligned = None
518+
if self.input.data_weights is not None:
519+
weights_aligned, _ = self.input.data_weights.align(
520+
data_aligned,
521+
axis=None,
522+
join="right",
523+
copy=True,
524+
fill_value=self.input.missing_weights_value,
525+
)
526+
527+
# Compute target function
528+
return self._target_func(data_aligned, impact_df_aligned, weights_aligned)
492529

493530
@abstractmethod
494531
def run(self, **opt_kwargs) -> Output:

climada/util/calibrate/bayesian_optimizer.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -616,9 +616,11 @@ def __post_init__(self, random_state, allow_duplicate_points, bayes_opt_kwds):
616616
**bayes_opt_kwds,
617617
)
618618

619-
def _target_func(self, data: pd.DataFrame, predicted: pd.DataFrame) -> Number:
619+
def _target_func(
620+
self, data: pd.DataFrame, predicted: pd.DataFrame, weights: pd.DataFrame | None
621+
) -> Number:
620622
"""Invert the cost function because BayesianOptimization maximizes the target"""
621-
return -self.input.cost_func(data, predicted)
623+
return -self.input.cost_func(data, predicted, weights)
622624

623625
def run(self, controller: BayesianOptimizerController) -> BayesianOptimizerOutput:
624626
"""Execute the optimization

climada/util/calibrate/ensemble.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,36 @@ def sample_data(data: pd.DataFrame, sample: list[tuple[int, int]]):
7575
return data_sampled
7676

7777

78+
def sample_weights(weights: pd.DataFrame, sample: list[tuple[int, int]]):
79+
"""
80+
Return an updated DataFrame containing the appropriate weights for a sample.
81+
82+
Weights that are not in ``sample`` are set to zero, whereas weights that are sampled
83+
multiple times will effectively multiplied by their occurrence in ``sample``.
84+
85+
Parameters
86+
----------
87+
weights : pandas.DataFrame
88+
The original weights for the data
89+
sample : list of tuple of int
90+
A list of (row, column) index pairs indicating which weights will be used, and
91+
how often.
92+
93+
Returns
94+
-------
95+
pandas.DataFrame
96+
Updated ``weights`` for ``sample``.
97+
"""
98+
# Create all-zero weights
99+
weights_sampled = pd.DataFrame(0.0, columns=weights.columns, index=weights.index)
100+
101+
# Add weights for each sample
102+
for row, col in sample:
103+
weights_sampled.iloc[row, col] += weights.iloc[row, col]
104+
105+
return weights_sampled
106+
107+
78108
def event_info_from_input(inp: Input) -> dict[str, Any]:
79109
"""Get information on the event(s) for which we calibrated
80110
@@ -595,7 +625,7 @@ def __post_init__(self, sample_fraction, ensemble_size, random_state, replace):
595625
"""Create the samples"""
596626
if sample_fraction <= 0:
597627
raise ValueError("Sample fraction must be larger than 0")
598-
elif sample_fraction > 1 and not replace:
628+
if sample_fraction > 1 and not replace:
599629
raise ValueError("Sample fraction must be <=1 or replace must be True")
600630
if ensemble_size < 1:
601631
raise ValueError("Ensemble size must be >=1")
@@ -615,7 +645,17 @@ def __post_init__(self, sample_fraction, ensemble_size, random_state, replace):
615645
def input_from_sample(self, sample: list[tuple[int, int]]):
616646
"""Shallow-copy the input and update the data"""
617647
input = copy(self.input) # NOTE: Shallow copy!
648+
649+
# Sampling
650+
# NOTE: We always need samples to support `replace=True`
618651
input.data = sample_data(input.data, sample)
652+
weights = (
653+
input.data_weights
654+
if input.data_weights is not None
655+
else pd.DataFrame(1.0, index=input.data.index, columns=input.data.columns)
656+
)
657+
input.data_weights = sample_weights(weights, sample)
658+
619659
return input
620660

621661

@@ -666,6 +706,15 @@ def input_from_sample(self, sample: list[tuple[int, int]]):
666706
input.data = data.dropna(axis="columns", how="all").dropna(
667707
axis="index", how="all"
668708
)
709+
if input.data_weights is not None:
710+
input.data_weights, _ = input.data_weights.align(
711+
input.data,
712+
axis=None,
713+
join="right",
714+
copy=True,
715+
fill_value=input.missing_weights_value,
716+
)
717+
input.data_weights = sample_weights(input.data_weights, sample)
669718

670719
# Select single hazard event
671720
input.hazard = input.hazard.select(event_id=input.data.index)

climada/util/calibrate/test/test_base.py

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@
3838
class ConcreteOptimizer(Optimizer):
3939
"""An instance for testing. Implements 'run' without doing anything"""
4040

41-
def run(self, **_):
42-
pass
41+
def run(self, *args, **kwargs):
42+
return self._opt_func(*args, **kwargs)
4343

4444

4545
def hazard():
@@ -83,7 +83,7 @@ def setUp(self):
8383

8484
# Create dummy funcs
8585
self.impact_to_dataframe = lambda _: pd.DataFrame()
86-
self.cost_func = lambda impact, data: 1.0
86+
self.cost_func = lambda impact, data, weights: 1.0
8787
self.impact_func_gen = lambda **kwargs: ImpactFuncSet()
8888

8989
def test_post_init_calls(self):
@@ -138,7 +138,7 @@ def test_align_impact(self):
138138
data=pd.DataFrame(
139139
data={"col1": [1, 2], "col2": [2, 3]}, index=[0, 1], dtype="float"
140140
),
141-
cost_func=lambda x, y: (x + y).sum(axis=None),
141+
cost_func=lambda x, y, _: (x + y).sum(axis=None),
142142
impact_func_creator=lambda _: ImpactFuncSet([ImpactFunc()]),
143143
# Mock the dataframe creation by ignoring the argument
144144
impact_to_dataframe=lambda _: pd.DataFrame(
@@ -191,6 +191,7 @@ def test_align_impact(self):
191191
data_aligned, impact_df_aligned = input.impact_to_aligned_df(None)
192192

193193

194+
@patch("climada.util.calibrate.base.ImpactCalc", autospec=True)
194195
class TestOptimizer(unittest.TestCase):
195196
"""Base class for testing optimizers. Creates an input mock"""
196197

@@ -200,14 +201,52 @@ def setUp(self):
200201
hazard=hazard(),
201202
exposure=exposure(),
202203
data=pd.DataFrame(data={"col1": [1, 2], "col2": [2, 3]}, index=[0, 1]),
203-
cost_func=lambda x, y: (x + y).sum(axis=None),
204-
impact_func_creator=lambda _: ImpactFuncSet([ImpactFunc()]),
204+
cost_func=lambda x, y: ((x - y) ** 2).sum(),
205+
impact_func_creator=lambda: ImpactFuncSet([ImpactFunc()]),
205206
impact_to_dataframe=lambda x: x.impact_at_reg(),
206207
)
207208
self.optimizer = ConcreteOptimizer(self.input)
208209

210+
def test_align(self, _):
211+
"""Test aligning of data frames"""
212+
self.input.impact_to_dataframe = lambda _: pd.DataFrame(
213+
data={"col1": [2, 4], "col2": [4, 0]}, index=[0, 2]
214+
)
215+
self.input.cost_func = lambda x, y, w: (x, y, w)
216+
217+
# Apply
218+
data, impact, weights = self.optimizer.run()
219+
220+
# Check alignment
221+
self.assertIsNone(weights)
222+
pd.testing.assert_frame_equal(
223+
data,
224+
pd.DataFrame(
225+
{"col1": [1, 2, 0], "col2": [2, 3, 0]}, index=[0, 1, 2], dtype="float"
226+
),
227+
)
228+
pd.testing.assert_frame_equal(
229+
impact,
230+
pd.DataFrame(
231+
{"col1": [2, 0, 0], "col2": [4, 0, 0]}, index=[0, 1, 2], dtype="float"
232+
),
233+
)
234+
235+
# Apply with weights
236+
self.input.data_weights = pd.DataFrame({"col1": [1], "col2": [1]}, index=[1])
237+
self.input.missing_weights_value = 2.0
238+
data, impact, weights = self.optimizer.run()
239+
240+
# Check alignment
241+
pd.testing.assert_frame_equal(
242+
weights,
243+
pd.DataFrame(
244+
{"col1": [2, 1, 2], "col2": [2, 1, 2]}, index=[0, 1, 2], dtype="float"
245+
),
246+
)
247+
209248

210-
class TestOuput(unittest.TestCase):
249+
class TestOutput(unittest.TestCase):
211250
"""Test the optimizer output"""
212251

213252
def test_cycle(self):
@@ -261,5 +300,6 @@ def test_init(self, mock):
261300
if __name__ == "__main__":
262301
TESTS = unittest.TestLoader().loadTestsFromTestCase(TestInputPostInit)
263302
TESTS.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOptimizer))
303+
TESTS.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOutput))
264304
TESTS.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOutputEvaluator))
265305
unittest.TextTestRunner(verbosity=2).run(TESTS)

climada/util/calibrate/test/test_ensemble.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
TragedyEnsembleOptimizer,
3939
event_info_from_input,
4040
sample_data,
41+
sample_weights,
4142
)
4243

4344
from .test_base import ConcreteOptimizer, exposure, hazard
@@ -161,6 +162,25 @@ def test_sample_data(self):
161162
)
162163

163164

165+
class TestSampleWeights(unittest.TestCase):
166+
"""Test sample_weights function"""
167+
168+
def test_sample_weights(self):
169+
"""Test sampling of data weights"""
170+
df = pd.DataFrame([[0, 1, 2], [3, 4, 5]], index=[1, 2], columns=["a", "b", "c"])
171+
samples = [(0, 0), (0, 2), (1, 1), (0, 2)]
172+
173+
pdt.assert_frame_equal(
174+
sample_weights(df, samples),
175+
pd.DataFrame(
176+
[[0, 0, 4], [0, 4, 0]],
177+
index=df.index,
178+
columns=df.columns,
179+
dtype="float",
180+
),
181+
)
182+
183+
164184
class TestEventInfoFromInput(unittest.TestCase):
165185
"""Test retrieving event information from the input"""
166186

@@ -319,6 +339,7 @@ def __init__(self, df):
319339
self.stub = "a"
320340
self.hazard = create_autospec(hazard())
321341
self.hazard.select.return_value = self.hazard
342+
self.data_weights = None
322343

323344

324345
class TestAverageEnsembleOptimizer(unittest.TestCase):
@@ -419,9 +440,18 @@ def test_input_from_sample(self):
419440
input=self.input,
420441
optimizer_type=ConcreteOptimizer,
421442
)
422-
inp = opt.input_from_sample([(0, 0)])
443+
inp = opt.input_from_sample([(0, 0), (3, 1), (0, 0)])
444+
423445
self.assertIsNot(inp, self.input)
424446
self.assertIs(inp.stub, self.input.stub)
447+
pd.testing.assert_frame_equal(
448+
inp.data,
449+
pd.DataFrame({"a": [1.0, None, None, None], "b": [None, None, None, 4.0]}),
450+
)
451+
pd.testing.assert_frame_equal(
452+
inp.data_weights,
453+
pd.DataFrame({"a": [2.0, 0.0, 0.0, 0.0], "b": [0.0, 0.0, 0.0, 1.0]}),
454+
)
425455

426456

427457
class TestTragedyEnsembleOptimizer(unittest.TestCase):

0 commit comments

Comments
 (0)