From d55581a849b43bc0287edcfee8e474e7ebed8f4b Mon Sep 17 00:00:00 2001 From: Mahimn Date: Tue, 26 May 2026 10:33:45 -0400 Subject: [PATCH] fix: extend zero_division parameter to percentage and range-based metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Percentage and range-based metrics (`wmape`, `ope`, `arre`, `marre`, `coefficient_of_variation`) previously either raised a hard `ValueError` or silently returned `nan`/`inf` when their denominator was zero. This made batch evaluation pipelines brittle for constant or all-zero components. Mirrors the `zero_division` design introduced in #3059 for the scaled error family: `"warn"` (default) returns `np.nan` and emits a warning, `"raise"` preserves the legacy `ValueError`. A new `_safe_pct_divide` helper sits next to `_safe_scaled_divide`; the two differ only in fill semantics β€” percentage metrics multiply the ratio by 100 so a `1.0` fill for the 0/0 case (the scaled-metric "on par with naive") would surface as `100 %` error and be misleading, hence `np.nan` instead. Two adjacent bugs surface and are fixed in the same change: * `ope` previously checked `sum > 0` and rejected `actual_series` with a strictly negative sum (e.g. financial return series). The check is now `sum != 0` via the helper. * `wmape`'s docstring claimed `ValueError if actual_series contains some zeros`, but the implementation divides by `sum(|y_true|)` and only the all-zero case ever triggered the path. Docstring corrected. The CHANGELOG entry for the parameter addition carries the breaking- change marker per the convention discussed in #3080 (the post-mortem on #3059), since the default behavior flips from raising to warning. Adds a parametrized regression test covering all five metrics and an explicit OPE-with-negative-sum test. Existing `test_ope_zero` and the arre/marre legacy raise check are updated to opt into the legacy behavior with `zero_division="raise"`. --- CHANGELOG.md | 2 + darts/metrics/metrics.py | 90 ++++++++++++++++++++--------- darts/metrics/utils.py | 67 +++++++++++++++++++++ darts/tests/metrics/test_metrics.py | 80 ++++++++++++++++++++++++- 4 files changed, 208 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b921ad3de..83c51fdff9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,12 +15,14 @@ but cannot always guarantee backwards compatibility. Changes that may **break co - Added `use_longer_projection_head` to `TimesFM2p5Model` to enable longer non-autoregressive prediction horizons (up to 1024 steps for `output_chunk_length + output_chunk_shift`). [#3121](https://github.com/unit8co/darts/pull/3121) by [Zhihao Dai](https://github.com/daidahao). - `TimeSeries.from_dataframe()` now supports time columns of type `pl.Date` for `polars.DataFrame`. [#3124](https://github.com/unit8co/darts/pull/3124) by [Dennis Bader](https://github.com/dennisbader) - Custom encoders now support functions that return multiple components. Simply pass such a function via the `"custom"` encoder key in the `add_encoders` model input parameter. [#3069](https://github.com/unit8co/darts/pull/3069) by [Moritz Waldleben](https://github.com/mwaldleben). +- πŸ”΄ Percentage and range-based metrics (`wmape`, `ope`, `arre`, `marre`, `coefficient_of_variation`) now expose a `zero_division` parameter (mirroring [#3059](https://github.com/unit8co/darts/pull/3059)) controlling the behavior when the denominator is zero: `"warn"` (default) returns `np.nan` and emits a warning, `"raise"` preserves the legacy `ValueError`. [#3122](https://github.com/unit8co/darts/pull/3122) by [Mahimn](https://github.com/mahimn01). **Fixed** - Fixed `_ScaledDotProductAttention` float16 overflow in `masked_fill` under mixed precision training. [#3087](https://github.com/unit8co/darts/pull/3087) by [Robert Ruidisch](https://github.com/robrui). - Fixed a bug in `TimeSeries.quantile()` where the output dtype did not match the input series dtype for dtypes `float32` or `float16`. Now the dtype is correctly propagated. [#3124](https://github.com/unit8co/darts/pull/3124) by [Dennis Bader](https://github.com/dennisbader) - Optuna integration's `PyTorchLightningPruningCallback` for hyperparameter optimization of torch models is now natively available in Darts via `darts.utils.callbacks`. [#3114](https://github.com/unit8co/darts/pull/3114) by [Jakub ChΕ‚apek](https://github.com/jakubchlapek). +- Fixed `ope` to accept `actual_series` with a strictly negative sum; the previous `sum > 0` check incorrectly rejected valid inputs such as financial return series. Also corrected the `wmape` docstring which inaccurately claimed it raised on zeros in `actual_series`. [#3122](https://github.com/unit8co/darts/pull/3122) by [Mahimn](https://github.com/mahimn01). **Dependencies** diff --git a/darts/metrics/metrics.py b/darts/metrics/metrics.py index 35688b1291..31471a92d7 100644 --- a/darts/metrics/metrics.py +++ b/darts/metrics/metrics.py @@ -24,6 +24,7 @@ _get_values_or_raise, _get_wrapped_metric, _LabelReduction, + _safe_pct_divide, _safe_scaled_divide, classification_support, interval_support, @@ -1725,6 +1726,7 @@ def wmape( intersect: bool = True, *, q: float | list[float] | tuple[np.ndarray, pd.Index] | None = None, + zero_division: str = "warn", component_reduction: Callable[[np.ndarray], float] | None = np.nanmean, series_reduction: Callable[[np.ndarray], float | np.ndarray] | None = None, n_jobs: int = 1, @@ -1753,6 +1755,12 @@ def wmape( will consider the values only over their common time interval (intersection in time). q Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on. + zero_division + Controls behavior when the denominator :math:`\\sum_{t=1}^T |y_t|` is zero (i.e. ``actual_series`` is + all zeros for a given component). + + * ``"warn"`` (default) – returns ``np.nan`` and emits a warning. + * ``"raise"`` – raises a ``ValueError``. component_reduction Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray` of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a @@ -1776,7 +1784,7 @@ def wmape( Raises ------ ValueError - If `actual_series` contains some zeros. + If `zero_division="raise"` and the denominator :math:`\\sum_{t=1}^T |y_t|` is zero for some component. Returns ------- @@ -1812,10 +1820,10 @@ def wmape( q=q, ) - return ( - 100.0 - * np.nansum(np.abs(y_true - y_pred), axis=TIME_AX) - / np.nansum(np.abs(y_true), axis=TIME_AX) + return 100.0 * _safe_pct_divide( + np.nansum(np.abs(y_true - y_pred), axis=TIME_AX), + np.nansum(np.abs(y_true), axis=TIME_AX), + zero_division=zero_division, ) @@ -2029,6 +2037,7 @@ def ope( intersect: bool = True, *, q: float | list[float] | tuple[np.ndarray, pd.Index] | None = None, + zero_division: str = "warn", component_reduction: Callable[[np.ndarray], float] | None = np.nanmean, series_reduction: Callable[[np.ndarray], float | np.ndarray] | None = None, n_jobs: int = 1, @@ -2058,6 +2067,14 @@ def ope( will consider the values only over their common time interval (intersection in time). q Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on. + zero_division + Controls behavior when the denominator :math:`\\sum_{t=1}^{T}{y_t}` is zero. + + * ``"warn"`` (default) – returns ``np.nan`` and emits a warning. + * ``"raise"`` – raises a ``ValueError``. + + Note: a negative sum is a valid denominator (e.g. financial return series). Only an exact + zero sum triggers the zero-division handling. component_reduction Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray` of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a @@ -2081,7 +2098,7 @@ def ope( Raises ------ ValueError - If :math:`\\sum_{t=1}^{T}{y_t} = 0`. + If `zero_division="raise"` and :math:`\\sum_{t=1}^{T}{y_t} = 0` for some component. Returns ------- @@ -2116,14 +2133,16 @@ def ope( np.nansum(y_true, axis=TIME_AX), np.nansum(y_pred, axis=TIME_AX), ) - if not (y_true_sum > 0).all(): - raise_log( - ValueError( - "The series of actual value cannot sum to zero when computing OPE." - ), - logger=logger, + return ( + np.abs( + _safe_pct_divide( + y_true_sum - y_pred_sum, + y_true_sum, + zero_division=zero_division, + ) ) - return np.abs((y_true_sum - y_pred_sum) / y_true_sum) * 100.0 + * 100.0 + ) @multi_ts_support @@ -2134,6 +2153,7 @@ def arre( intersect: bool = True, *, q: float | list[float] | tuple[np.ndarray, pd.Index] | None = None, + zero_division: str = "warn", time_reduction: Callable[..., np.ndarray] | None = None, component_reduction: Callable[[np.ndarray], float] | None = np.nanmean, series_reduction: Callable[[np.ndarray], float | np.ndarray] | None = None, @@ -2163,6 +2183,12 @@ def arre( will consider the values only over their common time interval (intersection in time). q Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on. + zero_division + Controls behavior when the denominator :math:`\\max_t{y_t} - \\min_t{y_t}` is zero (i.e. + ``actual_series`` is constant for a given component). + + * ``"warn"`` (default) – returns ``np.nan`` for affected components and emits a warning. + * ``"raise"`` – raises a ``ValueError``. time_reduction Optionally, a function to aggregate the metrics over the time axis. It must reduce a `np.ndarray` of shape `(t, c)` to a `np.ndarray` of shape `(c,)`. The function takes as input a ``np.ndarray`` and a @@ -2191,7 +2217,7 @@ def arre( Raises ------ ValueError - If :math:`\\max_t{y_t} = \\min_t{y_t}`. + If `zero_division="raise"` and :math:`\\max_t{y_t} = \\min_t{y_t}` for some component. Returns ------- @@ -2226,16 +2252,10 @@ def arre( q=q, ) y_max, y_min = np.nanmax(y_true, axis=TIME_AX), np.nanmin(y_true, axis=TIME_AX) - if not (y_max > y_min).all(): - raise_log( - ValueError( - "The difference between the max and min values must " - "be strictly positive to compute the MARRE." - ), - logger=logger, - ) true_range = y_max - y_min - return 100.0 * np.abs((y_true - y_pred) / true_range) + return 100.0 * np.abs( + _safe_pct_divide(y_true - y_pred, true_range, zero_division=zero_division) + ) @multi_ts_support @@ -2246,6 +2266,7 @@ def marre( intersect: bool = True, *, q: float | list[float] | tuple[np.ndarray, pd.Index] | None = None, + zero_division: str = "warn", component_reduction: Callable[[np.ndarray], float] | None = np.nanmean, series_reduction: Callable[[np.ndarray], float | np.ndarray] | None = None, n_jobs: int = 1, @@ -2275,6 +2296,12 @@ def marre( will consider the values only over their common time interval (intersection in time). q Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on. + zero_division + Controls behavior when the denominator :math:`\\max_t{y_t} - \\min_t{y_t}` is zero (i.e. + ``actual_series`` is constant for a given component). + + * ``"warn"`` (default) – returns ``np.nan`` for affected components and emits a warning. + * ``"raise"`` – raises a ``ValueError``. component_reduction Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray` of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a @@ -2298,7 +2325,7 @@ def marre( Raises ------ ValueError - If :math:`\\max_t{y_t} = \\min_t{y_t}`. + If `zero_division="raise"` and :math:`\\max_t{y_t} = \\min_t{y_t}` for some component. float A single metric score for: @@ -2322,6 +2349,7 @@ def marre( pred_series, intersect, q=q, + zero_division=zero_division, ), axis=TIME_AX, ) @@ -2433,6 +2461,7 @@ def coefficient_of_variation( intersect: bool = True, *, q: float | list[float] | tuple[np.ndarray, pd.Index] | None = None, + zero_division: str = "warn", component_reduction: Callable[[np.ndarray], float] | None = np.nanmean, series_reduction: Callable[[np.ndarray], float | np.ndarray] | None = None, n_jobs: int = 1, @@ -2464,6 +2493,11 @@ def coefficient_of_variation( will consider the values only over their common time interval (intersection in time). q Optionally, the quantile (float [0, 1]) or list of quantiles of interest to compute the metric on. + zero_division + Controls behavior when the denominator :math:`\\bar{y}` (the mean of ``actual_series``) is zero. + + * ``"warn"`` (default) – returns ``np.nan`` for affected components and emits a warning. + * ``"raise"`` – raises a ``ValueError``. component_reduction Optionally, a function to aggregate the metrics over the component/column axis. It must reduce a `np.ndarray` of shape `(t, c)` to a `np.ndarray` of shape `(t,)`. The function takes as input a ``np.ndarray`` and a @@ -2514,10 +2548,10 @@ def coefficient_of_variation( q=q, ) # not calling rmse as y_true and y_pred are np.ndarray - return ( - 100 - * np.sqrt(np.nanmean((y_true - y_pred) ** 2, axis=TIME_AX)) - / np.nanmean(y_true, axis=TIME_AX) + return 100 * _safe_pct_divide( + np.sqrt(np.nanmean((y_true - y_pred) ** 2, axis=TIME_AX)), + np.nanmean(y_true, axis=TIME_AX), + zero_division=zero_division, ) diff --git a/darts/metrics/utils.py b/darts/metrics/utils.py index 454f1ca8ea..20c8e01aa2 100644 --- a/darts/metrics/utils.py +++ b/darts/metrics/utils.py @@ -940,6 +940,73 @@ def _safe_scaled_divide( return result +def _safe_pct_divide( + errors: np.ndarray, + scale: np.ndarray, + zero_division: str = "warn", +) -> np.ndarray: + """Divides ``errors`` by ``scale`` for percentage-style metrics, returning + ``np.nan`` where ``scale`` is zero. + + Unlike :func:`_safe_scaled_divide` β€” which fills the ``0/0`` case with + ``1.0`` to express "on par with naive baseline" for scaled-error metrics + β€” this helper always fills zero-scale entries with ``np.nan`` because + percentage metrics multiply the ratio by ``100``; a fill of ``1.0`` would + surface as a ``100 %`` error and be misleading. + + Parameters + ---------- + errors + Numerator array. Broadcasts against ``scale``. + scale + Denominator array (e.g. the sum, mean, or range of ``actual_series``). + zero_division + Controls behavior when ``scale`` is (near) zero. + + * ``"warn"`` (default) – fill zero-scale entries with ``np.nan`` and + emit a warning. + * ``"raise"`` – raise a ``ValueError`` (the legacy behavior). + + Returns + ------- + np.ndarray + The result of ``errors / scale`` with zero-scale entries replaced by + ``np.nan``. + """ + if zero_division not in ["warn", "raise"]: + raise_log( + ValueError( + f"`zero_division` must be 'warn' or 'raise'. Received {zero_division}." + ), + logger=logger, + ) + + zero_mask = np.isclose(scale, 0.0) + if not zero_mask.any(): + return errors / scale + + if zero_division == "raise": + raise_log( + ValueError( + "Cannot compute percentage metric: the denominator " + "(e.g. sum, mean, or range of `actual_series`) is zero " + "for some components." + ), + logger=logger, + ) + + # Avoid runtime warnings from the masked divide + safe_scale = np.where(zero_mask, 1.0, scale) + result = np.where(zero_mask, np.nan, errors / safe_scale) + + logger.warning( + "The denominator (e.g. sum, mean, or range of `actual_series`) is " + "zero for some components in the percentage metric. Those entries " + "are set to NaN." + ) + return result + + def _unique_labels(y_true: np.ndarray, y_pred: np.ndarray) -> list[np.ndarray]: """Returns unique labels for each component in the true and predicted labels.""" labels = [] diff --git a/darts/tests/metrics/test_metrics.py b/darts/tests/metrics/test_metrics.py index 39d85a7e5c..6fccb1cde5 100644 --- a/darts/tests/metrics/test_metrics.py +++ b/darts/tests/metrics/test_metrics.py @@ -232,10 +232,12 @@ def test_ape_zero(self, metric): metric(self.series1, self.series1) def test_ope_zero(self): + # Legacy raising behavior is now opt-in via `zero_division="raise"`. with pytest.raises(ValueError): metrics.ope( self.series1 - self.series1.to_series().mean(), self.series1 - self.series1.to_series().mean(), + zero_division="raise", ) @pytest.mark.parametrize( @@ -1105,14 +1107,14 @@ def test_arre(self, config): self.helper_test_nan(metric, **kwargs) self.helper_test_non_aggregate(metric, is_aggregate) + # Legacy raising behavior is now opt-in via `zero_division="raise"`. with pytest.raises(ValueError) as exc: _ = metric( TimeSeries.from_values(np.ones((3, 1, 1))), TimeSeries.from_values(np.ones((3, 1, 1))), + zero_division="raise", ) - assert str(exc.value).startswith( - "The difference between the max and min values must " - ) + assert "denominator" in str(exc.value) @pytest.mark.parametrize( "metric", @@ -1474,6 +1476,78 @@ def test_scaled_errors_zero_division( assert np.all(np.isnan(result[2:])) caplog.clear() + @pytest.mark.parametrize( + "metric", + [ + metrics.wmape, + metrics.ope, + metrics.arre, + metrics.marre, + metrics.coefficient_of_variation, + ], + ) + def test_pct_metrics_zero_division(self, metric, caplog): + """Percentage / range-based metrics return NaN on zero denominator + under the default ``zero_division="warn"`` and raise under + ``zero_division="raise"``. + + A constant all-zero ``actual_series`` triggers every denominator + these metrics use (sum of absolutes, sum, mean, max-min).""" + zero_actual = TimeSeries.from_values(np.zeros((10, 1))) + some_pred = TimeSeries.from_values(np.ones((10, 1))) + + # --- default "warn": NaN + warning --- + with caplog.at_level(logging.WARNING): + result = metric(zero_actual, some_pred, component_reduction=None) + assert "denominator" in caplog.text + assert np.all(np.isnan(np.atleast_1d(result))) + caplog.clear() + + # --- "raise": ValueError (legacy behavior) --- + with pytest.raises(ValueError, match="denominator"): + metric(zero_actual, some_pred, zero_division="raise") + + # --- invalid value rejected --- + with pytest.raises(ValueError, match="`zero_division` must be"): + metric(zero_actual, some_pred, zero_division="invalid") + + # --- non-zero denominator: no warning, finite result --- + caplog.clear() + with caplog.at_level(logging.WARNING): + result_normal = metric(self.series1, self.series2, component_reduction=None) + assert "denominator" not in caplog.text + assert not np.any(np.isnan(np.atleast_1d(result_normal))) + + def test_ope_accepts_negative_sum(self, caplog): + """OPE must accept ``actual_series`` whose sum is negative (e.g. + financial return series). Only an exact zero sum triggers the + zero-division handling. + + The previous ``y_true_sum > 0`` guard incorrectly rejected this + valid input. + """ + # mean ~ 0 but sum strictly negative + neg_sum_values = np.array([ + 1.0, + -2.0, + 1.0, + -3.0, + 1.0, + -2.0, + 1.0, + -3.0, + 1.0, + -2.0, + ]).reshape(-1, 1) + actual = TimeSeries.from_values(neg_sum_values) + pred = TimeSeries.from_values(neg_sum_values + 0.1) + + caplog.clear() + with caplog.at_level(logging.WARNING): + result = metrics.ope(actual, pred) + assert "denominator" not in caplog.text + assert np.isfinite(result) + def test_ope(self): self.helper_test_multivariate_duplication_equality(metrics.ope) self.helper_test_multiple_ts_duplication_equality(metrics.ope)