ecmwf · mc4117 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/none b/none
diff --git a/training/docs/modules/losses.rst b/training/docs/modules/losses.rst
@@ -91,6 +91,60 @@ deterministic:
 
 .. _multiscale-loss-functions:
 
+***************************
+ Time Aggregate Loss Functions
+***************************
+
+These loss functions encourage the model to produce **temporally consistent** outputs
+i.e. output sequences that are internally coherent over
+time, not just accurate at each individual step.
+
+:class:`~anemoi.training.losses.aggregate.TimeAggregateLossWrapper`
+addresses this by applying a base loss function to *time-aggregated*
+versions of the prediction and target, rather than step-by-step. The
+following aggregations are supported:
+
+.. list-table::
+   :widths: 15 85
+   :header-rows: 1
+
+   -  -  Aggregation
+      -  Description
+
+   -  -  ``mean``
+      -  Mean over the output time window — penalises bias in the
+         temporal average.
+
+   -  -  ``max``
+      -  Maximum over the output time window — penalises errors in peak
+         values.
+
+   -  -  ``min``
+      -  Minimum over the output time window — penalises errors in
+         minimum values.
+
+   -  -  ``diff``
+      -  Consecutive step-to-step differences
+         (``pred[:, 1:] - pred[:, :-1]``) — penalises unrealistic
+         temporal transitions and discontinuities.
+
+The wrapper accumulates the specified loss function evaluated on each aggregation in
+turn and returns the sum. Because the ``time_steps`` scaler is
+intentionally excluded from the inner ``loss_fn`` (temporal aggregation
+collapses the time dimension), only spatial and variable scalers should
+be listed there.
+
+.. note::
+
+   ``TimeAggregateLossWrapper`` requires an output time dimension
+   greater than one, as it is not
+   meaningful for single-step tasks.
+
+We strongly recommend using the time aggregate loss when training any
+temporal downscaler. The pre-built config variants ``single_MSE_aggregation``
+and ``ensemble_multiscale_aggregation`` combine it with the primary loss inside a
+:class:`~anemoi.training.losses.combined.CombinedLoss`.
+
 ***************************
  Multiscale Loss Functions
 ***************************

diff --git a/training/docs/modules/tasks.rst b/training/docs/modules/tasks.rst
@@ -155,6 +155,9 @@ Example: ``input_timestep="6H"``, ``output_timestep="3H"``,
 ``output_left_boundary=True`` produces output offsets
 ``[0H, 3H]`` and input offsets ``[0H, 6H]``.
 
+The default is to use the time aggregate loss when training any
+temporal downscaler.
+
 .. automodule:: anemoi.training.tasks.temporal_downscaling
    :members:
    :no-undoc-members:

diff --git a/training/src/anemoi/training/config/temporal_downscaler.yaml b/training/src/anemoi/training/config/temporal_downscaler.yaml
@@ -7,6 +7,7 @@ defaults:
 - model: graphtransformer
 - task: temporal_downscaler
 - training: single
+- override training/training_loss: single_MSE_aggregation
 - _self_
 
 config_validation: True

diff --git a/training/src/anemoi/training/config/temporal_downscaler_ensemble.yaml b/training/src/anemoi/training/config/temporal_downscaler_ensemble.yaml
@@ -7,6 +7,7 @@ defaults:
 - model: graphtransformer_ens
 - task: temporal_downscaler
 - training: ensemble
+- override training/training_loss: ensemble_multiscale_aggregation
 - _self_
 
 config_validation: True

diff --git a/training/src/anemoi/training/config/training/diffusion.yaml b/training/src/anemoi/training/config/training/diffusion.yaml
@@ -1,6 +1,7 @@
 ---
 defaults:
   - scalers: global
+  - training_loss: single
   - optimization: default
   - weight_averaging: null
 

diff --git a/training/src/anemoi/training/config/training/ensemble.yaml b/training/src/anemoi/training/config/training/ensemble.yaml
@@ -1,6 +1,7 @@
 ---
 defaults:
   - scalers: global
+  - training_loss: ensemble
   - optimization: default
   - weight_averaging: null
 
@@ -51,33 +52,6 @@ strategy:
 loss_gradient_scaling: False
 
 
-# loss function for the model
-# To train without multiscale loss, set it to the desired loss directly
-training_loss:
-  datasets:
-    data: # user-defined key in data
-      # loss class to initialise, can be anything subclassing torch.nn.Module
-      _target_: anemoi.training.losses.MultiscaleLossWrapper
-      # Disk mode:   multiscale_config: {loss_matrices_path: /path, loss_matrices: ["file.npz", null]}
-      # On-the-fly:  multiscale_config: {num_scales: 4, base_num_nearest_neighbours: 16, base_sigma: 0.01570}
-      multiscale_config: null  # null = single scale, no smoothing
-      weights: [1.0]
-
-      per_scale_loss:
-        _target_: anemoi.training.losses.CRPS
-        scalers: ['pressure_level', 'general_variable', 'nan_mask_weights', 'node_weights', 'time_steps']
-
-        # Scalers to include in loss calculation
-        # A selection of available scalers are listed in training/scalers.
-        # '*' is a valid entry to use all `scalers` given, if a scaler is to be excluded
-        # add `!scaler_name`, i.e. ['*', '!scaler_1'], and `scaler_1` will not be added.
-        # scalers: ['pressure_level', 'general_variable', 'nan_mask_weights', 'node_weights']
-        ignore_nans: False
-        no_autocast: True
-        alpha: 0.95
-
-
-
 # Validation metrics calculation,
 # This may be a list, in which case all metrics will be calculated
 # and logged according to their name.

diff --git a/training/src/anemoi/training/config/training/lam.yaml b/training/src/anemoi/training/config/training/lam.yaml
@@ -1,6 +1,7 @@
 ---
 defaults:
   - scalers: lam
+  - training_loss: single
   - optimization: default
   - weight_averaging: null
 
@@ -48,19 +49,6 @@ strategy:
 # don't enable this by default until it's been tested and proven beneficial
 loss_gradient_scaling: False
 
-# loss function for the model
-training_loss:
-  datasets:
-    data: # user-defined key in data
-      # loss class to initialise
-      _target_: anemoi.training.losses.MSELoss
-      # Scalers to include in loss calculation
-      # A selection of available scalers are listed in training/scalers/scalers.yaml
-      # '*' is a valid entry to use all `scalers` given, if a scaler is to be excluded
-      # add `!scaler_name`, i.e. ['*', '!scaler_1'], and `scaler_1` will not be added.
-      scalers: ['pressure_level', 'general_variable', 'node_weights', 'time_steps']
-      ignore_nans: False
-
 # Validation metrics calculation,
 # This may be a list, in which case all metrics will be calculated
 # and logged according to their name.

diff --git a/training/src/anemoi/training/config/training/multi.yaml b/training/src/anemoi/training/config/training/multi.yaml
@@ -1,6 +1,7 @@
 ---
 defaults:
   - scalers: multi
+  - training_loss: single
   - optimization: default
   - weight_averaging: null
 
@@ -56,7 +57,6 @@ max_steps: 150000
 
 
 submodules_to_freeze: []
-
 # Dataset-specific loss and metrics configuration
 training_loss:
   datasets:

diff --git a/training/src/anemoi/training/config/training/single.yaml b/training/src/anemoi/training/config/training/single.yaml
@@ -1,6 +1,7 @@
 ---
 defaults:
   - scalers: global
+  - training_loss: single
   - optimization: default
   - weight_averaging: null
 
@@ -39,26 +40,11 @@ strategy:
   num_gpus_per_model: ${system.hardware.num_gpus_per_model}
   read_group_size: ${dataloader.read_group_size}
 
-# loss functions
-
 # dynamic rescaling of the loss gradient
 # see https://arxiv.org/pdf/2306.06079.pdf, section 4.3.2
 # don't enable this by default until it's been tested and proven beneficial
 loss_gradient_scaling: False
 
-# loss function for the model
-training_loss:
-  datasets:
-    data: # user-defined key in data
-      # loss class to initialise
-      _target_: anemoi.training.losses.MSELoss
-      # Scalers to include in loss calculation
-      # A selection of available scalers are listed in training/scalers.
-      # '*' is a valid entry to use all `scalers` given, if a scaler is to be excluded
-      # add `!scaler_name`, i.e. ['*', '!scaler_1'], and `scaler_1` will not be added.
-      scalers: ['pressure_level', 'general_variable', 'node_weights', 'time_steps']
-      ignore_nans: False
-
 # Validation metrics calculation,
 # This may be a list, in which case all metrics will be calculated
 # and logged according to their name.

diff --git a/training/src/anemoi/training/config/training/stretched.yaml b/training/src/anemoi/training/config/training/stretched.yaml
@@ -1,6 +1,7 @@
 ---
 defaults:
   - scalers: stretched
+  - training_loss: single
   - optimization: default
   - weight_averaging: null
 
@@ -49,19 +50,6 @@ strategy:
 # don't enable this by default until it's been tested and proven beneficial
 loss_gradient_scaling: False
 
-# loss function for the model
-training_loss:
-  datasets:
-    data: # user-defined key in data
-      # loss class to initialise
-      _target_: anemoi.training.losses.MSELoss
-      # Scalers to include in loss calculation
-      # A selection of available scalers are listed in training/scalers/scalers.yaml
-      # '*' is a valid entry to use all `scalers` given, if a scaler is to be excluded
-      # add `!scaler_name`, i.e. ['*', '!scaler_1'], and `scaler_1` will not be added.
-      scalers: ['pressure_level', 'general_variable', 'node_weights', 'time_steps']
-      ignore_nans: False
-
 # Validation metrics calculation,
 # This may be a list, in which case all metrics will be calculated
 # and logged according to their name.

diff --git a/training/src/anemoi/training/config/training/training_loss/ensemble.yaml b/training/src/anemoi/training/config/training/training_loss/ensemble.yaml
@@ -0,0 +1,22 @@
+# loss function for the model
+# To train without multiscale loss, set it to the desired loss directly
+datasets:
+  data: # user-defined key in data
+    # loss class to initialise, can be anything subclassing torch.nn.Module
+    _target_: anemoi.training.losses.MultiscaleLossWrapper
+    # Disk mode:   multiscale_config: {loss_matrices_path: /path, loss_matrices: ["file.npz", null]}
+    # On-the-fly:  multiscale_config: {num_scales: 4, base_num_nearest_neighbours: 16, base_sigma: 0.01570}
+    multiscale_config: null  # null = single scale, no smoothing
+    weights: [1.0]
+    per_scale_loss:
+      _target_: anemoi.training.losses.CRPS
+      scalers: ['pressure_level', 'general_variable', 'nan_mask_weights', 'node_weights', 'time_steps']
+
+      # Scalers to include in loss calculation
+      # A selection of available scalers are listed in training/scalers.
+      # '*' is a valid entry to use all `scalers` given, if a scaler is to be excluded
+      # add `!scaler_name`, i.e. ['*', '!scaler_1'], and `scaler_1` will not be added.
+      # scalers: ['pressure_level', 'general_variable', 'nan_mask_weights', 'node_weights']
+      ignore_nans: False
+      no_autocast: True
+      alpha: 0.95
diff --git a/...ng/src/anemoi/training/config/training/training_loss/ensemble_multiscale_aggregation.yaml b/...ng/src/anemoi/training/config/training/training_loss/ensemble_multiscale_aggregation.yaml
@@ -0,0 +1,27 @@
+datasets:
+  data:
+    _target_: anemoi.training.losses.combined.CombinedLoss
+    ignore_nans: False
+    # loss_weights: [n_timesteps / (n_timesteps + n_agg_ops), n_agg_ops / (n_timesteps + n_agg_ops)]
+    # Each sub-loss averages internally (raw over timesteps, aggregate over agg ops).
+    # These weights re-scale so the total matches: sum_all / (n_timesteps + n_agg_ops).
+    # Example for 6 timesteps and 4 agg ops: [0.6, 0.4]
+    loss_weights: [0.6, 0.4]
+    losses:
+      - _target_: anemoi.training.losses.MultiscaleLossWrapper
+        multiscale_config: null  # null = single scale, no smoothing
+        weights: [1.0]
+        per_scale_loss:
+          _target_: anemoi.training.losses.CRPS
+          scalers: ['pressure_level', 'general_variable', 'nan_mask_weights', 'node_weights', 'time_steps']
+          ignore_nans: False
+          no_autocast: True
+          alpha: 0.95
+      - _target_: anemoi.training.losses.aggregate.TimeAggregateLossWrapper
+        scalers: ['pressure_level', 'general_variable', 'nan_mask_weights', 'node_weights']
+        time_aggregation_types: [mean, max, min, diff]
+        loss_fn:
+          _target_: anemoi.training.losses.CRPS
+          ignore_nans: False
+          no_autocast: True
+          alpha: 0.95
diff --git a/training/src/anemoi/training/config/training/training_loss/single.yaml b/training/src/anemoi/training/config/training/training_loss/single.yaml
@@ -0,0 +1,9 @@
+datasets:
+  data:
+    _target_: anemoi.training.losses.MSELoss
+    # Scalers to include in loss calculation
+    # A selection of available scalers are listed in training/scalers.
+    # '*' is a valid entry to use all `scalers` given, if a scaler is to be excluded
+    # add `!scaler_name`, i.e. ['*', '!scaler_1'], and `scaler_1` will not be added.
+    scalers: ['pressure_level', 'general_variable', 'node_weights', 'time_steps']
+    ignore_nans: False
diff --git a/training/src/anemoi/training/config/training/training_loss/single_MSE_aggregation.yaml b/training/src/anemoi/training/config/training/training_loss/single_MSE_aggregation.yaml
@@ -0,0 +1,19 @@
+datasets:
+  data:
+    _target_: anemoi.training.losses.combined.CombinedLoss
+    ignore_nans: False
+    # loss_weights: [n_timesteps / (n_timesteps + n_agg_ops), n_agg_ops / (n_timesteps + n_agg_ops)]
+    # Each sub-loss averages internally (raw over timesteps, aggregate over agg ops).
+    # These weights re-scale so the total matches: sum_all / (n_timesteps + n_agg_ops).
+    # Example for 6 timesteps and 4 agg ops: [0.6, 0.4]
+    loss_weights: [0.6, 0.4]
+    losses:
+      - _target_: anemoi.training.losses.MSELoss
+        scalers: ['pressure_level', 'general_variable', 'node_weights', 'time_steps']
+        ignore_nans: False
+      - _target_: anemoi.training.losses.aggregate.TimeAggregateLossWrapper
+        scalers: ['pressure_level', 'general_variable', 'node_weights']
+        time_aggregation_types: [mean, max, min, diff]
+        loss_fn:
+          _target_: anemoi.training.losses.MSELoss
+          ignore_nans: False