docs: Add monitoring example (#337)

mattbuot · web-flow · commit a5c19daf135e · 2025-05-06T22:25:05.000+02:00
* Add monitoring.rst
* Add its test in test_rst.py
* Add link in index.rst
diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst
@@ -15,6 +15,8 @@ This section contains some usage examples for TorchJD.
   dedicated backpropagation function :doc:`mtl_backward <../docs/autojac/mtl_backward>`.
 - :doc:`Recurrent Neural Network (RNN) <rnn>` shows how to apply Jacobian descent to RNN training,
   with one loss per output sequence element.
+- :doc:`Monitoring Aggregations <monitoring>` shows how to monitor the aggregation performed by the
+  aggregator, to check if Jacobian descent is prescribed for your use-case.
 - :doc:`PyTorch Lightning Integration <lightning_integration>` showcases how to combine
   TorchJD with PyTorch Lightning, by providing an example implementation of a multi-task
   ``LightningModule`` optimized by Jacobian descent.
@@ -27,5 +29,6 @@ This section contains some usage examples for TorchJD.
     iwrm.rst
     mtl.rst
     rnn.rst
+    monitoring.rst
     lightning_integration.rst
     amp.rst
diff --git a/docs/source/examples/monitoring.rst b/docs/source/examples/monitoring.rst
@@ -0,0 +1,68 @@
+Monitoring aggregations
+=======================
+
+The :doc:`Aggregator <../docs/aggregation/bases>` class is a subclass of :class:`torch.nn.Module`.
+This allows registering hooks, which can be used to monitor some information about aggregations.
+The following code example demonstrates registering a hook to compute and print the cosine
+similarity between the aggregation performed by :doc:`UPGrad <../docs/aggregation/upgrad>` and the
+average of the gradients, and another hook to compute and print the weights of the weighting of
+:doc:`UPGrad <../docs/aggregation/upgrad>`.
+
+Updating the parameters of the model with the average gradient is equivalent to using gradient
+descent on the average of the losses. Observing a cosine similarity smaller than 1 means that
+Jacobian descent is doing something different than gradient descent. With
+:doc:`UPGrad <../docs/aggregation/upgrad>`, this happens when the original gradients conflict (i.e.
+they have a negative inner product).
+
+.. code-block:: python
+    :emphasize-lines: 9-11, 13-18, 33-34
+
+    import torch
+    from torch.nn import Linear, MSELoss, ReLU, Sequential
+    from torch.optim import SGD
+    from torch.nn.functional import cosine_similarity
+
+    from torchjd import mtl_backward
+    from torchjd.aggregation import UPGrad
+
+    def print_weights(_, __, weights: torch.Tensor) -> None:
+        """Prints the extracted weights."""
+        print(f"Weights: {weights}")
+
+    def print_similarity_with_gd(_, inputs: torch.Tensor, aggregation: torch.Tensor) -> None:
+        """Prints the cosine similarity between the aggregation and the average gradient."""
+        matrix = inputs[0]
+        gd_output = matrix.mean(dim=0)
+        similarity = cosine_similarity(aggregation, gd_output, dim=0)
+        print(f"Cosine similarity: {similarity.item():.4f}")
+
+    shared_module = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
+    task1_module = Linear(3, 1)
+    task2_module = Linear(3, 1)
+    params = [
+        *shared_module.parameters(),
+        *task1_module.parameters(),
+        *task2_module.parameters(),
+    ]
+
+    loss_fn = MSELoss()
+    optimizer = SGD(params, lr=0.1)
+    aggregator = UPGrad()
+
+    aggregator.weighting.register_forward_hook(print_weights)
+    aggregator.register_forward_hook(print_similarity_with_gd)
+
+    inputs = torch.randn(8, 16, 10)  # 8 batches of 16 random input vectors of length 10
+    task1_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the first task
+    task2_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the second task
+
+    for input, target1, target2 in zip(inputs, task1_targets, task2_targets):
+        features = shared_module(input)
+        output1 = task1_module(features)
+        output2 = task2_module(features)
+        loss1 = loss_fn(output1, target1)
+        loss2 = loss_fn(output2, target2)
+
+        optimizer.zero_grad()
+        mtl_backward(losses=[loss1, loss2], features=features, aggregator=aggregator)
+        optimizer.step()
diff --git a/tests/doc/test_rst.py b/tests/doc/test_rst.py
@@ -209,6 +209,58 @@ def test_rnn():
         optimizer.step()
 
 
+def test_monitoring():
+    import torch
+    from torch.nn import Linear, MSELoss, ReLU, Sequential
+    from torch.nn.functional import cosine_similarity
+    from torch.optim import SGD
+
+    from torchjd import mtl_backward
+    from torchjd.aggregation import UPGrad
+
+    def print_weights(_, __, weights: torch.Tensor) -> None:
+        """Prints the extracted weights."""
+        print(f"Weights: {weights}")
+
+    def print_similarity_with_gd(_, inputs: torch.Tensor, aggregation: torch.Tensor) -> None:
+        """Prints the cosine similarity between the aggregation and the average gradient."""
+        matrix = inputs[0]
+        gd_output = matrix.mean(dim=0)
+        similarity = cosine_similarity(aggregation, gd_output, dim=0)
+        print(f"Cosine similarity: {similarity.item():.4f}")
+
+    shared_module = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
+    task1_module = Linear(3, 1)
+    task2_module = Linear(3, 1)
+    params = [
+        *shared_module.parameters(),
+        *task1_module.parameters(),
+        *task2_module.parameters(),
+    ]
+
+    loss_fn = MSELoss()
+    optimizer = SGD(params, lr=0.1)
+    aggregator = UPGrad()
+
+    aggregator.weighting.register_forward_hook(print_weights)
+    aggregator.register_forward_hook(print_similarity_with_gd)
+
+    inputs = torch.randn(8, 16, 10)  # 8 batches of 16 random input vectors of length 10
+    task1_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the first task
+    task2_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the second task
+
+    for input, target1, target2 in zip(inputs, task1_targets, task2_targets):
+        features = shared_module(input)
+        output1 = task1_module(features)
+        output2 = task2_module(features)
+        loss1 = loss_fn(output1, target1)
+        loss2 = loss_fn(output2, target2)
+
+        optimizer.zero_grad()
+        mtl_backward(losses=[loss1, loss2], features=features, aggregator=aggregator)
+        optimizer.step()
+
+
 def test_amp():
     import torch
     from torch.amp import GradScaler