SimplexLab
diff --git a/‎stable/.buildinfo‎
Lines changed: 1 addition & 1 deletion b/‎stable/.buildinfo‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎stable/_sources/docs/autojac/index.rst.txt‎
Lines changed: 2 additions & 0 deletions b/‎stable/_sources/docs/autojac/index.rst.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎stable/_sources/docs/autojac/jac.rst.txt‎
Lines changed: 6 additions & 0 deletions b/‎stable/_sources/docs/autojac/jac.rst.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎stable/_sources/docs/autojac/jac_to_grad.rst.txt‎
Lines changed: 6 additions & 0 deletions b/‎stable/_sources/docs/autojac/jac_to_grad.rst.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎stable/_sources/examples/amp.rst.txt‎
Lines changed: 5 additions & 4 deletions b/‎stable/_sources/examples/amp.rst.txt‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎stable/_sources/examples/basic_usage.rst.txt‎
Lines changed: 12 additions & 9 deletions b/‎stable/_sources/examples/basic_usage.rst.txt‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎stable/_sources/examples/iwmtl.rst.txt‎
Lines changed: 2 additions & 2 deletions b/‎stable/_sources/examples/iwmtl.rst.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎stable/_sources/examples/iwrm.rst.txt‎
Lines changed: 8 additions & 8 deletions b/‎stable/_sources/examples/iwrm.rst.txt‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎stable/_sources/examples/lightning_integration.rst.txt‎
Lines changed: 5 additions & 4 deletions b/‎stable/_sources/examples/lightning_integration.rst.txt‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎stable/_sources/examples/monitoring.rst.txt‎
Lines changed: 5 additions & 4 deletions b/‎stable/_sources/examples/monitoring.rst.txt‎
Lines changed: 5 additions & 4 deletions
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file records the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: e5000e9e67322558c40cde17b9944e7c
+config: a7571c8a4da17ac2925af991bf4fc8fb
 tags: d77d1c0d9ca2f4c8421862c7c5a0d620
@@ -10,3 +10,5 @@ autojac
 
     backward.rst
     mtl_backward.rst
+    jac.rst
+    jac_to_grad.rst
@@ -0,0 +1,6 @@
+:hide-toc:
+
+jac
+===
+
+.. autofunction:: torchjd.autojac.jac
@@ -0,0 +1,6 @@
+:hide-toc:
+
+jac_to_grad
+===========
+
+.. autofunction:: torchjd.autojac.jac_to_grad
@@ -12,15 +12,15 @@ case, the losses) should preferably be scaled with a `GradScaler
 following example shows the resulting code for a multi-task learning use-case.
 
 .. code-block:: python
-    :emphasize-lines: 2, 17, 27, 34, 36-38
+    :emphasize-lines: 2, 17, 27, 34-35, 37-38
 
     import torch
     from torch.amp import GradScaler
     from torch.nn import Linear, MSELoss, ReLU, Sequential
     from torch.optim import SGD
 
     from torchjd.aggregation import UPGrad
-    from torchjd.autojac import mtl_backward
+    from torchjd.autojac import mtl_backward, jac_to_grad
 
     shared_module = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
     task1_module = Linear(3, 1)
@@ -48,10 +48,11 @@ following example shows the resulting code for a multi-task learning use-case.
             loss2 = loss_fn(output2, target2)
 
         scaled_losses = scaler.scale([loss1, loss2])
-        optimizer.zero_grad()
-        mtl_backward(losses=scaled_losses, features=features, aggregator=aggregator)
+        mtl_backward(scaled_losses, features=features)
+        jac_to_grad(shared_module.parameters(), aggregator)
         scaler.step(optimizer)
         scaler.update()
+        optimizer.zero_grad()
 
 .. hint::
     Within the ``torch.autocast`` context, some operations may be done in ``float16`` type. For
 
@@ -20,6 +20,7 @@ Import several classes from ``torch`` and ``torchjd``:
 
     from torchjd import autojac
     from torchjd.aggregation import UPGrad
+    from torchjd.autojac import jac_to_grad
 
 Define the model and the optimizer, as usual:
 
@@ -59,20 +60,16 @@ We can now compute the losses associated to each element of the batch.
 
 The last steps are similar to gradient descent-based optimization, but using the two losses.
 
-Reset the ``.grad`` field of each model parameter:
-
-.. code-block:: python
-
-    optimizer.zero_grad()
-
 Perform the Jacobian descent backward pass:
 
 .. code-block:: python
 
-    autojac.backward([loss1, loss2], aggregator)
+    autojac.backward([loss1, loss2])
+    jac_to_grad(model.parameters(), aggregator)
 
-This will populate the ``.grad`` field of each model parameter with the corresponding aggregated
-Jacobian matrix.
+The first function will populate the ``.jac`` field of each model parameter with the corresponding
+Jacobian, and the second one will aggregate these Jacobians and store the result in the ``.grad``
+field of the parameters. It also deletes the ``.jac`` fields save some memory.
 
 Update each parameter based on its ``.grad`` field, using the ``optimizer``:
 
@@ -81,3 +78,9 @@ Update each parameter based on its ``.grad`` field, using the ``optimizer``:
     optimizer.step()
 
 The model's parameters have been updated!
+
+As usual, you should now reset the ``.grad`` field of each model parameter:
+
+.. code-block:: python
+
+    optimizer.zero_grad()
@@ -10,7 +10,7 @@ this Gramian to reweight the gradients and resolve conflict entirely.
 The following example shows how to do that.
 
 .. code-block:: python
-    :emphasize-lines: 5-6, 18-20, 31-32, 34-35, 37-38, 41-42
+    :emphasize-lines: 5-6, 18-20, 31-32, 34-35, 37-38, 40-41
 
     import torch
     from torch.nn import Linear, MSELoss, ReLU, Sequential
@@ -51,10 +51,10 @@ The following example shows how to do that.
         # Obtain the weights that lead to no conflict between reweighted gradients
         weights = weighting(gramian)  # shape: [16, 2]
 
-        optimizer.zero_grad()
         # Do the standard backward pass, but weighted using the obtained weights
         losses.backward(weights)
         optimizer.step()
+        optimizer.zero_grad()
 
 .. note::
     In this example, the tensor of losses is a matrix rather than a vector. The gramian is thus a
 
@@ -64,26 +64,26 @@ batch of data. When minimizing per-instance losses (IWRM), we use either autojac
             for x, y in zip(X, Y):
                 y_hat = model(x).squeeze(dim=1)  # shape: [16]
                 loss = loss_fn(y_hat, y)  # shape: [] (scalar)
-                optimizer.zero_grad()
                 loss.backward()
 
 
                 optimizer.step()
+                optimizer.zero_grad()
 
         In this baseline example, the update may negatively affect the loss of some elements of the
         batch.
 
     .. tab-item:: autojac
 
         .. code-block:: python
-            :emphasize-lines: 5-6, 12, 16, 21, 23
+            :emphasize-lines: 5-6, 12, 16, 21-23
 
             import torch
             from torch.nn import Linear, MSELoss, ReLU, Sequential
             from torch.optim import SGD
 
             from torchjd.aggregation import UPGrad
-            from torchjd.autojac import backward
+            from torchjd.autojac import backward, jac_to_grad
 
             X = torch.randn(8, 16, 10)
             Y = torch.randn(8, 16)
@@ -99,19 +99,19 @@ batch of data. When minimizing per-instance losses (IWRM), we use either autojac
             for x, y in zip(X, Y):
                 y_hat = model(x).squeeze(dim=1)  # shape: [16]
                 losses = loss_fn(y_hat, y)  # shape: [16]
-                optimizer.zero_grad()
-                backward(losses, aggregator)
-
+                backward(losses)
+                jac_to_grad(model.parameters(), aggregator)
 
                 optimizer.step()
+                optimizer.zero_grad()
 
         Here, we compute the Jacobian of the per-sample losses with respect to the model parameters
         and use it to update the model such that no loss from the batch is (locally) increased.
 
     .. tab-item:: autogram (recommended)
 
         .. code-block:: python
-            :emphasize-lines: 5-6, 12, 16-17, 21, 23-25
+            :emphasize-lines: 5-6, 12, 16-17, 21-24
 
             import torch
             from torch.nn import Linear, MSELoss, ReLU, Sequential
@@ -134,11 +134,11 @@ batch of data. When minimizing per-instance losses (IWRM), we use either autojac
             for x, y in zip(X, Y):
                 y_hat = model(x).squeeze(dim=1)  # shape: [16]
                 losses = loss_fn(y_hat, y)  # shape: [16]
-                optimizer.zero_grad()
                 gramian = engine.compute_gramian(losses)  # shape: [16, 16]
                 weights = weighting(gramian)  # shape: [16]
                 losses.backward(weights)
                 optimizer.step()
+                optimizer.zero_grad()
 
         Here, the per-sample gradients are never fully stored in memory, leading to large
         improvements in memory usage and speed compared to autojac, in most practical cases. The
 
@@ -11,7 +11,7 @@ The following code example demonstrates a basic multi-task learning setup using
 <../docs/autojac/mtl_backward>` at each training iteration.
 
 .. code-block:: python
-    :emphasize-lines: 9-10, 18, 32
+    :emphasize-lines: 9-10, 18, 31-32
 
     import torch
     from lightning import LightningModule, Trainer
@@ -22,7 +22,7 @@ The following code example demonstrates a basic multi-task learning setup using
     from torch.utils.data import DataLoader, TensorDataset
 
     from torchjd.aggregation import UPGrad
-    from torchjd.autojac import mtl_backward
+    from torchjd.autojac import mtl_backward, jac_to_grad
 
     class Model(LightningModule):
         def __init__(self):
@@ -43,9 +43,10 @@ The following code example demonstrates a basic multi-task learning setup using
             loss2 = mse_loss(output2, target2)
 
             opt = self.optimizers()
-            opt.zero_grad()
-            mtl_backward(losses=[loss1, loss2], features=features, aggregator=UPGrad())
+            mtl_backward([loss1, loss2], features=features)
+            jac_to_grad(self.feature_extractor.parameters(), UPGrad())
             opt.step()
+            opt.zero_grad()
 
         def configure_optimizers(self) -> OptimizerLRScheduler:
             optimizer = Adam(self.parameters(), lr=1e-3)
 
@@ -23,7 +23,7 @@ they have a negative inner product).
     from torch.optim import SGD
 
     from torchjd.aggregation import UPGrad
-    from torchjd.autojac import mtl_backward
+    from torchjd.autojac import mtl_backward, jac_to_grad
 
     def print_weights(_, __, weights: torch.Tensor) -> None:
         """Prints the extracted weights."""
@@ -49,7 +49,7 @@ they have a negative inner product).
     optimizer = SGD(params, lr=0.1)
     aggregator = UPGrad()
 
-    aggregator.weighting.weighting.register_forward_hook(print_weights)
+    aggregator.gramian_weighting.register_forward_hook(print_weights)
     aggregator.register_forward_hook(print_gd_similarity)
 
     inputs = torch.randn(8, 16, 10)  # 8 batches of 16 random input vectors of length 10
@@ -63,6 +63,7 @@ they have a negative inner product).
         loss1 = loss_fn(output1, target1)
         loss2 = loss_fn(output2, target2)
 
-        optimizer.zero_grad()
-        mtl_backward(losses=[loss1, loss2], features=features, aggregator=aggregator)
+        mtl_backward([loss1, loss2], features=features)
+        jac_to_grad(shared_module.parameters(), aggregator)
         optimizer.step()
+        optimizer.zero_grad()
-Original file line number
+Diff line change
@@ @@ -0,0 +1,6 @@ @@
 +:hide-toc:
++
 +jac
 +===
++
 +.. autofunction:: torchjd.autojac.jac