Move newly added functions

ValerianRey · ValerianRey · commit b1aaee9138cf · 2026-01-12T18:17:27.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,7 +12,7 @@ changelog does not include internal changes that do not affect the user.
 
 - **BREAKING**: Removed from `backward` and `mtl_backward` the responsibility to aggregate the
   Jacobian. Now, these functions compute and populate the `.jac` fields of the parameters, and a new
-  function `torchjd.utils.jac_to_grad` should then be called to aggregate those `.jac` fields into
+  function `torchjd.autojac.jac_to_grad` should then be called to aggregate those `.jac` fields into
   `.grad` fields.
   This means that users now have more control on what they do with the Jacobians (they can easily
   aggregate them group by group or even param by param if they want), but it now requires an extra
diff --git a/docs/source/examples/amp.rst b/docs/source/examples/amp.rst
@@ -12,16 +12,15 @@ case, the losses) should preferably be scaled with a `GradScaler
 following example shows the resulting code for a multi-task learning use-case.
 
 .. code-block:: python
-    :emphasize-lines: 2, 18, 28, 35-36, 38-39
+    :emphasize-lines: 2, 17, 27, 34-35, 37-38
 
     import torch
     from torch.amp import GradScaler
     from torch.nn import Linear, MSELoss, ReLU, Sequential
     from torch.optim import SGD
 
     from torchjd.aggregation import UPGrad
-    from torchjd.autojac import mtl_backward
-    from torchjd.utils import jac_to_grad
+    from torchjd.autojac import mtl_backward, jac_to_grad
 
     shared_module = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
     task1_module = Linear(3, 1)
diff --git a/docs/source/examples/basic_usage.rst b/docs/source/examples/basic_usage.rst
@@ -19,8 +19,7 @@ Import several classes from ``torch`` and ``torchjd``:
     from torch.optim import SGD
 
     from torchjd import autojac
-    from torchjd.aggregation import UPGrad
-    from torchjd.utils import jac_to_grad
+    from torchjd.aggregation import UPGrad, jac_to_grad
 
 Define the model and the optimizer, as usual:
 
diff --git a/docs/source/examples/iwrm.rst b/docs/source/examples/iwrm.rst
@@ -50,7 +50,6 @@ batch of data. When minimizing per-instance losses (IWRM), we use either autojac
 
 
 
-
             X = torch.randn(8, 16, 10)
             Y = torch.randn(8, 16)
 
@@ -78,15 +77,14 @@ batch of data. When minimizing per-instance losses (IWRM), we use either autojac
     .. tab-item:: autojac
 
         .. code-block:: python
-            :emphasize-lines: 5-7, 13, 17, 22-24
+            :emphasize-lines: 5-6, 12, 16, 21-23
 
             import torch
             from torch.nn import Linear, MSELoss, ReLU, Sequential
             from torch.optim import SGD
 
             from torchjd.aggregation import UPGrad
-            from torchjd.autojac import backward
-            from torchjd.utils import jac_to_grad
+            from torchjd.autojac import backward, jac_to_grad
 
             X = torch.randn(8, 16, 10)
             Y = torch.randn(8, 16)
@@ -115,7 +113,7 @@ batch of data. When minimizing per-instance losses (IWRM), we use either autojac
     .. tab-item:: autogram (recommended)
 
         .. code-block:: python
-            :emphasize-lines: 5-6, 13, 17-18, 22-25
+            :emphasize-lines: 5-6, 12, 16-17, 21-24
 
             import torch
             from torch.nn import Linear, MSELoss, ReLU, Sequential
@@ -124,7 +122,6 @@ batch of data. When minimizing per-instance losses (IWRM), we use either autojac
             from torchjd.aggregation import UPGradWeighting
             from torchjd.autogram import Engine
 
-
             X = torch.randn(8, 16, 10)
             Y = torch.randn(8, 16)
 
diff --git a/docs/source/examples/lightning_integration.rst b/docs/source/examples/lightning_integration.rst
@@ -11,7 +11,7 @@ The following code example demonstrates a basic multi-task learning setup using
 <../docs/autojac/mtl_backward>` at each training iteration.
 
 .. code-block:: python
-    :emphasize-lines: 9-11, 19, 32-33
+    :emphasize-lines: 9-10, 18, 31-32
 
     import torch
     from lightning import LightningModule, Trainer
@@ -22,8 +22,7 @@ The following code example demonstrates a basic multi-task learning setup using
     from torch.utils.data import DataLoader, TensorDataset
 
     from torchjd.aggregation import UPGrad
-    from torchjd.autojac import mtl_backward
-    from torchjd.utils import jac_to_grad
+    from torchjd.autojac import mtl_backward, jac_to_grad
 
     class Model(LightningModule):
         def __init__(self):
diff --git a/docs/source/examples/monitoring.rst b/docs/source/examples/monitoring.rst
@@ -15,16 +15,15 @@ Jacobian descent is doing something different than gradient descent. With
 they have a negative inner product).
 
 .. code-block:: python
-    :emphasize-lines: 10-12, 14-19, 34-35
+    :emphasize-lines: 9-11, 13-18, 33-34
 
     import torch
     from torch.nn import Linear, MSELoss, ReLU, Sequential
     from torch.nn.functional import cosine_similarity
     from torch.optim import SGD
 
     from torchjd.aggregation import UPGrad
-    from torchjd.autojac import mtl_backward
-    from torchjd.utils import jac_to_grad
+    from torchjd.autojac import mtl_backward, jac_to_grad
 
     def print_weights(_, __, weights: torch.Tensor) -> None:
         """Prints the extracted weights."""
diff --git a/docs/source/examples/mtl.rst b/docs/source/examples/mtl.rst
@@ -19,15 +19,14 @@ vectors of dimension 10, and their corresponding scalar labels for both tasks.
 
 
 .. code-block:: python
-    :emphasize-lines: 5-7, 20, 33-34
+    :emphasize-lines: 5-6, 19, 32-33
 
     import torch
     from torch.nn import Linear, MSELoss, ReLU, Sequential
     from torch.optim import SGD
 
     from torchjd.aggregation import UPGrad
-    from torchjd.autojac import mtl_backward
-    from torchjd.utils import jac_to_grad
+    from torchjd.autojac import mtl_backward, jac_to_grad
 
     shared_module = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
     task1_module = Linear(3, 1)
diff --git a/docs/source/examples/rnn.rst b/docs/source/examples/rnn.rst
@@ -6,15 +6,14 @@ element of the output sequences. If the gradients of these losses are likely to
 descent can be leveraged to enhance optimization.
 
 .. code-block:: python
-    :emphasize-lines: 5-7, 11, 18, 20-21
+    :emphasize-lines: 5-6, 10, 17, 19-20
 
     import torch
     from torch.nn import RNN
     from torch.optim import SGD
 
     from torchjd.aggregation import UPGrad
-    from torchjd.autojac import backward
-    from torchjd.utils import jac_to_grad
+    from torchjd.autojac import backward, jac_to_grad
 
     rnn = RNN(input_size=10, hidden_size=20, num_layers=2)
     optimizer = SGD(rnn.parameters(), lr=0.1)
diff --git a/src/torchjd/autojac/__init__.py b/src/torchjd/autojac/__init__.py
@@ -6,6 +6,7 @@
 """
 
 from ._backward import backward
+from ._jac_to_grad import jac_to_grad
 from ._mtl_backward import mtl_backward
 
-__all__ = ["backward", "mtl_backward"]
+__all__ = ["backward", "jac_to_grad", "mtl_backward"]
diff --git a/src/torchjd/autojac/_accumulation.py b/src/torchjd/autojac/_accumulation.py
@@ -3,10 +3,18 @@
 
 from torch import Tensor
 
-from torchjd.utils._tensor_with_jac import TensorWithJac
+
+class TensorWithJac(Tensor):
+    """
+    Tensor known to have a populated jac field.
+
+    Should not be directly instantiated, but can be used as a type hint and can be casted to.
+    """
+
+    jac: Tensor
 
 
-def _accumulate_jacs(params: Iterable[Tensor], jacobians: Iterable[Tensor]) -> None:
+def accumulate_jacs(params: Iterable[Tensor], jacobians: Iterable[Tensor]) -> None:
     for param, jac in zip(params, jacobians, strict=True):
         _check_expects_grad(param)
         # We that the shape is correct to be consistent with torch, that checks that the grad
@@ -34,7 +42,7 @@ def _accumulate_jacs(params: Iterable[Tensor], jacobians: Iterable[Tensor]) -> N
             param.__setattr__("jac", jac)
 
 
-def _accumulate_grads(params: Iterable[Tensor], gradients: Iterable[Tensor]) -> None:
+def accumulate_grads(params: Iterable[Tensor], gradients: Iterable[Tensor]) -> None:
     for param, grad in zip(params, gradients, strict=True):
         _check_expects_grad(param)
         if hasattr(param, "grad") and param.grad is not None:
diff --git a/src/torchjd/autojac/_jac_to_grad.py b/src/torchjd/autojac/_jac_to_grad.py
@@ -5,8 +5,8 @@
 from torch import Tensor
 
 from torchjd.aggregation import Aggregator
-from torchjd.utils._accumulation import _accumulate_grads
-from torchjd.utils._tensor_with_jac import TensorWithJac
+
+from ._accumulation import TensorWithJac, accumulate_grads
 
 
 def jac_to_grad(
@@ -43,7 +43,7 @@ def jac_to_grad(
     jacobian_matrix = _unite_jacobians(jacobians)
     gradient_vector = aggregator(jacobian_matrix)
     gradients = _disunite_gradient(gradient_vector, jacobians, params_)
-    _accumulate_grads(params_, gradients)
+    accumulate_grads(params_, gradients)
 
     if not retain_jacs:
         _free_jacs(params_)
diff --git a/src/torchjd/autojac/_transform/_accumulate.py b/src/torchjd/autojac/_transform/_accumulate.py
@@ -1,7 +1,6 @@
 from torch import Tensor
 
-from torchjd.utils._accumulation import _accumulate_grads, _accumulate_jacs
-
+from .._accumulation import accumulate_grads, accumulate_jacs
 from ._base import TensorDict, Transform
 
 
@@ -15,7 +14,7 @@ class AccumulateGrad(Transform):
     """
 
     def __call__(self, gradients: TensorDict) -> TensorDict:
-        _accumulate_grads(gradients.keys(), gradients.values())
+        accumulate_grads(gradients.keys(), gradients.values())
         return {}
 
     def check_keys(self, input_keys: set[Tensor]) -> set[Tensor]:
@@ -32,7 +31,7 @@ class AccumulateJac(Transform):
     """
 
     def __call__(self, jacobians: TensorDict) -> TensorDict:
-        _accumulate_jacs(jacobians.keys(), jacobians.values())
+        accumulate_jacs(jacobians.keys(), jacobians.values())
         return {}
 
     def check_keys(self, input_keys: set[Tensor]) -> set[Tensor]:
diff --git a/src/torchjd/utils/__init__.py b/src/torchjd/utils/__init__.py
diff --git a/src/torchjd/utils/_tensor_with_jac.py b/src/torchjd/utils/_tensor_with_jac.py
diff --git a/tests/doc/test_rst.py b/tests/doc/test_rst.py
@@ -14,8 +14,7 @@ def test_amp():
     from torch.optim import SGD
 
     from torchjd.aggregation import UPGrad
-    from torchjd.autojac import mtl_backward
-    from torchjd.utils import jac_to_grad
+    from torchjd.autojac import jac_to_grad, mtl_backward
 
     shared_module = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
     task1_module = Linear(3, 1)
@@ -58,7 +57,7 @@ def test_basic_usage():
 
     from torchjd import autojac
     from torchjd.aggregation import UPGrad
-    from torchjd.utils import jac_to_grad
+    from torchjd.autojac import jac_to_grad
 
     model = Sequential(Linear(10, 5), ReLU(), Linear(5, 2))
     optimizer = SGD(model.parameters(), lr=0.1)
@@ -155,8 +154,7 @@ def test_autojac():
         from torch.optim import SGD
 
         from torchjd.aggregation import UPGrad
-        from torchjd.autojac import backward
-        from torchjd.utils import jac_to_grad
+        from torchjd.autojac import backward, jac_to_grad
 
         X = torch.randn(8, 16, 10)
         Y = torch.randn(8, 16)
@@ -229,8 +227,7 @@ def test_lightning_integration():
     from torch.utils.data import DataLoader, TensorDataset
 
     from torchjd.aggregation import UPGrad
-    from torchjd.autojac import mtl_backward
-    from torchjd.utils import jac_to_grad
+    from torchjd.autojac import jac_to_grad, mtl_backward
 
     class Model(LightningModule):
         def __init__(self):
@@ -287,8 +284,7 @@ def test_monitoring():
     from torch.optim import SGD
 
     from torchjd.aggregation import UPGrad
-    from torchjd.autojac import mtl_backward
-    from torchjd.utils import jac_to_grad
+    from torchjd.autojac import jac_to_grad, mtl_backward
 
     def print_weights(_, __, weights: torch.Tensor) -> None:
         """Prints the extracted weights."""
@@ -340,8 +336,7 @@ def test_mtl():
     from torch.optim import SGD
 
     from torchjd.aggregation import UPGrad
-    from torchjd.autojac import mtl_backward
-    from torchjd.utils import jac_to_grad
+    from torchjd.autojac import jac_to_grad, mtl_backward
 
     shared_module = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
     task1_module = Linear(3, 1)
@@ -413,8 +408,7 @@ def test_rnn():
     from torch.optim import SGD
 
     from torchjd.aggregation import UPGrad
-    from torchjd.autojac import backward
-    from torchjd.utils import jac_to_grad
+    from torchjd.autojac import backward, jac_to_grad
 
     rnn = RNN(input_size=10, hidden_size=20, num_layers=2)
     optimizer = SGD(rnn.parameters(), lr=0.1)
diff --git a/tests/unit/autojac/_asserts.py b/tests/unit/autojac/_asserts.py
@@ -3,7 +3,7 @@
 import torch
 from torch.testing import assert_close
 
-from torchjd.utils._tensor_with_jac import TensorWithJac
+from torchjd.autojac._accumulation import TensorWithJac
 
 
 def assert_has_jac(t: torch.Tensor) -> None: