Improve sequential differentiation (#222)

ValerianRey · web-flow · commit 654dd889ccde · 2025-01-02T20:05:27.000+01:00
* Remove some tested parameters from test_value_is_correct to make it faster
* Add parametrization of chunk_size in test_value_is_correct and in most test_jac tests
* Add test_tensor_used_multiple_times in test_backward.py
* Change tests with retains_grad to make them have m&gt;1
* Change chunk_size from 3 to 2 in test_value_is_correct
* Change the implementation of Jac to always make the chunks ourselves and use vmap only if necessary
* Change the role of retain_graph to apply to the last differentiation
* Add changelog entries

---------
    Co-authored-by: Pierre Quinton &lt;pierre.quinton@epfl.ch&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,18 @@ changes that do not affect the user.
 
 ## [Unreleased]
 
+### Changed
+
+- Changed how the Jacobians are computed when calling `backward` or `mtl_backward` with
+  `parallel_chunk_size=1` to not rely on `torch.autograd.vmap` in this case. Whenever `vmap` does
+  not support something (compiled functions, RNN on cuda, etc.), users should now be able to avoid
+  using `vmap` by calling `backward` or `mtl_backward` with `parallel_chunk_size=1`.
+
+- Changed the effect of the parameter `retain_graph` of `backward` and `mtl_backward`. When set to
+  `False`, it now frees the graph only after all gradients have been computed. In most cases, users
+  should now leave the default value `retain_graph=False`, no matter what the value of
+  `parallel_chunk_size` is. This will reduce the memory overhead.
+
 ## [0.3.1] - 2024-12-21
 
 ### Changed
diff --git a/src/torchjd/autojac/_transform/jac.py b/src/torchjd/autojac/_transform/jac.py
@@ -1,5 +1,7 @@
+import math
+from functools import partial
 from itertools import accumulate
-from typing import Iterable, Sequence
+from typing import Callable, Iterable, Sequence
 
 import torch
 from torch import Size, Tensor
@@ -56,30 +58,67 @@ def _differentiate(self, jac_outputs: Sequence[Tensor]) -> tuple[Tensor, ...]:
                 ]
             )
 
-        def get_vjp(grad_outputs: Sequence[Tensor]) -> Tensor:
+        def _get_vjp(grad_outputs: Sequence[Tensor], retain_graph: bool) -> Tensor:
             optional_grads = torch.autograd.grad(
                 outputs,
                 inputs,
                 grad_outputs=grad_outputs,
-                retain_graph=self.retain_graph,
+                retain_graph=retain_graph,
                 create_graph=self.create_graph,
                 allow_unused=True,
             )
             grads = _materialize(optional_grads, inputs=inputs)
             return torch.concatenate([grad.reshape([-1]) for grad in grads])
 
-        # Because of a limitation of vmap, this breaks when some tensors have `retains_grad=True`.
-        # See https://pytorch.org/functorch/stable/ux_limitations.html for more information.
-        # This also breaks when some tensors have been produced by compiled functions.
-        grouped_jacobian_matrix = torch.vmap(get_vjp, chunk_size=self.chunk_size)(jac_outputs)
-
+        # By the Jacobians constraint, this value should be the same for all jac_outputs.
+        m = jac_outputs[0].shape[0]
+        max_chunk_size = self.chunk_size if self.chunk_size is not None else m
+        n_chunks = math.ceil(m / max_chunk_size)
+
+        # List of tensors of shape [k_i, n] where the k_i's sum to m
+        jac_matrix_chunks = []
+
+        # First differentiations: always retain graph
+        get_vjp_retain = partial(_get_vjp, retain_graph=True)
+        for i in range(n_chunks - 1):
+            start = i * max_chunk_size
+            end = (i + 1) * max_chunk_size
+            jac_outputs_chunk = [jac_output[start:end] for jac_output in jac_outputs]
+            jac_matrix_chunks.append(_get_jac_matrix_chunk(jac_outputs_chunk, get_vjp_retain))
+
+        # Last differentiation: retain the graph only if self.retain_graph==True
+        get_vjp_last = partial(_get_vjp, retain_graph=self.retain_graph)
+        start = (n_chunks - 1) * max_chunk_size
+        jac_outputs_chunk = [jac_output[start:] for jac_output in jac_outputs]
+        jac_matrix_chunks.append(_get_jac_matrix_chunk(jac_outputs_chunk, get_vjp_last))
+
+        jac_matrix = torch.vstack(jac_matrix_chunks)
         lengths = [input.numel() for input in inputs]
-        jacobian_matrices = _extract_sub_matrices(grouped_jacobian_matrix, lengths)
+        jac_matrices = _extract_sub_matrices(jac_matrix, lengths)
 
         shapes = [input.shape for input in inputs]
-        jacobians = _reshape_matrices(jacobian_matrices, shapes)
-
-        return tuple(jacobians)
+        jacs = _reshape_matrices(jac_matrices, shapes)
+
+        return tuple(jacs)
+
+
+def _get_jac_matrix_chunk(
+    jac_outputs_chunk: list[Tensor], get_vjp: Callable[[Sequence[Tensor]], Tensor]
+) -> Tensor:
+    """
+    Computes the jacobian matrix chunk corresponding to the provided get_vjp function, either by
+    calling get_vjp directly or by wrapping it into a call to ``torch.vmap``, depending on the shape
+    of the provided ``jac_outputs_chunk``. Because of the numerous issues of vmap, we use it only if
+    necessary (i.e. when the ``jac_outputs_chunk`` have more than 1 row).
+    """
+
+    chunk_size = jac_outputs_chunk[0].shape[0]
+    if chunk_size == 1:
+        grad_outputs = [tensor.squeeze() for tensor in jac_outputs_chunk]
+        gradient_vector = get_vjp(grad_outputs)
+        return gradient_vector.unsqueeze(0)
+    else:
+        return torch.vmap(get_vjp, chunk_size=chunk_size)(jac_outputs_chunk)
 
 
 def _extract_sub_matrices(matrix: Tensor, lengths: Sequence[int]) -> list[Tensor]:
diff --git a/src/torchjd/autojac/_utils.py b/src/torchjd/autojac/_utils.py
@@ -21,19 +21,6 @@ def _as_tensor_list(tensors: Sequence[Tensor] | Tensor) -> list[Tensor]:
     return output
 
 
-def _check_retain_graph_compatible_with_chunk_size(
-    tensors: list[Tensor],
-    retain_graph: bool,
-    parallel_chunk_size: int | None,
-) -> None:
-    tensors_numel = sum([tensor.numel() for tensor in tensors])
-    if parallel_chunk_size is not None and parallel_chunk_size < tensors_numel and not retain_graph:
-        raise ValueError(
-            "When using `retain_graph=False`, parameter `parallel_chunk_size` must be `None` or "
-            "large enough to compute all gradients in parallel."
-        )
-
-
 def _get_leaf_tensors(tensors: Iterable[Tensor], excluded: Iterable[Tensor]) -> set[Tensor]:
     """
     Gets the leaves of the autograd graph of all specified ``tensors``.
diff --git a/src/torchjd/autojac/backward.py b/src/torchjd/autojac/backward.py
@@ -5,12 +5,7 @@
 from torchjd.aggregation import Aggregator
 
 from ._transform import Accumulate, Aggregate, Diagonalize, EmptyTensorDict, Init, Jac
-from ._utils import (
-    _as_tensor_list,
-    _check_optional_positive_chunk_size,
-    _check_retain_graph_compatible_with_chunk_size,
-    _get_leaf_tensors,
-)
+from ._utils import _as_tensor_list, _check_optional_positive_chunk_size, _get_leaf_tensors
 
 
 def backward(
@@ -37,8 +32,7 @@ def backward(
         backward pass. If set to ``None``, all coordinates of ``tensors`` will be differentiated in
         parallel at once. If set to ``1``, all coordinates will be differentiated sequentially. A
         larger value results in faster differentiation, but also higher memory usage. Defaults to
-        ``None``. If ``parallel_chunk_size`` is not large enough to differentiate all tensors
-        simultaneously, ``retain_graph`` has to be set to ``True``.
+        ``None``.
 
     .. admonition::
         Example
@@ -64,13 +58,13 @@ def backward(
         :math:`\begin{bmatrix}y_1 \\ y_2\end{bmatrix}` with respect to ``param``.
 
     .. warning::
-        ``backward`` relies on a usage of ``torch.vmap`` that is not compatible with compiled
-        functions. The arguments of ``backward`` should thus not come from a compiled model. Check
-        https://github.com/pytorch/pytorch/issues/138422 for the status of this issue.
-
-    .. warning::
-        Because of a limitation of ``torch.vmap``, tensors in the computation graph of the
-        ``tensors`` parameter should not have their ``retains_grad`` parameter set to ``True``.
+        To differentiate in parallel, ``backward`` relies on ``torch.vmap``, which has some
+        limitations: `it does not work on the output of compiled functions
+        <https://github.com/pytorch/pytorch/issues/138422>`_, `when some tensors have
+        <https://github.com/TorchJD/torchjd/issues/184>`_ ``retains_grad=True`` or `when using an
+        RNN on CUDA <https://github.com/TorchJD/torchjd/issues/220>`_, for instance. If you
+        experience issues with ``backward`` try to use ``parallel_chunk_size=1`` to avoid relying on
+        ``torch.vmap``.
     """
     _check_optional_positive_chunk_size(parallel_chunk_size)
 
@@ -79,8 +73,6 @@ def backward(
     if len(tensors) == 0:
         raise ValueError("`tensors` cannot be empty")
 
-    _check_retain_graph_compatible_with_chunk_size(tensors, retain_graph, parallel_chunk_size)
-
     if inputs is None:
         inputs = _get_leaf_tensors(tensors=tensors, excluded=set())
     else:
diff --git a/src/torchjd/autojac/mtl_backward.py b/src/torchjd/autojac/mtl_backward.py
@@ -16,12 +16,7 @@
     Stack,
     Transform,
 )
-from ._utils import (
-    _as_tensor_list,
-    _check_optional_positive_chunk_size,
-    _check_retain_graph_compatible_with_chunk_size,
-    _get_leaf_tensors,
-)
+from ._utils import _as_tensor_list, _check_optional_positive_chunk_size, _get_leaf_tensors
 
 
 def mtl_backward(
@@ -60,8 +55,7 @@ def mtl_backward(
         backward pass. If set to ``None``, all coordinates of ``tensors`` will be differentiated in
         parallel at once. If set to ``1``, all coordinates will be differentiated sequentially. A
         larger value results in faster differentiation, but also higher memory usage. Defaults to
-        ``None``. If ``parallel_chunk_size`` is not large enough to differentiate all tensors
-        simultaneously, ``retain_graph`` has to be set to ``True``.
+        ``None``.
 
     .. admonition::
         Example
@@ -75,13 +69,13 @@ def mtl_backward(
         respect to those parameters will be accumulated into their ``.grad`` fields.
 
     .. warning::
-        ``mtl_backward`` relies on a usage of ``torch.vmap`` that is not compatible with compiled
-        functions. The arguments of ``mtl_backward`` should thus not come from a compiled model.
-        Check https://github.com/pytorch/pytorch/issues/138422 for the status of this issue.
-
-    .. warning::
-        Because of a limitation of ``torch.vmap``, tensors in the computation graph of the
-        ``features`` parameter should not have their ``retains_grad`` parameter set to ``True``.
+        To differentiate in parallel, ``mtl_backward`` relies on ``torch.vmap``, which has some
+        limitations: `it does not work on the output of compiled functions
+        <https://github.com/pytorch/pytorch/issues/138422>`_, `when some tensors have
+        <https://github.com/TorchJD/torchjd/issues/184>`_ ``retains_grad=True`` or `when using an
+        RNN on CUDA <https://github.com/TorchJD/torchjd/issues/220>`_, for instance. If you
+        experience issues with ``backward`` try to use ``parallel_chunk_size=1`` to avoid relying on
+        ``torch.vmap``.
     """
 
     _check_optional_positive_chunk_size(parallel_chunk_size)
@@ -96,7 +90,6 @@ def mtl_backward(
     if len(features) == 0:
         raise ValueError("`features` cannot be empty.")
 
-    _check_retain_graph_compatible_with_chunk_size(features, retain_graph, parallel_chunk_size)
     _check_no_overlap(shared_params, tasks_params)
     _check_losses_are_scalar(losses)
 
diff --git a/tests/unit/autojac/_transform/test_jac.py b/tests/unit/autojac/_transform/test_jac.py
@@ -1,13 +1,14 @@
 import torch
-from pytest import raises
+from pytest import mark, raises
 from unit.conftest import DEVICE
 
 from torchjd.autojac._transform import Jac, Jacobians
 
 from ._dict_assertions import assert_tensor_dicts_are_close
 
 
-def test_single_input():
+@mark.parametrize("chunk_size", [1, 3, None])
+def test_single_input(chunk_size: int | None):
     """
     Tests that the Jac transform works correctly for an example of multiple differentiation. Here,
     the function considered is: `y = [a1 * x, a2 * x]`. We want to compute the jacobians of `y` with
@@ -20,7 +21,7 @@ def test_single_input():
     y = torch.stack([a1 * x, a2 * x])
     input = Jacobians({y: torch.eye(2, device=DEVICE)})
 
-    jac = Jac(outputs=[y], inputs=[a1, a2], chunk_size=None)
+    jac = Jac(outputs=[y], inputs=[a1, a2], chunk_size=chunk_size)
 
     jacobians = jac(input)
     expected_jacobians = {
@@ -31,7 +32,8 @@ def test_single_input():
     assert_tensor_dicts_are_close(jacobians, expected_jacobians)
 
 
-def test_empty_inputs_1():
+@mark.parametrize("chunk_size", [1, 3, None])
+def test_empty_inputs_1(chunk_size: int | None):
     """
     Tests that the Jac transform works correctly when the `inputs` parameter is an empty `Iterable`.
     """
@@ -41,15 +43,16 @@ def test_empty_inputs_1():
     y = torch.stack([y1, y2])
     input = Jacobians({y: torch.eye(2, device=DEVICE)})
 
-    jac = Jac(outputs=[y], inputs=[], chunk_size=None)
+    jac = Jac(outputs=[y], inputs=[], chunk_size=chunk_size)
 
     jacobians = jac(input)
     expected_jacobians = {}
 
     assert_tensor_dicts_are_close(jacobians, expected_jacobians)
 
 
-def test_empty_inputs_2():
+@mark.parametrize("chunk_size", [1, 3, None])
+def test_empty_inputs_2(chunk_size: int | None):
     """
     Tests that the Jac transform works correctly when the `inputs` parameter is an empty `Iterable`.
     """
@@ -62,7 +65,7 @@ def test_empty_inputs_2():
     y = torch.stack([y1, y2])
     input = Jacobians({y: torch.eye(2, device=DEVICE)})
 
-    jac = Jac(outputs=[y], inputs=[], chunk_size=None)
+    jac = Jac(outputs=[y], inputs=[], chunk_size=chunk_size)
 
     jacobians = jac(input)
     expected_jacobians = {}
@@ -122,7 +125,8 @@ def test_two_levels():
     assert_tensor_dicts_are_close(jacobians, expected_jacobians)
 
 
-def test_multiple_outputs_1():
+@mark.parametrize("chunk_size", [1, 3, None])
+def test_multiple_outputs_1(chunk_size: int | None):
     """
     Tests that the Jac transform works correctly when the `outputs` contains 3 vectors.
     The input (jac_outputs) is not the same for all outputs, so that this test also checks that the
@@ -143,7 +147,7 @@ def test_multiple_outputs_1():
     jac_output3 = torch.cat([zeros_2x2, zeros_2x2, identity_2x2])
     input = Jacobians({y1: jac_output1, y2: jac_output2, y3: jac_output3})
 
-    jac = Jac(outputs=[y1, y2, y3], inputs=[a1, a2], chunk_size=None)
+    jac = Jac(outputs=[y1, y2, y3], inputs=[a1, a2], chunk_size=chunk_size)
 
     jacobians = jac(input)
     zero_scalar = torch.tensor(0.0, device=DEVICE)
@@ -155,7 +159,8 @@ def test_multiple_outputs_1():
     assert_tensor_dicts_are_close(jacobians, expected_jacobians)
 
 
-def test_multiple_outputs_2():
+@mark.parametrize("chunk_size", [1, 3, None])
+def test_multiple_outputs_2(chunk_size: int | None):
     """
     Same as test_multiple_outputs_1 but with different jac_outputs, so the returned jacobians are of
     different shapes.
@@ -175,7 +180,7 @@ def test_multiple_outputs_2():
     jac_output3 = torch.stack([zeros_2, zeros_2, ones_2])
     input = Jacobians({y1: jac_output1, y2: jac_output2, y3: jac_output3})
 
-    jac = Jac(outputs=[y1, y2, y3], inputs=[a1, a2], chunk_size=None)
+    jac = Jac(outputs=[y1, y2, y3], inputs=[a1, a2], chunk_size=chunk_size)
 
     jacobians = jac(input)
     expected_jacobians = {
diff --git a/tests/unit/autojac/test_backward.py b/tests/unit/autojac/test_backward.py
diff --git a/tests/unit/autojac/test_mtl_backward.py b/tests/unit/autojac/test_mtl_backward.py