docs(autojac): Document Transforms (#326)

ValerianRey · web-flow · commit d5ed6f36e556 · 2025-04-26T13:49:53.000+02:00
diff --git a/src/torchjd/autojac/_transform/_differentiate.py b/src/torchjd/autojac/_transform/_differentiate.py
@@ -9,17 +9,28 @@
 
 
 class Differentiate(Transform[_A, _A], ABC):
+    """
+    Abstract base class for transforms responsible for differentiating some outputs with respect to
+    some inputs.
+
+    :param outputs: Tensors to differentiate.
+    :param inputs: Tensors with respect to which we differentiate.
+    :param retain_graph: If False, the graph used to compute the grads will be freed.
+    :param create_graph: If True, graph of the derivative will be constructed, allowing to compute
+        higher order derivative products.
+
+    .. note:: The order of outputs and inputs only matters because we have no guarantee that
+        torch.autograd.grad is *exactly* equivariant to input permutations and invariant to output
+        (with their corresponding grad_output) permutations.
+    """
+
     def __init__(
         self,
         outputs: OrderedSet[Tensor],
         inputs: OrderedSet[Tensor],
         retain_graph: bool,
         create_graph: bool,
     ):
-        # The order of outputs and inputs only matters because we have no guarantee that
-        # torch.autograd.grad is *exactly* equivariant to input permutations and invariant to
-        # output (with their corresponding grad_output) permutations.
-
         self.outputs = list(outputs)
         self.inputs = list(inputs)
         self.retain_graph = retain_graph
diff --git a/src/torchjd/autojac/_transform/accumulate.py b/src/torchjd/autojac/_transform/accumulate.py
@@ -5,11 +5,9 @@
 
 
 class Accumulate(Transform[Gradients, EmptyTensorDict]):
-    def __call__(self, gradients: Gradients) -> EmptyTensorDict:
-        """
-        Accumulates gradients with respect to keys in their ``.grad`` field.
-        """
+    """Transform that accumulates gradients with respect to keys into their ``grad`` field."""
 
+    def __call__(self, gradients: Gradients) -> EmptyTensorDict:
         for key in gradients.keys():
             _check_expects_grad(key)
             if hasattr(key, "grad") and key.grad is not None:
diff --git a/src/torchjd/autojac/_transform/aggregate.py b/src/torchjd/autojac/_transform/aggregate.py
@@ -15,6 +15,17 @@
 
 
 class Aggregate(Transform[Jacobians, Gradients]):
+    """
+    Transform aggregating Jacobians into Gradients.
+
+    It does so by reshaping these Jacobians into matrices, concatenating them into a single matrix,
+    applying an aggregator to it, separating the result back into one gradient vector per key, and
+    finally reshaping those into gradients of the same shape as their corresponding keys.
+
+    :param aggregator: The aggregator used to aggregate the concatenated jacobian matrix.
+    :param key_order: Order in which the different jacobian matrices must be concatenated.
+    """
+
     def __init__(self, aggregator: Aggregator, key_order: OrderedSet[Tensor]):
         matrixify = _Matrixify()
         aggregate_matrices = _AggregateMatrices(aggregator, key_order)
@@ -31,6 +42,16 @@ def check_keys(self, input_keys: set[Tensor]) -> set[Tensor]:
 
 
 class _AggregateMatrices(Transform[JacobianMatrices, GradientVectors]):
+    """
+    Transform aggregating JacobiansMatrices into GradientsVectors.
+
+    It does so by concatenating the matrices into a single matrix, applying an aggregator to it and
+    separating the result back into one gradient vector per key.
+
+    :param aggregator: The aggregator used to aggregate the concatenated jacobian matrix.
+    :param key_order: Order in which the different jacobian matrices must be concatenated.
+    """
+
     def __init__(self, aggregator: Aggregator, key_order: OrderedSet[Tensor]):
         self.key_order = key_order
         self.aggregator = aggregator
@@ -112,6 +133,8 @@ def _disunite(
 
 
 class _Matrixify(Transform[Jacobians, JacobianMatrices]):
+    """Transform reshaping Jacobians into JacobianMatrices."""
+
     def __call__(self, jacobians: Jacobians) -> JacobianMatrices:
         jacobian_matrices = {
             key: jacobian.view(jacobian.shape[0], -1) for key, jacobian in jacobians.items()
@@ -123,6 +146,8 @@ def check_keys(self, input_keys: set[Tensor]) -> set[Tensor]:
 
 
 class _Reshape(Transform[GradientVectors, Gradients]):
+    """Transform reshaping GradientVectors into Gradients."""
+
     def __call__(self, gradient_vectors: GradientVectors) -> Gradients:
         gradients = {
             key: gradient_vector.view(key.shape)
diff --git a/src/torchjd/autojac/_transform/base.py b/src/torchjd/autojac/_transform/base.py
@@ -15,25 +15,9 @@ class RequirementError(ValueError):
 
 
 class Transform(Generic[_B, _C], ABC):
-    r"""
+    """
     Abstract base class for all transforms. Transforms are elementary building blocks of a jacobian
-    descent backward phase. A transform maps a :class:`~torchjd.transform.tensor_dict.TensorDict` to
-    another. The input :class:`~torchjd.transform.tensor_dict.TensorDict` has keys `required_keys`
-    and the output :class:`~torchjd.transform.tensor_dict.TensorDict` has keys `output_keys`.
-
-    Formally a transform is a function:
-
-    .. math::
-        f:\mathbb R^{n_1+\dots+n_p}\to \mathbb R^{m_1+\dots+m_q}
-
-    where we have ``p`` `required_keys`, ``q`` `output_keys`, ``n_i`` is the number of elements in
-    the value associated to the ``i`` th `required_key` of the input
-    :class:`~torchjd.transform.tensor_dict.TensorDict` and ``m_j`` is the number of elements in the
-    value associated to the ``j`` th `output_key` of the output
-    :class:`~torchjd.transform.tensor_dict.TensorDict`.
-
-    As they are mathematical functions, transforms can be composed together as long as their
-    domains and range meaningfully match.
+    descent backward phase. A transform maps a TensorDict to another.
     """
 
     def compose(self, other: Transform[_A, _B]) -> Transform[_A, _C]:
@@ -67,6 +51,13 @@ def check_keys(self, input_keys: set[Tensor]) -> set[Tensor]:
 
 
 class Composition(Transform[_A, _C]):
+    """
+    Transform corresponding to the composition of two transforms inner and outer.
+
+    :param inner: The transform to apply first, to the input.
+    :param outer: The transform to apply second, to the result of ``inner``.
+    """
+
     def __init__(self, outer: Transform[_B, _C], inner: Transform[_A, _B]):
         self.outer = outer
         self.inner = inner
@@ -85,6 +76,13 @@ def check_keys(self, input_keys: set[Tensor]) -> set[Tensor]:
 
 
 class Conjunction(Transform[_A, _B]):
+    """
+    Transform applying several transforms to the same input, and combining the results (by union)
+    into a single TensorDict.
+
+    :param transforms: The transforms to apply. Their outputs should have disjoint sets of keys.
+    """
+
     def __init__(self, transforms: Sequence[Transform[_A, _B]]):
         self.transforms = transforms
 
diff --git a/src/torchjd/autojac/_transform/diagonalize.py b/src/torchjd/autojac/_transform/diagonalize.py
@@ -7,6 +7,51 @@
 
 
 class Diagonalize(Transform[Gradients, Jacobians]):
+    """
+    Transform diagonalizing Gradients into Jacobians.
+
+    The first dimension of the returned Jacobians will be equal to the total number of elements in
+    the tensors of the input tensor dict. The exact behavior of the diagonalization is best
+    explained by some examples.
+
+    Example 1:
+        The input is one tensor of shape [3] and of value [1 2 3].
+        The output Jacobian will be:
+        [[1 0 0]
+         [0 2 0]
+         [0 0 3]]
+
+    Example 2:
+        The input is one tensor of shape [2, 2] and of value [[4 5] [6 7]].
+        The output Jacobian will be:
+        [[[4 0] [0 0]]
+         [[0 5] [0 0]]
+         [[0 0] [6 0]]
+         [[0 0] [0 7]]]
+
+    Example 3:
+        The input is two tensors, of shapes [3] and [2, 2] and of values [1 2 3] and [[4 5] [6 7]].
+        If the key_order has the tensor of shape [3] appear first and the one of shape [2, 2] appear
+        second, the output Jacobians will be:
+        [[1 0 0]
+         [0 2 0]
+         [0 0 3]
+         [0 0 0]
+         [0 0 0]
+         [0 0 0]
+         [0 0 0]] and
+        [[[0 0] [0 0]]
+         [[0 0] [0 0]]
+         [[0 0] [0 0]]
+         [[4 0] [0 0]]
+         [[0 5] [0 0]]
+         [[0 0] [6 0]]
+         [[0 0] [0 7]]]
+
+    :param key_order: The order in which the keys are represented in the rows of the output
+        Jacobians.
+    """
+
     def __init__(self, key_order: OrderedSet[Tensor]):
         self.key_order = key_order
         self.indices: list[tuple[int, int]] = []
diff --git a/src/torchjd/autojac/_transform/grad.py b/src/torchjd/autojac/_transform/grad.py
@@ -10,6 +10,22 @@
 
 
 class Grad(Differentiate[Gradients]):
+    """
+    Transform computing the gradient of each output element with respect to each input tensor, and
+    applying the linear transformations represented by provided the grad_outputs to the results.
+
+    :param outputs: Tensors to differentiate.
+    :param inputs: Tensors with respect to which we differentiate.
+    :param retain_graph: If False, the graph used to compute the grads will be freed. Defaults to
+        False.
+    :param create_graph: If True, graph of the derivative will be constructed, allowing to compute
+        higher order derivative products. Defaults to False.
+
+    .. note:: The order of outputs and inputs only matters because we have no guarantee that
+        torch.autograd.grad is *exactly* equivariant to input permutations and invariant to output
+        (with their corresponding grad_output) permutations.
+    """
+
     def __init__(
         self,
         outputs: OrderedSet[Tensor],
@@ -21,14 +37,15 @@ def __init__(
 
     def _differentiate(self, grad_outputs: Sequence[Tensor]) -> tuple[Tensor, ...]:
         """
-        Computes the gradient of each output with respect to each input, and applies the linear
-        transformations represented by the grad_outputs to the results.
+        Computes the gradient of each output element with respect to each input tensor, and applies
+        the linear transformations represented by the grad_outputs to the results.
 
-        Returns one gradient per input.
+        Returns one gradient per input, corresponding to the sum of the scaled gradients with
+        respect to this input.
 
-        :param grad_outputs: The sequence of scalar tensors to scale the obtained gradients with.
-            Its length should be equal to the length of ``outputs``. Each grad_output should have
-            the same shape as the corresponding output.
+        :param grad_outputs: The sequence of tensors to scale the obtained gradients with. Its
+            length should be equal to the length of ``outputs``. Each grad_output should have the
+            same shape as the corresponding output.
         """
 
         if len(self.inputs) == 0:
diff --git a/src/torchjd/autojac/_transform/init.py b/src/torchjd/autojac/_transform/init.py
@@ -8,17 +8,16 @@
 
 
 class Init(Transform[EmptyTensorDict, Gradients]):
+    """
+    Transform returning Gradients filled with ones for each of the provided values.
+
+    :param values: Tensors for which Gradients must be returned.
+    """
+
     def __init__(self, values: Set[Tensor]):
         self.values = values
 
     def __call__(self, input: EmptyTensorDict) -> Gradients:
-        r"""
-        Computes the gradients of the ``value`` with respect to itself. Returns the result as a
-        dictionary. The only key of the dictionary is ``value``. The corresponding gradient is a
-        tensor of 1s of identical shape, because :math:`\frac{\partial v}{\partial v} = 1` for any
-        :math:`v`.
-        """
-
         return Gradients({value: torch.ones_like(value) for value in self.values})
 
     def check_keys(self, input_keys: set[Tensor]) -> set[Tensor]:
diff --git a/src/torchjd/autojac/_transform/jac.py b/src/torchjd/autojac/_transform/jac.py
@@ -13,6 +13,26 @@
 
 
 class Jac(Differentiate[Jacobians]):
+    """
+    Transform computing the jacobian of each output with respect to each input, and applying the
+    linear transformations represented by the argument jac_outputs to the results.
+
+    :param outputs: Tensors to differentiate.
+    :param inputs: Tensors with respect to which we differentiate.
+    :param chunk_size: The number of scalars to differentiate simultaneously. If set to ``None``,
+        all outputs will be differentiated in parallel at once. If set to ``1``, all will be
+        differentiated sequentially. A larger value results in faster differentiation, but also
+        higher memory usage. Defaults to ``None``.
+    :param retain_graph: If False, the graph used to compute the grads will be freed. Defaults to
+        False.
+    :param create_graph: If True, graph of the derivative will be constructed, allowing to compute
+        higher order derivative products. Defaults to False.
+
+    .. note:: The order of outputs and inputs only matters because we have no guarantee that
+        torch.autograd.grad is *exactly* equivariant to input permutations and invariant to output
+        (with their corresponding grad_output) permutations.
+    """
+
     def __init__(
         self,
         outputs: OrderedSet[Tensor],
diff --git a/src/torchjd/autojac/_transform/select.py b/src/torchjd/autojac/_transform/select.py
@@ -7,6 +7,12 @@
 
 
 class Select(Transform[_A, _A]):
+    """
+    Transform returning a subset of the provided TensorDict.
+
+    :param keys: The keys that should be included in the returned subset.
+    """
+
     def __init__(self, keys: Set[Tensor]):
         self.keys = keys
 
diff --git a/src/torchjd/autojac/_transform/stack.py b/src/torchjd/autojac/_transform/stack.py
@@ -9,6 +9,17 @@
 
 
 class Stack(Transform[_A, Jacobians]):
+    """
+    Transform applying several transforms to the same input, and combining the results (by stacking)
+    into a single TensorDict.
+
+    The set of keys of the resulting dict is the union of the sets of keys of the input dicts.
+
+    :param transforms: The transforms to apply. Their outputs may have different sets of keys. If a
+        key is absent in some output dicts, the corresponding stacked tensor is filled with zeroes
+        at the positions corresponding to those dicts.
+    """
+
     def __init__(self, transforms: Sequence[Transform[_A, Gradients]]):
         self.transforms = transforms
 
@@ -22,13 +33,6 @@ def check_keys(self, input_keys: set[Tensor]) -> set[Tensor]:
 
 
 def _stack(gradient_dicts: list[Gradients]) -> Jacobians:
-    """
-    Transforms a list of tensor dicts into a single dict of (stacked) tensors. The set of keys of
-    the resulting dict is the union of the sets of keys of the input dicts.
-    If a key is absent in some input dicts, the corresponding stacked tensor is filled with zeroes
-    at the positions corresponding to those dicts.
-    """
-
     # It is important to first remove duplicate keys before computing their associated
     # stacked tensor. Otherwise, some computations would be duplicated. Therefore, we first compute
     # unique_keys, and only then, we compute the stacked tensors.
@@ -41,9 +45,7 @@ def _stack(gradient_dicts: list[Gradients]) -> Jacobians:
 
 
 def _stack_one_key(gradient_dicts: list[Gradients], input: Tensor) -> Tensor:
-    """
-    Makes the stacked tensor corresponding to a given key, from a list of tensor dicts.
-    """
+    """Makes the stacked tensor corresponding to a given key, from a list of tensor dicts."""
 
     optional_gradients = [gradients.get(input, None) for gradients in gradient_dicts]
     gradients = materialize(optional_gradients, [input] * len(optional_gradients))