|
| 1 | +from collections.abc import Sequence |
| 2 | +from typing import Iterable |
| 3 | + |
| 4 | +from torch import Tensor |
| 5 | + |
| 6 | +from torchjd.autojac._transform._base import Transform |
| 7 | +from torchjd.autojac._transform._diagonalize import Diagonalize |
| 8 | +from torchjd.autojac._transform._init import Init |
| 9 | +from torchjd.autojac._transform._jac import Jac |
| 10 | +from torchjd.autojac._transform._ordered_set import OrderedSet |
| 11 | +from torchjd.autojac._utils import ( |
| 12 | + as_checked_ordered_set, |
| 13 | + check_optional_positive_chunk_size, |
| 14 | + get_leaf_tensors, |
| 15 | +) |
| 16 | + |
| 17 | + |
| 18 | +def jac( |
| 19 | + outputs: Sequence[Tensor] | Tensor, |
| 20 | + inputs: Iterable[Tensor] | None = None, |
| 21 | + retain_graph: bool = False, |
| 22 | + parallel_chunk_size: int | None = None, |
| 23 | +) -> tuple[Tensor, ...]: |
| 24 | + r""" |
| 25 | + Computes the Jacobian of all values in ``outputs`` with respect to all ``inputs``. Returns the |
| 26 | + result as a tuple, with one element per input tensor. |
| 27 | +
|
| 28 | + :param outputs: The tensor or tensors to differentiate. Should be non-empty. The Jacobian |
| 29 | + matrices will have one row for each value of each of these tensors. |
| 30 | + :param inputs: The tensors with respect to which the Jacobian must be computed. These must have |
| 31 | + their ``requires_grad`` flag set to ``True``. If not provided, defaults to the leaf tensors |
| 32 | + that were used to compute the ``outputs`` parameter. |
| 33 | + :param retain_graph: If ``False``, the graph used to compute the grad will be freed. Defaults to |
| 34 | + ``False``. |
| 35 | + :param parallel_chunk_size: The number of scalars to differentiate simultaneously in the |
| 36 | + backward pass. If set to ``None``, all coordinates of ``outputs`` will be differentiated in |
| 37 | + parallel at once. If set to ``1``, all coordinates will be differentiated sequentially. A |
| 38 | + larger value results in faster differentiation, but also higher memory usage. Defaults to |
| 39 | + ``None``. |
| 40 | +
|
| 41 | + .. admonition:: |
| 42 | + Example |
| 43 | +
|
| 44 | + The following example shows how to use ``jac``. |
| 45 | +
|
| 46 | + >>> import torch |
| 47 | + >>> |
| 48 | + >>> from torchjd.autojac import jac |
| 49 | + >>> |
| 50 | + >>> param = torch.tensor([1., 2.], requires_grad=True) |
| 51 | + >>> # Compute arbitrary quantities that are function of param |
| 52 | + >>> y1 = torch.tensor([-1., 1.]) @ param |
| 53 | + >>> y2 = (param ** 2).sum() |
| 54 | + >>> |
| 55 | + >>> jacobians = jac([y1, y2], [param]) |
| 56 | + >>> |
| 57 | + >>> jacobians |
| 58 | + (tensor([-1., 1.], |
| 59 | + [ 2., 4.]]),) |
| 60 | +
|
| 61 | + The returned tuple contains a single tensor (because there is a single param), that is the |
| 62 | + Jacobian of :math:`\begin{bmatrix}y_1 \\ y_2\end{bmatrix}` with respect to ``param``. |
| 63 | +
|
| 64 | + .. warning:: |
| 65 | + To differentiate in parallel, ``jac`` relies on ``torch.vmap``, which has some |
| 66 | + limitations: `it does not work on the output of compiled functions |
| 67 | + <https://github.com/pytorch/pytorch/issues/138422>`_, `when some tensors have |
| 68 | + <https://github.com/TorchJD/torchjd/issues/184>`_ ``retains_grad=True`` or `when using an |
| 69 | + RNN on CUDA <https://github.com/TorchJD/torchjd/issues/220>`_, for instance. If you |
| 70 | + experience issues with ``backward`` try to use ``parallel_chunk_size=1`` to avoid relying on |
| 71 | + ``torch.vmap``. |
| 72 | + """ |
| 73 | + |
| 74 | + check_optional_positive_chunk_size(parallel_chunk_size) |
| 75 | + outputs_ = as_checked_ordered_set(outputs, "outputs") |
| 76 | + |
| 77 | + if inputs is None: |
| 78 | + inputs_ = get_leaf_tensors(tensors=outputs_, excluded=set()) |
| 79 | + else: |
| 80 | + inputs_ = OrderedSet(inputs) |
| 81 | + |
| 82 | + if len(outputs_) == 0: |
| 83 | + raise ValueError("`outputs` cannot be empty") |
| 84 | + |
| 85 | + if len(inputs_) == 0: |
| 86 | + raise ValueError("`inputs` cannot be empty") |
| 87 | + |
| 88 | + jac_transform = _create_transform( |
| 89 | + outputs=outputs_, |
| 90 | + inputs=inputs_, |
| 91 | + retain_graph=retain_graph, |
| 92 | + parallel_chunk_size=parallel_chunk_size, |
| 93 | + ) |
| 94 | + |
| 95 | + result = jac_transform({}) |
| 96 | + return tuple(val for val in result.values()) |
| 97 | + |
| 98 | + |
| 99 | +def _create_transform( |
| 100 | + outputs: OrderedSet[Tensor], |
| 101 | + inputs: OrderedSet[Tensor], |
| 102 | + retain_graph: bool, |
| 103 | + parallel_chunk_size: int | None, |
| 104 | +) -> Transform: |
| 105 | + # Transform that creates gradient outputs containing only ones. |
| 106 | + init = Init(outputs) |
| 107 | + |
| 108 | + # Transform that turns the gradients into Jacobians. |
| 109 | + diag = Diagonalize(outputs) |
| 110 | + |
| 111 | + # Transform that computes the required Jacobians. |
| 112 | + jac = Jac(outputs, inputs, parallel_chunk_size, retain_graph) |
| 113 | + |
| 114 | + return jac << diag << init |
0 commit comments