TorchJD/tests/unit/autojac/_transform/test_interactions.py at c804e5ecffa9aed445e283c281e69169e0719f6f · SimplexLab/TorchJD · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import torch
from pytest import raises
from torch.testing import assert_close

from torchjd.autojac._transform import (
    Accumulate,
    Conjunction,
    Diagonalize,
    EmptyTensorDict,
    Grad,
    Gradients,
    Init,
    Jac,
    Jacobians,
    Select,
    Stack,
    TensorDict,
)

from ._dict_assertions import assert_tensor_dicts_are_close


def test_jac_is_stack_of_grads():
    """
    Tests that the Jac transform (composed with a Diagonalize) is equivalent to a Stack of Grad and
    Select transforms.
    """

    x = torch.tensor(5.0)
    a1 = torch.tensor(2.0, requires_grad=True)
    a2 = torch.tensor(3.0, requires_grad=True)
    y1 = a1 * x
    y2 = a2 * x
    input = Gradients({y1: torch.ones_like(y1), y2: torch.ones_like(y2)})

    jac = Jac(outputs=[y1, y2], inputs=[a1, a2], chunk_size=None, retain_graph=True)
    diag = Diagonalize([y1, y2])
    jac_diag = jac << diag

    grad1 = Grad(outputs=[y1], inputs=[a1, a2])
    grad2 = Grad(outputs=[y2], inputs=[a1, a2])
    select1 = Select([y1], [y1, y2])
    select2 = Select([y2], [y1, y2])
    stack_of_grads = Stack([grad1 << select1, grad2 << select2])

    jacobians = jac_diag(input)
    expected_jacobians = stack_of_grads(input)

    assert_tensor_dicts_are_close(jacobians, expected_jacobians)


def test_single_differentiation():
    """
    Tests that we can perform a single scalar differentiation with the composition of a Grad and an
    Init transform.
    """

    a = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
    y = a * 2.0
    input = EmptyTensorDict()

    init = Init([y])
    grad = Grad([y], [a])
    transform = grad << init

    output = transform(input)
    expected_output = {a: torch.tensor([2.0, 2.0, 2.0])}

    assert_tensor_dicts_are_close(output, expected_output)


def test_multiple_differentiations():
    """
    Tests that we can perform multiple scalar differentiations with the conjunction of multiple Grad
    transforms, composed with an Init transform.
    """

    a1 = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], requires_grad=True)
    a2 = torch.tensor([1.0, 3.0, 5.0], requires_grad=True)
    y1 = a1 * 2.0
    y2 = a2 * 3.0
    input = EmptyTensorDict()

    grad1 = Grad([y1], [a1])
    grad2 = Grad([y2], [a2])
    select1 = Select([y1], [y1, y2])
    select2 = Select([y2], [y1, y2])
    init = Init([y1, y2])
    transform = ((grad1 << select1) | (grad2 << select2)) << init

    output = transform(input)
    expected_output = {
        a1: torch.tensor([[2.0, 2.0, 2.0], [2.0, 2.0, 2.0]]),
        a2: torch.tensor([3.0, 3.0, 3.0]),
    }

    assert_tensor_dicts_are_close(output, expected_output)


def test_str():
    """Tests that the __str__ method works correctly even for a complex transform."""
    init = Init([])
    diag = Diagonalize([])
    jac = Jac([], [], chunk_size=None)
    transform = jac << diag << init

    assert str(transform) == "Jac ∘ Diagonalize ∘ Init"


def test_simple_conjunction():
    """
    Tests that the Conjunction transform works correctly with a simple example involving several
    Select transforms, whose keys form a partition of the keys of the input tensor dict.
    Because of this, the output is expected to be the same as the input.
    """

    x1 = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
    x2 = torch.tensor([1.0, 3.0, 5.0])
    x3 = torch.tensor(4.0)
    input = TensorDict({x1: torch.ones_like(x1), x2: torch.ones_like(x2), x3: torch.ones_like(x3)})

    select1 = Select([x1], [x1, x2, x3])
    select2 = Select([x2], [x1, x2, x3])
    select3 = Select([x3], [x1, x2, x3])
    conjunction = Conjunction([select1, select2, select3])

    output = conjunction(input)
    expected_output = input

    assert_tensor_dicts_are_close(output, expected_output)


def test_conjunction_is_commutative():
    """
    Tests that the Conjunction transform gives the same result no matter the order in which its
    transforms are given.
    """

    x1 = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
    x2 = torch.tensor([1.0, 3.0, 5.0])
    input = TensorDict({x1: torch.ones_like(x1), x2: torch.ones_like(x2)})

    a = Select([x1], [x1, x2])
    b = Select([x2], [x1, x2])
    flipped_conjunction = Conjunction([b, a])
    conjunction = Conjunction([a, b])

    output = flipped_conjunction(input)
    expected_output = conjunction(input)

    assert_tensor_dicts_are_close(output, expected_output)


def test_conjunction_is_associative():
    """
    Tests that the Conjunction transform gives the same result no matter how it is parenthesized.
    """

    x1 = torch.tensor([[3.0, 11.0], [2.0, 7.0]])
    x2 = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
    x3 = torch.tensor([1.0, 3.0, 5.0])
    x4 = torch.tensor(4.0)
    input = TensorDict(
        {
            x1: torch.ones_like(x1),
            x2: torch.ones_like(x2),
            x3: torch.ones_like(x3),
            x4: torch.ones_like(x4),
        }
    )

    a = Select([x1], [x1, x2, x3, x4])
    b = Select([x2], [x1, x2, x3, x4])
    c = Select([x3], [x1, x2, x3, x4])
    d = Select([x4], [x1, x2, x3, x4])

    parenthesized_conjunction = Conjunction([a, Conjunction([Conjunction([b, c]), d])])
    conjunction = Conjunction([a, b, c, d])

    output = parenthesized_conjunction(input)
    expected_output = conjunction(input)

    assert_tensor_dicts_are_close(output, expected_output)


def test_conjunction_accumulate_select():
    """
    Tests that it is possible to conjunct an Accumulate and a Select in this order.
    It is not trivial since the type of the TensorDict returned by the first transform (Accumulate)
    is EmptyDict, which is not the type that the conjunction should return (Gradients), but a
    subclass of it.
    """

    key = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
    value = torch.ones_like(key)
    input = Gradients({key: value})

    select = Select([], [key])
    accumulate = Accumulate([key])
    conjunction = accumulate | select

    output = conjunction(input)
    expected_output = {}

    assert_tensor_dicts_are_close(output, expected_output)


def test_equivalence_jac_grads():
    """
    Tests that differentiation in parallel using `_jac` is equivalent to sequential differentiation
    using several calls to `_grad` and stacking the resulting gradients.
    """

    A = torch.tensor([[4.0, 5.0], [6.0, 7.0], [8.0, 9.0]], requires_grad=True)
    b = torch.tensor([0.0, 2.0], requires_grad=True)
    c = torch.tensor(1.0, requires_grad=True)

    X1 = torch.tensor([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]])
    x2 = torch.tensor([5.0, 4.0, 3.0])

    y1 = X1 @ A @ b
    y2 = x2 @ A @ b + c

    inputs = [A, b, c]
    outputs = [y1, y2]
    grad_outputs = [torch.ones_like(output) for output in outputs]

    grad1 = Grad(outputs=[outputs[0]], inputs=inputs, retain_graph=True)
    grad_dict_1 = grad1(Gradients({outputs[0]: grad_outputs[0]}))
    grad_1_A, grad_1_b, grad_1_c = grad_dict_1[A], grad_dict_1[b], grad_dict_1[c]

    grad2 = Grad(outputs=[outputs[1]], inputs=inputs, retain_graph=True)
    grad_dict_2 = grad2(Gradients({outputs[1]: grad_outputs[1]}))
    grad_2_A, grad_2_b, grad_2_c = grad_dict_2[A], grad_dict_2[b], grad_dict_2[c]

    n_outputs = len(outputs)
    batched_grad_outputs = [
        torch.zeros((n_outputs,) + grad_output.shape) for grad_output in grad_outputs
    ]
    for i, grad_output in enumerate(grad_outputs):
        batched_grad_outputs[i][i] = grad_output

    jac = Jac(outputs=outputs, inputs=inputs, chunk_size=None)
    jac_dict = jac(
        Jacobians({outputs[0]: batched_grad_outputs[0], outputs[1]: batched_grad_outputs[1]})
    )
    jac_A, jac_b, jac_c = jac_dict[A], jac_dict[b], jac_dict[c]

    assert_close(jac_A, torch.stack([grad_1_A, grad_2_A]))
    assert_close(jac_b, torch.stack([grad_1_b, grad_2_b]))
    assert_close(jac_c, torch.stack([grad_1_c, grad_2_c]))


def test_stack_different_required_keys():
    """Tests that the Stack transform fails on transforms with different required keys."""

    a = torch.tensor(1.0, requires_grad=True)
    y1 = a * 2.0
    y2 = a * 3.0

    grad1 = Grad([y1], [a])
    grad2 = Grad([y2], [a])

    with raises(ValueError):
        Stack([grad1, grad2]).check_keys()