SimplexLab · ValerianRey · Dec 17, 2025 · Dec 18, 2025 · Dec 18, 2025 · PierreQuinton
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,4 +26,6 @@ dependencies = [
 	"numba",
 	"triton",
 	"pre-commit",
+    "torchjd",
+    "torchviz"
 ]
diff --git a/src/recursion/dataset/repeat_after_k.py b/src/recursion/dataset/repeat_after_k.py
@@ -0,0 +1,14 @@
+import torch
+from torch import Tensor
+
+
+def make_sequence(length: int, k: int) -> tuple[Tensor, Tensor]:
+    seq = torch.randint(low=0, high=2, size=[length + k])
+    input = seq[k:]
+
+    if k == 0:
+        target = seq
+    else:
+        target = seq[:-k]
+
+    return input, target
diff --git a/src/recursion/models/trivial_memory_model.py b/src/recursion/models/trivial_memory_model.py
@@ -0,0 +1,103 @@
+from collections import defaultdict
+
+import torch
+from torch import Tensor, nn
+from torch.optim import SGD
+
+from recursion.dataset.repeat_after_k import make_sequence
+
+
+class TrivialMemoryModel(nn.Module):
+    def __init__(self, memory_dim: int):
+        super().__init__()
+
+        hidden_size = 2 * (1 + memory_dim)
+        self.fc1 = nn.Linear(1 + memory_dim, hidden_size)
+        self.fc2 = nn.Linear(hidden_size, memory_dim)
+        # self.fc3 = nn.Linear(memory_dim, 1)
+        self.relu = nn.ReLU()
+
+    def forward(self, input: Tensor, memory: Tensor) -> tuple[Tensor, Tensor]:
+        x = torch.cat([input, memory], dim=-1)
+        x = self.relu(self.fc1(x))
+        x = self.fc2(x)
+
+        return x
+
+
+input_sequence, target_sequence = make_sequence(7, 3)
+
+memory_dim = 8
+
+model = TrivialMemoryModel(memory_dim)
+head = nn.Linear(memory_dim, 1)
+memory = torch.randn(memory_dim)
+criterion = nn.BCEWithLogitsLoss()
+optimizer = SGD(model.parameters(), lr=1e-2)
+memories = []
+memories_wrt = []
+
+param_to_gradients = defaultdict(list)
+torch.set_printoptions(linewidth=200)
+update_every = 6
+
+from torchjd.aggregation import UPGradWeighting
+
+weighting = UPGradWeighting()
+
+for i, (input, target) in enumerate(zip(input_sequence, target_sequence, strict=True)):
+    memories_wrt.append(memory.detach().requires_grad_(True))
+    memory = model(input.unsqueeze(0).to(dtype=torch.float32), memories_wrt[-1])
+    output = head(memory)
+    loss = criterion(output, target.unsqueeze(0).to(dtype=torch.float32))
+    memories.append(memory)
+
+    print(f"{loss.item():.1e}")
+
+    if (i + 1) % update_every == 0:
+        optimizer.zero_grad()
+
+        grad_output = torch.autograd.grad(loss, [memories[-1]])
+
+        for j in range(update_every):
+            print(j)
+            grads = torch.autograd.grad(
+                memories[-j - 1],
+                list(model.parameters()) + [memories_wrt[-j - 1]],
+                grad_outputs=grad_output,
+            )
+            grads_wrt_params = grads[:-1]
+            grad_output = grads[-1]
+
+            for param, grad in zip(model.parameters(), grads_wrt_params, strict=True):
+                param_to_gradients[param].append(grad)
+
+        param_to_jacobian_matrix = {
+            param: torch.stack([g.flatten() for g in gradients], dim=0)
+            for param, gradients in param_to_gradients.items()
+        }
+        jacobian_matrix = torch.cat([mat for mat in param_to_jacobian_matrix.values()], dim=1)
+
+        gramian = jacobian_matrix @ jacobian_matrix.T
+        weights = weighting(gramian)
+        # print(jacobian_matrix.shape)
+        print(gramian)
+        print(weights)
+
+        # graph = make_dot(loss, params=dict(model.named_parameters()), show_attrs=True, show_saved=True)
+        # graph.view()
+
+        # graph = make_dot(attached_memories[-1], params=dict(model.named_parameters()), show_attrs=True,
+        #                  show_saved=True)
+        # graph.view()
+
+        # loss.backward()
+
+        # print("fc1 weights: ", model.fc1.weight.grad)
+        # print("fc1 biases: ", model.fc1.bias.grad)
+        #
+        # print("fc2 weights: ", model.fc2.weight.grad)
+        # print("fc2 biases: ", model.fc2.bias.grad)
+
+        optimizer.step()
+        memory = memory.detach()