Make GradVac, GradDrop, PCGrad and Random Stochastic (borken).

PierreQuinton · PierreQuinton · commit a6735c003728 · 2026-04-17T10:09:39.000+02:00
diff --git a/src/torchjd/aggregation/_graddrop.py b/src/torchjd/aggregation/_graddrop.py
@@ -6,14 +6,15 @@
 from torchjd._linalg import Matrix
 
 from ._aggregator_bases import Aggregator
+from ._mixins import Stochastic
 from ._utils.non_differentiable import raise_non_differentiable_error
 
 
 def _identity(P: Tensor) -> Tensor:
     return P
 
 
-class GradDrop(Aggregator):
+class GradDrop(Aggregator, Stochastic):
     """
     :class:`~torchjd.aggregation._aggregator_bases.Aggregator` that applies the gradient combination
     steps from GradDrop, as defined in lines 10 to 15 of Algorithm 1 of `Just Pick a Sign:
@@ -24,16 +25,21 @@ class GradDrop(Aggregator):
         increasing. Defaults to identity.
     :param leak: The tensor of leak values, determining how much each row is allowed to leak
         through. Defaults to None, which means no leak.
+    :param seed: Seed for the internal random number generator. If ``None``, a seed is drawn from
+        the global PyTorch RNG to fork an independent stream.
     """
 
-    def __init__(self, f: Callable = _identity, leak: Tensor | None = None) -> None:
+    def __init__(
+        self, f: Callable = _identity, leak: Tensor | None = None, seed: int | None = None
+    ) -> None:
         if leak is not None and leak.dim() != 1:
             raise ValueError(
                 "Parameter `leak` should be a 1-dimensional tensor. Found `leak.shape = "
                 f"{leak.shape}`.",
             )
 
-        super().__init__()
+        Aggregator.__init__(self)
+        Stochastic.__init__(self, seed=seed)
         self.f = f
         self.leak = leak
 
@@ -50,7 +56,7 @@ def forward(self, matrix: Matrix, /) -> Tensor:
 
         P = 0.5 * (torch.ones_like(matrix[0]) + matrix.sum(dim=0) / matrix.abs().sum(dim=0))
         fP = self.f(P)
-        U = torch.rand(P.shape, dtype=matrix.dtype, device=matrix.device)
+        U = torch.rand(P.shape, dtype=matrix.dtype, device=matrix.device, generator=self.generator)
 
         vector = torch.zeros_like(matrix[0])
         for i in range(len(matrix)):
diff --git a/src/torchjd/aggregation/_gradvac.py b/src/torchjd/aggregation/_gradvac.py
@@ -6,16 +6,16 @@
 from torch import Tensor
 
 from torchjd._linalg import PSDMatrix
-from torchjd.aggregation._mixins import Stateful
+from torchjd.aggregation._mixins import Stochastic
 
 from ._aggregator_bases import GramianWeightedAggregator
 from ._utils.non_differentiable import raise_non_differentiable_error
 from ._weighting_bases import Weighting
 
 
-class GradVac(GramianWeightedAggregator, Stateful):
+class GradVac(GramianWeightedAggregator, Stochastic):
     r"""
-    :class:`~torchjd.aggregation._mixins.Stateful`
+    :class:`~torchjd.aggregation._mixins.Stochastic`
     :class:`~torchjd.aggregation._aggregator_bases.Aggregator` implementing the aggregation step of
     Gradient Vaccine (GradVac) from `Gradient Vaccine: Investigating and Improving Multi-task
     Optimization in Massively Multilingual Models (ICLR 2021 Spotlight)
@@ -35,16 +35,14 @@ class GradVac(GramianWeightedAggregator, Stateful):
 
     :param beta: EMA decay for :math:`\hat{\phi}`.
     :param eps: Small non-negative constant added to denominators.
-
-    .. note::
-        For each task :math:`i`, the order of other tasks :math:`j` is shuffled independently
-        using the global PyTorch RNG (``torch.randperm``). Seed it with ``torch.manual_seed`` if
-        you need reproducibility.
+    :param seed: Seed for the internal random number generator. If ``None``, a seed is drawn from
+        the global PyTorch RNG to fork an independent stream.
     """
 
-    def __init__(self, beta: float = 0.5, eps: float = 1e-8) -> None:
-        weighting = GradVacWeighting(beta=beta, eps=eps)
-        super().__init__(weighting)
+    def __init__(self, beta: float = 0.5, eps: float = 1e-8, seed: int | None = None) -> None:
+        weighting = GradVacWeighting(beta=beta, eps=eps, seed=seed)
+        GramianWeightedAggregator.__init__(self, weighting)
+        Stochastic.__init__(self, generator=weighting.generator)
         self._gradvac_weighting = weighting
         self.register_full_backward_pre_hook(raise_non_differentiable_error)
 
@@ -65,17 +63,18 @@ def eps(self, value: float) -> None:
         self._gradvac_weighting.eps = value
 
     def reset(self) -> None:
-        """Clears EMA state so the next forward starts from zero targets."""
+        """Resets the random number generator and clears the EMA state."""
 
+        Stochastic.reset(self)
         self._gradvac_weighting.reset()
 
     def __repr__(self) -> str:
         return f"GradVac(beta={self.beta!r}, eps={self.eps!r})"
 
 
-class GradVacWeighting(Weighting[PSDMatrix], Stateful):
+class GradVacWeighting(Weighting[PSDMatrix], Stochastic):
     r"""
-    :class:`~torchjd.aggregation._mixins.Stateful`
+    :class:`~torchjd.aggregation._mixins.Stochastic`
     :class:`~torchjd.aggregation._weighting_bases.Weighting` giving the weights of
     :class:`~torchjd.aggregation.GradVac`.
 
@@ -97,10 +96,13 @@ class GradVacWeighting(Weighting[PSDMatrix], Stateful):
 
     :param beta: EMA decay for :math:`\hat{\phi}`.
     :param eps: Small non-negative constant added to denominators.
+    :param seed: Seed for the internal random number generator. If ``None``, a seed is drawn from
+        the global PyTorch RNG to fork an independent stream.
     """
 
-    def __init__(self, beta: float = 0.5, eps: float = 1e-8) -> None:
-        super().__init__()
+    def __init__(self, beta: float = 0.5, eps: float = 1e-8, seed: int | None = None) -> None:
+        Weighting.__init__(self)
+        Stochastic.__init__(self, seed=seed)
         if not (0.0 <= beta <= 1.0):
             raise ValueError(f"Parameter `beta` must be in [0, 1]. Found beta={beta!r}.")
         if eps < 0.0:
@@ -132,8 +134,9 @@ def eps(self, value: float) -> None:
         self._eps = value
 
     def reset(self) -> None:
-        """Clears EMA state so the next forward starts from zero targets."""
+        """Resets the random number generator and clears the EMA state."""
 
+        Stochastic.reset(self)
         self._phi_t = None
         self._state_key = None
 
@@ -161,7 +164,7 @@ def forward(self, gramian: PSDMatrix, /) -> Tensor:
             cG = C[i] @ G
 
             others = [j for j in range(m) if j != i]
-            perm = torch.randperm(len(others))
+            perm = torch.randperm(len(others), generator=self.generator)
             shuffled_js = [others[idx] for idx in perm.tolist()]
 
             for j in shuffled_js:
diff --git a/src/torchjd/aggregation/_mixins.py b/src/torchjd/aggregation/_mixins.py
@@ -1,5 +1,7 @@
 from abc import ABC, abstractmethod
 
+import torch
+
 
 class Stateful(ABC):
     r"""
@@ -18,3 +20,41 @@ class Stateful(ABC):
     @abstractmethod
     def reset(self) -> None:
         """Resets the internal state :math:`s_0`."""
+
+
+class Stochastic(Stateful, ABC):
+    r"""
+    Stateful mixin that represents mappings that have inherent randomness.
+
+    Internally, a ``Stochastic`` mapping holds a :class:`torch.Generator` that serves as an
+    independent random number stream. Implementing classes must pass this generator to all torch
+    random functions via their ``generator`` argument, e.g.:
+
+    .. code-block:: python
+
+        torch.rand(n, generator=self.generator)
+        torch.randn(n, generator=self.generator)
+        torch.randperm(n, generator=self.generator)
+
+    :param seed: Seed for the internal :class:`torch.Generator`. If ``None``, a seed is drawn
+        from the global PyTorch RNG to fork an independent stream.
+    :param generator: An existing :class:`torch.Generator` to share, typically from a companion
+        :class:`Stochastic` instance (e.g. a :class:`Weighting` sharing the generator of its
+        :class:`Aggregator`). Mutually exclusive with ``seed``.
+    """
+
+    def __init__(self, seed: int | None = None, generator: torch.Generator | None = None) -> None:
+        if generator is not None and seed is not None:
+            raise ValueError("Parameters `seed` and `generator` are mutually exclusive.")
+        if generator is not None:
+            self.generator = generator
+        else:
+            self.generator = torch.Generator()
+            if seed is None:
+                seed = int(torch.randint(0, 2**62, size=(1,), dtype=torch.int64).item())
+            self.generator.manual_seed(seed)
+        self._initial_rng_state = self.generator.get_state()
+
+    def reset(self) -> None:
+        """Resets the random number generator to its initial state."""
+        self.generator.set_state(self._initial_rng_state)
diff --git a/src/torchjd/aggregation/_pcgrad.py b/src/torchjd/aggregation/_pcgrad.py
@@ -6,29 +6,42 @@
 from torchjd._linalg import PSDMatrix
 
 from ._aggregator_bases import GramianWeightedAggregator
+from ._mixins import Stochastic
 from ._utils.non_differentiable import raise_non_differentiable_error
 from ._weighting_bases import Weighting
 
 
-class PCGrad(GramianWeightedAggregator):
+class PCGrad(GramianWeightedAggregator, Stochastic):
     """
     :class:`~torchjd.aggregation._aggregator_bases.Aggregator` as defined in algorithm 1 of
     `Gradient Surgery for Multi-Task Learning <https://arxiv.org/pdf/2001.06782.pdf>`_.
+
+    :param seed: Seed for the internal random number generator. If ``None``, a seed is drawn from
+        the global PyTorch RNG to fork an independent stream.
     """
 
-    def __init__(self) -> None:
-        super().__init__(PCGradWeighting())
+    def __init__(self, seed: int | None = None) -> None:
+        weighting = PCGradWeighting(seed=seed)
+        GramianWeightedAggregator.__init__(self, weighting)
+        Stochastic.__init__(self, generator=weighting.generator)
 
         # This prevents running into a RuntimeError due to modifying stored tensors in place.
         self.register_full_backward_pre_hook(raise_non_differentiable_error)
 
 
-class PCGradWeighting(Weighting[PSDMatrix]):
+class PCGradWeighting(Weighting[PSDMatrix], Stochastic):
     """
     :class:`~torchjd.aggregation._weighting_bases.Weighting` giving the weights of
     :class:`~torchjd.aggregation.PCGrad`.
+
+    :param seed: Seed for the internal random number generator. If ``None``, a seed is drawn from
+        the global PyTorch RNG to fork an independent stream.
     """
 
+    def __init__(self, seed: int | None = None) -> None:
+        Weighting.__init__(self)
+        Stochastic.__init__(self, seed=seed)
+
     def forward(self, gramian: PSDMatrix, /) -> Tensor:
         # Move all computations on cpu to avoid moving memory between cpu and gpu at each iteration
         device = gramian.device
@@ -40,7 +53,7 @@ def forward(self, gramian: PSDMatrix, /) -> Tensor:
         weights = torch.zeros(dimension, device=cpu, dtype=dtype)
 
         for i in range(dimension):
-            permutation = torch.randperm(dimension)
+            permutation = torch.randperm(dimension, generator=self.generator)
             current_weights = torch.zeros(dimension, device=cpu, dtype=dtype)
             current_weights[i] = 1.0
 
diff --git a/src/torchjd/aggregation/_random.py b/src/torchjd/aggregation/_random.py
@@ -5,28 +5,43 @@
 from torchjd._linalg import Matrix
 
 from ._aggregator_bases import WeightedAggregator
+from ._mixins import Stochastic
 from ._weighting_bases import Weighting
 
 
-class Random(WeightedAggregator):
+class Random(WeightedAggregator, Stochastic):
     """
     :class:`~torchjd.aggregation._aggregator_bases.Aggregator` that computes a random combination of
     the rows of the provided matrices, as defined in algorithm 2 of `Reasonable Effectiveness of
     Random Weighting: A Litmus Test for Multi-Task Learning
     <https://arxiv.org/pdf/2111.10603.pdf>`_.
+
+    :param seed: Seed for the internal random number generator. If ``None``, a seed is drawn from
+        the global PyTorch RNG to fork an independent stream.
     """
 
-    def __init__(self) -> None:
-        super().__init__(RandomWeighting())
+    def __init__(self, seed: int | None = None) -> None:
+        weighting = RandomWeighting(seed=seed)
+        WeightedAggregator.__init__(self, weighting)
+        Stochastic.__init__(self, generator=weighting.generator)
 
 
-class RandomWeighting(Weighting[Matrix]):
+class RandomWeighting(Weighting[Matrix], Stochastic):
     """
     :class:`~torchjd.aggregation._weighting_bases.Weighting` that generates positive random weights
     at each call.
+
+    :param seed: Seed for the internal random number generator. If ``None``, a seed is drawn from
+        the global PyTorch RNG to fork an independent stream.
     """
 
+    def __init__(self, seed: int | None = None) -> None:
+        Weighting.__init__(self)
+        Stochastic.__init__(self, seed=seed)
+
     def forward(self, matrix: Tensor, /) -> Tensor:
-        random_vector = torch.randn(matrix.shape[0], device=matrix.device, dtype=matrix.dtype)
+        random_vector = torch.randn(
+            matrix.shape[0], device=matrix.device, dtype=matrix.dtype, generator=self.generator
+        )
         weights = F.softmax(random_vector, dim=-1)
         return weights