fix(aggregation): Make non-differentiability explicit (#334)

ValerianRey · web-flow · commit 387a672b393c · 2025-05-01T11:26:16.000+02:00
* Add _non_differentiable.py with NonDifferentiableError and raise_non_differentiable_error
* Register raise_non_differentiable_error as a full backward pre-hook of CAGrad, ConFIG, DualProj, GradDrop, IMTLG, NashMTL, PCGrad and UPGrad
* Add NonDifferentiableProperty tester
* Give NonDifferentiableProperty to CAGrad, ConFIG, DualProj, GradDrop, IMTLG, NashMTL, PCGrad and UPGrad
* Add changelog entry
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,13 @@ changes that do not affect the user.
   torchjd[nash_mtl]` or `pip install torchjd[full]` to install TorchJD alongside those dependencies.
   This should make TorchJD more lightweight.
 
+### Fixed
+
+- Made some aggregators (`CAGrad`, `ConFIG`, `DualProj`, `GradDrop`, `IMTLG`, `NashMTL`, `PCGrad`
+  and `UPGrad`) raise a `NonDifferentiableError` whenever one tries to differentiate through them.
+  Before this change, trying to differentiate through them leaded to wrong gradients or unclear
+  errors.
+
 ## [0.6.0] - 2025-04-19
 
 ### Added
diff --git a/src/torchjd/aggregation/_non_differentiable.py b/src/torchjd/aggregation/_non_differentiable.py
@@ -0,0 +1,10 @@
+from torch import Tensor, nn
+
+
+class NonDifferentiableError(RuntimeError):
+    def __init__(self, module: nn.Module):
+        super().__init__(f"Trying to differentiate through {module}, which is not differentiable.")
+
+
+def raise_non_differentiable_error(module: nn.Module, _: tuple[Tensor, ...]) -> None:
+    raise NonDifferentiableError(module)
diff --git a/src/torchjd/aggregation/cagrad.py b/src/torchjd/aggregation/cagrad.py
@@ -8,6 +8,7 @@
 from torch import Tensor
 
 from ._gramian_utils import compute_gramian, normalize
+from ._non_differentiable import raise_non_differentiable_error
 from .bases import _WeightedAggregator, _Weighting
 
 
@@ -44,6 +45,9 @@ class CAGrad(_WeightedAggregator):
     def __init__(self, c: float, norm_eps: float = 0.0001):
         super().__init__(weighting=_CAGradWeighting(c=c, norm_eps=norm_eps))
 
+        # This prevents considering the computed weights as constant w.r.t. the matrix.
+        self.register_full_backward_pre_hook(raise_non_differentiable_error)
+
     def __repr__(self) -> str:
         return (
             f"{self.__class__.__name__}(c={self.weighting.c}, norm_eps={self.weighting.norm_eps})"
diff --git a/src/torchjd/aggregation/config.py b/src/torchjd/aggregation/config.py
@@ -28,6 +28,7 @@
 import torch
 from torch import Tensor
 
+from ._non_differentiable import raise_non_differentiable_error
 from ._pref_vector_utils import pref_vector_to_str_suffix, pref_vector_to_weighting
 from .bases import Aggregator
 from .sum import _SumWeighting
@@ -65,6 +66,9 @@ def __init__(self, pref_vector: Tensor | None = None):
         self.weighting = pref_vector_to_weighting(pref_vector, default=_SumWeighting())
         self._pref_vector = pref_vector
 
+        # This prevents computing gradients that can be very wrong.
+        self.register_full_backward_pre_hook(raise_non_differentiable_error)
+
     def forward(self, matrix: Tensor) -> Tensor:
         weights = self.weighting(matrix)
         units = torch.nan_to_num((matrix / (matrix.norm(dim=1)).unsqueeze(1)), 0.0)
diff --git a/src/torchjd/aggregation/dualproj.py b/src/torchjd/aggregation/dualproj.py
@@ -4,6 +4,7 @@
 
 from ._dual_cone_utils import project_weights
 from ._gramian_utils import compute_gramian, normalize, regularize
+from ._non_differentiable import raise_non_differentiable_error
 from ._pref_vector_utils import pref_vector_to_str_suffix, pref_vector_to_weighting
 from .bases import _WeightedAggregator, _Weighting
 from .mean import _MeanWeighting
@@ -56,6 +57,9 @@ def __init__(
             )
         )
 
+        # This prevents considering the computed weights as constant w.r.t. the matrix.
+        self.register_full_backward_pre_hook(raise_non_differentiable_error)
+
     def __repr__(self) -> str:
         return (
             f"{self.__class__.__name__}(pref_vector={repr(self._pref_vector)}, norm_eps="
diff --git a/src/torchjd/aggregation/graddrop.py b/src/torchjd/aggregation/graddrop.py
@@ -3,6 +3,7 @@
 import torch
 from torch import Tensor
 
+from ._non_differentiable import raise_non_differentiable_error
 from .bases import Aggregator
 
 
@@ -47,6 +48,9 @@ def __init__(self, f: Callable = _identity, leak: Tensor | None = None):
         self.f = f
         self.leak = leak
 
+        # This prevents computing gradients that can be very wrong.
+        self.register_full_backward_pre_hook(raise_non_differentiable_error)
+
     def forward(self, matrix: Tensor) -> Tensor:
         self._check_is_matrix(matrix)
         self._check_matrix_has_enough_rows(matrix)
diff --git a/src/torchjd/aggregation/imtl_g.py b/src/torchjd/aggregation/imtl_g.py
@@ -2,6 +2,7 @@
 from torch import Tensor
 
 from ._gramian_utils import compute_gramian
+from ._non_differentiable import raise_non_differentiable_error
 from .bases import _WeightedAggregator, _Weighting
 
 
@@ -30,6 +31,9 @@ class IMTLG(_WeightedAggregator):
     def __init__(self):
         super().__init__(weighting=_IMTLGWeighting())
 
+        # This prevents computing gradients that can be very wrong.
+        self.register_full_backward_pre_hook(raise_non_differentiable_error)
+
 
 class _IMTLGWeighting(_Weighting):
     """
diff --git a/src/torchjd/aggregation/nash_mtl.py b/src/torchjd/aggregation/nash_mtl.py
@@ -33,6 +33,7 @@
 from cvxpy import Expression
 from torch import Tensor
 
+from ._non_differentiable import raise_non_differentiable_error
 from .bases import _WeightedAggregator, _Weighting
 
 
@@ -95,6 +96,9 @@ def __init__(
             )
         )
 
+        # This prevents considering the computed weights as constant w.r.t. the matrix.
+        self.register_full_backward_pre_hook(raise_non_differentiable_error)
+
     def reset(self) -> None:
         """Resets the internal state of the algorithm."""
         self.weighting.reset()
diff --git a/src/torchjd/aggregation/pcgrad.py b/src/torchjd/aggregation/pcgrad.py
@@ -2,6 +2,7 @@
 from torch import Tensor
 
 from ._gramian_utils import compute_gramian
+from ._non_differentiable import raise_non_differentiable_error
 from .bases import _WeightedAggregator, _Weighting
 
 
@@ -28,6 +29,9 @@ class PCGrad(_WeightedAggregator):
     def __init__(self):
         super().__init__(weighting=_PCGradWeighting())
 
+        # This prevents running into a RuntimeError due to modifying stored tensors in place.
+        self.register_full_backward_pre_hook(raise_non_differentiable_error)
+
 
 class _PCGradWeighting(_Weighting):
     """
diff --git a/src/torchjd/aggregation/upgrad.py b/src/torchjd/aggregation/upgrad.py
@@ -5,6 +5,7 @@
 
 from ._dual_cone_utils import project_weights
 from ._gramian_utils import compute_gramian, normalize, regularize
+from ._non_differentiable import raise_non_differentiable_error
 from ._pref_vector_utils import pref_vector_to_str_suffix, pref_vector_to_weighting
 from .bases import _WeightedAggregator, _Weighting
 from .mean import _MeanWeighting
@@ -56,6 +57,9 @@ def __init__(
             )
         )
 
+        # This prevents considering the computed weights as constant w.r.t. the matrix.
+        self.register_full_backward_pre_hook(raise_non_differentiable_error)
+
     def __repr__(self) -> str:
         return (
             f"{self.__class__.__name__}(pref_vector={repr(self._pref_vector)}, norm_eps="
diff --git a/tests/unit/aggregation/_property_testers.py b/tests/unit/aggregation/_property_testers.py
@@ -1,9 +1,10 @@
 import torch
-from pytest import mark
+from pytest import mark, raises
 from torch import Tensor
 from torch.testing import assert_close
 
 from torchjd.aggregation import Aggregator
+from torchjd.aggregation._non_differentiable import NonDifferentiableError
 
 from ._inputs import non_strong_matrices, scaled_matrices, typical_matrices
 
@@ -123,3 +124,21 @@ def _assert_stationarity_property(aggregator: Aggregator, matrix: Tensor) -> Non
         vector = aggregator(matrix)
         norm = vector.norm().item()
         assert norm > 1e-03
+
+
+class NonDifferentiableProperty:
+    """
+    This class tests empirically that a given non-differentiable `Aggregator` correctly raises a
+    NonDifferentiableError whenever we try to backward through it.
+    """
+
+    @classmethod
+    @mark.parametrize("matrix", [torch.ones(3, 5, requires_grad=True)])
+    def test_non_differentiable_property(cls, aggregator: Aggregator, matrix: Tensor):
+        cls._assert_non_differentiable_property(aggregator, matrix)
+
+    @staticmethod
+    def _assert_non_differentiable_property(aggregator: Aggregator, matrix: Tensor):
+        vector = aggregator(matrix)
+        with raises(NonDifferentiableError):
+            vector.backward(torch.ones_like(vector))
diff --git a/tests/unit/aggregation/test_cagrad.py b/tests/unit/aggregation/test_cagrad.py
@@ -5,11 +5,15 @@
 
 from torchjd.aggregation import CAGrad
 
-from ._property_testers import ExpectedStructureProperty, NonConflictingProperty
+from ._property_testers import (
+    ExpectedStructureProperty,
+    NonConflictingProperty,
+    NonDifferentiableProperty,
+)
 
 
 @mark.parametrize("aggregator", [CAGrad(c=0.5)])
-class TestCAGrad(ExpectedStructureProperty):
+class TestCAGrad(ExpectedStructureProperty, NonDifferentiableProperty):
     pass
 
 
diff --git a/tests/unit/aggregation/test_config.py b/tests/unit/aggregation/test_config.py
@@ -3,13 +3,17 @@
 
 from torchjd.aggregation import ConFIG
 
-from ._property_testers import ExpectedStructureProperty, LinearUnderScalingProperty
+from ._property_testers import (
+    ExpectedStructureProperty,
+    LinearUnderScalingProperty,
+    NonDifferentiableProperty,
+)
 
 
 # For some reason, some permutation-invariance property tests fail with the pinv-based
 # implementation.
 @mark.parametrize("aggregator", [ConFIG()])
-class TestConfig(ExpectedStructureProperty, LinearUnderScalingProperty):
+class TestConfig(ExpectedStructureProperty, LinearUnderScalingProperty, NonDifferentiableProperty):
     pass
 
 
diff --git a/tests/unit/aggregation/test_dualproj.py b/tests/unit/aggregation/test_dualproj.py
@@ -6,6 +6,7 @@
 from ._property_testers import (
     ExpectedStructureProperty,
     NonConflictingProperty,
+    NonDifferentiableProperty,
     PermutationInvarianceProperty,
     StrongStationarityProperty,
 )
@@ -17,6 +18,7 @@ class TestDualProj(
     NonConflictingProperty,
     PermutationInvarianceProperty,
     StrongStationarityProperty,
+    NonDifferentiableProperty,
 ):
     pass
 
diff --git a/tests/unit/aggregation/test_graddrop.py b/tests/unit/aggregation/test_graddrop.py
@@ -6,11 +6,11 @@
 
 from torchjd.aggregation import GradDrop
 
-from ._property_testers import ExpectedStructureProperty
+from ._property_testers import ExpectedStructureProperty, NonDifferentiableProperty
 
 
 @mark.parametrize("aggregator", [GradDrop()])
-class TestGradDrop(ExpectedStructureProperty):
+class TestGradDrop(ExpectedStructureProperty, NonDifferentiableProperty):
     pass
 
 
diff --git a/tests/unit/aggregation/test_imtl_g.py b/tests/unit/aggregation/test_imtl_g.py
@@ -4,12 +4,12 @@
 
 from torchjd.aggregation import IMTLG
 
-from ._property_testers import ExpectedStructureProperty
+from ._property_testers import ExpectedStructureProperty, NonDifferentiableProperty
 
 
 # For some reason, a permutation-invariance property test fails on GPU
 @mark.parametrize("aggregator", [IMTLG()])
-class TestIMTLG(ExpectedStructureProperty):
+class TestIMTLG(ExpectedStructureProperty, NonDifferentiableProperty):
     pass
 
 
diff --git a/tests/unit/aggregation/test_nash_mtl.py b/tests/unit/aggregation/test_nash_mtl.py
@@ -6,7 +6,7 @@
 from torchjd.aggregation import NashMTL
 
 from ._inputs import nash_mtl_matrices
-from ._property_testers import ExpectedStructureProperty
+from ._property_testers import ExpectedStructureProperty, NonDifferentiableProperty
 
 
 def _make_aggregator(matrix: Tensor) -> NashMTL:
@@ -20,7 +20,7 @@ def _make_aggregator(matrix: Tensor) -> NashMTL:
     "ignore:Solution may be inaccurate.",
     "ignore:You are solving a parameterized problem that is not DPP.",
 )
-class TestNashMTL(ExpectedStructureProperty):
+class TestNashMTL(ExpectedStructureProperty, NonDifferentiableProperty):
     # Override the parametrization of `test_expected_structure_property` to make the test use the
     # right aggregator with each matrix.
 
@@ -31,6 +31,13 @@ class TestNashMTL(ExpectedStructureProperty):
     def test_expected_structure_property(cls, aggregator: NashMTL, matrix: Tensor):
         cls._assert_expected_structure_property(aggregator, matrix)
 
+    @classmethod
+    @mark.parametrize(
+        ["aggregator", "matrix"], [(NashMTL(n_tasks=3), torch.ones(3, 5, requires_grad=True))]
+    )
+    def test_non_differentiable_property(cls, aggregator: NashMTL, matrix: Tensor):
+        cls._assert_expected_structure_property(aggregator, matrix)
+
 
 @mark.filterwarnings("ignore: You are solving a parameterized problem that is not DPP.")
 def test_nash_mtl_reset():
diff --git a/tests/unit/aggregation/test_pcgrad.py b/tests/unit/aggregation/test_pcgrad.py
@@ -7,11 +7,11 @@
 from torchjd.aggregation.sum import _SumWeighting
 from torchjd.aggregation.upgrad import _UPGradWrapper
 
-from ._property_testers import ExpectedStructureProperty
+from ._property_testers import ExpectedStructureProperty, NonDifferentiableProperty
 
 
 @mark.parametrize("aggregator", [PCGrad()])
-class TestPCGrad(ExpectedStructureProperty):
+class TestPCGrad(ExpectedStructureProperty, NonDifferentiableProperty):
     pass
 
 
diff --git a/tests/unit/aggregation/test_upgrad.py b/tests/unit/aggregation/test_upgrad.py
@@ -7,6 +7,7 @@
     ExpectedStructureProperty,
     LinearUnderScalingProperty,
     NonConflictingProperty,
+    NonDifferentiableProperty,
     PermutationInvarianceProperty,
     StrongStationarityProperty,
 )
@@ -19,6 +20,7 @@ class TestUPGrad(
     PermutationInvarianceProperty,
     LinearUnderScalingProperty,
     StrongStationarityProperty,
+    NonDifferentiableProperty,
 ):
     pass
 

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@`
`33`	`33`	`from cvxpy import Expression`
`34`	`34`	`from torch import Tensor`
`35`	`35`
	`36`	`+from ._non_differentiable import raise_non_differentiable_error`
`36`	`37`	`from .bases import _WeightedAggregator, _Weighting`
`37`	`38`
`38`	`39`
`@@ -95,6 +96,9 @@ def __init__(`
`95`	`96`	`)`
`96`	`97`	`)`
`97`	`98`
	`99`	`+ # This prevents considering the computed weights as constant w.r.t. the matrix.`
	`100`	`+ self.register_full_backward_pre_hook(raise_non_differentiable_error)`
	`101`	`+`
`98`	`102`	`def reset(self) -> None:`
`99`	`103`	`"""Resets the internal state of the algorithm."""`
`100`	`104`	`self.weighting.reset()`