|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +from typing import cast |
| 4 | + |
| 5 | +import torch |
| 6 | +from torch import Tensor |
| 7 | + |
| 8 | +from torchjd._linalg import PSDMatrix |
| 9 | + |
| 10 | +from ._aggregator_bases import GramianWeightedAggregator |
| 11 | +from ._utils.non_differentiable import raise_non_differentiable_error |
| 12 | +from ._weighting_bases import Weighting |
| 13 | + |
| 14 | + |
| 15 | +class GradVac(GramianWeightedAggregator): |
| 16 | + r""" |
| 17 | + :class:`~torchjd.aggregation._aggregator_bases.Aggregator` implementing the aggregation step of |
| 18 | + Gradient Vaccine (GradVac) from `Gradient Vaccine: Investigating and Improving Multi-task |
| 19 | + Optimization in Massively Multilingual Models (ICLR 2021 Spotlight) |
| 20 | + <https://openreview.net/forum?id=F1vEjWK-lH_>`_. |
| 21 | +
|
| 22 | + For each task :math:`i`, the order in which other tasks :math:`j` are visited is drawn at |
| 23 | + random. For each pair :math:`(i, j)`, the cosine similarity :math:`\phi_{ij}` between the |
| 24 | + (possibly already modified) gradient of task :math:`i` and the original gradient of task |
| 25 | + :math:`j` is compared to an EMA target :math:`\hat{\phi}_{ij}`. When |
| 26 | + :math:`\phi_{ij} < \hat{\phi}_{ij}`, a closed-form correction adds a scaled copy of |
| 27 | + :math:`g_j` to :math:`g_i^{(\mathrm{PC})}`. The EMA is then updated with |
| 28 | + :math:`\hat{\phi}_{ij} \leftarrow (1-\beta)\hat{\phi}_{ij} + \beta \phi_{ij}`. The aggregated |
| 29 | + vector is the sum of the modified rows. |
| 30 | +
|
| 31 | + This aggregator is stateful: it keeps :math:`\hat{\phi}` across calls. Use :meth:`reset` when |
| 32 | + the number of tasks or dtype changes. |
| 33 | +
|
| 34 | + :param beta: EMA decay for :math:`\hat{\phi}`. |
| 35 | + :param eps: Small non-negative constant added to denominators. |
| 36 | +
|
| 37 | + .. note:: |
| 38 | + For each task :math:`i`, the order of other tasks :math:`j` is shuffled independently |
| 39 | + using the global PyTorch RNG (``torch.randperm``). Seed it with ``torch.manual_seed`` if |
| 40 | + you need reproducibility. |
| 41 | + """ |
| 42 | + |
| 43 | + def __init__(self, beta: float = 0.5, eps: float = 1e-8) -> None: |
| 44 | + weighting = GradVacWeighting(beta=beta, eps=eps) |
| 45 | + super().__init__(weighting) |
| 46 | + self._gradvac_weighting = weighting |
| 47 | + self.register_full_backward_pre_hook(raise_non_differentiable_error) |
| 48 | + |
| 49 | + @property |
| 50 | + def beta(self) -> float: |
| 51 | + return self._gradvac_weighting.beta |
| 52 | + |
| 53 | + @beta.setter |
| 54 | + def beta(self, value: float) -> None: |
| 55 | + self._gradvac_weighting.beta = value |
| 56 | + |
| 57 | + @property |
| 58 | + def eps(self) -> float: |
| 59 | + return self._gradvac_weighting.eps |
| 60 | + |
| 61 | + @eps.setter |
| 62 | + def eps(self, value: float) -> None: |
| 63 | + self._gradvac_weighting.eps = value |
| 64 | + |
| 65 | + def reset(self) -> None: |
| 66 | + """Clears EMA state so the next forward starts from zero targets.""" |
| 67 | + |
| 68 | + self._gradvac_weighting.reset() |
| 69 | + |
| 70 | + def __repr__(self) -> str: |
| 71 | + return f"GradVac(beta={self.beta!r}, eps={self.eps!r})" |
| 72 | + |
| 73 | + |
| 74 | +class GradVacWeighting(Weighting[PSDMatrix]): |
| 75 | + r""" |
| 76 | + :class:`~torchjd.aggregation._weighting_bases.Weighting` giving the weights of |
| 77 | + :class:`~torchjd.aggregation.GradVac`. |
| 78 | +
|
| 79 | + All required quantities (gradient norms, cosine similarities, and their updates after the |
| 80 | + vaccine correction) are derived purely from the Gramian, without needing the full Jacobian. |
| 81 | + If :math:`g_i^{(\mathrm{PC})} = \sum_k c_{ik} g_k`, then: |
| 82 | +
|
| 83 | + .. math:: |
| 84 | +
|
| 85 | + \|g_i^{(\mathrm{PC})}\|^2 = \mathbf{c}_i G \mathbf{c}_i^\top,\qquad |
| 86 | + g_i^{(\mathrm{PC})} \cdot g_j = \mathbf{c}_i G_{:,j} |
| 87 | +
|
| 88 | + where :math:`G` is the Gramian. The correction :math:`g_i^{(\mathrm{PC})} \mathrel{+}= w |
| 89 | + g_j` then becomes :math:`c_{ij} \mathrel{+}= w`, and the updated dot products follow |
| 90 | + immediately. |
| 91 | +
|
| 92 | + This weighting is stateful: it keeps :math:`\hat{\phi}` across calls. Use :meth:`reset` when |
| 93 | + the number of tasks or dtype changes. |
| 94 | +
|
| 95 | + :param beta: EMA decay for :math:`\hat{\phi}`. |
| 96 | + :param eps: Small non-negative constant added to denominators. |
| 97 | + """ |
| 98 | + |
| 99 | + def __init__(self, beta: float = 0.5, eps: float = 1e-8) -> None: |
| 100 | + super().__init__() |
| 101 | + if not (0.0 <= beta <= 1.0): |
| 102 | + raise ValueError(f"Parameter `beta` must be in [0, 1]. Found beta={beta!r}.") |
| 103 | + if eps < 0.0: |
| 104 | + raise ValueError(f"Parameter `eps` must be non-negative. Found eps={eps!r}.") |
| 105 | + |
| 106 | + self._beta = beta |
| 107 | + self._eps = eps |
| 108 | + self._phi_t: Tensor | None = None |
| 109 | + self._state_key: tuple[int, torch.dtype] | None = None |
| 110 | + |
| 111 | + @property |
| 112 | + def beta(self) -> float: |
| 113 | + return self._beta |
| 114 | + |
| 115 | + @beta.setter |
| 116 | + def beta(self, value: float) -> None: |
| 117 | + if not (0.0 <= value <= 1.0): |
| 118 | + raise ValueError(f"Attribute `beta` must be in [0, 1]. Found beta={value!r}.") |
| 119 | + self._beta = value |
| 120 | + |
| 121 | + @property |
| 122 | + def eps(self) -> float: |
| 123 | + return self._eps |
| 124 | + |
| 125 | + @eps.setter |
| 126 | + def eps(self, value: float) -> None: |
| 127 | + if value < 0.0: |
| 128 | + raise ValueError(f"Attribute `eps` must be non-negative. Found eps={value!r}.") |
| 129 | + self._eps = value |
| 130 | + |
| 131 | + def reset(self) -> None: |
| 132 | + """Clears EMA state so the next forward starts from zero targets.""" |
| 133 | + |
| 134 | + self._phi_t = None |
| 135 | + self._state_key = None |
| 136 | + |
| 137 | + def forward(self, gramian: PSDMatrix, /) -> Tensor: |
| 138 | + # Move all computations on cpu to avoid moving memory between cpu and gpu at each iteration |
| 139 | + device = gramian.device |
| 140 | + dtype = gramian.dtype |
| 141 | + cpu = torch.device("cpu") |
| 142 | + |
| 143 | + G = cast(PSDMatrix, gramian.to(device=cpu)) |
| 144 | + m = G.shape[0] |
| 145 | + |
| 146 | + self._ensure_state(m, dtype) |
| 147 | + phi_t = cast(Tensor, self._phi_t) |
| 148 | + |
| 149 | + beta = self._beta |
| 150 | + eps = self._eps |
| 151 | + |
| 152 | + # C[i, :] holds coefficients such that g_i^PC = sum_k C[i,k] * g_k (original gradients). |
| 153 | + # Initially each modified gradient equals the original, so C = I. |
| 154 | + C = torch.eye(m, device=cpu, dtype=dtype) |
| 155 | + |
| 156 | + for i in range(m): |
| 157 | + # Dot products of g_i^PC with every original g_j, shape (m,). |
| 158 | + cG = C[i] @ G |
| 159 | + |
| 160 | + others = [j for j in range(m) if j != i] |
| 161 | + perm = torch.randperm(len(others)) |
| 162 | + shuffled_js = [others[idx] for idx in perm.tolist()] |
| 163 | + |
| 164 | + for j in shuffled_js: |
| 165 | + dot_ij = cG[j] |
| 166 | + norm_i_sq = (cG * C[i]).sum() |
| 167 | + norm_i = norm_i_sq.clamp(min=0.0).sqrt() |
| 168 | + norm_j = G[j, j].clamp(min=0.0).sqrt() |
| 169 | + denom = norm_i * norm_j + eps |
| 170 | + phi_ijk = dot_ij / denom |
| 171 | + |
| 172 | + phi_hat = phi_t[i, j] |
| 173 | + if phi_ijk < phi_hat: |
| 174 | + sqrt_1_phi2 = (1.0 - phi_ijk * phi_ijk).clamp(min=0.0).sqrt() |
| 175 | + sqrt_1_hat2 = (1.0 - phi_hat * phi_hat).clamp(min=0.0).sqrt() |
| 176 | + denom_w = norm_j * sqrt_1_hat2 + eps |
| 177 | + w = norm_i * (phi_hat * sqrt_1_phi2 - phi_ijk * sqrt_1_hat2) / denom_w |
| 178 | + C[i, j] = C[i, j] + w |
| 179 | + cG = cG + w * G[j] |
| 180 | + |
| 181 | + phi_t[i, j] = (1.0 - beta) * phi_hat + beta * phi_ijk |
| 182 | + |
| 183 | + weights = C.sum(dim=0) |
| 184 | + return weights.to(device) |
| 185 | + |
| 186 | + def _ensure_state(self, m: int, dtype: torch.dtype) -> None: |
| 187 | + key = (m, dtype) |
| 188 | + if self._state_key != key or self._phi_t is None: |
| 189 | + self._phi_t = torch.zeros(m, m, dtype=dtype) |
| 190 | + self._state_key = key |
0 commit comments