feat(ocl): add lwf

tachyonicClock · tachyonicClock · commit f1c3cc999fcf · 2026-04-29T15:20:45.000+12:00
diff --git a/src/capymoa/ocl/strategy/__init__.py b/src/capymoa/ocl/strategy/__init__.py
@@ -5,5 +5,6 @@
 from ._rar import RAR
 from . import l2p
 from ._ewc import EWC
+from ._lwf import LWF
 
-__all__ = ["ExperienceReplay", "SLDA", "NCM", "GDumb", "RAR", "l2p", "EWC"]
+__all__ = ["ExperienceReplay", "SLDA", "NCM", "GDumb", "RAR", "l2p", "EWC", "LWF"]
diff --git a/src/capymoa/ocl/strategy/_lwf.py b/src/capymoa/ocl/strategy/_lwf.py
@@ -0,0 +1,105 @@
+from copy import deepcopy
+from typing import Optional
+
+import torch
+from torch import Tensor, nn
+
+from capymoa.base import BatchClassifier
+from capymoa.ocl.base import TrainTaskAware
+from capymoa.ocl.util.functional import hinton_distillation_loss
+from capymoa.stream._stream import Schema
+
+
+class LWF(BatchClassifier, nn.Module, TrainTaskAware):
+    """Learning Without Forgetting (LwF) [#f1]_ .
+
+    LwF is a regularisation-based continual learning strategy that distils predictions
+    from a frozen teacher snapshot of the previous task while learning the current task.
+
+    ..  [#f1] Li, Z., & Hoiem, D. (2016). Learning without forgetting. CoRR,
+        abs/1606.09282. http://arxiv.org/abs/1606.09282
+    """
+
+    def __init__(
+        self,
+        schema: Schema,
+        model: torch.nn.Module,
+        optimiser: torch.optim.Optimizer,
+        alpha: float = 1.0,
+        temperature: float = 2.0,
+        device: torch.device = torch.device("cpu"),
+    ) -> None:
+        """Construct an LWF learner.
+
+        :param schema: Stream schema used by the classifier interface.
+        :param model: Torch model that outputs class logits.
+        :param optimiser: Optimiser used to update ``model`` parameters.
+        :param alpha: Weight of the distillation loss term.
+        :param temperature: Distillation temperature.
+        :param device: Compute device.
+        """
+        super().__init__(schema, 0)
+        nn.Module.__init__(self)
+        if alpha < 0:
+            raise ValueError("alpha must be non-negative.")
+        if temperature <= 0:
+            raise ValueError("temperature must be greater than zero.")
+
+        self.device = device
+
+        self._alpha = alpha
+        self._temperature = temperature
+
+        self._optimiser = optimiser
+        self._model = model
+        self._criterion = torch.nn.CrossEntropyLoss()
+
+        self._teacher: Optional[torch.nn.Module] = None
+        self._train_task = 0
+
+    def batch_train(self, x: Tensor, y: Tensor) -> None:
+        self._model.train()
+        self._optimiser.zero_grad()
+
+        student_logits = self._model(x)
+        task_loss = self._criterion(student_logits, y)
+        total_loss = task_loss + self._alpha * self._distillation_loss(
+            x, student_logits
+        )
+
+        total_loss.backward()
+        self._optimiser.step()
+
+    @torch.no_grad()
+    def batch_predict_proba(self, x: Tensor) -> Tensor:
+        self._model.eval()
+        y_hat = self._model(x)
+        return torch.softmax(y_hat, dim=1)
+
+    def on_train_task(self, task_id: int) -> None:
+        if task_id > 0:
+            self._teacher = (
+                deepcopy(self._model).to(self.device).eval().requires_grad_(False)
+            )
+        self._train_task = task_id
+
+    @torch.no_grad()
+    def _teacher_forward(self, x: Tensor) -> Tensor:
+        if self._teacher is None:
+            raise RuntimeError("Teacher model is not available before task 1.")
+        return self._teacher(x)
+
+    def _distillation_loss(self, x: Tensor, student_logits: Tensor) -> Tensor:
+        if self._teacher is None:
+            return torch.tensor(0.0, device=self.device)
+
+        teacher_logits = self._teacher_forward(x)
+
+        return hinton_distillation_loss(
+            teacher_logits=teacher_logits,
+            student_logits=student_logits,
+            temperature=self._temperature,
+        )
+
+    def __str__(self) -> str:
+        return f"LWF(alpha={self._alpha}, temperature={self._temperature})"
diff --git a/src/capymoa/ocl/util/functional.py b/src/capymoa/ocl/util/functional.py
@@ -0,0 +1,35 @@
+"""A collection of functional utilities for OCL."""
+
+from torch import Tensor
+from torch.nn.functional import kl_div, log_softmax
+
+
+def hinton_distillation_loss(
+    teacher_logits: Tensor, student_logits: Tensor, temperature: float = 1.0
+) -> Tensor:
+    """Hinton's distillation loss [#f1].
+
+    .. math::
+        L_{KD} = T^2 KL(softmax(z_s / T), softmax(z_t / T))
+
+    where :math:`T` is the temperature, :math:`z_s` are the student logits, and
+    :math:`z_t` are the teacher logits.
+
+    ..  [#f1] Hinton, G., Vinyals, O., & Dean, J. (2015). Distilling the Knowledge in a
+        Neural Network. arXiv:1503.02531 [Cs, Stat]. http://arxiv.org/abs/1503.02531
+
+    :param teacher_logits: Teacher logits of shape ``(batch_size, num_classes)``.
+    :param student_logits: Student logits of shape ``(batch_size, num_classes)``.
+    :param temperature: Temperature for distillation. Higher values produce softer
+        probability distributions.
+    :return: The distillation loss as a scalar tensor.
+    """
+    return (
+        kl_div(
+            log_softmax(student_logits / temperature, dim=1),  # Soft predictions
+            log_softmax(teacher_logits / temperature, dim=1),  # Soft targets
+            log_target=True,
+            reduction="batchmean",  # Mathematically correct unlike the default
+        )
+        * temperature**2
+    )
diff --git a/tests/ocl/test_strategy.py b/tests/ocl/test_strategy.py
@@ -11,7 +11,7 @@
 from capymoa.classifier import Finetune, HoeffdingTree
 from capymoa.ocl.datasets import TinySplitMNIST
 from capymoa.ocl.evaluation import ocl_train_eval_loop
-from capymoa.ocl.strategy import ExperienceReplay, SLDA, NCM, GDumb, RAR, EWC
+from capymoa.ocl.strategy import ExperienceReplay, SLDA, NCM, GDumb, RAR, EWC, LWF
 from capymoa.stream import Schema
 
 import torch
@@ -127,6 +127,11 @@ def _new_rar(schema):
         Result(71.99, 47.20, 25.3),
         task_mask=True,
     ),
+    Case(
+        "LWF",
+        new_constructor(LWF, lr=0.10, alpha=4.66, temperature=1.67),
+        Result(36.49, 25.09, 17.59),
+    ),
 ]