feat: Add pipeline-aware gradient checkpointing

TimDettmers · claude · TimDettmers · commit e40805424494 · 2026-02-22T19:32:39.000-05:00
Adds CheckpointedStage and PipelineCheckpointer that wrap pipeline
stages with checkpoint_cpu_offload. Stage boundary activations stay
on GPU for inter-stage communication; internal layer activations are
offloaded to CPU during forward and reloaded during backward.

Also fixes checkpoint_cpu_offload backward to use torch.autograd.backward
instead of torch.autograd.grad, which properly accumulates gradients into
nn.Module parameters (not just input tensors). Updates the memory test
to use lightweight layers with large intermediate activations where
savings are clearly measurable.

4 new tests: checkpointed gradient correctness (CPU offload and standard),
memory reduction verification, eval mode bypass.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/bitsandbytes/pipeline.py b/bitsandbytes/pipeline.py
@@ -273,6 +273,67 @@ def split_model_layers(layers, num_stages):
         return stage_layers
 
 
+class CheckpointedStage(nn.Module):
+    """Pipeline stage with gradient checkpointing and optional CPU offload.
+
+    Wraps a stage module's forward with checkpoint_cpu_offload, so that
+    intermediate activations within the stage are offloaded to CPU during
+    forward and reloaded+recomputed during backward. Stage boundary
+    activations (input/output tensors) stay on GPU — they're managed by
+    the PipelineEngine for inter-stage communication.
+
+    Args:
+        stage_module: The stage module to wrap.
+        cpu_offload: If True, use checkpoint_cpu_offload (offloads to CPU).
+            If False, use torch.utils.checkpoint (GPU-only recomputation).
+    """
+
+    def __init__(self, stage_module, cpu_offload=True):
+        super().__init__()
+        self.stage_module = stage_module
+        self.cpu_offload = cpu_offload
+
+    def forward(self, x):
+        if self.training:
+            if self.cpu_offload:
+                from bitsandbytes.training import checkpoint_cpu_offload
+                return checkpoint_cpu_offload(self.stage_module, x)
+            else:
+                return torch.utils.checkpoint.checkpoint(
+                    self.stage_module, x, use_reentrant=False,
+                )
+        return self.stage_module(x)
+
+
+class PipelineCheckpointer:
+    """Wraps pipeline stages with gradient checkpointing.
+
+    Provides a static method to wrap each stage module with
+    CheckpointedStage. Stage boundary activations (passed between stages)
+    remain on GPU for pipeline communication; only internal layer
+    activations are checkpointed.
+
+    Usage:
+        stages = [SequentialStage(layers[:2]), SequentialStage(layers[2:])]
+        stages = PipelineCheckpointer.wrap_stages(stages, cpu_offload=True)
+        engine = PipelineEngine(stages, loss_fn=loss_fn, ...)
+    """
+
+    @staticmethod
+    def wrap_stages(stage_modules, cpu_offload=True):
+        """Wrap each stage with gradient checkpointing.
+
+        Args:
+            stage_modules: List of nn.Module stage modules.
+            cpu_offload: If True, offload activations to CPU. If False,
+                use standard gradient checkpointing (GPU recomputation only).
+
+        Returns:
+            List of CheckpointedStage modules.
+        """
+        return [CheckpointedStage(s, cpu_offload=cpu_offload) for s in stage_modules]
+
+
 class SequentialStage(nn.Module):
     """A pipeline stage that sequentially runs a list of layers.
 
diff --git a/bitsandbytes/training.py b/bitsandbytes/training.py
@@ -88,19 +88,16 @@ def backward(ctx, *grad_outputs):
         if isinstance(outputs, torch.Tensor):
             outputs = (outputs,)
 
-        # Compute gradients
-        input_grads = torch.autograd.grad(
-            outputs,
-            [inp for inp in inputs if isinstance(inp, torch.Tensor) and inp.requires_grad],
-            grad_outputs=grad_outputs,
-        )
-
-        # Map gradients back to original input positions
-        grad_iter = iter(input_grads)
+        # Use backward() to accumulate gradients into all leaf parameters
+        # (not just inputs). This is needed when the checkpointed function
+        # is an nn.Module with trainable parameters.
+        torch.autograd.backward(outputs, grad_outputs)
+
+        # Collect input gradients
         result = [None, None]  # for run_function and preserve_rng_state
-        for cpu_input, req_grad in zip(ctx.cpu_inputs, ctx.input_requires_grad):
-            if isinstance(cpu_input, torch.Tensor) and req_grad:
-                result.append(next(grad_iter))
+        for inp, req_grad in zip(inputs, ctx.input_requires_grad):
+            if isinstance(inp, torch.Tensor) and req_grad:
+                result.append(inp.grad if inp.grad is not None else torch.zeros_like(inp))
             else:
                 result.append(None)
 
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
@@ -12,6 +12,8 @@
 import torch.nn as nn
 
 from bitsandbytes.pipeline import (
+    CheckpointedStage,
+    PipelineCheckpointer,
     PipelineEngine,
     SequentialStage,
     generate_1f1b_schedule,
@@ -481,5 +483,189 @@ def test_multiple_steps(self):
                 assert (layer.linear.weight.grad != 0).any()
 
 
+# ─── Pipeline Checkpointing Tests ─────────────────────────────────────────
+
+class WideLayer(nn.Module):
+    """Linear layer with large intermediate for memory testing."""
+
+    def __init__(self, dim, intermediate):
+        super().__init__()
+        self.up = nn.Linear(dim, intermediate, bias=False)
+        self.down = nn.Linear(intermediate, dim, bias=False)
+
+    def forward(self, x):
+        return self.down(torch.relu(self.up(x)))
+
+
+class TestPipelineCheckpointer:
+
+    def test_checkpointed_gradient_correctness(self):
+        """Checkpointed pipeline should produce identical gradients to reference."""
+        dim = 32
+        M = 4
+        torch.manual_seed(42)
+
+        layers = [SimpleLayer(dim).cuda() for _ in range(4)]
+        micro_inputs = [torch.randn(4, dim, device="cuda") for _ in range(M)]
+        micro_labels = [torch.randn(4, dim, device="cuda") for _ in range(M)]
+        loss_fn = lambda out, labels: (out - labels).pow(2).mean()
+
+        # Reference: single-device gradient accumulation
+        ref_layers = [SimpleLayer(dim).cuda() for _ in range(4)]
+        for ref, orig in zip(ref_layers, layers):
+            ref.linear.weight.data.copy_(orig.linear.weight.data)
+        for ref in ref_layers:
+            ref.zero_grad()
+        for m in range(M):
+            x = micro_inputs[m]
+            for ref in ref_layers:
+                x = ref(x)
+            loss = loss_fn(x, micro_labels[m]) / M
+            loss.backward()
+        ref_grads = [ref.linear.weight.grad.clone() for ref in ref_layers]
+
+        # Pipeline with checkpointing
+        for layer in layers:
+            layer.zero_grad()
+        stages = [SequentialStage(layers[:2]).cuda(), SequentialStage(layers[2:]).cuda()]
+        stages = PipelineCheckpointer.wrap_stages(stages, cpu_offload=True)
+        engine = PipelineEngine(stages, loss_fn=loss_fn, num_micro_batches=M)
+
+        # Set to training mode
+        for s in stages:
+            s.train()
+
+        result = engine.step(micro_inputs, micro_labels)
+
+        for i, layer in enumerate(layers):
+            assert layer.linear.weight.grad is not None, f"Layer {i}: no gradient"
+            torch.testing.assert_close(
+                ref_grads[i], layer.linear.weight.grad,
+                atol=1e-5, rtol=1e-5,
+                msg=f"Layer {i}: gradient mismatch with checkpointing",
+            )
+
+    def test_checkpointed_no_cpu_offload(self):
+        """Checkpointing without CPU offload should also produce correct gradients."""
+        dim = 32
+        M = 4
+        torch.manual_seed(42)
+
+        layers = [SimpleLayer(dim).cuda() for _ in range(4)]
+        micro_inputs = [torch.randn(4, dim, device="cuda") for _ in range(M)]
+        micro_labels = [torch.randn(4, dim, device="cuda") for _ in range(M)]
+        loss_fn = lambda out, labels: (out - labels).pow(2).mean()
+
+        # Reference
+        ref_layers = [SimpleLayer(dim).cuda() for _ in range(4)]
+        for ref, orig in zip(ref_layers, layers):
+            ref.linear.weight.data.copy_(orig.linear.weight.data)
+        for ref in ref_layers:
+            ref.zero_grad()
+        for m in range(M):
+            x = micro_inputs[m]
+            for ref in ref_layers:
+                x = ref(x)
+            loss = loss_fn(x, micro_labels[m]) / M
+            loss.backward()
+        ref_grads = [ref.linear.weight.grad.clone() for ref in ref_layers]
+
+        # Pipeline with standard checkpointing (no CPU offload)
+        for layer in layers:
+            layer.zero_grad()
+        stages = [SequentialStage(layers[:2]).cuda(), SequentialStage(layers[2:]).cuda()]
+        stages = PipelineCheckpointer.wrap_stages(stages, cpu_offload=False)
+        engine = PipelineEngine(stages, loss_fn=loss_fn, num_micro_batches=M)
+        for s in stages:
+            s.train()
+        result = engine.step(micro_inputs, micro_labels)
+
+        for i, layer in enumerate(layers):
+            assert layer.linear.weight.grad is not None, f"Layer {i}: no gradient"
+            torch.testing.assert_close(
+                ref_grads[i], layer.linear.weight.grad,
+                atol=1e-5, rtol=1e-5,
+                msg=f"Layer {i}: gradient mismatch without CPU offload",
+            )
+
+    def test_checkpointed_memory_reduction(self):
+        """Checkpointing should reduce peak GPU memory for wide layers."""
+        dim = 64
+        intermediate = 4096  # Large intermediate to make memory difference visible
+        M = 4
+        batch = 32
+        torch.manual_seed(42)
+
+        loss_fn = lambda out, labels: (out - labels).pow(2).mean()
+
+        def run_pipeline(use_checkpoint):
+            torch.cuda.empty_cache()
+            torch.cuda.reset_peak_memory_stats()
+
+            layers = [WideLayer(dim, intermediate).cuda() for _ in range(4)]
+            micro_inputs = [torch.randn(batch, dim, device="cuda") for _ in range(M)]
+            micro_labels = [torch.randn(batch, dim, device="cuda") for _ in range(M)]
+
+            for layer in layers:
+                layer.zero_grad()
+
+            stages = [
+                SequentialStage(layers[:2]).cuda(),
+                SequentialStage(layers[2:]).cuda(),
+            ]
+
+            if use_checkpoint:
+                stages = PipelineCheckpointer.wrap_stages(stages, cpu_offload=True)
+                for s in stages:
+                    s.train()
+
+            engine = PipelineEngine(stages, loss_fn=loss_fn, num_micro_batches=M)
+
+            result = engine.step(micro_inputs, micro_labels)
+
+            peak_mem = torch.cuda.max_memory_allocated()
+
+            # Verify gradients exist
+            for layer in layers:
+                for p in layer.parameters():
+                    assert p.grad is not None
+
+            # Cleanup
+            del layers, micro_inputs, micro_labels, stages, engine
+            torch.cuda.empty_cache()
+
+            return peak_mem
+
+        peak_no_ckpt = run_pipeline(use_checkpoint=False)
+        peak_with_ckpt = run_pipeline(use_checkpoint=True)
+
+        # Checkpointing should use less peak memory
+        assert peak_with_ckpt < peak_no_ckpt, (
+            f"Checkpointing should reduce memory: "
+            f"without={peak_no_ckpt / 1e6:.1f}MB, with={peak_with_ckpt / 1e6:.1f}MB"
+        )
+
+    def test_eval_mode_skips_checkpointing(self):
+        """In eval mode, checkpointed stages should skip checkpointing."""
+        dim = 32
+        torch.manual_seed(42)
+
+        layers = [SimpleLayer(dim).cuda() for _ in range(4)]
+        stage = SequentialStage(layers[:2]).cuda()
+        ckpt_stage = CheckpointedStage(stage, cpu_offload=True)
+
+        x = torch.randn(4, dim, device="cuda")
+
+        # Training mode: uses checkpointing
+        ckpt_stage.train()
+        out_train = ckpt_stage(x)
+
+        # Eval mode: skips checkpointing
+        ckpt_stage.eval()
+        out_eval = ckpt_stage(x)
+
+        torch.testing.assert_close(out_train, out_eval, atol=1e-6, rtol=1e-6)
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v", "--tb=short"])
diff --git a/tests/test_training.py b/tests/test_training.py
@@ -60,20 +60,41 @@ def test_with_nn_module(self):
         assert x.grad.shape == x.shape
 
     def test_memory_reduction(self):
-        """CPU offload should use less GPU memory than standard checkpoint."""
-        dim = 1024
-
-        # Standard forward (saves activations on GPU)
+        """CPU offload should reduce GPU memory by offloading activations.
+
+        Uses lightweight parameterized functions that produce large
+        intermediate activations so the saved-activation memory
+        dominates over parameter gradient memory.
+        """
+        dim = 64
+        expand = 2048
+        n_layers = 8
+
+        class ExpandLayer(nn.Module):
+            """Lightweight params but large intermediate activations."""
+
+            def __init__(self):
+                super().__init__()
+                self.w = nn.Parameter(torch.randn(dim) * 0.01)
+
+            def forward(self, x):
+                # x: [batch, dim]. Expand to [batch, dim, expand], sum back.
+                # The expanded tensor is large and saved for backward.
+                h = x * self.w  # element-wise, saves x and w for backward
+                h = h.unsqueeze(-1).expand(-1, -1, expand)  # large activation
+                h = h.mean(-1)  # back to [batch, dim]
+                return h
+
+        # Standard forward (saves all expanded activations on GPU)
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
 
-        layers = nn.ModuleList([nn.Linear(dim, dim).cuda() for _ in range(4)])
-        x = torch.randn(32, dim, device="cuda", requires_grad=True)
+        layers = nn.ModuleList([ExpandLayer().cuda() for _ in range(n_layers)])
+        x = torch.randn(512, dim, device="cuda", requires_grad=True)
 
-        # Standard: all activations stay on GPU
         h = x
         for layer in layers:
-            h = torch.nn.functional.gelu(layer(h))
+            h = layer(h)
         h.sum().backward()
         peak_standard = torch.cuda.max_memory_allocated()
 
@@ -85,15 +106,14 @@ def test_memory_reduction(self):
         torch.cuda.reset_peak_memory_stats()
 
         # CPU offload: activations go to CPU
-        x = torch.randn(32, dim, device="cuda", requires_grad=True)
+        x = torch.randn(512, dim, device="cuda", requires_grad=True)
         h = x
         for layer in layers:
-            h = checkpoint_cpu_offload(lambda inp, l=layer: torch.nn.functional.gelu(l(inp)), h)
+            h = checkpoint_cpu_offload(layer, h)
         h.sum().backward()
         peak_offload = torch.cuda.max_memory_allocated()
 
         # CPU offload should use less peak memory
-        # Allow some margin since PyTorch internal allocations vary
         assert peak_offload < peak_standard, (
             f"CPU offload ({peak_offload / 1e6:.1f} MB) should use less peak memory "
             f"than standard ({peak_standard / 1e6:.1f} MB)"