feat: Add distributed pipeline engine with NCCL/gloo support

TimDettmers · claude · TimDettmers · commit a60011f56e22 · 2026-02-22T19:38:06.000-05:00
DistributedPipelineEngine runs one pipeline stage per process,
communicating activations and gradients via torch.distributed
send/recv. Supports both NCCL (for multi-GPU) and gloo (for
single-GPU multi-process testing) backends.

Verified with torchrun --nproc_per_node=2: all 4 layer gradients
match single-device reference within 1e-5 tolerance.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/bitsandbytes/pipeline.py b/bitsandbytes/pipeline.py
@@ -334,6 +334,143 @@ def wrap_stages(stage_modules, cpu_offload=True):
         return [CheckpointedStage(s, cpu_offload=cpu_offload) for s in stage_modules]
 
 
+class DistributedPipelineEngine:
+    """Distributed 1F1B pipeline engine using NCCL.
+
+    Each process runs one pipeline stage. Activations are transferred
+    between stages via torch.distributed.send/recv. Designed for use
+    with torchrun or torch.distributed.launch.
+
+    Each rank runs one stage. rank 0 = first stage, rank (world_size-1) = last.
+
+    Args:
+        stage_module: The nn.Module for this process's stage.
+        rank: This process's rank (stage index).
+        world_size: Total number of stages/processes.
+        loss_fn: Loss function (only used by the last stage).
+        num_micro_batches: Number of micro-batches per step.
+        hidden_shape: Shape of the hidden state tensor (without batch dim).
+            Used to pre-allocate receive buffers.
+        dtype: Data type for tensors (default: float32).
+    """
+
+    def __init__(
+        self,
+        stage_module: nn.Module,
+        rank: int,
+        world_size: int,
+        loss_fn=None,
+        num_micro_batches: int = 4,
+        hidden_shape: tuple = None,
+        dtype: torch.dtype = torch.float32,
+    ):
+        self.stage_module = stage_module
+        self.rank = rank
+        self.world_size = world_size
+        self.loss_fn = loss_fn
+        self.num_micro_batches = num_micro_batches
+        self.hidden_shape = hidden_shape
+        self.dtype = dtype
+        self.device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
+
+        schedule = generate_1f1b_schedule(world_size, num_micro_batches)
+        self.my_schedule = schedule[rank]
+
+    def step(self, micro_batch_inputs=None, micro_batch_labels=None):
+        """Run one distributed training step.
+
+        Args:
+            micro_batch_inputs: List of M input tensors (only used by rank 0).
+            micro_batch_labels: List of M label tensors (only used by last rank).
+
+        Returns:
+            dict with loss info (only meaningful on last rank).
+        """
+        import torch.distributed as dist
+
+        M = self.num_micro_batches
+        s = self.rank
+        S = self.world_size
+
+        fwd_inputs = [None] * M
+        fwd_outputs = [None] * M
+        losses = [None] * M
+        grad_from_next = [None] * M
+
+        # Determine if we need CPU transfers (gloo doesn't support CUDA tensors)
+        backend = dist.get_backend()
+        use_cpu_comm = backend != "nccl"
+
+        def _send(tensor, dst):
+            if use_cpu_comm:
+                dist.send(tensor.cpu(), dst=dst)
+            else:
+                dist.send(tensor, dst=dst)
+
+        def _recv(shape, src, device, dtype):
+            if use_cpu_comm:
+                buf = torch.empty(*shape, dtype=dtype)
+                dist.recv(buf, src=src)
+                return buf.to(device)
+            else:
+                buf = torch.empty(*shape, device=device, dtype=dtype)
+                dist.recv(buf, src=src)
+                return buf
+
+        for op, m in self.my_schedule:
+            if op == "F":
+                # Get input
+                if s == 0:
+                    inp = micro_batch_inputs[m].to(self.device)
+                else:
+                    # Receive activation from previous stage
+                    inp = _recv(self.hidden_shape, src=s - 1,
+                                device=self.device, dtype=self.dtype)
+
+                inp = inp.requires_grad_(True)
+                fwd_inputs[m] = inp
+
+                # Forward
+                output = self.stage_module(inp)
+                fwd_outputs[m] = output
+
+                if s < S - 1:
+                    # Send activation to next stage
+                    _send(output.detach(), dst=s + 1)
+
+                # Last stage: compute loss
+                if s == S - 1 and self.loss_fn is not None and micro_batch_labels is not None:
+                    losses[m] = self.loss_fn(output, micro_batch_labels[m].to(self.device))
+
+            elif op == "B":
+                output = fwd_outputs[m]
+                inp = fwd_inputs[m]
+
+                if s == S - 1:
+                    # Last stage: backward from loss
+                    if losses[m] is not None:
+                        scaled_loss = losses[m] / M
+                        scaled_loss.backward(retain_graph=False)
+                else:
+                    # Receive gradient from next stage
+                    grad = _recv(output.shape, src=s + 1,
+                                 device=self.device, dtype=output.dtype)
+                    output.backward(grad, retain_graph=False)
+
+                if s > 0 and inp.grad is not None:
+                    # Send gradient to previous stage
+                    _send(inp.grad.detach(), dst=s - 1)
+
+        # Collect losses on last rank
+        valid_losses = [l.item() for l in losses if l is not None]
+        avg_loss = sum(valid_losses) / len(valid_losses) if valid_losses else 0.0
+
+        return {
+            "loss": avg_loss,
+            "losses": valid_losses,
+        }
+
+
 class SequentialStage(nn.Module):
     """A pipeline stage that sequentially runs a list of layers.
 
diff --git a/tests/test_distributed_pipeline.py b/tests/test_distributed_pipeline.py
@@ -0,0 +1,141 @@
+"""Distributed pipeline parallelism test.
+
+Run with: torchrun --nproc_per_node=2 tests/test_distributed_pipeline.py
+
+Verifies that the distributed pipeline engine produces the same
+gradients as single-process training with gradient accumulation.
+"""
+
+import sys
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+from bitsandbytes.pipeline import DistributedPipelineEngine, SequentialStage
+
+
+class SimpleLayer(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.linear = nn.Linear(dim, dim, bias=False)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+def run_test():
+    # Use gloo for point-to-point ops; NCCL send/recv can fail on single-GPU
+    dist.init_process_group(backend="gloo")
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    assert world_size == 2, f"Requires 2 processes, got {world_size}"
+
+    device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
+    torch.cuda.set_device(device)
+
+    dim = 32
+    M = 4
+    batch = 4
+
+    # Create layers with shared seeds so all ranks have the same initial weights
+    torch.manual_seed(42)
+    all_layers = [SimpleLayer(dim) for _ in range(4)]
+
+    if rank == 0:
+        my_layers = all_layers[:2]
+    else:
+        my_layers = all_layers[2:]
+
+    my_stage = SequentialStage(my_layers).to(device)
+    my_stage.zero_grad()
+
+    # Create identical inputs/labels on all ranks
+    torch.manual_seed(123)
+    micro_inputs = [torch.randn(batch, dim) for _ in range(M)]
+    micro_labels = [torch.randn(batch, dim) for _ in range(M)]
+
+    loss_fn = lambda out, labels: (out - labels).pow(2).mean()
+
+    # Run distributed pipeline
+    engine = DistributedPipelineEngine(
+        stage_module=my_stage,
+        rank=rank,
+        world_size=world_size,
+        loss_fn=loss_fn,
+        num_micro_batches=M,
+        hidden_shape=(batch, dim),
+        dtype=torch.float32,
+    )
+
+    result = engine.step(
+        micro_batch_inputs=micro_inputs if rank == 0 else None,
+        micro_batch_labels=micro_labels if rank == world_size - 1 else None,
+    )
+
+    # Collect per-layer gradients
+    pipe_grads = {}
+    for i, layer in enumerate(my_layers):
+        layer_idx = i + (2 if rank == 1 else 0)
+        if layer.linear.weight.grad is not None:
+            pipe_grads[layer_idx] = layer.linear.weight.grad.clone()
+
+    # Exchange gradients and loss: rank 1 sends to rank 0 (CPU for gloo)
+    if rank == 0:
+        for layer_idx in [2, 3]:
+            buf = torch.empty(dim, dim)  # CPU tensor for gloo
+            dist.recv(buf, src=1, tag=layer_idx)
+            pipe_grads[layer_idx] = buf.to(device)
+        # Receive loss from last rank
+        loss_buf = torch.empty(1)
+        dist.recv(loss_buf, src=world_size - 1, tag=100)
+        pipeline_loss = loss_buf.item()
+    else:
+        for layer_idx in [2, 3]:
+            dist.send(pipe_grads[layer_idx].cpu(), dst=0, tag=layer_idx)
+        # Send loss to rank 0
+        dist.send(torch.tensor([result["loss"]]), dst=0, tag=100)
+        pipeline_loss = result["loss"]
+
+    # Rank 0 computes reference and checks
+    if rank == 0:
+        torch.manual_seed(42)
+        ref_layers = [SimpleLayer(dim).to(device) for _ in range(4)]
+        for ref in ref_layers:
+            ref.zero_grad()
+
+        for m in range(M):
+            x = micro_inputs[m].to(device)
+            for ref in ref_layers:
+                x = ref(x)
+            loss = loss_fn(x, micro_labels[m].to(device)) / M
+            loss.backward()
+
+        ref_grads = [ref.linear.weight.grad.clone() for ref in ref_layers]
+
+        all_pass = True
+        for i in range(4):
+            ref_g = ref_grads[i]
+            pipe_g = pipe_grads.get(i)
+            if pipe_g is None:
+                print(f"FAIL: Layer {i} — no gradient")
+                all_pass = False
+            elif not torch.allclose(ref_g, pipe_g, atol=1e-5, rtol=1e-5):
+                max_diff = (ref_g - pipe_g).abs().max().item()
+                print(f"FAIL: Layer {i} — max diff: {max_diff:.2e}")
+                all_pass = False
+            else:
+                print(f"PASS: Layer {i} — gradients match")
+
+        print(f"\nPipeline loss: {pipeline_loss:.6f}")
+        print(f"Result: {'ALL PASSED' if all_pass else 'SOME FAILED'}")
+
+        if not all_pass:
+            sys.exit(1)
+
+    dist.barrier()
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    run_test()