feat: Per-rank model loading for pipeline parallelism

TimDettmers · claude · TimDettmers · commit 814e66547ea4 · 2026-02-22T20:17:33.000-05:00
KbitLoraModel now supports partial layer loading via layer_range,
include_embed, and include_lm_head parameters. Each pipeline rank
only quantizes and stores the layers it needs, reducing per-GPU
memory by roughly 1/num_stages compared to loading the full model.

- layer_range=(start, end): only load decoder layers [start, end)
- include_embed=False: skip embedding (non-first stages)
- include_lm_head=False: skip LM head + final norm (non-last stages)
- _layer_forward uses local 0-based indexing within loaded range
- Updated train_pipeline.py to use per-rank loading

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/bitsandbytes/kbit_lora.py b/bitsandbytes/kbit_lora.py
@@ -49,6 +49,13 @@ class KbitLoraModel(nn.Module):
         cpu_offload: If True, offload inter-layer activations to CPU during
             forward and reload during backward. Saves GPU memory at cost
             of CPU<->GPU bandwidth. Default False.
+        layer_range: Optional tuple (start, end) to only load decoder layers
+            [start, end). Used for pipeline parallelism so each rank only
+            loads its assigned layers. Default None (all layers).
+        include_embed: Whether to keep the embedding layer. Default True.
+            Set False for non-first pipeline stages.
+        include_lm_head: Whether to quantize and keep the LM head. Default True.
+            Set False for non-last pipeline stages.
     """
 
     def __init__(
@@ -63,6 +70,9 @@ def __init__(
         ce_chunk_size: int = 8192,
         compute_dtype: torch.dtype = torch.bfloat16,
         cpu_offload: bool = False,
+        layer_range: Optional[tuple[int, int]] = None,
+        include_embed: bool = True,
+        include_lm_head: bool = True,
     ):
         super().__init__()
 
@@ -87,6 +97,8 @@ def __init__(
         self.ce_chunk_size = ce_chunk_size
         self.compute_dtype = compute_dtype
         self.cpu_offload = cpu_offload
+        self.include_embed = include_embed
+        self.include_lm_head = include_lm_head
 
         # Extract model dimensions from config
         self.hidden_size = config.hidden_size
@@ -102,9 +114,21 @@ def __init__(
         self.rope_theta = getattr(config, "rope_theta", 10000.0)
         self.has_qk_norm = self.model_type == "qwen3"
 
+        # Determine layer range
+        total_layers = config.num_hidden_layers
+        if layer_range is not None:
+            self._layer_start, self._layer_end = layer_range
+            assert 0 <= self._layer_start < self._layer_end <= total_layers
+        else:
+            self._layer_start, self._layer_end = 0, total_layers
+        self._num_loaded_layers = self._layer_end - self._layer_start
+
         # Keep reference to original model for embeddings
         self.model = model
-        self.embed_tokens = model.model.embed_tokens
+        if include_embed:
+            self.embed_tokens = model.model.embed_tokens
+        else:
+            self.embed_tokens = None
         self.lm_head_tied = hasattr(model, "lm_head") and (
             model.lm_head.weight.data_ptr() == model.model.embed_tokens.weight.data_ptr()
         )
@@ -162,14 +186,19 @@ def _create_lora(self, name: str, N: int, K: int, device: torch.device):
         return A, B
 
     def _quantize_and_create_lora(self, model: nn.Module):
-        """Walk model, quantize weights, create LoRA adapters."""
+        """Walk model, quantize weights, create LoRA adapters.
+
+        Only processes layers in [_layer_start, _layer_end) and optionally
+        skips embedding and LM head for pipeline parallelism.
+        """
         device = next(model.parameters()).device
 
-        # Process each decoder layer
+        # Process only the decoder layers in our range
         layers = model.model.layers
         self._layer_data = []
 
-        for i, layer in enumerate(layers):
+        for i in range(self._layer_start, self._layer_end):
+            layer = layers[i]
             attn = layer.self_attn
             mlp = layer.mlp
             prefix = f"layers_{i}"
@@ -225,23 +254,26 @@ def _quantize_and_create_lora(self, model: nn.Module):
 
             self._layer_data.append(layer_info)
 
-        # Final norm
-        final_norm = model.model.norm
-        self._norm_weights["final_norm_weight"] = nn.Parameter(
-            final_norm.weight.data.to(self.compute_dtype).clone()
-        )
+        # Final norm (only needed by last stage or full model)
+        if self.include_lm_head:
+            final_norm = model.model.norm
+            self._norm_weights["final_norm_weight"] = nn.Parameter(
+                final_norm.weight.data.to(self.compute_dtype).clone()
+            )
 
-        # LM head (use k_lm_head)
-        lm_weight = model.lm_head.weight.data.to(device)
-        name = "lm_head"
-        packed, absmax, codebook, N_padded, N, K = self._quantize_weight(
-            lm_weight, name, k=self.k_lm_head,
-        )
-        self._lm_head_info = {
-            "packed": packed, "absmax": absmax, "codebook": codebook,
-            "N_padded": N_padded, "N": N, "K": K,
-            "k": self.k_lm_head,
-        }
+        # LM head (only needed by last stage or full model)
+        self._lm_head_info = None
+        if self.include_lm_head:
+            lm_weight = model.lm_head.weight.data.to(device)
+            name = "lm_head"
+            packed, absmax, codebook, N_padded, N, K = self._quantize_weight(
+                lm_weight, name, k=self.k_lm_head,
+            )
+            self._lm_head_info = {
+                "packed": packed, "absmax": absmax, "codebook": codebook,
+                "N_padded": N_padded, "N": N, "K": K,
+                "k": self.k_lm_head,
+            }
 
         # Precompute RoPE cos/sin cache
         self._build_rope_cache(device)
@@ -271,7 +303,7 @@ def _layer_forward(self, layer_idx: int, hidden: torch.Tensor, position_ids: tor
         """Forward pass for one decoder layer.
 
         Args:
-            layer_idx: Index of the decoder layer.
+            layer_idx: Local index (0-based within this model's loaded layers).
             hidden: Input hidden states [B, S, H].
             position_ids: Position IDs [B, S].
 
@@ -417,11 +449,15 @@ def forward(
         # Extend RoPE cache if needed
         self._extend_rope_cache(S, device)
 
-        # Embedding
-        hidden = self.embed_tokens(input_ids).to(self.compute_dtype)
+        # Embedding (only if this model has the embedding layer)
+        if self.embed_tokens is not None:
+            hidden = self.embed_tokens(input_ids).to(self.compute_dtype)
+        else:
+            # input_ids is actually hidden states from previous pipeline stage
+            hidden = input_ids
 
-        # Decoder layers
-        for i in range(self.num_layers):
+        # Decoder layers (local indices, 0-based)
+        for i in range(self._num_loaded_layers):
             if self.cpu_offload and self.training:
                 # Wrap each layer with CPU offload: saves inter-layer
                 # activations to CPU during forward, reloads during backward
@@ -433,7 +469,10 @@ def _fn(h):
             else:
                 hidden = self._layer_forward(i, hidden, position_ids)
 
-        # Final norm
+        # Final norm + LM head (only if this model has the LM head)
+        if not self.include_lm_head:
+            return {"hidden": hidden}
+
         hidden_2d = hidden.reshape(-1, self.hidden_size)
         hidden_2d = rmsnorm(
             hidden_2d, self._norm_weights["final_norm_weight"], eps=self.rms_norm_eps,
diff --git a/examples/train_pipeline.py b/examples/train_pipeline.py
@@ -1,11 +1,11 @@
 """Pipeline parallelism training example using bitsandbytes kbit quantization.
 
-Demonstrates distributed pipeline training across 2+ GPUs:
-- Loads a HuggingFace model and applies KbitLoraModel
-- Splits decoder layers across GPUs (first stage = embedding + first layers,
-  last stage = remaining layers + norm + LM head)
-- Trains using DistributedPipelineEngine with NCCL
-- Reports per-GPU memory and throughput
+Demonstrates distributed pipeline training across 2+ GPUs with per-rank
+model loading — each GPU only loads the decoder layers it needs:
+- First stage: embedding + first half of layers
+- Last stage: remaining layers + final norm + LM head (loss)
+
+This reduces per-GPU memory compared to loading the full model everywhere.
 
 Usage:
     # 2-GPU pipeline training on Qwen3-0.6B
@@ -50,21 +50,20 @@ class KbitFirstStage(nn.Module):
     """First pipeline stage: embedding + first layers.
 
     Takes input_ids [B, S], returns hidden states [B, S, H].
+    The KbitLoraModel has already been created with only this stage's layers.
     """
 
-    def __init__(self, kbit_model, layer_start, layer_end):
+    def __init__(self, kbit_model):
         super().__init__()
         self.km = kbit_model
-        self.layer_start = layer_start
-        self.layer_end = layer_end
 
     def forward(self, input_ids):
         B, S = input_ids.shape
         device = input_ids.device
         position_ids = torch.arange(S, device=device).unsqueeze(0).expand(B, -1)
         self.km._extend_rope_cache(S, device)
         hidden = self.km.embed_tokens(input_ids).to(self.km.compute_dtype)
-        for i in range(self.layer_start, self.layer_end):
+        for i in range(self.km._num_loaded_layers):
             hidden = self.km._layer_forward(i, hidden, position_ids)
         return hidden
 
@@ -74,13 +73,13 @@ class KbitLastStage(nn.Module):
 
     Takes hidden states [B, S, H], returns hidden states after norm [B*S, H].
     Loss is computed externally by the engine's loss_fn.
+    The KbitLoraModel has already been created with only this stage's layers
+    plus the final norm and LM head.
     """
 
-    def __init__(self, kbit_model, layer_start, layer_end):
+    def __init__(self, kbit_model):
         super().__init__()
         self.km = kbit_model
-        self.layer_start = layer_start
-        self.layer_end = layer_end
 
     def forward(self, hidden):
         from bitsandbytes.autograd.training_kernels import rmsnorm
@@ -90,7 +89,7 @@ def forward(self, hidden):
         position_ids = torch.arange(S, device=device).unsqueeze(0).expand(B, -1)
         self.km._extend_rope_cache(S, device)
 
-        for i in range(self.layer_start, self.layer_end):
+        for i in range(self.km._num_loaded_layers):
             hidden = self.km._layer_forward(i, hidden, position_ids)
 
         # Final norm
@@ -110,12 +109,6 @@ def make_loss_fn(kbit_model):
     lm = km._lm_head_info
 
     def loss_fn(hidden_2d, labels):
-        """Compute chunked cross-entropy loss.
-
-        Args:
-            hidden_2d: [B*S, H] hidden states from last stage.
-            labels: [B, S] target token IDs.
-        """
         shift_hidden = hidden_2d[:-1]
         shift_labels = labels.reshape(-1)[1:]
         loss = chunked_cross_entropy(
@@ -138,65 +131,76 @@ def main():
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
 
+    is_first = (rank == 0)
+    is_last = (rank == world_size - 1)
+
     if rank == 0:
         print(f"{'=' * 60}")
-        print(f"Pipeline QLoRA Training ({world_size} GPUs)")
+        print(f"Pipeline QLoRA Training ({world_size} GPUs, per-rank loading)")
         print(f"{'=' * 60}")
         print(f"Model: {args.model}")
         print(f"LoRA rank: {args.lora_r}, k={args.k}")
         print(f"Seq len: {args.seq_len}, Micro-batches: {args.micro_batches}")
         print(f"Steps: {args.steps}")
         print()
 
-    # Load model
-    from transformers import AutoModelForCausalLM
+    # Load model — each rank loads the full HF model temporarily to extract
+    # its layer weights. We immediately delete the original after quantization.
+    from transformers import AutoModelForCausalLM, AutoConfig
+
+    config = AutoConfig.from_pretrained(args.model, trust_remote_code=True)
+    num_layers = config.num_hidden_layers
+    layers_per_stage = num_layers // world_size
+    layer_start = rank * layers_per_stage
+    layer_end = (rank + 1) * layers_per_stage if rank < world_size - 1 else num_layers
+
+    role = "first" if is_first else ("last" if is_last else "mid")
+    print(f"  GPU {rank}: layers {layer_start}-{layer_end-1} ({role} stage)")
 
     if rank == 0:
-        print("Loading base model...")
+        print(f"\nLoading and quantizing (per-rank)...")
+    torch.cuda.reset_peak_memory_stats()
+
     model = AutoModelForCausalLM.from_pretrained(
         args.model,
         dtype=torch.float16,
         device_map={"": device},
         trust_remote_code=True,
     )
 
-    # Quantize
-    if rank == 0:
-        print("Quantizing and creating LoRA adapters...")
+    mem_after_load = torch.cuda.memory_allocated() / 1024 / 1024
+    print(f"  GPU {rank}: {mem_after_load:.0f} MB after HF model load")
+
+    # Create KbitLoraModel with ONLY this rank's layers
     kbit_model = KbitLoraModel(
         model,
         lora_r=args.lora_r,
         lora_alpha=16.0,
         k=args.k,
         compute_dtype=torch.bfloat16,
+        layer_range=(layer_start, layer_end),
+        include_embed=is_first,
+        include_lm_head=is_last,
     )
+
+    # Delete the original HF model to free memory
     del model
     torch.cuda.empty_cache()
 
-    num_layers = kbit_model.num_layers
-    layers_per_stage = num_layers // world_size
-    layer_start = rank * layers_per_stage
-    layer_end = (rank + 1) * layers_per_stage if rank < world_size - 1 else num_layers
+    mem_after_quant = torch.cuda.memory_allocated() / 1024 / 1024
+    print(f"  GPU {rank}: {mem_after_quant:.0f} MB after quantize + cleanup "
+          f"({kbit_model._num_loaded_layers} layers, "
+          f"embed={'yes' if is_first else 'no'}, "
+          f"lm_head={'yes' if is_last else 'no'})")
 
-    is_first = (rank == 0)
-    is_last = (rank == world_size - 1)
+    if rank == 0:
+        print(f"  Trainable params (rank 0): {kbit_model.num_trainable_parameters():,}")
 
+    # Create pipeline stage wrappers
     if is_first:
-        stage = KbitFirstStage(kbit_model, layer_start, layer_end)
+        stage = KbitFirstStage(kbit_model)
     else:
-        stage = KbitLastStage(kbit_model, layer_start, layer_end)
-
-    if rank == 0:
-        print(f"  Total layers: {num_layers}")
-        print(f"  Trainable params: {kbit_model.num_trainable_parameters():,}")
-
-    for r in range(world_size):
-        if r == rank:
-            ls = r * layers_per_stage
-            le = (r + 1) * layers_per_stage if r < world_size - 1 else num_layers
-            role = "first" if r == 0 else ("last" if r == world_size - 1 else "mid")
-            print(f"  GPU {r}: layers {ls}-{le-1} ({role} stage)")
-        dist.barrier()
+        stage = KbitLastStage(kbit_model)
 
     # Loss function for the last stage
     loss_fn = make_loss_fn(kbit_model) if is_last else None
@@ -215,7 +219,7 @@ def main():
         dtype=torch.bfloat16,
     )
 
-    # Optimizer — each rank has its own view of the parameters
+    # Optimizer — each rank optimizes only its own trainable parameters
     trainable_params = kbit_model.get_trainable_parameters()
     optimizer = torch.optim.AdamW(trainable_params, lr=args.lr, weight_decay=0.01)
 
@@ -233,8 +237,7 @@ def main():
         t_step = time.time()
         optimizer.zero_grad()
 
-        # Generate micro-batches (all ranks generate same data for labels)
-        # Use deterministic seed per step so last rank has correct labels
+        # All ranks generate same data with same seed (for label consistency)
         torch.manual_seed(step * 1000 + 42)
         micro_batch_inputs = []
         micro_batch_labels = []