refactor: remove CPU offload from ROCm branch, bump ROCm 7.2.1

gtherond · gtherond · commit 8ec8d214fa1a · 2026-03-26T14:39:54.000+01:00
Separate CPU offload work into its own branch (feat/cpu-offload)
to keep the ROCm support PR focused on infrastructure.

This branch now contains only:
- ROCm 7.2 Docker/compose infrastructure
- Device-agnostic code replacements
- VQ-GAN precision pass-through and VRAM_FRACTION cap
- MAX_SEQ_LEN configurable KV cache
- ROCm gfx arch auto-detection and VRAM guidance
- MIOpen exhaustive kernel tuning (MIOPEN_FIND_MODE=3)
- Persistent MIOpen cache volume

Tested on ROCm 7.2.1 with RX 9070 XT (16GB):
- Full GPU INT8 + COMPILE=1 + MIOpen tuning: 34.9s (was 85s on 7.2.0)
- 2.4x performance improvement from ROCm 7.2.1 HIP/MIOpen fixes
diff --git a/fish_speech/models/text2semantic/inference.py b/fish_speech/models/text2semantic/inference.py
@@ -768,13 +768,6 @@ def worker():
                 dtype=next(model.parameters()).dtype,
             )
 
-        # Offload weights to pinned CPU memory if requested.
-        # Runs after setup_caches so KV caches exist and can be
-        # preserved on GPU while layer weights move to CPU.
-        from fish_speech.utils.gpu import setup_cpu_offload
-
-        setup_cpu_offload(model, torch.device(device))
-
         init_event.set()
 
         while True:
diff --git a/fish_speech/models/text2semantic/llama.py b/fish_speech/models/text2semantic/llama.py
@@ -396,28 +396,22 @@ def forward_generate(
         return_all: bool = False,
     ) -> BaseTransformerForwardResult:
 
-        # When CPU offload is active, embeddings are on CPU — move input there
-        # Capture original device before any moves (for returning results to GPU)
-        _orig_device = inp.device
-        embed_device = self.embeddings.weight.device
-        inp_e = inp.to(embed_device) if inp.device != embed_device else inp
-
         # Embedding logic replicated from embed() for compilation compatibility
         embeds = []
         for i in range(self.config.num_codebooks):
             emb = self.codebook_embeddings(
-                inp_e[:, i + 1] + i * self.config.codebook_size
+                inp[:, i + 1] + i * self.config.codebook_size
             )
             embeds.append(emb)
 
         vq_embeds_sum = torch.stack(embeds, dim=1).sum(dim=1)
 
-        vq_masks = (inp_e[:, 0] >= self.config.semantic_begin_id) & (
-            inp_e[:, 0] <= self.config.semantic_end_id
+        vq_masks = (inp[:, 0] >= self.config.semantic_begin_id) & (
+            inp[:, 0] <= self.config.semantic_end_id
         )
 
         vq_embeds_sum[~vq_masks] = 0
-        x = self.embeddings(inp_e[:, 0]) + vq_embeds_sum
+        x = self.embeddings(inp[:, 0]) + vq_embeds_sum
 
         if self.config.scale_codebook_embeddings:
             vq_masks_expanded = vq_masks.unsqueeze(-1).expand_as(x)
@@ -427,12 +421,14 @@ def forward_generate(
 
         # Audio embeddings
         if audio_parts is not None:
+            # Note: This assumes self.audio_projector exists if audio_parts is used
+            # It seems missing in init, but we keep existing logic
             if hasattr(self, "audio_projector"):
-                audio_embeds = self.audio_projector(audio_parts.to(embed_device))
+                audio_embeds = self.audio_projector(audio_parts)
                 if self.config.scale_codebook_embeddings:
-                    x[audio_masks.to(embed_device)] = audio_embeds / math.sqrt(2)
+                    x[audio_masks] = audio_embeds / math.sqrt(2)
                 else:
-                    x[audio_masks.to(embed_device)] = audio_embeds
+                    x[audio_masks] = audio_embeds
             else:
                 logger.warning("audio_parts provided but model has no audio_projector")
 
@@ -445,39 +441,6 @@ def forward_generate(
         mask = self.causal_mask[None, None, input_pos, :max_seq_len]  # (B, N, Q, K)
         freqs_cis = self.freqs_cis[input_pos]
 
-        if getattr(self, "_layer_streamer", None) is not None:
-            # CPU offload: run slow layers + norm + logits on CPU,
-            # then transfer only the small results to GPU
-            gpu_device = _orig_device
-            x_cpu = x.to("cpu")
-            freqs_cpu = freqs_cis.to("cpu")
-            mask_cpu = mask.to("cpu")
-            ipos_cpu = input_pos.to("cpu") if input_pos is not None else None
-
-            for layer in self.layers:
-                x_cpu = layer(x_cpu, freqs_cpu, mask_cpu, input_pos=ipos_cpu)
-
-            if x_cpu.size(1) > 1 and not return_all:
-                x_cpu = x_cpu[:, -1:]
-
-            slow_out_cpu = self.norm(x_cpu)
-
-            if self.config.is_reward_model:
-                token_logits_cpu = self.score_output(slow_out_cpu)
-            elif self.config.tie_word_embeddings:
-                token_logits_cpu = F.linear(slow_out_cpu, self.embeddings.weight)
-            else:
-                token_logits_cpu = self.output(slow_out_cpu)
-
-            hidden_cpu = (
-                slow_out_cpu if getattr(self.config, "norm_fastlayer_input", False) else x_cpu
-            )
-
-            return BaseTransformerForwardResult(
-                logits=token_logits_cpu.to(gpu_device),
-                hidden_states=hidden_cpu.to(gpu_device),
-            )
-
         for layer in self.layers:
             x = layer(x, freqs_cis, mask, input_pos=input_pos)
 
diff --git a/fish_speech/utils/gpu.py b/fish_speech/utils/gpu.py
@@ -1,9 +1,8 @@
-"""GPU detection, VRAM guidance, ROCm gfx arch auto-detection, and CPU weight offloading."""
+"""GPU detection, VRAM guidance, and ROCm gfx arch auto-detection."""
 
 import os
 
 import torch
-import torch.nn as nn
 from loguru import logger
 
 # Known ROCm gfx arch overrides for GPUs not yet in PyTorch's HIP target list.
@@ -94,7 +93,6 @@ def check_vram_and_advise(checkpoint_path: str):
             suggestions.append(
                 f"reduce MAX_SEQ_LEN (current: {max_seq_len}, try 4096 to save ~{(max_seq_len - 4096) / 8192 * 1.2:.1f}GB)"
             )
-        suggestions.append("set OFFLOAD_WEIGHTS_TO_CPU=true to run slow layers on CPU")
         suggestions.append("set VRAM_FRACTION=0.95 to prevent system freeze on OOM")
 
         logger.warning(
@@ -103,119 +101,3 @@ def check_vram_and_advise(checkpoint_path: str):
         )
         for i, s in enumerate(suggestions, 1):
             logger.warning(f"  {i}. {s}")
-
-
-class CPUOffloadExecutor:
-    """Runs slow transformer layers on CPU (using AVX-512/VNNI), keeps fast path on GPU.
-
-    Instead of streaming layers GPU↔CPU (72 PCIe round-trips per token),
-    this executes the slow transformer entirely on CPU and only transfers
-    the final hidden state (~10KB) to GPU for the fast transformer + decoder.
-
-    For batch=1 single-token inference, CPU execution with DDR5 bandwidth
-    (~80-100 GB/s) and AVX-512 is competitive with the PCIe streaming approach
-    while eliminating all allocation overhead.
-    """
-
-    def __init__(self, gpu_device: torch.device):
-        self.gpu_device = gpu_device
-
-    def run(self, layers: nn.ModuleList, x, *args, **kwargs):
-        """Execute layers on CPU, return result on pinned memory for fast GPU transfer."""
-        # Move hidden state and all positional args to CPU
-        x_cpu = x.to("cpu")
-        args_cpu = tuple(a.to("cpu") if isinstance(a, torch.Tensor) else a for a in args)
-        kwargs_cpu = {
-            k: v.to("cpu") if isinstance(v, torch.Tensor) else v
-            for k, v in kwargs.items()
-        }
-
-        # Run all layers on CPU — weights are already here, no PCIe needed
-        for layer in layers:
-            x_cpu = layer(x_cpu, *args_cpu, **kwargs_cpu)
-
-        # Pin the result for faster DMA to GPU, then transfer non-blocking
-        return x_cpu.pin_memory().to(self.gpu_device, non_blocking=True)
-
-
-def _has_int8_weights(module: nn.Module) -> bool:
-    """Check if any submodule uses INT8 quantized weights."""
-    for child in module.modules():
-        if hasattr(child, "weight") and hasattr(child, "scales") and child.weight.dtype == torch.int8:
-            return True
-    return False
-
-
-def setup_cpu_offload(model: nn.Module, device: torch.device):
-    """Offload slow transformer layers to CPU execution.
-
-    Moves slow layer weights + KV caches to CPU. The slow transformer runs
-    entirely on CPU using AVX-512, and only the final hidden state is
-    transferred to GPU for the fast transformer and decoder.
-
-    Fast layers stay on GPU (small footprint, called 10x per token).
-
-    Enable with OFFLOAD_WEIGHTS_TO_CPU=true.
-    Requires native bf16 weights — INT8 quantized models are not supported
-    because autoregressive decode (M=1) cannot use VNNI _int_mm (requires M>16),
-    and the dequant+bf16 fallback is ~30% slower than native bf16 matmuls.
-    """
-    if not os.environ.get("OFFLOAD_WEIGHTS_TO_CPU", "").lower() in ("true", "1"):
-        return False
-
-    if not hasattr(model, "layers"):
-        logger.warning("Model has no 'layers' attribute, cannot offload weights.")
-        return False
-
-    if _has_int8_weights(model):
-        logger.warning(
-            "CPU offload requires native bf16 weights. INT8 quantized models are not supported "
-            "because autoregressive decode (batch=1) cannot use VNNI INT8 matmuls (requires M>16), "
-            "and the dequant+bf16 fallback is ~30% slower than native bf16. "
-            "Please use the original (non-quantized) checkpoint with OFFLOAD_WEIGHTS_TO_CPU=true."
-        )
-        return False
-
-    # Use physical cores only — HyperThreading causes cache contention
-    # on Zen 4 and hurts bf16 matmul throughput (~37% slower with HT).
-    physical_cores = os.cpu_count() // 2 if os.cpu_count() else 8
-    torch.set_num_threads(physical_cores)
-    logger.info(f"CPU offload: set torch threads to {physical_cores} (physical cores only)")
-
-    layers = model.layers
-    n_layers = len(layers)
-
-    # Move slow layers entirely to CPU (including KV caches)
-    gpu_mem_before = torch.cuda.memory_allocated()
-    with torch.inference_mode(False):
-        for layer in layers:
-            layer.to("cpu")
-    gpu_mem_after = torch.cuda.memory_allocated()
-    saved_gb = (gpu_mem_before - gpu_mem_after) / 1e9
-
-    # Move shared slow-path modules to CPU.
-    # Keep causal_mask and fast_freqs_cis on GPU (shared with fast path).
-    for name in ("norm", "embeddings", "codebook_embeddings", "output"):
-        module = getattr(model, name, None)
-        if module is not None:
-            with torch.inference_mode(False):
-                if isinstance(module, nn.Module):
-                    module.to("cpu")
-
-    gpu_mem_final = torch.cuda.memory_allocated()
-    total_saved_gb = (gpu_mem_before - gpu_mem_final) / 1e9
-
-    logger.info(
-        f"CPU offload: moved {n_layers} slow layers + shared modules to CPU, "
-        f"freed {total_saved_gb:.1f}GB VRAM. Fast layers + decoder remain on GPU."
-    )
-
-    # Keep fast_layers on GPU — small footprint, called 10x per token
-    fast_layers = getattr(model, "fast_layers", None)
-    if fast_layers is not None:
-        logger.info(f"CPU offload: keeping {len(fast_layers)} fast layers on GPU.")
-
-    # Attach executor — forward_generate will use it
-    model._layer_streamer = CPUOffloadExecutor(device)
-
-    return True