fix little bugs

grimoire · grimoire · commit 54007b27a6ce · 2026-05-08T21:20:01.000+08:00
diff --git a/lmdeploy/pytorch/backends/cuda/graph_runner.py b/lmdeploy/pytorch/backends/cuda/graph_runner.py
@@ -162,7 +162,6 @@ def __init__(self, model: torch.nn.Module, model_config: ModelConfig, cache_conf
                  backend_config: BackendConfig, device: torch.device):
         super().__init__(model, model_config, cache_config, backend_config, device)
         self.max_batches = cache_config.max_batches
-        self.max_tokens = cache_config.max_prefill_token_num
         self.num_blocks = cache_config.num_gpu_blocks
 
         self.enable_graph = self.check_enable_graph()
diff --git a/lmdeploy/pytorch/configurations/deepseek_v4.py b/lmdeploy/pytorch/configurations/deepseek_v4.py
@@ -32,9 +32,6 @@ def _check_env_v4(device: str = 'cuda'):
     except ImportError as e:
         raise ImportError('DeepSeek-V4 requires <fast_hadamard_transform> to be installed.') from e
 
-    if not hasattr(torch, 'float4_e2m1fn_x2'):
-        raise RuntimeError('DeepSeek-V4 requires PyTorch with float4_e2m1fn_x2 support.')
-
 
 def _finalize_v4_cache_specs(model_config: ModelConfig, block_size: int):
     if block_size < 128:
diff --git a/lmdeploy/pytorch/nn/moe/v4_fp4.py b/lmdeploy/pytorch/nn/moe/v4_fp4.py
@@ -10,32 +10,6 @@
 from .base import split_size as _split_size
 
 
-def _v4_swiglu(intermediate: torch.Tensor, swiglu_limit: float) -> torch.Tensor:
-    """Match DeepSeek-V4 routed-expert activation semantics.
-
-    Keep the activation hook in `nn/moe` so the V4 fused MoE wrapper does not depend on the legacy CUDA backend
-    implementation file.
-    """
-    hidden = intermediate.size(-1) // 2
-    gate = intermediate[..., :hidden].float()
-    up = intermediate[..., hidden:].float()
-    if swiglu_limit > 0:
-        up = torch.clamp(up, min=-swiglu_limit, max=swiglu_limit)
-        gate = torch.clamp(gate, max=swiglu_limit)
-    return (torch.nn.functional.silu(gate) * up).to(intermediate.dtype)
-
-
-def _get_v4_moe_runtime_kind(device: torch.device) -> str:
-    """Select the routed-expert runtime path for the current GPU.
-
-    CUDA uses lmdeploy's Triton FP8xFP4 MoE path, which keeps checkpoint-native packed FP4 expert weights resident and
-    unpacks them inside the GEMM kernel.
-    """
-    if device.type == 'cuda' and torch.cuda.is_available():
-        return 'triton_fp4'
-    raise RuntimeError('DeepSeek-V4 FP4 MoE requires CUDA because the expert weights stay in packed FP4 format.')
-
-
 class V4ExpertWeights(nn.Module):
     """Local expert-sharded V4 expert weights.
 
@@ -142,7 +116,6 @@ def __init__(self,
         self.ffn_dim = ffn_dim
         self.top_k = top_k
         self.block_size = 128
-        self.runtime_kind = _get_v4_moe_runtime_kind(device)
 
         self.gate_up = V4ExpertWeights(self.num_local_experts,
                                        hidden_dim,
@@ -283,7 +256,6 @@ def __init__(self,
                  device: torch.device | None = None):
         super().__init__()
         device = device or torch.device('cpu')
-        self.runtime_kind = _get_v4_moe_runtime_kind(device)
         dist_ctx = get_dist_manager().current_context()
         dist_config = dist_ctx.dist_config
         if dist_config.ep > 1:
diff --git a/tests/pytorch/kernel/dsv4_utils.py b/tests/pytorch/kernel/dsv4_utils.py
@@ -0,0 +1,134 @@
+"""Reference implementations for V4 FlashMLA sparse FP8 quantize/dequantize.
+
+Used by kernel tests for correctness comparison only — the production path fuses these operations into Triton kernels.
+"""
+
+import torch
+
+from lmdeploy.pytorch.consts import (
+    V4_FLASHMLA_D_NOPE,
+    V4_FLASHMLA_D_ROPE,
+    V4_FLASHMLA_NUM_TILES,
+    V4_FLASHMLA_TILE_SIZE,
+)
+
+D_NOPE = V4_FLASHMLA_D_NOPE      # 448
+D_ROPE = V4_FLASHMLA_D_ROPE      # 64
+TILE_SIZE = V4_FLASHMLA_TILE_SIZE  # 64
+NUM_TILES = V4_FLASHMLA_NUM_TILES  # 7
+NR_DIM = D_NOPE + 2 * D_ROPE    # 576 bytes per token (NoPE + RoPE in e4m3fn)
+FP8_MAX = 448.0
+
+
+def quantize_v4_flashmla_sparse(input_k_cache: torch.Tensor) -> torch.Tensor:
+    """Pack BF16 ``[num_blocks, block_size, 1, 512]`` K cache into V4 FlashMLA
+    sparse FP8 layout.
+
+    Returns ``[num_blocks, block_size, 1, 584]`` e4m3fn tensor.
+    """
+    assert input_k_cache.dim() == 4
+    num_blocks, block_size, _, head_dim = input_k_cache.shape
+    assert head_dim == 512
+
+    device = input_k_cache.device
+    packed_dim = NR_DIM + 8  # 576 + 8 = 584
+    output = torch.zeros(num_blocks, block_size, 1, packed_dim,
+                         dtype=torch.float8_e4m3fn, device=device)
+
+    # Flat view for layout construction (same pattern as v4_compressor.py / v4_flatten_kv.py)
+    flat_out = output.view(num_blocks, -1)
+
+    # NoPE+RoPE region: [num_blocks, block_size * NR_DIM] as e4m3fn
+    nope_rope = flat_out[:, :block_size * NR_DIM].view(
+        num_blocks, block_size, NR_DIM)
+    nope_view = nope_rope[:, :, :D_NOPE]  # [num_blocks, block_size, 448] e4m3fn
+
+    # RoPE region: view as bf16
+    rope_e4 = nope_rope[:, :, D_NOPE:]  # [num_blocks, block_size, 128] e4m3fn
+    rope_view = rope_e4.view(torch.bfloat16)  # [num_blocks, block_size, 64] bf16
+
+    # Scale region: uint8
+    scale_view = flat_out[:, block_size * NR_DIM:].view(
+        num_blocks, block_size, 8).view(torch.uint8)
+
+    # Per-block, per-token quantize
+    for b in range(num_blocks):
+        for t in range(block_size):
+            token = input_k_cache[b, t, 0]  # [512] bf16
+
+            # Quantize NoPE tiles
+            for tile_idx in range(NUM_TILES):
+                d_base = tile_idx * TILE_SIZE
+                tile = token[d_base:d_base + TILE_SIZE].float()
+
+                amax = tile.abs().max()
+                scale_inv = max(amax.item() / FP8_MAX, 1e-4)
+                ceil_log2 = torch.ceil(torch.log2(torch.tensor(scale_inv, dtype=torch.float32)))
+                scale_inv_pow2 = torch.exp2(ceil_log2)
+
+                quantized = (tile / scale_inv_pow2).to(torch.float8_e4m3fn)
+                nope_view[b, t, d_base:d_base + TILE_SIZE] = quantized
+
+                # e8m0fnu scale byte: raw byte = ceil_log2 + 127
+                scale_byte = int(ceil_log2.item() + 127)
+                scale_view[b, t, tile_idx] = scale_byte
+
+            # RoPE: direct bf16 copy (128 e4m3fn bytes = 64 bf16 elements)
+            rope_vals = token[D_NOPE:]  # [64] bf16
+            rope_view[b, t] = rope_vals
+
+    return output
+
+
+def dequantize_v4_flashmla_sparse(quant_k_cache: torch.Tensor) -> torch.Tensor:
+    """Dequantize V4 FlashMLA sparse FP8 K cache to BF16.
+
+    Re-exports from the production module for test convenience.
+
+    Args:
+        quant_k_cache: [num_blocks, block_size, 1, 584] e4m3fn FP8 cache.
+
+    Returns:
+        [num_blocks, block_size, 1, 512] BF16 cache.
+    """
+    assert quant_k_cache.dim() == 4
+    num_blocks, block_size, _, packed_dim = quant_k_cache.shape
+    assert packed_dim == NR_DIM + 8
+
+    device = quant_k_cache.device
+    output = torch.zeros(num_blocks, block_size, 1, 512,
+                         dtype=torch.bfloat16, device=device)
+
+    # Build views (same layout as quantize)
+    flat = quant_k_cache.view(num_blocks, -1)
+    nope_rope = flat[:, :block_size * NR_DIM].view(
+        num_blocks, block_size, NR_DIM)
+    nope_view = nope_rope[:, :, :D_NOPE]  # [num_blocks, block_size, 448] e4m3fn
+
+    rope_e4 = nope_rope[:, :, D_NOPE:]  # [num_blocks, block_size, 128] e4m3fn
+    rope_view = rope_e4.view(torch.bfloat16)  # [num_blocks, block_size, 64] bf16
+
+    scale_view = flat[:, block_size * NR_DIM:].view(
+        num_blocks, block_size, 8).view(torch.uint8)
+
+    # Per-block, per-token dequantize
+    for b in range(num_blocks):
+        for t in range(block_size):
+            # Dequantize NoPE tiles
+            for tile_idx in range(NUM_TILES):
+                d_base = tile_idx * TILE_SIZE
+                nope_fp8 = nope_view[b, t, d_base:d_base + TILE_SIZE].float()
+
+                # Read scale byte and reconstruct float scale
+                scale_byte = scale_view[b, t, tile_idx].item()
+                # e8m0fnu: bits = scale_byte, float = 2^(scale_byte - 127)
+                scale_bits = scale_byte << 23
+                scale_f32 = torch.tensor(scale_bits, dtype=torch.int32).view(torch.float32)
+
+                dequant = (nope_fp8 * scale_f32).to(torch.bfloat16)
+                output[b, t, 0, d_base:d_base + TILE_SIZE] = dequant
+
+            # RoPE: direct bf16 copy
+            output[b, t, 0, D_NOPE:] = rope_view[b, t]
+
+    return output