refactor

grimoire · grimoire · commit 4ed038f1d8dd · 2026-05-08T11:55:51.000+08:00
diff --git a/lmdeploy/pytorch/backends/cuda/attention/flashmla_utils.py b/lmdeploy/pytorch/backends/cuda/attention/flashmla_utils.py
diff --git a/lmdeploy/pytorch/configurations/deepseek_v4.py b/lmdeploy/pytorch/configurations/deepseek_v4.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 
-from lmdeploy.pytorch.backends.cuda.attention.flashmla_utils import model1_fp8_sparse_token_dim
+from lmdeploy.pytorch.kernels.cuda.dsv4.layout import V4_FLASHMLA_D_NOPE, V4_FLASHMLA_D_ROPE, V4_FLASHMLA_NUM_TILES
 from lmdeploy.pytorch.config import BlockCacheSpec, ModelConfig, StateCacheSpec
 
 from .builder import AutoModelConfigBuilder
@@ -45,7 +45,8 @@ def _finalize_v4_cache_specs(model_config: ModelConfig, block_size: int):
                          'has an integral number of entries per block.')
 
     hf_config = model_config.hf_config
-    packed_token_dim = model1_fp8_sparse_token_dim(64)
+    # V4 FlashMLA sparse FP8: 448 fp8 NoPE + 128 bytes (64 bf16) RoPE + 7 e8m0 scales + 1 pad = 584
+    packed_token_dim = V4_FLASHMLA_D_NOPE + 2 * V4_FLASHMLA_D_ROPE + V4_FLASHMLA_NUM_TILES + 1
     num_layers = hf_config.num_hidden_layers
     compress_ratios = getattr(hf_config, 'compress_ratios', None) or [0] * num_layers
     ratio4_layers = [i for i, r in enumerate(compress_ratios) if r == 4]
@@ -97,7 +98,8 @@ def build(cls, hf_config, model_path: str | None = None, tp: int = 1, **kwargs):
         """
         bos_token_id = getattr(hf_config, 'bos_token_id', None)
         head_dim = getattr(hf_config, 'head_dim', 512)
-        packed_token_dim = model1_fp8_sparse_token_dim(64)
+        # V4 FlashMLA sparse FP8: 448 fp8 NoPE + 128 bytes (64 bf16) RoPE + 7 e8m0 scales + 1 pad = 584
+        packed_token_dim = V4_FLASHMLA_D_NOPE + 2 * V4_FLASHMLA_D_ROPE + V4_FLASHMLA_NUM_TILES + 1
         num_layers = hf_config.num_hidden_layers
         compress_ratios = getattr(hf_config, 'compress_ratios', None) or [0] * num_layers
 
diff --git a/lmdeploy/pytorch/kernels/cuda/dsv4/layout.py b/lmdeploy/pytorch/kernels/cuda/dsv4/layout.py
@@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""DeepSeek-V4 FlashMLA sparse FP8 layout constants and helpers.
+
+The V4 FlashMLA sparse layout packs a 512-dim K cache head as:
+  [448 fp8 NoPE | 128 bytes (64 bf16) RoPE | 7 e8m0 scale bytes | 1 pad byte]
+  = 584 bytes per token.
+
+NoPE region: 7 tiles of 64 elements, each tile quantized to FP8 e4m3fn with
+a per-tile e8m0fnu power-of-2 scale factor.
+RoPE region: 64 BF16 values stored as raw bytes (128 bytes).
+Scales: 7 e8m0fnu scale bytes + 1 padding byte = 8 bytes.
+"""
+import torch
+
+V4_FLASHMLA_HEAD_DIM = 512
+V4_FLASHMLA_D_NOPE = 448
+V4_FLASHMLA_D_ROPE = 64
+V4_FLASHMLA_TILE_SIZE = 64
+V4_FLASHMLA_NUM_TILES = 7
+
+
+def dequantize_v4_flashmla_sparse(quant_k_cache: torch.Tensor) -> torch.Tensor:
+    """Dequantize V4 FlashMLA sparse FP8 KV cache to BF16.
+
+    Args:
+        quant_k_cache: ``[num_blocks, block_size, 1, packed_dim]`` FP8 cache.
+
+    Returns:
+        ``[num_blocks, block_size, 1, 512]`` BF16 tensor.
+    """
+    assert quant_k_cache.dim() == 4
+    num_blocks, block_size, num_heads, _ = quant_k_cache.shape
+    assert num_heads == 1
+
+    result = torch.empty((num_blocks, block_size, V4_FLASHMLA_HEAD_DIM),
+                         dtype=torch.bfloat16,
+                         device=quant_k_cache.device)
+    quant_k_cache = quant_k_cache.view(num_blocks, -1)
+    input_nope_rope = quant_k_cache[:, :block_size * (V4_FLASHMLA_D_NOPE + 2 * V4_FLASHMLA_D_ROPE)].view(
+        num_blocks, block_size, V4_FLASHMLA_D_NOPE + 2 * V4_FLASHMLA_D_ROPE)
+    input_nope = input_nope_rope[:, :, :V4_FLASHMLA_D_NOPE]
+    input_rope = input_nope_rope[:, :, V4_FLASHMLA_D_NOPE:].view(torch.bfloat16)
+    input_scale = quant_k_cache[:, block_size * (V4_FLASHMLA_D_NOPE + 2 * V4_FLASHMLA_D_ROPE):].view(
+        num_blocks, block_size, 8)[:, :, :V4_FLASHMLA_NUM_TILES].view(torch.float8_e8m0fnu)
+
+    result[..., V4_FLASHMLA_D_NOPE:] = input_rope
+    for tile_idx in range(V4_FLASHMLA_NUM_TILES):
+        cur_nope = input_nope[..., tile_idx * V4_FLASHMLA_TILE_SIZE:(tile_idx + 1) * V4_FLASHMLA_TILE_SIZE].to(
+            torch.bfloat16)
+        cur_scales = input_scale[:, :, tile_idx].to(torch.bfloat16).unsqueeze(-1)
+        result[..., tile_idx * V4_FLASHMLA_TILE_SIZE:(tile_idx + 1) * V4_FLASHMLA_TILE_SIZE] = cur_nope * cur_scales
+
+    return result.view(num_blocks, block_size, 1, V4_FLASHMLA_HEAD_DIM)
diff --git a/lmdeploy/pytorch/kernels/cuda/v4_compressor.py b/lmdeploy/pytorch/kernels/cuda/v4_compressor.py
@@ -625,9 +625,9 @@ def _fill_compressed_kv_kernel(
             cache_ptrs = kv_cache_ptr + phys_block * kvc_stride_b + block_off * kvc_stride_s + offs_d * kvc_stride_d
             tl.store(cache_ptrs, compressed.to(kv_cache_ptr.dtype.element_ty))
 
-    # ---- Write to FP8 paged block cache (MODEL1 sparse format) ----
+    # ---- Write to FP8 paged block cache (V4 FlashMLA sparse format) ----
     if has_fp8:
-        # FlashMLA MODEL1 sparse FP8 layout (matches C++ kernel addressing):
+        # V4 FlashMLA sparse FP8 layout (matches C++ kernel addressing):
         #   NoPE+RoPE region: [num_blocks, entries_per_block, 576] as e4m3fn
         #     per-token: [NoPE 448 fp8 | RoPE 128 bytes (64 bf16)]
         #     token stride = 576 bytes
@@ -639,6 +639,7 @@ def _fill_compressed_kv_kernel(
         #   fp8_nope_rope_ptr  — e4m3fn, stride_b/stride_s for NoPE write
         #   fp8_rope_bf16_ptr  — bfloat16 view of the RoPE region
         #   fp8_scales_u8_ptr  — uint8 view of the scales region
+        # Must match V4_FLASHMLA_* in dsv4/layout.py
         D_NOPE: tl.constexpr = 448
         D_ROPE: tl.constexpr = 64
         TILE_SIZE: tl.constexpr = 64
@@ -732,7 +733,7 @@ def fill_compressed_kv(
     (abs_pos = n*ratio - 1), this kernel scatters those entries into the
     block-paged kv_cache used by the decode-phase sparse attention.
 
-    When fp8_cache is provided, also writes MODEL1 sparse FP8 packed entries
+    When fp8_cache is provided, also writes V4 FlashMLA sparse FP8 packed entries
     directly into fp8_cache, eliminating the need for a separate Python-side
     packing step.
 
@@ -755,7 +756,7 @@ def fill_compressed_kv(
       phys_block  = block_offsets[batch_id, block_idx]  (physical block in kv_cache)
       write target: kv_cache[phys_block, block_off]
 
-    == FP8 MODEL1 sparse format ==
+    == FP8 V4 FlashMLA sparse format ==
     When fp8_cache is not None, the kernel also writes to:
       fp8_cache: [num_blocks, entries_per_block, packed_dim=584]
     Per-token layout: [NoPE 448 FP8 | RoPE 128 BF16-as-bytes | 7 E8M0 scales | 1 pad]
@@ -806,7 +807,7 @@ def fill_compressed_kv(
         kv_scale_cache = dummy
 
     if has_fp8:
-        # FlashMLA MODEL1 sparse FP8 layout: the fp8_cache tensor is
+        # V4 FlashMLA sparse FP8 layout: the fp8_cache tensor is
         # [num_blocks, entries_per_block, 584] but the actual memory layout
         # has NoPE+RoPE at stride 576 bytes per token, with scales in a
         # separate region. We create three views matching FlashMLA's addressing:
diff --git a/lmdeploy/pytorch/kernels/cuda/v4_flatten_kv.py b/lmdeploy/pytorch/kernels/cuda/v4_flatten_kv.py
@@ -121,7 +121,7 @@ def flatten_v4_kv(
         cu_seqlens_k: optional [bsz+1] int32 cumulative KV sequence lengths.
             If None, computed from kv_seqlens.
         fp8_compressed_kv_cache: optional [num_blocks, entries_per_block, 584]
-            FP8 MODEL1 sparse paged cache. When provided and compressed_kv_cache
+            FP8 V4 FlashMLA sparse paged cache. When provided and compressed_kv_cache
             is None, the FP8 cache is dequantized to a temporary BF16 tensor
             and used instead.
         slot: optional [bsz] int64 slot indices into the global
@@ -136,9 +136,9 @@ def flatten_v4_kv(
     """
     # If FP8 cache is provided and no BF16 cache, dequantize first
     if fp8_compressed_kv_cache is not None and compressed_kv_cache is None:
-        from lmdeploy.pytorch.backends.cuda.attention.flashmla_utils import dequantize_model1_fp8_sparse
+        from lmdeploy.pytorch.kernels.cuda.dsv4.layout import dequantize_v4_flashmla_sparse
         # fp8_cache is [num_blocks, entries, 584]; dequantize expects [num_blocks, entries, 1, 584]
-        dequant = dequantize_model1_fp8_sparse(
+        dequant = dequantize_v4_flashmla_sparse(
             fp8_compressed_kv_cache.unsqueeze(2)).squeeze(2)  # [num_blocks, entries, 512]
         # Clone to decouple from FP8 cache views. No synchronize() needed —
         # same-stream kernel launches are ordered, so the Triton flatten kernel
diff --git a/lmdeploy/pytorch/kernels/cuda/v4_pack_window.py b/lmdeploy/pytorch/kernels/cuda/v4_pack_window.py
@@ -1,9 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-"""Triton kernel to pack BF16 tokens into FlashMLA MODEL1 sparse FP8 flat-
+"""Triton kernel to pack BF16 tokens into V4 FlashMLA sparse FP8 flat-
 layout window cache, replacing the per-token Python loop in
 _pack_window_state_tokens.
 
-FlashMLA MODEL1 flat layout per slot (viewed as flat bytes):
+V4 FlashMLA flat layout per slot (viewed as flat bytes):
   [token_0 NoPE+RoPE | token_1 NoPE+RoPE | ... | token_0 scales | token_1 scales | ...]
   NoPE+RoPE per token = 576 bytes (448 e4m3fn + 128 bf16)
   Scales per token = 8 bytes (7 e8m0fnu + 1 padding)
@@ -92,19 +92,19 @@ def pack_window_tokens_fp8(
     slot: torch.Tensor,
     positions: torch.Tensor,
 ):
-    """Pack BF16 tokens into FlashMLA MODEL1 sparse FP8 window cache.
+    """Pack BF16 tokens into V4 FlashMLA sparse FP8 window cache.
 
     Args:
         kv_tokens: [num_tokens, 512] BF16 tokens to pack.
         window_state_fp8_cache: [num_total_slots, window_size, packed_dim] FP8 cache.
         slot: [num_tokens] slot indices (which cache row to write to).
         positions: [num_tokens] ring-buffer positions within the window.
     """
-    from lmdeploy.pytorch.backends.cuda.attention.flashmla_utils import (
-        MODEL1_D_NOPE,
-        MODEL1_D_ROPE,
-        MODEL1_NUM_TILES,
-        MODEL1_TILE_SIZE,
+    from lmdeploy.pytorch.kernels.cuda.dsv4.layout import (
+        V4_FLASHMLA_D_NOPE,
+        V4_FLASHMLA_D_ROPE,
+        V4_FLASHMLA_NUM_TILES,
+        V4_FLASHMLA_TILE_SIZE,
     )
 
     assert kv_tokens.dim() == 2
@@ -113,7 +113,7 @@ def pack_window_tokens_fp8(
         return
 
     window_size = window_state_fp8_cache.size(1)
-    nope_rope_stride = MODEL1_D_NOPE + 2 * MODEL1_D_ROPE  # 576 bytes per token in NoPE+RoPE region
+    nope_rope_stride = V4_FLASHMLA_D_NOPE + 2 * V4_FLASHMLA_D_ROPE  # 576 bytes per token in NoPE+RoPE region
     num_slots = window_state_fp8_cache.size(0)
 
     # Create three views of the same FP8 cache buffer (same pattern as fill_compressed_kv)
@@ -122,16 +122,16 @@ def pack_window_tokens_fp8(
     # NoPE+RoPE region: [num_slots, window_size * 576] as e4m3fn
     nope_rope = flat[:, :window_size * nope_rope_stride].view(
         num_slots, window_size, nope_rope_stride)
-    nope_view = nope_rope[:, :, :MODEL1_D_NOPE]  # [num_slots, window_size, 448] e4m3fn
+    nope_view = nope_rope[:, :, :V4_FLASHMLA_D_NOPE]  # [num_slots, window_size, 448] e4m3fn
 
     # RoPE region: slice the RoPE part first (128 e4m3fn bytes = 64 bf16 elements),
-    # then view as bf16 — same pattern as quantize_model1_fp8_sparse
-    rope_e4 = nope_rope[:, :, MODEL1_D_NOPE:]  # [num_slots, window_size, 128] e4m3fn
+    # then view as bf16 — same pattern as quantize_v4_flashmla_sparse
+    rope_e4 = nope_rope[:, :, V4_FLASHMLA_D_NOPE:]  # [num_slots, window_size, 128] e4m3fn
     rope_view = rope_e4.view(torch.bfloat16)    # [num_slots, window_size, 64] bf16
 
     # Scale region: uint8 view
     scale_view = flat[:, window_size * nope_rope_stride:].view(
-        num_slots, window_size, 8)[:, :, :MODEL1_NUM_TILES].view(torch.uint8)
+        num_slots, window_size, 8)[:, :, :V4_FLASHMLA_NUM_TILES].view(torch.uint8)
 
     grid = (num_tokens,)
     _pack_window_tokens_fp8_kernel[grid](
@@ -151,8 +151,8 @@ def pack_window_tokens_fp8(
         stride_scale_pos=scale_view.stride(1),
         stride_slot=1,
         WINDOW_SIZE=window_size,
-        D_NOPE=MODEL1_D_NOPE,
-        D_ROPE=MODEL1_D_ROPE,
-        TILE_SIZE=MODEL1_TILE_SIZE,
-        NUM_TILES=MODEL1_NUM_TILES,
+        D_NOPE=V4_FLASHMLA_D_NOPE,
+        D_ROPE=V4_FLASHMLA_D_ROPE,
+        TILE_SIZE=V4_FLASHMLA_TILE_SIZE,
+        NUM_TILES=V4_FLASHMLA_NUM_TILES,
     )
diff --git a/tests/pytorch/kernel/test_v4_compressor.py b/tests/pytorch/kernel/test_v4_compressor.py
@@ -749,7 +749,7 @@ def test_decode_r128(self, kvlen, compress_ratio, head_dim, block_size, device,
 class TestFillCompressedKVFP8:
     """Test FP8 direct write in fill_compressed_kv.
 
-    Verifies that the kernel's MODEL1 sparse FP8 output matches the Python reference (quantize_model1_fp8_sparse). Only
+    Verifies that the kernel's V4 FlashMLA sparse FP8 output matches the Python reference (quantize_v4_flashmla_sparse). Only
     ratio=4 is tested since r128 has no FP8 cache.
     """
 
@@ -772,13 +772,13 @@ def _packed_token_dim(self):
         return self.D_NOPE + 2 * self.D_ROPE + self.NUM_TILES + 1  # 584
 
     def _reference_pack_fp8(self, bf16_tokens):
-        """Pack BF16 tokens [N, 512] to MODEL1 FP8 using the Python
+        """Pack BF16 tokens [N, 512] to V4 FlashMLA FP8 using the Python
         reference."""
-        from lmdeploy.pytorch.backends.cuda.attention.flashmla_utils import quantize_model1_fp8_sparse
-        # quantize_model1_fp8_sparse expects [num_blocks, block_size, 1, 512]
+        from .dsv4_utils import quantize_v4_flashmla_sparse
+        # quantize_v4_flashmla_sparse expects [num_blocks, block_size, 1, 512]
         # For N tokens, treat as 1 block of N entries
         input_cache = bf16_tokens.unsqueeze(0).unsqueeze(2)  # [1, N, 1, 512]
-        packed = quantize_model1_fp8_sparse(input_cache)  # [1, N, 1, 584]
+        packed = quantize_v4_flashmla_sparse(input_cache)  # [1, N, 1, 584]
         return packed.squeeze(0).squeeze(1)  # [N, 584]
 
     def _run_test(self, compressed_kv, cu_q_seqlens, kv_seqlens, block_offsets, device):
@@ -805,9 +805,9 @@ def _run_test(self, compressed_kv, cu_q_seqlens, kv_seqlens, block_offsets, devi
                            fp8_cache=fp8_cache)
 
         # Reference: dequantize FP8 cache and compare with BF16 cache
-        from lmdeploy.pytorch.backends.cuda.attention.flashmla_utils import dequantize_model1_fp8_sparse
+        from .dsv4_utils import dequantize_v4_flashmla_sparse
         # Dequantize all blocks
-        dequant = dequantize_model1_fp8_sparse(
+        dequant = dequantize_v4_flashmla_sparse(
             fp8_cache.unsqueeze(2))  # [num_blocks, entries_per_block, 1, 512]
         dequant = dequant.squeeze(2)  # [num_blocks, entries_per_block, 512]
 
diff --git a/tests/pytorch/kernel/test_v4_flatten_kv.py b/tests/pytorch/kernel/test_v4_flatten_kv.py