support cfp8 in blackwell mla (#7876)

zhoutianzi666 · web-flow · commit 4f1484545f3d · 2026-05-21T15:41:56.000+08:00
diff --git a/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cu b/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cu
@@ -30,6 +30,7 @@ std::vector<paddle::Tensor> PrefillMLAWriteCache(
     const paddle::Tensor& slot_mapping,
     const paddle::optional<paddle::Tensor>& kv_signal_data,
     cudaStream_t& stream,
+    const std::string& cache_quant_type_str,
     paddle::Tensor* kv_cache) {
   typedef PDTraits<T> traits_;
   typedef typename traits_::DataType DataType_;
@@ -50,27 +51,51 @@ std::vector<paddle::Tensor> PrefillMLAWriteCache(
   int grid_size = 1;
   GetNumBlocks<128>(pack_num, &grid_size);
 
-  using CT = DataType_;
-
-  prefill_absorb_cache_kernel<DataType_, PackSize, CT>
-      <<<grid_size, blocksize, 0, stream>>>(
-          reinterpret_cast<DataType_*>(
-              const_cast<data_t*>(kv_nope.data<data_t>())),
-          reinterpret_cast<DataType_*>(
-              const_cast<data_t*>(kv_pe.data<data_t>())),
-          reinterpret_cast<DataType_*>(kv_cache->data<data_t>()),
-          block_tables.data<int>(),
-          slot_mapping.data<int64_t>(),
-          batch_id_per_token.data<int>(),
-          cu_seqlens_q.data<int>(),
-          seq_lens.data<int>(),
-          seq_lens_decoder.data<int>(),
-          max_blocks_per_seq,
-          kv_num_heads,
-          nope_size,
-          pe_size,
-          block_size,
-          elem_nums);
+  if (cache_quant_type_str == "cache_fp8") {
+    using CT = __nv_fp8_e4m3;
+    prefill_absorb_cache_kernel<DataType_, PackSize, CT>
+        <<<grid_size, blocksize, 0, stream>>>(
+            reinterpret_cast<DataType_*>(
+                const_cast<data_t*>(kv_nope.data<data_t>())),
+            reinterpret_cast<DataType_*>(
+                const_cast<data_t*>(kv_pe.data<data_t>())),
+            reinterpret_cast<CT*>(kv_cache->data<uint8_t>()),
+            block_tables.data<int>(),
+            slot_mapping.data<int64_t>(),
+            batch_id_per_token.data<int>(),
+            cu_seqlens_q.data<int>(),
+            seq_lens.data<int>(),
+            seq_lens_decoder.data<int>(),
+            max_blocks_per_seq,
+            kv_num_heads,
+            nope_size,
+            pe_size,
+            block_size,
+            elem_nums);
+  } else if (cache_quant_type_str == "none") {
+    prefill_absorb_cache_kernel<DataType_, PackSize, DataType_>
+        <<<grid_size, blocksize, 0, stream>>>(
+            reinterpret_cast<DataType_*>(
+                const_cast<data_t*>(kv_nope.data<data_t>())),
+            reinterpret_cast<DataType_*>(
+                const_cast<data_t*>(kv_pe.data<data_t>())),
+            reinterpret_cast<DataType_*>(kv_cache->data<data_t>()),
+            block_tables.data<int>(),
+            slot_mapping.data<int64_t>(),
+            batch_id_per_token.data<int>(),
+            cu_seqlens_q.data<int>(),
+            seq_lens.data<int>(),
+            seq_lens_decoder.data<int>(),
+            max_blocks_per_seq,
+            kv_num_heads,
+            nope_size,
+            pe_size,
+            block_size,
+            elem_nums);
+  } else {
+    PD_THROW("Unsupported cache_quant_type_str type: %s.",
+             cache_quant_type_str.c_str());
+  }
 
   const char* fmt_write_cache_completed_signal_str =
       std::getenv("FLAGS_fmt_write_cache_completed_signal");
@@ -142,6 +167,7 @@ std::vector<paddle::Tensor> PrefillMLAWriteCacheKernel(
           slot_mapping,
           kv_signal_data,
           stream,
+          cache_quant_type_str,
           const_cast<paddle::Tensor*>(&kv_cache));
     }
     case paddle::DataType::FLOAT16: {
@@ -157,6 +183,7 @@ std::vector<paddle::Tensor> PrefillMLAWriteCacheKernel(
           slot_mapping,
           kv_signal_data,
           stream,
+          cache_quant_type_str,
           const_cast<paddle::Tensor*>(&kv_cache));
     }
   }
diff --git a/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cuh b/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cuh
@@ -186,6 +186,8 @@ __global__ void prefill_absorb_cache_kernel(
     const uint32_t elem_cnt) {
   using LoadT = AlignedVector<T, VecSize>;
   LoadT src_vec;
+  using StoreT = AlignedVector<CT, VecSize>;
+  StoreT dst_vec;
 
   int64_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
   const uint32_t nope_hidden_size = kv_num_heads * nope_size;
@@ -227,7 +229,20 @@ __global__ void prefill_absorb_cache_kernel(
           hi * block_size * all_size + block_offset * all_size + h_bias;
       const uint32_t ori_idx = token_idx * nope_hidden_size + inner_bias;
       Load<T, VecSize>(&kv_nope[ori_idx], &src_vec);
-      Store<T, VecSize>(src_vec, &kv_cache[tgt_idx]);
+
+      if constexpr (std::is_same_v<CT, __nv_fp8_e4m3>) {
+        for (int i = 0; i < VecSize; i++) {
+          float quant_value = (float)(src_vec[i]);
+          quant_value = quant_value > 448.0f ? 448.0f : quant_value;
+          quant_value = quant_value < -448.0f ? -448.0f : quant_value;
+          dst_vec[i] = static_cast<__nv_fp8_e4m3>(quant_value);
+        }
+
+        Store<CT, VecSize>(dst_vec, &kv_cache[tgt_idx]);
+      } else {
+        Store<CT, VecSize>(src_vec, &kv_cache[tgt_idx]);
+      }
+
     } else {
       const uint32_t inner_bias = bias - nope_hidden_size;
       const uint32_t hi = inner_bias / pe_size;
@@ -238,7 +253,18 @@ __global__ void prefill_absorb_cache_kernel(
           h_bias;
       const uint32_t ori_idx = token_idx * pe_hidden_size + inner_bias;
       Load<T, VecSize>(&kv_pe[ori_idx], &src_vec);
-      Store<T, VecSize>(src_vec, &kv_cache[tgt_idx]);
+
+      if constexpr (std::is_same_v<CT, __nv_fp8_e4m3>) {
+        for (int i = 0; i < VecSize; i++) {
+          float quant_value = (float)(src_vec[i]);
+          quant_value = quant_value > 448.0f ? 448.0f : quant_value;
+          quant_value = quant_value < -448.0f ? -448.0f : quant_value;
+          dst_vec[i] = static_cast<__nv_fp8_e4m3>(quant_value);
+        }
+        Store<CT, VecSize>(dst_vec, &kv_cache[tgt_idx]);
+      } else {
+        Store<CT, VecSize>(src_vec, &kv_cache[tgt_idx]);
+      }
     }
   }
 }
diff --git a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py
@@ -860,7 +860,7 @@ def forward_mixed(
             metadata.block_tables,
             forward_meta.slot_mapping,
             metadata.kv_signal_data_list[layer.layer_id],
-            "none",
+            getattr(layer, "cache_quant_type_str", "none"),
         )
 
         # Prefill branch: k is not None
@@ -998,11 +998,13 @@ def forward_mixed(
     @staticmethod
     def mla_blackwell(decoder_q, latent_cache, block_table, cache_seqlens, attn_softmax_scale):
 
-        # decoder_q = decoder_q.cast(paddle.float8_e4m3fn)
-        # latent_cache = latent_cache.cast(paddle.float8_e4m3fn)
+        assert latent_cache.dtype in [paddle.bfloat16, paddle.uint8], latent_cache.dtype
+        use_fp8_cache_kv = latent_cache.dtype == paddle.uint8
+        if use_fp8_cache_kv:
+            decoder_q = decoder_q.cast(paddle.float8_e4m3fn)
+            latent_cache = latent_cache.view(paddle.float8_e4m3fn)
 
         assert decoder_q.dtype == latent_cache.dtype
-        q_dtype = decoder_q.dtype
 
         page_size = latent_cache.shape[2]
         q_num_heads = decoder_q.shape[2]
@@ -1049,11 +1051,16 @@ def mla_blackwell(decoder_q, latent_cache, block_table, cache_seqlens, attn_soft
         softmax_scale = attn_softmax_scale
         output_scale = 1.0
 
-        from mla_decode_fp16 import BlackwellMultiHeadLatentAttentionForwardFP16
-
-        # from mla_decode_fp8 import BlackwellMultiHeadLatentAttentionForwardFP8
+        if use_fp8_cache_kv:
+            from mla_decode_fp8 import (
+                BlackwellMultiHeadLatentAttentionForwardFP8 as kernel,
+            )
+        else:
+            from mla_decode_fp16 import (
+                BlackwellMultiHeadLatentAttentionForwardFP16 as kernel,
+            )
 
-        mla = BlackwellMultiHeadLatentAttentionForwardFP16(
+        mla = kernel(
             cutlass.Float32,
             cutlass.Float32,
             mma_qk_tiler_mn=(128, 128),
@@ -1108,7 +1115,7 @@ def mla_blackwell(decoder_q, latent_cache, block_table, cache_seqlens, attn_soft
             stream,
         )
 
-        if q_dtype == paddle.float8_e4m3fn:
+        if use_fp8_cache_kv:
             paddle_output = paddle_output.cast("bfloat16")
         return paddle_output