update ngram kernel with the same cudagraph adapting logic

NKNaN · NKNaN · commit 00c9cfea3e2c · 2026-05-25T18:57:25.000+08:00
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -960,7 +960,8 @@ void NgramMatch(const paddle::Tensor& token_ids_all,
                 const paddle::Tensor& seq_lens_decoder,
                 const paddle::Tensor& max_dec_len,
                 const int max_ngram_size,
-                const int max_draft_tokens);
+                const int max_draft_tokens,
+                const bool pad_to_max);
 
 void HybridMtpNgram(const paddle::Tensor& token_ids_all,
                     const paddle::Tensor& prompt_lens,
diff --git a/custom_ops/gpu_ops/speculate_decoding/draft_model/ngram_match_mixed.cu b/custom_ops/gpu_ops/speculate_decoding/draft_model/ngram_match_mixed.cu
@@ -216,20 +216,21 @@ __global__ void ngram_match_mixed_gather_kernel(
       }
     }
 
-    // === Pad seq_lens_this_time to K+1 for cudagraph stability ===
-    // Hybrid MTP-ngram produces variable seq_lens_this_time depending on how
-    // many ngram positions hit (range: [num_model_steps+1, K+1]). cudagraph
-    // captures launch params (grid dim, kernel args) at capture time; if the
-    // captured slt differs from replay-time slt, downstream kernels read past
-    // valid ranges of cu_seqlens / slot_mapping etc., causing CUDA 700.
+    // === Pad seq_lens_this_time to num_speculative_tokens+1 for cudagraph
+    // stability === Hybrid MTP-ngram produces variable seq_lens_this_time
+    // depending on how many ngram positions hit (range: [num_model_steps+1,
+    // num_speculative_tokens+1]). cudagraph captures launch params (grid dim,
+    // kernel args) at capture time; if the captured slt differs from
+    // replay-time slt, downstream kernels read past valid ranges of cu_seqlens
+    // / slot_mapping etc., causing CUDA 700.
     //
-    // When pad_to_max=true (cudagraph enabled), force slt = K+1 =
-    // max_draft_tokens + 1: positions beyond actual ngram hits get padded
-    // with a placeholder token. The target model will verify these
-    // placeholders and (almost always) reject them, but the verify cost is
-    // fixed per iteration => grid dim is now invariant. When pad_to_max=
-    // false (cudagraph disabled), keep the natural variable slt to avoid
-    // wasting verify compute on placeholders.
+    // When pad_to_max=true (cudagraph enabled), force slt =
+    // num_speculative_tokens+1 = max_draft_tokens + 1: positions beyond actual
+    // ngram hits get padded with a placeholder token. The target model will
+    // verify these placeholders and (almost always) reject them, but the verify
+    // cost is fixed per iteration => grid dim is now invariant. When
+    // pad_to_max= false (cudagraph disabled), keep the natural variable slt to
+    // avoid wasting verify compute on placeholders.
     if (pad_to_max) {
       int target_slt = max_draft_tokens_param + 1;
       if (actual < target_slt) {
diff --git a/custom_ops/gpu_ops/speculate_decoding/ngram_match.cu b/custom_ops/gpu_ops/speculate_decoding/ngram_match.cu
@@ -138,7 +138,9 @@ __global__ void ngram_match_gather_kernel(
     int32_t *seq_lens_this_time,
     int64_t draft_tokens_stride,
     int64_t max_batch_size,
-    int threshold) {
+    int threshold,
+    int max_draft_tokens_param,
+    bool pad_to_max) {
   typedef cub::BlockScan<int, NGRAM_GATHER_THREADS> BlockScanInt;
   __shared__ typename BlockScanInt::TempStorage temp_storage1;
   __shared__ typename BlockScanInt::TempStorage temp_storage2;
@@ -203,16 +205,39 @@ __global__ void ngram_match_gather_kernel(
       actual = min(tentative, budget);
     }
 
-    seq_lens_this_time[tid] = actual;
-
-    // Copy draft tokens (slots 1..actual-1) from scratch to output
+    // Copy draft tokens (slots 1..actual-1) from scratch to output FIRST
+    // (so subsequent padding doesn't overwrite real ngram hits)
     if (actual > 1) {
       int64_t *dst = draft_tokens + tid * draft_tokens_stride;
       const int64_t *src = draft_tokens_copy + tid * draft_tokens_stride;
       for (int k = 1; k < actual; k++) {
         dst[k] = src[k];
       }
     }
+
+    // === Pad seq_lens_this_time to K+1 for cudagraph stability ===
+    // Variable seq_lens_this_time (range [1, K+1]) clashes with cudagraph's
+    // fixed launch params captured at warm-up time; downstream kernels read
+    // past valid cu_seqlens / slot_mapping when replay sees a smaller slt,
+    // leading to OOB / CUDA 700. When pad_to_max=true (cudagraph enabled),
+    // pad missing positions with a placeholder so slt is fixed at K+1.
+    // pad_to_max=false skips the padding cost when cudagraph is off.
+    if (pad_to_max) {
+      int target_slt = max_draft_tokens_param + 1;
+      if (actual < target_slt) {
+        int64_t *dst = draft_tokens + tid * draft_tokens_stride;
+        // Reuse the last valid draft token as placeholder. It is a token the
+        // model could plausibly have produced, so attention math stays
+        // well-defined; rejection happens at the sampler level.
+        int64_t pad_token = (actual > 0) ? dst[actual - 1] : 0;
+        for (int k = actual; k < target_slt; k++) {
+          dst[k] = pad_token;
+        }
+        actual = target_slt;
+      }
+    }
+
+    seq_lens_this_time[tid] = actual;
   }
 }
 
@@ -374,7 +399,8 @@ void NgramMatch(const paddle::Tensor &token_ids_all,
                 const paddle::Tensor &seq_lens_decoder,
                 const paddle::Tensor &max_dec_len,
                 const int max_ngram_size,
-                const int max_draft_tokens) {
+                const int max_draft_tokens,
+                const bool pad_to_max) {
   const int64_t max_model_len = token_ids_all.shape()[1];
 
   auto draft_tokens_shape = draft_tokens.shape();
@@ -448,7 +474,9 @@ void NgramMatch(const paddle::Tensor &token_ids_all,
         const_cast<int32_t *>(seq_lens_this_time.data<int32_t>()),
         draft_tokens_stride,
         max_batch_size,
-        threshold);
+        threshold,
+        max_draft_tokens,
+        pad_to_max);
   } else {
     find_candidate_pred_tokens(
         token_ids_all.data<int64_t>(),
@@ -478,7 +506,7 @@ PD_BUILD_STATIC_OP(ngram_match)
              "seq_lens_encoder",
              "seq_lens_decoder",
              "max_dec_len"})
-    .Attrs({"max_ngram_size: int", "max_draft_tokens: int"})
+    .Attrs({"max_ngram_size: int", "max_draft_tokens: int", "pad_to_max: bool"})
     .Outputs({"draft_tokens_out", "seq_lens_this_time_out"})
     .SetKernelFn(PD_KERNEL(NgramMatch))
     .SetInplaceMap({{"draft_tokens", "draft_tokens_out"},
diff --git a/fastdeploy/spec_decode/mtp_cuda.py b/fastdeploy/spec_decode/mtp_cuda.py
@@ -394,7 +394,7 @@ def _update_status(self):
 
     def _extend_draft_token_with_ngram_match(self):
         # pad_to_max forces hybrid kernel to write a fixed seq_lens_this_time
-        # = K + 1, padding unfilled ngram slots with a placeholder draft token.
+        # = num_speculative_tokens + 1, padding unfilled ngram slots with a placeholder draft token.
         # Required when target cudagraph is enabled (capture-time seq_lens_this_time
         # must match replay-time seq_lens_this_time).
         hybrid_mtp_ngram(
diff --git a/fastdeploy/spec_decode/ngram.py b/fastdeploy/spec_decode/ngram.py
@@ -39,6 +39,11 @@ def _run_impl(self, share_inputs):
         """
         run
         """
+        # pad_to_max forces the kernel to write a fixed seq_lens_this_time =
+        # num_speculative_tokens + 1, padding unfilled draft slots with a placeholder token.
+        # Required when target cudagraph is enabled (capture-time slt must
+        # match replay-time slt; see ngram_match.cu for details). Disabled
+        # when cudagraph is off to avoid wasted verify on placeholders.
         ngram_match(
             share_inputs["token_ids_all"],
             share_inputs["prompt_lens"],
@@ -51,4 +56,5 @@ def _run_impl(self, share_inputs):
             share_inputs["max_dec_len"],
             self.max_ngram_size,
             self.max_draft_token_num,
+            self.graph_opt_config.use_cudagraph,
         )
diff --git a/tests/operators/test_hybrid_mtp_ngram.py b/tests/operators/test_hybrid_mtp_ngram.py
@@ -75,6 +75,7 @@ def setUp(self):
         self.ref_draft_tokens = np.array([[8, 7, 6, 10, 9, 8], [8, 7, 6, 10, 9, 8]], dtype="int64")
 
     def test_ngram_match_mixed(self):
+        """pad_to_max=False: GPU output matches the CPU reference baseline."""
         hybrid_mtp_ngram(
             self.token_ids_all,
             self.prompt_lens,
@@ -94,6 +95,50 @@ def test_ngram_match_mixed(self):
         np.testing.assert_allclose(self.seq_lens_this_time.numpy(), self.ref_seq_lens_this_time)
         np.testing.assert_allclose(self.draft_tokens.numpy(), self.ref_draft_tokens)
 
+    def test_ngram_match_mixed_pad_to_max(self):
+        """pad_to_max=True: slt is forced to K+1 and unfilled draft slots are
+        padded with the last valid draft token (placeholder for cudagraph
+        stability).
+
+        To exercise the pad path we drive step_idx below min_ngram_size so
+        the search kernel finds no ngram match. Without pad, slt stays at
+        ori_seq_len_this_time=2; with pad, slt becomes max_draft_tokens+1=6
+        and draft_tokens[2:6] are filled with draft_tokens[1] (=7).
+        """
+        # No ngram match path: step_idx < min_ngram_size short-circuits search.
+        self.step_idx[:] = self.min_ngram_size - 1
+
+        hybrid_mtp_ngram(
+            self.token_ids_all,
+            self.prompt_lens,
+            self.pre_ids,
+            self.step_idx,
+            self.draft_token_num,
+            self.draft_tokens,
+            self.seq_lens_this_time,
+            self.seq_lens_decoder,
+            self.max_dec_len,
+            self.max_ngram_size,
+            self.min_ngram_size,
+            self.max_draft_tokens,
+            True,  # pad_to_max
+        )
+
+        target_slt = self.max_draft_tokens + 1  # K+1 = 6
+        slt = self.seq_lens_this_time.numpy()
+        assert (slt == target_slt).all(), f"expected all slt == {target_slt}, got {slt.flatten().tolist()}"
+
+        # ori_seq_len_this_time was 2; positions [2..6) should be padded with
+        # draft_tokens[1] (= 7, the last valid draft token before padding).
+        drafts = self.draft_tokens.numpy()
+        expected_placeholder = 7
+        for b in range(self.max_bsz):
+            np.testing.assert_array_equal(
+                drafts[b, 2:target_slt],
+                np.full(target_slt - 2, expected_placeholder, dtype="int64"),
+                err_msg=f"batch {b}: padded slots [2:{target_slt}) should equal placeholder {expected_placeholder}",
+            )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/spec_decode/test_ngram_gpu_kernel.py b/tests/spec_decode/test_ngram_gpu_kernel.py
@@ -365,6 +365,7 @@ def test_correctness_basic(self):
             gpu_data["max_dec_len"],
             max_ngram_size,
             max_draft_tokens,
+            False,
         )
         paddle.device.synchronize()
 
@@ -395,6 +396,7 @@ def test_correctness_varied_seeds(self):
                     data["max_dec_len"],
                     3,
                     10,
+                    False,
                 )
                 gpu_data = _to_gpu(data)
                 self.ngram_match(
@@ -409,6 +411,7 @@ def test_correctness_varied_seeds(self):
                     gpu_data["max_dec_len"],
                     3,
                     10,
+                    False,
                 )
                 paddle.device.synchronize()
                 np.testing.assert_array_equal(gpu_data["seq_lens_this_time"].numpy(), cpu_slt)
@@ -456,6 +459,7 @@ def test_large_batch_long_seq(self):
                 gpu_data["max_dec_len"],
                 3,
                 10,
+                False,
             )
             paddle.device.synchronize()
         finally:
@@ -485,6 +489,7 @@ def test_single_batch_long_seq(self):
             data["max_dec_len"],
             3,
             10,
+            False,
         )
         gpu_data = _to_gpu(data)
         self.ngram_match(
@@ -499,6 +504,7 @@ def test_single_batch_long_seq(self):
             gpu_data["max_dec_len"],
             3,
             10,
+            False,
         )
         paddle.device.synchronize()
         np.testing.assert_array_equal(gpu_data["seq_lens_this_time"].numpy(), cpu_slt)
@@ -542,6 +548,7 @@ def test_many_short_seqs(self):
                 gpu_data["max_dec_len"],
                 3,
                 10,
+                False,
             )
             paddle.device.synchronize()
         finally:
@@ -569,6 +576,7 @@ def test_latency(self):
                 d["max_dec_len"],
                 3,
                 10,
+                False,
             )
         paddle.device.synchronize()
 
@@ -591,6 +599,7 @@ def test_latency(self):
                 gpu_data["max_dec_len"],
                 3,
                 10,
+                False,
             )
             paddle.device.synchronize()
         t1 = time.perf_counter()
@@ -641,6 +650,7 @@ def test_latency_scaling(self):
                     gpu_data["max_dec_len"],
                     3,
                     10,
+                    False,
                 )
             paddle.device.synchronize()
 
@@ -660,6 +670,7 @@ def test_latency_scaling(self):
                     gpu_data["max_dec_len"],
                     3,
                     10,
+                    False,
                 )
                 paddle.device.synchronize()
             gpu_ms = (time.perf_counter() - t0) / n_runs * 1000
@@ -742,6 +753,7 @@ def test_latency_extreme(self):
                         gpu_data["max_dec_len"],
                         3,
                         10,
+                        False,
                     )
                 paddle.device.synchronize()
 
@@ -761,6 +773,7 @@ def test_latency_extreme(self):
                         gpu_data["max_dec_len"],
                         3,
                         10,
+                        False,
                     )
                     paddle.device.synchronize()
                 t1 = time.perf_counter()