PaddlePaddle
diff --git a/‎custom_ops/gpu_ops/cpp_extensions.cc‎
Lines changed: 1 addition & 3 deletions b/‎custom_ops/gpu_ops/cpp_extensions.cc‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎custom_ops/gpu_ops/speculate_decoding/ngram_match.cu‎
Lines changed: 15 additions & 34 deletions b/‎custom_ops/gpu_ops/speculate_decoding/ngram_match.cu‎
Lines changed: 15 additions & 34 deletions
diff --git a/‎fastdeploy/config.py‎
Lines changed: 1 addition & 0 deletions b/‎fastdeploy/config.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎fastdeploy/spec_decode/ngram.py‎
Lines changed: 0 additions & 13 deletions b/‎fastdeploy/spec_decode/ngram.py‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎fastdeploy/worker/gpu_model_runner.py‎
Lines changed: 5 additions & 1 deletion b/‎fastdeploy/worker/gpu_model_runner.py‎
Lines changed: 5 additions & 1 deletion
@@ -951,9 +951,7 @@ void SpeculateScheduleCache(const paddle::Tensor& draft_tokens,
                             const int block_size,
                             const int max_draft_tokens);
 
-void NgramMatch(const paddle::Tensor& input_ids,
-                const paddle::Tensor& input_ids_len,
-                const paddle::Tensor& token_ids_all,
+void NgramMatch(const paddle::Tensor& token_ids_all,
                 const paddle::Tensor& prompt_lens,
                 const paddle::Tensor& step_idx,
                 const paddle::Tensor& draft_token_num,
 
@@ -27,9 +27,7 @@
 // the tentative new seq_lens_this_time to a copy buffer.
 // Phase 2 will decide which ones to keep (threshold logic).
 // ============================================================
-__global__ void ngram_match_search_kernel(const int64_t *input_ids,
-                                          const int64_t *input_ids_len,
-                                          const int64_t *token_ids_all,
+__global__ void ngram_match_search_kernel(const int64_t *token_ids_all,
                                           const int64_t *prompt_lens,
                                           const int64_t *step_idx,
                                           const int *draft_token_num,
@@ -38,7 +36,6 @@ __global__ void ngram_match_search_kernel(const int64_t *input_ids,
                                           const int64_t *max_dec_len,
                                           int64_t *draft_tokens_copy,
                                           int32_t *seq_lens_this_time_copy,
-                                          int64_t input_ids_stride,
                                           int64_t max_model_len,
                                           int64_t draft_tokens_stride,
                                           int64_t max_batch_size,
@@ -63,9 +60,9 @@ __global__ void ngram_match_search_kernel(const int64_t *input_ids,
   // Active decoder item: at least the base token.
   if (threadIdx.x == 0) seq_lens_this_time_copy[batch_idx] = 1;
 
-  const int64_t *cur_input_ids = input_ids + batch_idx * input_ids_stride;
-  const int64_t cur_input_ids_len = input_ids_len[batch_idx];
   const int64_t prompt_len = prompt_lens[batch_idx];
+  const int64_t *cur_input_ids = token_ids_all + batch_idx * max_model_len;
+  const int64_t cur_input_ids_len = prompt_len;
   const int64_t *cur_pre_ids =
       token_ids_all + batch_idx * max_model_len + prompt_len;
   const int64_t cur_step_idx = step_idx[batch_idx];
@@ -79,7 +76,7 @@ __global__ void ngram_match_search_kernel(const int64_t *input_ids,
   for (int ngram_size = max_ngram_size; ngram_size >= 1; --ngram_size) {
     if (cur_step_idx < ngram_size) continue;
 
-    const int64_t *ngram = cur_pre_ids + (cur_step_idx + 1 - ngram_size);
+    const int64_t *ngram = cur_pre_ids + (cur_step_idx - ngram_size);
 
     int64_t pos = parallel_ngram_search(
         cur_input_ids, cur_input_ids_len, ngram, ngram_size, &s_min_pos);
@@ -235,9 +232,7 @@ static int sum_cpu(const int *value, int num) {
   return sum_value;
 }
 
-static void find_candidate_pred_tokens(const int64_t *input_ids,
-                                       const int64_t *input_ids_len,
-                                       const int64_t *token_ids_all,
+static void find_candidate_pred_tokens(const int64_t *token_ids_all,
                                        const int64_t *prompt_lens,
                                        const int64_t *step_idx,
                                        const int *draft_token_num,
@@ -246,7 +241,6 @@ static void find_candidate_pred_tokens(const int64_t *input_ids,
                                        int32_t *seq_lens_encoder,
                                        int32_t *seq_lens_decoder,
                                        int64_t *max_dec_len,
-                                       int64_t input_ids_stride,
                                        int64_t max_model_len,
                                        int64_t draft_tokens_stride,
                                        int64_t max_batch_size,
@@ -274,12 +268,12 @@ static void find_candidate_pred_tokens(const int64_t *input_ids,
       continue;
     }
 
-    const int64_t *cur_input_ids = input_ids + batch_idx * input_ids_stride;
+    const int64_t *cur_input_ids = token_ids_all + batch_idx * max_model_len;
+    const int64_t cur_input_ids_len = prompt_lens[batch_idx];
     int64_t *cur_draft_tokens = draft_tokens + batch_idx * draft_tokens_stride;
     const int64_t *cur_pre_ids =
-        token_ids_all + batch_idx * max_model_len + prompt_lens[batch_idx];
+        token_ids_all + batch_idx * max_model_len + cur_input_ids_len;
     const int64_t cur_step_idx = step_idx[batch_idx];
-    const int64_t cur_input_ids_len = input_ids_len[batch_idx];
     seq_lens_this_time[batch_idx] = 1;
     unprocessed_batch_size--;
 
@@ -301,7 +295,7 @@ static void find_candidate_pred_tokens(const int64_t *input_ids,
       if (cur_step_idx < ngram_size) {
         continue;
       }
-      const int64_t *ngram = cur_pre_ids + (cur_step_idx + 1 - ngram_size);
+      const int64_t *ngram = cur_pre_ids + (cur_step_idx - ngram_size);
 
       bool match_input = false;
       for (int64_t i = 0; i <= cur_input_ids_len - ngram_size; ++i) {
@@ -370,9 +364,7 @@ static void find_candidate_pred_tokens(const int64_t *input_ids,
 // bsz × NGRAM_BLOCK_THREADS threads.  Phase 2 is O(bsz) with scans.
 // ============================================================
 
-void NgramMatch(const paddle::Tensor &input_ids,
-                const paddle::Tensor &input_ids_len,
-                const paddle::Tensor &token_ids_all,
+void NgramMatch(const paddle::Tensor &token_ids_all,
                 const paddle::Tensor &prompt_lens,
                 const paddle::Tensor &step_idx,
                 const paddle::Tensor &draft_token_num,
@@ -383,9 +375,6 @@ void NgramMatch(const paddle::Tensor &input_ids,
                 const paddle::Tensor &max_dec_len,
                 const int max_ngram_size,
                 const int max_draft_tokens) {
-  auto input_ids_shape = input_ids.shape();
-  const int64_t input_ids_stride = input_ids_shape[1];
-
   const int64_t max_model_len = token_ids_all.shape()[1];
 
   auto draft_tokens_shape = draft_tokens.shape();
@@ -399,8 +388,8 @@ void NgramMatch(const paddle::Tensor &input_ids,
     threshold = std::stoi(env_var);
   }
 
-  if (input_ids.is_gpu()) {
-    auto stream = input_ids.stream();
+  if (token_ids_all.is_gpu()) {
+    auto stream = token_ids_all.stream();
 
     // Persistent scratch buffers for Phase 1 → Phase 2 communication.
     // Cached across calls to avoid per-invocation allocation overhead.
@@ -416,9 +405,9 @@ void NgramMatch(const paddle::Tensor &input_ids,
         draft_tokens_stride > s_scratch_stride) {
       s_draft_copy = paddle::empty({max_batch_size, draft_tokens_stride},
                                    paddle::DataType::INT64,
-                                   input_ids.place());
+                                   token_ids_all.place());
       s_seqlens_copy = paddle::empty(
-          {max_batch_size}, paddle::DataType::INT32, input_ids.place());
+          {max_batch_size}, paddle::DataType::INT32, token_ids_all.place());
       s_scratch_batch = max_batch_size;
       s_scratch_stride = draft_tokens_stride;
     }
@@ -435,8 +424,6 @@ void NgramMatch(const paddle::Tensor &input_ids,
                                 NGRAM_BLOCK_THREADS,
                                 0,
                                 stream>>>(
-        input_ids.data<int64_t>(),
-        input_ids_len.data<int64_t>(),
         token_ids_all.data<int64_t>(),
         prompt_lens.data<int64_t>(),
         step_idx.data<int64_t>(),
@@ -446,7 +433,6 @@ void NgramMatch(const paddle::Tensor &input_ids,
         max_dec_len.data<int64_t>(),
         draft_tokens_copy.data<int64_t>(),
         seq_lens_this_time_copy.data<int32_t>(),
-        input_ids_stride,
         max_model_len,
         draft_tokens_stride,
         max_batch_size,
@@ -465,8 +451,6 @@ void NgramMatch(const paddle::Tensor &input_ids,
         threshold);
   } else {
     find_candidate_pred_tokens(
-        input_ids.data<int64_t>(),
-        input_ids_len.data<int64_t>(),
         token_ids_all.data<int64_t>(),
         prompt_lens.data<int64_t>(),
         step_idx.data<int64_t>(),
@@ -476,7 +460,6 @@ void NgramMatch(const paddle::Tensor &input_ids,
         const_cast<int32_t *>(seq_lens_encoder.data<int32_t>()),
         const_cast<int32_t *>(seq_lens_decoder.data<int32_t>()),
         const_cast<int64_t *>(max_dec_len.data<int64_t>()),
-        input_ids_stride,
         max_model_len,
         draft_tokens_stride,
         max_batch_size,
@@ -486,9 +469,7 @@ void NgramMatch(const paddle::Tensor &input_ids,
 }
 
 PD_BUILD_STATIC_OP(ngram_match)
-    .Inputs({"input_ids",
-             "input_ids_len",
-             "token_ids_all",
+    .Inputs({"token_ids_all",
              "prompt_lens",
              "step_idx",
              "draft_token_num",
 
@@ -1964,6 +1964,7 @@ def __init__(
                     in [
                         SpecMethod.MTP,
                         SpecMethod.SUFFIX,
+                        SpecMethod.NGRAM,
                     ]
                 )
                 else 0
 
@@ -16,8 +16,6 @@
 
 from typing import TYPE_CHECKING
 
-import paddle
-
 from fastdeploy.model_executor.ops.gpu import ngram_match
 
 from .base import Proposer
@@ -36,23 +34,12 @@ class NgramProposer(Proposer):
     def __init__(self, fd_config: "FDConfig"):
         super().__init__(fd_config)
         self.max_ngram_size = self.speculative_config.max_ngram_size
-        self.input_ids_len = paddle.zeros(shape=[self.max_num_seqs, 1], dtype="int64").cpu()
-        self.input_ids_len_gpu = paddle.zeros(shape=[self.max_num_seqs, 1], dtype="int64").cuda()
-
-    def update(self, bid: int, seq_len: int):
-        """
-        update
-        """
-        self.input_ids_len[bid] = seq_len
-        self.input_ids_len_gpu[bid] = seq_len
 
     def _run_impl(self, share_inputs):
         """
         run
         """
         ngram_match(
-            share_inputs["input_ids_cpu"].cuda(),
-            self.input_ids_len_gpu,
             share_inputs["token_ids_all"],
             share_inputs["prompt_lens"],
             share_inputs["step_idx"],
 
@@ -2094,7 +2094,11 @@ def capture_model(self) -> None:
                     logger.info(
                         f"Warm up the model with the num_tokens:{num_tokens}, expected_decode_len:{expected_decode_len}"
                     )
-            elif self.speculative_decoding and self.spec_method in [SpecMethod.MTP, SpecMethod.SUFFIX]:
+            elif self.speculative_decoding and self.spec_method in [
+                SpecMethod.MTP,
+                SpecMethod.SUFFIX,
+                SpecMethod.NGRAM,
+            ]:
                 for capture_size in sorted(capture_sizes, reverse=True):
                     expected_decode_len = (self.speculative_config.num_speculative_tokens + 1) * 2
                     self._dummy_run(
Original file line number	Diff line number	Diff line change
`@@ -1964,6 +1964,7 @@ def __init__(`
`1964`	`1964`	`in [`
`1965`	`1965`	`SpecMethod.MTP,`
`1966`	`1966`	`SpecMethod.SUFFIX,`
	`1967`	`+ SpecMethod.NGRAM,`
`1967`	`1968`	`]`
`1968`	`1969`	`)`
`1969`	`1970`	`else 0`