[BugFix][Speculative Decoding] fix bug of speculate limit_thinking and stop_seqs

guanshihui] · guanshihui] · commit 0f4325c823d8 · 2026-04-02T21:36:37.000+08:00
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length.cu
@@ -98,10 +98,7 @@ __global__ void speculate_limit_thinking_content_length_kernel(
       if (max_think_len > 0) {
         // A) 超长触发：到达 max_think_len 时开始注入（从本 token 起输出
         // inject_token_ids[0]）
-        if (status == 0 &&
-            (current_step - 1) ==
-                max_think_len) {  // current_step - 1 是因为 speculate_verify 里
-                                  // step_idx + 1 了
+        if (status == 0 && current_step == max_think_len) {
           status = (inject_len > 0) ? 1 : done_status;
         }
       } else if (max_think_len == 0) {
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_set_stop_value_multi_seqs.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_set_stop_value_multi_seqs.cu
@@ -24,7 +24,7 @@ __global__ void spec_set_value_by_stop_seqs(bool *stop_flags,
                                             int *accept_nums,
                                             const int64_t *token_ids_all,
                                             const int64_t *prompt_lens,
-                                            const int64_t *step_idx,
+                                            int64_t *step_idx,
                                             const int64_t *stop_seqs,
                                             const int *stop_seqs_len,
                                             const int *seq_lens,
@@ -56,9 +56,10 @@ __global__ void spec_set_value_by_stop_seqs(bool *stop_flags,
     if (!stop_flags[bid]) {
       int accept_idx = 0;
       bool is_end = false;
-      // 遍历起始位置
-      for (; accept_idx <= accept_num - 1 && !is_end; accept_idx++) {
-        if (step_idx_now + accept_idx + 1 < stop_seq_len) {
+      // 遍历起始位置（不包含最后一个 accept token，由
+      // unified_update_model_status 处理 EOS 检测）
+      for (; accept_idx < accept_num - 1 && !is_end; accept_idx++) {
+        if (step_idx_now - accept_num + accept_idx + 1 < stop_seq_len) {
 #ifdef DEBUG_SPEC_STOP_SEQS
           printf("num %d < stop_seq_len %d\n",
                  step_idx_now - accept_num + accept_idx + 1,
@@ -71,7 +72,7 @@ __global__ void spec_set_value_by_stop_seqs(bool *stop_flags,
           int64_t cur_token_idx = -1;
 
           // 通过当前值判断 token 是在 pre_ids 还是 accept_token 里
-          if (stop_seq_len - 1 - i < accept_idx) {
+          if (stop_seq_len - 1 - i <= accept_idx) {
 #ifdef DEBUG_SPEC_STOP_SEQS
             printf(
                 "AcceptTokens bid:%d. tid:%d, accept_idx:%d, "
@@ -83,7 +84,7 @@ __global__ void spec_set_value_by_stop_seqs(bool *stop_flags,
                 accept_idx - (stop_seq_len - 1 - i) - 1);
 #endif
             cur_token_idx =
-                accept_tokens_now[accept_idx - (stop_seq_len - 1 - i) - 1];
+                accept_tokens_now[accept_idx - (stop_seq_len - 1 - i)];
           } else {
 #ifdef DEBUG_SPEC_STOP_SEQS
             printf(
@@ -98,7 +99,7 @@ __global__ void spec_set_value_by_stop_seqs(bool *stop_flags,
                     (stop_seq_len - 1 - i));
 #endif
             int pre_ids_idx =
-                step_idx_now + accept_idx - (stop_seq_len - 1 - i);
+                step_idx_now - accept_num + accept_idx - (stop_seq_len - 1 - i);
             // EC3
             // 特殊拼接会导致input_ids最后一位无特殊token，即pre_ids[0]可能为23,
             // 导致异常结束
@@ -129,8 +130,17 @@ __global__ void spec_set_value_by_stop_seqs(bool *stop_flags,
         printf("bid:%d end with accept_idx %d", bid, accept_idx);
 #endif
 
-        accept_nums[bid] = accept_idx;
-        accept_tokens_now[accept_idx - 1] = end_ids[0];
+        // accept_idx
+        // 已自增1，stop_seq的最后一个token在accept_tokens[accept_idx-1]
+        // 回退逻辑：丢弃stop token之后的多余token，保留stop token并追加eos
+        // 对齐非MTP行为: ...<|im_end|> <eos>
+        int keep_count = accept_idx + 1;
+        int discarded = accept_num - keep_count;
+        if (discarded > 0) {
+          step_idx[bid] -= discarded;
+        }
+        accept_nums[bid] = keep_count;
+        accept_tokens_now[accept_idx] = end_ids[0];
         // stop_flags[bid] = true;
       }
     }
@@ -167,7 +177,7 @@ void SpecGetStopFlagsMultiSeqs(const paddle::Tensor &accept_tokens,
       const_cast<int *>(accept_num.data<int>()),
       token_ids_all.data<int64_t>(),
       prompt_lens.data<int64_t>(),
-      step_idx.data<int64_t>(),
+      const_cast<int64_t *>(step_idx.data<int64_t>()),
       stop_seqs.data<int64_t>(),
       stop_seqs_len.data<int>(),
       seq_lens.data<int>(),
diff --git a/tests/operators/test_speculate_set_stop_value_multi_seqs.py b/tests/operators/test_speculate_set_stop_value_multi_seqs.py
@@ -61,7 +61,7 @@ def run_kernel(paddle_inputs, inputs):
 
 def get_outputs(paddle_inputs) -> Dict[str, np.ndarray]:
     """Extract all in-place-modified tensors back to numpy."""
-    keys = ["accept_tokens", "accept_num"]
+    keys = ["accept_tokens", "accept_num", "step_idx"]
     return {k: paddle_inputs[k].numpy() for k in keys}
 
 
@@ -100,7 +100,9 @@ def gen_inputs(
         accept_tokens[i, : accept_num[i]] = rng.integers(1, vocab_size, size=accept_num[i])
 
     stop_flags = np.zeros(real_bsz, dtype="bool")
-    seq_lens = (step_idx + accept_num).astype("int32")
+    # New semantics: step_idx already includes accept_num,
+    # so seq_lens = step_idx (not step_idx + accept_num)
+    seq_lens = step_idx.astype("int32")
 
     # stop_seqs: [bsz, stop_seqs_bs, stop_seqs_max_len]
     stop_seqs = rng.integers(1, vocab_size, size=(real_bsz, stop_seqs_bs, stop_seqs_max_len)).astype("int64")
@@ -140,10 +142,10 @@ def reference_spec_set_stop_value_multi_seqs(inputs: Dict[str, Any]) -> Dict[str
     """Python reference — must match CUDA kernel logic exactly."""
     accept_tokens = inputs["accept_tokens"].copy()
     accept_num = inputs["accept_num"].copy()
+    step_idx = inputs["step_idx"].copy()
     stop_flags = inputs["stop_flags"].copy()
     token_ids_all = inputs["token_ids_all"]
     prompt_lens = inputs["prompt_lens"]
-    step_idx = inputs["step_idx"]
     stop_seqs = inputs["stop_seqs"]
     stop_seqs_len = inputs["stop_seqs_len"]
     end_ids = inputs["end_ids"]
@@ -174,18 +176,22 @@ def reference_spec_set_stop_value_multi_seqs(inputs: Dict[str, Any]) -> Dict[str
 
             accept_idx = 0
             is_end = False
-            while accept_idx <= an - 1 and not is_end:
-                if step_idx_now + accept_idx + 1 < stop_seq_len:
+            # Loop excludes last accept token (accept_idx < an - 1)
+            while accept_idx < an - 1 and not is_end:
+                if step_idx_now - an + accept_idx + 1 < stop_seq_len:
                     accept_idx += 1
                     continue
 
                 # Check one stop_seq match
                 for i in range(stop_seq_len - 1, -1, -1):
                     cur_token_idx = -1
-                    if stop_seq_len - 1 - i < accept_idx:
-                        cur_token_idx = accept_tokens_now[accept_idx - (stop_seq_len - 1 - i) - 1]
+                    # Token boundary: <= (not <)
+                    if stop_seq_len - 1 - i <= accept_idx:
+                        # Accept token index: no -1 offset
+                        cur_token_idx = accept_tokens_now[accept_idx - (stop_seq_len - 1 - i)]
                     else:
-                        pre_ids_idx = step_idx_now + accept_idx - (stop_seq_len - 1 - i)
+                        # Pre_ids index: step_idx_now - an + accept_idx
+                        pre_ids_idx = step_idx_now - an + accept_idx - (stop_seq_len - 1 - i)
                         if pre_ids_idx <= 0:
                             break
                         cur_token_idx = pre_ids_now[pre_ids_idx]
@@ -199,13 +205,19 @@ def reference_spec_set_stop_value_multi_seqs(inputs: Dict[str, Any]) -> Dict[str
                 accept_idx += 1
 
             if is_end:
-                accept_num[bid] = accept_idx
-                accept_tokens[bid, accept_idx - 1] = end_ids[0]
-                # stop_flags[bid] = True  # kernel no longer sets stop_flags
+                # accept_idx already incremented by 1
+                # keep stop token + append end_id, rollback step_idx
+                keep_count = accept_idx + 1
+                discarded = an - keep_count
+                if discarded > 0:
+                    step_idx[bid] -= discarded
+                accept_num[bid] = keep_count
+                accept_tokens[bid, accept_idx] = end_ids[0]
 
     return {
         "accept_tokens": accept_tokens,
         "accept_num": accept_num,
+        "step_idx": step_idx,
     }
 
 
@@ -245,7 +257,7 @@ def _run_and_get(self, inputs):
     def _check_all_outputs(self, inputs, outputs):
         """Compare ALL output tensors against reference."""
         ref = reference_spec_set_stop_value_multi_seqs(inputs)
-        for key in ["accept_tokens", "accept_num"]:
+        for key in ["accept_tokens", "accept_num", "step_idx"]:
             np.testing.assert_array_equal(outputs[key], ref[key], err_msg=f"{key} mismatch")
 
     def _run_full_test(self, config):
@@ -266,16 +278,20 @@ def test_configs(self):
     def test_match_in_accept_tokens_only(self):
         """Stop seq found entirely within accept_tokens."""
         inputs = gen_inputs(real_bsz=1, accept_tokens_len=5, stop_seqs_bs=1, stop_seqs_max_len=3, seed=10)
-        # Place stop seq [A, B, C] at accept_tokens positions [0,1,2]
+        # Place stop seq [10, 20, 30] matching at accept_idx=2
+        # New semantics: step_idx already includes accept_num
         inputs["accept_num"][:] = 4
         inputs["accept_tokens"][0, :4] = [10, 20, 30, 40]
         inputs["stop_seqs"][0, 0, :3] = [10, 20, 30]
         inputs["stop_seqs_len"][0, 0] = 3
-        inputs["step_idx"][:] = 10
+        inputs["step_idx"][:] = 14  # old_step=10 + accept_num=4
         inputs["stop_flags"][:] = False
         inputs["min_tokens"][:] = 0
         outputs = self._run_and_get(inputs)
         self._check_all_outputs(inputs, outputs)
+        # Match at accept_idx=2: keep_count=2+1+1=4, accept_tokens[3]=end_id
+        self.assertEqual(outputs["accept_num"][0], 4)
+        self.assertEqual(outputs["accept_tokens"][0, 3], -1)
 
     def test_match_spanning_pre_ids_and_accept(self):
         """Stop seq spans token_ids_all (pre_ids) and accept_tokens."""
@@ -288,27 +304,32 @@ def test_match_spanning_pre_ids_and_accept(self):
             seed=20,
         )
         inputs["prompt_lens"][:] = 0
-        inputs["step_idx"][:] = 6
+        # New semantics: step_idx = old_step(6) + accept_num(3) = 9
+        inputs["step_idx"][:] = 9
         inputs["accept_num"][:] = 3
-        # Kernel matching at accept_idx=2 (3rd token, 0-indexed):
-        #   i=2(last): stop_seq_len-1-i=0 < accept_idx(2) -> accept_tokens[2-0-1]=accept_tokens[1]
-        #   i=1:       stop_seq_len-1-i=1 < accept_idx(2) -> accept_tokens[2-1-1]=accept_tokens[0]
-        #   i=0:       stop_seq_len-1-i=2 >= accept_idx(2) -> pre_ids[step_idx+2-(3-1-0)]=pre_ids[6]
-        # So stop_seq should be [pre_ids[6], accept_tokens[0], accept_tokens[1]]
-        inputs["token_ids_all"][0, 6] = 99
+        # New kernel matching at accept_idx=1 (for loop: 0..accept_num-2=1):
+        #   i=2(last): j=0, 0<=1 -> accept_tokens[1-0]=accept_tokens[1]=22
+        #   i=1:       j=1, 1<=1 -> accept_tokens[1-1]=accept_tokens[0]=11
+        #   i=0:       j=2, 2<=1 false -> pre_ids[9-3+1-2]=pre_ids[5]
+        # So stop_seq should be [pre_ids[5], accept_tokens[0], accept_tokens[1]]
+        inputs["token_ids_all"][0, 5] = 99
         inputs["accept_tokens"][0, :3] = [11, 22, 33]
         inputs["stop_seqs"][0, 0, :3] = [99, 11, 22]
         inputs["stop_seqs_len"][0, 0] = 3
         inputs["stop_flags"][:] = False
         inputs["min_tokens"][:] = 0
         outputs = self._run_and_get(inputs)
         self._check_all_outputs(inputs, outputs)
-        # Match at accept_idx=2, loop increments to 3
+        # Match at accept_idx=1, loop increments to 2, keep_count=3
         self.assertEqual(outputs["accept_num"][0], 3)
         self.assertEqual(outputs["accept_tokens"][0, 2], -1)
 
-    def test_match_in_pre_ids_only(self):
-        """Stop seq found entirely within token_ids_all (pre_ids), matching at accept_idx=0."""
+    def test_match_mostly_in_pre_ids(self):
+        """Stop seq mostly in pre_ids, last token in accept_tokens[0], matching at accept_idx=0.
+
+        New kernel: at accept_idx=0, stop_seq[last] always comes from accept_tokens[0],
+        remaining tokens come from pre_ids.
+        """
         inputs = gen_inputs(
             real_bsz=1,
             accept_tokens_len=5,
@@ -318,25 +339,27 @@ def test_match_in_pre_ids_only(self):
             seed=30,
         )
         inputs["prompt_lens"][:] = 0
-        inputs["step_idx"][:] = 8
+        # New semantics: step_idx = old_step(8) + accept_num(3) = 11
+        inputs["step_idx"][:] = 11
         inputs["accept_num"][:] = 3
-        # pre_ids at step_idx positions: token_ids_all[0, 6]=50, [0,7]=60, [0,8]=70
-        # stop_seq = [50, 60, 70], all 3 tokens are in pre_ids
-        # For accept_idx=0: step_idx_now + 0 + 1 = 9 >= stop_seq_len=3, so we check
-        # i=2: pre_ids_idx = 8+0-(3-1-2) = 8 -> pre_ids_now[8] = 70
-        # i=1: pre_ids_idx = 8+0-(3-1-1) = 7 -> pre_ids_now[7] = 60
-        # i=0: pre_ids_idx = 8+0-(3-1-0) = 6 -> pre_ids_now[6] = 50
+        # At accept_idx=0 with stop_seq_len=3:
+        #   i=2: j=0, 0<=0 -> accept_tokens[0]=70
+        #   i=1: j=1, 1<=0 false -> pre_ids[11-3+0-1]=pre_ids[7]=60
+        #   i=0: j=2, 2<=0 false -> pre_ids[11-3+0-2]=pre_ids[6]=50
         inputs["token_ids_all"][0, 6] = 50
         inputs["token_ids_all"][0, 7] = 60
-        inputs["token_ids_all"][0, 8] = 70
-        inputs["accept_tokens"][0, :3] = [1, 2, 3]
+        inputs["accept_tokens"][0, :3] = [70, 2, 3]
         inputs["stop_seqs"][0, 0, :3] = [50, 60, 70]
         inputs["stop_seqs_len"][0, 0] = 3
         inputs["stop_flags"][:] = False
         inputs["min_tokens"][:] = 0
         outputs = self._run_and_get(inputs)
         self._check_all_outputs(inputs, outputs)
-        self.assertEqual(outputs["accept_num"][0], 1)
+        # Match at accept_idx=0, loop increments to 1, keep_count=2
+        # discarded = 3 - 2 = 1, step_idx rolled back by 1 (11->10)
+        self.assertEqual(outputs["accept_num"][0], 2)
+        self.assertEqual(outputs["accept_tokens"][0, 1], -1)
+        self.assertEqual(outputs["step_idx"][0], 10)
 
     def test_already_stopped(self):
         """Kernel skips sequences with stop_flags=True."""
@@ -346,9 +369,10 @@ def test_already_stopped(self):
         inputs["stop_seqs_len"][:] = 2
         outputs = self._run_and_get(inputs)
         self._check_all_outputs(inputs, outputs)
-        # accept_tokens and accept_num should be unchanged
+        # accept_tokens, accept_num and step_idx should be unchanged
         np.testing.assert_array_equal(outputs["accept_tokens"], inputs["accept_tokens"])
         np.testing.assert_array_equal(outputs["accept_num"], inputs["accept_num"])
+        np.testing.assert_array_equal(outputs["step_idx"], inputs["step_idx"])
 
     def test_min_tokens_blocks_stop(self):
         """Kernel skips stop check when step_idx < min_tokens."""
@@ -361,17 +385,17 @@ def test_min_tokens_blocks_stop(self):
             seed=50,
         )
         inputs["prompt_lens"][:] = 0
-        inputs["step_idx"][:] = 8
+        # New semantics: step_idx = old_step(8) + accept_num(3) = 11
+        inputs["step_idx"][:] = 11
         inputs["accept_num"][:] = 3
-        # Same setup that would match (like test_match_in_pre_ids_only)
+        # Same setup that would match (like test_match_mostly_in_pre_ids)
         inputs["token_ids_all"][0, 6] = 50
         inputs["token_ids_all"][0, 7] = 60
-        inputs["token_ids_all"][0, 8] = 70
-        inputs["accept_tokens"][0, :3] = [1, 2, 3]
+        inputs["accept_tokens"][0, :3] = [70, 2, 3]
         inputs["stop_seqs"][0, 0, :3] = [50, 60, 70]
         inputs["stop_seqs_len"][0, 0] = 3
         inputs["stop_flags"][:] = False
-        inputs["min_tokens"][:] = 100  # step_idx=8 < 100, should NOT stop
+        inputs["min_tokens"][:] = 100  # step_idx=11 < 100, should NOT stop
         outputs = self._run_and_get(inputs)
         self._check_all_outputs(inputs, outputs)
 
@@ -386,17 +410,17 @@ def test_min_tokens_allows_stop(self):
             seed=60,
         )
         inputs["prompt_lens"][:] = 0
-        inputs["step_idx"][:] = 8
+        # New semantics: step_idx = old_step(8) + accept_num(3) = 11
+        inputs["step_idx"][:] = 11
         inputs["accept_num"][:] = 3
-        # Put stop_seq entirely in pre_ids (same pattern as test_match_in_pre_ids_only)
+        # Same setup as test_match_mostly_in_pre_ids
         inputs["token_ids_all"][0, 6] = 50
         inputs["token_ids_all"][0, 7] = 60
-        inputs["token_ids_all"][0, 8] = 70
-        inputs["accept_tokens"][0, :3] = [1, 2, 3]
+        inputs["accept_tokens"][0, :3] = [70, 2, 3]
         inputs["stop_seqs"][0, 0, :3] = [50, 60, 70]
         inputs["stop_seqs_len"][0, 0] = 3
         inputs["stop_flags"][:] = False
-        inputs["min_tokens"][:] = 5  # step_idx=8 >= 5, should stop
+        inputs["min_tokens"][:] = 5  # step_idx=11 >= 5, should stop
         outputs = self._run_and_get(inputs)
         self._check_all_outputs(inputs, outputs)
 
@@ -411,11 +435,12 @@ def test_multiple_stop_seqs_second_matches(self):
             seed=70,
         )
         inputs["prompt_lens"][:] = 0
-        inputs["step_idx"][:] = 8
+        # New semantics: step_idx = old_step(8) + accept_num(3) = 11
+        inputs["step_idx"][:] = 11
         inputs["accept_num"][:] = 3
-        # accept_tokens: stop_seq[20,30] matches at accept_idx=2:
-        #   i=1: accept_tokens[2-0-1]=accept_tokens[1]=30 vs stop_seq[1]=30 OK
-        #   i=0: accept_tokens[2-1-1]=accept_tokens[0]=20 vs stop_seq[0]=20 OK
+        # accept_tokens: stop_seq[20,30] matches at accept_idx=1:
+        #   i=1(last): j=0, 0<=1 -> accept_tokens[1-0]=accept_tokens[1]=30
+        #   i=0:       j=1, 1<=1 -> accept_tokens[1-1]=accept_tokens[0]=20
         inputs["accept_tokens"][0, :3] = [20, 30, 40]
         # First stop seq doesn't match
         inputs["stop_seqs"][0, 0, :3] = [99, 98, 97]
@@ -440,17 +465,15 @@ def test_nonzero_prompt_lens(self):
         )
         prompt_len = 10
         inputs["prompt_lens"][:] = prompt_len
-        inputs["step_idx"][:] = 5
+        # New semantics: step_idx = old_step(5) + accept_num(2) = 7
+        inputs["step_idx"][:] = 7
         inputs["accept_num"][:] = 2
         inputs["accept_tokens"][0, :2] = [55, 66]
         # pre_ids_now starts at token_ids_all[0, prompt_len:]
-        # stop_seq = [X, 55] where X = token_ids_all[0, prompt_len + step_idx]
-        # For accept_idx=0: pre_ids_idx = step_idx + 0 - (2-1-0) = 5-1 = 4
-        #   -> pre_ids_now[4] = token_ids_all[0, prompt_len + 4]
-        # For accept_idx=1 (second token is accept_tokens[0,0]=55):
-        #   i=1: accept_tokens_now[1-(2-1-1)-1] = accept_tokens_now[0] = 55
-        #   i=0: pre_ids_idx = step_idx + 1 - (2-1-0) = 5+1-1 = 5 -> pre_ids_now[5]
-        target_val = int(inputs["token_ids_all"][0, prompt_len + 5])
+        # At accept_idx=0 with stop_seq_len=2:
+        #   i=1(last): j=0, 0<=0 -> accept_tokens[0]=55
+        #   i=0:       j=1, 1<=0 false -> pre_ids[7-2+0-1]=pre_ids[4]=token_ids_all[0, prompt_len+4]
+        target_val = int(inputs["token_ids_all"][0, prompt_len + 4])
         inputs["stop_seqs"][0, 0, :2] = [target_val, 55]
         inputs["stop_seqs_len"][0, 0] = 2
         inputs["stop_flags"][:] = False