lonelygsh
diff --git a/‎custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length.cu‎
Lines changed: 5 additions & 12 deletions b/‎custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length.cu‎
Lines changed: 5 additions & 12 deletions
diff --git a/‎custom_ops/gpu_ops/speculate_decoding/speculate_set_stop_value_multi_seqs.cu‎
Lines changed: 7 additions & 7 deletions b/‎custom_ops/gpu_ops/speculate_decoding/speculate_set_stop_value_multi_seqs.cu‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎docs/get_started/installation/kunlunxin_xpu.md‎
Lines changed: 5 additions & 5 deletions b/‎docs/get_started/installation/kunlunxin_xpu.md‎
Lines changed: 5 additions & 5 deletions
@@ -34,7 +34,7 @@ __global__ void speculate_limit_thinking_content_length_kernel(
     int64_t* next_tokens,          // [bs, tokens_per_step]
     const int* max_think_lens,     // [bs]
     int* max_reply_lens,           // [bs]
-    int64_t* step_idx,             // [bs]
+    const int64_t* step_idx,       // [bs]
     const int64_t* eos_token_ids,  // [eos_len]
     int* limit_status,             // [bs]
     int* accept_num,               // [bs]
@@ -68,7 +68,7 @@ __global__ void speculate_limit_thinking_content_length_kernel(
   int new_accept_num = original_accept_num;
 
   // 本 step 的 token offset 对应的绝对 step
-  const int64_t current_base_step = step_idx[bid] - original_accept_num + 1;
+  const int64_t current_base_step = step_idx[bid] + 1;
 
   for (int token_offset = 0; token_offset < original_accept_num;
        token_offset++) {
@@ -100,8 +100,8 @@ __global__ void speculate_limit_thinking_content_length_kernel(
         // inject_token_ids[0]）
         if (status == 0 &&
             (current_step - 1) ==
-                max_think_len) {  // current_step - 1 是因为 speculate_verify 里
-                                  // step_idx + 1 了
+                max_think_len) {  // current_step - 1 : 已输出 current_step-1
+                                  // 个thinking token
           status = (inject_len > 0) ? 1 : done_status;
         }
       } else if (max_think_len == 0) {
@@ -181,13 +181,6 @@ __global__ void speculate_limit_thinking_content_length_kernel(
     }
   }
 
-  // 更新 step_idx / accept_num（被截断的 token 需要回退
-  // step_idx）
-  const int discarded_tokens = original_accept_num - new_accept_num;
-  if (discarded_tokens > 0) {
-    step_idx[bid] -= discarded_tokens;
-  }
-
   accept_num[bid] = new_accept_num;
   limit_status[bid] = status;
   max_reply_lens[bid] = max_reply_len;
@@ -221,7 +214,7 @@ void SpeculateLimitThinkingContentLength(
       const_cast<int64_t*>(next_tokens.data<int64_t>()),
       max_think_lens.data<int>(),
       const_cast<int*>(max_reply_lens.data<int>()),
-      const_cast<int64_t*>(step_idx.data<int64_t>()),
+      step_idx.data<int64_t>(),
       eos_token_ids.data<int64_t>(),
       const_cast<int*>(limit_status.data<int>()),
       const_cast<int*>(accept_num.data<int>()),
 
@@ -51,17 +51,18 @@ __global__ void spec_set_value_by_stop_seqs(bool *stop_flags,
     const int64_t step_idx_now = step_idx[bid];
     const int64_t min_token_limit = min_tokens[bid];
 
-    const bool can_stop = (step_idx_now >= min_token_limit);
+    const bool can_stop = (step_idx_now + accept_num >= min_token_limit);
     if (!can_stop) return;
     if (!stop_flags[bid]) {
       int accept_idx = 0;
       bool is_end = false;
+
       // 遍历起始位置
       for (; accept_idx <= accept_num - 1 && !is_end; accept_idx++) {
         if (step_idx_now + accept_idx + 1 < stop_seq_len) {
 #ifdef DEBUG_SPEC_STOP_SEQS
           printf("num %d < stop_seq_len %d\n",
-                 step_idx_now - accept_num + accept_idx + 1,
+                 step_idx_now + accept_idx + 1,
                  stop_seq_len);
 #endif
           continue;
@@ -71,7 +72,7 @@ __global__ void spec_set_value_by_stop_seqs(bool *stop_flags,
           int64_t cur_token_idx = -1;
 
           // 通过当前值判断 token 是在 pre_ids 还是 accept_token 里
-          if (stop_seq_len - 1 - i < accept_idx) {
+          if (stop_seq_len - 1 - i <= accept_idx) {
 #ifdef DEBUG_SPEC_STOP_SEQS
             printf(
                 "AcceptTokens bid:%d. tid:%d, accept_idx:%d, "
@@ -80,10 +81,10 @@ __global__ void spec_set_value_by_stop_seqs(bool *stop_flags,
                 bid,
                 tid,
                 accept_idx,
-                accept_idx - (stop_seq_len - 1 - i) - 1);
+                accept_idx - (stop_seq_len - 1 - i));
 #endif
             cur_token_idx =
-                accept_tokens_now[accept_idx - (stop_seq_len - 1 - i) - 1];
+                accept_tokens_now[accept_idx - (stop_seq_len - 1 - i)];
           } else {
 #ifdef DEBUG_SPEC_STOP_SEQS
             printf(
@@ -94,8 +95,7 @@ __global__ void spec_set_value_by_stop_seqs(bool *stop_flags,
                 tid,
                 step_idx_now,
                 accept_idx,
-                step_idx_now - accept_num + accept_idx -
-                    (stop_seq_len - 1 - i));
+                step_idx_now + accept_idx - (stop_seq_len - 1 - i));
 #endif
             int pre_ids_idx =
                 step_idx_now + accept_idx - (stop_seq_len - 1 - i);
 
@@ -28,9 +28,9 @@ Verified platform:
 ```bash
 mkdir Work
 cd Work
-docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.4.0
+docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.5.0
 docker run --name fastdeploy-xpu --net=host -itd --privileged -v $PWD:/Work -w /Work \
-    ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.4.0 \
+    ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-xpu:2.5.0 \
     /bin/bash
 docker exec -it fastdeploy-xpu /bin/bash
 ```
@@ -40,7 +40,7 @@ docker exec -it fastdeploy-xpu /bin/bash
 ### Install PaddlePaddle
 
 ```bash
-python -m pip install paddlepaddle-xpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
+python -m pip install paddlepaddle-xpu==3.3.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
 ```
 
 Alternatively, you can install the latest version of PaddlePaddle (Not recommended)
@@ -52,7 +52,7 @@ python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/
 ### Install FastDeploy (**Do NOT install via PyPI source**)
 
 ```bash
-python -m pip install fastdeploy-xpu==2.4.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+python -m pip install fastdeploy-xpu==2.5.0 -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-xpu-p800/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 ```
 
 Alternatively, you can install the latest version of FastDeploy (Not recommended)
@@ -66,7 +66,7 @@ python -m pip install --pre fastdeploy-xpu -i https://www.paddlepaddle.org.cn/pa
 ### Install PaddlePaddle
 
 ```bash
-python -m pip install paddlepaddle-xpu==3.3.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
+python -m pip install paddlepaddle-xpu==3.3.1 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
 ```
 
 Alternatively, you can install the latest version of PaddlePaddle (Not recommended)