fix mtp triton seed

ckl117 · ckl117 · commit b8b7f35ad91f · 2026-05-26T21:38:10.000+08:00
diff --git a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_triton.py b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_triton.py
@@ -28,6 +28,10 @@
 import paddle
 from paddle.utils.deprecated import VisibleDeprecationWarning
 
+from fastdeploy.model_executor.ops.triton_ops.triton_utils import (
+    enable_compat_on_triton_kernel,
+)
+
 # Suppress the VisibleDeprecationWarning from use_triton_in_paddle that fires
 # on every Triton kernel launch (paddle.device.cuda.current_stream /
 # synchronize).  In serving hot-paths this produces thousands of log lines per
@@ -112,6 +116,7 @@ def _update_min_larger_stats(data, above_mask, min_larger, num_min_larger, senti
     return min_larger, num_min_larger
 
 
+@enable_compat_on_triton_kernel
 @triton.jit
 def _topk_topp_kernel(
     LOGITS,
@@ -936,6 +941,7 @@ def apply_top_k_top_p_triton(
     return logits
 
 
+@enable_compat_on_triton_kernel
 @triton.jit
 def _seeded_gumbel_kernel(
     OUT_ptr,
diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -1097,6 +1097,7 @@ def _verify_and_sample(
         increment_value: int,
         accept_all_drafts: bool = False,
         reject_all_drafts: bool = False,
+        topp_seed: Optional[paddle.Tensor] = None,
     ) -> SamplerOutput:
         """
         Verify draft tokens against target model output and produce final samples.
@@ -1129,7 +1130,7 @@ def _verify_and_sample(
 
         if self.verify_strategy == VerifyStrategy.TARGET_MATCH:
             if FD_SAMPLING_CLASS.lower() == "triton":
-                target_tokens = _random_sample(probs, topp_seed=sampling_metadata.seed)
+                target_tokens = _random_sample(probs, topp_seed=topp_seed)
             else:
                 # Only TARGET_MATCH needs stochastic sampling
                 top_p, top_k, topp_seed = build_sampling_params(
@@ -1208,6 +1209,7 @@ def _normal_sample(
         probs: paddle.Tensor,
         sampling_metadata: SamplingMetadata,
         share_inputs: List[paddle.Tensor],
+        topp_seed: Optional[paddle.Tensor],
     ) -> SamplerOutput:
         """
         Normal sampling without draft token verification.
@@ -1230,7 +1232,7 @@ def _normal_sample(
 
         # Sample tokens
         if FD_SAMPLING_CLASS.lower() == "triton":
-            next_tokens = _random_sample(probs, topp_seed=sampling_metadata.seed)
+            next_tokens = _random_sample(probs, topp_seed=topp_seed)
         else:
             next_tokens = _sample_from_probs(
                 probs,
@@ -1333,9 +1335,10 @@ def forward_cuda(
             )
 
         logits_ori = None
+        topp_seed = None
         if FD_SAMPLING_CLASS.lower() == "triton":
             logits_ori = logits.clone()
-            top_p, top_k, _ = build_sampling_params(
+            top_p, top_k, topp_seed = build_sampling_params(
                 sampling_metadata.top_p,
                 sampling_metadata.top_k,
                 sampling_metadata.seed,
@@ -1356,7 +1359,7 @@ def forward_cuda(
         # Route based on spec_method
         is_naive = self.spec_method is None or self.spec_method == SpecMethod.NAIVE
         if is_naive:
-            sampler_output = self._normal_sample(logits, probs, sampling_metadata, share_inputs)
+            sampler_output = self._normal_sample(logits, probs, sampling_metadata, share_inputs, topp_seed=topp_seed)
         else:
             sampler_output = self._verify_and_sample(
                 logits,
@@ -1368,6 +1371,7 @@ def forward_cuda(
                 increment_value,
                 accept_all_drafts,
                 reject_all_drafts,
+                topp_seed=topp_seed,
             )
 
         keep_sampling_mask = sampling_metadata.keep_sampling_mask