PaddlePaddle · ckl117 · May 12, 2026 · May 25, 2026 · May 25, 2026 · May 25, 2026
diff --git a/.flake8 b/.flake8
@@ -5,3 +5,4 @@ max-line-length = 119
 # E402: module level import not at top of file
 per-file-ignores =
     __init__.py:F401,F403,E402
+    fastdeploy/model_executor/layers/sample/ops/top_k_top_p_triton.py:E241,E121,E131,E266
diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
@@ -72,7 +72,7 @@ def _validate_split_kv_size(value: int) -> int:
     "FD_ATTENTION_BACKEND": lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN"),
     # enable decode attention
     "USE_DECODE_UNIFIED_ATTENTION": lambda: bool(int(os.getenv("USE_DECODE_UNIFIED_ATTENTION", "0"))),
-    # Set sampling class. "base", "base_non_truncated", "air" and "rejection" can be set currently.
+    # Set sampling class. "base", "base_non_truncated", "air", "rejection" and "triton" can be set currently.
     "FD_SAMPLING_CLASS": lambda: os.getenv("FD_SAMPLING_CLASS", "base"),
     # Set moe backend."cutlass","marlin", "triton", "flashinfer-cutlass", "flashinfer-cutedsl" and "flashinfer-trtllm" can be set currently.
     "FD_MOE_BACKEND": lambda: os.getenv("FD_MOE_BACKEND", "cutlass"),
@@ -289,6 +289,8 @@ def _validate_split_kv_size(value: int) -> int:
     "FD_SiluAndMul_USE_PHI_SWIGLU": lambda: bool(int(os.getenv("FD_SiluAndMul_USE_PHI_SWIGLU", "0"))),
     # Whether to enable FP8 quantization with pow2scale.
     "FD_FP8_QUANT_WITH_POW2SCALE": lambda: bool(int(os.getenv("FD_FP8_QUANT_WITH_POW2SCALE", "0"))),
+    # Whether to enable top_p=1.0 optimization.
+    "FD_ENABLE_TOP_P_ONE_OPT": lambda: bool(int(os.getenv("FD_ENABLE_TOP_P_ONE_OPT", "1"))),
 }
 
 

diff --git a/fastdeploy/model_executor/layers/sample/meta_data.py b/fastdeploy/model_executor/layers/sample/meta_data.py
@@ -42,6 +42,7 @@ class SamplingMetadata:
     step_idx: paddle.Tensor
 
     top_p: paddle.Tensor
+    top_p_list: Optional[list] = None
     # only GPU used
     bad_words_token_len: Optional[paddle.Tensor] = None
     top_k: Optional[paddle.Tensor] = None

diff --git a/fastdeploy/model_executor/layers/sample/ops/__init__.py b/fastdeploy/model_executor/layers/sample/ops/__init__.py
@@ -23,7 +23,11 @@
     speculate_get_accept_tokens_and_logits,
     speculate_insert_first_token,
 )
-from .top_k_top_p_sampling import min_p_sampling, top_k_top_p_sampling
+from .top_k_top_p_sampling import (
+    dispatch_top_k_renorm_probs,
+    min_p_sampling,
+    top_k_top_p_sampling,
+)
 
 __all__ = [
     "apply_penalty_multi_scores",
@@ -33,4 +37,5 @@
     "min_p_sampling",
     "speculate_get_accept_tokens_and_logits",
     "speculate_insert_first_token",
+    "dispatch_top_k_renorm_probs",
 ]
diff --git a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
@@ -34,6 +34,20 @@ def _reset_cuda_generator_for_determinism():
     paddle.framework.core.default_cuda_generator(0).manual_seed(_DETERMINISTIC_RNG_SEED)
 
 
+def dispatch_top_k_renorm_probs(probs, top_k):
+    try:
+        if current_platform.is_iluvatar():
+            from fastdeploy.model_executor.ops.iluvatar import top_k_renorm_probs
+        else:
+            from fastdeploy.model_executor.ops.gpu import top_k_renorm_probs
+        probs = top_k_renorm_probs(probs, top_k)
+
+    except ImportError:
+        logger.warning("top_k sampling is not supported on current platform, skipping top_k filtering.")
+
+    return probs
+
+
 def top_k_top_p_sampling(
     x: paddle.Tensor,
     top_p: paddle.Tensor,
@@ -70,7 +84,6 @@ def top_k_top_p_sampling(
 
     """
     top_p_class = envs.FD_SAMPLING_CLASS.lower()
-    topp_seed_device = None
 
     # In deterministic mode, reset CUDA generator offset before sampling.
     # paddle.tensor.top_p_sampling uses the global GPU generator offset even
@@ -85,29 +98,17 @@ def top_k_top_p_sampling(
         _ = None
     else:
         if top_k_list and any(x > 0 for x in top_k_list):
-            try:
-                if current_platform.is_iluvatar():
-                    from fastdeploy.model_executor.ops.iluvatar import (
-                        top_k_renorm_probs,
-                    )
-                else:
-                    from fastdeploy.model_executor.ops.gpu import top_k_renorm_probs
-                x = top_k_renorm_probs(x, top_k)
-            except ImportError:
-                logger.warning("top_k sampling is not supported on current platform, skipping top_k filtering.")
+            x = dispatch_top_k_renorm_probs(x, top_k)
 
         if top_p_class == "air":
             _, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed=seed, k=k, mode=mode)
 
         elif top_p_class == "base_non_truncated":
-            if topp_seed is not None:
-                topp_seed_device = paddle.empty(shape=topp_seed.shape, dtype=topp_seed.dtype)
-                topp_seed_device.copy_(topp_seed, False)
             _, ids = paddle.tensor.top_p_sampling(
                 x,
                 top_p,
                 threshold=threshold,
-                topp_seed=topp_seed_device,
+                topp_seed=topp_seed,
                 seed=seed,
                 k=k,
                 mode="non-truncated",
@@ -122,14 +123,11 @@ def top_k_top_p_sampling(
 
                 _, ids = native_top_p_sampling(x, top_p)
             else:
-                if topp_seed is not None:
-                    topp_seed_device = paddle.empty(shape=topp_seed.shape, dtype=topp_seed.dtype)
-                    topp_seed_device.copy_(topp_seed, False)
                 _, ids = paddle.tensor.top_p_sampling(
                     x,
                     top_p,
                     threshold=threshold,
-                    topp_seed=topp_seed_device,
+                    topp_seed=topp_seed,
                     seed=seed,
                     k=k,
                     mode="truncated",