Skip to content

Commit 1e7ee22

Browse files
ckl117zeroRainsEmmonsCurse
authored
[Cherry-Pick] [Optimization] TopP=1.0 using _random_sample (#7892) and Triton SamplerBackend (#7639) (#7910)
* [CP][Feature] support new sampler backend with triton (#7639) * [Optimization] TopP=1.0 using _random_sample (#7892) * code check * add env FD_ENABLE_TOP_P_ONE_OPT control top_p=1 opt * defalut FD_ENABLE_TOP_P_ONE_OPT=0 * change FD_ENABLE_TOP_P_ONE_OPT=1 * fix mtp triton seed * change triton seed int64 * fix triton sampler * add seed for mtp triton sampler --------- Co-authored-by: Zero Rains <linjunlu@zerorains.top> Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
1 parent 2b0fd53 commit 1e7ee22

11 files changed

Lines changed: 1652 additions & 56 deletions

File tree

.flake8

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ max-line-length = 119
55
# E402: module level import not at top of file
66
per-file-ignores =
77
__init__.py:F401,F403,E402
8+
fastdeploy/model_executor/layers/sample/ops/top_k_top_p_triton.py:E241,E121,E131,E266

fastdeploy/envs.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def _validate_split_kv_size(value: int) -> int:
7272
"FD_ATTENTION_BACKEND": lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN"),
7373
# enable decode attention
7474
"USE_DECODE_UNIFIED_ATTENTION": lambda: bool(int(os.getenv("USE_DECODE_UNIFIED_ATTENTION", "0"))),
75-
# Set sampling class. "base", "base_non_truncated", "air" and "rejection" can be set currently.
75+
# Set sampling class. "base", "base_non_truncated", "air", "rejection" and "triton" can be set currently.
7676
"FD_SAMPLING_CLASS": lambda: os.getenv("FD_SAMPLING_CLASS", "base"),
7777
# Set moe backend."cutlass","marlin", "triton", "flashinfer-cutlass", "flashinfer-cutedsl" and "flashinfer-trtllm" can be set currently.
7878
"FD_MOE_BACKEND": lambda: os.getenv("FD_MOE_BACKEND", "cutlass"),
@@ -293,6 +293,8 @@ def _validate_split_kv_size(value: int) -> int:
293293
"FD_SiluAndMul_USE_PHI_SWIGLU": lambda: bool(int(os.getenv("FD_SiluAndMul_USE_PHI_SWIGLU", "0"))),
294294
# Whether to enable FP8 quantization with pow2scale.
295295
"FD_FP8_QUANT_WITH_POW2SCALE": lambda: bool(int(os.getenv("FD_FP8_QUANT_WITH_POW2SCALE", "0"))),
296+
# Whether to enable top_p=1.0 optimization.
297+
"FD_ENABLE_TOP_P_ONE_OPT": lambda: bool(int(os.getenv("FD_ENABLE_TOP_P_ONE_OPT", "1"))),
296298
}
297299

298300

fastdeploy/model_executor/layers/sample/meta_data.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ class SamplingMetadata:
4242
step_idx: paddle.Tensor
4343

4444
top_p: paddle.Tensor
45+
top_p_list: Optional[list] = None
4546
# only GPU used
4647
bad_words_token_len: Optional[paddle.Tensor] = None
4748
top_k: Optional[paddle.Tensor] = None

fastdeploy/model_executor/layers/sample/ops/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,11 @@
2323
speculate_get_accept_tokens_and_logits,
2424
speculate_insert_first_token,
2525
)
26-
from .top_k_top_p_sampling import min_p_sampling, top_k_top_p_sampling
26+
from .top_k_top_p_sampling import (
27+
dispatch_top_k_renorm_probs,
28+
min_p_sampling,
29+
top_k_top_p_sampling,
30+
)
2731

2832
__all__ = [
2933
"apply_penalty_multi_scores",
@@ -33,4 +37,5 @@
3337
"min_p_sampling",
3438
"speculate_get_accept_tokens_and_logits",
3539
"speculate_insert_first_token",
40+
"dispatch_top_k_renorm_probs",
3641
]

fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,20 @@ def _reset_cuda_generator_for_determinism():
3434
paddle.framework.core.default_cuda_generator(0).manual_seed(_DETERMINISTIC_RNG_SEED)
3535

3636

37+
def dispatch_top_k_renorm_probs(probs, top_k):
38+
try:
39+
if current_platform.is_iluvatar():
40+
from fastdeploy.model_executor.ops.iluvatar import top_k_renorm_probs
41+
else:
42+
from fastdeploy.model_executor.ops.gpu import top_k_renorm_probs
43+
probs = top_k_renorm_probs(probs, top_k)
44+
45+
except ImportError:
46+
logger.warning("top_k sampling is not supported on current platform, skipping top_k filtering.")
47+
48+
return probs
49+
50+
3751
def top_k_top_p_sampling(
3852
x: paddle.Tensor,
3953
top_p: paddle.Tensor,
@@ -70,7 +84,6 @@ def top_k_top_p_sampling(
7084
7185
"""
7286
top_p_class = envs.FD_SAMPLING_CLASS.lower()
73-
topp_seed_device = None
7487

7588
# In deterministic mode, reset CUDA generator offset before sampling.
7689
# paddle.tensor.top_p_sampling uses the global GPU generator offset even
@@ -85,29 +98,17 @@ def top_k_top_p_sampling(
8598
_ = None
8699
else:
87100
if top_k_list and any(x > 0 for x in top_k_list):
88-
try:
89-
if current_platform.is_iluvatar():
90-
from fastdeploy.model_executor.ops.iluvatar import (
91-
top_k_renorm_probs,
92-
)
93-
else:
94-
from fastdeploy.model_executor.ops.gpu import top_k_renorm_probs
95-
x = top_k_renorm_probs(x, top_k)
96-
except ImportError:
97-
logger.warning("top_k sampling is not supported on current platform, skipping top_k filtering.")
101+
x = dispatch_top_k_renorm_probs(x, top_k)
98102

99103
if top_p_class == "air":
100104
_, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed=seed, k=k, mode=mode)
101105

102106
elif top_p_class == "base_non_truncated":
103-
if topp_seed is not None:
104-
topp_seed_device = paddle.empty(shape=topp_seed.shape, dtype=topp_seed.dtype)
105-
topp_seed_device.copy_(topp_seed, False)
106107
_, ids = paddle.tensor.top_p_sampling(
107108
x,
108109
top_p,
109110
threshold=threshold,
110-
topp_seed=topp_seed_device,
111+
topp_seed=topp_seed,
111112
seed=seed,
112113
k=k,
113114
mode="non-truncated",
@@ -122,14 +123,11 @@ def top_k_top_p_sampling(
122123

123124
_, ids = native_top_p_sampling(x, top_p)
124125
else:
125-
if topp_seed is not None:
126-
topp_seed_device = paddle.empty(shape=topp_seed.shape, dtype=topp_seed.dtype)
127-
topp_seed_device.copy_(topp_seed, False)
128126
_, ids = paddle.tensor.top_p_sampling(
129127
x,
130128
top_p,
131129
threshold=threshold,
132-
topp_seed=topp_seed_device,
130+
topp_seed=topp_seed,
133131
seed=seed,
134132
k=k,
135133
mode="truncated",

0 commit comments

Comments
 (0)