Skip to content
Open
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ max-line-length = 119
# E402: module level import not at top of file
per-file-ignores =
__init__.py:F401,F403,E402
fastdeploy/model_executor/layers/sample/ops/top_k_top_p_triton.py:E241,E121,E131,E266
4 changes: 3 additions & 1 deletion fastdeploy/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def _validate_split_kv_size(value: int) -> int:
"FD_ATTENTION_BACKEND": lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN"),
# enable decode attention
"USE_DECODE_UNIFIED_ATTENTION": lambda: bool(int(os.getenv("USE_DECODE_UNIFIED_ATTENTION", "0"))),
# Set sampling class. "base", "base_non_truncated", "air" and "rejection" can be set currently.
# Set sampling class. "base", "base_non_truncated", "air", "rejection" and "triton" can be set currently.
"FD_SAMPLING_CLASS": lambda: os.getenv("FD_SAMPLING_CLASS", "base"),
# Set moe backend."cutlass","marlin", "triton", "flashinfer-cutlass", "flashinfer-cutedsl" and "flashinfer-trtllm" can be set currently.
"FD_MOE_BACKEND": lambda: os.getenv("FD_MOE_BACKEND", "cutlass"),
Expand Down Expand Up @@ -289,6 +289,8 @@ def _validate_split_kv_size(value: int) -> int:
"FD_SiluAndMul_USE_PHI_SWIGLU": lambda: bool(int(os.getenv("FD_SiluAndMul_USE_PHI_SWIGLU", "0"))),
# Whether to enable FP8 quantization with pow2scale.
"FD_FP8_QUANT_WITH_POW2SCALE": lambda: bool(int(os.getenv("FD_FP8_QUANT_WITH_POW2SCALE", "0"))),
# Whether to enable top_p=1.0 optimization.

This comment was marked as outdated.

"FD_ENABLE_TOP_P_ONE_OPT": lambda: bool(int(os.getenv("FD_ENABLE_TOP_P_ONE_OPT", "1"))),
}


Expand Down
1 change: 1 addition & 0 deletions fastdeploy/model_executor/layers/sample/meta_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class SamplingMetadata:
step_idx: paddle.Tensor

top_p: paddle.Tensor
top_p_list: Optional[list] = None

This comment was marked as outdated.

# only GPU used

This comment was marked as outdated.

This comment was marked as outdated.

bad_words_token_len: Optional[paddle.Tensor] = None
top_k: Optional[paddle.Tensor] = None
Expand Down
7 changes: 6 additions & 1 deletion fastdeploy/model_executor/layers/sample/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@
speculate_get_accept_tokens_and_logits,
speculate_insert_first_token,
)
from .top_k_top_p_sampling import min_p_sampling, top_k_top_p_sampling
from .top_k_top_p_sampling import (
dispatch_top_k_renorm_probs,
min_p_sampling,
top_k_top_p_sampling,
)

__all__ = [
"apply_penalty_multi_scores",
Expand All @@ -33,4 +37,5 @@
"min_p_sampling",
"speculate_get_accept_tokens_and_logits",
"speculate_insert_first_token",
"dispatch_top_k_renorm_probs",
]
36 changes: 17 additions & 19 deletions fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,20 @@ def _reset_cuda_generator_for_determinism():
paddle.framework.core.default_cuda_generator(0).manual_seed(_DETERMINISTIC_RNG_SEED)


def dispatch_top_k_renorm_probs(probs, top_k):
try:
if current_platform.is_iluvatar():
from fastdeploy.model_executor.ops.iluvatar import top_k_renorm_probs
else:
from fastdeploy.model_executor.ops.gpu import top_k_renorm_probs
probs = top_k_renorm_probs(probs, top_k)

except ImportError:
logger.warning("top_k sampling is not supported on current platform, skipping top_k filtering.")

return probs


def top_k_top_p_sampling(
x: paddle.Tensor,
top_p: paddle.Tensor,
Expand Down Expand Up @@ -70,7 +84,6 @@ def top_k_top_p_sampling(

"""
top_p_class = envs.FD_SAMPLING_CLASS.lower()
topp_seed_device = None

# In deterministic mode, reset CUDA generator offset before sampling.
# paddle.tensor.top_p_sampling uses the global GPU generator offset even
Expand All @@ -85,29 +98,17 @@ def top_k_top_p_sampling(
_ = None
else:
if top_k_list and any(x > 0 for x in top_k_list):
try:
if current_platform.is_iluvatar():
from fastdeploy.model_executor.ops.iluvatar import (
top_k_renorm_probs,
)
else:
from fastdeploy.model_executor.ops.gpu import top_k_renorm_probs
x = top_k_renorm_probs(x, top_k)
except ImportError:
logger.warning("top_k sampling is not supported on current platform, skipping top_k filtering.")
x = dispatch_top_k_renorm_probs(x, top_k)

if top_p_class == "air":
_, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed=seed, k=k, mode=mode)

elif top_p_class == "base_non_truncated":
if topp_seed is not None:
topp_seed_device = paddle.empty(shape=topp_seed.shape, dtype=topp_seed.dtype)
topp_seed_device.copy_(topp_seed, False)
_, ids = paddle.tensor.top_p_sampling(

This comment was marked as outdated.

x,

This comment was marked as outdated.

top_p,
threshold=threshold,
topp_seed=topp_seed_device,
topp_seed=topp_seed,
seed=seed,
k=k,
mode="non-truncated",
Expand All @@ -122,14 +123,11 @@ def top_k_top_p_sampling(

_, ids = native_top_p_sampling(x, top_p)
else:
if topp_seed is not None:
topp_seed_device = paddle.empty(shape=topp_seed.shape, dtype=topp_seed.dtype)
topp_seed_device.copy_(topp_seed, False)
_, ids = paddle.tensor.top_p_sampling(
x,
top_p,
threshold=threshold,

This comment was marked as outdated.

topp_seed=topp_seed_device,
topp_seed=topp_seed,

This comment was marked as outdated.

seed=seed,
k=k,
mode="truncated",
Expand Down
Loading
Loading