Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ max-line-length = 119
# E402: module level import not at top of file
per-file-ignores =
__init__.py:F401,F403,E402
fastdeploy/model_executor/layers/sample/ops/top_k_top_p_triton.py:E241,E121,E131,E266
4 changes: 3 additions & 1 deletion fastdeploy/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def _validate_split_kv_size(value: int) -> int:
"FD_ATTENTION_BACKEND": lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN"),
# enable decode attention
"USE_DECODE_UNIFIED_ATTENTION": lambda: bool(int(os.getenv("USE_DECODE_UNIFIED_ATTENTION", "0"))),
# Set sampling class. "base", "base_non_truncated", "air" and "rejection" can be set currently.
# Set sampling class. "base", "base_non_truncated", "air", "rejection" and "triton" can be set currently.
"FD_SAMPLING_CLASS": lambda: os.getenv("FD_SAMPLING_CLASS", "base"),
# Set moe backend."cutlass","marlin", "triton", "flashinfer-cutlass", "flashinfer-cutedsl" and "flashinfer-trtllm" can be set currently.
"FD_MOE_BACKEND": lambda: os.getenv("FD_MOE_BACKEND", "cutlass"),
Expand Down Expand Up @@ -293,6 +293,8 @@ def _validate_split_kv_size(value: int) -> int:
"FD_SiluAndMul_USE_PHI_SWIGLU": lambda: bool(int(os.getenv("FD_SiluAndMul_USE_PHI_SWIGLU", "0"))),
# Whether to enable FP8 quantization with pow2scale.
"FD_FP8_QUANT_WITH_POW2SCALE": lambda: bool(int(os.getenv("FD_FP8_QUANT_WITH_POW2SCALE", "0"))),
# Whether to enable top_p=1.0 optimization.

This comment was marked as outdated.

"FD_ENABLE_TOP_P_ONE_OPT": lambda: bool(int(os.getenv("FD_ENABLE_TOP_P_ONE_OPT", "1"))),
}


Expand Down
1 change: 1 addition & 0 deletions fastdeploy/model_executor/layers/sample/meta_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class SamplingMetadata:
step_idx: paddle.Tensor

top_p: paddle.Tensor
top_p_list: Optional[list] = None

This comment was marked as outdated.

# only GPU used

This comment was marked as outdated.

This comment was marked as outdated.

bad_words_token_len: Optional[paddle.Tensor] = None
top_k: Optional[paddle.Tensor] = None
Expand Down
7 changes: 6 additions & 1 deletion fastdeploy/model_executor/layers/sample/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@
speculate_get_accept_tokens_and_logits,
speculate_insert_first_token,
)
from .top_k_top_p_sampling import min_p_sampling, top_k_top_p_sampling
from .top_k_top_p_sampling import (
dispatch_top_k_renorm_probs,
min_p_sampling,
top_k_top_p_sampling,
)

__all__ = [
"apply_penalty_multi_scores",
Expand All @@ -33,4 +37,5 @@
"min_p_sampling",
"speculate_get_accept_tokens_and_logits",
"speculate_insert_first_token",
"dispatch_top_k_renorm_probs",
]
36 changes: 17 additions & 19 deletions fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,20 @@ def _reset_cuda_generator_for_determinism():
paddle.framework.core.default_cuda_generator(0).manual_seed(_DETERMINISTIC_RNG_SEED)


def dispatch_top_k_renorm_probs(probs, top_k):
try:
if current_platform.is_iluvatar():
from fastdeploy.model_executor.ops.iluvatar import top_k_renorm_probs
else:
from fastdeploy.model_executor.ops.gpu import top_k_renorm_probs
probs = top_k_renorm_probs(probs, top_k)

except ImportError:
logger.warning("top_k sampling is not supported on current platform, skipping top_k filtering.")

return probs


def top_k_top_p_sampling(
x: paddle.Tensor,
top_p: paddle.Tensor,
Expand Down Expand Up @@ -70,7 +84,6 @@ def top_k_top_p_sampling(

"""
top_p_class = envs.FD_SAMPLING_CLASS.lower()
topp_seed_device = None

# In deterministic mode, reset CUDA generator offset before sampling.
# paddle.tensor.top_p_sampling uses the global GPU generator offset even
Expand All @@ -85,29 +98,17 @@ def top_k_top_p_sampling(
_ = None
else:
if top_k_list and any(x > 0 for x in top_k_list):
try:
if current_platform.is_iluvatar():
from fastdeploy.model_executor.ops.iluvatar import (
top_k_renorm_probs,
)
else:
from fastdeploy.model_executor.ops.gpu import top_k_renorm_probs
x = top_k_renorm_probs(x, top_k)
except ImportError:
logger.warning("top_k sampling is not supported on current platform, skipping top_k filtering.")
x = dispatch_top_k_renorm_probs(x, top_k)

if top_p_class == "air":
_, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed=seed, k=k, mode=mode)

elif top_p_class == "base_non_truncated":
if topp_seed is not None:
topp_seed_device = paddle.empty(shape=topp_seed.shape, dtype=topp_seed.dtype)
topp_seed_device.copy_(topp_seed, False)
_, ids = paddle.tensor.top_p_sampling(

This comment was marked as outdated.

x,

This comment was marked as outdated.

top_p,
threshold=threshold,
topp_seed=topp_seed_device,
topp_seed=topp_seed,
seed=seed,
k=k,
mode="non-truncated",
Expand All @@ -122,14 +123,11 @@ def top_k_top_p_sampling(

_, ids = native_top_p_sampling(x, top_p)
else:
if topp_seed is not None:
topp_seed_device = paddle.empty(shape=topp_seed.shape, dtype=topp_seed.dtype)
topp_seed_device.copy_(topp_seed, False)
_, ids = paddle.tensor.top_p_sampling(
x,
top_p,
threshold=threshold,

This comment was marked as outdated.

topp_seed=topp_seed_device,
topp_seed=topp_seed,

This comment was marked as outdated.

seed=seed,
k=k,
mode="truncated",
Expand Down
Loading
Loading