Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ else
fi

pip install /wheels/*.whl
pip install dlblas==0.0.7 dlslime==0.0.2.post1
pip install dlslime==0.0.2.post1

pip install ninja einops packaging

Expand Down
4 changes: 2 additions & 2 deletions lmdeploy/pytorch/backends/cuda/graph_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def prepare_inputs_for_generation(
"""Prepare inputs."""

if get_deepep_state().enabled():
from dlblas.layers.moe.token_dispatcher import DeepEPBuffer, DeepEPMode
from lmdeploy.pytorch.backends.cuda.token_dispatcher import DeepEPBuffer, DeepEPMode
deepep_mode = DeepEPMode.LOW_LATENCY if context.global_is_decoding() else DeepEPMode.NORMAL
DeepEPBuffer.set_deepep_mode(deepep_mode)

Expand All @@ -322,7 +322,7 @@ def reset(self):
"""Remove all graphs to prevent hanging on exit."""
self._runner_map.clear()
if get_deepep_state().enabled():
from dlblas.layers.moe.token_dispatcher import DeepEPBuffer
from lmdeploy.pytorch.backends.cuda.token_dispatcher import DeepEPBuffer

if hasattr(DeepEPBuffer, 'destroy'):
from torch import distributed as dist
Expand Down
294 changes: 282 additions & 12 deletions lmdeploy/pytorch/backends/cuda/moe/blocked_fp8.py

Large diffs are not rendered by default.

44 changes: 28 additions & 16 deletions lmdeploy/pytorch/backends/cuda/moe/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,9 @@ def __init__(
layer_index: int = 0,
top_k: int = 8,
out_dtype: torch.dtype = torch.bfloat16,
num_max_dispatch_tokens_per_rank: int = 128,
):
from dlblas.layers.moe.token_dispatcher import DeepEPTokenDispatcherNormal
from lmdeploy.pytorch.backends.cuda.token_dispatcher import DeepEPTokenDispatcherNormal
self.layer_index = layer_index
self.top_k = top_k
self.num_experts = num_experts
Expand All @@ -94,6 +95,7 @@ def __init__(
num_local_experts=self.num_local_experts,
hidden_size=hidden_dim,
params_dtype=out_dtype,
num_max_dispatch_tokens_per_rank=num_max_dispatch_tokens_per_rank,
)

def forward(
Expand Down Expand Up @@ -148,7 +150,7 @@ def fusedmoe_forward(self, state, up_weight, down_weight):


def _disposible_tensor(tensor):
from dlblas.utils.utils import DisposibleTensor
from lmdeploy.pytorch.backends.cuda.token_dispatcher import DisposibleTensor
if isinstance(tensor, torch.Tensor):
tensor = DisposibleTensor(tensor)
else:
Expand Down Expand Up @@ -237,8 +239,9 @@ def __init__(
hidden_dim: int,
layer_index: int,
out_dtype: torch.dtype = torch.bfloat16,
num_max_dispatch_tokens_per_rank: int = 128,
):
from dlblas.layers.moe.token_dispatcher import DeepEPTokenDispatcherLowLatency
from lmdeploy.pytorch.backends.cuda.token_dispatcher import DeepEPTokenDispatcherLowLatency
self.num_experts = num_experts
self.layer_index = layer_index
self.out_dtype = out_dtype
Expand All @@ -248,6 +251,7 @@ def __init__(
num_local_experts=num_experts // ep_size,
hidden_size=hidden_dim,
params_dtype=out_dtype,
num_max_dispatch_tokens_per_rank=num_max_dispatch_tokens_per_rank,
)

def experts(
Expand All @@ -258,8 +262,7 @@ def experts(
masked_m: torch.Tensor,
expected_m: int,
):
from dlblas.utils.utils import DisposibleTensor

from lmdeploy.pytorch.backends.cuda.token_dispatcher import DisposibleTensor
from lmdeploy.pytorch.kernels.cuda.activation import silu_and_mul_moe_ep
from lmdeploy.pytorch.third_party.deep_gemm import m_grouped_bf16_gemm_nt_masked
num_groups, m, _ = hidden_states.shape
Expand Down Expand Up @@ -339,22 +342,25 @@ def build_deepep_moe(
top_k: int,
layer_idx: int = 0,
out_dtype: torch.dtype = torch.bfloat16,
num_max_dispatch_tokens_per_rank: int = 128,
):
if low_latency_mode:
return FusedMoELowLatency(ep_size=ep_size,
ep_group=ep_group,
num_experts=num_experts,
hidden_dim=hidden_dim,
layer_index=layer_idx,
out_dtype=out_dtype)
out_dtype=out_dtype,
num_max_dispatch_tokens_per_rank=num_max_dispatch_tokens_per_rank)
else:
return FusedMoENormal(ep_size=ep_size,
ep_group=ep_group,
num_experts=num_experts,
hidden_dim=hidden_dim,
layer_index=layer_idx,
top_k=top_k,
out_dtype=out_dtype)
out_dtype=out_dtype,
num_max_dispatch_tokens_per_rank=num_max_dispatch_tokens_per_rank)


class FusedMoEEPImpl(TritonFusedMoEImpl):
Expand All @@ -370,6 +376,7 @@ def __init__(
renormalize: bool = False,
layer_idx: int = 0,
out_dtype: torch.dtype = torch.bfloat16,
num_max_dispatch_tokens_per_rank: int = 128,
):
super().__init__(top_k, num_experts, renormalize)
self.num_experts = num_experts
Expand All @@ -378,19 +385,21 @@ def __init__(
self.hidden_dim = hidden_dim
self.layer_idx = layer_idx
self.out_dtype = out_dtype
self.num_max_dispatch_tokens_per_rank = num_max_dispatch_tokens_per_rank

try:
import deep_gemm # noqa: F401
except ImportError:
logger.exception('DeepGEMM is required for DeepEP MoE implementation.')
raise

try:
from dlblas.layers.moe.token_dispatcher import DeepEPBuffer, DeepEPMode, use_deepep # noqa: F401
get_deepep_state().enable()
if hasattr(DeepEPBuffer, 'set_explicitly_destroy'):
DeepEPBuffer.set_explicitly_destroy()
except ImportError:
logger.warning('For higher performance, please install DeepEP https://github.com/deepseek-ai/DeepEP')
from lmdeploy.pytorch.backends.cuda.token_dispatcher import DeepEPBuffer, use_deepep
if not use_deepep:
raise ImportError('DeepEP is required for DeepEP MoE implementation. Please install '
'https://github.com/deepseek-ai/DeepEP.')
get_deepep_state().enable()
if hasattr(DeepEPBuffer, 'set_explicitly_destroy'):
DeepEPBuffer.set_explicitly_destroy()

# pre-allocate buffer
self.fusedmoe_build(True)
Expand Down Expand Up @@ -440,7 +449,8 @@ def fusedmoe_build(self, low_latency_mode: bool = False):
self.hidden_dim,
self.top_k,
layer_idx=self.layer_idx,
out_dtype=self.out_dtype)
out_dtype=self.out_dtype,
num_max_dispatch_tokens_per_rank=self.num_max_dispatch_tokens_per_rank)
return deepep_moe


Expand All @@ -457,6 +467,7 @@ def build(
ep_group: dist.ProcessGroup = None,
layer_idx: int = 0,
out_dtype: torch.dtype = torch.bfloat16,
num_max_dispatch_tokens_per_rank: int = 128,
):
"""Build from mlp."""
if ep_size > 1:
Expand All @@ -467,5 +478,6 @@ def build(
hidden_dim=hidden_dim,
renormalize=renormalize,
layer_idx=layer_idx,
out_dtype=out_dtype)
out_dtype=out_dtype,
num_max_dispatch_tokens_per_rank=num_max_dispatch_tokens_per_rank)
return TritonFusedMoEImpl(top_k=top_k, num_experts=num_experts, renormalize=renormalize)
Loading
Loading