Skip to content

Commit 41f3947

Browse files
author
niushengxiao
committed
feat: deep_ep v2 and upgrade cuda 13.0
1 parent e696aed commit 41f3947

23 files changed

Lines changed: 724 additions & 251 deletions

File tree

docker/Dockerfile

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
1-
ARG CUDA_VERSION=12.8.0
1+
ARG CUDA_VERSION=13.0.0
22
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
33

44
ARG PYTHON_VERSION=3.10
55
ARG MAMBA_VERSION=24.7.1-0
6-
ARG VLLM_VERSION=0.16.0
6+
ARG VLLM_VERSION=0.21.0
7+
ARG NIXL_REF=v1.1.0
78
ARG FLASH_MLA_REF=47c35a7
9+
ARG DEEPGEMM_REF=891d57b4db1071624b5c8fa0d1e51cb317fa709f
810
ARG TARGETPLATFORM
911
ARG ENABLE_DEEPEP=1
1012
ARG ENABLE_NIXL=1
1113
ARG ENABLE_CACHE=1
14+
ARG ENABLE_SM100=0
1215

1316
ENV PATH=/opt/conda/bin:$PATH \
1417
CONDA_PREFIX=/opt/conda
@@ -44,13 +47,18 @@ WORKDIR /root
4447

4548
COPY ./requirements.txt /lightllm/requirements.txt
4649
RUN pip install -U pip
47-
RUN pip install -r /lightllm/requirements.txt --no-cache-dir
48-
RUN pip install --no-cache-dir vllm==${VLLM_VERSION}
49-
RUN git clone https://github.com/deepseek-ai/FlashMLA.git /root/FlashMLA && \
50+
RUN pip install --no-cache-dir \
51+
--extra-index-url https://download.pytorch.org/whl/cu130 \
52+
vllm==${VLLM_VERSION}
53+
RUN pip install -r /lightllm/requirements.txt --no-cache-dir \
54+
--extra-index-url https://download.pytorch.org/whl/cu130
55+
RUN export CPATH=/usr/local/cuda/targets/x86_64-linux/include/cccl:/usr/local/cuda/targets/x86_64-linux/include${CPATH:+:${CPATH}} && \
56+
git clone https://github.com/deepseek-ai/FlashMLA.git /root/FlashMLA && \
5057
cd /root/FlashMLA && \
5158
git checkout ${FLASH_MLA_REF} && \
5259
git submodule update --init --recursive && \
53-
FLASH_MLA_DISABLE_SM100=1 pip install --no-cache-dir .
60+
FLASH_MLA_DISABLE_SM100="$(if [ "${ENABLE_SM100}" = "1" ]; then echo 0; else echo 1; fi)" \
61+
pip install --no-cache-dir .
5462

5563
RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
5664

@@ -78,27 +86,20 @@ RUN if [ "${ENABLE_NIXL}" = "1" ] || [ "${ENABLE_DEEPEP}" = "1" ]; then \
7886
RUN if [ "${ENABLE_DEEPEP}" = "1" ]; then \
7987
set -e; \
8088
ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so; \
81-
NVSHMEM_VERSION=3.3.9; \
82-
CUDA_ARCHS=90; \
83-
wget https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
84-
&& tar -xf nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz && mv nvshmem_src nvshmem \
85-
&& cd nvshmem \
86-
&& rm -f /root/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
87-
&& NVSHMEM_SHMEM_SUPPORT=0 \
88-
NVSHMEM_UCX_SUPPORT=0 \
89-
NVSHMEM_USE_NCCL=0 \
90-
NVSHMEM_MPI_SUPPORT=0 \
91-
NVSHMEM_IBGDA_SUPPORT=1 \
92-
NVSHMEM_PMIX_SUPPORT=0 \
93-
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
94-
NVSHMEM_USE_GDRCOPY=1 \
95-
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS} \
96-
&& cmake --build build --target install -j64; \
97-
DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58; \
98-
cd /root && git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd ..; \
99-
cd /root/DeepEP && NVSHMEM_DIR=/root/nvshmem/install python setup.py install; \
89+
python -m pip install --upgrade --no-deps \
90+
"nvidia-nccl-cu13==2.30.4" \
91+
"nvidia-nvshmem-cu13==3.6.5"; \
92+
cd /root && git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout b306af06afd412c88e51e71802951606e40b7358; \
93+
ln -sf /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvshmem/lib/libnvshmem_host.so.3 /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvshmem/lib/libnvshmem_host.so; \
94+
ln -sf /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nccl/lib/libnccl.so.2 /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nccl/lib/libnccl.so; \
95+
pip install --no-build-isolation .; \
10096
fi
10197

98+
RUN cd /root && git clone https://github.com/deepseek-ai/DeepGEMM.git && \
99+
cd DeepGEMM && git checkout ${DEEPGEMM_REF} && \
100+
git submodule update --init --recursive && \
101+
pip install --no-build-isolation .
102+
102103
RUN if [ "${ENABLE_NIXL}" = "1" ]; then \
103104
apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \
104105
DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \
@@ -126,7 +127,7 @@ RUN if [ "${ENABLE_NIXL}" = "1" ]; then \
126127
apt-get update && apt-get install -y pkg-config tmux net-tools && \
127128
cd /usr/local/src; \
128129
pip install --upgrade meson pybind11 patchelf; \
129-
git clone https://github.com/ai-dynamo/nixl.git -b main && \
130+
git clone https://github.com/ai-dynamo/nixl.git -b ${NIXL_REF} && \
130131
cd nixl && \
131132
rm -rf build && \
132133
mkdir build && \

docker/scripts/build.sh

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,23 @@ set -euo pipefail
1818
# --no-nixl Disable NIXL (default: enabled)
1919
# --no-cache Disable cache (default: enabled)
2020
# --lite Disable DEEPEP, NIXL and cache in one shot
21-
# --cuda-version <ver> CUDA version (default: 12.8.0)
21+
# --cuda-version <ver> CUDA version (default: 13.0.0)
2222
# --image-prefix <name> Image prefix (default: lightllm)
2323
# --image-tag <tag> Image tag (default: generated from enabled features)
24+
# --enable-sm100 Enable SM100 support (default: disabled)
2425
# -h / --help Show help
2526

2627
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
2728
cd "${ROOT_DIR}"
2829

2930
IMAGE_PREFIX="${IMAGE_PREFIX:-lightllm}"
30-
CUDA_VERSION="${CUDA_VERSION:-12.8.0}"
31+
CUDA_VERSION="${CUDA_VERSION:-13.0.0}"
3132
IMAGE_TAG="${IMAGE_TAG:-}"
3233

3334
ENABLE_DEEPEP="${ENABLE_DEEPEP:-1}"
3435
ENABLE_NIXL="${ENABLE_NIXL:-1}"
3536
ENABLE_CACHE="${ENABLE_CACHE:-1}"
37+
ENABLE_SM100="${ENABLE_SM100:-0}"
3638

3739
print_help() {
3840
sed -n '1,80p' "$0" | sed 's/^# \{0,1\}//'
@@ -43,6 +45,7 @@ while [[ $# -gt 0 ]]; do
4345
--no-deepep) ENABLE_DEEPEP=0 ;;
4446
--no-nixl) ENABLE_NIXL=0 ;;
4547
--no-cache) ENABLE_CACHE=0 ;;
48+
--enable-sm100) ENABLE_SM100=1 ;;
4649
--lite)
4750
ENABLE_DEEPEP=0
4851
ENABLE_NIXL=0
@@ -78,13 +81,16 @@ done
7881
# - Other combos: composed from enabled feature names
7982
if [[ -z "${IMAGE_TAG}" ]]; then
8083
tag_parts=()
84+
if [[ "${ENABLE_SM100}" -eq 1 ]]; then
85+
tag_parts+=("sm100")
86+
fi
8187
if [[ "${ENABLE_NIXL}" -eq 1 ]]; then
8288
tag_parts+=("nixl")
8389
fi
8490
if [[ "${ENABLE_DEEPEP}" -eq 1 ]]; then
8591
tag_parts+=("deepep")
8692
fi
87-
if [[ "${ENABLE_NIXL}" -eq 1 && "${ENABLE_DEEPEP}" -eq 1 && "${ENABLE_CACHE}" -eq 1 ]]; then
93+
if [[ "${ENABLE_SM100}" -eq 0 && "${ENABLE_NIXL}" -eq 1 && "${ENABLE_DEEPEP}" -eq 1 && "${ENABLE_CACHE}" -eq 1 ]]; then
8894
IMAGE_TAG="cuda${CUDA_VERSION}"
8995
else
9096
prefix=""
@@ -100,6 +106,6 @@ DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile \
100106
--build-arg ENABLE_DEEPEP="${ENABLE_DEEPEP}" \
101107
--build-arg ENABLE_NIXL="${ENABLE_NIXL}" \
102108
--build-arg ENABLE_CACHE="${ENABLE_CACHE}" \
109+
--build-arg ENABLE_SM100="${ENABLE_SM100}" \
103110
--progress=plain \
104111
-t "${IMAGE_PREFIX}:${IMAGE_TAG}" .
105-

lightllm/common/basemodel/layer_infer/cache_tensor_manager.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ class BufNode:
3333
inner_tensor: torch.Tensor
3434
shape_key: Tuple[int, torch.dtype]
3535
storage_weak_ptr: int
36+
free_use_count_bias: int = 0
3637
shape_to_tensor: Dict[Union[torch.Size, Iterable[int]], torch.Tensor] = field(default_factory=dict)
3738

3839
def __del__(self):
@@ -99,7 +100,8 @@ def alloc_tensor(
99100
# 回收可能消亡的 tensor
100101
for ptr in self.changed_ptr:
101102
t_buf_node = self.ptr_to_bufnode[ptr]
102-
if self.use_count(ptr) == 1 + len(t_buf_node.shape_to_tensor):
103+
free_use_count = t_buf_node.free_use_count_bias + 1 + len(t_buf_node.shape_to_tensor)
104+
if self.use_count(ptr) <= free_use_count:
103105
self.free_shape_dtype_to_bufs[t_buf_node.shape_key].append(t_buf_node)
104106
self.changed_ptr.clear()
105107

@@ -131,6 +133,7 @@ def alloc_tensor(
131133
self.ptr_to_bufnode[storage_weak_ptr] = buf_node
132134
if shape not in buf_node.shape_to_tensor:
133135
buf_node.shape_to_tensor[shape] = buf_node.inner_tensor.view(shape)
136+
buf_node.free_use_count_bias = self.use_count(storage_weak_ptr) - (1 + len(buf_node.shape_to_tensor))
134137
mark_tensor = buf_node.shape_to_tensor[shape]
135138
ans = mark_tensor.data # 返回一个新的引用, 否则引用计数会无法判断
136139
ans.storage_weak_ptr = buf_node.storage_weak_ptr

lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/fused_moe_weight.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from lightllm.common.basemodel.layer_weights.meta_weights.fused_moe.impl import select_fuse_moe_impl
1212
from lightllm.common.quantization.quantize_method import QuantizationMethod
1313
from lightllm.utils.envs_utils import get_redundancy_expert_ids, get_redundancy_expert_num, get_env_start_args
14+
from lightllm.utils.device_utils import is_sm100_gpu
1415
from lightllm.utils.dist_utils import get_global_world_size, get_global_rank
1516
from lightllm.utils.log_utils import init_logger
1617

@@ -52,6 +53,7 @@ def __init__(
5253
self.quant_method = quant_method
5354
assert num_fused_shared_experts in [0, 1], "num_fused_shared_experts can only support 0 or 1 now."
5455
self.enable_ep_moe = get_env_start_args().enable_ep_moe
56+
self.quant_method = self._maybe_upgrade_quant_method_for_ep_moe(self.quant_method)
5557
self.n_routed_experts = n_routed_experts
5658
self.num_fused_shared_experts = num_fused_shared_experts
5759
self._init_config(network_config)
@@ -70,6 +72,28 @@ def __init__(
7072
self.lock = threading.Lock()
7173
self._create_weight()
7274

75+
def _maybe_upgrade_quant_method_for_ep_moe(self, quant_method: QuantizationMethod) -> QuantizationMethod:
76+
if not self.enable_ep_moe:
77+
return quant_method
78+
79+
target_method = "deepgemm-fp8fp4-b32" if is_sm100_gpu() else "deepgemm-fp8w8a8-b128"
80+
if quant_method.method_name == "none":
81+
from lightllm.common.quantization.registry import QUANTMETHODS
82+
83+
logger.info(
84+
f"enable_ep_moe requires DeepGEMM MoE expert weights; "
85+
f"auto-upgrading fused_moe quantization from `none` to `{target_method}`."
86+
)
87+
quant_method = QUANTMETHODS.get(target_method)
88+
89+
if quant_method.method_name != target_method:
90+
raise ValueError(
91+
f"enable_ep_moe currently requires `{target_method}` for fused_moe on this GPU, "
92+
f"but got `{quant_method.method_name}`."
93+
)
94+
95+
return quant_method
96+
7397
def _init_config(self, network_config: Dict[str, Any]):
7498
self.n_group = network_config.get("n_group", 0)
7599
self.use_grouped_topk = self.n_group > 0
@@ -152,6 +176,9 @@ def experts(
152176
per_expert_scale=self.per_expert_scale,
153177
)
154178

179+
def use_sm100_mega_moe(self) -> bool:
180+
return bool(getattr(self.fuse_moe_impl, "_use_sm100_fp4_moe", lambda: False)())
181+
155182
def low_latency_dispatch(
156183
self,
157184
hidden_states: torch.Tensor,

0 commit comments

Comments
 (0)