ModelTC
diff --git a/‎docker/Dockerfile‎
Lines changed: 27 additions & 26 deletions b/‎docker/Dockerfile‎
Lines changed: 27 additions & 26 deletions
diff --git a/‎docker/scripts/build.sh‎
Lines changed: 10 additions & 4 deletions b/‎docker/scripts/build.sh‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎docs/CN/source/tutorial/api_server_args.rst‎
Lines changed: 8 additions & 0 deletions b/‎docs/CN/source/tutorial/api_server_args.rst‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/EN/source/tutorial/api_server_args.rst‎
Lines changed: 8 additions & 0 deletions b/‎docs/EN/source/tutorial/api_server_args.rst‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 2 additions & 1 deletion b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lightllm/common/basemodel/layer_infer/cache_tensor_manager.py‎
Lines changed: 4 additions & 1 deletion b/‎lightllm/common/basemodel/layer_infer/cache_tensor_manager.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/deepgemm_impl.py‎
Lines changed: 36 additions & 55 deletions b/‎lightllm/common/basemodel/layer_weights/meta_weights/fused_moe/impl/deepgemm_impl.py‎
Lines changed: 36 additions & 55 deletions
@@ -1,14 +1,17 @@
-ARG CUDA_VERSION=12.8.0
+ARG CUDA_VERSION=13.0.0
 FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04
 
 ARG PYTHON_VERSION=3.10
 ARG MAMBA_VERSION=24.7.1-0
-ARG VLLM_VERSION=0.16.0
+ARG VLLM_VERSION=0.21.0
+ARG NIXL_REF=v1.1.0
 ARG FLASH_MLA_REF=47c35a7
+ARG DEEPGEMM_REF=891d57b4db1071624b5c8fa0d1e51cb317fa709f
 ARG TARGETPLATFORM
 ARG ENABLE_DEEPEP=1
 ARG ENABLE_NIXL=1
 ARG ENABLE_CACHE=1
+ARG ENABLE_SM100=0
 
 ENV PATH=/opt/conda/bin:$PATH \
     CONDA_PREFIX=/opt/conda
@@ -44,13 +47,18 @@ WORKDIR /root
 
 COPY ./requirements.txt /lightllm/requirements.txt
 RUN pip install -U pip
-RUN pip install -r /lightllm/requirements.txt --no-cache-dir
-RUN pip install --no-cache-dir vllm==${VLLM_VERSION}
-RUN git clone https://github.com/deepseek-ai/FlashMLA.git /root/FlashMLA && \
+RUN pip install --no-cache-dir \
+    --extra-index-url https://download.pytorch.org/whl/cu130 \
+    vllm==${VLLM_VERSION}
+RUN pip install -r /lightllm/requirements.txt --no-cache-dir \
+    --extra-index-url https://download.pytorch.org/whl/cu130
+RUN export CPATH=/usr/local/cuda/targets/x86_64-linux/include/cccl:/usr/local/cuda/targets/x86_64-linux/include${CPATH:+:${CPATH}} && \
+    git clone https://github.com/deepseek-ai/FlashMLA.git /root/FlashMLA && \
     cd /root/FlashMLA && \
     git checkout ${FLASH_MLA_REF} && \
     git submodule update --init --recursive && \
-    FLASH_MLA_DISABLE_SM100=1 pip install --no-cache-dir .
+    FLASH_MLA_DISABLE_SM100="$(if [ "${ENABLE_SM100}" = "1" ]; then echo 0; else echo 1; fi)" \
+    pip install --no-cache-dir .
 
 RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
 
@@ -78,27 +86,20 @@ RUN if [ "${ENABLE_NIXL}" = "1" ] || [ "${ENABLE_DEEPEP}" = "1" ]; then \
 RUN if [ "${ENABLE_DEEPEP}" = "1" ]; then \
       set -e; \
       ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so; \
-      NVSHMEM_VERSION=3.3.9; \
-      CUDA_ARCHS=90; \
-      wget https://developer.download.nvidia.com/compute/redist/nvshmem/${NVSHMEM_VERSION}/source/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
-      && tar -xf nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz && mv nvshmem_src nvshmem \
-      && cd nvshmem \
-      && rm -f /root/nvshmem_src_cuda12-all-all-${NVSHMEM_VERSION}.tar.gz \
-      && NVSHMEM_SHMEM_SUPPORT=0 \
-         NVSHMEM_UCX_SUPPORT=0 \
-         NVSHMEM_USE_NCCL=0 \
-         NVSHMEM_MPI_SUPPORT=0 \
-         NVSHMEM_IBGDA_SUPPORT=1 \
-         NVSHMEM_PMIX_SUPPORT=0 \
-         NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
-         NVSHMEM_USE_GDRCOPY=1 \
-         cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/root/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS} \
-      && cmake --build build --target install -j64; \
-      DEEPEP_COMMIT=b6ce310bb0b75079682d09bc2ebc063a074fbd58; \
-      cd /root && git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout ${DEEPEP_COMMIT} && cd ..; \
-      cd /root/DeepEP && NVSHMEM_DIR=/root/nvshmem/install python setup.py install; \
+      python -m pip install --upgrade --no-deps \
+        "nvidia-nccl-cu13==2.30.4" \
+        "nvidia-nvshmem-cu13==3.6.5"; \
+      cd /root && git clone https://github.com/deepseek-ai/DeepEP.git && cd DeepEP && git checkout b306af06afd412c88e51e71802951606e40b7358; \
+      ln -sf /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvshmem/lib/libnvshmem_host.so.3 /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvshmem/lib/libnvshmem_host.so; \
+      ln -sf /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nccl/lib/libnccl.so.2 /opt/conda/lib/python${PYTHON_VERSION}/site-packages/nvidia/nccl/lib/libnccl.so; \
+      pip install --no-build-isolation .; \
     fi
 
+RUN cd /root && git clone https://github.com/deepseek-ai/DeepGEMM.git && \
+    cd DeepGEMM && git checkout ${DEEPGEMM_REF} && \
+    git submodule update --init --recursive && \
+    pip install --no-build-isolation .
+
 RUN if [ "${ENABLE_NIXL}" = "1" ]; then \
       apt-get update && apt-get install -y cmake automake autotools-dev libtool libz-dev && \
       DEBIAN_FRONTEND=noninteractive apt-get -y install --reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev; \
@@ -126,7 +127,7 @@ RUN if [ "${ENABLE_NIXL}" = "1" ]; then \
       apt-get update && apt-get install -y pkg-config tmux net-tools && \
       cd /usr/local/src; \
       pip install --upgrade meson pybind11 patchelf; \
-      git clone https://github.com/ai-dynamo/nixl.git -b main && \
+      git clone https://github.com/ai-dynamo/nixl.git -b ${NIXL_REF} && \
       cd nixl && \
       rm -rf build && \
       mkdir build && \
 
@@ -18,21 +18,23 @@ set -euo pipefail
 #   --no-nixl                 Disable NIXL (default: enabled)
 #   --no-cache                Disable cache (default: enabled)
 #   --lite                    Disable DEEPEP, NIXL and cache in one shot
-#   --cuda-version <ver>      CUDA version (default: 12.8.0)
+#   --cuda-version <ver>      CUDA version (default: 13.0.0)
 #   --image-prefix <name>     Image prefix (default: lightllm)
 #   --image-tag <tag>         Image tag (default: generated from enabled features)
+#   --enable-sm100            Enable SM100 support (default: disabled)
 #   -h / --help               Show help
 
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
 cd "${ROOT_DIR}"
 
 IMAGE_PREFIX="${IMAGE_PREFIX:-lightllm}"
-CUDA_VERSION="${CUDA_VERSION:-12.8.0}"
+CUDA_VERSION="${CUDA_VERSION:-13.0.0}"
 IMAGE_TAG="${IMAGE_TAG:-}"
 
 ENABLE_DEEPEP="${ENABLE_DEEPEP:-1}"
 ENABLE_NIXL="${ENABLE_NIXL:-1}"
 ENABLE_CACHE="${ENABLE_CACHE:-1}"
+ENABLE_SM100="${ENABLE_SM100:-0}"
 
 print_help() {
   sed -n '1,80p' "$0" | sed 's/^# \{0,1\}//'
@@ -43,6 +45,7 @@ while [[ $# -gt 0 ]]; do
     --no-deepep) ENABLE_DEEPEP=0 ;;
     --no-nixl) ENABLE_NIXL=0 ;;
     --no-cache) ENABLE_CACHE=0 ;;
+    --enable-sm100) ENABLE_SM100=1 ;;
     --lite)
       ENABLE_DEEPEP=0
       ENABLE_NIXL=0
@@ -78,13 +81,16 @@ done
 # - Other combos: composed from enabled feature names
 if [[ -z "${IMAGE_TAG}" ]]; then
   tag_parts=()
+  if [[ "${ENABLE_SM100}" -eq 1 ]]; then
+    tag_parts+=("sm100")
+  fi
   if [[ "${ENABLE_NIXL}" -eq 1 ]]; then
     tag_parts+=("nixl")
   fi
   if [[ "${ENABLE_DEEPEP}" -eq 1 ]]; then
     tag_parts+=("deepep")
   fi
-  if [[ "${ENABLE_NIXL}" -eq 1 && "${ENABLE_DEEPEP}" -eq 1 && "${ENABLE_CACHE}" -eq 1 ]]; then
+  if [[ "${ENABLE_SM100}" -eq 0 && "${ENABLE_NIXL}" -eq 1 && "${ENABLE_DEEPEP}" -eq 1 && "${ENABLE_CACHE}" -eq 1 ]]; then
     IMAGE_TAG="cuda${CUDA_VERSION}"
   else
     prefix=""
@@ -100,6 +106,6 @@ DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile \
   --build-arg ENABLE_DEEPEP="${ENABLE_DEEPEP}" \
   --build-arg ENABLE_NIXL="${ENABLE_NIXL}" \
   --build-arg ENABLE_CACHE="${ENABLE_CACHE}" \
+  --build-arg ENABLE_SM100="${ENABLE_SM100}" \
   --progress=plain \
   -t "${IMAGE_PREFIX}:${IMAGE_TAG}" . 
-
 
@@ -464,6 +464,14 @@ PD 分离模式参数
 
     示例可以在 test/advanced_config/mixed_quantization/llamacls-mix-down.yaml 中找到。
 
+.. option:: --expert_dtype
+
+    EP MoE 专家量化类型，可选值：
+
+    * ``fp8``
+    * ``fp4``，仅支持 SM100 GPU
+    * ``None`` (默认)
+
 .. option:: --vit_quant_type
 
     ViT 量化方法，可选值：
 
@@ -465,6 +465,14 @@ Quantization Parameters
 
     Examples can be found in test/advanced_config/mixed_quantization/llamacls-mix-down.yaml.
 
+.. option:: --expert_dtype
+
+    Expert quantization dtype for EP MoE, optional values:
+
+    * ``fp8``
+    * ``fp4``: SM100 GPUs only
+    * ``None`` (default)
+
 .. option:: --vit_quant_type
 
     ViT quantization method, optional values:
 
@@ -85,6 +85,7 @@ def __init__(self, kvargs):
         self.disable_cudagraph = kvargs.get("disable_cudagraph", False)
         self.quant_type = kvargs.get("quant_type", "none")
         self.quant_cfg_path = kvargs.get("quant_cfg", None)
+        self.expert_dtype = kvargs.get("expert_dtype", None)
         self.mem_fraction = kvargs.get("mem_fraction", 0.9)
         self.tp_world_size_ = get_dp_world_size()
         self.enable_tpsp_mix_mode = get_env_start_args().enable_tpsp_mix_mode
@@ -156,7 +157,7 @@ def _verify_params(self):
         return
 
     def _init_quant(self):
-        self.quant_cfg = Quantcfg(self.config, self.quant_type, self.quant_cfg_path)
+        self.quant_cfg = Quantcfg(self.config, self.quant_type, self.quant_cfg_path, self.expert_dtype)
         logger.info(f"Initial quantization. " f"The default quantization method is {self.quant_cfg.quant_type}")
 
     def _init_weights(self, start_layer_index=0):
 
@@ -33,6 +33,7 @@ class BufNode:
         inner_tensor: torch.Tensor
         shape_key: Tuple[int, torch.dtype]
         storage_weak_ptr: int
+        free_use_count_bias: int = 0
         shape_to_tensor: Dict[Union[torch.Size, Iterable[int]], torch.Tensor] = field(default_factory=dict)
 
         def __del__(self):
@@ -99,7 +100,8 @@ def alloc_tensor(
             # 回收可能消亡的 tensor
             for ptr in self.changed_ptr:
                 t_buf_node = self.ptr_to_bufnode[ptr]
-                if self.use_count(ptr) == 1 + len(t_buf_node.shape_to_tensor):
+                free_use_count = t_buf_node.free_use_count_bias + 1 + len(t_buf_node.shape_to_tensor)
+                if self.use_count(ptr) <= free_use_count:
                     self.free_shape_dtype_to_bufs[t_buf_node.shape_key].append(t_buf_node)
             self.changed_ptr.clear()
 
@@ -131,6 +133,7 @@ def alloc_tensor(
             self.ptr_to_bufnode[storage_weak_ptr] = buf_node
             if shape not in buf_node.shape_to_tensor:
                 buf_node.shape_to_tensor[shape] = buf_node.inner_tensor.view(shape)
+            buf_node.free_use_count_bias = self.use_count(storage_weak_ptr) - (1 + len(buf_node.shape_to_tensor))
             mark_tensor = buf_node.shape_to_tensor[shape]
             ans = mark_tensor.data  # 返回一个新的引用, 否则引用计数会无法判断
             ans.storage_weak_ptr = buf_node.storage_weak_ptr
 
@@ -4,11 +4,16 @@
 from lightllm.distributed import dist_group_manager
 from lightllm.common.triton_utils.autotuner import Autotuner
 from lightllm.common.quantization.quantize_method import WeightPack
-from lightllm.utils.envs_utils import get_deepep_num_max_dispatch_tokens_per_rank
+from lightllm.utils.envs_utils import (
+    get_deepep_num_max_dispatch_tokens_per_rank_prefill,
+    get_deepep_num_max_dispatch_tokens_per_rank_decode,
+)
 from lightllm.common.basemodel.triton_kernel.fused_moe.grouped_fused_moe_ep import (
-    fused_experts_impl,
+    fused_experts,
+    get_ep_num_sms,
     masked_group_gemm,
-    _deepgemm_grouped_fp8_nt_contiguous,
+    deepgemm_grouped_fp8_nt_contiguous,
+    quantize_fused_experts_input,
 )
 from lightllm.common.basemodel.triton_kernel.quantization.fp8act_quant_kernel import (
     per_token_group_quant_fp8,
@@ -72,23 +77,15 @@ def _fused_experts(
         router_logits: Optional[torch.Tensor] = None,
         is_prefill: Optional[bool] = None,
     ):
-        w13_weight, w13_scale = w13.weight, w13.weight_scale
-        w2_weight, w2_scale = w2.weight, w2.weight_scale
-        use_fp8_w8a8 = self.quant_method.method_name != "none"
-        output = fused_experts_impl(
+        output = fused_experts(
             hidden_states=input_tensor,
-            w1=w13_weight,
-            w2=w2_weight,
+            w13=w13,
+            w2=w2,
             topk_weights=topk_weights,
             topk_idx=topk_ids.to(torch.long),
             num_experts=self.total_expert_num_contain_redundancy,  # number of all experts contain redundancy
-            buffer=dist_group_manager.ep_buffer,
+            quant_method=self.quant_method,
             is_prefill=is_prefill,
-            use_fp8_w8a8=use_fp8_w8a8,
-            use_fp8_all2all=use_fp8_w8a8,
-            use_int8_w8a16=False,  # default to False
-            w1_scale=w13_scale,
-            w2_scale=w2_scale,
             previous_event=None,  # for overlap
         )
         return output
@@ -118,13 +115,13 @@ def low_latency_dispatch(
         )
 
         topk_idx = topk_idx.to(torch.long)
-        num_max_dispatch_tokens_per_rank = get_deepep_num_max_dispatch_tokens_per_rank()
+        num_max_dispatch_tokens_per_rank = get_deepep_num_max_dispatch_tokens_per_rank_decode()
         use_fp8_w8a8 = self.quant_method.method_name != "none"
-        recv_x, masked_m, handle, event, hook = dist_group_manager.ep_buffer.low_latency_dispatch(
-            hidden_states,
-            topk_idx,
-            num_max_dispatch_tokens_per_rank,
-            self.total_expert_num_contain_redundancy,
+        recv_x, masked_m, handle, event, hook = dist_group_manager.ep_low_latency_buffer.low_latency_dispatch(
+            topk_idx=topk_idx,
+            x=hidden_states,
+            num_max_dispatch_tokens_per_rank=num_max_dispatch_tokens_per_rank,
+            num_experts=self.total_expert_num_contain_redundancy,
             use_fp8=use_fp8_w8a8,
             async_finish=False,
             return_recv_hook=True,
@@ -155,13 +152,8 @@ def select_experts_and_quant_input(
             num_expert_group=n_group,
             scoring_func=scoring_func,
         )
-        w13_weight, w13_scale = w13.weight, w13.weight_scale
-        block_size_k = 0
-        if w13_weight.ndim == 3:
-            block_size_k = w13_weight.shape[2] // w13_scale.shape[2]
-        assert block_size_k == 128, "block_size_k must be 128"
-        qinput_tensor, input_scale = per_token_group_quant_fp8(hidden_states, block_size_k, dtype=w13_weight.dtype)
-        return topk_weights, topk_idx.to(torch.long), (qinput_tensor, input_scale)
+        qinput_tensor = quantize_fused_experts_input(hidden_states, w13, self.quant_method)
+        return topk_weights, topk_idx.to(torch.long), qinput_tensor
 
     def dispatch(
         self,
@@ -171,38 +163,26 @@ def dispatch(
         overlap_event: Optional[Any] = None,
     ):
         buffer = dist_group_manager.ep_buffer
-        # get_dispatch_layout
-        (
-            num_tokens_per_rank,
-            num_tokens_per_rdma_rank,
-            num_tokens_per_expert,
-            is_token_in_rank,
-            previous_event,
-        ) = buffer.get_dispatch_layout(
-            topk_idx,
-            self.total_expert_num_contain_redundancy,
-            previous_event=overlap_event,
-            async_finish=True,
-            allocate_on_comm_stream=True,
-        )
-        recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event = buffer.dispatch(
+        num_max_tokens_per_rank = get_deepep_num_max_dispatch_tokens_per_rank_prefill()
+        recv_x, recv_topk_idx, recv_topk_weights, handle, event = buffer.dispatch(
             qinput_tensor,
             topk_idx=topk_idx,
             topk_weights=topk_weights,
-            num_tokens_per_rank=num_tokens_per_rank,
-            num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
-            is_token_in_rank=is_token_in_rank,
-            num_tokens_per_expert=num_tokens_per_expert,
-            previous_event=previous_event,
-            async_finish=True,
-            allocate_on_comm_stream=True,
+            num_experts=self.total_expert_num_contain_redundancy,
+            num_max_tokens_per_rank=num_max_tokens_per_rank,
             expert_alignment=128,
+            num_sms=get_ep_num_sms(),
+            previous_event=overlap_event,
+            async_with_compute_stream=True,
+            allocate_on_comm_stream=True,
+            do_cpu_sync=True,
+            do_handle_copy=False,
         )
 
         def hook():
             event.current_stream_wait()
 
-        return recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, hook
+        return recv_x, recv_topk_idx, recv_topk_weights, handle.num_recv_tokens_per_expert_list, handle, hook
 
     def masked_group_gemm(
         self,
@@ -281,7 +261,7 @@ def prefilled_group_gemm(
             # groupgemm (contiguous layout)
             gemm_out_a = torch.empty((all_tokens, N), device=device, dtype=hidden_dtype)
 
-            _deepgemm_grouped_fp8_nt_contiguous(input_tensor, (w13_weight, w13_scale), gemm_out_a, m_indices)
+            deepgemm_grouped_fp8_nt_contiguous(input_tensor, (w13_weight, w13_scale), gemm_out_a, m_indices)
 
             # silu_and_mul_fwd + qaunt
             # TODO fused kernel
@@ -295,7 +275,7 @@ def prefilled_group_gemm(
             # groupgemm (contiguous layout)
             gemm_out_b = torch.empty((all_tokens, K), device=device, dtype=hidden_dtype)
 
-            _deepgemm_grouped_fp8_nt_contiguous(
+            deepgemm_grouped_fp8_nt_contiguous(
                 (qsilu_out, qsilu_out_scale), (w2_weight, w2_scale), gemm_out_b, m_indices
             )
             # gather and local reduce
@@ -319,7 +299,7 @@ def low_latency_combine(
         topk_weights: torch.Tensor,
         handle: Any,
     ):
-        combined_x, event_overlap, hook = dist_group_manager.ep_buffer.low_latency_combine(
+        combined_x, event_overlap, hook = dist_group_manager.ep_low_latency_buffer.low_latency_combine(
             gemm_out_b, topk_idx, topk_weights, handle, async_finish=False, return_recv_hook=True
         )
         return combined_x, hook
@@ -335,8 +315,9 @@ def combine(
             gemm_out_b,
             handle,
             topk_weights=None,
-            async_finish=True,
+            num_sms=get_ep_num_sms(),
             previous_event=overlap_event,
+            async_with_compute_stream=True,
             allocate_on_comm_stream=True,
         )