Skip to content

Commit 61aad7c

Browse files
authored
feat: implement DeepSeek-V4 model (#1651)
* feat: implement DeepSeek-V4 model Signed-off-by: AlpinDale <alpindale@gmail.com> * fix: namespace issues Signed-off-by: AlpinDale <alpindale@gmail.com> * fix: cmake context Signed-off-by: AlpinDale <alpindale@gmail.com> --------- Signed-off-by: AlpinDale <alpindale@gmail.com>
1 parent 19b1e78 commit 61aad7c

132 files changed

Lines changed: 12181 additions & 646 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CMakeLists.txt

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,6 @@ set(APHRODITE_EXT_SRC
281281
"csrc/topk.cu"
282282
"csrc/cuda_view.cu"
283283
"csrc/dsv3_fused_a_gemm.cu"
284-
"csrc/minimax_reduce_rms_kernel.cu"
285284
"csrc/quantization/gptq/q_gemm.cu"
286285
"csrc/quantization/w8a8/int8/scaled_quant.cu"
287286
"csrc/quantization/w8a8/fp8/common.cu"
@@ -293,7 +292,11 @@ set(APHRODITE_EXT_SRC
293292
"csrc/cpu/dry.cpp"
294293
"csrc/torch_bindings.cpp")
295294

296-
if(APHRODITE_GPU_LANG STREQUAL "CUDA")
295+
if(APHRODITE_GPU_LANG STREQUAL "CUDA")
296+
list(APPEND APHRODITE_EXT_SRC
297+
"csrc/minimax_reduce_rms_kernel.cu"
298+
"csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu")
299+
297300
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
298301

299302
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
@@ -945,7 +948,8 @@ if(APHRODITE_GPU_LANG STREQUAL "CUDA")
945948
list(APPEND APHRODITE_MOE_EXT_SRC
946949
"csrc/moe/moe_wna16.cu"
947950
"csrc/moe/grouped_topk_kernels.cu"
948-
"csrc/moe/router_gemm.cu")
951+
"csrc/moe/router_gemm.cu"
952+
"csrc/moe/topk_softplus_sqrt_kernels.cu")
949953
endif()
950954

951955
if(APHRODITE_GPU_LANG STREQUAL "CUDA")

aphrodite/_aiter_ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1104,7 +1104,7 @@ class rocm_aiter_ops:
11041104
The environment variables are assigned when the module is imported,
11051105
so you can't change the environment variables after the module is imported.
11061106
This is done out of performance consideration. Accessing environment variables
1107-
is expensive as described in issue https://github.com/aphrodite-project/aphrodite/issues/17067
1107+
is expensive as described in issue https://github.com/vllm-project/vllm/issues/17067
11081108
so we don't want to do it repeatedly, especially in the hot path (the forward pass).
11091109
You can call the refresh_env_variables() function to reload the env variables
11101110
after monkey patching the env variables in the unit test.

aphrodite/_custom_ops.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -646,8 +646,22 @@ def rotary_embedding(
646646
head_size: int,
647647
cos_sin_cache: torch.Tensor,
648648
is_neox: bool,
649+
rope_dim_offset: int = 0,
650+
inverse: bool = False,
649651
) -> None:
650-
torch.ops._C.rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox)
652+
if rope_dim_offset == 0 and not inverse:
653+
torch.ops._C.rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox)
654+
else:
655+
torch.ops._C.rotary_embedding(
656+
positions,
657+
query,
658+
key,
659+
head_size,
660+
cos_sin_cache,
661+
is_neox,
662+
rope_dim_offset,
663+
inverse,
664+
)
651665

652666

653667
# layer norm ops
@@ -2672,6 +2686,30 @@ def topk_sigmoid(
26722686
)
26732687

26742688

2689+
def topk_hash_softplus_sqrt(
2690+
topk_weights: torch.Tensor,
2691+
topk_indices: torch.Tensor,
2692+
token_expert_indices: torch.Tensor,
2693+
gating_output: torch.Tensor,
2694+
renormalize: bool = False,
2695+
routed_scaling_factor: float = 1.0,
2696+
e_score_correction_bias: torch.Tensor | None = None,
2697+
input_tokens: torch.Tensor | None = None,
2698+
hash_indices_table: torch.Tensor | None = None,
2699+
) -> None:
2700+
torch.ops._moe_C.topk_softplus_sqrt(
2701+
topk_weights,
2702+
topk_indices,
2703+
token_expert_indices,
2704+
gating_output,
2705+
renormalize,
2706+
routed_scaling_factor,
2707+
e_score_correction_bias,
2708+
input_tokens,
2709+
hash_indices_table,
2710+
)
2711+
2712+
26752713
def grouped_topk(
26762714
scores: torch.Tensor,
26772715
num_expert_group: int,

aphrodite/config/aphrodite.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ class OptimizationLevel(IntEnum):
9191
# if model_config is not None:
9292
# IS_QUANTIZED = lambda c: c.model_config.is_quantized()
9393
# IS_DENSE = lambda c: not c.model_config.is_model_moe()
94-
# See https://github.com/aphrodite-project/aphrodite/issues/25689.
94+
# See https://github.com/vllm-project/vllm/issues/25689.
9595

9696

9797
def enable_norm_fusion(cfg: "AphroditeConfig") -> bool:
@@ -129,10 +129,10 @@ def enable_allreduce_rms_fusion(cfg: "AphroditeConfig") -> bool:
129129
and has_flashinfer()
130130
and (current_platform.is_device_capability_family(100) or current_platform.is_device_capability(90))
131131
# tp-dp combination broken:
132-
# https://github.com/aphrodite-project/aphrodite/issues/34458
132+
# https://github.com/vllm-project/vllm/issues/34458
133133
and cfg.parallel_config.data_parallel_size == 1
134134
# tp-pp combination broken:
135-
# https://github.com/aphrodite-project/aphrodite/issues/35426
135+
# https://github.com/vllm-project/vllm/issues/35426
136136
and cfg.parallel_config.pipeline_parallel_size == 1
137137
)
138138

@@ -881,7 +881,7 @@ def has_blocked_weights():
881881
# Enable quant_fp8 CUDA ops (TODO disable in follow up)
882882
# On H100 the CUDA kernel is faster than
883883
# native implementation
884-
# https://github.com/aphrodite-project/aphrodite/issues/25094
884+
# https://github.com/vllm-project/vllm/issues/25094
885885
if has_blocked_weights():
886886
custom_ops = self.compilation_config.custom_ops
887887
if "-quant_fp8" not in custom_ops:
@@ -1103,7 +1103,7 @@ def has_blocked_weights():
11031103
# incorrect residual shape.
11041104
# Use custom rms norm to unblock. In the future,
11051105
# the pass will operate on higher-level IR to avoid the issue.
1106-
# TODO: https://github.com/aphrodite-project/aphrodite/issues/27894
1106+
# TODO: https://github.com/vllm-project/vllm/issues/27894
11071107
if self.compilation_config.mode != CompilationMode.APHRODITE_COMPILE:
11081108
logger.warning(
11091109
"Sequence parallelism is enabled, but running in wrong aphrodite compile mode: %s.",
@@ -1246,7 +1246,7 @@ def has_blocked_weights():
12461246
# Enable quant_fp8 CUDA ops (TODO disable in follow up)
12471247
# On H100 the CUDA kernel is faster than
12481248
# native implementation
1249-
# https://github.com/aphrodite-project/aphrodite/issues/25094
1249+
# https://github.com/vllm-project/vllm/issues/25094
12501250
if has_blocked_weights():
12511251
custom_ops = self.compilation_config.custom_ops
12521252
if "-quant_fp8" not in custom_ops:
@@ -1657,18 +1657,18 @@ def _validate_v2_model_runner(self) -> None:
16571657
unsupported.append("dual batch overlap")
16581658

16591659
if self.model_config is not None and self.model_config.enable_return_routed_experts:
1660-
# Will be added by https://github.com/aphrodite-project/aphrodite/pull/38163
1660+
# Will be added by https://github.com/vllm-project/vllm/pull/38163
16611661
unsupported.append("routed experts capture")
16621662

16631663
if self.model_config is not None and self.model_config.logits_processors:
16641664
unsupported.append("custom logits processors")
16651665

16661666
if self.cache_config.kv_sharing_fast_prefill:
1667-
# Will be added by https://github.com/aphrodite-project/aphrodite/pull/35045
1667+
# Will be added by https://github.com/vllm-project/vllm/pull/35045
16681668
unsupported.append("KV sharing fast prefill")
16691669

16701670
if self.ec_transfer_config is not None:
1671-
# Will be added by https://github.com/aphrodite-project/aphrodite/pull/38390
1671+
# Will be added by https://github.com/vllm-project/vllm/pull/38390
16721672
unsupported.append("EC transfer")
16731673

16741674
if unsupported:

aphrodite/config/attention.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ class AttentionConfig:
5151
use_prefill_query_quantization: bool = False
5252
"""If set, quantize query for attention in prefill."""
5353

54+
use_fp4_indexer_cache: bool = False
55+
"""If set, use fp4 indexer cache for dsv32 family model (not support yet)"""
56+
5457
def compute_hash(self) -> str:
5558
"""
5659
Provide a hash that uniquely identifies all the configs

aphrodite/config/cache.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,18 @@ class CacheConfig:
5151
"""Whether block_size was explicitly provided. Derived automatically."""
5252
user_specified_mamba_block_size: bool = field(default=False, init=False)
5353
"""Whether mamba_block_size was explicitly provided. Derived automatically."""
54+
hash_block_size: SkipValidation[int] | None = None # type: ignore
55+
"""Block size (in tokens) used for computing Request's block_hashes.
56+
57+
This can be set to a finer granularity than the physical KV cache block
58+
sizes (e.g. 8) as long as every KV cache group's `block_size` is divisible
59+
by it. This enables prefix-caching keys to be computed at the finest common
60+
granularity and then merged for larger physical block sizes.
61+
62+
This config is not static default. If left unspecified, vLLM will choose a
63+
default based on the resolved KV cache groups (typically the smallest KV
64+
cache block size when there are multiple groups).
65+
"""
5466
gpu_memory_utilization: float = Field(default=0.92, gt=0, le=1)
5567
"""The fraction of GPU memory to be used for the model executor, which can
5668
range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
@@ -182,6 +194,8 @@ def compute_hash(self) -> str:
182194
"num_gpu_blocks_override",
183195
"enable_prefix_caching",
184196
"prefix_caching_hash_algo",
197+
# Prefix-caching implementation detail (doesn't affect compiled graph).
198+
"hash_block_size",
185199
"mamba_page_size_padded",
186200
"user_specified_block_size",
187201
"user_specified_mamba_block_size",

aphrodite/config/compilation.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,7 @@ class CompilationConfig:
739739
"aphrodite::kda_attention",
740740
"aphrodite::sparse_attn_indexer",
741741
"aphrodite::rocm_aiter_sparse_attn_indexer",
742+
"aphrodite::deepseek_v4_attention",
742743
]
743744

744745
def compute_hash(self) -> str:

aphrodite/config/kernel.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ def with_default(cls, default: list[str], /, **kwargs: list[str]) -> "IrOpPriori
101101
"auto",
102102
"triton",
103103
"deep_gemm",
104+
"deep_gemm_mega_moe",
104105
"cutlass",
105106
"flashinfer_trtllm",
106107
"flashinfer_cutlass",
@@ -130,6 +131,7 @@ class KernelConfig:
130131
- "auto": Automatically select the best backend based on model and hardware
131132
- "triton": Use Triton-based fused MoE kernels
132133
- "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)
134+
- "deep_gemm_mega_moe": Use DeepGEMM mega MoE kernels
133135
- "cutlass": Use Aphrodite CUTLASS kernels
134136
- "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels
135137
- "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels

aphrodite/config/model.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,10 +81,10 @@
8181

8282
logger = init_logger(__name__)
8383

84-
RunnerOption = Literal["auto", RunnerType]
84+
RunnerOption = Literal["auto", "generate", "pooling", "draft"]
8585
ConvertType = Literal["none", "embed", "classify"]
8686
ConvertOption = Literal["auto", ConvertType]
87-
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
87+
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32", "deepseek_v4"]
8888
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
8989
LogprobsMode = Literal["raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"]
9090
HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig]
@@ -131,6 +131,7 @@ class ModelConfig:
131131
- "slow" will always use the slow tokenizer.
132132
- "mistral" will always use the tokenizer from `mistral_common`.
133133
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.
134+
- "deepseek_v4" will always use the tokenizer from `deepseek_v4`.
134135
- "qwen_vl" will always use the tokenizer from `qwen_vl`.
135136
- Other custom values can be supported via plugins."""
136137
trust_remote_code: bool = False
@@ -555,6 +556,8 @@ def __post_init__(
555556
self.tokenizer_mode = "qwen_vl"
556557
elif arch == "DeepseekV32ForCausalLM":
557558
self.tokenizer_mode = "deepseek_v32"
559+
elif arch == "DeepseekV4ForCausalLM":
560+
self.tokenizer_mode = "deepseek_v4"
558561

559562
if self.tokenizer_mode != "auto":
560563
logger.info(
@@ -924,6 +927,7 @@ def _verify_quantization(self) -> None:
924927
# imports during override detection (e.g., MXFP4 imports Triton)
925928
"mxfp4",
926929
"gpt_oss_mxfp4",
930+
"deepseek_v4_fp8",
927931
"cpu_awq",
928932
"humming",
929933
"gguf",

aphrodite/config/speculative.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,11 +272,19 @@ def compute_hash(self) -> str:
272272
@staticmethod
273273
def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
274274
initial_architecture = hf_config.architectures[0]
275-
if hf_config.model_type in ("deepseek_v3", "deepseek_v32", "glm_moe_dsa"):
275+
if hf_config.model_type in (
276+
"deepseek_v3",
277+
"deepseek_v32",
278+
"glm_moe_dsa",
279+
):
276280
hf_config.model_type = "deepseek_mtp"
277281
if hf_config.model_type == "deepseek_mtp":
278282
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
279283
hf_config.update({"n_predict": n_predict, "architectures": ["DeepSeekMTPModel"]})
284+
if hf_config.model_type == "deepseek_v4":
285+
hf_config.model_type = "deepseek_mtp"
286+
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
287+
hf_config.update({"n_predict": n_predict, "architectures": ["DeepSeekV4MTPModel"]})
280288
if hf_config.model_type in ("pangu_ultra_moe"):
281289
hf_config.model_type = "pangu_ultra_moe_mtp"
282290
if hf_config.model_type == "pangu_ultra_moe_mtp":

0 commit comments

Comments
 (0)