dphnAI
diff --git a/‎CMakeLists.txt‎
Lines changed: 7 additions & 3 deletions b/‎CMakeLists.txt‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎aphrodite/_aiter_ops.py‎
Lines changed: 1 addition & 1 deletion b/‎aphrodite/_aiter_ops.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aphrodite/_custom_ops.py‎
Lines changed: 39 additions & 1 deletion b/‎aphrodite/_custom_ops.py‎
Lines changed: 39 additions & 1 deletion
diff --git a/‎aphrodite/config/aphrodite.py‎
Lines changed: 9 additions & 9 deletions b/‎aphrodite/config/aphrodite.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎aphrodite/config/attention.py‎
Lines changed: 3 additions & 0 deletions b/‎aphrodite/config/attention.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎aphrodite/config/cache.py‎
Lines changed: 14 additions & 0 deletions b/‎aphrodite/config/cache.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎aphrodite/config/compilation.py‎
Lines changed: 1 addition & 0 deletions b/‎aphrodite/config/compilation.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aphrodite/config/kernel.py‎
Lines changed: 2 additions & 0 deletions b/‎aphrodite/config/kernel.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎aphrodite/config/model.py‎
Lines changed: 6 additions & 2 deletions b/‎aphrodite/config/model.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎aphrodite/config/speculative.py‎
Lines changed: 9 additions & 1 deletion b/‎aphrodite/config/speculative.py‎
Lines changed: 9 additions & 1 deletion
@@ -281,7 +281,6 @@ set(APHRODITE_EXT_SRC
   "csrc/topk.cu"
   "csrc/cuda_view.cu"
   "csrc/dsv3_fused_a_gemm.cu"
-  "csrc/minimax_reduce_rms_kernel.cu"
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/quantization/w8a8/int8/scaled_quant.cu"
   "csrc/quantization/w8a8/fp8/common.cu"
@@ -293,7 +292,11 @@ set(APHRODITE_EXT_SRC
   "csrc/cpu/dry.cpp"
   "csrc/torch_bindings.cpp")
 
-  if(APHRODITE_GPU_LANG STREQUAL "CUDA")
+if(APHRODITE_GPU_LANG STREQUAL "CUDA")
+  list(APPEND APHRODITE_EXT_SRC
+    "csrc/minimax_reduce_rms_kernel.cu"
+    "csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu")
+
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
@@ -945,7 +948,8 @@ if(APHRODITE_GPU_LANG STREQUAL "CUDA")
   list(APPEND APHRODITE_MOE_EXT_SRC
     "csrc/moe/moe_wna16.cu"
     "csrc/moe/grouped_topk_kernels.cu"
-    "csrc/moe/router_gemm.cu")
+    "csrc/moe/router_gemm.cu"
+    "csrc/moe/topk_softplus_sqrt_kernels.cu")
 endif()
 
 if(APHRODITE_GPU_LANG STREQUAL "CUDA")
 
@@ -1104,7 +1104,7 @@ class rocm_aiter_ops:
         The environment variables are assigned when the module is imported,
         so you can't change the environment variables after the module is imported.
         This is done out of performance consideration. Accessing environment variables
-        is expensive as described in issue https://github.com/aphrodite-project/aphrodite/issues/17067
+        is expensive as described in issue https://github.com/vllm-project/vllm/issues/17067
         so we don't want to do it repeatedly, especially in the hot path (the forward pass).
         You can call the refresh_env_variables() function to reload the env variables
         after monkey patching the env variables in the unit test.
 
@@ -646,8 +646,22 @@ def rotary_embedding(
     head_size: int,
     cos_sin_cache: torch.Tensor,
     is_neox: bool,
+    rope_dim_offset: int = 0,
+    inverse: bool = False,
 ) -> None:
-    torch.ops._C.rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox)
+    if rope_dim_offset == 0 and not inverse:
+        torch.ops._C.rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox)
+    else:
+        torch.ops._C.rotary_embedding(
+            positions,
+            query,
+            key,
+            head_size,
+            cos_sin_cache,
+            is_neox,
+            rope_dim_offset,
+            inverse,
+        )
 
 
 # layer norm ops
@@ -2672,6 +2686,30 @@ def topk_sigmoid(
     )
 
 
+def topk_hash_softplus_sqrt(
+    topk_weights: torch.Tensor,
+    topk_indices: torch.Tensor,
+    token_expert_indices: torch.Tensor,
+    gating_output: torch.Tensor,
+    renormalize: bool = False,
+    routed_scaling_factor: float = 1.0,
+    e_score_correction_bias: torch.Tensor | None = None,
+    input_tokens: torch.Tensor | None = None,
+    hash_indices_table: torch.Tensor | None = None,
+) -> None:
+    torch.ops._moe_C.topk_softplus_sqrt(
+        topk_weights,
+        topk_indices,
+        token_expert_indices,
+        gating_output,
+        renormalize,
+        routed_scaling_factor,
+        e_score_correction_bias,
+        input_tokens,
+        hash_indices_table,
+    )
+
+
 def grouped_topk(
     scores: torch.Tensor,
     num_expert_group: int,
 
@@ -91,7 +91,7 @@ class OptimizationLevel(IntEnum):
 # if model_config is not None:
 #     IS_QUANTIZED = lambda c: c.model_config.is_quantized()
 #     IS_DENSE = lambda c: not c.model_config.is_model_moe()
-# See https://github.com/aphrodite-project/aphrodite/issues/25689.
+# See https://github.com/vllm-project/vllm/issues/25689.
 
 
 def enable_norm_fusion(cfg: "AphroditeConfig") -> bool:
@@ -129,10 +129,10 @@ def enable_allreduce_rms_fusion(cfg: "AphroditeConfig") -> bool:
         and has_flashinfer()
         and (current_platform.is_device_capability_family(100) or current_platform.is_device_capability(90))
         # tp-dp combination broken:
-        # https://github.com/aphrodite-project/aphrodite/issues/34458
+        # https://github.com/vllm-project/vllm/issues/34458
         and cfg.parallel_config.data_parallel_size == 1
         # tp-pp combination broken:
-        # https://github.com/aphrodite-project/aphrodite/issues/35426
+        # https://github.com/vllm-project/vllm/issues/35426
         and cfg.parallel_config.pipeline_parallel_size == 1
     )
 
@@ -881,7 +881,7 @@ def has_blocked_weights():
         # Enable quant_fp8 CUDA ops (TODO disable in follow up)
         # On H100 the CUDA kernel is faster than
         # native implementation
-        # https://github.com/aphrodite-project/aphrodite/issues/25094
+        # https://github.com/vllm-project/vllm/issues/25094
         if has_blocked_weights():
             custom_ops = self.compilation_config.custom_ops
             if "-quant_fp8" not in custom_ops:
@@ -1103,7 +1103,7 @@ def has_blocked_weights():
             # incorrect residual shape.
             # Use custom rms norm to unblock. In the future,
             # the pass will operate on higher-level IR to avoid the issue.
-            # TODO: https://github.com/aphrodite-project/aphrodite/issues/27894
+            # TODO: https://github.com/vllm-project/vllm/issues/27894
             if self.compilation_config.mode != CompilationMode.APHRODITE_COMPILE:
                 logger.warning(
                     "Sequence parallelism is enabled, but running in wrong aphrodite compile mode: %s.",
@@ -1246,7 +1246,7 @@ def has_blocked_weights():
         # Enable quant_fp8 CUDA ops (TODO disable in follow up)
         # On H100 the CUDA kernel is faster than
         # native implementation
-        # https://github.com/aphrodite-project/aphrodite/issues/25094
+        # https://github.com/vllm-project/vllm/issues/25094
         if has_blocked_weights():
             custom_ops = self.compilation_config.custom_ops
             if "-quant_fp8" not in custom_ops:
@@ -1657,18 +1657,18 @@ def _validate_v2_model_runner(self) -> None:
             unsupported.append("dual batch overlap")
 
         if self.model_config is not None and self.model_config.enable_return_routed_experts:
-            # Will be added by https://github.com/aphrodite-project/aphrodite/pull/38163
+            # Will be added by https://github.com/vllm-project/vllm/pull/38163
             unsupported.append("routed experts capture")
 
         if self.model_config is not None and self.model_config.logits_processors:
             unsupported.append("custom logits processors")
 
         if self.cache_config.kv_sharing_fast_prefill:
-            # Will be added by https://github.com/aphrodite-project/aphrodite/pull/35045
+            # Will be added by https://github.com/vllm-project/vllm/pull/35045
             unsupported.append("KV sharing fast prefill")
 
         if self.ec_transfer_config is not None:
-            # Will be added by https://github.com/aphrodite-project/aphrodite/pull/38390
+            # Will be added by https://github.com/vllm-project/vllm/pull/38390
             unsupported.append("EC transfer")
 
         if unsupported:
 
@@ -51,6 +51,9 @@ class AttentionConfig:
     use_prefill_query_quantization: bool = False
     """If set, quantize query for attention in prefill."""
 
+    use_fp4_indexer_cache: bool = False
+    """If set, use fp4 indexer cache for dsv32 family model (not support yet)"""
+
     def compute_hash(self) -> str:
         """
         Provide a hash that uniquely identifies all the configs
 
@@ -51,6 +51,18 @@ class CacheConfig:
     """Whether block_size was explicitly provided. Derived automatically."""
     user_specified_mamba_block_size: bool = field(default=False, init=False)
     """Whether mamba_block_size was explicitly provided. Derived automatically."""
+    hash_block_size: SkipValidation[int] | None = None  # type: ignore
+    """Block size (in tokens) used for computing Request's block_hashes.
+
+    This can be set to a finer granularity than the physical KV cache block
+    sizes (e.g. 8) as long as every KV cache group's `block_size` is divisible
+    by it. This enables prefix-caching keys to be computed at the finest common
+    granularity and then merged for larger physical block sizes.
+
+    This config is not static default. If left unspecified, vLLM will choose a
+    default based on the resolved KV cache groups (typically the smallest KV
+    cache block size when there are multiple groups).
+    """
     gpu_memory_utilization: float = Field(default=0.92, gt=0, le=1)
     """The fraction of GPU memory to be used for the model executor, which can
     range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
@@ -182,6 +194,8 @@ def compute_hash(self) -> str:
             "num_gpu_blocks_override",
             "enable_prefix_caching",
             "prefix_caching_hash_algo",
+            # Prefix-caching implementation detail (doesn't affect compiled graph).
+            "hash_block_size",
             "mamba_page_size_padded",
             "user_specified_block_size",
             "user_specified_mamba_block_size",
 
@@ -739,6 +739,7 @@ class CompilationConfig:
         "aphrodite::kda_attention",
         "aphrodite::sparse_attn_indexer",
         "aphrodite::rocm_aiter_sparse_attn_indexer",
+        "aphrodite::deepseek_v4_attention",
     ]
 
     def compute_hash(self) -> str:
 
@@ -101,6 +101,7 @@ def with_default(cls, default: list[str], /, **kwargs: list[str]) -> "IrOpPriori
     "auto",
     "triton",
     "deep_gemm",
+    "deep_gemm_mega_moe",
     "cutlass",
     "flashinfer_trtllm",
     "flashinfer_cutlass",
@@ -130,6 +131,7 @@ class KernelConfig:
     - "auto": Automatically select the best backend based on model and hardware
     - "triton": Use Triton-based fused MoE kernels
     - "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)
+    - "deep_gemm_mega_moe": Use DeepGEMM mega MoE kernels
     - "cutlass": Use Aphrodite CUTLASS kernels
     - "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels
     - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
 
@@ -81,10 +81,10 @@
 
 logger = init_logger(__name__)
 
-RunnerOption = Literal["auto", RunnerType]
+RunnerOption = Literal["auto", "generate", "pooling", "draft"]
 ConvertType = Literal["none", "embed", "classify"]
 ConvertOption = Literal["auto", ConvertType]
-TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
+TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32", "deepseek_v4"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal["raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"]
 HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig]
@@ -131,6 +131,7 @@ class ModelConfig:
     - "slow" will always use the slow tokenizer.
     - "mistral" will always use the tokenizer from `mistral_common`.
     - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.
+    - "deepseek_v4" will always use the tokenizer from `deepseek_v4`.
     - "qwen_vl" will always use the tokenizer from `qwen_vl`.
     - Other custom values can be supported via plugins."""
     trust_remote_code: bool = False
@@ -555,6 +556,8 @@ def __post_init__(
                 self.tokenizer_mode = "qwen_vl"
             elif arch == "DeepseekV32ForCausalLM":
                 self.tokenizer_mode = "deepseek_v32"
+            elif arch == "DeepseekV4ForCausalLM":
+                self.tokenizer_mode = "deepseek_v4"
 
             if self.tokenizer_mode != "auto":
                 logger.info(
@@ -924,6 +927,7 @@ def _verify_quantization(self) -> None:
                 # imports during override detection (e.g., MXFP4 imports Triton)
                 "mxfp4",
                 "gpt_oss_mxfp4",
+                "deepseek_v4_fp8",
                 "cpu_awq",
                 "humming",
                 "gguf",
 
@@ -272,11 +272,19 @@ def compute_hash(self) -> str:
     @staticmethod
     def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
         initial_architecture = hf_config.architectures[0]
-        if hf_config.model_type in ("deepseek_v3", "deepseek_v32", "glm_moe_dsa"):
+        if hf_config.model_type in (
+            "deepseek_v3",
+            "deepseek_v32",
+            "glm_moe_dsa",
+        ):
             hf_config.model_type = "deepseek_mtp"
         if hf_config.model_type == "deepseek_mtp":
             n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
             hf_config.update({"n_predict": n_predict, "architectures": ["DeepSeekMTPModel"]})
+        if hf_config.model_type == "deepseek_v4":
+            hf_config.model_type = "deepseek_mtp"
+            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+            hf_config.update({"n_predict": n_predict, "architectures": ["DeepSeekV4MTPModel"]})
         if hf_config.model_type in ("pangu_ultra_moe"):
             hf_config.model_type = "pangu_ultra_moe_mtp"
         if hf_config.model_type == "pangu_ultra_moe_mtp":
Original file line number	Diff line number	Diff line change
`@@ -739,6 +739,7 @@ class CompilationConfig:`
`739`	`739`	`"aphrodite::kda_attention",`
`740`	`740`	`"aphrodite::sparse_attn_indexer",`
`741`	`741`	`"aphrodite::rocm_aiter_sparse_attn_indexer",`
	`742`	`+ "aphrodite::deepseek_v4_attention",`
`742`	`743`	`]`
`743`	`744`
`744`	`745`	`def compute_hash(self) -> str:`