From c24103b83b493adddeecc893d3baa0976150801b Mon Sep 17 00:00:00 2001 From: AlpinDale Date: Tue, 4 Nov 2025 10:38:35 +0000 Subject: [PATCH 1/2] [spec] fix DeepSeek v3.2 MTP metadata and cuda graph Signed-off-by: AlpinDale --- aphrodite/v1/spec_decode/eagle.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/aphrodite/v1/spec_decode/eagle.py b/aphrodite/v1/spec_decode/eagle.py index 3d2b394684..a528f9b395 100644 --- a/aphrodite/v1/spec_decode/eagle.py +++ b/aphrodite/v1/spec_decode/eagle.py @@ -89,6 +89,7 @@ def __init__( (sorted(self.aphrodite_config.compilation_config.cudagraph_capture_sizes)) if self.use_cuda_graph else [] ) + self.use_cuda_graph = self.use_cuda_graph and bool(self.cudagraph_batch_sizes) # persistent buffers for cuda graph self.input_ids = torch.zeros(self.max_num_tokens, dtype=torch.int32, device=device) self.uses_mrope = self.aphrodite_config.model_config.uses_mrope @@ -824,7 +825,7 @@ def load_model(self, target_model: nn.Module) -> None: ) indexer_layers = get_layers_from_aphrodite_config(self.aphrodite_config, DeepseekV32IndexerCache) draft_indexer_layer_names = indexer_layers.keys() - target_indexer_layer_names - self.attn_layer_names = list(draft_attn_layer_names) + self.attn_layer_names = list(draft_attn_layer_names - draft_indexer_layer_names) self.indexer_layer_names = list(draft_indexer_layer_names) if self.indexer_layer_names: @@ -907,14 +908,16 @@ def dummy_run( num_tokens: int, use_cudagraphs=True, ) -> None: - if use_cudagraphs and num_tokens <= self.cudagraph_batch_sizes[-1]: + # Determine if CUDA graphs should be used for this run. + cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph + if cudagraphs_enabled and num_tokens <= self.cudagraph_batch_sizes[-1]: num_tokens = self.aphrodite_config.pad_for_cudagraph(num_tokens) with set_forward_context( None, self.aphrodite_config, num_tokens=num_tokens, - cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE if use_cudagraphs else CUDAGraphMode.NONE, + cudagraph_runtime_mode=(CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE), ): if self.supports_mm_inputs: input_ids = None From 1a4743722f4304f86691fb1f46e45949cce75bf9 Mon Sep 17 00:00:00 2001 From: AlpinDale Date: Tue, 4 Nov 2025 10:40:04 +0000 Subject: [PATCH 2/2] some oversights from previous PR Signed-off-by: AlpinDale --- aphrodite/platforms/cuda.py | 1 + aphrodite/platforms/xpu.py | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/aphrodite/platforms/cuda.py b/aphrodite/platforms/cuda.py index 249356d34e..75c05c0d36 100644 --- a/aphrodite/platforms/cuda.py +++ b/aphrodite/platforms/cuda.py @@ -380,6 +380,7 @@ def get_attn_backend_cls( logger.info_once( "Using FlexAttention backend for %s.", ", ".join(f"{k}={v}" for k, v in use_flex_attention_reason.items()), + scope="global", ) return FLEX_ATTENTION_V1 diff --git a/aphrodite/platforms/xpu.py b/aphrodite/platforms/xpu.py index 89f850aaad..82cb8d3638 100644 --- a/aphrodite/platforms/xpu.py +++ b/aphrodite/platforms/xpu.py @@ -71,9 +71,7 @@ def get_attn_backend_cls( logger.info_once("Using Flash Attention backend.", scope="global") return FLASH_ATTN elif selected_backend: - raise ValueError( - f"Invalid attention backend for {cls.device_name}, with use_v1: {use_v1} use_mla: {use_mla}" - ) + raise ValueError(f"Invalid attention backend for {cls.device_name}, with use_mla: {use_mla}") logger.info_once("Using Flash Attention backend.", scope="global") return "aphrodite.v1.attention.backends.flash_attn.FlashAttentionBackend"