[spec] fix DeepSeek v3.2 MTP metadata and cuda graph (#1591)

AlpinDale · web-flow · commit 7fd7f4a53552 · 2025-11-04T15:12:29.000+04:30
* [spec] fix DeepSeek v3.2 MTP metadata and cuda graph

Signed-off-by: AlpinDale &lt;alpindale@gmail.com&gt;

* some oversights from previous PR

Signed-off-by: AlpinDale &lt;alpindale@gmail.com&gt;

---------

Signed-off-by: AlpinDale &lt;alpindale@gmail.com&gt;
diff --git a/aphrodite/platforms/cuda.py b/aphrodite/platforms/cuda.py
@@ -380,6 +380,7 @@ def get_attn_backend_cls(
         logger.info_once(
             "Using FlexAttention backend for %s.",
             ", ".join(f"{k}={v}" for k, v in use_flex_attention_reason.items()),
+            scope="global",
         )
         return FLEX_ATTENTION_V1
 
diff --git a/aphrodite/platforms/xpu.py b/aphrodite/platforms/xpu.py
@@ -71,9 +71,7 @@ def get_attn_backend_cls(
             logger.info_once("Using Flash Attention backend.", scope="global")
             return FLASH_ATTN
         elif selected_backend:
-            raise ValueError(
-                f"Invalid attention backend for {cls.device_name}, with use_v1: {use_v1} use_mla: {use_mla}"
-            )
+            raise ValueError(f"Invalid attention backend for {cls.device_name}, with use_mla: {use_mla}")
 
         logger.info_once("Using Flash Attention backend.", scope="global")
         return "aphrodite.v1.attention.backends.flash_attn.FlashAttentionBackend"
diff --git a/aphrodite/v1/spec_decode/eagle.py b/aphrodite/v1/spec_decode/eagle.py
@@ -89,6 +89,7 @@ def __init__(
             (sorted(self.aphrodite_config.compilation_config.cudagraph_capture_sizes)) if self.use_cuda_graph else []
         )
 
+        self.use_cuda_graph = self.use_cuda_graph and bool(self.cudagraph_batch_sizes)
         # persistent buffers for cuda graph
         self.input_ids = torch.zeros(self.max_num_tokens, dtype=torch.int32, device=device)
         self.uses_mrope = self.aphrodite_config.model_config.uses_mrope
@@ -824,7 +825,7 @@ def load_model(self, target_model: nn.Module) -> None:
         )
         indexer_layers = get_layers_from_aphrodite_config(self.aphrodite_config, DeepseekV32IndexerCache)
         draft_indexer_layer_names = indexer_layers.keys() - target_indexer_layer_names
-        self.attn_layer_names = list(draft_attn_layer_names)
+        self.attn_layer_names = list(draft_attn_layer_names - draft_indexer_layer_names)
         self.indexer_layer_names = list(draft_indexer_layer_names)
 
         if self.indexer_layer_names:
@@ -907,14 +908,16 @@ def dummy_run(
         num_tokens: int,
         use_cudagraphs=True,
     ) -> None:
-        if use_cudagraphs and num_tokens <= self.cudagraph_batch_sizes[-1]:
+        # Determine if CUDA graphs should be used for this run.
+        cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
+        if cudagraphs_enabled and num_tokens <= self.cudagraph_batch_sizes[-1]:
             num_tokens = self.aphrodite_config.pad_for_cudagraph(num_tokens)
 
         with set_forward_context(
             None,
             self.aphrodite_config,
             num_tokens=num_tokens,
-            cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE if use_cudagraphs else CUDAGraphMode.NONE,
+            cudagraph_runtime_mode=(CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE),
         ):
             if self.supports_mm_inputs:
                 input_ids = None

Original file line number	Diff line number	Diff line change
`@@ -380,6 +380,7 @@ def get_attn_backend_cls(`
`380`	`380`	`logger.info_once(`
`381`	`381`	`"Using FlexAttention backend for %s.",`
`382`	`382`	`", ".join(f"{k}={v}" for k, v in use_flex_attention_reason.items()),`
	`383`	`+ scope="global",`
`383`	`384`	`)`
`384`	`385`	`return FLEX_ATTENTION_V1`
`385`	`386`