[Spec] fold can_run_cuda_graph into EagleVerifyOutput; drop dead extend-after-decode check (sgl-project#25566)

hnyls2002 · web-flow · commit f01a1d2cf9ca · 2026-05-18T14:24:54.000-07:00
diff --git a/python/sglang/srt/speculative/eagle_info.py b/python/sglang/srt/speculative/eagle_info.py
@@ -1012,6 +1012,10 @@ class EagleVerifyOutput:
     num_correct_drafts_per_req_cpu: List[int]
     # Accepted indices from logits_output.next_token_logits
     accept_indices: torch.Tensor
+    # Whether the target verify forward ran a captured cuda graph. Set by
+    # the worker after `EagleVerifyInput.sample` returns; default kept so
+    # idle / direct constructions don't have to pass it.
+    can_run_cuda_graph: bool = False
 
     @classmethod
     def create_idle(
diff --git a/python/sglang/srt/speculative/eagle_worker.py b/python/sglang/srt/speculative/eagle_worker.py
@@ -5,7 +5,6 @@
 
 import torch
 
-from sglang.srt.distributed import get_tp_group
 from sglang.srt.hardware_backend.npu.graph_runner.eagle_draft_npu_graph_runner import (
     EAGLEDraftNpuGraphRunner,
 )
@@ -162,7 +161,7 @@ def __init__(
                 server_args=server_args,
                 gpu_id=gpu_id,
                 tp_rank=tp_rank,
-                pp_rank=0,  # FIXME
+                pp_rank=0,  # spec workers don't support pipeline parallelism
                 dp_rank=dp_rank,
                 moe_ep_rank=moe_ep_rank,
                 attn_cp_rank=attn_cp_rank,
@@ -492,8 +491,9 @@ def forward_batch_generation(self, batch: ScheduleBatch) -> GenerationBatchResul
             set_time_batch(batch.reqs, "set_spec_draft_end_time", trace_only=True)
             set_time_batch(batch.reqs, "set_spec_verify_start_time", trace_only=True)
 
+            # Install verify_input as `batch.spec_info` for the verify forward.
             batch.spec_info = verify_input
-            logits_output, verify_output, can_run_cuda_graph = self.verify(batch)
+            verify_output = self.verify(batch)
 
             if get_global_tracing_enabled():
                 for idx, req in enumerate(batch.reqs):
@@ -520,8 +520,9 @@ def forward_batch_generation(self, batch: ScheduleBatch) -> GenerationBatchResul
                     self.server_args.enable_dp_attention
                     or draft_extend_input.input_ids.shape[0] > 0
                 ):
-                    # decode is not finished; stash for extend, then restash
-                    # the next-iter EagleDraftInput it returns.
+                    # decode is not finished; install draft_extend_input for
+                    # the extend forward, then install the next-iter
+                    # EagleDraftInput it returns.
                     batch.spec_info = draft_extend_input
                     next_draft_input = self.forward_draft_extend_after_decode(batch)
                     batch.spec_info = next_draft_input
@@ -542,31 +543,13 @@ def forward_batch_generation(self, batch: ScheduleBatch) -> GenerationBatchResul
                 )
 
             return GenerationBatchResult(
-                logits_output=logits_output,
+                logits_output=verify_output.logits_output,
                 next_token_ids=verify_output.accept_tokens,
                 num_correct_drafts=sum(verify_output.num_correct_drafts_per_req_cpu),
                 num_correct_drafts_per_req_cpu=verify_output.num_correct_drafts_per_req_cpu,
-                can_run_cuda_graph=can_run_cuda_graph,
+                can_run_cuda_graph=verify_output.can_run_cuda_graph,
             )
 
-    def check_forward_draft_extend_after_decode(self, verify_output: EagleVerifyOutput):
-        local_need_forward = verify_output.draft_extend_input.input_ids.shape[0] > 0
-        if not self.server_args.enable_dp_attention:
-            return local_need_forward
-
-        global_need_forward = torch.tensor(
-            [
-                (local_need_forward),
-            ],
-            dtype=torch.int64,
-        )
-        torch.distributed.all_reduce(
-            global_need_forward, group=get_tp_group().cpu_group
-        )
-        global_need_forward_cnt = global_need_forward[0].item()
-        need_forward = global_need_forward_cnt > 0
-        return need_forward
-
     def forward_target_extend(
         self, batch: ScheduleBatch
     ) -> Tuple[LogitsProcessorOutput, torch.Tensor, Optional[torch.Tensor], bool]:
@@ -1015,7 +998,8 @@ def verify(self, batch: ScheduleBatch):
             ForwardMode.DECODE if not batch.forward_mode.is_idle() else ForwardMode.IDLE
         )
 
-        return logits_output, res, can_run_cuda_graph
+        res.can_run_cuda_graph = can_run_cuda_graph
+        return res
 
     def _mamba_verify_update(
         self,
diff --git a/python/sglang/srt/speculative/eagle_worker_v2.py b/python/sglang/srt/speculative/eagle_worker_v2.py
@@ -154,7 +154,7 @@ def __init__(
                 server_args=server_args,
                 gpu_id=gpu_id,
                 tp_rank=tp_rank,
-                pp_rank=0,  # FIXME
+                pp_rank=0,  # spec workers don't support pipeline parallelism
                 dp_rank=dp_rank,
                 moe_ep_rank=moe_ep_rank,
                 attn_cp_rank=attn_cp_rank,
diff --git a/python/sglang/srt/speculative/frozen_kv_mtp_worker.py b/python/sglang/srt/speculative/frozen_kv_mtp_worker.py
@@ -449,8 +449,9 @@ def forward_batch_generation(self, batch: ScheduleBatch) -> GenerationBatchResul
         set_time_batch(batch.reqs, "set_spec_draft_end_time", trace_only=True)
         set_time_batch(batch.reqs, "set_spec_verify_start_time", trace_only=True)
 
+        # Install verify_input as `batch.spec_info` for the verify forward.
         batch.spec_info = verify_input
-        logits_output, verify_output, can_run_cuda_graph = self.verify(batch)
+        verify_output = self.verify(batch)
 
         if get_global_tracing_enabled():
             for idx, req in enumerate(batch.reqs):
@@ -470,18 +471,19 @@ def forward_batch_generation(self, batch: ScheduleBatch) -> GenerationBatchResul
                 self.server_args.enable_dp_attention
                 or draft_extend_input.input_ids.shape[0] > 0
             ):
-                # Stash for the seed step; _run_assistant_seed_step swaps in
-                # a fresh FrozenKVMTPDraftInput for next iter.
+                # Install draft_extend_input as `batch.spec_info` for the seed
+                # step; `_run_assistant_seed_step` replaces it with a fresh
+                # `FrozenKVMTPDraftInput` for next iter.
                 batch.spec_info = draft_extend_input
                 self.forward_draft_extend_after_decode(batch)
         set_time_batch(batch.reqs, "set_spec_draft_extend_end_time", trace_only=True)
 
         return GenerationBatchResult(
-            logits_output=logits_output,
+            logits_output=verify_output.logits_output,
             next_token_ids=verify_output.accept_tokens,
             num_correct_drafts=sum(verify_output.num_correct_drafts_per_req_cpu),
             num_correct_drafts_per_req_cpu=verify_output.num_correct_drafts_per_req_cpu,
-            can_run_cuda_graph=can_run_cuda_graph,
+            can_run_cuda_graph=verify_output.can_run_cuda_graph,
         )
 
     def forward_target_extend(
@@ -518,7 +520,7 @@ def forward_draft_extend_after_decode(self, batch: ScheduleBatch) -> None:
         input_is_idle = batch.forward_mode.is_idle()
 
         if not input_is_idle and draft_extend_input.input_ids.shape[0] == 0:
-            # All reqs finished; stash an idle FrozenKVMTPDraftInput so the
+            # All reqs finished. Install an idle FrozenKVMTPDraftInput so the
             # next-iter draft sees a valid spec_info.
             batch = batch.copy()
             batch.prepare_for_idle()
@@ -775,4 +777,5 @@ def verify(self, batch: ScheduleBatch):
         )
 
         del seq_lens_pre_verify
-        return logits_output, res, can_run_cuda_graph
+        res.can_run_cuda_graph = can_run_cuda_graph
+        return res
diff --git a/python/sglang/srt/speculative/multi_layer_eagle_worker.py b/python/sglang/srt/speculative/multi_layer_eagle_worker.py
@@ -18,7 +18,6 @@
 
 import torch
 
-from sglang.srt.distributed import get_tp_group
 from sglang.srt.layers.dp_attention import get_attention_tp_group
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.moe.utils import speculative_moe_backend_context
@@ -136,7 +135,7 @@ def __init__(
                 server_args=server_args,
                 gpu_id=gpu_id,
                 tp_rank=tp_rank,
-                pp_rank=0,  # FIXME
+                pp_rank=0,  # spec workers don't support pipeline parallelism
                 dp_rank=dp_rank,
                 moe_ep_rank=moe_ep_rank,
                 attn_cp_rank=attn_cp_rank,
@@ -293,8 +292,9 @@ def forward_batch_generation(self, batch: ScheduleBatch) -> GenerationBatchResul
             set_time_batch(batch.reqs, "set_spec_draft_end_time", trace_only=True)
             set_time_batch(batch.reqs, "set_spec_verify_start_time", trace_only=True)
 
+            # Install verify_input as `batch.spec_info` for the verify forward.
             batch.spec_info = verify_input
-            logits_output, verify_output, can_run_cuda_graph = self.verify(batch)
+            verify_output = self.verify(batch)
 
             if get_global_tracing_enabled():
                 for idx, req in enumerate(batch.reqs):
@@ -320,8 +320,9 @@ def forward_batch_generation(self, batch: ScheduleBatch) -> GenerationBatchResul
                     self.server_args.enable_dp_attention
                     or draft_extend_input.input_ids.shape[0] > 0
                 ):
-                    # decode is not finished; stash for extend, then restash
-                    # the next-iter EagleDraftInput it returns.
+                    # decode is not finished; install draft_extend_input for
+                    # the extend forward, then install the next-iter
+                    # EagleDraftInput it returns.
                     batch.spec_info = draft_extend_input
                     next_draft_input = self.forward_draft_extend_after_decode(batch)
                     batch.spec_info = next_draft_input
@@ -337,31 +338,13 @@ def forward_batch_generation(self, batch: ScheduleBatch) -> GenerationBatchResul
             )
 
             return GenerationBatchResult(
-                logits_output=logits_output,
+                logits_output=verify_output.logits_output,
                 next_token_ids=verify_output.accept_tokens,
                 num_correct_drafts=sum(verify_output.num_correct_drafts_per_req_cpu),
                 num_correct_drafts_per_req_cpu=verify_output.num_correct_drafts_per_req_cpu,
-                can_run_cuda_graph=can_run_cuda_graph,
+                can_run_cuda_graph=verify_output.can_run_cuda_graph,
             )
 
-    def check_forward_draft_extend_after_decode(self, verify_output: EagleVerifyOutput):
-        local_need_forward = verify_output.draft_extend_input.input_ids.shape[0] > 0
-        if not self.server_args.enable_dp_attention:
-            return local_need_forward
-
-        global_need_forward = torch.tensor(
-            [
-                (local_need_forward),
-            ],
-            dtype=torch.int64,
-        )
-        torch.distributed.all_reduce(
-            global_need_forward, group=get_tp_group().cpu_group
-        )
-        global_need_forward_cnt = global_need_forward[0].item()
-        need_forward = global_need_forward_cnt > 0
-        return need_forward
-
     def forward_target_extend(
         self, batch: ScheduleBatch
     ) -> Tuple[LogitsProcessorOutput, torch.Tensor, Optional[torch.Tensor], bool]:
@@ -644,7 +627,8 @@ def verify(self, batch: ScheduleBatch):
             ForwardMode.DECODE if not batch.forward_mode.is_idle() else ForwardMode.IDLE
         )
 
-        return logits_output, res, can_run_cuda_graph
+        res.can_run_cuda_graph = can_run_cuda_graph
+        return res
 
     def forward_draft_extend(
         self,
diff --git a/python/sglang/srt/speculative/multi_layer_eagle_worker_v2.py b/python/sglang/srt/speculative/multi_layer_eagle_worker_v2.py
@@ -128,7 +128,7 @@ def __init__(
                 server_args=server_args,
                 gpu_id=gpu_id,
                 tp_rank=tp_rank,
-                pp_rank=0,  # FIXME
+                pp_rank=0,  # spec workers don't support pipeline parallelism
                 dp_rank=dp_rank,
                 moe_ep_rank=moe_ep_rank,
                 attn_cp_rank=attn_cp_rank,
diff --git a/python/sglang/srt/speculative/spec_info.py b/python/sglang/srt/speculative/spec_info.py
@@ -232,9 +232,11 @@ class SpecInput(ABC):
     def __init__(self, spec_input_type: SpecInputType):
         self.spec_input_type = spec_input_type
 
+    # Cross-algorithm phase guards. Used by attention backends and
+    # ForwardBatch padding logic to dispatch on phase without hardcoding the
+    # specific algo class (EAGLE / FROZEN_KV_MTP / DFLASH / NGRAM each have
+    # their own draft / verify SpecInput subclasses).
     def is_draft_input(self) -> bool:
-        # FIXME: remove this function which is only used for assertion
-        # or use another variable name like `draft_input` to substitute `spec_info`
         return self.spec_input_type in {
             SpecInputType.EAGLE_DRAFT,
             SpecInputType.EAGLE_DRAFT_EXTEND,
diff --git a/python/sglang/srt/speculative/standalone_worker.py b/python/sglang/srt/speculative/standalone_worker.py
@@ -87,7 +87,7 @@ def __init__(
                 server_args=server_args,
                 gpu_id=gpu_id,
                 tp_rank=tp_rank,
-                pp_rank=0,  # FIXME
+                pp_rank=0,  # spec workers don't support pipeline parallelism
                 dp_rank=dp_rank,
                 moe_ep_rank=moe_ep_rank,
                 attn_cp_rank=attn_cp_rank,
diff --git a/python/sglang/srt/speculative/standalone_worker_v2.py b/python/sglang/srt/speculative/standalone_worker_v2.py
@@ -93,7 +93,7 @@ def __init__(
                 server_args=server_args,
                 gpu_id=gpu_id,
                 tp_rank=tp_rank,
-                pp_rank=0,  # FIXME
+                pp_rank=0,  # spec workers don't support pipeline parallelism
                 dp_rank=dp_rank,
                 moe_ep_rank=moe_ep_rank,
                 attn_cp_rank=attn_cp_rank,