fix: disable sequence parallelism for piecewise compilation (#1650)

AlpinDale · web-flow · commit 19b1e78131d9 · 2026-04-27T13:42:35.000+04:30
Signed-off-by: AlpinDale &lt;alpindale@gmail.com&gt;
diff --git a/aphrodite/compilation/passes/fusion/collective_fusion.py b/aphrodite/compilation/passes/fusion/collective_fusion.py
@@ -368,13 +368,12 @@ def __init__(self, config: AphroditeConfig) -> None:
         self.dump_patterns(config, self.patterns)
 
     def is_applicable_for_range(self, compile_range: Range) -> bool:
-        # This pass is applied on top of the sequence parallelism pass.
-        # It inherits the same applicability condition as `SequenceParallelismPass`.
-        # See `SequenceParallelismPass.is_applicable` for more details.
-        if not self.compilation_config.splitting_ops or self.compilation_config.use_inductor_graph_partition:
-            return True
-        tp_size = get_tensor_model_parallel_world_size()
-        return bool(compile_range.is_single_size() and compile_range.end % tp_size == 0)
+        # This pass is applied on top of the sequence parallelism pass,
+        # which is only supported in fullgraph compilation mode.
+        assert self.compilation_config.use_inductor_graph_partition or not self.compilation_config.splitting_ops, (
+            "AsyncTPPass requires full-graph compilation"
+        )
+        return True
 
     @AphroditeInductorPass.time_and_log
     def __call__(self, graph: fx.Graph) -> None:
diff --git a/aphrodite/compilation/passes/fusion/sequence_parallelism.py b/aphrodite/compilation/passes/fusion/sequence_parallelism.py
@@ -330,21 +330,18 @@ class SequenceParallelismPass(AphroditePatternMatcherPass):
     performance.
 
 
-    This pass splits up the residual tensor across TP ranks and hence divides its size.
-    Because the pattern matcher starts at the end of the graph, the replacement
-    contains a slice that temporarily conforms the input residual to the correct size.
-    After all patterns have been matched, we use a NoOpEliminationPass to clean up
-    what have now become no-op slices.
-
-    Note that an older version of the pass did not need this as it operated only on
-    custom rms_norm and fused_rms_norm_add custom ops which did not complain about
-    mismatched shapes during replacement. So this approach has the same assumption that
-    correctness is only maintained if all rms_norm operations are split across ranks.
-
-    Correctness-wise, this is approach strictly better than before - before,
-    the graph was incorrect semantically and shape-wise during the pass.
-    With this approach there's only semantic incorrectness during the pass.
-    Both approaches restore a correct graph once all patterns are matched.
+    This pass is only supported when compiling the whole graph (fullgraph
+    mode, i.e. using Inductor graph partition or empty splitting_ops).
+    Piecewise compilation is not supported because the residual tensor
+    gets split across TP ranks, causing size mismatches at subgraph
+    boundaries.
+
+    This pass splits up the residual tensor across TP ranks and hence
+    divides its size. Because the pattern matcher starts at the end of
+    the graph, the replacement contains a slice that temporarily conforms
+    the input residual to the correct size. After all patterns have been
+    matched, we use a NoOpEliminationPass to clean up what have now
+    become no-op slices.
     """
 
     @enable_fake_mode
@@ -397,16 +394,12 @@ def is_applicable_for_range(self, compile_range: Range) -> bool:
         and gathering tensors across TP ranks outweighs the benefits.
 
         Returns False (SP disabled) when:
-        - Using piecewise compilation with non-concrete or TP-indivisible sizes
         - min_token_num is None (SP disabled for this device/config)
         - The compile range starts below the minimum token threshold
         """
-        # For piecewise compilation (not using inductor graph partition),
-        # we need concrete sizes that are divisible by TP for correct splitting
-        if not self.compilation_config.use_inductor_graph_partition and self.compilation_config.splitting_ops:
-            tp_size = get_tensor_model_parallel_world_size()
-            if not compile_range.is_single_size() or compile_range.end % tp_size != 0:
-                return False
+        assert self.compilation_config.use_inductor_graph_partition or not self.compilation_config.splitting_ops, (
+            "SequenceParallelismPass requires full-graph compilation"
+        )
 
         # min_token_num is None when SP is disabled for this device/config
         # (e.g., non-CUDA platform, unsupported GPU, or small hidden_size)
diff --git a/aphrodite/config/aphrodite.py b/aphrodite/config/aphrodite.py
@@ -931,19 +931,16 @@ def has_blocked_weights():
             )
             self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
-        # async tp is built on top of sequence parallelism
-        # and requires it to be enabled.
-        if self.compilation_config.pass_config.fuse_gemm_comms:
-            self.compilation_config.pass_config.enable_sp = True
-        if self.compilation_config.pass_config.enable_sp:
+        # async tp is built on top of sequence parallelism and requires it.
+        pass_config = self.compilation_config.pass_config
+        if pass_config.fuse_gemm_comms:
+            pass_config.enable_sp = True
+        if pass_config.enable_sp:
             if self.parallel_config.tensor_parallel_size == 1:
                 logger.warning("Sequence Parallelism requires TP>1, disabling")
-                self.compilation_config.pass_config.enable_sp = False
-                self.compilation_config.pass_config.fuse_gemm_comms = False
+                pass_config.enable_sp = False
+                pass_config.fuse_gemm_comms = False
             else:
-                # Compute SP threshold early; disable if None (model too
-                # small for SP to be beneficial).
-                pass_config = self.compilation_config.pass_config
                 if pass_config.sp_min_token_num is None:
                     from aphrodite.compilation.passes.fusion.sequence_parallelism import (
                         get_sequence_parallelism_threshold,
@@ -963,8 +960,8 @@ def has_blocked_weights():
                         "threshold heuristic, disabling. To force SP, "
                         "set pass_config.sp_min_token_num manually."
                     )
-                    self.compilation_config.pass_config.enable_sp = False
-                    self.compilation_config.pass_config.fuse_gemm_comms = False
+                    pass_config.enable_sp = False
+                    pass_config.fuse_gemm_comms = False
 
         from aphrodite.utils.torch_utils import HAS_OPAQUE_TYPE
 
@@ -1102,8 +1099,8 @@ def has_blocked_weights():
         )
 
         if self.compilation_config.pass_config.enable_sp:
-            # With pipeline parallelism or dynamo partitioning,
-            # native rms norm tracing errors due to incorrect residual shape.
+            # With pipeline parallelism, native rms norm tracing errors due to
+            # incorrect residual shape.
             # Use custom rms norm to unblock. In the future,
             # the pass will operate on higher-level IR to avoid the issue.
             # TODO: https://github.com/aphrodite-project/aphrodite/issues/27894
@@ -1113,20 +1110,15 @@ def has_blocked_weights():
                     self.compilation_config.mode,
                 )
 
-            is_fullgraph = (
-                self.compilation_config.use_inductor_graph_partition
-                or len(self.compilation_config.splitting_ops or []) == 0
-            )
-            if self.parallel_config.pipeline_parallel_size > 1 or not is_fullgraph:
+            if self.parallel_config.pipeline_parallel_size > 1:
                 if "-rms_norm" not in self.compilation_config.custom_ops:
                     self.compilation_config.custom_ops.append("+rms_norm")
                 else:
-                    regime = "Dynamo partition" if not is_fullgraph else "pipeline parallelism"
                     logger.warning_once(
                         "Sequence parallelism not supported with "
                         "native rms_norm when using %s, "
                         "this will likely lead to an error.",
-                        regime,
+                        "pipeline parallelism",
                     )
 
         # final check of cudagraph mode after all possible updates
@@ -1138,9 +1130,9 @@ def has_blocked_weights():
                 and not self.compilation_config.cudagraph_mode.has_piecewise_cudagraphs()  # noqa: E501
             ):
                 logger.warning_once(
-                    "No piecewise cudagraph for executing cascade attention."
-                    " Will fall back to eager execution if a batch runs "
-                    "into cascade attentions."
+                    "No piecewise cudagraph for executing cascade attention. "
+                    "Will fall back to eager execution if a batch runs into "
+                    "cascade attentions."
                 )
 
             if self.compilation_config.cudagraph_mode.requires_piecewise_compilation():
diff --git a/aphrodite/config/compilation.py b/aphrodite/config/compilation.py
@@ -1105,6 +1105,25 @@ def set_splitting_ops_for_v1(self, all2all_backend: str, data_parallel_size: int
                     self.cudagraph_mode = CUDAGraphMode.FULL
                 self.splitting_ops = []
 
+        if (
+            not self.use_inductor_graph_partition
+            and (self.pass_config.enable_sp or self.pass_config.fuse_gemm_comms)
+            and self.splitting_ops
+        ):
+            logger.warning_once(
+                "Sequence parallelism requires full-graph compilation when "
+                "use_inductor_graph_partition is off. Setting splitting_ops "
+                "to an empty list to preserve SP and async TP."
+            )
+            self.splitting_ops = []
+            if self.cudagraph_mode.has_piecewise_cudagraphs():
+                logger.warning_once(
+                    "Sequence parallelism is incompatible with piecewise "
+                    "cudagraph when use_inductor_graph_partition is off. "
+                    "Setting cudagraph_mode to FULL."
+                )
+                self.cudagraph_mode = CUDAGraphMode.FULL
+
         # Disable CUDA graphs for DeepEP high-throughput since its not CG compatible
         if (
             all2all_backend == "deepep_high_throughput"
diff --git a/aphrodite/v1/worker/utils.py b/aphrodite/v1/worker/utils.py
@@ -491,12 +491,8 @@ def is_residual_scattered_for_sp(aphrodite_config: AphroditeConfig, num_input_to
     """Check if the residual tensor is scattered for sequence parallelism.
 
     The residual tensor is scattered across tensor parallel ranks when sequence
-    parallelism and tensor parallelism is enabled.
-
-    This follows the same logic as SequenceParallelismPass.is_applicable_for_range():
-    - In full-graph compilation mode (no splitting ops or using inductor graph
-      partition), SP is always applied
-    - Otherwise, SP is only applied for specific shapes in compile_sizes
+    parallelism and tensor parallelism is enabled. SP is only supported in
+    full-graph compilation mode.
     """
     if not aphrodite_config.compilation_config.pass_config.enable_sp:
         return False
@@ -506,16 +502,13 @@ def is_residual_scattered_for_sp(aphrodite_config: AphroditeConfig, num_input_to
     if tp == 1:
         return False
 
+    assert (
+        aphrodite_config.compilation_config.use_inductor_graph_partition
+        or not aphrodite_config.compilation_config.splitting_ops
+    ), "Sequence parallelism requires full-graph compilation"
+
     # When sequence parallelism is enabled, we always pad num_input_tokens
     # to be a multiple of tensor_parallel_size (tp) earlier.
     assert num_input_tokens % tp == 0
 
-    if (
-        not aphrodite_config.compilation_config.splitting_ops
-        or aphrodite_config.compilation_config.use_inductor_graph_partition
-    ):
-        return True
-    compile_sizes = aphrodite_config.compilation_config.compile_sizes
-    if compile_sizes is None:
-        return False
-    return num_input_tokens in compile_sizes
+    return True