@@ -931,19 +931,16 @@ def has_blocked_weights():
931931 )
932932 self .compilation_config .cudagraph_mode = CUDAGraphMode .NONE
933933
934- # async tp is built on top of sequence parallelism
935- # and requires it to be enabled.
936- if self . compilation_config . pass_config .fuse_gemm_comms :
937- self . compilation_config . pass_config .enable_sp = True
938- if self . compilation_config . pass_config .enable_sp :
934+ # async tp is built on top of sequence parallelism and requires it.
935+ pass_config = self . compilation_config . pass_config
936+ if pass_config .fuse_gemm_comms :
937+ pass_config .enable_sp = True
938+ if pass_config .enable_sp :
939939 if self .parallel_config .tensor_parallel_size == 1 :
940940 logger .warning ("Sequence Parallelism requires TP>1, disabling" )
941- self . compilation_config . pass_config .enable_sp = False
942- self . compilation_config . pass_config .fuse_gemm_comms = False
941+ pass_config .enable_sp = False
942+ pass_config .fuse_gemm_comms = False
943943 else :
944- # Compute SP threshold early; disable if None (model too
945- # small for SP to be beneficial).
946- pass_config = self .compilation_config .pass_config
947944 if pass_config .sp_min_token_num is None :
948945 from aphrodite .compilation .passes .fusion .sequence_parallelism import (
949946 get_sequence_parallelism_threshold ,
@@ -963,8 +960,8 @@ def has_blocked_weights():
963960 "threshold heuristic, disabling. To force SP, "
964961 "set pass_config.sp_min_token_num manually."
965962 )
966- self . compilation_config . pass_config .enable_sp = False
967- self . compilation_config . pass_config .fuse_gemm_comms = False
963+ pass_config .enable_sp = False
964+ pass_config .fuse_gemm_comms = False
968965
969966 from aphrodite .utils .torch_utils import HAS_OPAQUE_TYPE
970967
@@ -1102,8 +1099,8 @@ def has_blocked_weights():
11021099 )
11031100
11041101 if self .compilation_config .pass_config .enable_sp :
1105- # With pipeline parallelism or dynamo partitioning,
1106- # native rms norm tracing errors due to incorrect residual shape.
1102+ # With pipeline parallelism, native rms norm tracing errors due to
1103+ # incorrect residual shape.
11071104 # Use custom rms norm to unblock. In the future,
11081105 # the pass will operate on higher-level IR to avoid the issue.
11091106 # TODO: https://github.com/aphrodite-project/aphrodite/issues/27894
@@ -1113,20 +1110,15 @@ def has_blocked_weights():
11131110 self .compilation_config .mode ,
11141111 )
11151112
1116- is_fullgraph = (
1117- self .compilation_config .use_inductor_graph_partition
1118- or len (self .compilation_config .splitting_ops or []) == 0
1119- )
1120- if self .parallel_config .pipeline_parallel_size > 1 or not is_fullgraph :
1113+ if self .parallel_config .pipeline_parallel_size > 1 :
11211114 if "-rms_norm" not in self .compilation_config .custom_ops :
11221115 self .compilation_config .custom_ops .append ("+rms_norm" )
11231116 else :
1124- regime = "Dynamo partition" if not is_fullgraph else "pipeline parallelism"
11251117 logger .warning_once (
11261118 "Sequence parallelism not supported with "
11271119 "native rms_norm when using %s, "
11281120 "this will likely lead to an error." ,
1129- regime ,
1121+ "pipeline parallelism" ,
11301122 )
11311123
11321124 # final check of cudagraph mode after all possible updates
@@ -1138,9 +1130,9 @@ def has_blocked_weights():
11381130 and not self .compilation_config .cudagraph_mode .has_piecewise_cudagraphs () # noqa: E501
11391131 ):
11401132 logger .warning_once (
1141- "No piecewise cudagraph for executing cascade attention."
1142- " Will fall back to eager execution if a batch runs "
1143- "into cascade attentions."
1133+ "No piecewise cudagraph for executing cascade attention. "
1134+ "Will fall back to eager execution if a batch runs into "
1135+ "cascade attentions."
11441136 )
11451137
11461138 if self .compilation_config .cudagraph_mode .requires_piecewise_compilation ():
0 commit comments