@@ -1980,6 +1980,7 @@ def expand_bsz_map(real_bsz_to_captured_size):
19801980 int (envs .ENABLE_V1_KVCACHE_SCHEDULER ) == 0
19811981 and self .model_config is not None
19821982 and self .model_config .enable_mm
1983+ and self .deploy_modality != DeployModality .TEXT
19831984 ):
19841985 self .max_prefill_batch = 1 # TODO:当前V0多模prefill阶段只支持并行度为1,待优化
19851986 else :
@@ -2019,6 +2020,16 @@ def expand_bsz_map(real_bsz_to_captured_size):
20192020 self .check ()
20202021 # self.print() # NOTE: it's better to explicitly call .print() when FDConfig is initialized
20212022
2023+ @property
2024+ def enable_mm_runtime (self ) -> bool :
2025+ return self .model_config is not None and self .model_config .enable_mm and self .deploy_modality != DeployModality .TEXT
2026+
2027+ @property
2028+ def enable_rope_3d_runtime (self ) -> bool :
2029+ return self .enable_mm_runtime and (
2030+ getattr (self .model_config , "rope_3d" , False ) or getattr (self .model_config , "use_3d_rope" , False )
2031+ )
2032+
20222033 def _disable_sequence_parallel_moe_if_needed (self , mode_name ):
20232034 if self .parallel_config .use_sequence_parallel_moe and self .graph_opt_config .use_cudagraph :
20242035 self .parallel_config .use_sequence_parallel_moe = False
@@ -2057,9 +2068,21 @@ def postprocess(self):
20572068 if self .long_prefill_token_threshold == 0 :
20582069 self .long_prefill_token_threshold = int (self .model_config .max_model_len * 0.04 )
20592070
2071+ if (
2072+ self .model_config is not None
2073+ and self .model_config .enable_mm
2074+ and self .deploy_modality == DeployModality .TEXT
2075+ ):
2076+ if getattr (self .model_config , "rope_3d" , False ) or getattr (self .model_config , "use_3d_rope" , False ):
2077+ logger .info (
2078+ "Deploy modality is text; forcing the multimodal-capable model onto the 1D RoPE runtime path."
2079+ )
2080+ setattr (self .model_config , "rope_3d" , False )
2081+ setattr (self .model_config , "use_3d_rope" , False )
2082+
20602083 self .cache_config .max_block_num_per_seq = int (self .model_config .max_model_len // self .cache_config .block_size )
20612084 self .cache_config .postprocess (self .get_max_chunk_tokens (), self .scheduler_config .max_num_seqs )
2062- if self .model_config is not None and self .model_config . enable_mm and not envs .ENABLE_V1_KVCACHE_SCHEDULER :
2085+ if self .model_config is not None and self .enable_mm_runtime and not envs .ENABLE_V1_KVCACHE_SCHEDULER :
20632086 self .cache_config .enable_prefix_caching = False
20642087 if (
20652088 self .structured_outputs_config is not None
@@ -2085,7 +2108,7 @@ def postprocess(self):
20852108 f"Guided decoding backend '{ self .structured_outputs_config .guided_decoding_backend } ' is not implemented. [auto, xgrammar, guidance, off]"
20862109 )
20872110
2088- if self .model_config . enable_mm :
2111+ if self .enable_mm_runtime :
20892112 if self .cache_config .max_encoder_cache is None or self .cache_config .max_encoder_cache < 0 :
20902113 self .cache_config .max_encoder_cache = self .scheduler_config .max_num_batched_tokens
20912114 elif self .cache_config .max_encoder_cache != 0 :
@@ -2392,7 +2415,7 @@ def get_max_chunk_tokens(self, mm_max_tokens_per_item=None):
23922415 num_tokens = self .scheduler_config .max_num_seqs
23932416 else :
23942417 num_tokens = self .scheduler_config .max_num_batched_tokens
2395- if mm_max_tokens_per_item is not None and self . deploy_modality != DeployModality . TEXT :
2418+ if self . enable_mm_runtime and mm_max_tokens_per_item is not None :
23962419 max_mm_tokens = max (
23972420 mm_max_tokens_per_item .get ("image" , 0 ),
23982421 mm_max_tokens_per_item .get ("video" , 0 ),
0 commit comments