PaddlePaddle
diff --git a/‎fastdeploy/config.py‎
Lines changed: 26 additions & 3 deletions b/‎fastdeploy/config.py‎
Lines changed: 26 additions & 3 deletions
diff --git a/‎fastdeploy/engine/async_llm.py‎
Lines changed: 2 additions & 1 deletion b/‎fastdeploy/engine/async_llm.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎fastdeploy/engine/common_engine.py‎
Lines changed: 18 additions & 3 deletions b/‎fastdeploy/engine/common_engine.py‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎fastdeploy/engine/sched/resource_manager_v1.py‎
Lines changed: 3 additions & 3 deletions b/‎fastdeploy/engine/sched/resource_manager_v1.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎fastdeploy/entrypoints/engine_client.py‎
Lines changed: 16 additions & 1 deletion b/‎fastdeploy/entrypoints/engine_client.py‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎fastdeploy/input/preprocess.py‎
Lines changed: 4 additions & 1 deletion b/‎fastdeploy/input/preprocess.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎fastdeploy/inter_communicator/engine_worker_queue.py‎
Lines changed: 17 additions & 1 deletion b/‎fastdeploy/inter_communicator/engine_worker_queue.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎fastdeploy/model_executor/layers/attention/append_attn_backend.py‎
Lines changed: 1 addition & 3 deletions b/‎fastdeploy/model_executor/layers/attention/append_attn_backend.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎fastdeploy/model_executor/layers/attention/dsa_attention_backend.py‎
Lines changed: 1 addition & 1 deletion b/‎fastdeploy/model_executor/layers/attention/dsa_attention_backend.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastdeploy/model_executor/layers/attention/flash_attn_backend.py‎
Lines changed: 1 addition & 3 deletions b/‎fastdeploy/model_executor/layers/attention/flash_attn_backend.py‎
Lines changed: 1 addition & 3 deletions
@@ -1980,6 +1980,7 @@ def expand_bsz_map(real_bsz_to_captured_size):
                 int(envs.ENABLE_V1_KVCACHE_SCHEDULER) == 0
                 and self.model_config is not None
                 and self.model_config.enable_mm
+                and self.deploy_modality != DeployModality.TEXT
             ):
                 self.max_prefill_batch = 1  # TODO:当前V0多模prefill阶段只支持并行度为1,待优化
         else:
@@ -2019,6 +2020,16 @@ def expand_bsz_map(real_bsz_to_captured_size):
         self.check()
         # self.print()    # NOTE: it's better to explicitly call .print() when FDConfig is initialized
 
+    @property
+    def enable_mm_runtime(self) -> bool:
+        return self.model_config is not None and self.model_config.enable_mm and self.deploy_modality != DeployModality.TEXT
+
+    @property
+    def enable_rope_3d_runtime(self) -> bool:
+        return self.enable_mm_runtime and (
+            getattr(self.model_config, "rope_3d", False) or getattr(self.model_config, "use_3d_rope", False)
+        )
+
     def _disable_sequence_parallel_moe_if_needed(self, mode_name):
         if self.parallel_config.use_sequence_parallel_moe and self.graph_opt_config.use_cudagraph:
             self.parallel_config.use_sequence_parallel_moe = False
@@ -2057,9 +2068,21 @@ def postprocess(self):
         if self.long_prefill_token_threshold == 0:
             self.long_prefill_token_threshold = int(self.model_config.max_model_len * 0.04)
 
+        if (
+            self.model_config is not None
+            and self.model_config.enable_mm
+            and self.deploy_modality == DeployModality.TEXT
+        ):
+            if getattr(self.model_config, "rope_3d", False) or getattr(self.model_config, "use_3d_rope", False):
+                logger.info(
+                    "Deploy modality is text; forcing the multimodal-capable model onto the 1D RoPE runtime path."
+                )
+            setattr(self.model_config, "rope_3d", False)
+            setattr(self.model_config, "use_3d_rope", False)
+
         self.cache_config.max_block_num_per_seq = int(self.model_config.max_model_len // self.cache_config.block_size)
         self.cache_config.postprocess(self.get_max_chunk_tokens(), self.scheduler_config.max_num_seqs)
-        if self.model_config is not None and self.model_config.enable_mm and not envs.ENABLE_V1_KVCACHE_SCHEDULER:
+        if self.model_config is not None and self.enable_mm_runtime and not envs.ENABLE_V1_KVCACHE_SCHEDULER:
             self.cache_config.enable_prefix_caching = False
         if (
             self.structured_outputs_config is not None
@@ -2085,7 +2108,7 @@ def postprocess(self):
                     f"Guided decoding backend '{self.structured_outputs_config.guided_decoding_backend}' is not implemented. [auto, xgrammar, guidance, off]"
                 )
 
-        if self.model_config.enable_mm:
+        if self.enable_mm_runtime:
             if self.cache_config.max_encoder_cache is None or self.cache_config.max_encoder_cache < 0:
                 self.cache_config.max_encoder_cache = self.scheduler_config.max_num_batched_tokens
             elif self.cache_config.max_encoder_cache != 0:
@@ -2392,7 +2415,7 @@ def get_max_chunk_tokens(self, mm_max_tokens_per_item=None):
                 num_tokens = self.scheduler_config.max_num_seqs
         else:
             num_tokens = self.scheduler_config.max_num_batched_tokens
-            if mm_max_tokens_per_item is not None and self.deploy_modality != DeployModality.TEXT:
+            if self.enable_mm_runtime and mm_max_tokens_per_item is not None:
                 max_mm_tokens = max(
                     mm_max_tokens_per_item.get("image", 0),
                     mm_max_tokens_per_item.get("video", 0),
 
@@ -294,6 +294,7 @@ def __init__(self, cfg, pid):
             cfg.limit_mm_per_prompt,
             cfg.mm_processor_kwargs,
             cfg.tool_parser,
+            enable_mm_runtime=cfg.enable_mm_runtime,
         )
         # Create data processor
         self.data_processor = self.input_processor.create_processor()
@@ -446,7 +447,7 @@ async def add_request(
                 )
             if envs.ZMQ_SEND_BATCH_DATA and self.connection_manager is not None:
                 request["zmq_worker_pid"] = self.connection_manager.worker_pid
-            if self.cfg.model_config.enable_mm:
+            if self.cfg.enable_mm_runtime:
                 self.request_client.send_pyobj(request)
             else:
                 self.request_client.send_json(request)
 
@@ -330,6 +330,7 @@ def create_data_processor(self):
             self.cfg.limit_mm_per_prompt,
             self.cfg.mm_processor_kwargs,
             self.cfg.tool_parser,
+            enable_mm_runtime=self.cfg.enable_mm_runtime,
         )
         self.data_processor = self.input_processor.create_processor()
         self.mm_max_tokens_per_item = self.data_processor.get_mm_max_tokens_per_item(
@@ -601,7 +602,7 @@ def insert_tasks(self, tasks: List[Request], current_id=-1):
                         LoggingEventName.RESCHEDULED_INFERENCE_START, task.request_id, getattr(task, "user", "")
                     )
             if not is_prefill:
-                if not self.cfg.model_config.enable_mm:
+                if not self.cfg.enable_mm_runtime:
                     self.update_requests_chunk_size(tasks)
                 else:
                     self.update_mm_requests_chunk_size(tasks)
@@ -1218,7 +1219,7 @@ def _insert_zmq_task_to_scheduler(self):
         while self.running:
             try:
                 block = True if len(added_requests) == 0 else False
-                if not self.cfg.model_config.enable_mm:
+                if not self.cfg.enable_mm_runtime:
                     err, data = self.recv_request_server.receive_json_once(block)
                 else:
                     err, data = self.recv_request_server.receive_pyobj_once(block)
@@ -1276,6 +1277,20 @@ def _insert_zmq_task_to_scheduler(self):
                     err_msg = None
                     try:
                         request = Request.from_dict(data)
+
+                        # [DEBUG] engine 接收到的请求关键字段
+                        print(
+                            f"[DEBUG][engine_recv] req_id={request.request_id} "
+                            f"prompt_token_ids_len={len(request.prompt_token_ids) if request.prompt_token_ids else 0} "
+                            f"has_multimodal_inputs={request.multimodal_inputs is not None} "
+                            f"max_tokens={request.sampling_params.max_tokens} "
+                            f"min_tokens={request.sampling_params.min_tokens} "
+                            f"prompt_token_ids_first20={request.prompt_token_ids[:20] if request.prompt_token_ids else []} "
+                            f"prompt_token_ids_last20={request.prompt_token_ids[-20:] if request.prompt_token_ids else []}"
+                            f"prompt_token_ids_ALL={request.prompt_token_ids if request.prompt_token_ids else []} ",
+                            flush=True,
+                        )
+
                         request.metrics.scheduler_recv_req_time = time.time()
                         main_process_metrics.requests_number.inc()
                         trace_carrier = data.get("trace_carrier")
@@ -2355,7 +2370,7 @@ def _setting_environ_variables(self):
             if self.cfg.scheduler_config.splitwise_role == "prefill":
                 variables["FLAGS_fmt_write_cache_completed_signal"] = 1
 
-        if self.cfg.model_config.enable_mm:
+        if self.cfg.enable_mm_runtime:
             variables["FLAGS_max_partition_size"] = 1024
 
         command_prefix = ""
 
@@ -205,11 +205,11 @@ def __init__(self, max_num_seqs, config, tensor_parallel_size, splitwise_role, l
         self.need_block_num_map = dict()
 
         self.encoder_cache = None
-        if config.model_config.enable_mm and config.cache_config.max_encoder_cache > 0:
+        if config.enable_mm_runtime and config.cache_config.max_encoder_cache > 0:
             self.encoder_cache = EncoderCacheManager(config.cache_config.max_encoder_cache)
 
         self.processor_cache = None
-        if config.model_config.enable_mm and config.cache_config.max_processor_cache > 0:
+        if config.enable_mm_runtime and config.cache_config.max_processor_cache > 0:
             max_processor_cache_in_bytes = int(config.cache_config.max_processor_cache * 1024 * 1024 * 1024)
             self.processor_cache = ProcessorCacheManager(max_processor_cache_in_bytes)
 
@@ -550,7 +550,7 @@ def _get_num_new_tokens(self, request, token_budget):
             num_new_tokens = token_budget // self.config.cache_config.block_size * self.config.cache_config.block_size
         request.with_image = False
 
-        if not self.config.model_config.enable_mm:
+        if not self.config.enable_mm_runtime:
             return num_new_tokens
 
         inputs = request.multimodal_inputs
 
@@ -84,7 +84,7 @@ class EngineClient:
     def __init__(self, pid: int | str, port: int | str, fd_config: FDConfig, workers: int = 1, max_logprobs: int = 20):
         self.fd_config = fd_config
         self.tensor_parallel_size = self.fd_config.parallel_config.tensor_parallel_size
-        self.enable_mm = self.fd_config.model_config.enable_mm
+        self.enable_mm = self.fd_config.enable_mm_runtime
         self.max_logprobs = max_logprobs
         input_processor = InputPreprocessor(
             self.fd_config.model_config,
@@ -93,6 +93,7 @@ def __init__(self, pid: int | str, port: int | str, fd_config: FDConfig, workers
             self.fd_config.mm_processor_kwargs,
             self.fd_config.tool_parser,
             self.enable_mm and self.fd_config.cache_config.max_processor_cache > 0,
+            enable_mm_runtime=self.enable_mm,
         )
         self.enable_logprob = self.fd_config.model_config.enable_logprob
         self.data_processor = input_processor.create_processor()
@@ -358,6 +359,20 @@ async def add_requests(self, task):
 
             task["max_tokens"] = min(self.max_model_len - input_ids_len, task.get("max_tokens"))
             min_tokens = task.get("min_tokens", 1)
+
+            # [DEBUG] 发送到 engine 前的关键字段
+            print(
+                f"[DEBUG][engine_client] req_id={task.get('request_id', 'N/A')} "
+                f"enable_mm={self.enable_mm} "
+                f"input_ids_len={input_ids_len} "
+                f"max_tokens={task['max_tokens']} "
+                f"min_tokens={min_tokens} "
+                f"has_multimodal_inputs={'multimodal_inputs' in task and task['multimodal_inputs'] is not None} "
+                f"prompt_token_ids_first20={task['prompt_token_ids'][:20]} "
+                f"prompt_token_ids_last20={task['prompt_token_ids'][-20:]}",
+                flush=True,
+            )
+
             if "messages" in task:
                 task["messages"] = None
             api_server_logger.info(f"task['max_tokens']:{task['max_tokens']}")
 
@@ -48,6 +48,7 @@ def __init__(
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
         tool_parser: str = None,
         enable_processor_cache: bool = False,
+        enable_mm_runtime: Optional[bool] = None,
     ) -> None:
         self.model_config = model_config
         self.model_name_or_path = self.model_config.model
@@ -56,6 +57,7 @@ def __init__(
         self.mm_processor_kwargs = mm_processor_kwargs
         self.tool_parser = tool_parser
         self.enable_processor_cache = enable_processor_cache
+        self.enable_mm_runtime = self.model_config.enable_mm if enable_mm_runtime is None else enable_mm_runtime
 
     def create_processor(self):
         reasoning_parser_obj = None
@@ -77,10 +79,11 @@ def create_processor(self):
                 reasoning_parser_obj=reasoning_parser_obj,
                 tool_parser_obj=tool_parser_obj,
                 mm_processor_kwargs=self.mm_processor_kwargs,
+                enable_mm_runtime=self.enable_mm_runtime,
             )
         except Exception as e:
             logger.info(f"Plugin input processor not available ({e}), using built-in processor")
-            if not self.model_config.enable_mm:
+            if not self.enable_mm_runtime:
                 from fastdeploy.input.text_processor import TextProcessor
 
                 tokenizer_type = "ernie4_5" if ErnieArchitectures.contains_ernie_arch(architecture) else "auto"
 
@@ -545,10 +545,18 @@ def put_tasks(self, tasks: List[Any]) -> None:
             tasks: Tasks to be added to the queue
         """
         self.lock.acquire()
+        wait_start = time.perf_counter()
         while sum(self.client_read_flag) < self.num_client:
             self.lock.release()
             time.sleep(0.001)
             self.lock.acquire()
+        wait_ms = (time.perf_counter() - wait_start) * 1000.0
+        print(
+            "[engine_worker_queue.put_tasks.wait_all_clients_read] "
+            f"client_id={self.client_id} wait_ms={wait_ms:.3f} "
+            f"client_read_flag={list(self.client_read_flag)}",
+            flush=True,
+        )
 
         if envs.FD_ENABLE_MAX_PREFILL or envs.FD_ENABLE_E2W_TENSOR_CONVERT:
             # multimodal input numpy -> tensor
@@ -571,14 +579,22 @@ def get_tasks(self) -> Tuple[List[Any], bool]:
         """
         tasks: List[Any] = list()
         self.lock.acquire()
-
+        read_start = time.perf_counter()
         tasks.extend(self.tasks)
+        read_ms = (time.perf_counter() - read_start) * 1000.0
         self.client_read_flag[self.client_id] = 1
         all_client_read: bool = np.sum(self.client_read_flag) == self.num_client
         if all_client_read:
             self.tasks[:] = list()
             self.set_exist_tasks(False)
         self.lock.release()
+        print(
+            "[engine_worker_queue.get_tasks] "
+            f"client_id={self.client_id} proxy_type={type(self.tasks).__name__} "
+            f"read_ms={read_ms:.3f} all_client_read={all_client_read} "
+            f"client_read_flag={list(self.client_read_flag)}",
+            flush=True,
+        )
         llm_logger.debug(f"get_tasks: tasks={tasks}")
         return tasks, all_client_read
 
 
@@ -138,9 +138,7 @@ def __init__(
         self.rope_theta: float = (
             10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta
         )
-        self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False) or getattr(
-            fd_config.model_config, "use_3d_rope", False
-        )
+        self.rope_3d: bool = fd_config.enable_rope_3d_runtime
         if fd_config.speculative_config.model_type != "main":
             self.rope_3d = False
         self.causal: bool = getattr(fd_config.model_config, "causal", True)
 
@@ -136,7 +136,7 @@ def __init__(
         self.rope_theta: float = (
             10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta
         )
-        self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False)
+        self.rope_3d: bool = fd_config.enable_rope_3d_runtime
         self.causal: bool = getattr(fd_config.model_config, "causal", True)
         self.speculative_method: str = fd_config.speculative_config.method
         self.use_speculate: bool = self.speculative_method is not None
 
@@ -267,9 +267,7 @@ def __init__(
 
         self.rank, self.device_id = init_rank_and_device_id(fd_config)
 
-        self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False) or getattr(
-            fd_config.model_config, "use_3d_rope", False
-        )
+        self.rope_3d: bool = fd_config.enable_rope_3d_runtime
         if fd_config.speculative_config.model_type != "main":
             self.rope_3d = False
         # Note(ZKK): here must be consistent with append_attn_backend.py
Original file line number	Diff line number	Diff line change
`@@ -136,7 +136,7 @@ def __init__(`
`136`	`136`	`self.rope_theta: float = (`
`137`	`137`	`10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta`
`138`	`138`	`)`
`139`		`- self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False)`
	`139`	`+ self.rope_3d: bool = fd_config.enable_rope_3d_runtime`
`140`	`140`	`self.causal: bool = getattr(fd_config.model_config, "causal", True)`
`141`	`141`	`self.speculative_method: str = fd_config.speculative_config.method`
`142`	`142`	`self.use_speculate: bool = self.speculative_method is not None`