@@ -342,6 +342,7 @@ def create_data_processor(self):
342342 self .cfg .limit_mm_per_prompt ,
343343 self .cfg .mm_processor_kwargs ,
344344 self .cfg .tool_parser ,
345+ enable_mm_runtime = self .cfg .enable_mm_runtime ,
345346 )
346347 self .data_processor = self .input_processor .create_processor ()
347348 self .mm_max_tokens_per_item = self .data_processor .get_mm_max_tokens_per_item (
@@ -611,7 +612,7 @@ def insert_tasks(self, tasks: List[Request], current_id=-1):
611612 LoggingEventName .RESCHEDULED_INFERENCE_START , task .request_id , getattr (task , "user" , "" )
612613 )
613614 if not is_prefill :
614- if not self .cfg .model_config . enable_mm :
615+ if not self .cfg .enable_mm_runtime :
615616 self .update_requests_chunk_size (tasks )
616617 else :
617618 self .update_mm_requests_chunk_size (tasks )
@@ -1260,7 +1261,7 @@ def _insert_zmq_task_to_scheduler(self):
12601261 while self .running :
12611262 try :
12621263 block = True if len (added_requests ) == 0 else False
1263- if not self .cfg .model_config . enable_mm :
1264+ if not self .cfg .enable_mm_runtime :
12641265 err , data = self .recv_request_server .receive_json_once (block )
12651266 else :
12661267 err , data = self .recv_request_server .receive_pyobj_once (block )
@@ -1482,22 +1483,25 @@ def _control_pause(self, control_request: ControlRequest):
14821483 self ._send_error_response (req .request_id , "Request is aborted since engine is paused." )
14831484 self .scheduler .reset ()
14841485
1485- # pause cache transfer
1486- if self .cfg .cache_config .num_cpu_blocks > 0 or self .cfg .cache_config .kvcache_storage_backend :
1487- self .llm_logger .info ("Start to pause cache transfer." )
1488- pause_transfer_request = ControlRequest (
1489- request_id = f"{ control_request .request_id } _pause_transfer" , method = "pause"
1490- )
1491- self .cache_task_queue .put_transfer_task ((CacheStatus .CTRL , pause_transfer_request ))
1492- # Wait for cache_transfer responses
1493- asyncio .run (
1494- self ._wait_for_control_responses (
1495- f"{ pause_transfer_request .request_id } " , 60 , executors = ["cache_transfer" ]
1486+ if envs .ENABLE_V1_KVCACHE_MANAGER :
1487+ self .resource_manager .cache_manager .reset_cache ()
1488+ else :
1489+ # pause cache transfer
1490+ if self .cfg .cache_config .num_cpu_blocks > 0 or self .cfg .cache_config .kvcache_storage_backend :
1491+ self .llm_logger .info ("Start to pause cache transfer." )
1492+ pause_transfer_request = ControlRequest (
1493+ request_id = f"{ control_request .request_id } _pause_transfer" , method = "pause"
14961494 )
1497- )
1498- self .llm_logger .info ("Successfully paused cache transfer." )
1495+ self .cache_task_queue .put_transfer_task ((CacheStatus .CTRL , pause_transfer_request ))
1496+ # Wait for cache_transfer responses
1497+ asyncio .run (
1498+ self ._wait_for_control_responses (
1499+ f"{ pause_transfer_request .request_id } " , 60 , executors = ["cache_transfer" ]
1500+ )
1501+ )
1502+ self .llm_logger .info ("Successfully paused cache transfer." )
14991503
1500- self .resource_manager .cache_manager .reset ()
1504+ self .resource_manager .cache_manager .reset ()
15011505 self .llm_logger .info ("Successfully paused request generation." )
15021506 return None
15031507
@@ -1791,10 +1795,14 @@ def _control_sleep(self, control_request: ControlRequest):
17911795 executors .add ("worker" )
17921796 if "kv_cache" in tags :
17931797 executors .add ("worker" )
1794- if self .cfg .cache_config .num_cpu_blocks > 0 or self .cfg .cache_config .kvcache_storage_backend :
1795- executors .add ("cache_transfer" )
1796- if self .cfg .cache_config .enable_prefix_caching :
1797- self .resource_manager .cache_manager .reset ()
1798+ if envs .ENABLE_V1_KVCACHE_MANAGER :
1799+ if self .cfg .cache_config .enable_prefix_caching :
1800+ self .resource_manager .cache_manager .reset_cache ()
1801+ else :
1802+ if self .cfg .cache_config .num_cpu_blocks > 0 or self .cfg .cache_config .kvcache_storage_backend :
1803+ executors .add ("cache_transfer" )
1804+ if self .cfg .cache_config .enable_prefix_caching :
1805+ self .resource_manager .cache_manager .reset ()
17981806
17991807 # Dispatch sleep request to executors
18001808 self .llm_logger .info (f"Dispatch sleep request to executors: { list (executors )} " )
@@ -1989,6 +1997,11 @@ def _decode_token(self, token_ids, req_id, is_end):
19891997 token_ids = cum_tokens [prefix_offset :read_offset ]
19901998 else :
19911999 token_ids = []
2000+
2001+ if is_end and delta_text == "" and len (cum_tokens ) > 0 :
2002+ read_offset = self .data_processor .decode_status [req_id ][1 ]
2003+ token_ids = cum_tokens [read_offset :]
2004+
19922005 if is_end :
19932006 del self .data_processor .decode_status [req_id ]
19942007 return delta_text , token_ids
@@ -2444,7 +2457,7 @@ def _setting_environ_variables(self):
24442457 if self .cfg .scheduler_config .splitwise_role == "prefill" :
24452458 variables ["FLAGS_fmt_write_cache_completed_signal" ] = 1
24462459
2447- if self .cfg .model_config . enable_mm :
2460+ if self .cfg .enable_mm_runtime :
24482461 variables ["FLAGS_max_partition_size" ] = 1024
24492462
24502463 command_prefix = ""
@@ -2545,6 +2558,7 @@ def _start_worker_service(self):
25452558 f" --early_stop_config '{ self .cfg .early_stop_config .to_json_string ()} '"
25462559 f" --reasoning_parser { self .cfg .structured_outputs_config .reasoning_parser } "
25472560 f" --load_choices { self .cfg .load_config .load_choices } "
2561+ f" --model_loader_extra_config '{ json .dumps (self .cfg .load_config .model_loader_extra_config )} '"
25482562 f" --plas_attention_config '{ self .cfg .plas_attention_config .to_json_string ()} '"
25492563 f" --ips { ips } "
25502564 f" --cache-transfer-protocol { self .cfg .cache_config .cache_transfer_protocol } "
@@ -2577,6 +2591,7 @@ def _start_worker_service(self):
25772591 "moe_gate_fp32" : self .cfg .model_config .moe_gate_fp32 ,
25782592 "enable_entropy" : self .cfg .model_config .enable_entropy ,
25792593 "enable_overlap_schedule" : self .cfg .scheduler_config .enable_overlap_schedule ,
2594+ "enable_flashinfer_allreduce_fusion" : self .cfg .parallel_config .enable_flashinfer_allreduce_fusion ,
25802595 }
25812596 for worker_flag , value in worker_store_true_flag .items ():
25822597 if value :
0 commit comments