Skip to content

Commit a2f636e

Browse files
authored
[Optimization] Deduplicate reasoning_status reset in insert_tasks_v1 (#7665)
1 parent 70b4972 commit a2f636e

3 files changed

Lines changed: 4 additions & 8 deletions

File tree

fastdeploy/worker/gpu_model_runner.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -849,9 +849,9 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
849849
enable_thinking = bool(request.get("enable_thinking"))
850850
logger.debug(f"request {request.request_id} with {enable_thinking=} at idx {idx}")
851851
self.share_inputs["enable_thinking"][idx : idx + 1, :] = enable_thinking
852+
async_set_value(self.share_inputs["reasoning_status"][idx : idx + 1], 0)
852853
if enable_thinking:
853854
self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
854-
self.share_inputs["reasoning_status"][idx : idx + 1, :] = 0
855855
if request.get("reasoning_max_tokens") is not None:
856856
# Enable thinking
857857
self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get(
@@ -871,7 +871,6 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
871871
self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1
872872
self.share_inputs["max_reply_lens"][idx : idx + 1, :] = -1
873873
self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
874-
self.share_inputs["reasoning_status"][idx : idx + 1, :] = 0
875874

876875
if isinstance(request.prompt_token_ids, np.ndarray):
877876
prompt_token_ids = request.prompt_token_ids.tolist()

fastdeploy/worker/metax_model_runner.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -719,9 +719,9 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
719719
enable_thinking = bool(request.get("enable_thinking"))
720720
logger.debug(f"request {request.request_id} with {enable_thinking=} at idx {idx}")
721721
self.share_inputs["enable_thinking"][idx : idx + 1, :] = enable_thinking
722+
self.share_inputs["reasoning_status"][idx : idx + 1, :] = 0
722723
if enable_thinking:
723724
self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
724-
self.share_inputs["reasoning_status"][idx : idx + 1, :] = 0
725725
if request.get("reasoning_max_tokens") is not None:
726726
# Enable thinking
727727
self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get(
@@ -741,7 +741,6 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
741741
self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1
742742
self.share_inputs["max_reply_lens"][idx : idx + 1, :] = -1
743743
self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
744-
self.share_inputs["reasoning_status"][idx : idx + 1, :] = 0
745744

746745
if isinstance(request.prompt_token_ids, np.ndarray):
747746
prompt_token_ids = request.prompt_token_ids.tolist()

fastdeploy/worker/xpu_model_runner.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -595,16 +595,15 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int):
595595
prefill_start_index = request.prefill_start_index
596596
prefill_end_index = request.prefill_end_index
597597
length = prefill_end_index - prefill_start_index
598+
self.share_inputs["reasoning_status"][idx : idx + 1, :] = 0
598599
if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None:
599600
# Enable thinking
600601
self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens")
601602
self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
602-
self.share_inputs["reasoning_status"][idx : idx + 1, :] = 0
603603
else:
604604
# Disable thinking
605605
self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1
606606
self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
607-
self.share_inputs["reasoning_status"][idx : idx + 1, :] = 0
608607

609608
if (
610609
hasattr(request, "sampling_params")
@@ -796,16 +795,15 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests:
796795
)[0]
797796
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
798797

798+
self.share_inputs["reasoning_status"][idx : idx + 1, :] = 0
799799
if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None:
800800
# Enable thinking
801801
self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens")
802802
self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
803-
self.share_inputs["reasoning_status"][idx : idx + 1, :] = 0
804803
else:
805804
# Disable thinking
806805
self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1
807806
self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
808-
self.share_inputs["reasoning_status"][idx : idx + 1, :] = 0
809807

810808
def get_attr_from_request(request, attr, default_value=None):
811809
res = request.get(attr, default_value)

0 commit comments

Comments
 (0)