Skip to content

Commit d1acd62

Browse files
authored
fix(disagg): unstuck decode aborts under prealloc pressure (#25561)
1 parent 0ab427d commit d1acd62

3 files changed

Lines changed: 18 additions & 2 deletions

File tree

python/sglang/srt/disaggregation/decode.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -609,7 +609,11 @@ def _update_handshake_waiters(
609609
if not self.queue:
610610
return
611611

612-
if all(decode_req.waiting_for_input for decode_req in self.queue):
612+
# Still poll if any receiver was aborted, otherwise it stays stuck.
613+
if all(decode_req.waiting_for_input for decode_req in self.queue) and not any(
614+
getattr(decode_req.kv_receiver, "conclude_state", None) == KVPoll.Failed
615+
for decode_req in self.queue
616+
):
613617
return
614618

615619
polls = poll_and_all_reduce(

python/sglang/srt/managers/scheduler.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3499,12 +3499,16 @@ def abort_request(self, recv_req: AbortReq):
34993499
if recv_req.abort_all or decode_req.req.rid.startswith(recv_req.rid):
35003500
logger.debug(f"Abort prealloc queue request. {decode_req.req.rid=}")
35013501
decode_req.kv_receiver.abort()
3502+
if not isinstance(decode_req.req.finished_reason, FINISH_ABORT):
3503+
decode_req.req.finished_reason = FINISH_ABORT()
35023504

35033505
# Abort requests waiting for kvcache to release tree cache
35043506
for decode_req in self.disagg_decode_transfer_queue.queue:
35053507
if recv_req.abort_all or decode_req.req.rid.startswith(recv_req.rid):
35063508
logger.debug(f"Abort transfer queue request. {decode_req.req.rid=}")
35073509
decode_req.kv_receiver.abort()
3510+
if not isinstance(decode_req.req.finished_reason, FINISH_ABORT):
3511+
decode_req.req.finished_reason = FINISH_ABORT()
35083512

35093513
# Abort requests already retracted to CPU cache
35103514
if self.disagg_decode_prealloc_queue.retracted_queue:

python/sglang/srt/managers/tokenizer_manager.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1491,7 +1491,15 @@ async def _handle_batch_request(
14911491
pass
14921492

14931493
def abort_request(self, rid: str = "", abort_all: bool = False):
1494-
if not abort_all and rid not in self.rid_to_state:
1494+
# Empty rid would startswith-match every request on the scheduler.
1495+
if not abort_all and not rid:
1496+
logger.warning("Ignore abort_request with empty rid and abort_all=False")
1497+
return
1498+
if (
1499+
not abort_all
1500+
and self.server_args.tokenizer_worker_num == 1
1501+
and rid not in self.rid_to_state
1502+
):
14951503
return
14961504
req = AbortReq(rid=rid, abort_all=abort_all)
14971505
self.send_to_scheduler.send_pyobj(req)

0 commit comments

Comments
 (0)