Skip to content

Commit 4e78ae1

Browse files
ma-hangwangpengcheng
authored andcommitted
issue/407 - fix: early token budget check
1 parent e7e3d8e commit 4e78ae1

1 file changed

Lines changed: 17 additions & 5 deletions

File tree

python/infinilm/llm/scheduler.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,15 +98,27 @@ def schedule(self) -> Optional[SchedulerOutput]:
9898
)
9999
)
100100

101+
num_tokens_this_step = (
102+
req.get_prompt_length() - req.num_cached_tokens
103+
)
104+
if (
105+
current_num_batched_tokens + num_tokens_this_step
106+
>= self.max_num_batched_tokens
107+
):
108+
if req.num_cached_tokens > 0:
109+
self.cache_manager.free_blocks(req.block_table)
110+
req.block_table = []
111+
req.slot_mapping = []
112+
req.num_cached_tokens = 0
113+
114+
self.waiting_queue.sync_q.put(req)
115+
break
116+
117+
current_num_batched_tokens += num_tokens_this_step
101118
req.num_blocks = len(req.block_table)
102119
req.status = RequestStatus.RUNNING
103120
scheduled_requests.append(req)
104121

105-
# TODO
106-
# num_tokens_this_step = req.get_prompt_length() - req.num_cached_tokens
107-
# current_num_batched_tokens += num_tokens_this_step
108-
assert False
109-
110122
# Return prefill batch if any waiting requests were scheduled
111123
if scheduled_requests:
112124
is_prefill = True

0 commit comments

Comments
 (0)