Skip to content

Commit f455380

Browse files
committed
Move .cpu() calls into asyncio.to_thread worker closures
Avoid CUDA synchronization on the main async event loop by moving .cpu() transfers inside worker closures, as suggested in PR #4559 review comment r3371701042.
1 parent 9f63ee5 commit f455380

1 file changed

Lines changed: 4 additions & 4 deletions

File tree

lmdeploy/pytorch/spec_decode/guided_spec_helper.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,9 @@ async def accept_draft_tokens(self, draft_token_ids: torch.Tensor,
105105
"""
106106
if not processors or self._mgr is None:
107107
return
108-
cpu_ids = draft_token_ids[:, 0].cpu()
109108

110109
def _accept():
110+
cpu_ids = draft_token_ids[:, 0].cpu()
111111
for idx, proc in processors.items():
112112
self._mgr.accept_token(proc, cpu_ids[idx].item())
113113

@@ -190,11 +190,11 @@ async def accept_rejection_sampled_tokens(
190190
"""
191191
if not processors or self._mgr is None:
192192
return
193-
cpu_num_rejected = num_rejected.cpu() if num_rejected.is_cuda else num_rejected
194-
cpu_output_token_ids = output_token_ids.cpu() if output_token_ids.is_cuda else output_token_ids
195-
cpu_next_token_ids = next_token_ids.cpu() if next_token_ids.is_cuda else next_token_ids
196193

197194
def _accept():
195+
cpu_num_rejected = num_rejected.cpu() if num_rejected.is_cuda else num_rejected
196+
cpu_output_token_ids = output_token_ids.cpu() if output_token_ids.is_cuda else output_token_ids
197+
cpu_next_token_ids = next_token_ids.cpu() if next_token_ids.is_cuda else next_token_ids
198198
for idx, processor in processors.items():
199199
n_rejected = cpu_num_rejected[idx].item()
200200
n_valid_draft = num_spec_tokens - n_rejected

0 commit comments

Comments
 (0)