Skip to content

Commit 07ef34f

Browse files
update patch for gsaondevice
1 parent 2cc678f commit 07ef34f

1 file changed

Lines changed: 66 additions & 56 deletions

File tree

ucm/integration/vllm/patch/0.9.2/vllm-adapt-sparse.patch

Lines changed: 66 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,22 @@
1-
From 8cb493f9ece884cbc2ba71e367bed2b4116ae1b3 Mon Sep 17 00:00:00 2001
1+
From d24997f66572362fa3a46ee76f5af65e027b856e Mon Sep 17 00:00:00 2001
22
From: wenxinwang <wangwenxin21@huawei.com>
3-
Date: Tue, 23 Dec 2025 19:44:21 -0800
4-
Subject: [PATCH] kvcomp qwen deepseek
3+
Date: Mon, 19 Jan 2026 19:26:05 -0800
4+
Subject: [PATCH] update patch for gsaondevice + sparse + cache blend
55

66
---
7-
vllm/attention/layer.py | 63 ++++++++++++++++-
7+
vllm/attention/layer.py | 63 +++++++++++++++-
88
vllm/model_executor/models/llama.py | 21 +++++-
9-
vllm/model_executor/models/qwen2.py | 23 ++++++-
10-
vllm/v1/attention/backends/flash_attn.py | 7 ++
9+
vllm/model_executor/models/qwen2.py | 23 +++++-
1110
vllm/v1/attention/backends/mla/common.py | 15 +++-
1211
vllm/v1/attention/backends/mla/flashmla.py | 18 ++++-
1312
vllm/v1/core/kv_cache_manager.py | 7 +-
1413
vllm/v1/core/kv_cache_utils.py | 13 ++++
15-
vllm/v1/core/sched/output.py | 3 +
16-
vllm/v1/core/sched/scheduler.py | 30 +++++++-
14+
vllm/v1/core/sched/output.py | 7 +-
15+
vllm/v1/core/sched/scheduler.py | 34 ++++++++-
1716
vllm/v1/worker/block_table.py | 13 ++++
18-
vllm/v1/worker/gpu_model_runner.py | 80 +++++++++++++++++++---
17+
vllm/v1/worker/gpu_model_runner.py | 87 +++++++++++++++++++---
1918
vllm/v1/worker/gpu_worker.py | 2 +
20-
13 files changed, 275 insertions(+), 20 deletions(-)
19+
12 files changed, 281 insertions(+), 22 deletions(-)
2120

2221
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
2322
index f0ad68b16..ba93960de 100644
@@ -237,31 +236,6 @@ index 7ef9d248d..e35ab2fdc 100644
237236
if not get_pp_group().is_last_rank:
238237
return IntermediateTensors({
239238
"hidden_states": hidden_states,
240-
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
241-
index fbc13c06c..2b2244949 100755
242-
--- a/vllm/v1/attention/backends/flash_attn.py
243-
+++ b/vllm/v1/attention/backends/flash_attn.py
244-
@@ -16,6 +16,8 @@ from vllm.attention.ops.merge_attn_states import merge_attn_states
245-
from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
246-
get_flash_attn_version,
247-
is_flash_attn_varlen_func_available)
248-
+from ucm.sparse.state import get_ucm_sparse, has_ucm_sparse
249-
+import os
250-
251-
if is_flash_attn_varlen_func_available():
252-
from vllm.attention.utils.fa_utils import (flash_attn_varlen_func,
253-
@@ -221,6 +223,11 @@ class FlashAttentionMetadataBuilder(
254-
block_table = self.block_table
255-
block_table_tensor = block_table.get_device_tensor()[:num_reqs]
256-
257-
+ if has_ucm_sparse():
258-
+ ucm_sparse = get_ucm_sparse()
259-
+ if os.getenv("VLLM_HASH_ATTENTION") == "1":
260-
+ decode_mask, topk_seq_lens = ucm_sparse.build_decode_attention_meta(query_start_loc, seq_lens, block_table_tensor)
261-
+
262-
block_table.slot_mapping[:num_actual_tokens].copy_(
263-
block_table.slot_mapping_cpu[:num_actual_tokens],
264-
non_blocking=True)
265239
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
266240
index f2aaf59a4..439bb9b14 100644
267241
--- a/vllm/v1/attention/backends/mla/common.py
@@ -454,18 +428,29 @@ index 2fbcb569e..40c199563 100644
454428
# All layers have the same KV cache spec, so we create one kv cache group
455429
# for all layers.
456430
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
457-
index d34f39327..141d750b3 100644
431+
index d34f39327..0f60ac77d 100644
458432
--- a/vllm/v1/core/sched/output.py
459433
+++ b/vllm/v1/core/sched/output.py
460-
@@ -155,3 +155,6 @@ class SchedulerOutput:
434+
@@ -3,7 +3,7 @@
435+
436+
from __future__ import annotations
437+
438+
-from dataclasses import dataclass
439+
+from dataclasses import dataclass, field
440+
from typing import TYPE_CHECKING, Optional
441+
442+
if TYPE_CHECKING:
443+
@@ -155,3 +155,8 @@ class SchedulerOutput:
461444

462445
# KV Cache Connector metadata.
463446
kv_connector_metadata: Optional[KVConnectorMetadata] = None
464447
+
465448
+ # modified slots by sparse algorithm
466449
+ req_sparsed_slots: dict[str, int] = None
450+
+ # The number of tokens computed externally for each request
451+
+ num_external_computed_tokens_per_req: dict[str, int] = field(default_factory=dict)
467452
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
468-
index fe552db74..0d8a67eba 100644
453+
index fe552db74..7d98745c8 100644
469454
--- a/vllm/v1/core/sched/scheduler.py
470455
+++ b/vllm/v1/core/sched/scheduler.py
471456
@@ -34,6 +34,10 @@ from vllm.v1.outputs import ModelRunnerOutput
@@ -524,7 +509,14 @@ index fe552db74..0d8a67eba 100644
524509
if new_blocks is None:
525510
# The request cannot be scheduled.
526511
# Preempt the lowest-priority request.
527-
@@ -337,6 +355,10 @@ class Scheduler(SchedulerInterface):
512+
@@ -331,12 +349,17 @@ class Scheduler(SchedulerInterface):
513+
skipped_waiting_requests = create_request_queue(self.policy)
514+
515+
# Next, schedule the WAITING requests.
516+
+ num_external_computed_tokens_per_req: dict[str, int] = {}
517+
if not preempted_reqs:
518+
while self.waiting and token_budget > 0:
519+
if len(self.running) == self.max_num_running_reqs:
528520
break
529521

530522
request = self.waiting.peek_request()
@@ -535,23 +527,33 @@ index fe552db74..0d8a67eba 100644
535527

536528
# KVTransfer: skip request if still waiting for remote kvs.
537529
if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
538-
@@ -446,6 +468,7 @@ class Scheduler(SchedulerInterface):
530+
@@ -387,7 +410,7 @@ class Scheduler(SchedulerInterface):
531+
num_external_computed_tokens, load_kv_async = (
532+
self.connector.get_num_new_matched_tokens(
533+
request, num_new_local_computed_tokens))
534+
-
535+
+ num_external_computed_tokens_per_req.update({request.request_id: num_external_computed_tokens})
536+
# Total computed tokens (local + external).
537+
num_computed_tokens = (num_new_local_computed_tokens +
538+
num_external_computed_tokens)
539+
@@ -446,6 +469,7 @@ class Scheduler(SchedulerInterface):
539540
new_computed_blocks,
540541
num_lookahead_tokens=self.num_lookahead_tokens,
541542
delay_cache_blocks=load_kv_async,
542543
+ num_slots_sparsed=num_slots_sparsed
543544
)
544545
if new_blocks is None:
545546
# The request cannot be scheduled.
546-
@@ -559,6 +582,7 @@ class Scheduler(SchedulerInterface):
547+
@@ -559,6 +583,8 @@ class Scheduler(SchedulerInterface):
547548
scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
548549
scheduled_encoder_inputs=scheduled_encoder_inputs,
549550
num_common_prefix_blocks=num_common_prefix_blocks,
550551
+ req_sparsed_slots=req_sparsed_slots,
552+
+ num_external_computed_tokens_per_req = num_external_computed_tokens_per_req,
551553
# finished_req_ids is an existing state in the scheduler,
552554
# instead of being newly scheduled in this step.
553555
# It contains the request IDs that are finished in between
554-
@@ -927,6 +951,8 @@ class Scheduler(SchedulerInterface):
556+
@@ -927,6 +953,8 @@ class Scheduler(SchedulerInterface):
555557
def add_request(self, request: Request) -> None:
556558
self.waiting.add_request(request)
557559
self.requests[request.request_id] = request
@@ -560,7 +562,7 @@ index fe552db74..0d8a67eba 100644
560562
if self.log_stats:
561563
request.record_event(EngineCoreEventType.QUEUED)
562564

563-
@@ -976,6 +1002,8 @@ class Scheduler(SchedulerInterface):
565+
@@ -976,6 +1004,8 @@ class Scheduler(SchedulerInterface):
564566

565567
def _free_request(self, request: Request) -> Optional[dict[str, Any]]:
566568
assert request.is_finished()
@@ -601,7 +603,7 @@ index 8f4e8d64c..f45e39f5c 100644
601603
for i, block_table in enumerate(self.block_tables):
602604
block_table.add_row(block_ids[i], row_idx)
603605
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
604-
index 5a26e88db..6a39240d2 100644
606+
index 5a26e88db..41544a077 100644
605607
--- a/vllm/v1/worker/gpu_model_runner.py
606608
+++ b/vllm/v1/worker/gpu_model_runner.py
607609
@@ -15,6 +15,7 @@ import torch.nn as nn
@@ -622,15 +624,17 @@ index 5a26e88db..6a39240d2 100644
622624
if TYPE_CHECKING:
623625
import xgrammar as xgr
624626
import xgrammar.kernels.apply_token_bitmask_inplace_torch_compile as xgr_torch_compile # noqa: E501
625-
@@ -365,6 +369,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
627+
@@ -364,7 +368,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
628+
new/resumed/paused/finished request in the batch.
626629
"""
627630
# Remove finished requests from the cached states.
631+
+ self.ucm_sparse_update_states(scheduler_output)
628632
for req_id in scheduler_output.finished_req_ids:
629633
+ self.ucm_sparse_request_finished_in_worker(req_id)
630634
self.requests.pop(req_id, None)
631635
self.encoder_cache.pop(req_id, None)
632636
# Remove the finished requests from the persistent batch.
633-
@@ -468,11 +473,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
637+
@@ -468,11 +474,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
634638
# Update the states of the running/resumed requests.
635639
is_last_rank = get_pp_group().is_last_rank
636640
req_data = scheduler_output.scheduled_cached_reqs
@@ -644,7 +648,7 @@ index 5a26e88db..6a39240d2 100644
644648

645649
# Update the cached states.
646650
req_state.num_computed_tokens = num_computed_tokens
647-
@@ -494,15 +501,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
651+
@@ -494,15 +502,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
648652
new_token_ids[-num_new_tokens:])
649653

650654
# Update the block IDs.
@@ -666,7 +670,7 @@ index 5a26e88db..6a39240d2 100644
666670

667671
req_index = self.input_batch.req_id_to_index.get(req_id)
668672
if req_index is None:
669-
@@ -515,6 +522,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
673+
@@ -515,6 +523,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
670674
# Update the persistent batch.
671675
self.input_batch.num_computed_tokens_cpu[req_index] = (
672676
num_computed_tokens)
@@ -675,7 +679,7 @@ index 5a26e88db..6a39240d2 100644
675679
self.input_batch.block_table.append_row(new_block_ids, req_index)
676680

677681
# For the last rank, we don't need to update the token_ids_cpu
678-
@@ -623,6 +632,19 @@ class GPUModelRunner(LoRAModelRunnerMixin):
682+
@@ -623,6 +633,19 @@ class GPUModelRunner(LoRAModelRunnerMixin):
679683
if self.uses_mrope:
680684
self._calc_mrope_positions(scheduler_output)
681685

@@ -695,7 +699,7 @@ index 5a26e88db..6a39240d2 100644
695699
# Get token indices.
696700
# E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
697701
# -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
698-
@@ -652,11 +674,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
702+
@@ -652,11 +675,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
699703
# block_size.
700704
block_table_indices = (
701705
req_indices * block_table.max_num_blocks_per_req +
@@ -709,7 +713,7 @@ index 5a26e88db..6a39240d2 100644
709713
np.add(
710714
block_numbers * block_size,
711715
block_offsets,
712-
@@ -666,9 +688,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
716+
@@ -666,9 +689,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
713717
self.query_start_loc_np[0] = 0
714718
self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
715719

@@ -724,7 +728,7 @@ index 5a26e88db..6a39240d2 100644
724728

725729
# Copy the tensors to the GPU.
726730
self.input_ids[:total_num_scheduled_tokens].copy_(
727-
@@ -680,6 +704,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
731+
@@ -680,6 +705,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
728732
non_blocking=True)
729733
else:
730734
# Common case (1D positions)
@@ -733,15 +737,15 @@ index 5a26e88db..6a39240d2 100644
733737
self.positions[:total_num_scheduled_tokens].copy_(
734738
self.positions_cpu[:total_num_scheduled_tokens],
735739
non_blocking=True)
736-
@@ -1370,6 +1396,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
740+
@@ -1370,6 +1397,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
737741
skip_cuda_graphs=skip_cuda_graphs,
738742
):
739743
self.maybe_setup_kv_connector(scheduler_output)
740744
+ self.maybe_execute_ucm_sparse_begin(scheduler_output, attn_metadata)
741745

742746
model_output = self.model(
743747
input_ids=input_ids,
744-
@@ -1379,6 +1406,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
748+
@@ -1379,6 +1407,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
745749
)
746750

747751
self.maybe_wait_for_kv_save()
@@ -750,7 +754,7 @@ index 5a26e88db..6a39240d2 100644
750754
finished_sending, finished_recving = (
751755
self.get_finished_kv_transfers(scheduler_output))
752756

753-
@@ -1723,6 +1752,30 @@ class GPUModelRunner(LoRAModelRunnerMixin):
757+
@@ -1723,6 +1753,36 @@ class GPUModelRunner(LoRAModelRunnerMixin):
754758
if has_kv_transfer_group():
755759
get_kv_transfer_group().wait_for_save()
756760

@@ -777,11 +781,17 @@ index 5a26e88db..6a39240d2 100644
777781
+ return
778782
+ ucm_sparse = get_ucm_sparse()
779783
+ ucm_sparse.request_finished_in_worker(request_id)
784+
+
785+
+ def ucm_sparse_update_states(self, scheduler_output: "SchedulerOutput"):
786+
+ if not has_ucm_sparse():
787+
+ return
788+
+ ucm_sparse = get_ucm_sparse()
789+
+ ucm_sparse.update_states(scheduler_output)
780790
+
781791
@staticmethod
782792
def get_finished_kv_transfers(
783793
scheduler_output: "SchedulerOutput",
784-
@@ -2570,6 +2623,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
794+
@@ -2570,6 +2630,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
785795
kv_caches = self._reshape_kv_cache_tensors(kv_cache_config,
786796
kv_cache_raw_tensors)
787797

0 commit comments

Comments
 (0)