1- From 8cb493f9ece884cbc2ba71e367bed2b4116ae1b3 Mon Sep 17 00:00:00 2001
1+ From d24997f66572362fa3a46ee76f5af65e027b856e Mon Sep 17 00:00:00 2001
22From: wenxinwang <wangwenxin21@huawei.com>
3- Date: Tue, 23 Dec 2025 19:44:21 -0800
4- Subject: [PATCH] kvcomp qwen deepseek
3+ Date: Mon, 19 Jan 2026 19:26:05 -0800
4+ Subject: [PATCH] update patch for gsaondevice + sparse + cache blend
55
66---
7- vllm/attention/layer.py | 63 ++++++++++++++++ -
7+ vllm/attention/layer.py | 63 +++++++++++++++-
88 vllm/model_executor/models/llama.py | 21 +++++-
9- vllm/model_executor/models/qwen2.py | 23 ++++++-
10- vllm/v1/attention/backends/flash_attn.py | 7 ++
9+ vllm/model_executor/models/qwen2.py | 23 +++++-
1110 vllm/v1/attention/backends/mla/common.py | 15 +++-
1211 vllm/v1/attention/backends/mla/flashmla.py | 18 ++++-
1312 vllm/v1/core/kv_cache_manager.py | 7 +-
1413 vllm/v1/core/kv_cache_utils.py | 13 ++++
15- vllm/v1/core/sched/output.py | 3 +
16- vllm/v1/core/sched/scheduler.py | 30 +++++++-
14+ vllm/v1/core/sched/output.py | 7 +-
15+ vllm/v1/core/sched/scheduler.py | 34 + +++++++-
1716 vllm/v1/worker/block_table.py | 13 ++++
18- vllm/v1/worker/gpu_model_runner.py | 80 +++++++++++++++++++---
17+ vllm/v1/worker/gpu_model_runner.py | 87 +++++++++++++++++++---
1918 vllm/v1/worker/gpu_worker.py | 2 +
20- 13 files changed, 275 insertions(+), 20 deletions(-)
19+ 12 files changed, 281 insertions(+), 22 deletions(-)
2120
2221diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
2322index f0ad68b16..ba93960de 100644
@@ -237,31 +236,6 @@ index 7ef9d248d..e35ab2fdc 100644
237236 if not get_pp_group().is_last_rank:
238237 return IntermediateTensors({
239238 "hidden_states": hidden_states,
240- diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
241- index fbc13c06c..2b2244949 100755
242- --- a/vllm/v1/attention/backends/flash_attn.py
243- +++ b/vllm/v1/attention/backends/flash_attn.py
244- @@ -16,6 +16,8 @@ from vllm.attention.ops.merge_attn_states import merge_attn_states
245- from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
246- get_flash_attn_version,
247- is_flash_attn_varlen_func_available)
248- + from ucm.sparse.state import get_ucm_sparse, has_ucm_sparse
249- + import os
250-
251- if is_flash_attn_varlen_func_available():
252- from vllm.attention.utils.fa_utils import (flash_attn_varlen_func,
253- @@ -221,6 +223,11 @@ class FlashAttentionMetadataBuilder(
254- block_table = self.block_table
255- block_table_tensor = block_table.get_device_tensor()[:num_reqs]
256-
257- + if has_ucm_sparse():
258- + ucm_sparse = get_ucm_sparse()
259- + if os.getenv("VLLM_HASH_ATTENTION") == "1":
260- + decode_mask, topk_seq_lens = ucm_sparse.build_decode_attention_meta(query_start_loc, seq_lens, block_table_tensor)
261- +
262- block_table.slot_mapping[:num_actual_tokens].copy_(
263- block_table.slot_mapping_cpu[:num_actual_tokens],
264- non_blocking=True)
265239diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
266240index f2aaf59a4..439bb9b14 100644
267241--- a/vllm/v1/attention/backends/mla/common.py
@@ -454,18 +428,29 @@ index 2fbcb569e..40c199563 100644
454428 # All layers have the same KV cache spec, so we create one kv cache group
455429 # for all layers.
456430diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
457- index d34f39327..141d750b3 100644
431+ index d34f39327..0f60ac77d 100644
458432--- a/vllm/v1/core/sched/output.py
459433+++ b/vllm/v1/core/sched/output.py
460- @@ -155,3 +155,6 @@ class SchedulerOutput:
434+ @@ -3,7 +3,7 @@
435+
436+ from __future__ import annotations
437+
438+ - from dataclasses import dataclass
439+ + from dataclasses import dataclass, field
440+ from typing import TYPE_CHECKING, Optional
441+
442+ if TYPE_CHECKING:
443+ @@ -155,3 +155,8 @@ class SchedulerOutput:
461444
462445 # KV Cache Connector metadata.
463446 kv_connector_metadata: Optional[KVConnectorMetadata] = None
464447+
465448+ # modified slots by sparse algorithm
466449+ req_sparsed_slots: dict[str, int] = None
450+ + # The number of tokens computed externally for each request
451+ + num_external_computed_tokens_per_req: dict[str, int] = field(default_factory=dict)
467452diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
468- index fe552db74..0d8a67eba 100644
453+ index fe552db74..7d98745c8 100644
469454--- a/vllm/v1/core/sched/scheduler.py
470455+++ b/vllm/v1/core/sched/scheduler.py
471456@@ -34,6 +34,10 @@ from vllm.v1.outputs import ModelRunnerOutput
@@ -524,7 +509,14 @@ index fe552db74..0d8a67eba 100644
524509 if new_blocks is None:
525510 # The request cannot be scheduled.
526511 # Preempt the lowest-priority request.
527- @@ -337,6 +355,10 @@ class Scheduler(SchedulerInterface):
512+ @@ -331,12 +349,17 @@ class Scheduler(SchedulerInterface):
513+ skipped_waiting_requests = create_request_queue(self.policy)
514+
515+ # Next, schedule the WAITING requests.
516+ + num_external_computed_tokens_per_req: dict[str, int] = {}
517+ if not preempted_reqs:
518+ while self.waiting and token_budget > 0:
519+ if len(self.running) == self.max_num_running_reqs:
528520 break
529521
530522 request = self.waiting.peek_request()
@@ -535,23 +527,33 @@ index fe552db74..0d8a67eba 100644
535527
536528 # KVTransfer: skip request if still waiting for remote kvs.
537529 if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
538- @@ -446,6 +468,7 @@ class Scheduler(SchedulerInterface):
530+ @@ -387,7 +410,7 @@ class Scheduler(SchedulerInterface):
531+ num_external_computed_tokens, load_kv_async = (
532+ self.connector.get_num_new_matched_tokens(
533+ request, num_new_local_computed_tokens))
534+ -
535+ + num_external_computed_tokens_per_req.update({request.request_id: num_external_computed_tokens})
536+ # Total computed tokens (local + external).
537+ num_computed_tokens = (num_new_local_computed_tokens +
538+ num_external_computed_tokens)
539+ @@ -446,6 +469,7 @@ class Scheduler(SchedulerInterface):
539540 new_computed_blocks,
540541 num_lookahead_tokens=self.num_lookahead_tokens,
541542 delay_cache_blocks=load_kv_async,
542543+ num_slots_sparsed=num_slots_sparsed
543544 )
544545 if new_blocks is None:
545546 # The request cannot be scheduled.
546- @@ -559,6 +582,7 @@ class Scheduler(SchedulerInterface):
547+ @@ -559,6 +583,8 @@ class Scheduler(SchedulerInterface):
547548 scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
548549 scheduled_encoder_inputs=scheduled_encoder_inputs,
549550 num_common_prefix_blocks=num_common_prefix_blocks,
550551+ req_sparsed_slots=req_sparsed_slots,
552+ + num_external_computed_tokens_per_req = num_external_computed_tokens_per_req,
551553 # finished_req_ids is an existing state in the scheduler,
552554 # instead of being newly scheduled in this step.
553555 # It contains the request IDs that are finished in between
554- @@ -927,6 +951 ,8 @@ class Scheduler(SchedulerInterface):
556+ @@ -927,6 +953 ,8 @@ class Scheduler(SchedulerInterface):
555557 def add_request(self, request: Request) -> None:
556558 self.waiting.add_request(request)
557559 self.requests[request.request_id] = request
@@ -560,7 +562,7 @@ index fe552db74..0d8a67eba 100644
560562 if self.log_stats:
561563 request.record_event(EngineCoreEventType.QUEUED)
562564
563- @@ -976,6 +1002 ,8 @@ class Scheduler(SchedulerInterface):
565+ @@ -976,6 +1004 ,8 @@ class Scheduler(SchedulerInterface):
564566
565567 def _free_request(self, request: Request) -> Optional[dict[str, Any]]:
566568 assert request.is_finished()
@@ -601,7 +603,7 @@ index 8f4e8d64c..f45e39f5c 100644
601603 for i, block_table in enumerate(self.block_tables):
602604 block_table.add_row(block_ids[i], row_idx)
603605diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
604- index 5a26e88db..6a39240d2 100644
606+ index 5a26e88db..41544a077 100644
605607--- a/vllm/v1/worker/gpu_model_runner.py
606608+++ b/vllm/v1/worker/gpu_model_runner.py
607609@@ -15,6 +15,7 @@ import torch.nn as nn
@@ -622,15 +624,17 @@ index 5a26e88db..6a39240d2 100644
622624 if TYPE_CHECKING:
623625 import xgrammar as xgr
624626 import xgrammar.kernels.apply_token_bitmask_inplace_torch_compile as xgr_torch_compile # noqa: E501
625- @@ -365,6 +369,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
627+ @@ -364,7 +368,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
628+ new/resumed/paused/finished request in the batch.
626629 """
627630 # Remove finished requests from the cached states.
631+ + self.ucm_sparse_update_states(scheduler_output)
628632 for req_id in scheduler_output.finished_req_ids:
629633+ self.ucm_sparse_request_finished_in_worker(req_id)
630634 self.requests.pop(req_id, None)
631635 self.encoder_cache.pop(req_id, None)
632636 # Remove the finished requests from the persistent batch.
633- @@ -468,11 +473 ,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
637+ @@ -468,11 +474 ,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
634638 # Update the states of the running/resumed requests.
635639 is_last_rank = get_pp_group().is_last_rank
636640 req_data = scheduler_output.scheduled_cached_reqs
@@ -644,7 +648,7 @@ index 5a26e88db..6a39240d2 100644
644648
645649 # Update the cached states.
646650 req_state.num_computed_tokens = num_computed_tokens
647- @@ -494,15 +501 ,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
651+ @@ -494,15 +502 ,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
648652 new_token_ids[-num_new_tokens:])
649653
650654 # Update the block IDs.
@@ -666,7 +670,7 @@ index 5a26e88db..6a39240d2 100644
666670
667671 req_index = self.input_batch.req_id_to_index.get(req_id)
668672 if req_index is None:
669- @@ -515,6 +522 ,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
673+ @@ -515,6 +523 ,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
670674 # Update the persistent batch.
671675 self.input_batch.num_computed_tokens_cpu[req_index] = (
672676 num_computed_tokens)
@@ -675,7 +679,7 @@ index 5a26e88db..6a39240d2 100644
675679 self.input_batch.block_table.append_row(new_block_ids, req_index)
676680
677681 # For the last rank, we don't need to update the token_ids_cpu
678- @@ -623,6 +632 ,19 @@ class GPUModelRunner(LoRAModelRunnerMixin):
682+ @@ -623,6 +633 ,19 @@ class GPUModelRunner(LoRAModelRunnerMixin):
679683 if self.uses_mrope:
680684 self._calc_mrope_positions(scheduler_output)
681685
@@ -695,7 +699,7 @@ index 5a26e88db..6a39240d2 100644
695699 # Get token indices.
696700 # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
697701 # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
698- @@ -652,11 +674 ,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
702+ @@ -652,11 +675 ,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
699703 # block_size.
700704 block_table_indices = (
701705 req_indices * block_table.max_num_blocks_per_req +
@@ -709,7 +713,7 @@ index 5a26e88db..6a39240d2 100644
709713 np.add(
710714 block_numbers * block_size,
711715 block_offsets,
712- @@ -666,9 +688 ,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
716+ @@ -666,9 +689 ,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
713717 self.query_start_loc_np[0] = 0
714718 self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
715719
@@ -724,7 +728,7 @@ index 5a26e88db..6a39240d2 100644
724728
725729 # Copy the tensors to the GPU.
726730 self.input_ids[:total_num_scheduled_tokens].copy_(
727- @@ -680,6 +704 ,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
731+ @@ -680,6 +705 ,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
728732 non_blocking=True)
729733 else:
730734 # Common case (1D positions)
@@ -733,15 +737,15 @@ index 5a26e88db..6a39240d2 100644
733737 self.positions[:total_num_scheduled_tokens].copy_(
734738 self.positions_cpu[:total_num_scheduled_tokens],
735739 non_blocking=True)
736- @@ -1370,6 +1396 ,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
740+ @@ -1370,6 +1397 ,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
737741 skip_cuda_graphs=skip_cuda_graphs,
738742 ):
739743 self.maybe_setup_kv_connector(scheduler_output)
740744+ self.maybe_execute_ucm_sparse_begin(scheduler_output, attn_metadata)
741745
742746 model_output = self.model(
743747 input_ids=input_ids,
744- @@ -1379,6 +1406 ,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
748+ @@ -1379,6 +1407 ,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
745749 )
746750
747751 self.maybe_wait_for_kv_save()
@@ -750,7 +754,7 @@ index 5a26e88db..6a39240d2 100644
750754 finished_sending, finished_recving = (
751755 self.get_finished_kv_transfers(scheduler_output))
752756
753- @@ -1723,6 +1752,30 @@ class GPUModelRunner(LoRAModelRunnerMixin):
757+ @@ -1723,6 +1753,36 @@ class GPUModelRunner(LoRAModelRunnerMixin):
754758 if has_kv_transfer_group():
755759 get_kv_transfer_group().wait_for_save()
756760
@@ -777,11 +781,17 @@ index 5a26e88db..6a39240d2 100644
777781+ return
778782+ ucm_sparse = get_ucm_sparse()
779783+ ucm_sparse.request_finished_in_worker(request_id)
784+ +
785+ + def ucm_sparse_update_states(self, scheduler_output: "SchedulerOutput"):
786+ + if not has_ucm_sparse():
787+ + return
788+ + ucm_sparse = get_ucm_sparse()
789+ + ucm_sparse.update_states(scheduler_output)
780790+
781791 @staticmethod
782792 def get_finished_kv_transfers(
783793 scheduler_output: "SchedulerOutput",
784- @@ -2570,6 +2623 ,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
794+ @@ -2570,6 +2630 ,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
785795 kv_caches = self._reshape_kv_cache_tensors(kv_cache_config,
786796 kv_cache_raw_tensors)
787797
0 commit comments