[gsaonDevice] bugfix & optimization

wangwenxin0312 · wangwenxin0312 · commit 2cc678feb492 · 2026-01-19T23:01:25.000-08:00
diff --git a/examples/offline_inference_kvcomphbm.py b/examples/offline_inference_kvcomphbm.py
@@ -77,7 +77,7 @@ def build_llm_with_uc(module_path: str, name: str, model: str):
                     },
                 }
             ],
-            "ucm_sparse_config": {"KvCompOnDevice": {}},
+            "ucm_sparse_config": {"GSAOnDevice": {}},
         },
     )
 
diff --git a/ucm/sparse/base.py b/ucm/sparse/base.py
@@ -53,6 +53,77 @@ class UcmSparseMetadata(ABC):  # noqa: B024
     pass
 
 
+class UcmSparseCpuGpuBuffer:
+    """Buffer to easily copy tensors between CPU and GPU. Inferred by vLLM."""
+
+    def __init__(
+        self,
+        *size: Union[int, torch.SymInt],
+        dtype: torch.dtype,
+        device: torch.device,
+        pin_memory: bool = True,
+        with_numpy: bool = True,
+    ) -> None:
+        self.cpu = torch.zeros(*size, dtype=dtype, device="cpu", pin_memory=pin_memory)
+        self.gpu = self.cpu.to(device)
+        self.np: np.ndarray
+        self.n = 0
+
+        if with_numpy:
+            if dtype == torch.bfloat16:
+                raise ValueError(
+                    "Bfloat16 torch tensors cannot be directly cast to a "
+                    "numpy array, so call UcmSparseCpuGpuBuffer with with_numpy=False"
+                )
+            self.np = self.cpu.numpy()
+
+    def copy_to_gpu(self, n: Optional[int] = None) -> None:
+        # TODO: replace with esa_copy
+        if n is None:
+            n = self.n
+        if n <= 0:
+            return
+        self.gpu[:n].copy_(self.cpu[:n], non_blocking=True)
+
+    def copy_to_cpu(self, n: Optional[int] = None) -> None:
+        # TODO: replace with esa_copy
+        """NOTE: Because this method is non-blocking, explicit synchronization
+        is needed to ensure the data is copied to CPU."""
+        if n is None:
+            n = self.n
+        if n <= 0:
+            return
+        self.cpu[:n].copy_(self.gpu[:n], non_blocking=True)
+
+    def append_numpy(self, data: List[Any]) -> None:
+        size = len(data)
+        assert (
+            self.np is not None
+        ), "append_numpy meed to be initialized by with_numpy=True."
+        assert self.n + size < self.cpu.shape[0], "append_numpy data out of range."
+        self.np[self.n : self.n + size] = data
+        self.n += size
+
+    def clear(self) -> None:
+        self.n = 0
+
+    @property
+    def size(self) -> int:
+        return self.n
+
+    @property
+    def valid_np(self) -> np.ndarray:
+        return self.np[: self.n]
+
+    @property
+    def valid_cpu(self) -> torch.Tensor:
+        return self.cpu[: self.n]
+
+    @property
+    def valid_gpu(self) -> torch.Tensor:
+        return self.gpu[: self.n]
+
+
 class UcmSparseBase(ABC):
     """
     An general interface for impl sparse attention algorithm in vLLM
diff --git a/ucm/sparse/gsa_on_device/gsa_on_device.py b/ucm/sparse/gsa_on_device/gsa_on_device.py

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ def build_llm_with_uc(module_path: str, name: str, model: str):`
`77`	`77`	`},`
`78`	`78`	`}`
`79`	`79`	`],`
`80`		`- "ucm_sparse_config": {"KvCompOnDevice": {}},`
	`80`	`+ "ucm_sparse_config": {"GSAOnDevice": {}},`
`81`	`81`	`},`
`82`	`82`	`)`
`83`	`83`