fix

0oshowero0 · 0oshowero0 · commit 20d3ba7ac929 · 2026-04-25T12:30:53.000+08:00
Signed-off-by: 0oshowero0 &lt;o0shower0o@outlook.com&gt;
diff --git a/transfer_queue/storage/clients/mooncake_client.py b/transfer_queue/storage/clients/mooncake_client.py
@@ -24,7 +24,7 @@
 
 from transfer_queue.storage.clients.base import TransferQueueStorageKVClient
 from transfer_queue.storage.clients.factory import StorageClientFactory
-from transfer_queue.utils.tensor_utils import allocate_empty_tensors, get_nbytes, merge_continues_memory
+from transfer_queue.utils.tensor_utils import allocate_empty_tensors, get_nbytes, merge_contiguous_memory
 
 logger = logging.getLogger(__name__)
 logger.setLevel(os.getenv("TQ_LOGGING_LEVEL", logging.WARNING))
@@ -145,7 +145,7 @@ def _put_tensors_thread_worker(self, batch_keys: list[str], batch_tensors: list[
         """Worker thread for putting batch of tensors to MooncakeStore."""
 
         batch_ptrs, batch_sizes, contiguous_tensors = self._preprocess_tensors_for_put(batch_tensors)
-        batch_ptr_reduced, batch_sizes_reduced = merge_continues_memory(batch_ptrs, batch_sizes)
+        batch_ptr_reduced, batch_sizes_reduced = merge_contiguous_memory(batch_ptrs, batch_sizes)
         self._register_all_buffers(batch_ptr_reduced, batch_sizes_reduced)
 
         try:
@@ -159,14 +159,14 @@ def _put_tensors_thread_worker(self, batch_keys: list[str], batch_tensors: list[
         finally:
             self._unregister_all_buffers(batch_ptr_reduced)
 
-    def _put_bytes_thread_worker(self, batch_keys: list[str], batch_values: list[bytes]):
+    def _put_bytes_thread_worker(self, batch_keys: list[str], batch_values: list[Any]):
         """Worker thread for putting batch of non-tensors to MooncakeStore."""
 
         batch_values = [pickle.dumps(v, protocol=pickle.HIGHEST_PROTOCOL) for v in batch_values]
 
         ret = self._store.upsert_batch(batch_keys, batch_values, self.replica_config)
         if ret != 0:
-            raise RuntimeError(f"put_batch failed with error code: {ret}")
+            raise RuntimeError(f"upsert_batch failed with error code: {ret}")
 
     def get(
         self,
@@ -232,9 +232,11 @@ def _get_tensors_thread_worker(
         self, batch_keys: list[str], batch_shapes: list[tuple], batch_dtypes: list[torch.dtype], indexes: list[int]
     ) -> tuple[list[Tensor], list[int]]:
         batch_nbytes = get_nbytes(batch_dtypes, batch_shapes)
-        batch_buffer_tensors, batch_buffer_ptrs = allocate_empty_tensors(batch_dtypes, batch_shapes)
+        batch_buffer_tensors, batch_buffer_ptrs, region_ptrs, region_sizes = allocate_empty_tensors(
+            batch_dtypes, batch_shapes
+        )
 
-        self._register_all_buffers(batch_buffer_ptrs, batch_nbytes)
+        self._register_all_buffers(region_ptrs, region_sizes)
         try:
             ret_codes = self._store.batch_get_into(batch_keys, batch_buffer_ptrs, batch_nbytes)
             if len(ret_codes) != len(batch_keys):
@@ -243,7 +245,7 @@ def _get_tensors_thread_worker(
                 if ret < 0:
                     raise RuntimeError(f"batch_get_into failed for key `{batch_keys[i]}` with error code: {ret}")
         finally:
-            self._unregister_all_buffers(batch_buffer_ptrs)
+            self._unregister_all_buffers(region_ptrs)
 
         return batch_buffer_tensors, indexes
 
@@ -283,6 +285,11 @@ def _preprocess_tensors_for_put(values: list[Tensor]) -> tuple[list[Any], list[A
         size_list = []
         tensor_list = []  # hold reference for the contiguous tensor
         for t in values:
+            # TODO: support gpu direct rdma and use different data paths.
+            #       For GPU, it's more reasonable to perform data copy since
+            #       The register overhead is much higher than CPU
+            if t.device.type == "cuda":
+                t = t.cpu()
             t = t.contiguous()
             tensor_list.append(t)
             ptr_list.append(t.data_ptr())
diff --git a/transfer_queue/utils/tensor_utils.py b/transfer_queue/utils/tensor_utils.py
@@ -16,6 +16,7 @@
 import logging
 import operator
 import os
+import warnings
 from functools import reduce
 
 import torch
@@ -25,7 +26,9 @@
 logger.setLevel(os.getenv("TQ_LOGGING_LEVEL", logging.WARNING))
 
 
-def allocate_empty_tensors(dtypes: list[torch.dtype], shapes: list[tuple]) -> tuple[list[Tensor], list[int]]:
+def allocate_empty_tensors(
+    dtypes: list[torch.dtype], shapes: list[tuple]
+) -> tuple[list[Tensor], list[int], list[int], list[int]]:
     """Allocate empty tensors, grouping same dtypes into shared memory blocks.
 
     Instead of allocating each tensor separately, this function groups tensors
@@ -40,17 +43,19 @@ def allocate_empty_tensors(dtypes: list[torch.dtype], shapes: list[tuple]) -> tu
         A tuple containing:
             - List of tensors sharing memory within their dtype groups.
             - List of memory pointers (data_ptr) for each tensor.
+            - List of base pointers for each allocated memory region (one per dtype).
+            - List of total bytes for each allocated memory region (one per dtype).
 
     Example:
         >>> dtypes = [torch.float32, torch.float32, torch.int32, torch.float32]
         >>> shapes = [(10,), (20,), (5,), (15,)]
-        >>> tensors, ptrs = allocate_empty_tensors(dtypes, shapes)
+        >>> tensors, ptrs, region_ptrs, region_sizes = allocate_empty_tensors(dtypes, shapes)
         >>> # tensors[0], [1], [3] share the same dtype and memory block
     """
     assert len(dtypes) == len(shapes), "dtypes and shapes must have the same length"
 
     if len(dtypes) == 0:
-        return [], []
+        return [], [], [], []
 
     # Group indices by dtype
     dtype_groups: dict[torch.dtype, list[int]] = {}
@@ -61,6 +66,8 @@ def allocate_empty_tensors(dtypes: list[torch.dtype], shapes: list[tuple]) -> tu
 
     tensor_list = [torch.empty(()) for _ in range(len(dtypes))]
     ptr_list = [0] * len(dtypes)
+    region_ptrs: list[int] = []
+    region_sizes: list[int] = []
 
     # For each dtype group, allocate one big tensor and create views
     for dtype, indices in dtype_groups.items():
@@ -69,13 +76,15 @@ def allocate_empty_tensors(dtypes: list[torch.dtype], shapes: list[tuple]) -> tu
         shape_info = []  # Store (index, shape, num_elements, offset)
 
         for idx in indices:
-            shape = shapes[idx]
-            num_elements = reduce(operator.mul, shape)
+            shape = tuple(shapes[idx])
+            num_elements = reduce(operator.mul, shape, 1)
             shape_info.append((idx, shape, num_elements, total_elements))
             total_elements += num_elements
 
         # Allocate one big contiguous memory block for this dtype
         big_tensor = torch.empty(total_elements, dtype=dtype)
+        region_ptrs.append(big_tensor.data_ptr())
+        region_sizes.append(big_tensor.nbytes)
 
         # Create views into the big tensor for each small tensor
         for idx, shape, num_elements, offset in shape_info:
@@ -84,7 +93,7 @@ def allocate_empty_tensors(dtypes: list[torch.dtype], shapes: list[tuple]) -> tu
             tensor_list[idx] = small_tensor
             ptr_list[idx] = small_tensor.data_ptr()
 
-    return tensor_list, ptr_list
+    return tensor_list, ptr_list, region_ptrs, region_sizes
 
 
 def compute_stride(shape: tuple[int, ...]) -> tuple[int, ...]:
@@ -115,36 +124,37 @@ def get_nbytes(dtypes, shapes) -> list[int]:
     nbytes = []
     for i in range(len(dtypes)):
         elem_size = torch.tensor([], dtype=dtypes[i]).element_size()
-        numel = reduce(operator.mul, shapes[i])
+        shape = tuple(shapes[i])
+        numel = reduce(operator.mul, shape, 1)
         nbytes.append(elem_size * numel)
 
     return nbytes
 
 
-def merge_continues_memory(ptrs: list[int], sizes: list[int]) -> tuple[list[int], list[int]]:
-    """Merge continuous memory regions to reduce register_buffer overhead
+def merge_contiguous_memory(ptrs: list[int], sizes: list[int]) -> tuple[list[int], list[int]]:
+    """Merge contiguous memory regions to reduce register_buffer overhead
 
     Args:
         ptrs: List of memory pointers (starting addresses).
         sizes: List of memory region sizes corresponding to each pointer.
 
     Returns:
-        A tuple of (merged_ptrs, merged_sizes) where continuous regions
+        A tuple of (merged_ptrs, merged_sizes) where contiguous regions
         have been merged into single regions.
 
     Example:
-        >>> merge_continues_memory([0, 10, 30], [10, 20, 10])
+        >>> merge_contiguous_memory([0, 10, 30], [10, 20, 10])
         ([0, 30], [30, 10])
 
-        >>> merge_continues_memory([0, 5, 20], [5, 5, 10])
+        >>> merge_contiguous_memory([0, 5, 20], [5, 5, 10])
         ([0, 20], [10, 10])
     """
-    if not ptrs or not sizes:
-        return [], []
-
     if len(ptrs) != len(sizes):
         raise ValueError("ptrs and sizes must have the same length")
 
+    if not ptrs:
+        return [], []
+
     # Create list of (ptr, size) pairs and sort by pointer address
     regions = sorted(zip(ptrs, sizes, strict=False), key=lambda x: x[0])
 
@@ -171,3 +181,13 @@ def merge_continues_memory(ptrs: list[int], sizes: list[int]) -> tuple[list[int]
     merged_sizes.append(current_size)
 
     return merged_ptrs, merged_sizes
+
+
+def merge_continues_memory(ptrs: list[int], sizes: list[int]) -> tuple[list[int], list[int]]:
+    """Deprecated alias for :func:`merge_contiguous_memory`."""
+    warnings.warn(
+        "merge_continues_memory is deprecated, use merge_contiguous_memory instead",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    return merge_contiguous_memory(ptrs, sizes)