[feat] Introduce Zero-Copy to use YuanrongStorageClient for transmitting CPU Tensors

ascend-robot · ascend-robot · commit b221e2a585ba · 2026-01-26T11:40:19.000+08:00
Co-authored-by: liwenlin<liwenlin8@huawei.com> # message auto-generated for no-merge-commit merge: !10 merge ds_zero_copy into main [feat] Introduce Zero-Copy to use YuanrongStorageClient for transmitting CPU Tensors Created-by: Lexie-7 Commit-by: liwenlin Merged-by: ascend-robot Description: ### Summary When connecting to the backend of the YuanrongStorageClient, zero-copy is activated to enhance the transmission speed. ### Change 1. Modified the `transfer_queue/storage/clients/yuanrong_client.py` to call the zero-copy interface, and performed operations such as serialization and pack. 2. Add mget and mset UT: `tests/test_yuanrong_storage_client.py` . ### Testing - Test on CPU: `pytest tests/test_yuanrong_storage_client.py ` ### Result When transmitting 512 pieces of data, each 32 MB in size, with a total data volume of 16GB: End-to-end **Get** took **10s** and the bandwidth was **1.6 GB/s**. The time spent calling the **YuanrongStorageClient** interface was **2.27s** with a bandwidth of **7.05 GB/s**. End-to-end **Put** took **3.42s** and the bandwidth was **4.68 GB/s**. The time spent calling the **YuanrongStorageClient** interface was **3.32s** with a bandwidth of **4.83 GB/s**. ### Related Links - Previous issues can be viewed: [[Feat]: Try zero-copy serialize objects that can be converted to memoryview](TransferQueue/TransferQueue#147) - Yuanrong Datasystem PR: [https://atomgit.com/openeuler/yuanrong-datasystem/pull/141](https://atomgit.com/openeuler/yuanrong-datasystem/pull/141) See merge request: Ascend/TransferQueue!10
diff --git a/tests/test_yuanrong_storage_manager.py b/tests/test_yuanrong_storage_manager.py
@@ -0,0 +1,92 @@
+# Copyright 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2025 The TransferQueue Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+import torch
+
+parent_dir = Path(__file__).resolve().parent.parent
+sys.path.append(str(parent_dir))
+
+from transfer_queue.storage.clients.yuanrong_client import (  # noqa: E402
+    YuanrongStorageClient,
+)
+
+
+class MockBuffer:
+    def __init__(self, size):
+        self.data = bytearray(size)
+
+    def mutable_data(self):
+        return self.data
+
+
+class TestYuanrongStorageZCopy:
+    @pytest.fixture
+    def mock_kv_client(self, mocker):
+        mock_client = MagicMock()
+        mock_client.init.return_value = None
+
+        mocker.patch("yr.datasystem.KVClient", return_value=mock_client)
+        mocker.patch("yr.datasystem.DsTensorClient")
+        mocker.patch("transfer_queue.storage.clients.yuanrong_client.TORCH_NPU_IMPORTED", False)
+
+        return mock_client
+
+    @pytest.fixture
+    def storage_client(self, mock_kv_client):
+        return YuanrongStorageClient({"host": "127.0.0.1", "port": 31501})
+
+    def test_mset_mget_p2p(self, storage_client, mocker):
+        # Mock serialization/deserialization
+        def mock_serialization(obj):
+            if isinstance(obj, torch.Tensor):
+                return [obj.numpy().tobytes()]
+            return [str(obj).encode("utf-8")]
+
+        def mock_deserialization(items):
+            data = items[0]
+            if len(data) == 12:
+                return torch.from_numpy(np.frombuffer(data, dtype=np.float32).copy())
+            try:
+                return data.tobytes().decode("utf-8")
+            except UnicodeDecodeError:
+                return data
+
+        mocker.patch("transfer_queue.storage.clients.yuanrong_client.serialization", side_effect=mock_serialization)
+        mocker.patch("transfer_queue.storage.clients.yuanrong_client.deserialization", side_effect=mock_deserialization)
+
+        stored_raw_buffers = []
+
+        def side_effect_mcreate(keys, sizes):
+            buffers = [MockBuffer(size) for size in sizes]
+            for b in buffers:
+                stored_raw_buffers.append(b.mutable_data())
+            return 0, buffers
+
+        storage_client._cpu_ds_client.mcreate.side_effect = side_effect_mcreate
+        storage_client._cpu_ds_client.get_buffers.return_value = (0, stored_raw_buffers)
+
+        storage_client.mset_zcopy(
+            ["tensor_key", "string_key"], [torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32), "hello yuanrong"]
+        )
+        results = storage_client.mget_zcopy(["tensor_key", "string_key"])
+
+        assert torch.allclose(results[0], torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32))
+        assert results[1] == "hello yuanrong"
diff --git a/transfer_queue/storage/clients/yuanrong_client.py b/transfer_queue/storage/clients/yuanrong_client.py
@@ -16,13 +16,18 @@
 import logging
 import os
 import pickle
-from typing import Any, Optional
+import struct
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Optional, TypeAlias
 
 import torch
 from torch import Tensor
 
 from transfer_queue.storage.clients.base import TransferQueueStorageKVClient
 from transfer_queue.storage.clients.factory import StorageClientFactory
+from transfer_queue.utils.serial_utils import _decoder, _encoder
+
+bytestr: TypeAlias = bytes | bytearray | memoryview
 
 logger = logging.getLogger(__name__)
 logger.setLevel(os.getenv("TQ_LOGGING_LEVEL", logging.WARNING))
@@ -31,11 +36,84 @@
 CPU_DS_CLIENT_KEYS_LIMIT: int = 1999
 YUANRONG_DATASYSTEM_IMPORTED: bool = True
 TORCH_NPU_IMPORTED: bool = True
+DS_MAX_WORKERS: int = 16
 try:
     from yr import datasystem
 except ImportError:
     YUANRONG_DATASYSTEM_IMPORTED = False
 
+# Header: number of entries (uint32, little-endian)
+HEADER_FMT = "<I"
+HEADER_SIZE = struct.calcsize(HEADER_FMT)
+# Entry: (payload_offset: uint32, payload_size: uint32)
+ENTRY_FMT = "<II"
+ENTRY_SIZE = struct.calcsize(ENTRY_FMT)
+
+
+def calc_packed_size(items: list[memoryview]) -> int:
+    """
+    Calculate the total size (in bytes) required to pack a list of memoryview items
+    into the structured binary format used by pack_into.
+
+    Args:
+        items: List of memoryview objects to be packed.
+
+    Returns:
+        Total buffer size in bytes.
+    """
+    return HEADER_SIZE + len(items) * ENTRY_SIZE + sum(item.nbytes for item in items)
+
+
+def pack_into(target: memoryview, items: list[memoryview]):
+    """
+    Pack multiple contiguous buffers into a single buffer.
+        ┌───────────────┐
+        │ item_count    │  uint32
+        ├───────────────┤
+        │ entries       │  N * item entries
+        ├───────────────┤
+        │ payload blob  │  N * concatenated buffers
+        └───────────────┘
+
+    Args:
+        target (memoryview): A writable memoryview returned by StateValueBuffer.MutableData().
+            It must be large enough to accommodate the total number of bytes of HEADER + ENTRY_TABLE + all items.
+            This buffer is usually mapped to shared memory or Zero-Copy memory area.
+        items (List[memoryview]): List of read-only memory views (e.g., from serialized objects). Each item must support
+            the buffer protocol and be readable as raw bytes.
+
+    """
+    struct.pack_into(HEADER_FMT, target, 0, len(items))
+
+    entry_offset = HEADER_SIZE
+    payload_offset = HEADER_SIZE + len(items) * ENTRY_SIZE
+
+    target_tensor = torch.frombuffer(target, dtype=torch.uint8)
+
+    for item in items:
+        struct.pack_into(ENTRY_FMT, target, entry_offset, payload_offset, item.nbytes)
+        src_tensor = torch.frombuffer(item, dtype=torch.uint8)
+        target_tensor[payload_offset : payload_offset + item.nbytes].copy_(src_tensor)
+        entry_offset += ENTRY_SIZE
+        payload_offset += item.nbytes
+
+
+def unpack_from(source: memoryview) -> list[bytestr]:
+    """
+    Unpack multiple contiguous buffers from a single packed buffer.
+    Args:
+        source (memoryview): The packed source buffer.
+    Returns:
+        list[bytestr]: List of unpacked contiguous buffers.
+    """
+    mv = memoryview(source)
+    item_count = struct.unpack_from(HEADER_FMT, mv, 0)[0]
+    offsets = []
+    for i in range(item_count):
+        offset, length = struct.unpack_from(ENTRY_FMT, mv, HEADER_SIZE + i * ENTRY_SIZE)
+        offsets.append((offset, length))
+    return [mv[offset : offset + length] for offset, length in offsets]
+
 
 @StorageClientFactory.register("YuanrongStorageClient")
 class YuanrongStorageClient(TransferQueueStorageKVClient):
@@ -106,6 +184,19 @@ def _create_empty_npu_tensorlist(self, shapes, dtypes):
             tensors.append(tensor)
         return tensors
 
+    def mset_zcopy(self, keys: list[str], objs: list[Any]):
+        items_list = [[memoryview(b) for b in _encoder.encode(obj)] for obj in objs]
+        packed_sizes = [calc_packed_size(items) for items in items_list]
+        status, buffers = self._cpu_ds_client.mcreate(keys, packed_sizes)
+        tasks = [(target.MutableData(), item) for target, item in zip(buffers, items_list, strict=False)]
+        with ThreadPoolExecutor(max_workers=DS_MAX_WORKERS) as executor:
+            list(executor.map(lambda p: pack_into(*p), tasks))
+        self._cpu_ds_client.mset_buffer(buffers)
+
+    def mget_zcopy(self, keys: list[str]) -> list[Any]:
+        status, buffers = self._cpu_ds_client.get_buffers(keys, timeout_ms=500)
+        return [_decoder.decode(unpack_from(buffer)) if buffer is not None else None for buffer in buffers]
+
     def _batch_put(self, keys: list[str], values: list[Any]):
         """Stores a batch of key-value pairs to remote storage, splitting by device type.
 
@@ -125,17 +216,15 @@ def _batch_put(self, keys: list[str], values: list[Any]):
             cpu_values = []
 
             for key, value in zip(keys, values, strict=True):
-                if isinstance(value, Tensor) and value.device.type == "npu":
+                if isinstance(value, torch.Tensor) and value.device.type == "npu":
                     if not value.is_contiguous():
                         raise ValueError(f"NPU Tensor is not contiguous: {value}")
                     npu_keys.append(key)
                     npu_values.append(value)
 
                 else:
                     cpu_keys.append(key)
-                    # TODO: Optimize serialization of tensors
-                    # Serializing slice of tensors results in entire tensors being serialized
-                    cpu_values.append(pickle.dumps(value.clone() if isinstance(value, Tensor) else value))
+                    cpu_values.append(pickle.dumps(value))
 
             # put NPU data
             for i in range(0, len(npu_keys), NPU_DS_CLIENT_KEYS_LIMIT):
@@ -157,11 +246,10 @@ def _batch_put(self, keys: list[str], values: list[Any]):
 
         else:
             #  All data goes through CPU path
-            pickled_values = [pickle.dumps(v.clone() if isinstance(v, Tensor) else v) for v in values]
             for i in range(0, len(keys), CPU_DS_CLIENT_KEYS_LIMIT):
                 batch_keys = keys[i : i + CPU_DS_CLIENT_KEYS_LIMIT]
-                batch_vals = pickled_values[i : i + CPU_DS_CLIENT_KEYS_LIMIT]
-                self._cpu_ds_client.mset(batch_keys, batch_vals)
+                batch_vals = values[i : i + CPU_DS_CLIENT_KEYS_LIMIT]
+                self.mset_zcopy(batch_keys, batch_vals)
 
     def put(self, keys: list[str], values: list[Any]) -> Optional[list[Any]]:
         """Stores multiple key-value pairs to remote storage.
@@ -253,16 +341,17 @@ def _batch_get(self, keys: list[str], shapes: list, dtypes: list) -> list[Any]:
                     results[idx] = pickle.loads(raw_val)
 
             return results
+
         else:
-            # npu is not available, goes through cpu_ds_client
             results = [None] * len(keys)
-            idx = 0
+            cpu_indices = list(range(len(keys)))
+
             for i in range(0, len(keys), CPU_DS_CLIENT_KEYS_LIMIT):
                 batch_keys = keys[i : i + CPU_DS_CLIENT_KEYS_LIMIT]
-                raw_values = self._cpu_ds_client.get(batch_keys)
-                for raw_val in raw_values:
-                    results[idx] = pickle.loads(raw_val)
-                    idx += 1
+                batch_indices = cpu_indices[i : i + CPU_DS_CLIENT_KEYS_LIMIT]
+                objects = self.mget_zcopy(batch_keys)
+                for idx, obj in zip(batch_indices, objects, strict=False):
+                    results[idx] = obj
             return results
 
     def get(self, keys: list[str], shapes=None, dtypes=None, custom_meta=None) -> list[Any]: