[perf] Add zmq.proxy to accelerate request processing for SimpleStorageUnit (Ascend#37)

0oshowero0 · web-flow · commit ba5710ec54ac · 2026-02-28T11:28:38.000+08:00
## Background Previously, `SimpleStorageUnit` relied on a single-threaded event loop for request processing. This design could lead to bottlenecks and increased latency when multiple requests arrived simultaneously, as operations like ZMQ message deserialization and memory I/O would block the main socket loop from receiving new requests. ## Key Changes 1. Refactored `SimpleStorageUnit` to utilize a native `zmq.proxy`. This acts as a highly efficient, C-level load balancer between a frontend `ROUTER` socket (handling external client connections) and an internal backend `DEALER` socket (inproc://). 2. ~~Introduced a worker thread pool where each worker binds its own independent `DEALER` socket to process `PUT/GET/CLEAR` requests concurrently. This preserves ZMQ's "share-nothing" concurrency philosophy.~~ 3. ~~Added a `threading.Lock()` to `StorageUnitData` to prevent race condition introduced by multi-threads~~ 4. ~~Added `num_worker_threads` as an explicit input parameter for `SimpleStorageUnit` (configurable via TQ system config items).~~ > During performance test, we surprisingly find out that the refactored multi-thread code achieves better performance with `num_worker_threads=1`. The introduction of the native C-level `zmq.proxy` offloads the high-frequency I/O from the main Python thread. Therefore, we retire the multi-thread version and only preserve the `zmq.proxy` optimization. ## Architechture ### Old Version <img width="1067" height="1760" alt="mermaid-diagram-2026-02-26-192209" src="https://github.com/user-attachments/assets/3a61673b-9e91-4cc9-9930-b20e6cd06217" /> ### New Version <img width="1374" height="3104" alt="mermaid-diagram-2026-02-26-220631" src="https://github.com/user-attachments/assets/824386e0-5b57-4a7c-a15c-ac3c6258d9ad" /> ## Performance Gain We provide a simple benchmark script for this PR: ```python3 import argparse import multiprocessing import time import ray import torch import zmq import tensordict # Ensure this runs in the repository root directory, otherwise sys.path.append might be needed from transfer_queue.storage.simple_backend import SimpleStorageUnit from transfer_queue.utils.zmq_utils import ZMQMessage, ZMQRequestType class StorageClient: """Independent test client that interacts directly with the frontend ROUTER of SimpleStorageUnit""" def __init__(self, address): self.context = zmq.Context() self.socket = self.context.socket(zmq.DEALER) self.socket.setsockopt(zmq.RCVTIMEO, 20000) # Timeout set to 20s to prevent timeouts under heavy concurrency self.socket.connect(address) def send_put(self, client_id, local_indexes, field_data): msg = ZMQMessage.create( request_type=ZMQRequestType.PUT_DATA, sender_id=f"bench_client_{client_id}", body={"local_indexes": local_indexes, "data": field_data}, ) self.socket.send_multipart(msg.serialize()) return ZMQMessage.deserialize(self.socket.recv_multipart()) def close(self): self.socket.close() self.context.term() def client_worker(worker_id, address, num_requests, batch_size): """Worker process task: Continuously bombard the Storage Unit with PUT requests""" client = StorageClient(address) start_time = time.time() # Construct Dummy Tensor data to simulate actual memory and serialization overhead # As noted in the PR description, serialization and memory I/O are the bottlenecks blocking the main loop field_data = { "dummy_tensor": [torch.randn(256, 256) for _ in range(batch_size)] } for i in range(num_requests): local_indexes = list(range(i * batch_size, (i + 1) * batch_size)) client.send_put(worker_id, local_indexes, field_data) elapsed = time.time() - start_time client.close() print(f"[Worker {worker_id}] Completed {num_requests} write requests, took {elapsed:.3f} seconds " f"(QPS: {num_requests / elapsed:.2f} req/s)") def main(num_clients, storage_threads, requests_per_client): # Initialize Ray and global settings ray.init(ignore_reinit_error=True) tensordict.set_list_to_stack(True).set() try: print(f"🚀 Launching SimpleStorageUnit, internal worker threads (num_worker_threads): {storage_threads} ...") # Launch the backend Actor. PR 37 exposes the num_worker_threads parameter storage_actor = SimpleStorageUnit.options( max_concurrency=50, num_cpus=2 ).remote( storage_unit_size=1000000, num_worker_threads=storage_threads # comment this line for old version comparison ) zmq_info = ray.get(storage_actor.get_zmq_server_info.remote()) put_get_address = zmq_info.to_addr("put_get_socket") print(f"✅ Storage unit ready, ZMQ Address: {put_get_address}") # Wait for zmq.proxy and all worker threads to bind to the inproc port time.sleep(2) print(f"🔥 Spawning {num_clients} independent concurrent write processes...") processes = [] batch_size = 256 start_time = time.time() # 1. Create and start multiple processes for i in range(num_clients): p = multiprocessing.Process( target=client_worker, args=(i, put_get_address, requests_per_client, batch_size) ) p.start() processes.append(p) # 2. Wait for all concurrent processes to complete for p in processes: p.join() total_time = time.time() - start_time total_requests = num_clients * requests_per_client print("\n" + "=" * 50) print(f" 📊 Benchmark Results") print("=" * 50) print(f" SimpleStorageUnit internal threads : {storage_threads}") print(f" External concurrent clients : {num_clients}") print(f" Total processed requests (Batches) : {total_requests} (Batch Size: {batch_size})") print(f" Total benchmark duration : {total_time:.3f} seconds") print(f" 🚀 Overall Throughput : {total_requests / total_time:.2f} req/s") print("=" * 50 + "\n") finally: # Resource cleanup if 'storage_actor' in locals(): ray.kill(storage_actor) ray.shutdown() if __name__ == "__main__": parser = argparse.ArgumentParser(description="PR Ascend#37 Performance Benchmark") parser.add_argument("--clients", type=int, default=8, help="Number of concurrent client processes") parser.add_argument("--threads", type=int, default=4, help="Number of processing threads in SimpleStorageUnit") parser.add_argument("--requests", type=int, default=300, help="Number of requests sent per client") args = parser.parse_args() main(args.clients, args.threads, args.requests) ``` ### Small Scale Test (`batch_size=20`, `clients=4`) On a mac mini with M2 chip with 24GB memory: #### Old Version ```bash python benchmark.py --clients 4 ``` <img width="680" height="343" alt="image" src="https://github.com/user-attachments/assets/0e5fedc4-a185-4d34-94d0-8cde007d1a74" /> #### New Version ```bash python benchmark.py --clients 4 --threads 1 ``` <img width="663" height="342" alt="image" src="https://github.com/user-attachments/assets/c325bc27-0ad7-485a-9717-9255662b3733" /> ```bash python benchmark.py --clients 4 --threads 2 ``` <img width="663" height="343" alt="image" src="https://github.com/user-attachments/assets/66e64858-08ac-4358-b8f9-8b0f56506ffa" /> ### Middle Scale Test (`batch_size=256`, `clients=4`) On a mac mini with M2 chip with 24GB memory: #### Old Version ```bash python benchmark.py --clients 4 ``` <img width="683" height="327" alt="image" src="https://github.com/user-attachments/assets/47b4b8a7-d81a-4572-9235-14c3c68059f7" /> #### New Version ```bash python benchmark.py --clients 4 --threads 1 ``` <img width="731" height="343" alt="image" src="https://github.com/user-attachments/assets/ae22115e-9433-4a80-a4d3-238beba9fec1" /> ```bash python benchmark.py --clients 4 --threads 2 ``` <img width="716" height="341" alt="image" src="https://github.com/user-attachments/assets/ba9ff4c6-9d0c-45cd-83c5-881be2b5c118" /> ### Large Scale Test (`batch_size=256`, `clients=50`) On a Ubuntu server with Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz x 128 cores: Note: 1. The benchmark script has also been modified to consider `get` performance 2. We export the following env vars: ```bash export OMP_NUM_THREADS=1 export MKL_NUM_THREADS=1 export OPENBLAS_NUM_THREADS=1 export VECLIB_MAXIMUM_THREADS=1 export NUMEXPR_NUM_THREADS=1 export TORCH_NUM_THREADS=1 export TQ_ZERO_COPY_SERIALIZATION=True ``` #### Old Version ```bash python benchmark.py --clients 50 ``` <img width="555" height="196" alt="image" src="https://github.com/user-attachments/assets/f47397d6-1819-4230-bb46-3073d36a1633" /> #### New Version ```bash python benchmark.py --clients 50 --threads 1 ``` <img width="551" height="195" alt="image" src="https://github.com/user-attachments/assets/0a9dcee1-326e-43eb-901a-5e1f9b1a75f1" /> ```bash python benchmark.py --clients 50 --threads 2 ``` <img width="556" height="195" alt="image" src="https://github.com/user-attachments/assets/b8a6daaf-5644-4607-b8f9-a4d00c7a8b34" /> ```bash python benchmark.py --clients 50 --threads 4 ``` <img width="526" height="190" alt="image" src="https://github.com/user-attachments/assets/1e0a5e3f-c13b-4ba0-ac95-4ee54f30be79" /> --------- Signed-off-by: 0oshowero0 <o0shower0o@outlook.com>
diff --git a/scripts/performance_test.py b/scripts/performance_test.py
@@ -30,14 +30,11 @@
 parent_dir = Path(__file__).resolve().parent.parent.parent
 sys.path.append(str(parent_dir))
 
-
-from transfer_queue import (  # noqa: E402
-    SimpleStorageUnit,
-    TransferQueueClient,
-    TransferQueueController,
-    process_zmq_server_info,
-)
+from transfer_queue.client import TransferQueueClient  # noqa: E402
+from transfer_queue.controller import TransferQueueController  # noqa: E402
+from transfer_queue.storage.simple_backend import SimpleStorageUnit  # noqa: E402
 from transfer_queue.utils.common import get_placement_group  # noqa: E402
+from transfer_queue.utils.zmq_utils import process_zmq_server_info  # noqa: E402
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)
diff --git a/transfer_queue/controller.py b/transfer_queue/controller.py
@@ -1523,7 +1523,7 @@ def kv_retrieve_keys(
         )
         data_fields = []
         for fname, col_idx in partition.field_name_mapping.items():
-            if col_mask[col_idx]:
+            if col_idx < len(col_mask) and col_mask[col_idx]:
                 data_fields.append(fname)
 
         metadata = self.generate_batch_meta(partition_id, verified_global_indexes, data_fields, mode="force_fetch")
diff --git a/transfer_queue/interface.py b/transfer_queue/interface.py
@@ -82,7 +82,9 @@ def _maybe_create_transferqueue_storage(conf: DictConfig) -> DictConfig:
                     placement_group_bundle_index=storage_unit_rank,
                     name=f"TransferQueueStorageUnit#{storage_unit_rank}",
                     lifetime="detached",
-                ).remote(storage_unit_size=math.ceil(total_storage_size / num_data_storage_units))
+                ).remote(
+                    storage_unit_size=math.ceil(total_storage_size / num_data_storage_units),
+                )
                 _TRANSFER_QUEUE_STORAGE[f"TransferQueueStorageUnit#{storage_unit_rank}"] = storage_node
                 logger.info(f"TransferQueueStorageUnit#{storage_unit_rank} has been created.")
 
diff --git a/transfer_queue/storage/simple_backend.py b/transfer_queue/storage/simple_backend.py
@@ -16,10 +16,12 @@
 import dataclasses
 import logging
 import os
+import time
+import weakref
 from dataclasses import dataclass
 from operator import itemgetter
-from threading import Thread
-from typing import Any
+from threading import Event, Thread
+from typing import Any, Optional
 from uuid import uuid4
 
 import ray
@@ -173,16 +175,41 @@ def __init__(self, storage_unit_size: int):
 
         self.storage_data = StorageUnitData(self.storage_unit_size)
 
+        # Internal communication address for proxy and workers
+        self._inproc_addr = f"inproc://simple_storage_workers_{self.storage_unit_id}"
+
+        # Shutdown event for graceful termination
+        self._shutdown_event = Event()
+
+        # Placeholder for zmq_context, proxy_thread and worker_threads
+        self.zmq_context: Optional[zmq.Context] = None
+        self.put_get_socket: Optional[zmq.Socket] = None
+        self.proxy_thread: Optional[Thread] = None
+        self.worker_thread: Optional[Thread] = None
+
         self._init_zmq_socket()
         self._start_process_put_get()
 
+        # Register finalizer for graceful cleanup when garbage collected
+        self._finalizer = weakref.finalize(
+            self,
+            self._shutdown_resources,
+            self._shutdown_event,
+            self.worker_thread,
+            self.proxy_thread,
+            self.zmq_context,
+            self.put_get_socket,
+        )
+
     def _init_zmq_socket(self) -> None:
         """
         Initialize ZMQ socket connections between storage unit and controller/clients:
-        - put_get_socket:
-            Handle put/get requests from clients.
+        - put_get_socket (ROUTER): Handle put/get requests from clients.
+        - worker_socket (DEALER): Backend socket for worker communication.
         """
         self.zmq_context = zmq.Context()
+
+        # Frontend: ROUTER for receiving client requests
         self.put_get_socket = create_zmq_socket(self.zmq_context, zmq.ROUTER)
         self._node_ip = get_node_ip_address()
 
@@ -195,6 +222,10 @@ def _init_zmq_socket(self) -> None:
                 logger.warning(f"[{self.storage_unit_id}]: Try to bind ZMQ sockets failed, retrying...")
                 continue
 
+        # Backend: DEALER for worker communication (connected via zmq.proxy)
+        self.worker_socket = create_zmq_socket(self.zmq_context, zmq.DEALER)
+        self.worker_socket.bind(self._inproc_addr)
+
         self.zmq_server_info = ZMQServerInfo(
             role=TransferQueueRole.STORAGE,
             id=str(self.storage_unit_id),
@@ -203,33 +234,78 @@ def _init_zmq_socket(self) -> None:
         )
 
     def _start_process_put_get(self) -> None:
-        """Create a daemon thread and start put/get process."""
-        self.process_put_get_thread = Thread(
-            target=self._process_put_get, name=f"StorageUnitProcessPutGetThread-{self.storage_unit_id}", daemon=True
+        """Start worker threads and ZMQ proxy for handling requests."""
+
+        # Start worker thread
+        self.worker_thread = Thread(
+            target=self._worker_routine,
+            name=f"StorageUnitWorkerThread-{self.storage_unit_id}",
+            daemon=True,
+        )
+        self.worker_thread.start()
+
+        time.sleep(0.5)  # make sure worker thread is ready before zmq.proxy forwarding messages
+
+        # Start proxy thread (ROUTER <-> DEALER)
+        self.proxy_thread = Thread(
+            target=self._proxy_routine,
+            name=f"StorageUnitProxyThread-{self.storage_unit_id}",
+            daemon=True,
         )
-        self.process_put_get_thread.start()
+        self.proxy_thread.start()
+
+    def _proxy_routine(self) -> None:
+        """ZMQ proxy for message forwarding between frontend ROUTER and backend DEALER."""
+        logger.info(f"[{self.storage_unit_id}]: start ZMQ proxy...")
+        try:
+            zmq.proxy(self.put_get_socket, self.worker_socket)
+        except zmq.ContextTerminated:
+            logger.info(f"[{self.storage_unit_id}]: ZMQ Proxy stopped gracefully (Context Terminated)")
+        except Exception as e:
+            if self._shutdown_event.is_set():
+                logger.info(f"[{self.storage_unit_id}]: ZMQ Proxy shutting down...")
+            else:
+                logger.error(f"[{self.storage_unit_id}]: ZMQ Proxy unexpected error: {e}")
+
+    def _worker_routine(self) -> None:
+        """Worker thread for processing requests."""
+        # Each worker must have its own socket
+        worker_socket = create_zmq_socket(self.zmq_context, zmq.DEALER)
+        worker_socket.connect(self._inproc_addr)
 
-    def _process_put_get(self) -> None:
-        """Process put_get_socket request."""
         poller = zmq.Poller()
-        poller.register(self.put_get_socket, zmq.POLLIN)
+        poller.register(worker_socket, zmq.POLLIN)
 
-        logger.info(f"[{self.storage_unit_id}]: start processing put/get requests...")
+        logger.info(f"[{self.storage_unit_id}]: worker thread started...")
+        perf_monitor = IntervalPerfMonitor(caller_name=f"{self.storage_unit_id}")
+
+        while not self._shutdown_event.is_set():
+            try:
+                socks = dict(poller.poll(TQ_STORAGE_POLLER_TIMEOUT * 1000))
+            except zmq.error.ContextTerminated:
+                # ZMQ context was terminated, exit gracefully
+                logger.info(f"[{self.storage_unit_id}]: worker stopped gracefully (Context Terminated)")
+                break
+            except Exception as e:
+                logger.warning(f"[{self.storage_unit_id}]: worker poll error: {e}")
+                continue
 
-        perf_monitor = IntervalPerfMonitor(caller_name=self.storage_unit_id)
+            if self._shutdown_event.is_set():
+                break
 
-        while True:
-            socks = dict(poller.poll(TQ_STORAGE_POLLER_TIMEOUT * 1000))
+            if worker_socket in socks:
+                # Messages received from proxy: [identity, serialized_msg_frame1, ...]
+                messages = worker_socket.recv_multipart()
+                identity = messages[0]
+                serialized_msg = messages[1:]
 
-            if self.put_get_socket in socks:
-                messages = self.put_get_socket.recv_multipart()
-                identity = messages.pop(0)
-                serialized_msg = messages
                 request_msg = ZMQMessage.deserialize(serialized_msg)
                 operation = request_msg.request_type
+
                 try:
-                    logger.debug(f"[{self.storage_unit_id}]: receive operation: {operation}, message: {request_msg}")
+                    logger.debug(f"[{self.storage_unit_id}]: worker received operation: {operation}")
 
+                    # Process request
                     if operation == ZMQRequestType.PUT_DATA:
                         with perf_monitor.measure(op_type="PUT_DATA"):
                             response_msg = self._handle_put(request_msg)
@@ -253,12 +329,17 @@ def _process_put_get(self) -> None:
                         request_type=ZMQRequestType.PUT_GET_ERROR,
                         sender_id=self.storage_unit_id,
                         body={
-                            "message": f"Storage unit id #{self.storage_unit_id} occur error in processing "
-                            f"put/get/clear request, detail error message: {str(e)}."
+                            "message": f"{self.storage_unit_id}, worker encountered error "
+                            f"during operation {operation}: {str(e)}."
                         },
                     )
 
-                self.put_get_socket.send_multipart([identity, *response_msg.serialize()], copy=False)
+                # Send response back with identity for routing
+                worker_socket.send_multipart([identity] + response_msg.serialize(), copy=False)
+
+        logger.info(f"[{self.storage_unit_id}]: worker stopped.")
+        poller.unregister(worker_socket)
+        worker_socket.close(linger=0)
 
     def _handle_put(self, data_parts: ZMQMessage) -> ZMQMessage:
         """
@@ -365,6 +446,36 @@ def _handle_clear(self, data_parts: ZMQMessage) -> ZMQMessage:
             )
         return response_msg
 
+    @staticmethod
+    def _shutdown_resources(
+        shutdown_event: Event,
+        worker_thread: Optional[Thread],
+        proxy_thread: Optional[Thread],
+        zmq_context: Optional[zmq.Context],
+        put_get_socket: Optional[zmq.Socket],
+    ) -> None:
+        """Clean up resources on garbage collection."""
+        logger.info("Shutting down SimpleStorageUnit resources...")
+
+        # Signal all threads to stop
+        shutdown_event.set()
+
+        # Terminate put_get_socket
+        if put_get_socket:
+            put_get_socket.close(linger=0)
+
+        # Terminate ZMQ context to unblock proxy and workers
+        if zmq_context:
+            zmq_context.term()
+
+        # Wait for threads to finish (with timeout)
+        if worker_thread and worker_thread.is_alive():
+            worker_thread.join(timeout=5)
+        if proxy_thread and proxy_thread.is_alive():
+            proxy_thread.join(timeout=5)
+
+        logger.info("SimpleStorageUnit resources shutdown complete.")
+
     def get_zmq_server_info(self) -> ZMQServerInfo:
         """Get the ZMQ server information for this storage unit.
 

Original file line number	Diff line number	Diff line change
`@@ -1523,7 +1523,7 @@ def kv_retrieve_keys(`
`1523`	`1523`	`)`
`1524`	`1524`	`data_fields = []`
`1525`	`1525`	`for fname, col_idx in partition.field_name_mapping.items():`
`1526`		`- if col_mask[col_idx]:`
	`1526`	`+ if col_idx < len(col_mask) and col_mask[col_idx]:`
`1527`	`1527`	`data_fields.append(fname)`
`1528`	`1528`
`1529`	`1529`	`metadata = self.generate_batch_meta(partition_id, verified_global_indexes, data_fields, mode="force_fetch")`