Commit ba5710e
authored
## Background
Previously, `SimpleStorageUnit` relied on a single-threaded event loop
for request processing. This design could lead to bottlenecks and
increased latency when multiple requests arrived simultaneously, as
operations like ZMQ message deserialization and memory I/O would block
the main socket loop from receiving new requests.
## Key Changes
1. Refactored `SimpleStorageUnit` to utilize a native `zmq.proxy`. This
acts as a highly efficient, C-level load balancer between a frontend
`ROUTER` socket (handling external client connections) and an internal
backend `DEALER` socket (inproc://).
2. ~~Introduced a worker thread pool where each worker binds its own
independent `DEALER` socket to process `PUT/GET/CLEAR` requests
concurrently. This preserves ZMQ's "share-nothing" concurrency
philosophy.~~
3. ~~Added a `threading.Lock()` to `StorageUnitData` to prevent race
condition introduced by multi-threads~~
4. ~~Added `num_worker_threads` as an explicit input parameter for
`SimpleStorageUnit` (configurable via TQ system config items).~~
> During performance test, we surprisingly find out that the refactored
multi-thread code achieves better performance with
`num_worker_threads=1`. The introduction of the native C-level
`zmq.proxy` offloads the high-frequency I/O from the main Python thread.
Therefore, we retire the multi-thread version and only preserve the
`zmq.proxy` optimization.
## Architechture
### Old Version
<img width="1067" height="1760" alt="mermaid-diagram-2026-02-26-192209"
src="https://github.com/user-attachments/assets/3a61673b-9e91-4cc9-9930-b20e6cd06217"
/>
### New Version
<img width="1374" height="3104" alt="mermaid-diagram-2026-02-26-220631"
src="https://github.com/user-attachments/assets/824386e0-5b57-4a7c-a15c-ac3c6258d9ad"
/>
## Performance Gain
We provide a simple benchmark script for this PR:
```python3
import argparse
import multiprocessing
import time
import ray
import torch
import zmq
import tensordict
# Ensure this runs in the repository root directory, otherwise sys.path.append might be needed
from transfer_queue.storage.simple_backend import SimpleStorageUnit
from transfer_queue.utils.zmq_utils import ZMQMessage, ZMQRequestType
class StorageClient:
"""Independent test client that interacts directly with the frontend ROUTER of SimpleStorageUnit"""
def __init__(self, address):
self.context = zmq.Context()
self.socket = self.context.socket(zmq.DEALER)
self.socket.setsockopt(zmq.RCVTIMEO, 20000) # Timeout set to 20s to prevent timeouts under heavy concurrency
self.socket.connect(address)
def send_put(self, client_id, local_indexes, field_data):
msg = ZMQMessage.create(
request_type=ZMQRequestType.PUT_DATA,
sender_id=f"bench_client_{client_id}",
body={"local_indexes": local_indexes, "data": field_data},
)
self.socket.send_multipart(msg.serialize())
return ZMQMessage.deserialize(self.socket.recv_multipart())
def close(self):
self.socket.close()
self.context.term()
def client_worker(worker_id, address, num_requests, batch_size):
"""Worker process task: Continuously bombard the Storage Unit with PUT requests"""
client = StorageClient(address)
start_time = time.time()
# Construct Dummy Tensor data to simulate actual memory and serialization overhead
# As noted in the PR description, serialization and memory I/O are the bottlenecks blocking the main loop
field_data = {
"dummy_tensor": [torch.randn(256, 256) for _ in range(batch_size)]
}
for i in range(num_requests):
local_indexes = list(range(i * batch_size, (i + 1) * batch_size))
client.send_put(worker_id, local_indexes, field_data)
elapsed = time.time() - start_time
client.close()
print(f"[Worker {worker_id}] Completed {num_requests} write requests, took {elapsed:.3f} seconds "
f"(QPS: {num_requests / elapsed:.2f} req/s)")
def main(num_clients, storage_threads, requests_per_client):
# Initialize Ray and global settings
ray.init(ignore_reinit_error=True)
tensordict.set_list_to_stack(True).set()
try:
print(f"🚀 Launching SimpleStorageUnit, internal worker threads (num_worker_threads): {storage_threads} ...")
# Launch the backend Actor. PR 37 exposes the num_worker_threads parameter
storage_actor = SimpleStorageUnit.options(
max_concurrency=50, num_cpus=2
).remote(
storage_unit_size=1000000,
num_worker_threads=storage_threads # comment this line for old version comparison
)
zmq_info = ray.get(storage_actor.get_zmq_server_info.remote())
put_get_address = zmq_info.to_addr("put_get_socket")
print(f"✅ Storage unit ready, ZMQ Address: {put_get_address}")
# Wait for zmq.proxy and all worker threads to bind to the inproc port
time.sleep(2)
print(f"🔥 Spawning {num_clients} independent concurrent write processes...")
processes = []
batch_size = 256
start_time = time.time()
# 1. Create and start multiple processes
for i in range(num_clients):
p = multiprocessing.Process(
target=client_worker,
args=(i, put_get_address, requests_per_client, batch_size)
)
p.start()
processes.append(p)
# 2. Wait for all concurrent processes to complete
for p in processes:
p.join()
total_time = time.time() - start_time
total_requests = num_clients * requests_per_client
print("\n" + "=" * 50)
print(f" 📊 Benchmark Results")
print("=" * 50)
print(f" SimpleStorageUnit internal threads : {storage_threads}")
print(f" External concurrent clients : {num_clients}")
print(f" Total processed requests (Batches) : {total_requests} (Batch Size: {batch_size})")
print(f" Total benchmark duration : {total_time:.3f} seconds")
print(f" 🚀 Overall Throughput : {total_requests / total_time:.2f} req/s")
print("=" * 50 + "\n")
finally:
# Resource cleanup
if 'storage_actor' in locals():
ray.kill(storage_actor)
ray.shutdown()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="PR Ascend#37 Performance Benchmark")
parser.add_argument("--clients", type=int, default=8, help="Number of concurrent client processes")
parser.add_argument("--threads", type=int, default=4, help="Number of processing threads in SimpleStorageUnit")
parser.add_argument("--requests", type=int, default=300, help="Number of requests sent per client")
args = parser.parse_args()
main(args.clients, args.threads, args.requests)
```
### Small Scale Test (`batch_size=20`, `clients=4`)
On a mac mini with M2 chip with 24GB memory:
#### Old Version
```bash
python benchmark.py --clients 4
```
<img width="680" height="343" alt="image"
src="https://github.com/user-attachments/assets/0e5fedc4-a185-4d34-94d0-8cde007d1a74"
/>
#### New Version
```bash
python benchmark.py --clients 4 --threads 1
```
<img width="663" height="342" alt="image"
src="https://github.com/user-attachments/assets/c325bc27-0ad7-485a-9717-9255662b3733"
/>
```bash
python benchmark.py --clients 4 --threads 2
```
<img width="663" height="343" alt="image"
src="https://github.com/user-attachments/assets/66e64858-08ac-4358-b8f9-8b0f56506ffa"
/>
### Middle Scale Test (`batch_size=256`, `clients=4`)
On a mac mini with M2 chip with 24GB memory:
#### Old Version
```bash
python benchmark.py --clients 4
```
<img width="683" height="327" alt="image"
src="https://github.com/user-attachments/assets/47b4b8a7-d81a-4572-9235-14c3c68059f7"
/>
#### New Version
```bash
python benchmark.py --clients 4 --threads 1
```
<img width="731" height="343" alt="image"
src="https://github.com/user-attachments/assets/ae22115e-9433-4a80-a4d3-238beba9fec1"
/>
```bash
python benchmark.py --clients 4 --threads 2
```
<img width="716" height="341" alt="image"
src="https://github.com/user-attachments/assets/ba9ff4c6-9d0c-45cd-83c5-881be2b5c118"
/>
### Large Scale Test (`batch_size=256`, `clients=50`)
On a Ubuntu server with Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz x
128 cores:
Note:
1. The benchmark script has also been modified to consider `get`
performance
2. We export the following env vars:
```bash
export OMP_NUM_THREADS=1
export MKL_NUM_THREADS=1
export OPENBLAS_NUM_THREADS=1
export VECLIB_MAXIMUM_THREADS=1
export NUMEXPR_NUM_THREADS=1
export TORCH_NUM_THREADS=1
export TQ_ZERO_COPY_SERIALIZATION=True
```
#### Old Version
```bash
python benchmark.py --clients 50
```
<img width="555" height="196" alt="image"
src="https://github.com/user-attachments/assets/f47397d6-1819-4230-bb46-3073d36a1633"
/>
#### New Version
```bash
python benchmark.py --clients 50 --threads 1
```
<img width="551" height="195" alt="image"
src="https://github.com/user-attachments/assets/0a9dcee1-326e-43eb-901a-5e1f9b1a75f1"
/>
```bash
python benchmark.py --clients 50 --threads 2
```
<img width="556" height="195" alt="image"
src="https://github.com/user-attachments/assets/b8a6daaf-5644-4607-b8f9-a4d00c7a8b34"
/>
```bash
python benchmark.py --clients 50 --threads 4
```
<img width="526" height="190" alt="image"
src="https://github.com/user-attachments/assets/1e0a5e3f-c13b-4ba0-ac95-4ee54f30be79"
/>
---------
Signed-off-by: 0oshowero0 <o0shower0o@outlook.com>
1 parent bd31c02 commit ba5710e
4 files changed
Lines changed: 142 additions & 32 deletions
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
30 | 30 | | |
31 | 31 | | |
32 | 32 | | |
33 | | - | |
34 | | - | |
35 | | - | |
36 | | - | |
37 | | - | |
38 | | - | |
39 | | - | |
| 33 | + | |
| 34 | + | |
| 35 | + | |
40 | 36 | | |
| 37 | + | |
41 | 38 | | |
42 | 39 | | |
43 | 40 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1523 | 1523 | | |
1524 | 1524 | | |
1525 | 1525 | | |
1526 | | - | |
| 1526 | + | |
1527 | 1527 | | |
1528 | 1528 | | |
1529 | 1529 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
82 | 82 | | |
83 | 83 | | |
84 | 84 | | |
85 | | - | |
| 85 | + | |
| 86 | + | |
| 87 | + | |
86 | 88 | | |
87 | 89 | | |
88 | 90 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
16 | 16 | | |
17 | 17 | | |
18 | 18 | | |
| 19 | + | |
| 20 | + | |
19 | 21 | | |
20 | 22 | | |
21 | | - | |
22 | | - | |
| 23 | + | |
| 24 | + | |
23 | 25 | | |
24 | 26 | | |
25 | 27 | | |
| |||
173 | 175 | | |
174 | 176 | | |
175 | 177 | | |
| 178 | + | |
| 179 | + | |
| 180 | + | |
| 181 | + | |
| 182 | + | |
| 183 | + | |
| 184 | + | |
| 185 | + | |
| 186 | + | |
| 187 | + | |
| 188 | + | |
| 189 | + | |
176 | 190 | | |
177 | 191 | | |
178 | 192 | | |
| 193 | + | |
| 194 | + | |
| 195 | + | |
| 196 | + | |
| 197 | + | |
| 198 | + | |
| 199 | + | |
| 200 | + | |
| 201 | + | |
| 202 | + | |
| 203 | + | |
179 | 204 | | |
180 | 205 | | |
181 | 206 | | |
182 | | - | |
183 | | - | |
| 207 | + | |
| 208 | + | |
184 | 209 | | |
185 | 210 | | |
| 211 | + | |
| 212 | + | |
186 | 213 | | |
187 | 214 | | |
188 | 215 | | |
| |||
195 | 222 | | |
196 | 223 | | |
197 | 224 | | |
| 225 | + | |
| 226 | + | |
| 227 | + | |
| 228 | + | |
198 | 229 | | |
199 | 230 | | |
200 | 231 | | |
| |||
203 | 234 | | |
204 | 235 | | |
205 | 236 | | |
206 | | - | |
207 | | - | |
208 | | - | |
| 237 | + | |
| 238 | + | |
| 239 | + | |
| 240 | + | |
| 241 | + | |
| 242 | + | |
| 243 | + | |
| 244 | + | |
| 245 | + | |
| 246 | + | |
| 247 | + | |
| 248 | + | |
| 249 | + | |
| 250 | + | |
| 251 | + | |
| 252 | + | |
| 253 | + | |
209 | 254 | | |
210 | | - | |
| 255 | + | |
| 256 | + | |
| 257 | + | |
| 258 | + | |
| 259 | + | |
| 260 | + | |
| 261 | + | |
| 262 | + | |
| 263 | + | |
| 264 | + | |
| 265 | + | |
| 266 | + | |
| 267 | + | |
| 268 | + | |
| 269 | + | |
| 270 | + | |
| 271 | + | |
| 272 | + | |
| 273 | + | |
| 274 | + | |
211 | 275 | | |
212 | | - | |
213 | | - | |
214 | 276 | | |
215 | | - | |
| 277 | + | |
216 | 278 | | |
217 | | - | |
| 279 | + | |
| 280 | + | |
| 281 | + | |
| 282 | + | |
| 283 | + | |
| 284 | + | |
| 285 | + | |
| 286 | + | |
| 287 | + | |
| 288 | + | |
| 289 | + | |
| 290 | + | |
| 291 | + | |
218 | 292 | | |
219 | | - | |
| 293 | + | |
| 294 | + | |
220 | 295 | | |
221 | | - | |
222 | | - | |
| 296 | + | |
| 297 | + | |
| 298 | + | |
| 299 | + | |
| 300 | + | |
223 | 301 | | |
224 | | - | |
225 | | - | |
226 | | - | |
227 | | - | |
228 | 302 | | |
229 | 303 | | |
| 304 | + | |
230 | 305 | | |
231 | | - | |
| 306 | + | |
232 | 307 | | |
| 308 | + | |
233 | 309 | | |
234 | 310 | | |
235 | 311 | | |
| |||
253 | 329 | | |
254 | 330 | | |
255 | 331 | | |
256 | | - | |
257 | | - | |
| 332 | + | |
| 333 | + | |
258 | 334 | | |
259 | 335 | | |
260 | 336 | | |
261 | | - | |
| 337 | + | |
| 338 | + | |
| 339 | + | |
| 340 | + | |
| 341 | + | |
| 342 | + | |
262 | 343 | | |
263 | 344 | | |
264 | 345 | | |
| |||
365 | 446 | | |
366 | 447 | | |
367 | 448 | | |
| 449 | + | |
| 450 | + | |
| 451 | + | |
| 452 | + | |
| 453 | + | |
| 454 | + | |
| 455 | + | |
| 456 | + | |
| 457 | + | |
| 458 | + | |
| 459 | + | |
| 460 | + | |
| 461 | + | |
| 462 | + | |
| 463 | + | |
| 464 | + | |
| 465 | + | |
| 466 | + | |
| 467 | + | |
| 468 | + | |
| 469 | + | |
| 470 | + | |
| 471 | + | |
| 472 | + | |
| 473 | + | |
| 474 | + | |
| 475 | + | |
| 476 | + | |
| 477 | + | |
| 478 | + | |
368 | 479 | | |
369 | 480 | | |
370 | 481 | | |
| |||
0 commit comments