perf: add multiplexing performance tests for AsyncMultiRangeDownloader (#16501)

zhixiangli · web-flow · commit 20969910b482 · 2026-04-10T08:54:44.000+08:00
## Overview
This PR introduces new microbenchmarks to measure and expose the
performance bottleneck caused by lock contention in the
`AsyncMultiRangeDownloader`. It provides a concrete way to compare the
previous serialized implementation against the new multiplexed
architecture.

## Before vs. After: The Performance Gap

### Before (Serialized via Lock)
In the previous implementation, `download_ranges` used a shared lock to
prevent concurrent access to the bidi-gRPC stream. This meant that even
with multiple coroutines, only one could "own" the stream at a time. The
entire download cycle (Send -&gt; Receive All) had to complete before
another task could start.

**Execution Flow:**
```mermaid
sequenceDiagram
    participant C1 as Coroutine 1
    participant C2 as Coroutine 2
    participant S as gRPC Stream

    C1-&gt;&gt;C1: Acquire Lock
    C1-&gt;&gt;S: Send Requests
    S--&gt;&gt;C1: Receive Data (Streaming...)
    S--&gt;&gt;C1: End of Range
    C1-&gt;&gt;C1: Release Lock
    
    Note over C2: Waiting for Lock...
    
    C2-&gt;&gt;C2: Acquire Lock
    C2-&gt;&gt;S: Send Requests
    S--&gt;&gt;C2: Receive Data (Streaming...)
    S--&gt;&gt;C2: End of Range
    C2-&gt;&gt;C2: Release Lock
```

### After (Multiplexed Concurrent)
With the introduction of the `_StreamMultiplexer`, multiple coroutines
can now share the same stream concurrently. Requests are interleaved,
and a background receiver loop routes incoming data to the correct task
using `read_id`.

**Execution Flow:**
```mermaid
sequenceDiagram
    participant C1 as Coroutine 1
    participant C2 as Coroutine 2
    participant M as Multiplexer
    participant S as gRPC Stream

    C1-&gt;&gt;M: Send Requests
    M-&gt;&gt;S: Forward Req 1
    C2-&gt;&gt;M: Send Requests
    M-&gt;&gt;S: Forward Req 2
    
    Note over C1,C2: Tasks wait on their own queues
    
    S--&gt;&gt;M: Data for C1
    M--&gt;&gt;C1: Route to Q1
    S--&gt;&gt;M: Data for C2
    M--&gt;&gt;C2: Route to Q2
    S--&gt;&gt;M: Data for C1
    M--&gt;&gt;C1: Route to Q1
```

## How the Benchmark Works
This PR adds a `read_rand_multi_coro` workload that:
1. Spawns multiple asynchronous tasks (coroutines).
2. Shares a single `AsyncMultiRangeDownloader` instance across all
tasks.
3. Simulates the old serialized behavior by explicitly passing a
`shared_lock` to `download_ranges`.
4. Measures total throughput (MiB/s) and resource utilization.

## Key Changes
- **`test_reads.py`**: Refactored to support launching concurrent
coroutines within a single worker process.
- **`config.yaml`**: Added `read_rand_multi_coro` with 1, 16 coroutines
to stress the downloader.
- **`config.py`**: Updated naming convention to include coroutine count
(e.g., `16c`) in reports for easier differentiation.
diff --git a/packages/google-cloud-storage/output.json b/packages/google-cloud-storage/output.json
diff --git a/packages/google-cloud-storage/tests/perf/microbenchmarks/time_based/conftest.py b/packages/google-cloud-storage/tests/perf/microbenchmarks/time_based/conftest.py
@@ -17,5 +17,5 @@
 @pytest.fixture
 def workload_params(request):
     params = request.param
-    files_names = [f"fio-go_storage_fio.0.{i}" for i in range(0, params.num_processes)]
+    files_names = [f"fio-go_storage_fio.0.{i}" for i in range(0, params.num_files)]
     return params, files_names
diff --git a/packages/google-cloud-storage/tests/perf/microbenchmarks/time_based/reads/config.py b/packages/google-cloud-storage/tests/perf/microbenchmarks/time_based/reads/config.py
@@ -80,10 +80,10 @@ def _get_params() -> Dict[str, List[TimeBasedReadParameters]]:
             chunk_size_bytes = chunk_size_kib * 1024
             bucket_name = bucket_map[bucket_type]
 
-            num_files = num_processes * num_coros
+            num_files = num_processes
 
             # Create a descriptive name for the parameter set
-            name = f"{pattern}_{bucket_type}_{num_processes}p_{file_size_mib}MiB_{chunk_size_kib}KiB_{num_ranges_val}ranges"
+            name = f"{pattern}_{bucket_type}_{num_processes}p_{num_coros}c_{file_size_mib}MiB_{chunk_size_kib}KiB_{num_ranges_val}ranges"
 
             params[workload_name].append(
                 TimeBasedReadParameters(
diff --git a/packages/google-cloud-storage/tests/perf/microbenchmarks/time_based/reads/config.yaml b/packages/google-cloud-storage/tests/perf/microbenchmarks/time_based/reads/config.yaml
@@ -20,9 +20,10 @@ workload:
 
   - name: "read_rand_multi_process"
     pattern: "rand"
-    coros: [1]
+    coros: [1, 16]
     processes: [1]
 
+
 defaults:
   DEFAULT_RAPID_ZONAL_BUCKET: "chandrasiri-benchmarks-zb"
   DEFAULT_STANDARD_BUCKET: "chandrasiri-benchmarks-rb"
diff --git a/packages/google-cloud-storage/tests/perf/microbenchmarks/time_based/reads/test_reads.py b/packages/google-cloud-storage/tests/perf/microbenchmarks/time_based/reads/test_reads.py
@@ -115,47 +115,51 @@ def _download_time_based_json(client, filename, params):
 
 
 async def _download_time_based_async(client, filename, params):
-    total_bytes_downloaded = 0
-
     mrd = AsyncMultiRangeDownloader(client, params.bucket_name, filename)
     await mrd.open()
 
-    offset = 0
-    is_warming_up = True
-    start_time = time.monotonic()
-    warmup_end_time = start_time + params.warmup_duration
-    test_end_time = warmup_end_time + params.duration
-
-    while time.monotonic() < test_end_time:
-        current_time = time.monotonic()
-        if is_warming_up and current_time >= warmup_end_time:
-            is_warming_up = False
-            total_bytes_downloaded = 0  # Reset counter after warmup
-
-        ranges = []
-        if params.pattern == "rand":
-            for _ in range(params.num_ranges):
-                offset = random.randint(
-                    0, params.file_size_bytes - params.chunk_size_bytes
-                )
-                ranges.append((offset, params.chunk_size_bytes, BytesIO()))
-        else:  # seq
-            for _ in range(params.num_ranges):
-                ranges.append((offset, params.chunk_size_bytes, BytesIO()))
-                offset += params.chunk_size_bytes
-                if offset + params.chunk_size_bytes > params.file_size_bytes:
-                    offset = 0  # Reset offset if end of file is reached
-
-        await mrd.download_ranges(ranges)
-
-        bytes_in_buffers = sum(r[2].getbuffer().nbytes for r in ranges)
-        assert bytes_in_buffers == params.chunk_size_bytes * params.num_ranges
-
-        if not is_warming_up:
-            total_bytes_downloaded += params.chunk_size_bytes * params.num_ranges
+    async def _worker_coro():
+        total_bytes_downloaded = 0
+        offset = 0
+        is_warming_up = True
+        start_time = time.monotonic()
+        warmup_end_time = start_time + params.warmup_duration
+        test_end_time = warmup_end_time + params.duration
+
+        while time.monotonic() < test_end_time:
+            current_time = time.monotonic()
+            if is_warming_up and current_time >= warmup_end_time:
+                is_warming_up = False
+                total_bytes_downloaded = 0  # Reset counter after warmup
+
+            ranges = []
+            if params.pattern == "rand":
+                for _ in range(params.num_ranges):
+                    offset = random.randint(
+                        0, params.file_size_bytes - params.chunk_size_bytes
+                    )
+                    ranges.append((offset, params.chunk_size_bytes, BytesIO()))
+            else:  # seq
+                for _ in range(params.num_ranges):
+                    ranges.append((offset, params.chunk_size_bytes, BytesIO()))
+                    offset += params.chunk_size_bytes
+                    if offset + params.chunk_size_bytes > params.file_size_bytes:
+                        offset = 0  # Reset offset if end of file is reached
+
+            await mrd.download_ranges(ranges)
+
+            bytes_in_buffers = sum(r[2].getbuffer().nbytes for r in ranges)
+            assert bytes_in_buffers == params.chunk_size_bytes * params.num_ranges
+
+            if not is_warming_up:
+                total_bytes_downloaded += params.chunk_size_bytes * params.num_ranges
+        return total_bytes_downloaded
+
+    tasks = [asyncio.create_task(_worker_coro()) for _ in range(params.num_coros)]
+    results = await asyncio.gather(*tasks)
 
     await mrd.close()
-    return total_bytes_downloaded
+    return sum(results)
 
 
 def _download_files_worker(process_idx, filename, params, bucket_type):