-
Notifications
You must be signed in to change notification settings - Fork 1.7k
perf: add microbenchmarks for crc32c and MRD reads #17392
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from 1 commit
Commits
Show all changes
18 commits
Select commit
Hold shift + click to select a range
75d0127
perf: add microbenchmarks for crc32c and MRD reads
chandra-siri 02fa59c
perf: add warmup phase to MRD reads benchmark
chandra-siri 891a981
perf: update warmup chunk size to 10MiB
chandra-siri 88f0e21
perf: make GCS object size configurable in MRD reads benchmark
chandra-siri 1dbb5f2
perf: add % change when checksum disabled column to MRD reads report
chandra-siri fc85a68
perf: support pre-upload and full range downloads in MRD reads benchmark
chandra-siri 0334d2c
perf: align upload/download sizes and use random temp objects in MRD …
chandra-siri f268e4f
perf: migrate prints to stderr logging, introducing --debug option fo…
chandra-siri 1fe6c69
perf: skip Full-1 case when checksum validation is disabled
chandra-siri 9403ed4
perf: add pytest-benchmark test for checksum overhead in MRD reads
chandra-siri bc0ac8d
perf: calculate and report average throughput in test_checksum_overhead
chandra-siri 3efb011
perf: convert test parameter to (object_size, download_size) tuple, t…
chandra-siri 183e297
perf: upload fresh object for each enable_chk iteration in MRD reads …
chandra-siri 39d633c
perf: compare Full-1 throughput with Full baseline in MRD reads bench…
chandra-siri ec174cd
perf: make test_checksum_overhead rounds configurable via BENCHMARK_R…
chandra-siri 9db8398
perf: calculate standard deviation in throughput and elapsed time, ad…
chandra-siri 56c3968
perf: add standard deviation to throughput reporting in MRD reads ben…
chandra-siri 3d67e6e
chore: fix formatting and lint issues
chandra-siri File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
107 changes: 107 additions & 0 deletions
107
packages/google-cloud-storage/tests/perf/microbenchmarks/benchmark_crc32c.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,107 @@ | ||
| import argparse | ||
| import os | ||
| import statistics | ||
| import sys | ||
| import time | ||
|
|
||
| try: | ||
| import google_crc32c | ||
| except ImportError: | ||
| print("Error: google_crc32c package is not installed in the python environment.", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
|
|
||
| def parse_size(size_str: str) -> int: | ||
| size_str = size_str.strip().upper() | ||
| if size_str.endswith("KIB"): | ||
| return int(float(size_str[:-3]) * 1024) | ||
| elif size_str.endswith("MIB"): | ||
| return int(float(size_str[:-3]) * 1024 * 1024) | ||
| elif size_str.endswith("GIB"): | ||
| return int(float(size_str[:-3]) * 1024 * 1024 * 1024) | ||
| elif size_str.endswith("KB"): | ||
| return int(float(size_str[:-2]) * 1000) | ||
| elif size_str.endswith("MB"): | ||
| return int(float(size_str[:-2]) * 1000 * 1000) | ||
| elif size_str.endswith("GB"): | ||
| return int(float(size_str[:-2]) * 1000 * 1000 * 1000) | ||
| elif size_str.endswith("B"): | ||
| return int(size_str[:-1]) | ||
| else: | ||
| try: | ||
| return int(size_str) | ||
| except ValueError: | ||
| raise ValueError(f"Unknown size format: {size_str}") | ||
|
|
||
|
|
||
| def format_time(seconds: float) -> str: | ||
| if seconds < 1e-6: | ||
| return f"{seconds * 1e9:.2f} ns" | ||
| elif seconds < 1e-3: | ||
| return f"{seconds * 1e6:.2f} \u03bcs" | ||
| elif seconds < 1.0: | ||
| return f"{seconds * 1e3:.2f} ms" | ||
| else: | ||
| return f"{seconds:.2f} s" | ||
|
|
||
|
|
||
| def main(): | ||
| parser = argparse.ArgumentParser(description="Benchmark google_crc32c.value execution time.") | ||
| parser.add_argument( | ||
| "--sizes", | ||
| type=str, | ||
| default="1KiB,100KiB,2MiB", | ||
| help="Comma-separated list of sizes (e.g. '1KiB,100KiB,2MiB')" | ||
| ) | ||
| parser.add_argument( | ||
| "--iterations", | ||
| type=int, | ||
| default=100, | ||
| help="Number of iterations for benchmark (default: 100)" | ||
| ) | ||
| args = parser.parse_args() | ||
|
|
||
| # Ensure google_crc32c uses accelerated C code | ||
| impl = getattr(google_crc32c, "implementation", None) | ||
| print(f"google_crc32c implementation: {impl}") | ||
| if impl != "c": | ||
| print(f"Error: google_crc32c is not using the accelerated C code (got '{impl}').", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
| sizes_to_test = [] | ||
| for s in args.sizes.split(","): | ||
| try: | ||
| sizes_to_test.append((s.strip(), parse_size(s))) | ||
| except ValueError as e: | ||
| print(f"Error parsing size '{s}': {e}", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
| print(f"Benchmarking google_crc32c.value(data) with {args.iterations} iterations:") | ||
| print("-" * 80) | ||
| print(f"{'Size (String)':<15} | {'Size (Bytes)':<12} | {'Min':<10} | {'Max':<10} | {'Mean':<10} | {'Median':<10}") | ||
| print("-" * 80) | ||
|
|
||
| for size_str, size_bytes in sizes_to_test: | ||
| data = os.urandom(size_bytes) | ||
|
|
||
| durations = [] | ||
| for _ in range(args.iterations): | ||
| start = time.perf_counter() | ||
| _ = google_crc32c.value(data) | ||
| end = time.perf_counter() | ||
| durations.append(end - start) | ||
|
|
||
| min_time = min(durations) | ||
| max_time = max(durations) | ||
| mean_time = statistics.mean(durations) | ||
| median_time = statistics.median(durations) | ||
|
|
||
| print( | ||
| f"{size_str:<15} | {size_bytes:<12} | " | ||
| f"{format_time(min_time):<10} | {format_time(max_time):<10} | " | ||
| f"{format_time(mean_time):<10} | {format_time(median_time):<10}" | ||
| ) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() |
168 changes: 168 additions & 0 deletions
168
packages/google-cloud-storage/tests/perf/microbenchmarks/benchmark_mrd_reads.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,168 @@ | ||
| import argparse | ||
| import asyncio | ||
| import os | ||
| import random | ||
| import statistics | ||
| import sys | ||
| import time | ||
|
|
||
| try: | ||
| import google_crc32c | ||
| except ImportError: | ||
| print("Error: google_crc32c package is not installed in the python environment.", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
| from google.cloud.storage.asyncio.async_grpc_client import AsyncGrpcClient | ||
| from google.cloud.storage.asyncio.async_multi_range_downloader import AsyncMultiRangeDownloader | ||
|
|
||
|
|
||
| class VoidBuffer: | ||
| """A writeable file-like object that discards written data to save memory.""" | ||
| def __init__(self): | ||
| self.size = 0 | ||
|
|
||
| def write(self, data: bytes) -> int: | ||
| n = len(data) | ||
| self.size += n | ||
| return n | ||
|
|
||
| def tell(self) -> int: | ||
| return self.size | ||
|
|
||
|
|
||
| def parse_size(size_str: str) -> int: | ||
| size_str = size_str.strip().upper() | ||
| if size_str.endswith("KIB"): | ||
| return int(float(size_str[:-3]) * 1024) | ||
| elif size_str.endswith("MIB"): | ||
| return int(float(size_str[:-3]) * 1024 * 1024) | ||
| elif size_str.endswith("GIB"): | ||
| return int(float(size_str[:-3]) * 1024 * 1024 * 1024) | ||
| elif size_str.endswith("KB"): | ||
| return int(float(size_str[:-2]) * 1000) | ||
| elif size_str.endswith("MB"): | ||
| return int(float(size_str[:-2]) * 1000 * 1000) | ||
| elif size_str.endswith("GB"): | ||
| return int(float(size_str[:-2]) * 1000 * 1000 * 1000) | ||
| elif size_str.endswith("B"): | ||
| return int(size_str[:-1]) | ||
| else: | ||
| try: | ||
| return int(size_str) | ||
| except ValueError: | ||
| raise ValueError(f"Unknown size format: {size_str}") | ||
|
|
||
|
|
||
| def format_time(seconds: float) -> str: | ||
| if seconds < 1e-6: | ||
| return f"{seconds * 1e9:.2f} ns" | ||
| elif seconds < 1e-3: | ||
| return f"{seconds * 1e6:.2f} \u03bcs" | ||
| elif seconds < 1.0: | ||
| return f"{seconds * 1e3:.2f} ms" | ||
| else: | ||
| return f"{seconds:.2f} s" | ||
|
|
||
|
|
||
| async def download_range( | ||
| grpc_client: AsyncGrpcClient, | ||
| bucket_name: str, | ||
| object_name: str, | ||
| start_byte: int, | ||
| size: int, | ||
| enable_checksum: bool, | ||
| ) -> float: | ||
| mrd = AsyncMultiRangeDownloader(grpc_client, bucket_name, object_name) | ||
| try: | ||
| await mrd.open() | ||
| output_buffer = VoidBuffer() | ||
| start = time.perf_counter() | ||
| await mrd.download_ranges( | ||
| [(start_byte, size, output_buffer)], | ||
| enable_checksum=enable_checksum, | ||
| ) | ||
| end = time.perf_counter() | ||
| return end - start | ||
| finally: | ||
| if mrd.is_stream_open: | ||
| await mrd.close() | ||
|
|
||
|
|
||
| async def run_benchmark(): | ||
| parser = argparse.ArgumentParser(description="Benchmark GCS Object Range Downloads using MRD.") | ||
| parser.add_argument("--bucket", type=str, default="chandrasiri-benchmarks-zb", help="Bucket name") | ||
| parser.add_argument("--object", type=str, default="large_20260507_10737418240", help="Object name (10GiB size)") | ||
| parser.add_argument("--sizes", type=str, default="1KiB,2MiB,10MiB,100MiB,1GiB", help="Sizes to benchmark") | ||
| parser.add_argument("--iterations", type=int, default=5, help="Number of iterations per size") | ||
| args = parser.parse_args() | ||
|
|
||
| impl = getattr(google_crc32c, "implementation", None) | ||
| print(f"google_crc32c implementation: {impl}") | ||
| if impl != "c": | ||
| print(f"Error: google_crc32c implementation is '{impl}', expected 'c'", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
| sizes_to_test = [] | ||
| for s in args.sizes.split(","): | ||
| try: | ||
| sizes_to_test.append((s.strip(), parse_size(s))) | ||
| except ValueError as e: | ||
| print(f"Error parsing size '{s}': {e}", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
| # 10 GiB in bytes | ||
| object_size_bytes = 10 * 1024 * 1024 * 1024 | ||
|
|
||
| grpc_client = AsyncGrpcClient() | ||
|
|
||
| print(f"Benchmarking MRD Reads on gs://{args.bucket}/{args.object} with {args.iterations} iterations:") | ||
| print("-" * 125) | ||
| print(f"{'Size (String)':<15} | {'Checksum':<10} | {'Size (Bytes)':<12} | {'Min':<12} | {'Max':<12} | {'Mean':<12} | {'Median':<12} | {'Avg Throughput':<15}") | ||
| print("-" * 125) | ||
|
|
||
| for size_str, size_bytes in sizes_to_test: | ||
| # Pre-generate random offsets so that both Enabled and Disabled configurations run on the exact same offsets | ||
| offsets = [random.randint(0, object_size_bytes - size_bytes) for _ in range(args.iterations)] | ||
|
chandra-siri marked this conversation as resolved.
Outdated
|
||
|
|
||
| for enable_chk in [True, False]: | ||
| chk_label = "Enabled" if enable_chk else "Disabled" | ||
| durations = [] | ||
|
|
||
| for i, start_byte in enumerate(offsets): | ||
| print(f" [{size_str} - Checksum {chk_label}] Iteration {i+1}/{args.iterations}: Downloading from offset {start_byte}...", end="", flush=True) | ||
|
|
||
| try: | ||
| duration = await download_range(grpc_client, args.bucket, args.object, start_byte, size_bytes, enable_checksum=enable_chk) | ||
| durations.append(duration) | ||
| print(f" Done in {format_time(duration)}") | ||
| except Exception as e: | ||
| print(f" Failed: {e}") | ||
| continue | ||
|
|
||
| if not durations: | ||
| print(f"{size_str:<15} | {chk_label:<10} | {size_bytes:<12} | {'FAILED':<12} | {'FAILED':<12} | {'FAILED':<12} | {'FAILED':<12} | {'N/A':<15}") | ||
| continue | ||
|
|
||
| min_time = min(durations) | ||
| max_time = max(durations) | ||
| mean_time = statistics.mean(durations) | ||
| median_time = statistics.median(durations) | ||
|
|
||
| # Throughput in MiB/s | ||
| avg_throughput = (size_bytes / (1024 * 1024)) / mean_time | ||
|
|
||
| print( | ||
| f"{size_str:<15} | {chk_label:<10} | {size_bytes:<12} | " | ||
| f"{format_time(min_time):<12} | {format_time(max_time):<12} | " | ||
| f"{format_time(mean_time):<12} | {format_time(median_time):<12} | " | ||
| f"{avg_throughput:.2f} MiB/s" | ||
| ) | ||
| print("-" * 125) | ||
|
|
||
|
|
||
| def main(): | ||
| asyncio.run(run_benchmark()) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.