fix(google-crc32c): release GIL for large buffers in crc32c operations (#16975)

zhixiangli · web-flow · commit 54b1971a7955 · 2026-05-11T19:49:06.000+05:30
Release the Global Interpreter Lock (GIL) in `_crc32c_extend` and `_crc32c_value` when processing large, immutable byte buffers (>= 1MB). This allows other Python threads to run concurrently during expensive crc32c calculations on large chunks of data. Fixes #16923 🦕 # Unit Tests ``` .nox/check-3-13/bin/pytest -v tests ==================================================================================================================== test session starts ===================================================================================================================== platform linux -- Python 3.13.12, pytest-9.0.3, pluggy-1.6.0 -- /usr/local/google/home/zhixiangli/Cloudtop/Github/zhixiangli/google-cloud-python/packages/google-crc32c/.nox/check-3-13/bin/python cachedir: .pytest_cache rootdir: /usr/local/google/home/zhixiangli/Cloudtop/Github/zhixiangli/google-cloud-python/packages/google-crc32c configfile: pyproject.toml collected 42 items tests/test___init__.py::test_extend_w_empty_chunk PASSED [ 2%] tests/test___init__.py::test_extend_w_multiple_chunks PASSED [ 4%] tests/test___init__.py::test_extend_w_reduce PASSED [ 7%] tests/test___init__.py::test_value[-0] PASSED [ 9%] tests/test___init__.py::test_value[\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00-2324772522] PASSED [ 11%] tests/test___init__.py::test_value[\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff-1655221059] PASSED [ 14%] tests/test___init__.py::test_value[\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f-1188919630] PASSED [ 16%] tests/test___init__.py::test_value[\x1f\x1e\x1d\x1c\x1b\x1a\x19\x18\x17\x16\x15\x14\x13\x12\x11\x10\x0f\x0e\r\x0c\x0b\n\t\x08\x07\x06\x05\x04\x03\x02\x01\x00-289397596] PASSED [ 19%] tests/test___init__.py::test_value[chunk5-3650501206] PASSED [ 21%] tests/test___init__.py::test_value[\x01\xc0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x14\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x14\x00\x00\x00\x18(\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00-3650501206] PASSED [ 23%] tests/test___init__.py::TestChecksum::test_ctor_defaults[python] PASSED [ 26%] tests/test___init__.py::TestChecksum::test_ctor_defaults[cext] PASSED [ 28%] tests/test___init__.py::TestChecksum::test_ctor_explicit[python] PASSED [ 30%] tests/test___init__.py::TestChecksum::test_ctor_explicit[cext] PASSED [ 33%] tests/test___init__.py::TestChecksum::test_update[python] PASSED [ 35%] tests/test___init__.py::TestChecksum::test_update[cext] PASSED [ 38%] tests/test___init__.py::TestChecksum::test_update_w_multiple_chunks[python] PASSED [ 40%] tests/test___init__.py::TestChecksum::test_update_w_multiple_chunks[cext] PASSED [ 42%] tests/test___init__.py::TestChecksum::test_digest_zero[python] PASSED [ 45%] tests/test___init__.py::TestChecksum::test_digest_zero[cext] PASSED [ 47%] tests/test___init__.py::TestChecksum::test_digest_nonzero[python] PASSED [ 50%] tests/test___init__.py::TestChecksum::test_digest_nonzero[cext] PASSED [ 52%] tests/test___init__.py::TestChecksum::test_hexdigest_zero[python] PASSED [ 54%] tests/test___init__.py::TestChecksum::test_hexdigest_zero[cext] PASSED [ 57%] tests/test___init__.py::TestChecksum::test_hexdigest_nonzero[python] PASSED [ 59%] tests/test___init__.py::TestChecksum::test_hexdigest_nonzero[cext] PASSED [ 61%] tests/test___init__.py::TestChecksum::test_copy[python] PASSED [ 64%] tests/test___init__.py::TestChecksum::test_copy[cext] PASSED [ 66%] tests/test___init__.py::TestChecksum::test_consume_stream[python-1] PASSED [ 69%] tests/test___init__.py::TestChecksum::test_consume_stream[python-3] PASSED [ 71%] tests/test___init__.py::TestChecksum::test_consume_stream[python-5] PASSED [ 73%] tests/test___init__.py::TestChecksum::test_consume_stream[python-7] PASSED [ 76%] tests/test___init__.py::TestChecksum::test_consume_stream[python-11] PASSED [ 78%] tests/test___init__.py::TestChecksum::test_consume_stream[python-13] PASSED [ 80%] tests/test___init__.py::TestChecksum::test_consume_stream[python-48] PASSED [ 83%] tests/test___init__.py::TestChecksum::test_consume_stream[cext-1] PASSED [ 85%] tests/test___init__.py::TestChecksum::test_consume_stream[cext-3] PASSED [ 88%] tests/test___init__.py::TestChecksum::test_consume_stream[cext-5] PASSED [ 90%] tests/test___init__.py::TestChecksum::test_consume_stream[cext-7] PASSED [ 92%] tests/test___init__.py::TestChecksum::test_consume_stream[cext-11] PASSED [ 95%] tests/test___init__.py::TestChecksum::test_consume_stream[cext-13] PASSED [ 97%] tests/test___init__.py::TestChecksum::test_consume_stream[cext-48] PASSED [100%] ===================================================================================================================== 42 passed in 0.08s ===================================================================================================================== ``` # Perf Tests ## Methodology Created an independent benchmark script (shown below) that measures the time taken for `crc32c.value()` on different buffer sizes (10KB to 10MB) with 1 thread and 4 threads. ## Results | Size | Threads | Before (s) | After (s) | Speedup | | :--- | :--- | :--- | :--- | :--- | | **10KB** | 1 | 0.0001 | 0.0001 | 1.0x | | | 4 | 0.0031 | 0.0033 | 0.9x | | **500KB**| 1 | 0.0023 | 0.0022 | 1.0x | | | 4 | 0.0110 | 0.0110 | 1.0x | | **1MB** | 1 | 0.0045 | 0.0045 | 1.0x | | | 4 | 0.0241 | 0.0067 | **3.6x** | | **5MB** | 1 | 0.0291 | 0.0227 | 1.3x | | | 4 | 0.1209 | 0.0245 | **4.9x** | | **10MB** | 1 | 0.0550 | 0.0451 | 1.2x | | | 4 | 0.2091 | 0.0483 | **4.3x** | ## Conclusion - **Multi-threaded performance for large buffers (>= 1MB) improved significantly.** We see speedups of 3.6x to 4.9x when using 4 threads on buffers of 1MB and larger. - **No regression for small buffers (< 1MB)** where the GIL is not released. - **Single-threaded performance is also slightly better** or comparable, showing no negative impact from the conditional GIL release overhead. ## Code ``` import time import concurrent.futures import os import sys try: from google_crc32c import _crc32c import google_crc32c print(f"Successfully imported _crc32c: {_crc32c}") except ImportError as e: print(f"Failed to import _crc32c: {e}") print("This benchmark requires the C extension.") sys.exit(1) def benchmark_single_threaded(data, iterations=100): start = time.time() for _ in range(iterations): google_crc32c.value(data) return time.time() - start def benchmark_multi_threaded(data, num_threads=4, iterations=100): start = time.time() def worker(): for _ in range(iterations): google_crc32c.value(data) with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor: futures = [executor.submit(worker) for _ in range(num_threads)] concurrent.futures.wait(futures) return time.time() - start sizes = { "10KB": 10 * 1024, "500KB": 500 * 1024, "1MB": 1024 * 1024, "5MB": 5 * 1024 * 1024, "10MB": 10 * 1024 * 1024, } print(f"{'Size':<10} | {'Threads':<10} | {'Time (s)':<10}") print("-" * 35) # Warmup dummy_data = os.urandom(1024) google_crc32c.value(dummy_data) for name, size in sizes.items(): data = os.urandom(size) # Single threaded t_single = benchmark_single_threaded(data, iterations=100) print(f"{name:<10} | {'1':<10} | {t_single:.4f}") # Multi threaded t_multi = benchmark_multi_threaded(data, num_threads=4, iterations=100) print(f"{name:<10} | {'4':<10} | {t_multi:.4f}") ```
diff --git a/packages/google-crc32c/src/google_crc32c/_crc32c.c b/packages/google-crc32c/src/google_crc32c/_crc32c.c
@@ -2,6 +2,16 @@
 #include <Python.h>
 #include <crc32c/crc32c.h>
 
+/* The minimum buffer size in bytes (1MB) required to justify the overhead of releasing the GIL. */
+static const Py_ssize_t gil_threshold = 1024 * 1024;
+
+static int
+_should_release_gil(Py_ssize_t length, PyObject *chunk_obj)
+{
+    /* Checks if the chunk is immutable (bytes) to prevent concurrent modification,
+     * and large enough to benefit from releasing the GIL. */
+    return (length >= gil_threshold && PyBytes_Check(chunk_obj));
+}
 
 static PyObject *
 _crc32c_extend(PyObject *self, PyObject *args)
@@ -10,12 +20,21 @@ _crc32c_extend(PyObject *self, PyObject *args)
     uint32_t crc;
     const char *chunk;
     Py_ssize_t length;
+    PyThreadState *save = NULL;
 
     if (!PyArg_ParseTuple(args, "ky#", &crc_input, &chunk, &length))
         return NULL;
 
+    if (_should_release_gil(length, PyTuple_GET_ITEM(args, 1))) {
+        save = PyEval_SaveThread();
+    }
+
     crc = crc32c_extend((uint32_t)crc_input, (const uint8_t*)chunk, length);
 
+    if (save) {
+        PyEval_RestoreThread(save);
+    }
+
     return PyLong_FromUnsignedLong(crc);
 }
 
@@ -26,12 +45,21 @@ _crc32c_value(PyObject *self, PyObject *args)
     uint32_t crc;
     const char *chunk;
     Py_ssize_t length;
+    PyThreadState *save = NULL;
 
     if (!PyArg_ParseTuple(args, "y#", &chunk, &length))
         return NULL;
 
+    if (_should_release_gil(length, PyTuple_GET_ITEM(args, 0))) {
+        save = PyEval_SaveThread();
+    }
+
     crc = crc32c_value((const uint8_t*)chunk, length);
 
+    if (save) {
+        PyEval_RestoreThread(save);
+    }
+
     return PyLong_FromUnsignedLong(crc);
 }