(improvement) cache namedtuple class in named_tuple_factory to avoid repeated exec() calls

mykaul · mykaul · commit a66228101c0b · 2026-04-07T14:02:39.000+03:00
Cache the Row namedtuple class keyed on tuple(colnames) so Python's
namedtuple() (which internally calls exec()) is only invoked once per
unique column schema. For prepared statements the column names never
change, eliminating redundant class creation on every result set.

Cache is a plain dict keyed on tuple(colnames) (raw column names before
cleaning). Error handling paths (SyntaxError, Exception) preserved
unchanged. Cache is naturally bounded by the number of distinct queries.
diff --git a/benchmarks/test_named_tuple_factory_benchmark.py b/benchmarks/test_named_tuple_factory_benchmark.py
@@ -0,0 +1,206 @@
+# Copyright ScyllaDB, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Benchmarks for named_tuple_factory with and without namedtuple class caching.
+
+Run with: pytest benchmarks/test_named_tuple_factory_benchmark.py -v
+"""
+
+import re
+import warnings
+from collections import namedtuple
+
+import pytest
+
+from cassandra.query import named_tuple_factory, _named_tuple_cache
+from cassandra.util import _sanitize_identifiers
+
+
+# ---------------------------------------------------------------------------
+# Reference: original uncached implementation (copied from master)
+# ---------------------------------------------------------------------------
+
+NON_ALPHA_REGEX = re.compile("[^a-zA-Z0-9]")
+START_BADCHAR_REGEX = re.compile("^[^a-zA-Z0-9]*")
+END_BADCHAR_REGEX = re.compile("[^a-zA-Z0-9_]*$")
+
+_clean_name_cache_old = {}
+
+
+def _clean_column_name_old(name):
+    try:
+        return _clean_name_cache_old[name]
+    except KeyError:
+        clean = NON_ALPHA_REGEX.sub(
+            "_", START_BADCHAR_REGEX.sub("", END_BADCHAR_REGEX.sub("", name))
+        )
+        _clean_name_cache_old[name] = clean
+        return clean
+
+
+def named_tuple_factory_uncached(colnames, rows):
+    """Original implementation without caching (for benchmark comparison)."""
+    clean_column_names = map(_clean_column_name_old, colnames)
+    try:
+        Row = namedtuple("Row", clean_column_names)
+    except SyntaxError:
+        raise
+    except Exception:
+        clean_column_names = list(map(_clean_column_name_old, colnames))
+        Row = namedtuple("Row", _sanitize_identifiers(clean_column_names))
+    return [Row(*row) for row in rows]
+
+
+# ---------------------------------------------------------------------------
+# Test data generators
+# ---------------------------------------------------------------------------
+
+
+def make_colnames(n):
+    return tuple(f"col_{i}" for i in range(n))
+
+
+def make_rows(ncols, nrows):
+    return [tuple(range(ncols)) for _ in range(nrows)]
+
+
+# ---------------------------------------------------------------------------
+# Correctness tests
+# ---------------------------------------------------------------------------
+
+
+class TestNamedTupleFactoryCorrectness:
+    """Verify the cached implementation matches the uncached one."""
+
+    @pytest.mark.parametrize("ncols", [1, 5, 10, 20])
+    @pytest.mark.parametrize("nrows", [1, 10, 100])
+    def test_results_match(self, ncols, nrows):
+        colnames = make_colnames(ncols)
+        rows = make_rows(ncols, nrows)
+        _named_tuple_cache.clear()
+        cached_result = named_tuple_factory(colnames, rows)
+        uncached_result = named_tuple_factory_uncached(colnames, rows)
+        assert len(cached_result) == len(uncached_result)
+        for cr, ur in zip(cached_result, uncached_result):
+            assert tuple(cr) == tuple(ur)
+            assert cr._fields == ur._fields
+
+    def test_cache_hit_returns_same_class(self):
+        colnames = ("name", "age", "email")
+        rows1 = [("Alice", 30, "a@b.com")]
+        rows2 = [("Bob", 25, "b@c.com")]
+        _named_tuple_cache.clear()
+        result1 = named_tuple_factory(colnames, rows1)
+        result2 = named_tuple_factory(colnames, rows2)
+        # Same Row class should be reused
+        assert type(result1[0]) is type(result2[0])
+
+    def test_different_schemas_get_different_classes(self):
+        _named_tuple_cache.clear()
+        result1 = named_tuple_factory(("a", "b"), [(1, 2)])
+        result2 = named_tuple_factory(("x", "y"), [(3, 4)])
+        assert type(result1[0]) is not type(result2[0])
+        assert result1[0]._fields == ("a", "b")
+        assert result2[0]._fields == ("x", "y")
+
+
+# ---------------------------------------------------------------------------
+# Benchmarks
+# ---------------------------------------------------------------------------
+
+
+class TestNamedTupleFactoryBenchmark:
+    """Benchmark cached vs uncached named_tuple_factory."""
+
+    # --- 5 columns, 100 rows ---
+
+    @pytest.mark.benchmark(group="ntf_5cols_100rows")
+    def test_uncached_5cols_100rows(self, benchmark):
+        colnames = make_colnames(5)
+        rows = make_rows(5, 100)
+        benchmark(named_tuple_factory_uncached, colnames, rows)
+
+    @pytest.mark.benchmark(group="ntf_5cols_100rows")
+    def test_cached_5cols_100rows(self, benchmark):
+        colnames = make_colnames(5)
+        rows = make_rows(5, 100)
+        _named_tuple_cache.clear()
+        # Warm the cache with one call
+        named_tuple_factory(colnames, rows)
+        benchmark(named_tuple_factory, colnames, rows)
+
+    # --- 10 columns, 100 rows ---
+
+    @pytest.mark.benchmark(group="ntf_10cols_100rows")
+    def test_uncached_10cols_100rows(self, benchmark):
+        colnames = make_colnames(10)
+        rows = make_rows(10, 100)
+        benchmark(named_tuple_factory_uncached, colnames, rows)
+
+    @pytest.mark.benchmark(group="ntf_10cols_100rows")
+    def test_cached_10cols_100rows(self, benchmark):
+        colnames = make_colnames(10)
+        rows = make_rows(10, 100)
+        _named_tuple_cache.clear()
+        named_tuple_factory(colnames, rows)
+        benchmark(named_tuple_factory, colnames, rows)
+
+    # --- 20 columns, 100 rows ---
+
+    @pytest.mark.benchmark(group="ntf_20cols_100rows")
+    def test_uncached_20cols_100rows(self, benchmark):
+        colnames = make_colnames(20)
+        rows = make_rows(20, 100)
+        benchmark(named_tuple_factory_uncached, colnames, rows)
+
+    @pytest.mark.benchmark(group="ntf_20cols_100rows")
+    def test_cached_20cols_100rows(self, benchmark):
+        colnames = make_colnames(20)
+        rows = make_rows(20, 100)
+        _named_tuple_cache.clear()
+        named_tuple_factory(colnames, rows)
+        benchmark(named_tuple_factory, colnames, rows)
+
+    # --- 5 columns, 1000 rows ---
+
+    @pytest.mark.benchmark(group="ntf_5cols_1000rows")
+    def test_uncached_5cols_1000rows(self, benchmark):
+        colnames = make_colnames(5)
+        rows = make_rows(5, 1000)
+        benchmark(named_tuple_factory_uncached, colnames, rows)
+
+    @pytest.mark.benchmark(group="ntf_5cols_1000rows")
+    def test_cached_5cols_1000rows(self, benchmark):
+        colnames = make_colnames(5)
+        rows = make_rows(5, 1000)
+        _named_tuple_cache.clear()
+        named_tuple_factory(colnames, rows)
+        benchmark(named_tuple_factory, colnames, rows)
+
+    # --- 10 columns, 1 row (measures class creation overhead most clearly) ---
+
+    @pytest.mark.benchmark(group="ntf_10cols_1row")
+    def test_uncached_10cols_1row(self, benchmark):
+        colnames = make_colnames(10)
+        rows = make_rows(10, 1)
+        benchmark(named_tuple_factory_uncached, colnames, rows)
+
+    @pytest.mark.benchmark(group="ntf_10cols_1row")
+    def test_cached_10cols_1row(self, benchmark):
+        colnames = make_colnames(10)
+        rows = make_rows(10, 1)
+        _named_tuple_cache.clear()
+        named_tuple_factory(colnames, rows)
+        benchmark(named_tuple_factory, colnames, rows)
diff --git a/cassandra/query.py b/cassandra/query.py
@@ -117,6 +117,12 @@ def pseudo_namedtuple_factory(colnames, rows):
             for od in ordered_dict_factory(colnames, rows)]
 
 
+# Cache namedtuple Row classes to avoid repeated exec() calls in namedtuple()
+# for the same column schema. Naturally bounded by the number of distinct
+# column-name tuples, which equals the number of distinct queries.
+_named_tuple_cache = {}
+
+
 def named_tuple_factory(colnames, rows):
     """
     Returns each row as a `namedtuple <https://docs.python.org/2/library/collections.html#collections.namedtuple>`_.
@@ -146,32 +152,37 @@ def named_tuple_factory(colnames, rows):
     .. versionchanged:: 2.0.0
         moved from ``cassandra.decoder`` to ``cassandra.query``
     """
-    clean_column_names = map(_clean_column_name, colnames)
+    key = tuple(colnames)
     try:
-        Row = namedtuple('Row', clean_column_names)
-    except SyntaxError:
-        warnings.warn(
-            "Failed creating namedtuple for a result because there were too "
-            "many columns. This is due to a Python limitation that affects "
-            "namedtuple in Python 3.0-3.6 (see issue18896). The row will be "
-            "created with {substitute_factory_name}, which lacks some namedtuple "
-            "features and is slower. To avoid slower performance accessing "
-            "values on row objects, Upgrade to Python 3.7, or use a different "
-            "row factory. (column names: {colnames})".format(
-                substitute_factory_name=pseudo_namedtuple_factory.__name__,
-                colnames=colnames
+        Row = _named_tuple_cache[key]
+    except KeyError:
+        clean_column_names = map(_clean_column_name, colnames)
+        try:
+            Row = namedtuple('Row', clean_column_names)
+        except SyntaxError:
+            warnings.warn(
+                "Failed creating namedtuple for a result because there were too "
+                "many columns. This is due to a Python limitation that affects "
+                "namedtuple in Python 3.0-3.6 (see issue18896). The row will be "
+                "created with {substitute_factory_name}, which lacks some namedtuple "
+                "features and is slower. To avoid slower performance accessing "
+                "values on row objects, Upgrade to Python 3.7, or use a different "
+                "row factory. (column names: {colnames})".format(
+                    substitute_factory_name=pseudo_namedtuple_factory.__name__,
+                    colnames=colnames
+                )
             )
-        )
-        return pseudo_namedtuple_factory(colnames, rows)
-    except Exception:
-        clean_column_names = list(map(_clean_column_name, colnames))  # create list because py3 map object will be consumed by first attempt
-        log.warning("Failed creating named tuple for results with column names %s (cleaned: %s) "
-                    "(see Python 'namedtuple' documentation for details on name rules). "
-                    "Results will be returned with positional names. "
-                    "Avoid this by choosing different names, using SELECT \"<col name>\" AS aliases, "
-                    "or specifying a different row_factory on your Session" %
-                    (colnames, clean_column_names))
-        Row = namedtuple('Row', _sanitize_identifiers(clean_column_names))
+            return pseudo_namedtuple_factory(colnames, rows)
+        except Exception:
+            clean_column_names = list(map(_clean_column_name, colnames))  # create list because py3 map object will be consumed by first attempt
+            log.warning("Failed creating named tuple for results with column names %s (cleaned: %s) "
+                        "(see Python 'namedtuple' documentation for details on name rules). "
+                        "Results will be returned with positional names. "
+                        "Avoid this by choosing different names, using SELECT \"<col name>\" AS aliases, "
+                        "or specifying a different row_factory on your Session" %
+                        (colnames, clean_column_names))
+            Row = namedtuple('Row', _sanitize_identifiers(clean_column_names))
+        _named_tuple_cache[key] = Row
 
     return [Row(*row) for row in rows]