Skip to content

Commit a662281

Browse files
committed
(improvement) cache namedtuple class in named_tuple_factory to avoid repeated exec() calls
Cache the Row namedtuple class keyed on tuple(colnames) so Python's namedtuple() (which internally calls exec()) is only invoked once per unique column schema. For prepared statements the column names never change, eliminating redundant class creation on every result set. Cache is a plain dict keyed on tuple(colnames) (raw column names before cleaning). Error handling paths (SyntaxError, Exception) preserved unchanged. Cache is naturally bounded by the number of distinct queries.
1 parent caa98b6 commit a662281

2 files changed

Lines changed: 241 additions & 24 deletions

File tree

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
# Copyright ScyllaDB, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
Benchmarks for named_tuple_factory with and without namedtuple class caching.
17+
18+
Run with: pytest benchmarks/test_named_tuple_factory_benchmark.py -v
19+
"""
20+
21+
import re
22+
import warnings
23+
from collections import namedtuple
24+
25+
import pytest
26+
27+
from cassandra.query import named_tuple_factory, _named_tuple_cache
28+
from cassandra.util import _sanitize_identifiers
29+
30+
31+
# ---------------------------------------------------------------------------
32+
# Reference: original uncached implementation (copied from master)
33+
# ---------------------------------------------------------------------------
34+
35+
NON_ALPHA_REGEX = re.compile("[^a-zA-Z0-9]")
36+
START_BADCHAR_REGEX = re.compile("^[^a-zA-Z0-9]*")
37+
END_BADCHAR_REGEX = re.compile("[^a-zA-Z0-9_]*$")
38+
39+
_clean_name_cache_old = {}
40+
41+
42+
def _clean_column_name_old(name):
43+
try:
44+
return _clean_name_cache_old[name]
45+
except KeyError:
46+
clean = NON_ALPHA_REGEX.sub(
47+
"_", START_BADCHAR_REGEX.sub("", END_BADCHAR_REGEX.sub("", name))
48+
)
49+
_clean_name_cache_old[name] = clean
50+
return clean
51+
52+
53+
def named_tuple_factory_uncached(colnames, rows):
54+
"""Original implementation without caching (for benchmark comparison)."""
55+
clean_column_names = map(_clean_column_name_old, colnames)
56+
try:
57+
Row = namedtuple("Row", clean_column_names)
58+
except SyntaxError:
59+
raise
60+
except Exception:
61+
clean_column_names = list(map(_clean_column_name_old, colnames))
62+
Row = namedtuple("Row", _sanitize_identifiers(clean_column_names))
63+
return [Row(*row) for row in rows]
64+
65+
66+
# ---------------------------------------------------------------------------
67+
# Test data generators
68+
# ---------------------------------------------------------------------------
69+
70+
71+
def make_colnames(n):
72+
return tuple(f"col_{i}" for i in range(n))
73+
74+
75+
def make_rows(ncols, nrows):
76+
return [tuple(range(ncols)) for _ in range(nrows)]
77+
78+
79+
# ---------------------------------------------------------------------------
80+
# Correctness tests
81+
# ---------------------------------------------------------------------------
82+
83+
84+
class TestNamedTupleFactoryCorrectness:
85+
"""Verify the cached implementation matches the uncached one."""
86+
87+
@pytest.mark.parametrize("ncols", [1, 5, 10, 20])
88+
@pytest.mark.parametrize("nrows", [1, 10, 100])
89+
def test_results_match(self, ncols, nrows):
90+
colnames = make_colnames(ncols)
91+
rows = make_rows(ncols, nrows)
92+
_named_tuple_cache.clear()
93+
cached_result = named_tuple_factory(colnames, rows)
94+
uncached_result = named_tuple_factory_uncached(colnames, rows)
95+
assert len(cached_result) == len(uncached_result)
96+
for cr, ur in zip(cached_result, uncached_result):
97+
assert tuple(cr) == tuple(ur)
98+
assert cr._fields == ur._fields
99+
100+
def test_cache_hit_returns_same_class(self):
101+
colnames = ("name", "age", "email")
102+
rows1 = [("Alice", 30, "a@b.com")]
103+
rows2 = [("Bob", 25, "b@c.com")]
104+
_named_tuple_cache.clear()
105+
result1 = named_tuple_factory(colnames, rows1)
106+
result2 = named_tuple_factory(colnames, rows2)
107+
# Same Row class should be reused
108+
assert type(result1[0]) is type(result2[0])
109+
110+
def test_different_schemas_get_different_classes(self):
111+
_named_tuple_cache.clear()
112+
result1 = named_tuple_factory(("a", "b"), [(1, 2)])
113+
result2 = named_tuple_factory(("x", "y"), [(3, 4)])
114+
assert type(result1[0]) is not type(result2[0])
115+
assert result1[0]._fields == ("a", "b")
116+
assert result2[0]._fields == ("x", "y")
117+
118+
119+
# ---------------------------------------------------------------------------
120+
# Benchmarks
121+
# ---------------------------------------------------------------------------
122+
123+
124+
class TestNamedTupleFactoryBenchmark:
125+
"""Benchmark cached vs uncached named_tuple_factory."""
126+
127+
# --- 5 columns, 100 rows ---
128+
129+
@pytest.mark.benchmark(group="ntf_5cols_100rows")
130+
def test_uncached_5cols_100rows(self, benchmark):
131+
colnames = make_colnames(5)
132+
rows = make_rows(5, 100)
133+
benchmark(named_tuple_factory_uncached, colnames, rows)
134+
135+
@pytest.mark.benchmark(group="ntf_5cols_100rows")
136+
def test_cached_5cols_100rows(self, benchmark):
137+
colnames = make_colnames(5)
138+
rows = make_rows(5, 100)
139+
_named_tuple_cache.clear()
140+
# Warm the cache with one call
141+
named_tuple_factory(colnames, rows)
142+
benchmark(named_tuple_factory, colnames, rows)
143+
144+
# --- 10 columns, 100 rows ---
145+
146+
@pytest.mark.benchmark(group="ntf_10cols_100rows")
147+
def test_uncached_10cols_100rows(self, benchmark):
148+
colnames = make_colnames(10)
149+
rows = make_rows(10, 100)
150+
benchmark(named_tuple_factory_uncached, colnames, rows)
151+
152+
@pytest.mark.benchmark(group="ntf_10cols_100rows")
153+
def test_cached_10cols_100rows(self, benchmark):
154+
colnames = make_colnames(10)
155+
rows = make_rows(10, 100)
156+
_named_tuple_cache.clear()
157+
named_tuple_factory(colnames, rows)
158+
benchmark(named_tuple_factory, colnames, rows)
159+
160+
# --- 20 columns, 100 rows ---
161+
162+
@pytest.mark.benchmark(group="ntf_20cols_100rows")
163+
def test_uncached_20cols_100rows(self, benchmark):
164+
colnames = make_colnames(20)
165+
rows = make_rows(20, 100)
166+
benchmark(named_tuple_factory_uncached, colnames, rows)
167+
168+
@pytest.mark.benchmark(group="ntf_20cols_100rows")
169+
def test_cached_20cols_100rows(self, benchmark):
170+
colnames = make_colnames(20)
171+
rows = make_rows(20, 100)
172+
_named_tuple_cache.clear()
173+
named_tuple_factory(colnames, rows)
174+
benchmark(named_tuple_factory, colnames, rows)
175+
176+
# --- 5 columns, 1000 rows ---
177+
178+
@pytest.mark.benchmark(group="ntf_5cols_1000rows")
179+
def test_uncached_5cols_1000rows(self, benchmark):
180+
colnames = make_colnames(5)
181+
rows = make_rows(5, 1000)
182+
benchmark(named_tuple_factory_uncached, colnames, rows)
183+
184+
@pytest.mark.benchmark(group="ntf_5cols_1000rows")
185+
def test_cached_5cols_1000rows(self, benchmark):
186+
colnames = make_colnames(5)
187+
rows = make_rows(5, 1000)
188+
_named_tuple_cache.clear()
189+
named_tuple_factory(colnames, rows)
190+
benchmark(named_tuple_factory, colnames, rows)
191+
192+
# --- 10 columns, 1 row (measures class creation overhead most clearly) ---
193+
194+
@pytest.mark.benchmark(group="ntf_10cols_1row")
195+
def test_uncached_10cols_1row(self, benchmark):
196+
colnames = make_colnames(10)
197+
rows = make_rows(10, 1)
198+
benchmark(named_tuple_factory_uncached, colnames, rows)
199+
200+
@pytest.mark.benchmark(group="ntf_10cols_1row")
201+
def test_cached_10cols_1row(self, benchmark):
202+
colnames = make_colnames(10)
203+
rows = make_rows(10, 1)
204+
_named_tuple_cache.clear()
205+
named_tuple_factory(colnames, rows)
206+
benchmark(named_tuple_factory, colnames, rows)

cassandra/query.py

Lines changed: 35 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,12 @@ def pseudo_namedtuple_factory(colnames, rows):
117117
for od in ordered_dict_factory(colnames, rows)]
118118

119119

120+
# Cache namedtuple Row classes to avoid repeated exec() calls in namedtuple()
121+
# for the same column schema. Naturally bounded by the number of distinct
122+
# column-name tuples, which equals the number of distinct queries.
123+
_named_tuple_cache = {}
124+
125+
120126
def named_tuple_factory(colnames, rows):
121127
"""
122128
Returns each row as a `namedtuple <https://docs.python.org/2/library/collections.html#collections.namedtuple>`_.
@@ -146,32 +152,37 @@ def named_tuple_factory(colnames, rows):
146152
.. versionchanged:: 2.0.0
147153
moved from ``cassandra.decoder`` to ``cassandra.query``
148154
"""
149-
clean_column_names = map(_clean_column_name, colnames)
155+
key = tuple(colnames)
150156
try:
151-
Row = namedtuple('Row', clean_column_names)
152-
except SyntaxError:
153-
warnings.warn(
154-
"Failed creating namedtuple for a result because there were too "
155-
"many columns. This is due to a Python limitation that affects "
156-
"namedtuple in Python 3.0-3.6 (see issue18896). The row will be "
157-
"created with {substitute_factory_name}, which lacks some namedtuple "
158-
"features and is slower. To avoid slower performance accessing "
159-
"values on row objects, Upgrade to Python 3.7, or use a different "
160-
"row factory. (column names: {colnames})".format(
161-
substitute_factory_name=pseudo_namedtuple_factory.__name__,
162-
colnames=colnames
157+
Row = _named_tuple_cache[key]
158+
except KeyError:
159+
clean_column_names = map(_clean_column_name, colnames)
160+
try:
161+
Row = namedtuple('Row', clean_column_names)
162+
except SyntaxError:
163+
warnings.warn(
164+
"Failed creating namedtuple for a result because there were too "
165+
"many columns. This is due to a Python limitation that affects "
166+
"namedtuple in Python 3.0-3.6 (see issue18896). The row will be "
167+
"created with {substitute_factory_name}, which lacks some namedtuple "
168+
"features and is slower. To avoid slower performance accessing "
169+
"values on row objects, Upgrade to Python 3.7, or use a different "
170+
"row factory. (column names: {colnames})".format(
171+
substitute_factory_name=pseudo_namedtuple_factory.__name__,
172+
colnames=colnames
173+
)
163174
)
164-
)
165-
return pseudo_namedtuple_factory(colnames, rows)
166-
except Exception:
167-
clean_column_names = list(map(_clean_column_name, colnames)) # create list because py3 map object will be consumed by first attempt
168-
log.warning("Failed creating named tuple for results with column names %s (cleaned: %s) "
169-
"(see Python 'namedtuple' documentation for details on name rules). "
170-
"Results will be returned with positional names. "
171-
"Avoid this by choosing different names, using SELECT \"<col name>\" AS aliases, "
172-
"or specifying a different row_factory on your Session" %
173-
(colnames, clean_column_names))
174-
Row = namedtuple('Row', _sanitize_identifiers(clean_column_names))
175+
return pseudo_namedtuple_factory(colnames, rows)
176+
except Exception:
177+
clean_column_names = list(map(_clean_column_name, colnames)) # create list because py3 map object will be consumed by first attempt
178+
log.warning("Failed creating named tuple for results with column names %s (cleaned: %s) "
179+
"(see Python 'namedtuple' documentation for details on name rules). "
180+
"Results will be returned with positional names. "
181+
"Avoid this by choosing different names, using SELECT \"<col name>\" AS aliases, "
182+
"or specifying a different row_factory on your Session" %
183+
(colnames, clean_column_names))
184+
Row = namedtuple('Row', _sanitize_identifiers(clean_column_names))
185+
_named_tuple_cache[key] = Row
175186

176187
return [Row(*row) for row in rows]
177188

0 commit comments

Comments
 (0)