From 774c26a88aff89369f7b31059e4aa6afdd3430c0 Mon Sep 17 00:00:00 2001 From: Yaniv Michael Kaul Date: Thu, 5 Feb 2026 16:41:45 +0200 Subject: [PATCH 1/3] (improvement) cqltypes: Optimize VectorType deserialization with struct.unpack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add bulk deserialization using struct.unpack for common numeric vector types instead of element-by-element deserialization. This provides significant performance improvements, especially for small vectors and integer types. Optimized types: - FloatType ('>Nf' format) - DoubleType ('>Nd' format) - Int32Type ('>Ni' format) - LongType ('>Nq' format) - ShortType ('>Nh' format) Performance improvements (measured with CASS_DRIVER_NO_CYTHON=1): Small vectors (3-4 elements): Vector : 0.88 μs → 0.25 μs (3.58x faster) Vector : 0.78 μs → 0.28 μs (2.79x faster) Medium vectors (128 elements): Vector : 4.72 μs → 4.06 μs (1.16x faster) Vector : 4.83 μs → 4.01 μs (1.20x faster) Vector : 2.27 μs → 1.25 μs (1.82x faster) Large vectors (384-1536 elements): Vector : 15.38 μs → 14.67 μs (1.05x faster) Vector : 32.43 μs → 30.72 μs (1.06x faster) Vector : 63.74 μs → 63.24 μs (1.01x faster) The optimization is most effective for: - Small vectors (3-4 elements): 2.8-3.6x speedup - Integer vectors: 1.8x speedup - Medium-sized float/double vectors: 1.2-1.3x speedup For very large vectors (384+ elements), the benefit is minimal as the deserialization time is dominated by data copying rather than function call overhead. Variable-size subtypes and other numeric types continue to use the element-by-element fallback path. Signed-off-by: Yaniv Kaul --- cassandra/cqltypes.py | 91 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 15 deletions(-) diff --git a/cassandra/cqltypes.py b/cassandra/cqltypes.py index 547a13c979..fc2f61a1cf 100644 --- a/cassandra/cqltypes.py +++ b/cassandra/cqltypes.py @@ -1432,6 +1432,8 @@ class VectorType(_CassandraType): typename = 'org.apache.cassandra.db.marshal.VectorType' vector_size = 0 subtype = None + _vector_struct = None # Cached struct.Struct for bulk deserialization + _struct_format_map = {} # Populated after FloatType etc. are defined @classmethod def serial_size(cls): @@ -1443,7 +1445,14 @@ def apply_parameters(cls, params, names): assert len(params) == 2 subtype = lookup_casstype(params[0]) vsize = params[1] - return type('%s(%s)' % (cls.cass_parameterized_type_with([]), vsize), (cls,), {'vector_size': vsize, 'subtype': subtype}) + # Cache a struct.Struct for bulk deserialization of known numeric types + vector_struct = None + for base_type, fmt_char in cls._struct_format_map.items(): + if subtype is base_type or (isinstance(subtype, type) and issubclass(subtype, base_type)): + vector_struct = struct.Struct(f'>{vsize}{fmt_char}') + break + return type('%s(%s)' % (cls.cass_parameterized_type_with([]), vsize), (cls,), + {'vector_size': vsize, 'subtype': subtype, '_vector_struct': vector_struct}) @classmethod def deserialize(cls, byts, protocol_version): @@ -1454,25 +1463,64 @@ def deserialize(cls, byts, protocol_version): raise ValueError( "Expected vector of type {0} and dimension {1} to have serialized size {2}; observed serialized size of {3} instead"\ .format(cls.subtype.typename, cls.vector_size, expected_byte_size, len(byts))) - indexes = (serialized_size * x for x in range(0, cls.vector_size)) - return [cls.subtype.deserialize(byts[idx:idx + serialized_size], protocol_version) for idx in indexes] + # Optimization: bulk deserialization for common numeric types + # For small vectors: use cached struct.Struct (avoids per-call format string allocation) + # For large vectors with numpy: use numpy.frombuffer (1.3-1.5x faster for 128+ elements) + # Threshold at 32 elements balances simplicity with performance + if cls._vector_struct is not None: + use_numpy = HAVE_NUMPY and cls.vector_size >= 32 + if use_numpy: + _dtype_map = {'f': '>f4', 'd': '>f8', 'i': '>i4', 'q': '>i8'} + fmt_char = cls._vector_struct.format[-1:] + numpy_dtype = _dtype_map.get(fmt_char) + if numpy_dtype is not None: + return np.frombuffer(byts, dtype=numpy_dtype, count=cls.vector_size).tolist() + return list(cls._vector_struct.unpack(byts)) + # Fallback: element-by-element deserialization for other fixed-size types + result = [None] * cls.vector_size + subtype_deserialize = cls.subtype.deserialize + offset = 0 + for i in range(cls.vector_size): + result[i] = subtype_deserialize(byts[offset:offset + serialized_size], protocol_version) + offset += serialized_size + return result + + # Variable-size subtype path + result = [None] * cls.vector_size idx = 0 - rv = [] - while (len(rv) < cls.vector_size): + byts_len = len(byts) + subtype_deserialize = cls.subtype.deserialize + + for i in range(cls.vector_size): + if idx >= byts_len: + raise ValueError("Error reading additional data during vector deserialization after successfully adding {} elements" + .format(i)) + try: size, bytes_read = uvint_unpack(byts[idx:]) - idx += bytes_read - rv.append(cls.subtype.deserialize(byts[idx:idx + size], protocol_version)) - idx += size - except: - raise ValueError("Error reading additional data during vector deserialization after successfully adding {} elements"\ - .format(len(rv))) - - # If we have any additional data in the serialized vector treat that as an error as well - if idx < len(byts): + except IndexError: + raise ValueError("Error reading additional data during vector deserialization after successfully adding {} elements" + .format(i)) + + idx += bytes_read + + if idx + size > byts_len: + raise ValueError("Error reading additional data during vector deserialization after successfully adding {} elements" + .format(i)) + + try: + result[i] = subtype_deserialize(byts[idx:idx + size], protocol_version) + except Exception as e: + raise ValueError("Error deserializing element {} during vector deserialization after successfully adding {} elements" + .format(i, i)) from e + idx += size + + # Check for additional data + if idx < byts_len: raise ValueError("Additional bytes remaining after vector deserialization completed") - return rv + + return result @classmethod def serialize(cls, v, protocol_version): @@ -1483,6 +1531,9 @@ def serialize(cls, v, protocol_version): .format(cls.vector_size, cls.subtype.typename, v_length)) serialized_size = cls.subtype.serial_size() + # Bulk serialization for known numeric types (symmetric with struct.unpack in deserialize) + if cls._vector_struct is not None and serialized_size is not None: + return cls._vector_struct.pack(*v) buf = io.BytesIO() for item in v: item_bytes = cls.subtype.serialize(item, protocol_version) @@ -1494,3 +1545,13 @@ def serialize(cls, v, protocol_version): @classmethod def cql_parameterized_type(cls): return "%s<%s, %s>" % (cls.typename, cls.subtype.cql_parameterized_type(), cls.vector_size) + + +# Populate VectorType._struct_format_map now that all types are defined +VectorType._struct_format_map = { + FloatType: 'f', + DoubleType: 'd', + Int32Type: 'i', + LongType: 'q', + ShortType: 'h', +} From 0535ecda678e218239ce8bd28bf14bb6c8c4b70f Mon Sep 17 00:00:00 2001 From: Yaniv Michael Kaul Date: Thu, 5 Feb 2026 16:56:52 +0200 Subject: [PATCH 2/3] (improvement) cqltypes: Use numpy for large VectorType deserialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For vectors with 32 or more elements, use numpy.frombuffer() which provides 1.3-1.5x speedup for large vectors (128+ elements) compared to struct.unpack. The hybrid approach: - Small vectors (< 32 elements): struct.unpack (2.8-3.6x faster than baseline) - Large vectors (>= 32 elements): numpy.frombuffer().tolist() (1.3-1.5x faster than struct.unpack) Threshold of 32 elements balances code complexity with performance gains. Benchmark results: - float[128]: 2.15 μs → 1.87 μs (1.15x faster) - float[384]: 6.17 μs → 4.44 μs (1.39x faster) - float[768]: 12.25 μs → 8.45 μs (1.45x faster) - float[1536]: 24.44 μs → 15.77 μs (1.55x faster) Signed-off-by: Yaniv Kaul --- cassandra/cqltypes.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/cassandra/cqltypes.py b/cassandra/cqltypes.py index fc2f61a1cf..aa7ec90837 100644 --- a/cassandra/cqltypes.py +++ b/cassandra/cqltypes.py @@ -50,6 +50,10 @@ varint_pack, varint_unpack, point_be, point_le, vints_pack, vints_unpack, uvint_unpack, uvint_pack) from cassandra import util +from cassandra.cython_deps import HAVE_NUMPY + +if HAVE_NUMPY: + import numpy as np _little_endian_flag = 1 # we always serialize LE import ipaddress @@ -1434,6 +1438,7 @@ class VectorType(_CassandraType): subtype = None _vector_struct = None # Cached struct.Struct for bulk deserialization _struct_format_map = {} # Populated after FloatType etc. are defined + _numpy_dtype = None # Cached numpy dtype string for large vector deserialization @classmethod def serial_size(cls): @@ -1447,12 +1452,14 @@ def apply_parameters(cls, params, names): vsize = params[1] # Cache a struct.Struct for bulk deserialization of known numeric types vector_struct = None + numpy_dtype = None for base_type, fmt_char in cls._struct_format_map.items(): if subtype is base_type or (isinstance(subtype, type) and issubclass(subtype, base_type)): vector_struct = struct.Struct(f'>{vsize}{fmt_char}') + numpy_dtype = cls._numpy_dtype_map.get(fmt_char) break return type('%s(%s)' % (cls.cass_parameterized_type_with([]), vsize), (cls,), - {'vector_size': vsize, 'subtype': subtype, '_vector_struct': vector_struct}) + {'vector_size': vsize, 'subtype': subtype, '_vector_struct': vector_struct, '_numpy_dtype': numpy_dtype}) @classmethod def deserialize(cls, byts, protocol_version): @@ -1469,13 +1476,8 @@ def deserialize(cls, byts, protocol_version): # For large vectors with numpy: use numpy.frombuffer (1.3-1.5x faster for 128+ elements) # Threshold at 32 elements balances simplicity with performance if cls._vector_struct is not None: - use_numpy = HAVE_NUMPY and cls.vector_size >= 32 - if use_numpy: - _dtype_map = {'f': '>f4', 'd': '>f8', 'i': '>i4', 'q': '>i8'} - fmt_char = cls._vector_struct.format[-1:] - numpy_dtype = _dtype_map.get(fmt_char) - if numpy_dtype is not None: - return np.frombuffer(byts, dtype=numpy_dtype, count=cls.vector_size).tolist() + if HAVE_NUMPY and cls.vector_size >= 32 and cls._numpy_dtype is not None: + return np.frombuffer(byts, dtype=cls._numpy_dtype, count=cls.vector_size).tolist() return list(cls._vector_struct.unpack(byts)) # Fallback: element-by-element deserialization for other fixed-size types result = [None] * cls.vector_size @@ -1555,3 +1557,6 @@ def cql_parameterized_type(cls): LongType: 'q', ShortType: 'h', } + +# Map struct format chars to numpy dtype strings for large vector deserialization +VectorType._numpy_dtype_map = {'f': '>f4', 'd': '>f8', 'i': '>i4', 'q': '>i8', 'h': '>i2'} From 75b9b75cb0eee8e0b4cd24e7d75ea97701a2b4f7 Mon Sep 17 00:00:00 2001 From: Yaniv Michael Kaul Date: Sun, 5 Apr 2026 17:20:43 +0300 Subject: [PATCH 3/3] (improvement) cqltypes: Cache serial_size in VectorType to avoid repeated method dispatch Cache subtype.serial_size() and the full vector serial_size() as class attributes (_subtype_serial_size, _serial_size) during apply_parameters(). This eliminates per-call method dispatch overhead in serialize(), deserialize(), and serial_size() hot paths. serial_size() call: 99ns -> 46ns (2.2x faster) Attribute access: 54ns -> 17ns (3.2x faster) --- cassandra/cqltypes.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/cassandra/cqltypes.py b/cassandra/cqltypes.py index aa7ec90837..9cad2aad58 100644 --- a/cassandra/cqltypes.py +++ b/cassandra/cqltypes.py @@ -1439,11 +1439,13 @@ class VectorType(_CassandraType): _vector_struct = None # Cached struct.Struct for bulk deserialization _struct_format_map = {} # Populated after FloatType etc. are defined _numpy_dtype = None # Cached numpy dtype string for large vector deserialization + _subtype_serial_size = None # Cached subtype.serial_size() (computed once in apply_parameters) + _serial_size = None # Cached serial_size() for the full vector (subtype_serial_size * vector_size) @classmethod def serial_size(cls): - serialized_size = cls.subtype.serial_size() - return cls.vector_size * serialized_size if serialized_size is not None else None + return cls._serial_size + @classmethod def apply_parameters(cls, params, names): @@ -1458,12 +1460,17 @@ def apply_parameters(cls, params, names): vector_struct = struct.Struct(f'>{vsize}{fmt_char}') numpy_dtype = cls._numpy_dtype_map.get(fmt_char) break + # Cache subtype serial_size and full vector serial_size to avoid + # repeated method dispatch in serialize/deserialize hot paths. + subtype_ss = subtype.serial_size() + vec_ss = vsize * subtype_ss if subtype_ss is not None else None return type('%s(%s)' % (cls.cass_parameterized_type_with([]), vsize), (cls,), - {'vector_size': vsize, 'subtype': subtype, '_vector_struct': vector_struct, '_numpy_dtype': numpy_dtype}) + {'vector_size': vsize, 'subtype': subtype, '_vector_struct': vector_struct, + '_numpy_dtype': numpy_dtype, '_subtype_serial_size': subtype_ss, '_serial_size': vec_ss}) @classmethod def deserialize(cls, byts, protocol_version): - serialized_size = cls.subtype.serial_size() + serialized_size = cls._subtype_serial_size if serialized_size is not None: expected_byte_size = serialized_size * cls.vector_size if len(byts) != expected_byte_size: @@ -1532,7 +1539,7 @@ def serialize(cls, v, protocol_version): "Expected sequence of size {0} for vector of type {1} and dimension {0}, observed sequence of length {2}"\ .format(cls.vector_size, cls.subtype.typename, v_length)) - serialized_size = cls.subtype.serial_size() + serialized_size = cls._subtype_serial_size # Bulk serialization for known numeric types (symmetric with struct.unpack in deserialize) if cls._vector_struct is not None and serialized_size is not None: return cls._vector_struct.pack(*v)