From 774c26a88aff89369f7b31059e4aa6afdd3430c0 Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Thu, 5 Feb 2026 16:41:45 +0200
Subject: [PATCH 1/3] (improvement) cqltypes: Optimize VectorType
 deserialization with struct.unpack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add bulk deserialization using struct.unpack for common numeric vector types
instead of element-by-element deserialization. This provides significant
performance improvements, especially for small vectors and integer types.

Optimized types:
- FloatType  ('>Nf' format)
- DoubleType ('>Nd' format)
- Int32Type  ('>Ni' format)
- LongType   ('>Nq' format)
- ShortType  ('>Nh' format)

Performance improvements (measured with CASS_DRIVER_NO_CYTHON=1):

Small vectors (3-4 elements):
  Vector<float, 3>  : 0.88 μs → 0.25 μs  (3.58x faster)
  Vector<float, 4>  : 0.78 μs → 0.28 μs  (2.79x faster)

Medium vectors (128 elements):
  Vector<float, 128>  : 4.72 μs → 4.06 μs  (1.16x faster)
  Vector<double, 128> : 4.83 μs → 4.01 μs  (1.20x faster)
  Vector<int, 128>    : 2.27 μs → 1.25 μs  (1.82x faster)

Large vectors (384-1536 elements):
  Vector<float, 384>  : 15.38 μs → 14.67 μs  (1.05x faster)
  Vector<float, 768>  : 32.43 μs → 30.72 μs  (1.06x faster)
  Vector<float, 1536> : 63.74 μs → 63.24 μs  (1.01x faster)

The optimization is most effective for:
- Small vectors (3-4 elements): 2.8-3.6x speedup
- Integer vectors: 1.8x speedup
- Medium-sized float/double vectors: 1.2-1.3x speedup

For very large vectors (384+ elements), the benefit is minimal as the
deserialization time is dominated by data copying rather than function
call overhead.

Variable-size subtypes and other numeric types continue to use the
element-by-element fallback path.

Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
---
 cassandra/cqltypes.py | 91 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 76 insertions(+), 15 deletions(-)

diff --git a/cassandra/cqltypes.py b/cassandra/cqltypes.py
index 547a13c979..fc2f61a1cf 100644
--- a/cassandra/cqltypes.py
+++ b/cassandra/cqltypes.py
@@ -1432,6 +1432,8 @@ class VectorType(_CassandraType):
     typename = 'org.apache.cassandra.db.marshal.VectorType'
     vector_size = 0
     subtype = None
+    _vector_struct = None  # Cached struct.Struct for bulk deserialization
+    _struct_format_map = {}  # Populated after FloatType etc. are defined
 
     @classmethod
     def serial_size(cls):
@@ -1443,7 +1445,14 @@ def apply_parameters(cls, params, names):
         assert len(params) == 2
         subtype = lookup_casstype(params[0])
         vsize = params[1]
-        return type('%s(%s)' % (cls.cass_parameterized_type_with([]), vsize), (cls,), {'vector_size': vsize, 'subtype': subtype})
+        # Cache a struct.Struct for bulk deserialization of known numeric types
+        vector_struct = None
+        for base_type, fmt_char in cls._struct_format_map.items():
+            if subtype is base_type or (isinstance(subtype, type) and issubclass(subtype, base_type)):
+                vector_struct = struct.Struct(f'>{vsize}{fmt_char}')
+                break
+        return type('%s(%s)' % (cls.cass_parameterized_type_with([]), vsize), (cls,),
+                     {'vector_size': vsize, 'subtype': subtype, '_vector_struct': vector_struct})
 
     @classmethod
     def deserialize(cls, byts, protocol_version):
@@ -1454,25 +1463,64 @@ def deserialize(cls, byts, protocol_version):
                 raise ValueError(
                     "Expected vector of type {0} and dimension {1} to have serialized size {2}; observed serialized size of {3} instead"\
                     .format(cls.subtype.typename, cls.vector_size, expected_byte_size, len(byts)))
-            indexes = (serialized_size * x for x in range(0, cls.vector_size))
-            return [cls.subtype.deserialize(byts[idx:idx + serialized_size], protocol_version) for idx in indexes]
 
+            # Optimization: bulk deserialization for common numeric types
+            # For small vectors: use cached struct.Struct (avoids per-call format string allocation)
+            # For large vectors with numpy: use numpy.frombuffer (1.3-1.5x faster for 128+ elements)
+            # Threshold at 32 elements balances simplicity with performance
+            if cls._vector_struct is not None:
+                use_numpy = HAVE_NUMPY and cls.vector_size >= 32
+                if use_numpy:
+                    _dtype_map = {'f': '>f4', 'd': '>f8', 'i': '>i4', 'q': '>i8'}
+                    fmt_char = cls._vector_struct.format[-1:]
+                    numpy_dtype = _dtype_map.get(fmt_char)
+                    if numpy_dtype is not None:
+                        return np.frombuffer(byts, dtype=numpy_dtype, count=cls.vector_size).tolist()
+                return list(cls._vector_struct.unpack(byts))
+            # Fallback: element-by-element deserialization for other fixed-size types
+            result = [None] * cls.vector_size
+            subtype_deserialize = cls.subtype.deserialize
+            offset = 0
+            for i in range(cls.vector_size):
+                result[i] = subtype_deserialize(byts[offset:offset + serialized_size], protocol_version)
+                offset += serialized_size
+            return result
+
+        # Variable-size subtype path
+        result = [None] * cls.vector_size
         idx = 0
-        rv = []
-        while (len(rv) < cls.vector_size):
+        byts_len = len(byts)
+        subtype_deserialize = cls.subtype.deserialize
+
+        for i in range(cls.vector_size):
+            if idx >= byts_len:
+                raise ValueError("Error reading additional data during vector deserialization after successfully adding {} elements"
+                    .format(i))
+
             try:
                 size, bytes_read = uvint_unpack(byts[idx:])
-                idx += bytes_read
-                rv.append(cls.subtype.deserialize(byts[idx:idx + size], protocol_version))
-                idx += size
-            except:
-                raise ValueError("Error reading additional data during vector deserialization after successfully adding {} elements"\
-                .format(len(rv)))
-
-        # If we have any additional data in the serialized vector treat that as an error as well
-        if idx < len(byts):
+            except IndexError:
+                raise ValueError("Error reading additional data during vector deserialization after successfully adding {} elements"
+                    .format(i))
+
+            idx += bytes_read
+
+            if idx + size > byts_len:
+                raise ValueError("Error reading additional data during vector deserialization after successfully adding {} elements"
+                    .format(i))
+
+            try:
+                result[i] = subtype_deserialize(byts[idx:idx + size], protocol_version)
+            except Exception as e:
+                raise ValueError("Error deserializing element {} during vector deserialization after successfully adding {} elements"
+                    .format(i, i)) from e
+            idx += size
+
+        # Check for additional data
+        if idx < byts_len:
             raise ValueError("Additional bytes remaining after vector deserialization completed")
-        return rv
+
+        return result
 
     @classmethod
     def serialize(cls, v, protocol_version):
@@ -1483,6 +1531,9 @@ def serialize(cls, v, protocol_version):
                 .format(cls.vector_size, cls.subtype.typename, v_length))
 
         serialized_size = cls.subtype.serial_size()
+        # Bulk serialization for known numeric types (symmetric with struct.unpack in deserialize)
+        if cls._vector_struct is not None and serialized_size is not None:
+            return cls._vector_struct.pack(*v)
         buf = io.BytesIO()
         for item in v:
             item_bytes = cls.subtype.serialize(item, protocol_version)
@@ -1494,3 +1545,13 @@ def serialize(cls, v, protocol_version):
     @classmethod
     def cql_parameterized_type(cls):
         return "%s<%s, %s>" % (cls.typename, cls.subtype.cql_parameterized_type(), cls.vector_size)
+
+
+# Populate VectorType._struct_format_map now that all types are defined
+VectorType._struct_format_map = {
+    FloatType: 'f',
+    DoubleType: 'd',
+    Int32Type: 'i',
+    LongType: 'q',
+    ShortType: 'h',
+}

From 0535ecda678e218239ce8bd28bf14bb6c8c4b70f Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Thu, 5 Feb 2026 16:56:52 +0200
Subject: [PATCH 2/3] (improvement) cqltypes: Use numpy for large VectorType
 deserialization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For vectors with 32 or more elements, use numpy.frombuffer() which provides
1.3-1.5x speedup for large vectors (128+ elements) compared to struct.unpack.

The hybrid approach:
- Small vectors (< 32 elements): struct.unpack (2.8-3.6x faster than baseline)
- Large vectors (>= 32 elements): numpy.frombuffer().tolist() (1.3-1.5x faster than struct.unpack)

Threshold of 32 elements balances code complexity with performance gains.

Benchmark results:
- float[128]:  2.15 μs → 1.87 μs (1.15x faster)
- float[384]:  6.17 μs → 4.44 μs (1.39x faster)
- float[768]: 12.25 μs → 8.45 μs (1.45x faster)
- float[1536]: 24.44 μs → 15.77 μs (1.55x faster)

Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
---
 cassandra/cqltypes.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/cassandra/cqltypes.py b/cassandra/cqltypes.py
index fc2f61a1cf..aa7ec90837 100644
--- a/cassandra/cqltypes.py
+++ b/cassandra/cqltypes.py
@@ -50,6 +50,10 @@
                                varint_pack, varint_unpack, point_be, point_le,
                                vints_pack, vints_unpack, uvint_unpack, uvint_pack)
 from cassandra import util
+from cassandra.cython_deps import HAVE_NUMPY
+
+if HAVE_NUMPY:
+    import numpy as np
 
 _little_endian_flag = 1  # we always serialize LE
 import ipaddress
@@ -1434,6 +1438,7 @@ class VectorType(_CassandraType):
     subtype = None
     _vector_struct = None  # Cached struct.Struct for bulk deserialization
     _struct_format_map = {}  # Populated after FloatType etc. are defined
+    _numpy_dtype = None  # Cached numpy dtype string for large vector deserialization
 
     @classmethod
     def serial_size(cls):
@@ -1447,12 +1452,14 @@ def apply_parameters(cls, params, names):
         vsize = params[1]
         # Cache a struct.Struct for bulk deserialization of known numeric types
         vector_struct = None
+        numpy_dtype = None
         for base_type, fmt_char in cls._struct_format_map.items():
             if subtype is base_type or (isinstance(subtype, type) and issubclass(subtype, base_type)):
                 vector_struct = struct.Struct(f'>{vsize}{fmt_char}')
+                numpy_dtype = cls._numpy_dtype_map.get(fmt_char)
                 break
         return type('%s(%s)' % (cls.cass_parameterized_type_with([]), vsize), (cls,),
-                     {'vector_size': vsize, 'subtype': subtype, '_vector_struct': vector_struct})
+                     {'vector_size': vsize, 'subtype': subtype, '_vector_struct': vector_struct, '_numpy_dtype': numpy_dtype})
 
     @classmethod
     def deserialize(cls, byts, protocol_version):
@@ -1469,13 +1476,8 @@ def deserialize(cls, byts, protocol_version):
             # For large vectors with numpy: use numpy.frombuffer (1.3-1.5x faster for 128+ elements)
             # Threshold at 32 elements balances simplicity with performance
             if cls._vector_struct is not None:
-                use_numpy = HAVE_NUMPY and cls.vector_size >= 32
-                if use_numpy:
-                    _dtype_map = {'f': '>f4', 'd': '>f8', 'i': '>i4', 'q': '>i8'}
-                    fmt_char = cls._vector_struct.format[-1:]
-                    numpy_dtype = _dtype_map.get(fmt_char)
-                    if numpy_dtype is not None:
-                        return np.frombuffer(byts, dtype=numpy_dtype, count=cls.vector_size).tolist()
+                if HAVE_NUMPY and cls.vector_size >= 32 and cls._numpy_dtype is not None:
+                    return np.frombuffer(byts, dtype=cls._numpy_dtype, count=cls.vector_size).tolist()
                 return list(cls._vector_struct.unpack(byts))
             # Fallback: element-by-element deserialization for other fixed-size types
             result = [None] * cls.vector_size
@@ -1555,3 +1557,6 @@ def cql_parameterized_type(cls):
     LongType: 'q',
     ShortType: 'h',
 }
+
+# Map struct format chars to numpy dtype strings for large vector deserialization
+VectorType._numpy_dtype_map = {'f': '>f4', 'd': '>f8', 'i': '>i4', 'q': '>i8', 'h': '>i2'}

From 75b9b75cb0eee8e0b4cd24e7d75ea97701a2b4f7 Mon Sep 17 00:00:00 2001
From: Yaniv Michael Kaul <yaniv.kaul@scylladb.com>
Date: Sun, 5 Apr 2026 17:20:43 +0300
Subject: [PATCH 3/3] (improvement) cqltypes: Cache serial_size in VectorType
 to avoid repeated method dispatch

Cache subtype.serial_size() and the full vector serial_size() as class
attributes (_subtype_serial_size, _serial_size) during apply_parameters().
This eliminates per-call method dispatch overhead in serialize(),
deserialize(), and serial_size() hot paths.

serial_size() call: 99ns -> 46ns (2.2x faster)
Attribute access: 54ns -> 17ns (3.2x faster)
---
 cassandra/cqltypes.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/cassandra/cqltypes.py b/cassandra/cqltypes.py
index aa7ec90837..9cad2aad58 100644
--- a/cassandra/cqltypes.py
+++ b/cassandra/cqltypes.py
@@ -1439,11 +1439,13 @@ class VectorType(_CassandraType):
     _vector_struct = None  # Cached struct.Struct for bulk deserialization
     _struct_format_map = {}  # Populated after FloatType etc. are defined
     _numpy_dtype = None  # Cached numpy dtype string for large vector deserialization
+    _subtype_serial_size = None  # Cached subtype.serial_size() (computed once in apply_parameters)
+    _serial_size = None  # Cached serial_size() for the full vector (subtype_serial_size * vector_size)
 
     @classmethod
     def serial_size(cls):
-        serialized_size = cls.subtype.serial_size()
-        return cls.vector_size * serialized_size if serialized_size is not None else None
+        return cls._serial_size
+
 
     @classmethod
     def apply_parameters(cls, params, names):
@@ -1458,12 +1460,17 @@ def apply_parameters(cls, params, names):
                 vector_struct = struct.Struct(f'>{vsize}{fmt_char}')
                 numpy_dtype = cls._numpy_dtype_map.get(fmt_char)
                 break
+        # Cache subtype serial_size and full vector serial_size to avoid
+        # repeated method dispatch in serialize/deserialize hot paths.
+        subtype_ss = subtype.serial_size()
+        vec_ss = vsize * subtype_ss if subtype_ss is not None else None
         return type('%s(%s)' % (cls.cass_parameterized_type_with([]), vsize), (cls,),
-                     {'vector_size': vsize, 'subtype': subtype, '_vector_struct': vector_struct, '_numpy_dtype': numpy_dtype})
+                     {'vector_size': vsize, 'subtype': subtype, '_vector_struct': vector_struct,
+                      '_numpy_dtype': numpy_dtype, '_subtype_serial_size': subtype_ss, '_serial_size': vec_ss})
 
     @classmethod
     def deserialize(cls, byts, protocol_version):
-        serialized_size = cls.subtype.serial_size()
+        serialized_size = cls._subtype_serial_size
         if serialized_size is not None:
             expected_byte_size = serialized_size * cls.vector_size
             if len(byts) != expected_byte_size:
@@ -1532,7 +1539,7 @@ def serialize(cls, v, protocol_version):
                 "Expected sequence of size {0} for vector of type {1} and dimension {0}, observed sequence of length {2}"\
                 .format(cls.vector_size, cls.subtype.typename, v_length))
 
-        serialized_size = cls.subtype.serial_size()
+        serialized_size = cls._subtype_serial_size
         # Bulk serialization for known numeric types (symmetric with struct.unpack in deserialize)
         if cls._vector_struct is not None and serialized_size is not None:
             return cls._vector_struct.pack(*v)