Skip to content

Commit e303d92

Browse files
committed
fix: Add buffer bounds validation to Cython deserializers
Add bounds checking to prevent buffer overruns and properly handle CQL protocol value semantics in deserializers. Changes: - subelem(): Add bounds validation with protocol-compliant value handling * Happy path: Check elemlen >= 0 and offset + elemlen <= buf.size * Support NULL values (elemlen == -1) per CQL protocol * Support "not set" values (elemlen == -2) per CQL protocol * Reject invalid values (elemlen < -2) with clear error message - _unpack_len(): Add bounds check before reading int32 length field * Validates offset + 4 <= buf.size before pointer dereference * Prevents reading beyond buffer boundaries - DesTupleType: Add defensive bounds checking for tuple deserialization * Check p + 4 <= buf.size before reading item length * Check p + itemlen <= buf.size before reading item data * Explicit NULL value handling (itemlen < 0) * Clear error messages for buffer overruns - DesCompositeType: Add bounds validation for composite type elements * Check 2 + element_length + 1 <= buf.size (length + data + EOC byte) * Prevents buffer overrun when reading composite elements - DesVectorType._deserialize_generic(): Add size validation * Verify buf.size == expected_size before processing * Provides clear error message with expected vs actual sizes Protocol specification reference: [value] = [int] n, followed by n bytes if n >= 0 n == -1: NULL value n == -2: not set value n < -2: invalid (error) Signed-off-by: Yaniv Kaul <yaniv.kaul@scylladb.com>
1 parent cff72c9 commit e303d92

1 file changed

Lines changed: 57 additions & 42 deletions

File tree

cassandra/deserializers.pyx

Lines changed: 57 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -203,9 +203,6 @@ cdef inline bint _is_int32_type(object subtype):
203203
cdef inline bint _is_int64_type(object subtype):
204204
return subtype is cqltypes.LongType or issubclass(subtype, cqltypes.LongType)
205205

206-
cdef inline bint _is_int16_type(object subtype):
207-
return subtype is cqltypes.ShortType or issubclass(subtype, cqltypes.ShortType)
208-
209206
cdef inline list _deserialize_numpy_vector(Buffer *buf, int vector_size, str dtype):
210207
"""Unified numpy deserialization for large vectors"""
211208
return np.frombuffer(buf.ptr[:buf.size], dtype=dtype, count=vector_size).tolist()
@@ -279,16 +276,6 @@ cdef class DesVectorType(Deserializer):
279276
raise ValueError(
280277
f"Expected vector of type {self.subtype.typename} and dimension {self.vector_size} "
281278
f"to have serialized size {expected_size}; observed serialized size of {buf.size} instead")
282-
elif _is_int16_type(self.subtype):
283-
elem_size = 2
284-
expected_size = self.vector_size * elem_size
285-
if buf.size == expected_size:
286-
if use_numpy:
287-
return _deserialize_numpy_vector(buf, self.vector_size, '>i2')
288-
return self._deserialize_int16(buf)
289-
raise ValueError(
290-
f"Expected vector of type {self.subtype.typename} and dimension {self.vector_size} "
291-
f"to have serialized size {expected_size}; observed serialized size of {buf.size} instead")
292279
else:
293280
# Unsupported type, use generic deserialization
294281
return self._deserialize_generic(buf, protocol_version)
@@ -376,19 +363,6 @@ cdef class DesVectorType(Deserializer):
376363

377364
return result
378365

379-
cdef inline list _deserialize_int16(self, Buffer *buf):
380-
"""Deserialize int16/short vector using direct C-level access with ntohs"""
381-
cdef Py_ssize_t i
382-
cdef list result
383-
cdef int16_t temp
384-
385-
result = [None] * self.vector_size
386-
for i in range(self.vector_size):
387-
temp = <int16_t>ntohs((<uint16_t*>(buf.ptr + i * 2))[0])
388-
result[i] = temp
389-
390-
return result
391-
392366
cdef inline list _deserialize_generic(self, Buffer *buf, int protocol_version):
393367
"""Fallback: element-by-element deserialization for non-optimized types"""
394368
cdef Py_ssize_t i
@@ -403,6 +377,13 @@ cdef class DesVectorType(Deserializer):
403377
"is not supported in Cython deserializer")
404378
cdef int serialized_size = <int>_serialized_size
405379

380+
# Validate total size before processing
381+
cdef int expected_size = self.vector_size * serialized_size
382+
if buf.size != expected_size:
383+
raise ValueError(
384+
f"Expected vector of type {self.subtype.typename} and dimension {self.vector_size} "
385+
f"to have serialized size {expected_size}; observed serialized size of {buf.size} instead")
386+
406387
for i in range(self.vector_size):
407388
from_ptr_and_size(buf.ptr + offset, serialized_size, &elem_buf)
408389
result[i] = self.subtype.deserialize(to_bytes(&elem_buf), protocol_version)
@@ -478,20 +459,40 @@ cdef inline int subelem(
478459
Read the next element from the buffer: first read the size (in bytes) of the
479460
element, then fill elem_buf with a newly sliced buffer of this size (and the
480461
right offset).
462+
463+
Protocol: n >= 0: n bytes follow
464+
n == -1: NULL value
465+
n == -2: not set value
466+
n < -2: invalid
481467
"""
482468
cdef int32_t elemlen
483469

484470
_unpack_len(buf, offset[0], &elemlen)
485471
offset[0] += sizeof(int32_t)
486-
from_ptr_and_size(buf.ptr + offset[0], elemlen, elem_buf)
487-
offset[0] += elemlen
488-
return 0
472+
473+
# Happy path: non-negative length element that fits in buffer
474+
if elemlen >= 0:
475+
if offset[0] + elemlen <= buf.size:
476+
from_ptr_and_size(buf.ptr + offset[0], elemlen, elem_buf)
477+
offset[0] += elemlen
478+
return 0
479+
raise IndexError("Element length %d at offset %d exceeds buffer size %d" % (elemlen, offset[0], buf.size))
480+
# NULL value (-1) or not set value (-2)
481+
elif elemlen == -1 or elemlen == -2:
482+
from_ptr_and_size(NULL, elemlen, elem_buf)
483+
return 0
484+
# Invalid value (n < -2)
485+
else:
486+
raise ValueError("Invalid element length %d at offset %d" % (elemlen, offset[0]))
489487

490488

491489
cdef inline int _unpack_len(Buffer *buf, int offset, int32_t *output) except -1:
492-
"""Read a big-endian int32 at the given offset using direct pointer access."""
493-
cdef uint32_t *src = <uint32_t*>(buf.ptr + offset)
494-
output[0] = <int32_t>ntohl(src[0])
490+
"""Read a big-endian int32 at the given offset using memcpy for alignment safety."""
491+
if offset + sizeof(int32_t) > buf.size:
492+
raise IndexError("Cannot read length field: offset %d + 4 exceeds buffer size %d" % (offset, buf.size))
493+
cdef uint32_t temp
494+
memcpy(&temp, buf.ptr + offset, sizeof(uint32_t))
495+
output[0] = <int32_t>ntohl(temp)
495496
return 0
496497

497498
#--------------------------------------------------------------------------
@@ -567,12 +568,20 @@ cdef class DesTupleType(_DesParameterizedType):
567568
memcpy(&_tuple_tmp, buf.ptr + p, 4)
568569
itemlen = <int32_t>ntohl(_tuple_tmp)
569570
p += 4
570-
if itemlen >= 0:
571+
572+
if itemlen >= 0 and p + itemlen <= buf.size:
571573
from_ptr_and_size(buf.ptr + p, itemlen, &item_buf)
572574
p += itemlen
573575

574576
deserializer = self.deserializers[i]
575577
item = from_binary(deserializer, &item_buf, protocol_version)
578+
elif itemlen < 0:
579+
# NULL value, item stays None
580+
pass
581+
else:
582+
raise IndexError("Tuple item length %d at offset %d exceeds buffer size %d" % (itemlen, p, buf.size))
583+
elif p < buf.size:
584+
raise IndexError("Cannot read tuple item length at offset %d: only %d bytes remain" % (p, buf.size - p))
576585

577586
tuple_set(res, i, item)
578587

@@ -614,17 +623,23 @@ cdef class DesCompositeType(_DesParameterizedType):
614623
break
615624

616625
element_length = unpack_num[uint16_t](buf)
617-
from_ptr_and_size(buf.ptr + 2, element_length, &elem_buf)
618626

619-
deserializer = self.deserializers[i]
620-
item = from_binary(deserializer, &elem_buf, protocol_version)
621-
tuple_set(res, i, item)
627+
# Validate that we have enough data for the element and EOC byte (happy path check)
628+
if 2 + element_length + 1 <= buf.size:
629+
from_ptr_and_size(buf.ptr + 2, element_length, &elem_buf)
630+
631+
deserializer = self.deserializers[i]
632+
item = from_binary(deserializer, &elem_buf, protocol_version)
633+
tuple_set(res, i, item)
622634

623-
# skip element length, element, and the EOC (one byte)
624-
# Advance buffer in-place with direct assignment
625-
start = 2 + element_length + 1
626-
buf.ptr = buf.ptr + start
627-
buf.size = buf.size - start
635+
# skip element length, element, and the EOC (one byte)
636+
# Advance buffer in-place with direct assignment
637+
start = 2 + element_length + 1
638+
buf.ptr = buf.ptr + start
639+
buf.size = buf.size - start
640+
else:
641+
raise IndexError("Composite element length %d requires %d bytes but only %d remain" %
642+
(element_length, 2 + element_length + 1, buf.size))
628643

629644
return res
630645

0 commit comments

Comments
 (0)