Skip to content

Commit f1ef71a

Browse files
alambadamreeve
andauthored
[arrow-array] use usize arithmetic in FixedSizeBinaryArray, aggressive overflow checks (#9910)
# Which issue does this PR close? - Closes #9906. # Rationale for this change `FixedSizeBinaryArray` still stores its public fixed width as `i32`, which means internal address calculations rely on repeated conversions between `i32` and pointer-sized offsets. We recently had issue with some i32 based arithmetic overflowing (see #9898) To avoid inadvertently using i32 arithmetic, this PR proposes to change the internal representation of the FixedSizeBinaryArray to use `usize` and compute byte positions using `usize` ( pointer-sized arithmetic) directly, with checked conversions only at the public API boundaries that still require `i32`. I am quite pleased it is a net reduction in lines of code (admittedly most of that was the checks added in #9872 # What changes are included in this PR? - Store the fixed-width element size as `value_size: usize` inside `FixedSizeBinaryArray`. - Rewrite internal position calculations in accessors and slicing to use `usize` arithmetic. - Remove the old `validate_lengths` invariant that existed only to keep internal `i32` offset arithmetic in range. - Remove implicit `as` casts from the implementation and replace them with checked conversions or typed bindings. # Are these changes tested? These changes are covered by CI. # Are there any user-facing changes? No. --------- Co-authored-by: Adam Reeve <adreeve@gmail.com> Co-authored-by: Adam Reeve <adam.reeve@gr-oss.io>
1 parent 4459985 commit f1ef71a

4 files changed

Lines changed: 102 additions & 131 deletions

File tree

arrow-array/src/array/fixed_size_binary_array.rs

Lines changed: 57 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -88,11 +88,23 @@ use std::sync::Arc;
8888
///
8989
#[derive(Clone)]
9090
pub struct FixedSizeBinaryArray {
91-
data_type: DataType, // Must be DataType::FixedSizeBinary(value_length)
91+
/// Must be DataType::FixedSizeBinary(value_size)
92+
data_type: DataType,
93+
/// `len` values, each `value_size` bytes
9294
value_data: Buffer,
95+
/// Optional Null Buffer
9396
nulls: Option<NullBuffer>,
97+
/// Number of elements in the array
9498
len: usize,
95-
value_length: i32,
99+
/// size of each element, validated to fit in a positive i32
100+
///
101+
/// Corresponds to the [`byteWidth` field] in the Arrow Spec
102+
///
103+
/// note: Arrow stores `value_len` using i32. This implementation stores it
104+
/// as a usize to ensure correct offset calculations.
105+
///
106+
/// [`byteWidth` field]: https://github.com/apache/arrow/blob/2a89d03bbefd620b42126b8e00f8ae57e99cd638/format/Schema.fbs#L211
107+
value_size: usize,
96108
}
97109

98110
impl FixedSizeBinaryArray {
@@ -124,7 +136,6 @@ impl FixedSizeBinaryArray {
124136
/// * `value_length < 0`
125137
/// * `values.len() / value_length != nulls.len()`
126138
/// * `value_length == 0 && values.len() != 0`
127-
/// * `len * value_length > i32::MAX`
128139
pub fn try_new(
129140
value_length: i32,
130141
values: Buffer,
@@ -163,44 +174,15 @@ impl FixedSizeBinaryArray {
163174
}
164175
};
165176

166-
Self::validate_lengths(value_size, len)?;
167-
168177
Ok(Self {
169178
data_type,
170179
value_data: values,
171-
value_length,
180+
value_size,
172181
nulls,
173182
len,
174183
})
175184
}
176185

177-
/// Some calculations below use i32 arithmetic which can overflow when
178-
/// valid offsets are past i32::MAX. Until that is solved for real do not
179-
/// permit constructing any FixedSizeBinaryArray that has a valid offset
180-
/// past i32::MAX
181-
fn validate_lengths(value_size: usize, len: usize) -> Result<(), ArrowError> {
182-
// the offset is also calculated for the next element (i + 1) so
183-
// check `len` (not last element index) to ensure that all offsets are valid
184-
let max_offset = value_size.checked_mul(len).ok_or_else(|| {
185-
ArrowError::InvalidArgumentError(format!(
186-
"FixedSizeBinaryArray error: value size {value_size} * len {len} exceeds maximum valid offset"
187-
))
188-
})?;
189-
190-
let max_valid_offset: usize = i32::MAX.try_into().map_err(|_| {
191-
ArrowError::InvalidArgumentError(format!(
192-
"FixedSizeBinaryArray error: maximum valid offset exceeds i32::MAX, got {max_offset}"
193-
))
194-
})?;
195-
196-
if max_offset > max_valid_offset {
197-
return Err(ArrowError::InvalidArgumentError(format!(
198-
"FixedSizeBinaryArray error: value size {value_size} * length {len} exceeds maximum valid offset of {max_valid_offset}"
199-
)));
200-
};
201-
Ok(())
202-
}
203-
204186
/// Create a new [`FixedSizeBinaryArray`] of length `len` where all values are null
205187
///
206188
/// # Panics
@@ -209,26 +191,25 @@ impl FixedSizeBinaryArray {
209191
///
210192
/// * `value_length < 0`
211193
/// * `value_length * len` would overflow `usize`
212-
/// * `value_length * len > i32::MAX`
213194
/// * `value_length * len * 8` would overflow `usize`
214195
pub fn new_null(value_length: i32, len: usize) -> Self {
215196
const BITS_IN_A_BYTE: usize = 8;
216197
let value_size = value_length.to_usize().unwrap();
217-
Self::validate_lengths(value_size, len).unwrap();
218198
let capacity_in_bytes = value_size.checked_mul(len).unwrap();
219199
let capacity_in_bits = capacity_in_bytes.checked_mul(BITS_IN_A_BYTE).unwrap();
220200
Self {
221201
data_type: DataType::FixedSizeBinary(value_length),
222202
value_data: MutableBuffer::new_null(capacity_in_bits).into(),
223203
nulls: Some(NullBuffer::new_null(len)),
224-
value_length,
204+
value_size,
225205
len,
226206
}
227207
}
228208

229209
/// Deconstruct this array into its constituent parts
230210
pub fn into_parts(self) -> (i32, Buffer, Option<NullBuffer>) {
231-
(self.value_length, self.value_data, self.nulls)
211+
let value_length = self.value_length();
212+
(value_length, self.value_data, self.nulls)
232213
}
233214

234215
/// Returns the element at index `i` as a byte slice.
@@ -239,19 +220,14 @@ impl FixedSizeBinaryArray {
239220
/// # Panics
240221
/// Panics if index `i` is out of bounds.
241222
pub fn value(&self, i: usize) -> &[u8] {
223+
let len = self.len();
242224
assert!(
243-
i < self.len(),
244-
"Trying to access an element at index {} from a FixedSizeBinaryArray of length {}",
245-
i,
246-
self.len()
225+
i < len,
226+
"Trying to access an element at index {i} from a FixedSizeBinaryArray of length {len}",
247227
);
248-
let offset = i + self.offset();
228+
let position = i * self.value_size;
249229
unsafe {
250-
let pos = self.value_offset_at(offset);
251-
std::slice::from_raw_parts(
252-
self.value_data.as_ptr().offset(pos as isize),
253-
(self.value_offset_at(offset + 1) - pos) as usize,
254-
)
230+
std::slice::from_raw_parts(self.value_data.as_ptr().add(position), self.value_size)
255231
}
256232
}
257233

@@ -265,30 +241,46 @@ impl FixedSizeBinaryArray {
265241
/// Caller is responsible for ensuring that the index is within the bounds
266242
/// of the array
267243
pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
268-
let offset = i + self.offset();
269-
let pos = self.value_offset_at(offset);
244+
let position = i * self.value_size;
270245
unsafe {
271-
std::slice::from_raw_parts(
272-
self.value_data.as_ptr().offset(pos as isize),
273-
(self.value_offset_at(offset + 1) - pos) as usize,
274-
)
246+
std::slice::from_raw_parts(self.value_data.as_ptr().add(position), self.value_size)
275247
}
276248
}
277249

278250
/// Returns the offset for the element at index `i`.
279251
///
280252
/// Note this doesn't do any bound checking, for performance reason.
253+
///
254+
/// # Panics
255+
///
256+
/// Panics if the computed byte offset exceeds `i32::MAX`.
257+
#[deprecated(since = "59.0.0", note = "Use i * value_size() instead")]
281258
#[inline]
282259
pub fn value_offset(&self, i: usize) -> i32 {
283-
self.value_offset_at(self.offset() + i)
260+
self.value_length() * i as i32
284261
}
285262

286263
/// Returns the length for an element.
287264
///
288265
/// All elements have the same length as the array is a fixed size.
266+
///
267+
/// Returns an `i32` to be compatible with the Arrow spec.
268+
///
269+
/// Use [`Self::value_size`] to return a `usize`.
289270
#[inline]
290271
pub fn value_length(&self) -> i32 {
291-
self.value_length
272+
// This is safe: constructor validated that value_size was a valid i32
273+
self.value_size as i32
274+
}
275+
276+
/// Return the length for an element, as a usize.
277+
///
278+
/// All elements have the same length as the array is a fixed size.
279+
///
280+
/// Note: This value will always fit, without overflow, into an i32
281+
#[inline]
282+
pub fn value_size(&self) -> usize {
283+
self.value_size
292284
}
293285

294286
/// Returns the values of this array.
@@ -311,14 +303,16 @@ impl FixedSizeBinaryArray {
311303
offset.saturating_add(len) <= self.len,
312304
"the length + offset of the sliced FixedSizeBinaryArray cannot exceed the existing length"
313305
);
314-
315-
let size = self.value_length as usize;
306+
let offset_bytes = offset
307+
.checked_mul(self.value_size)
308+
.expect("offset overflow");
309+
let len_bytes = len.checked_mul(self.value_size).expect("offset overflow");
316310

317311
Self {
318312
data_type: self.data_type.clone(),
319313
nulls: self.nulls.as_ref().map(|n| n.slice(offset, len)),
320-
value_length: self.value_length,
321-
value_data: self.value_data.slice_with_length(offset * size, len * size),
314+
value_size: self.value_size,
315+
value_data: self.value_data.slice_with_length(offset_bytes, len_bytes),
322316
len,
323317
}
324318
}
@@ -420,7 +414,6 @@ impl FixedSizeBinaryArray {
420414
let nulls = NullBuffer::from_unsliced_buffer(null_buf, len);
421415

422416
let value_size = value_size.unwrap_or(0);
423-
Self::validate_lengths(value_size, len)?;
424417
let value_length = value_size.try_into().map_err(|_| {
425418
ArrowError::InvalidArgumentError(format!(
426419
"FixedSizeBinaryArray value length exceeds i32, got {value_size}"
@@ -430,7 +423,7 @@ impl FixedSizeBinaryArray {
430423
data_type: DataType::FixedSizeBinary(value_length),
431424
value_data: buffer.into(),
432425
nulls,
433-
value_length,
426+
value_size,
434427
len,
435428
})
436429
}
@@ -514,14 +507,13 @@ impl FixedSizeBinaryArray {
514507
})?;
515508

516509
let nulls = NullBuffer::from_unsliced_buffer(null_buf, len);
517-
Self::validate_lengths(value_size, len)?;
518510

519511
Ok(Self {
520512
data_type: DataType::FixedSizeBinary(value_length),
521513
value_data: buffer.into(),
522514
nulls,
523515
len,
524-
value_length,
516+
value_size,
525517
})
526518
}
527519

@@ -583,7 +575,6 @@ impl FixedSizeBinaryArray {
583575
}
584576

585577
let value_size = value_size.unwrap_or(0);
586-
Self::validate_lengths(value_size, len)?;
587578
let value_length = value_size.try_into().map_err(|_| {
588579
ArrowError::InvalidArgumentError(format!(
589580
"FixedSizeBinaryArray value length exceeds i32, got {value_size}"
@@ -593,16 +584,11 @@ impl FixedSizeBinaryArray {
593584
data_type: DataType::FixedSizeBinary(value_length),
594585
value_data: buffer.into(),
595586
nulls: None,
596-
value_length,
587+
value_size,
597588
len,
598589
})
599590
}
600591

601-
#[inline]
602-
fn value_offset_at(&self, i: usize) -> i32 {
603-
self.value_length * i as i32
604-
}
605-
606592
/// constructs a new iterator
607593
pub fn iter(&self) -> FixedSizeBinaryIter<'_> {
608594
FixedSizeBinaryIter::new(self)
@@ -626,8 +612,6 @@ impl From<ArrayData> for FixedSizeBinaryArray {
626612
let value_size = value_length
627613
.to_usize()
628614
.expect("FixedSizeBinaryArray value length must be non-negative");
629-
Self::validate_lengths(value_size, len)
630-
.expect("FixedSizeBinaryArray offsets must fit within i32");
631615
let value_data = buffers[0].slice_with_length(
632616
offset.checked_mul(value_size).expect("offset overflow"),
633617
len.checked_mul(value_size).expect("length overflow"),
@@ -638,7 +622,7 @@ impl From<ArrayData> for FixedSizeBinaryArray {
638622
nulls,
639623
len,
640624
value_data,
641-
value_length,
625+
value_size,
642626
}
643627
}
644628
}
@@ -849,7 +833,6 @@ mod tests {
849833
fixed_size_binary_array.value(2)
850834
);
851835
assert_eq!(5, fixed_size_binary_array.value_length());
852-
assert_eq!(10, fixed_size_binary_array.value_offset(2));
853836
for i in 0..3 {
854837
assert!(fixed_size_binary_array.is_valid(i));
855838
assert!(!fixed_size_binary_array.is_null(i));
@@ -872,9 +855,7 @@ mod tests {
872855
fixed_size_binary_array.value(1)
873856
);
874857
assert_eq!(2, fixed_size_binary_array.len());
875-
assert_eq!(0, fixed_size_binary_array.value_offset(0));
876858
assert_eq!(5, fixed_size_binary_array.value_length());
877-
assert_eq!(5, fixed_size_binary_array.value_offset(1));
878859
}
879860

880861
#[test]
@@ -1122,26 +1103,6 @@ mod tests {
11221103
array.value(4);
11231104
}
11241105

1125-
#[test]
1126-
fn test_validate_lengths_allows_empty_array() {
1127-
FixedSizeBinaryArray::validate_lengths(1024, 0).unwrap();
1128-
}
1129-
1130-
#[test]
1131-
fn test_validate_lengths_allows_i32_max_offset() {
1132-
FixedSizeBinaryArray::validate_lengths(1, i32::MAX as usize).unwrap();
1133-
FixedSizeBinaryArray::validate_lengths(262_176, 8191).unwrap();
1134-
}
1135-
1136-
#[test]
1137-
fn test_validate_lengths_rejects_offset_past_i32_max() {
1138-
let err = FixedSizeBinaryArray::validate_lengths(262_177, 8192).unwrap_err();
1139-
assert_eq!(
1140-
err.to_string(),
1141-
"Invalid argument error: FixedSizeBinaryArray error: value size 262177 * length 8192 exceeds maximum valid offset of 2147483647",
1142-
);
1143-
}
1144-
11451106
#[test]
11461107
fn test_constructors() {
11471108
let buffer = Buffer::from_vec(vec![0_u8; 10]);

arrow-array/src/builder/fixed_size_binary_builder.rs

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -206,11 +206,11 @@ mod tests {
206206
assert_eq!(&DataType::FixedSizeBinary(5), array.data_type());
207207
assert_eq!(6, array.len());
208208
assert_eq!(3, array.null_count());
209-
assert_eq!(10, array.value_offset(2));
210-
assert_eq!(15, array.value_offset(3));
211209
assert_eq!(5, array.value_length());
210+
assert_eq!(b"arrow", array.value(2));
212211
assert!(array.is_null(3));
213212
assert!(array.is_null(4));
213+
assert_eq!(b"world", array.value(5));
214214
}
215215

216216
#[test]
@@ -226,8 +226,8 @@ mod tests {
226226
assert_eq!(&DataType::FixedSizeBinary(5), array.data_type());
227227
assert_eq!(3, array.len());
228228
assert_eq!(1, array.null_count());
229-
assert_eq!(10, array.value_offset(2));
230229
assert_eq!(5, array.value_length());
230+
assert_eq!(b"arrow", array.value(2));
231231

232232
// [b"finis", null, "clone"]
233233
builder.append_value(b"finis").unwrap();
@@ -239,8 +239,8 @@ mod tests {
239239
assert_eq!(&DataType::FixedSizeBinary(5), array.data_type());
240240
assert_eq!(6, array.len());
241241
assert_eq!(2, array.null_count());
242-
assert_eq!(25, array.value_offset(5));
243242
assert_eq!(5, array.value_length());
243+
assert_eq!(b"clone", array.value(5));
244244
}
245245

246246
#[test]
@@ -256,7 +256,6 @@ mod tests {
256256
assert_eq!(&DataType::FixedSizeBinary(0), array.data_type());
257257
assert_eq!(3, array.len());
258258
assert_eq!(1, array.null_count());
259-
assert_eq!(0, array.value_offset(2));
260259
assert_eq!(0, array.value_length());
261260
assert_eq!(b"", array.value(0));
262261
assert_eq!(b"", array.value(2));
@@ -307,10 +306,6 @@ mod tests {
307306
assert_eq!(&DataType::FixedSizeBinary(5), array.data_type());
308307
assert_eq!(6, array.len());
309308
assert_eq!(2, array.null_count());
310-
for i in 0..6 {
311-
assert_eq!(i * 5, array.value_offset(i as usize));
312-
}
313-
314309
assert_eq!(b"hello", array.value(0));
315310
assert!(array.is_null(1));
316311
assert_eq!(b"arrow", array.value(2));

0 commit comments

Comments
 (0)