Skip to content

Commit 7ad8e2c

Browse files
authored
fix array_repeat capacity overflow on constant scalar with large count (#22305)
## Which issue does this PR close? - Closes #22228 . ## Rationale for this change `array_repeat` still panics for oversized repeat counts in the constant-scalar path. The simplest reproducer is: ```sql SELECT array_repeat(1, 9223372036854775807) ``` Unlike the previously reported `array_repeat` overflow cases, this path does not sum counts across rows and does not multiply nested list lengths, but it still reaches an unchecked `Vec` preallocation and panics with `capacity overflow`. This change makes `array_repeat` reject oversized output lengths up front and return a normal execution error instead of panicking. ## What changes are included in this PR? This PR adds explicit bounds checks in repeat.rs so `array_repeat` validates requested output sizes before allocating buffers. The main changes are: - Move repeat-length accumulation into shared checked helpers. - Reject oversized output lengths with: `array_repeat: requested length exceeds maximum array size` - Guard both scalar and list repeat paths so they fail consistently before hitting unchecked allocation or arithmetic overflow. - Reuse precomputed outer offsets for the list path instead of rebuilding them from unchecked lengths. ## Are these changes tested? Yes. This PR adds a regression test in repeat.rs covering the constant-scalar reproducer with `i64::MAX` as the repeat count and verifies that `array_repeat` returns an execution error rather than panicking. Validated with: ```bash cargo test -p datafusion-functions-nested scalar_count_exceeding_max_array_size_returns_error --lib ``` ## Are there any user-facing changes? Yes. Previously, oversized `array_repeat` calls could panic the process. After this change, they return a regular execution error: ```text array_repeat: requested length exceeds maximum array size ```
1 parent 94c58d0 commit 7ad8e2c

2 files changed

Lines changed: 111 additions & 35 deletions

File tree

datafusion/functions-nested/src/repeat.rs

Lines changed: 108 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,12 @@ use datafusion_expr::{
3838
};
3939
use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
4040
use datafusion_macros::user_doc;
41+
use std::mem::size_of;
4142
use std::sync::Arc;
4243

44+
const ARRAY_REPEAT_LENGTH_EXCEEDED: &str =
45+
"array_repeat: requested length exceeds maximum array size";
46+
4347
make_udf_expr_and_func!(
4448
ArrayRepeat,
4549
array_repeat,
@@ -175,28 +179,12 @@ fn general_repeat<O: OffsetSizeTrait>(
175179
array: &ArrayRef,
176180
count_array: &Int64Array,
177181
) -> Result<ArrayRef> {
178-
let total_repeated_values: usize = (0..count_array.len())
179-
.map(|i| get_count_with_validity(count_array, i))
180-
.sum();
182+
let (offsets, total_repeated_values) = build_repeat_offsets::<O>(count_array)?;
181183

182184
let mut take_indices = Vec::with_capacity(total_repeated_values);
183-
let mut offsets = Vec::with_capacity(count_array.len() + 1);
184-
offsets.push(O::zero());
185-
let mut running_offset = 0usize;
186185

187186
for idx in 0..count_array.len() {
188187
let count = get_count_with_validity(count_array, idx);
189-
running_offset = running_offset.checked_add(count).ok_or_else(|| {
190-
DataFusionError::Execution(
191-
"array_repeat: running_offset overflowed usize".to_string(),
192-
)
193-
})?;
194-
let offset = O::from_usize(running_offset).ok_or_else(|| {
195-
DataFusionError::Execution(format!(
196-
"array_repeat: offset {running_offset} exceeds the maximum value for offset type"
197-
))
198-
})?;
199-
offsets.push(offset);
200188
take_indices.extend(std::iter::repeat_n(idx as u64, count));
201189
}
202190

@@ -231,23 +219,23 @@ fn general_list_repeat<O: OffsetSizeTrait>(
231219
count_array: &Int64Array,
232220
) -> Result<ArrayRef> {
233221
let list_offsets = list_array.value_offsets();
222+
let (outer_offsets, outer_total) = build_repeat_offsets::<O>(count_array)?;
234223

235224
// calculate capacities for pre-allocation
236-
let mut outer_total = 0usize;
237225
let mut inner_total = 0usize;
238226
for i in 0..count_array.len() {
239227
let count = get_count_with_validity(count_array, i);
240-
if count > 0 {
241-
outer_total += count;
242-
if list_array.is_valid(i) {
243-
let len = list_offsets[i + 1].to_usize().unwrap()
244-
- list_offsets[i].to_usize().unwrap();
245-
inner_total += len * count;
246-
}
228+
if count > 0 && list_array.is_valid(i) {
229+
let len = list_offsets[i + 1].to_usize().unwrap()
230+
- list_offsets[i].to_usize().unwrap();
231+
inner_total =
232+
checked_repeat_len_add(inner_total, checked_repeat_len_mul(len, count)?)?;
233+
ensure_array_repeat_output_len::<O>(inner_total)?;
247234
}
248235
}
249236

250237
// Build inner structures
238+
ensure_vec_capacity::<O>(checked_repeat_len_add(outer_total, 1)?)?;
251239
let mut inner_offsets = Vec::with_capacity(outer_total + 1);
252240
let mut take_indices = Vec::with_capacity(inner_total);
253241
let mut inner_nulls = BooleanBufferBuilder::new(outer_total);
@@ -262,11 +250,8 @@ fn general_list_repeat<O: OffsetSizeTrait>(
262250
let row_len = end - start;
263251

264252
for _ in 0..count {
265-
inner_running = inner_running.checked_add(row_len).ok_or_else(|| {
266-
DataFusionError::Execution(
267-
"array_repeat: inner offset overflowed usize".to_string(),
268-
)
269-
})?;
253+
inner_running = checked_repeat_len_add(inner_running, row_len)?;
254+
ensure_array_repeat_output_len::<O>(inner_running)?;
270255
let offset = O::from_usize(inner_running).ok_or_else(|| {
271256
DataFusionError::Execution(format!(
272257
"array_repeat: offset {inner_running} exceeds the maximum value for offset type"
@@ -299,16 +284,85 @@ fn general_list_repeat<O: OffsetSizeTrait>(
299284
list_array.data_type().to_owned(),
300285
true,
301286
)),
302-
OffsetBuffer::<O>::from_lengths(
303-
count_array
304-
.iter()
305-
.map(|c| c.map(|v| if v > 0 { v as usize } else { 0 }).unwrap_or(0)),
306-
),
287+
OffsetBuffer::new(outer_offsets.into()),
307288
Arc::new(inner_list),
308289
count_array.nulls().cloned(),
309290
)?))
310291
}
311292

293+
fn build_repeat_offsets<O: OffsetSizeTrait>(
294+
count_array: &Int64Array,
295+
) -> Result<(Vec<O>, usize)> {
296+
let mut offsets = Vec::with_capacity(count_array.len() + 1);
297+
offsets.push(O::zero());
298+
let mut running_offset = 0usize;
299+
300+
for idx in 0..count_array.len() {
301+
let count = get_count_with_validity(count_array, idx);
302+
running_offset = checked_repeat_len_add(running_offset, count)?;
303+
ensure_array_repeat_output_len::<O>(running_offset)?;
304+
let offset = O::from_usize(running_offset).ok_or_else(|| {
305+
DataFusionError::Execution(format!(
306+
"array_repeat: offset {running_offset} exceeds the maximum value for offset type"
307+
))
308+
})?;
309+
offsets.push(offset);
310+
}
311+
312+
Ok((offsets, running_offset))
313+
}
314+
315+
fn checked_repeat_len_add(lhs: usize, rhs: usize) -> Result<usize> {
316+
lhs.checked_add(rhs).ok_or_else(|| {
317+
DataFusionError::Execution(ARRAY_REPEAT_LENGTH_EXCEEDED.to_string())
318+
})
319+
}
320+
321+
fn checked_repeat_len_mul(lhs: usize, rhs: usize) -> Result<usize> {
322+
lhs.checked_mul(rhs).ok_or_else(|| {
323+
DataFusionError::Execution(ARRAY_REPEAT_LENGTH_EXCEEDED.to_string())
324+
})
325+
}
326+
327+
fn ensure_array_repeat_output_len<O: OffsetSizeTrait>(len: usize) -> Result<()> {
328+
if len > max_array_repeat_output_len::<O>() {
329+
return Err(DataFusionError::Execution(
330+
ARRAY_REPEAT_LENGTH_EXCEEDED.to_string(),
331+
));
332+
}
333+
334+
Ok(())
335+
}
336+
337+
fn ensure_vec_capacity<T>(len: usize) -> Result<()> {
338+
if len > max_vec_elements::<T>() {
339+
return Err(DataFusionError::Execution(
340+
ARRAY_REPEAT_LENGTH_EXCEEDED.to_string(),
341+
));
342+
}
343+
344+
Ok(())
345+
}
346+
347+
fn max_array_repeat_output_len<O: OffsetSizeTrait>() -> usize {
348+
max_offset_elements::<O>().min(max_vec_elements::<u64>())
349+
}
350+
351+
fn max_offset_elements<O: OffsetSizeTrait>() -> usize {
352+
if size_of::<O>() == size_of::<i32>() {
353+
i32::MAX as usize
354+
} else {
355+
i64::MAX as usize
356+
}
357+
}
358+
359+
fn max_vec_elements<T>() -> usize {
360+
let element_size = size_of::<T>();
361+
(isize::MAX as usize)
362+
.checked_div(element_size)
363+
.unwrap_or(usize::MAX)
364+
}
365+
312366
/// Helper function to get count from count_array at given index
313367
/// Return 0 for null values or non-positive count.
314368
#[inline]
@@ -320,3 +374,22 @@ fn get_count_with_validity(count_array: &Int64Array, idx: usize) -> usize {
320374
if c > 0 { c as usize } else { 0 }
321375
}
322376
}
377+
378+
#[cfg(test)]
379+
mod tests {
380+
use super::array_repeat_inner;
381+
use arrow::array::{ArrayRef, Int64Array};
382+
use std::sync::Arc;
383+
384+
#[test]
385+
fn scalar_count_exceeding_max_array_size_returns_error() {
386+
let element: ArrayRef = Arc::new(Int64Array::from(vec![1]));
387+
let count: ArrayRef = Arc::new(Int64Array::from(vec![i64::MAX]));
388+
389+
let err = array_repeat_inner(&[element, count]).unwrap_err();
390+
assert_eq!(
391+
err.to_string(),
392+
"Execution error: array_repeat: requested length exceeds maximum array size"
393+
);
394+
}
395+
}

datafusion/sqllogictest/test_files/array/array_repeat.slt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ select
4343
----
4444
[[1], [1], [1], [1], [1]] [[1.1, 2.2, 3.3], [1.1, 2.2, 3.3], [1.1, 2.2, 3.3]] [[NULL, NULL], [NULL, NULL], [NULL, NULL]] [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]
4545

46+
query error DataFusion error: Execution error: array_repeat: requested length exceeds maximum array size
47+
select array_repeat(1, 9223372036854775807);
48+
4649
query ????
4750
select
4851
array_repeat(arrow_cast([1], 'LargeList(Int64)'), 5),

0 commit comments

Comments
 (0)