Skip to content

Commit 423835f

Browse files
authored
fix: prevent arithmetic overflow in U64Segment encoding selection for sparse/extreme row id ranges (#6516)
`U64Segment::from_stats_and_sequence` crashes when row IDs span a large range or include values near `u64::MAX`. Fixes #6515 There are two independent overflow classes: 1. **Cost estimation**: `n_holes()` and `sorted_sequence_sizes()` compute range spans in `u64`/`usize` that wrap for large ranges, making infeasible encodings (RangeWithHoles, RangeWithBitmap) appear cheapest. The code then attempts to materialize billions of holes or allocate multi-exabyte bitmaps. 2. **Exclusive-end**: All range-backed encodings construct `Range<u64>` with `stats.max + 1` as the exclusive end. When `max == u64::MAX`, this overflows even for small, memory-feasible sets (e.g., `[u64::MAX - 3, u64::MAX - 1, u64::MAX]`). Both classes cause process aborts in debug and OOM in release. Across JNI this kills the JVM with no recoverable exception. ### Fix **`n_holes()` → `u128` return type**: The total slot count `max - min + 1` can be up to `2^64`, which exceeds `u64::MAX`. Widening to `u128` gives the correct value instead of wrapping. **`sorted_sequence_sizes()` → `u128` arithmetic**: All cost estimates computed in `u128` with saturating arithmetic, then converted via `usize::try_from(...).unwrap_or(usize::MAX)`. Infeasible encodings saturate and always lose the `min()` comparison. **`from_stats_and_sequence()` → `checked_add(1)` gate**: `exclusive_end = stats.max.checked_add(1)` computed once and used as a gate for all range-backed branches. When `None` (i.e., `max == u64::MAX`), falls through to `SortedArray`. The bare expression `stats.max + 1` no longer appears in the function.
1 parent ed8bde5 commit 423835f

1 file changed

Lines changed: 179 additions & 18 deletions

File tree

rust/lance-table/src/rowids/segment.rs

Lines changed: 179 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@ use std::ops::{Range, RangeInclusive};
66
use super::{bitmap::Bitmap, encoded_array::EncodedU64Array};
77
use deepsize::DeepSizeOf;
88

9+
/// Convert an estimated serialized byte cost from `u128` to `usize`, saturating
10+
/// at [`usize::MAX`] when the value does not fit (infeasible encodings).
11+
#[inline]
12+
fn u128_byte_cost_to_usize(v: u128) -> usize {
13+
usize::try_from(v).unwrap_or(usize::MAX)
14+
}
15+
916
/// Different ways to represent a sequence of distinct u64s.
1017
///
1118
/// This is designed to be especially efficient for sequences that are sorted,
@@ -88,13 +95,17 @@ struct SegmentStats {
8895
}
8996

9097
impl SegmentStats {
91-
fn n_holes(&self) -> u64 {
98+
/// Number of missing values ("holes") in the range `[min, max]`.
99+
///
100+
/// Returns `u128` because the total slot count `max - min + 1` can be up
101+
/// to `2^64` (when `min = 0, max = u64::MAX`), which exceeds `u64::MAX`.
102+
fn n_holes(&self) -> u128 {
92103
debug_assert!(self.sorted);
93104
if self.count == 0 {
94105
0
95106
} else {
96-
let total_slots = self.max - self.min + 1;
97-
total_slots - self.count
107+
let total_slots = self.max as u128 - self.min as u128 + 1;
108+
total_slots - self.count as u128
98109
}
99110
}
100111
}
@@ -149,15 +160,24 @@ impl U64Segment {
149160
}
150161
}
151162

163+
/// Estimate the serialized byte size of each sorted encoding variant.
164+
///
165+
/// All arithmetic is performed in `u128` to avoid overflow when the range
166+
/// span `max - min + 1` approaches or exceeds `2^64`. Infeasible sizes
167+
/// saturate to `usize::MAX` so they always lose the `min()` comparison.
152168
fn sorted_sequence_sizes(stats: &SegmentStats) -> [usize; 3] {
153169
let n_holes = stats.n_holes();
154-
let total_slots = stats.max - stats.min + 1;
170+
let total_slots = stats.max as u128 - stats.min as u128 + 1;
155171

156-
let range_with_holes = 24 + 4 * n_holes as usize;
157-
let range_with_bitmap = 24 + (total_slots as f64 / 8.0).ceil() as usize;
158-
let sorted_array = 24 + 2 * stats.count as usize;
172+
let range_with_holes = 24u128.saturating_add(4u128.saturating_mul(n_holes));
173+
let range_with_bitmap = 24u128.saturating_add(total_slots.div_ceil(8));
174+
let sorted_array = 24u128.saturating_add(2u128.saturating_mul(stats.count as u128));
159175

160-
[range_with_holes, range_with_bitmap, sorted_array]
176+
[
177+
u128_byte_cost_to_usize(range_with_holes),
178+
u128_byte_cost_to_usize(range_with_bitmap),
179+
u128_byte_cost_to_usize(sorted_array),
180+
]
161181
}
162182

163183
fn from_stats_and_sequence(
@@ -166,38 +186,41 @@ impl U64Segment {
166186
) -> Self {
167187
if stats.sorted {
168188
let n_holes = stats.n_holes();
189+
// Range-backed encodings store an exclusive end as `Range<u64>`,
190+
// which cannot represent `u64::MAX + 1`. Compute the end once and
191+
// gate all range-backed branches on its representability.
192+
let exclusive_end = stats.max.checked_add(1);
169193
if stats.count == 0 {
170194
Self::Range(0..0)
171-
} else if n_holes == 0 {
172-
Self::Range(stats.min..(stats.max + 1))
173-
} else {
195+
} else if n_holes == 0 && exclusive_end.is_some() {
196+
Self::Range(stats.min..exclusive_end.unwrap())
197+
} else if let Some(end) = exclusive_end {
174198
let sizes = Self::sorted_sequence_sizes(&stats);
175199
let min_size = sizes.iter().min().unwrap();
176200
if min_size == &sizes[0] {
177-
let range = stats.min..(stats.max + 1);
201+
let range = stats.min..end;
178202
let mut holes =
179203
Self::holes_in_slice(stats.min..=stats.max, sequence).collect::<Vec<_>>();
180204
holes.sort_unstable();
181205
let holes = EncodedU64Array::from(holes);
182-
183206
Self::RangeWithHoles { range, holes }
184207
} else if min_size == &sizes[1] {
185-
let range = stats.min..(stats.max + 1);
208+
let range = stats.min..end;
186209
let mut bitmap = Bitmap::new_full((stats.max - stats.min) as usize + 1);
187-
188210
for hole in Self::holes_in_slice(stats.min..=stats.max, sequence) {
189211
let offset = (hole - stats.min) as usize;
190212
bitmap.clear(offset);
191213
}
192-
193214
Self::RangeWithBitmap { range, bitmap }
194215
} else {
195-
// Must use array, but at least it's sorted
196216
Self::SortedArray(EncodedU64Array::from_iter(sequence))
197217
}
218+
} else {
219+
// max == u64::MAX: exclusive end is unrepresentable in Range<u64>,
220+
// so no range-backed encoding can be used.
221+
Self::SortedArray(EncodedU64Array::from_iter(sequence))
198222
}
199223
} else {
200-
// Must use array
201224
Self::Array(EncodedU64Array::from_iter(sequence))
202225
}
203226
}
@@ -707,6 +730,144 @@ mod test {
707730
);
708731
}
709732

733+
#[test]
734+
fn test_segment_overflow_boundary() {
735+
// Sparse range spanning i64::MAX — the original overflow reproducer.
736+
// n_holes ≈ 2^63, which overflows `4 * n_holes as usize` without u128 arithmetic.
737+
let values: Vec<u64> = vec![0, 1, 2, 100, i64::MAX as u64];
738+
let segment = U64Segment::from_slice(&values);
739+
assert!(
740+
matches!(segment, U64Segment::SortedArray(_)),
741+
"sparse range spanning i64::MAX should be SortedArray, got {:?}",
742+
std::mem::discriminant(&segment)
743+
);
744+
assert_eq!(segment.len(), 5);
745+
assert_eq!(segment.iter().collect::<Vec<_>>(), values);
746+
747+
// Two values at u64 extremes — triggers n_holes() total_slots overflow
748+
// (u64::MAX - 0 + 1 wraps to 0 without u128).
749+
let values: Vec<u64> = vec![0, u64::MAX];
750+
let segment = U64Segment::from_slice(&values);
751+
assert!(
752+
matches!(segment, U64Segment::SortedArray(_)),
753+
"full u64 span should be SortedArray, got {:?}",
754+
std::mem::discriminant(&segment)
755+
);
756+
assert_eq!(segment.len(), 2);
757+
assert_eq!(segment.iter().collect::<Vec<_>>(), values);
758+
759+
// Small dense set near u64::MAX — cost estimation correctly prefers a
760+
// range-backed encoding, but Range<u64> cannot represent u64::MAX + 1
761+
// as the exclusive end. Must fall back to SortedArray.
762+
let values: Vec<u64> = vec![u64::MAX - 3, u64::MAX - 1, u64::MAX];
763+
let segment = U64Segment::from_slice(&values);
764+
assert!(
765+
matches!(segment, U64Segment::SortedArray(_)),
766+
"dense set near u64::MAX should be SortedArray (exclusive end unrepresentable), got {:?}",
767+
std::mem::discriminant(&segment)
768+
);
769+
assert_eq!(segment.len(), 3);
770+
assert_eq!(segment.iter().collect::<Vec<_>>(), values);
771+
772+
// Single value at u64::MAX — contiguous range with n_holes == 0, but
773+
// exclusive end u64::MAX + 1 overflows.
774+
let values: Vec<u64> = vec![u64::MAX];
775+
let segment = U64Segment::from_slice(&values);
776+
assert!(
777+
matches!(segment, U64Segment::SortedArray(_)),
778+
"single u64::MAX should be SortedArray, got {:?}",
779+
std::mem::discriminant(&segment)
780+
);
781+
assert_eq!(segment.len(), 1);
782+
assert_eq!(segment.iter().collect::<Vec<_>>(), values);
783+
784+
// Contiguous range ending just below u64::MAX — exclusive end is
785+
// representable, so Range encoding should still be used.
786+
let values: Vec<u64> = vec![u64::MAX - 3, u64::MAX - 2, u64::MAX - 1];
787+
let segment = U64Segment::from_slice(&values);
788+
assert_eq!(segment, U64Segment::Range((u64::MAX - 3)..u64::MAX));
789+
assert_eq!(segment.len(), 3);
790+
assert_eq!(segment.iter().collect::<Vec<_>>(), values);
791+
792+
// Regression: normal dense range with few holes still picks RangeWithHoles.
793+
// Needs total_slots > 32 * n_holes for RangeWithHoles to beat RangeWithBitmap.
794+
let values: Vec<u64> = (100..1100).filter(|&x| x != 500).collect();
795+
let segment = U64Segment::from_slice(&values);
796+
assert_eq!(
797+
segment,
798+
U64Segment::RangeWithHoles {
799+
range: 100..1100,
800+
holes: vec![500].into(),
801+
}
802+
);
803+
assert_eq!(segment.len(), 999);
804+
assert_eq!(segment.iter().collect::<Vec<_>>(), values);
805+
806+
// Regression: small dense range with hole picks RangeWithBitmap.
807+
let values: Vec<u64> = vec![100, 101, 102, 103, 105];
808+
let segment = U64Segment::from_slice(&values);
809+
assert!(
810+
matches!(segment, U64Segment::RangeWithBitmap { .. }),
811+
"small dense range with hole should be RangeWithBitmap, got {:?}",
812+
std::mem::discriminant(&segment)
813+
);
814+
assert_eq!(segment.len(), 5);
815+
assert_eq!(segment.iter().collect::<Vec<_>>(), values);
816+
}
817+
818+
#[test]
819+
fn test_u128_byte_cost_to_usize() {
820+
assert_eq!(super::u128_byte_cost_to_usize(0), 0);
821+
assert_eq!(super::u128_byte_cost_to_usize(42), 42);
822+
assert_eq!(
823+
super::u128_byte_cost_to_usize(usize::MAX as u128),
824+
usize::MAX
825+
);
826+
assert_eq!(super::u128_byte_cost_to_usize(u128::MAX), usize::MAX);
827+
}
828+
829+
#[test]
830+
fn test_sorted_sequence_sizes_sparse_span_saturates_range_with_holes_cost() {
831+
let stats = super::SegmentStats {
832+
min: 0,
833+
max: i64::MAX as u64,
834+
count: 5,
835+
sorted: true,
836+
};
837+
let sizes = U64Segment::sorted_sequence_sizes(&stats);
838+
assert_eq!(sizes[0], usize::MAX);
839+
assert!(sizes[2] < sizes[0]);
840+
}
841+
842+
#[test]
843+
fn test_sorted_sequence_sizes_sorted_array_cost_saturates() {
844+
// Nearly full [0, u64::MAX] with one hole: count = u64::MAX, n_holes = 1.
845+
// SortedArray cost 24 + 2 * u64::MAX does not fit in usize on 64-bit.
846+
let stats = super::SegmentStats {
847+
min: 0,
848+
max: u64::MAX,
849+
count: u64::MAX,
850+
sorted: true,
851+
};
852+
let sizes = U64Segment::sorted_sequence_sizes(&stats);
853+
assert_eq!(sizes[2], usize::MAX);
854+
}
855+
856+
#[test]
857+
fn test_sorted_sequence_sizes_full_span_bitmap_cost() {
858+
// Synthetic stats: full [0, u64::MAX] slot space; exercises `range_with_bitmap`
859+
// cost path (always fits in `usize` on 64-bit targets).
860+
let stats = super::SegmentStats {
861+
min: 0,
862+
max: u64::MAX,
863+
count: 1,
864+
sorted: true,
865+
};
866+
let sizes = U64Segment::sorted_sequence_sizes(&stats);
867+
assert!(sizes[1] < sizes[0]);
868+
assert!(sizes[1] < usize::MAX);
869+
}
870+
710871
#[test]
711872
fn test_with_new_high() {
712873
// Test Range: contiguous sequence

0 commit comments

Comments
 (0)