@@ -6,6 +6,13 @@ use std::ops::{Range, RangeInclusive};
66use super :: { bitmap:: Bitmap , encoded_array:: EncodedU64Array } ;
77use deepsize:: DeepSizeOf ;
88
9+ /// Convert an estimated serialized byte cost from `u128` to `usize`, saturating
10+ /// at [`usize::MAX`] when the value does not fit (infeasible encodings).
11+ #[ inline]
12+ fn u128_byte_cost_to_usize ( v : u128 ) -> usize {
13+ usize:: try_from ( v) . unwrap_or ( usize:: MAX )
14+ }
15+
916/// Different ways to represent a sequence of distinct u64s.
1017///
1118/// This is designed to be especially efficient for sequences that are sorted,
@@ -88,13 +95,17 @@ struct SegmentStats {
8895}
8996
9097impl SegmentStats {
91- fn n_holes ( & self ) -> u64 {
98+ /// Number of missing values ("holes") in the range `[min, max]`.
99+ ///
100+ /// Returns `u128` because the total slot count `max - min + 1` can be up
101+ /// to `2^64` (when `min = 0, max = u64::MAX`), which exceeds `u64::MAX`.
102+ fn n_holes ( & self ) -> u128 {
92103 debug_assert ! ( self . sorted) ;
93104 if self . count == 0 {
94105 0
95106 } else {
96- let total_slots = self . max - self . min + 1 ;
97- total_slots - self . count
107+ let total_slots = self . max as u128 - self . min as u128 + 1 ;
108+ total_slots - self . count as u128
98109 }
99110 }
100111}
@@ -149,15 +160,24 @@ impl U64Segment {
149160 }
150161 }
151162
163+ /// Estimate the serialized byte size of each sorted encoding variant.
164+ ///
165+ /// All arithmetic is performed in `u128` to avoid overflow when the range
166+ /// span `max - min + 1` approaches or exceeds `2^64`. Infeasible sizes
167+ /// saturate to `usize::MAX` so they always lose the `min()` comparison.
152168 fn sorted_sequence_sizes ( stats : & SegmentStats ) -> [ usize ; 3 ] {
153169 let n_holes = stats. n_holes ( ) ;
154- let total_slots = stats. max - stats. min + 1 ;
170+ let total_slots = stats. max as u128 - stats. min as u128 + 1 ;
155171
156- let range_with_holes = 24 + 4 * n_holes as usize ;
157- let range_with_bitmap = 24 + ( total_slots as f64 / 8.0 ) . ceil ( ) as usize ;
158- let sorted_array = 24 + 2 * stats. count as usize ;
172+ let range_with_holes = 24u128 . saturating_add ( 4u128 . saturating_mul ( n_holes) ) ;
173+ let range_with_bitmap = 24u128 . saturating_add ( total_slots. div_ceil ( 8 ) ) ;
174+ let sorted_array = 24u128 . saturating_add ( 2u128 . saturating_mul ( stats. count as u128 ) ) ;
159175
160- [ range_with_holes, range_with_bitmap, sorted_array]
176+ [
177+ u128_byte_cost_to_usize ( range_with_holes) ,
178+ u128_byte_cost_to_usize ( range_with_bitmap) ,
179+ u128_byte_cost_to_usize ( sorted_array) ,
180+ ]
161181 }
162182
163183 fn from_stats_and_sequence (
@@ -166,38 +186,41 @@ impl U64Segment {
166186 ) -> Self {
167187 if stats. sorted {
168188 let n_holes = stats. n_holes ( ) ;
189+ // Range-backed encodings store an exclusive end as `Range<u64>`,
190+ // which cannot represent `u64::MAX + 1`. Compute the end once and
191+ // gate all range-backed branches on its representability.
192+ let exclusive_end = stats. max . checked_add ( 1 ) ;
169193 if stats. count == 0 {
170194 Self :: Range ( 0 ..0 )
171- } else if n_holes == 0 {
172- Self :: Range ( stats. min ..( stats . max + 1 ) )
173- } else {
195+ } else if n_holes == 0 && exclusive_end . is_some ( ) {
196+ Self :: Range ( stats. min ..exclusive_end . unwrap ( ) )
197+ } else if let Some ( end ) = exclusive_end {
174198 let sizes = Self :: sorted_sequence_sizes ( & stats) ;
175199 let min_size = sizes. iter ( ) . min ( ) . unwrap ( ) ;
176200 if min_size == & sizes[ 0 ] {
177- let range = stats. min ..( stats . max + 1 ) ;
201+ let range = stats. min ..end ;
178202 let mut holes =
179203 Self :: holes_in_slice ( stats. min ..=stats. max , sequence) . collect :: < Vec < _ > > ( ) ;
180204 holes. sort_unstable ( ) ;
181205 let holes = EncodedU64Array :: from ( holes) ;
182-
183206 Self :: RangeWithHoles { range, holes }
184207 } else if min_size == & sizes[ 1 ] {
185- let range = stats. min ..( stats . max + 1 ) ;
208+ let range = stats. min ..end ;
186209 let mut bitmap = Bitmap :: new_full ( ( stats. max - stats. min ) as usize + 1 ) ;
187-
188210 for hole in Self :: holes_in_slice ( stats. min ..=stats. max , sequence) {
189211 let offset = ( hole - stats. min ) as usize ;
190212 bitmap. clear ( offset) ;
191213 }
192-
193214 Self :: RangeWithBitmap { range, bitmap }
194215 } else {
195- // Must use array, but at least it's sorted
196216 Self :: SortedArray ( EncodedU64Array :: from_iter ( sequence) )
197217 }
218+ } else {
219+ // max == u64::MAX: exclusive end is unrepresentable in Range<u64>,
220+ // so no range-backed encoding can be used.
221+ Self :: SortedArray ( EncodedU64Array :: from_iter ( sequence) )
198222 }
199223 } else {
200- // Must use array
201224 Self :: Array ( EncodedU64Array :: from_iter ( sequence) )
202225 }
203226 }
@@ -707,6 +730,144 @@ mod test {
707730 ) ;
708731 }
709732
733+ #[ test]
734+ fn test_segment_overflow_boundary ( ) {
735+ // Sparse range spanning i64::MAX — the original overflow reproducer.
736+ // n_holes ≈ 2^63, which overflows `4 * n_holes as usize` without u128 arithmetic.
737+ let values: Vec < u64 > = vec ! [ 0 , 1 , 2 , 100 , i64 :: MAX as u64 ] ;
738+ let segment = U64Segment :: from_slice ( & values) ;
739+ assert ! (
740+ matches!( segment, U64Segment :: SortedArray ( _) ) ,
741+ "sparse range spanning i64::MAX should be SortedArray, got {:?}" ,
742+ std:: mem:: discriminant( & segment)
743+ ) ;
744+ assert_eq ! ( segment. len( ) , 5 ) ;
745+ assert_eq ! ( segment. iter( ) . collect:: <Vec <_>>( ) , values) ;
746+
747+ // Two values at u64 extremes — triggers n_holes() total_slots overflow
748+ // (u64::MAX - 0 + 1 wraps to 0 without u128).
749+ let values: Vec < u64 > = vec ! [ 0 , u64 :: MAX ] ;
750+ let segment = U64Segment :: from_slice ( & values) ;
751+ assert ! (
752+ matches!( segment, U64Segment :: SortedArray ( _) ) ,
753+ "full u64 span should be SortedArray, got {:?}" ,
754+ std:: mem:: discriminant( & segment)
755+ ) ;
756+ assert_eq ! ( segment. len( ) , 2 ) ;
757+ assert_eq ! ( segment. iter( ) . collect:: <Vec <_>>( ) , values) ;
758+
759+ // Small dense set near u64::MAX — cost estimation correctly prefers a
760+ // range-backed encoding, but Range<u64> cannot represent u64::MAX + 1
761+ // as the exclusive end. Must fall back to SortedArray.
762+ let values: Vec < u64 > = vec ! [ u64 :: MAX - 3 , u64 :: MAX - 1 , u64 :: MAX ] ;
763+ let segment = U64Segment :: from_slice ( & values) ;
764+ assert ! (
765+ matches!( segment, U64Segment :: SortedArray ( _) ) ,
766+ "dense set near u64::MAX should be SortedArray (exclusive end unrepresentable), got {:?}" ,
767+ std:: mem:: discriminant( & segment)
768+ ) ;
769+ assert_eq ! ( segment. len( ) , 3 ) ;
770+ assert_eq ! ( segment. iter( ) . collect:: <Vec <_>>( ) , values) ;
771+
772+ // Single value at u64::MAX — contiguous range with n_holes == 0, but
773+ // exclusive end u64::MAX + 1 overflows.
774+ let values: Vec < u64 > = vec ! [ u64 :: MAX ] ;
775+ let segment = U64Segment :: from_slice ( & values) ;
776+ assert ! (
777+ matches!( segment, U64Segment :: SortedArray ( _) ) ,
778+ "single u64::MAX should be SortedArray, got {:?}" ,
779+ std:: mem:: discriminant( & segment)
780+ ) ;
781+ assert_eq ! ( segment. len( ) , 1 ) ;
782+ assert_eq ! ( segment. iter( ) . collect:: <Vec <_>>( ) , values) ;
783+
784+ // Contiguous range ending just below u64::MAX — exclusive end is
785+ // representable, so Range encoding should still be used.
786+ let values: Vec < u64 > = vec ! [ u64 :: MAX - 3 , u64 :: MAX - 2 , u64 :: MAX - 1 ] ;
787+ let segment = U64Segment :: from_slice ( & values) ;
788+ assert_eq ! ( segment, U64Segment :: Range ( ( u64 :: MAX - 3 ) ..u64 :: MAX ) ) ;
789+ assert_eq ! ( segment. len( ) , 3 ) ;
790+ assert_eq ! ( segment. iter( ) . collect:: <Vec <_>>( ) , values) ;
791+
792+ // Regression: normal dense range with few holes still picks RangeWithHoles.
793+ // Needs total_slots > 32 * n_holes for RangeWithHoles to beat RangeWithBitmap.
794+ let values: Vec < u64 > = ( 100 ..1100 ) . filter ( |& x| x != 500 ) . collect ( ) ;
795+ let segment = U64Segment :: from_slice ( & values) ;
796+ assert_eq ! (
797+ segment,
798+ U64Segment :: RangeWithHoles {
799+ range: 100 ..1100 ,
800+ holes: vec![ 500 ] . into( ) ,
801+ }
802+ ) ;
803+ assert_eq ! ( segment. len( ) , 999 ) ;
804+ assert_eq ! ( segment. iter( ) . collect:: <Vec <_>>( ) , values) ;
805+
806+ // Regression: small dense range with hole picks RangeWithBitmap.
807+ let values: Vec < u64 > = vec ! [ 100 , 101 , 102 , 103 , 105 ] ;
808+ let segment = U64Segment :: from_slice ( & values) ;
809+ assert ! (
810+ matches!( segment, U64Segment :: RangeWithBitmap { .. } ) ,
811+ "small dense range with hole should be RangeWithBitmap, got {:?}" ,
812+ std:: mem:: discriminant( & segment)
813+ ) ;
814+ assert_eq ! ( segment. len( ) , 5 ) ;
815+ assert_eq ! ( segment. iter( ) . collect:: <Vec <_>>( ) , values) ;
816+ }
817+
818+ #[ test]
819+ fn test_u128_byte_cost_to_usize ( ) {
820+ assert_eq ! ( super :: u128_byte_cost_to_usize( 0 ) , 0 ) ;
821+ assert_eq ! ( super :: u128_byte_cost_to_usize( 42 ) , 42 ) ;
822+ assert_eq ! (
823+ super :: u128_byte_cost_to_usize( usize :: MAX as u128 ) ,
824+ usize :: MAX
825+ ) ;
826+ assert_eq ! ( super :: u128_byte_cost_to_usize( u128 :: MAX ) , usize :: MAX ) ;
827+ }
828+
829+ #[ test]
830+ fn test_sorted_sequence_sizes_sparse_span_saturates_range_with_holes_cost ( ) {
831+ let stats = super :: SegmentStats {
832+ min : 0 ,
833+ max : i64:: MAX as u64 ,
834+ count : 5 ,
835+ sorted : true ,
836+ } ;
837+ let sizes = U64Segment :: sorted_sequence_sizes ( & stats) ;
838+ assert_eq ! ( sizes[ 0 ] , usize :: MAX ) ;
839+ assert ! ( sizes[ 2 ] < sizes[ 0 ] ) ;
840+ }
841+
842+ #[ test]
843+ fn test_sorted_sequence_sizes_sorted_array_cost_saturates ( ) {
844+ // Nearly full [0, u64::MAX] with one hole: count = u64::MAX, n_holes = 1.
845+ // SortedArray cost 24 + 2 * u64::MAX does not fit in usize on 64-bit.
846+ let stats = super :: SegmentStats {
847+ min : 0 ,
848+ max : u64:: MAX ,
849+ count : u64:: MAX ,
850+ sorted : true ,
851+ } ;
852+ let sizes = U64Segment :: sorted_sequence_sizes ( & stats) ;
853+ assert_eq ! ( sizes[ 2 ] , usize :: MAX ) ;
854+ }
855+
856+ #[ test]
857+ fn test_sorted_sequence_sizes_full_span_bitmap_cost ( ) {
858+ // Synthetic stats: full [0, u64::MAX] slot space; exercises `range_with_bitmap`
859+ // cost path (always fits in `usize` on 64-bit targets).
860+ let stats = super :: SegmentStats {
861+ min : 0 ,
862+ max : u64:: MAX ,
863+ count : 1 ,
864+ sorted : true ,
865+ } ;
866+ let sizes = U64Segment :: sorted_sequence_sizes ( & stats) ;
867+ assert ! ( sizes[ 1 ] < sizes[ 0 ] ) ;
868+ assert ! ( sizes[ 1 ] < usize :: MAX ) ;
869+ }
870+
710871 #[ test]
711872 fn test_with_new_high ( ) {
712873 // Test Range: contiguous sequence
0 commit comments