@@ -36,10 +36,8 @@ use vortex_mask::Mask;
3636use crate :: FSST ;
3737use crate :: FSSTArray ;
3838
39- /// FSST worst case: every input byte expands to an escape + literal (2x) plus a small
40- /// per-string header.
39+ /// FSST worst case: every input byte expands to an escape + literal (2x).
4140const FSST_PER_BYTE_OVERHEAD : usize = 2 ;
42- const FSST_PER_ROW_OVERHEAD : usize = 7 ;
4341
4442/// Starting capacity for the per-row `compress_into` scratch buffer; grown monotonically.
4543const DEFAULT_BUFFER_LEN : usize = 1024 * 1024 ;
@@ -87,7 +85,6 @@ fn compress_varbinview(
8785) -> VortexResult < FSSTArray > {
8886 let mask = strings. validity ( ) ?. execute_mask ( strings. len ( ) , ctx) ?;
8987 let views = strings. views ( ) ;
90- let non_null = mask. true_count ( ) ;
9188
9289 let total_input_bytes = match mask. bit_buffer ( ) {
9390 AllOr :: All => views. iter ( ) . map ( |v| v. len ( ) as usize ) . sum ( ) ,
@@ -100,7 +97,7 @@ fn compress_varbinview(
10097 . sum ( ) ,
10198 } ;
10299
103- if fsst_output_fits_in_i32_offsets ( total_input_bytes, non_null ) {
100+ if fsst_output_fits_in_i32_offsets ( total_input_bytes) {
104101 compress_views :: < i32 > ( strings, & mask, compressor, ctx)
105102 } else {
106103 compress_views :: < i64 > ( strings, & mask, compressor, ctx)
@@ -120,9 +117,8 @@ fn compress_varbin_array(
120117 let last: usize = off[ off. len( ) - 1 ] . as_( ) ;
121118 last - first
122119 } ) ;
123- let non_null = mask. true_count ( ) ;
124120
125- if fsst_output_fits_in_i32_offsets ( total_input_bytes, non_null ) {
121+ if fsst_output_fits_in_i32_offsets ( total_input_bytes) {
126122 compress_varbin :: < i32 > ( strings, & offsets, & mask, compressor, ctx)
127123 } else {
128124 compress_varbin :: < i64 > ( strings, & offsets, & mask, compressor, ctx)
@@ -179,10 +175,8 @@ fn train_varbin_array(
179175}
180176
181177#[ inline]
182- fn fsst_output_fits_in_i32_offsets ( total_input_bytes : usize , non_null : usize ) -> bool {
183- let worst = total_input_bytes
184- . saturating_mul ( FSST_PER_BYTE_OVERHEAD )
185- . saturating_add ( non_null. saturating_mul ( FSST_PER_ROW_OVERHEAD ) ) ;
178+ fn fsst_output_fits_in_i32_offsets ( total_input_bytes : usize ) -> bool {
179+ let worst = total_input_bytes. saturating_mul ( FSST_PER_BYTE_OVERHEAD ) ;
186180 worst <= i32:: MAX as usize
187181}
188182
@@ -305,7 +299,7 @@ impl<'c, O: IntegerPType + 'static> FsstSink<'c, O> {
305299 i32:: try_from ( s. len ( ) ) . vortex_expect ( "per-row uncompressed length must fit in i32" ) ,
306300 ) ;
307301
308- let target = FSST_PER_BYTE_OVERHEAD * s. len ( ) + FSST_PER_ROW_OVERHEAD ;
302+ let target = FSST_PER_BYTE_OVERHEAD * s. len ( ) ;
309303 if target > self . buffer . len ( ) {
310304 self . buffer . reserve ( target - self . buffer . len ( ) ) ;
311305 }
@@ -332,6 +326,7 @@ impl<'c, O: IntegerPType + 'static> FsstSink<'c, O> {
332326#[ cfg( test) ]
333327mod tests {
334328 use vortex_array:: VortexSessionExecute ;
329+ use vortex_array:: array_session;
335330 use vortex_array:: arrays:: VarBinViewArray ;
336331 use vortex_array:: arrays:: varbin:: VarBinArrayExt ;
337332 use vortex_array:: dtype:: PType ;
@@ -347,18 +342,18 @@ mod tests {
347342 #[ test]
348343 fn offset_width_boundary ( ) {
349344 let m = i32:: MAX as usize ;
350- assert ! ( fsst_output_fits_in_i32_offsets( m / 2 - 7 , 1 ) ) ;
351- assert ! ( ! fsst_output_fits_in_i32_offsets( m / 2 , 1 ) ) ;
352- assert ! ( fsst_output_fits_in_i32_offsets( 0 , 0 ) ) ;
353- assert ! ( !fsst_output_fits_in_i32_offsets( usize :: MAX , 1 ) ) ;
345+ assert ! ( fsst_output_fits_in_i32_offsets( m / 2 - 7 ) ) ;
346+ assert ! ( fsst_output_fits_in_i32_offsets( m / 2 ) ) ;
347+ assert ! ( fsst_output_fits_in_i32_offsets( 0 ) ) ;
348+ assert ! ( !fsst_output_fits_in_i32_offsets( usize :: MAX ) ) ;
354349 }
355350
356351 /// Small inputs fit the i32 bound, so `fsst_compress` must pick i32 offsets.
357352 /// The i64 branch is covered by `tests::fsst_compress_offsets_overflow_i32`.
358353 #[ test]
359354 fn codes_offsets_dtype_small_input_is_i32 ( ) -> VortexResult < ( ) > {
360355 let array = VarBinViewArray :: from_iter_str ( [ "hello" , "world" , "fsst encoded" ] ) ;
361- let mut ctx = vortex_array :: array_session ( ) . create_execution_ctx ( ) ;
356+ let mut ctx = array_session ( ) . create_execution_ctx ( ) ;
362357 let compressor = fsst_train_compressor ( array. as_array ( ) , & mut ctx) ?;
363358 let fsst = fsst_compress ( array. as_array ( ) , & compressor, & mut ctx) ?;
364359 assert_eq ! ( fsst. codes( ) . offsets( ) . dtype( ) . as_ptype( ) , PType :: I32 ) ;
0 commit comments