11// SPDX-License-Identifier: Apache-2.0
22// SPDX-FileCopyrightText: Copyright the Vortex contributors
33
4- use arrayref:: array_mut_ref;
54use fastlanes:: RLE ;
65use vortex_array:: IntoArray ;
76use vortex_array:: ToCanonical ;
@@ -12,6 +11,7 @@ use vortex_array::match_each_native_ptype;
1211use vortex_array:: validity:: Validity ;
1312use vortex_buffer:: BitBufferMut ;
1413use vortex_buffer:: BufferMut ;
14+ use vortex_error:: VortexExpect ;
1515use vortex_error:: VortexResult ;
1616
1717use crate :: FL_CHUNK_SIZE ;
@@ -48,46 +48,58 @@ where
4848 let mut values_idx_offsets = BufferMut :: < u64 > :: with_capacity ( len. div_ceil ( FL_CHUNK_SIZE ) ) ;
4949
5050 let values_uninit = values_buf. spare_capacity_mut ( ) ;
51- let indices_uninit = indices_buf. spare_capacity_mut ( ) ;
51+ let ( indices_uninit, _) = indices_buf
52+ . spare_capacity_mut ( )
53+ . as_chunks_mut :: < FL_CHUNK_SIZE > ( ) ;
5254 let mut value_count_acc = 0 ; // Chunk value count prefix sum.
5355
5456 let ( chunks, remainder) = values. as_chunks :: < FL_CHUNK_SIZE > ( ) ;
5557
56- let mut process_chunk = |chunk_start_idx : usize , input : & [ T ; FL_CHUNK_SIZE ] | {
58+ let mut process_chunk = |chunk_start_idx : usize ,
59+ input : & [ T ; FL_CHUNK_SIZE ] ,
60+ rle_idxs : & mut [ u16 ; FL_CHUNK_SIZE ] | {
5761 // SAFETY: NativeValue is repr(transparent)
5862 let input: & [ NativeValue < T > ; FL_CHUNK_SIZE ] = unsafe { std:: mem:: transmute ( input) } ;
5963
6064 // SAFETY: `MaybeUninit<NativeValue<T>>` and `NativeValue<T>` have the same layout.
6165 let rle_vals: & mut [ NativeValue < T > ] =
6266 unsafe { std:: mem:: transmute ( & mut values_uninit[ value_count_acc..] [ ..FL_CHUNK_SIZE ] ) } ;
6367
64- // SAFETY: `MaybeUninit<u16>` and `u16` have the same layout.
65- let rle_idxs: & mut [ u16 ] =
66- unsafe { std:: mem:: transmute ( & mut indices_uninit[ chunk_start_idx..] [ ..FL_CHUNK_SIZE ] ) } ;
67-
6868 // Capture chunk start indices. This is necessary as indices
6969 // returned from `T::encode` are relative to the chunk.
7070 values_idx_offsets. push ( value_count_acc as u64 ) ;
7171
7272 let value_count = NativeValue :: < T > :: encode (
7373 input,
74- array_mut_ref ! [ rle_vals, 0 , FL_CHUNK_SIZE ] ,
75- array_mut_ref ! [ rle_idxs, 0 , FL_CHUNK_SIZE ] ,
74+ unsafe { & mut * ( rle_vals. as_mut_ptr ( ) as * mut [ _ ; FL_CHUNK_SIZE ] ) } ,
75+ rle_idxs,
7676 ) ;
7777
7878 value_count_acc += value_count;
7979 } ;
8080
81- for ( chunk_idx, chunk_slice) in chunks. iter ( ) . enumerate ( ) {
82- process_chunk ( chunk_idx * FL_CHUNK_SIZE , chunk_slice) ;
81+ for ( chunk_idx, ( chunk_slice, rle_idxs) ) in
82+ chunks. iter ( ) . zip ( indices_uninit. iter_mut ( ) ) . enumerate ( )
83+ {
84+ // SAFETY: `MaybeUninit<u16>` and `u16` have the same layout.
85+ process_chunk ( chunk_idx * FL_CHUNK_SIZE , chunk_slice, unsafe {
86+ std:: mem:: transmute ( rle_idxs)
87+ } ) ;
8388 }
8489
8590 if !remainder. is_empty ( ) {
8691 // Repeat the last value for padding to prevent
8792 // accounting for an additional value change.
8893 let mut padded_chunk = [ values[ len - 1 ] ; FL_CHUNK_SIZE ] ;
8994 padded_chunk[ ..remainder. len ( ) ] . copy_from_slice ( remainder) ;
90- process_chunk ( ( len / FL_CHUNK_SIZE ) * FL_CHUNK_SIZE , & padded_chunk) ;
95+ let last_idx_chunk = indices_uninit
96+ . last_mut ( )
97+ . vortex_expect ( "Must have the trailing chunk" ) ;
98+ process_chunk (
99+ ( len / FL_CHUNK_SIZE ) * FL_CHUNK_SIZE ,
100+ & padded_chunk,
101+ unsafe { std:: mem:: transmute ( last_idx_chunk) } ,
102+ ) ;
91103 }
92104
93105 unsafe {
@@ -137,10 +149,14 @@ mod tests {
137149 use rstest:: rstest;
138150 use vortex_array:: IntoArray ;
139151 use vortex_array:: ToCanonical ;
152+ use vortex_array:: arrays:: ConstantArray ;
153+ use vortex_array:: arrays:: MaskedArray ;
154+ use vortex_array:: arrays:: PrimitiveArray ;
140155 use vortex_array:: assert_arrays_eq;
141156 use vortex_array:: dtype:: half:: f16;
142157 use vortex_buffer:: Buffer ;
143158 use vortex_buffer:: buffer;
159+ use vortex_error:: VortexResult ;
144160
145161 use super :: * ;
146162
@@ -258,6 +274,89 @@ mod tests {
258274 assert_arrays_eq ! ( decoded, expected) ;
259275 }
260276
277+ /// Replaces the indices of an RLE array with MaskedArray(ConstantArray(1u16), validity).
278+ ///
279+ /// Simulates a compressor that represents indices as a masked constant.
280+ /// Valid when every chunk has at least two RLE dictionary entries (the
281+ /// fill-forward default at index 0 and the actual value at index 1), which
282+ /// holds whenever the first position of each chunk is null.
283+ fn with_masked_constant_indices ( rle : & RLEArray ) -> VortexResult < RLEArray > {
284+ let indices_prim = rle. indices ( ) . to_primitive ( ) ;
285+ let masked_indices = MaskedArray :: try_new (
286+ ConstantArray :: new ( 1u16 , indices_prim. len ( ) ) . into_array ( ) ,
287+ indices_prim. validity ( ) ,
288+ ) ?
289+ . into_array ( ) ;
290+ // SAFETY: we only replace the indices child; dtype and length are unchanged
291+ // and index 1 is valid in every chunk because each has ≥ 2 dictionary entries.
292+ unsafe {
293+ RLEArray :: try_from_data ( RLEData :: new_unchecked (
294+ rle. values ( ) . clone ( ) ,
295+ masked_indices,
296+ rle. values_idx_offsets ( ) . clone ( ) ,
297+ rle. dtype ( ) . clone ( ) ,
298+ rle. offset ( ) ,
299+ rle. len ( ) ,
300+ ) )
301+ }
302+ }
303+
304+ #[ test]
305+ fn test_encode_all_null_chunk ( ) -> VortexResult < ( ) > {
306+ let values: Vec < Option < u32 > > = vec ! [ None ; FL_CHUNK_SIZE ] ;
307+ let original = PrimitiveArray :: from_option_iter ( values) ;
308+ let rle = RLEData :: encode ( & original) ?;
309+ let decoded = with_masked_constant_indices ( & rle) ?;
310+ assert_arrays_eq ! ( decoded, original) ;
311+ Ok ( ( ) )
312+ }
313+
314+ #[ test]
315+ fn test_encode_all_null_chunk_then_value_chunk ( ) -> VortexResult < ( ) > {
316+ // First chunk is entirely null, second chunk has a value preceded by nulls.
317+ let mut values: Vec < Option < u32 > > = vec ! [ None ; 2 * FL_CHUNK_SIZE ] ;
318+ values[ FL_CHUNK_SIZE + 100 ] = Some ( 42 ) ;
319+ let original = PrimitiveArray :: from_option_iter ( values) ;
320+ let rle = RLEData :: encode ( & original) ?;
321+ let decoded = with_masked_constant_indices ( & rle) ?;
322+ assert_arrays_eq ! ( decoded, original) ;
323+ Ok ( ( ) )
324+ }
325+
326+ #[ test]
327+ fn test_encode_one_value_near_end ( ) -> VortexResult < ( ) > {
328+ // Single distinct value near the end of the chunk.
329+ let mut values: Vec < Option < u32 > > = vec ! [ None ; FL_CHUNK_SIZE ] ;
330+ values[ 1000 ] = Some ( 42 ) ;
331+ let original = PrimitiveArray :: from_option_iter ( values) ;
332+ let rle = RLEData :: encode ( & original) ?;
333+ let decoded = with_masked_constant_indices ( & rle) ?;
334+ assert_arrays_eq ! ( decoded, original) ;
335+ Ok ( ( ) )
336+ }
337+
338+ #[ test]
339+ fn test_encode_value_chunk_then_all_null_remainder ( ) -> VortexResult < ( ) > {
340+ // 1085 elements (2 chunks: 1024 + 61 padded to 1024).
341+ // Chunk 0 has -1i16 at scattered positions (273..=366), rest null.
342+ // Chunk 1 (the remainder) is entirely null.
343+ const NEG1_POSITIONS : & [ usize ] = & [
344+ 273 , 276 , 277 , 278 , 279 , 281 , 282 , 284 , 285 , 286 , 287 , 288 , 289 , 291 , 292 , 293 , 296 ,
345+ 298 , 299 , 302 , 304 , 308 , 310 , 311 , 313 , 314 , 315 , 317 , 318 , 322 , 324 , 325 , 334 , 335 ,
346+ 336 , 337 , 338 , 339 , 340 , 341 , 342 , 343 , 344 , 346 , 347 , 348 , 350 , 352 , 353 , 355 , 358 ,
347+ 359 , 362 , 363 , 364 , 366 ,
348+ ] ;
349+ let mut values: Vec < Option < i16 > > = vec ! [ None ; 1085 ] ;
350+ for & pos in NEG1_POSITIONS {
351+ values[ pos] = Some ( -1 ) ;
352+ }
353+ let original = PrimitiveArray :: from_option_iter ( values) ;
354+ let rle = RLEData :: encode ( & original) ?;
355+ let decoded = with_masked_constant_indices ( & rle) ?;
356+ assert_arrays_eq ! ( decoded, original) ;
357+ Ok ( ( ) )
358+ }
359+
261360 // Regression test: RLE compression properly supports decoding pos/neg zeros
262361 // See <https://github.com/vortex-data/vortex/issues/6491>
263362 #[ rstest]
0 commit comments