@@ -8,8 +8,13 @@ use vortex_array::IntoArray;
88use vortex_array:: VortexSessionExecute ;
99use vortex_array:: accessor:: ArrayAccessor ;
1010use vortex_array:: arrays:: ConstantArray ;
11+ use vortex_array:: arrays:: PrimitiveArray ;
1112use vortex_array:: arrays:: VarBinArray ;
1213use vortex_array:: arrays:: VarBinViewArray ;
14+ use vortex_array:: arrays:: filter:: FilterKernel ;
15+ use vortex_array:: match_each_integer_ptype;
16+ use vortex_array:: validity:: Validity ;
17+ use vortex_buffer:: BufferMut ;
1318use vortex_array:: arrays:: scalar_fn:: ScalarFnFactoryExt ;
1419use vortex_array:: builtins:: ArrayBuiltins ;
1520use vortex_array:: dtype:: DType ;
@@ -282,7 +287,6 @@ fn test_onpair_filter_shares_dict() {
282287 . filter_map ( |( i, s) | keep[ i] . then_some ( s. as_str ( ) ) )
283288 . collect ( ) ;
284289
285- use vortex_array:: arrays:: filter:: FilterKernel ;
286290 let mut filter_ctx = SESSION . create_execution_ctx ( ) ;
287291 let filtered = <OnPair as FilterKernel >:: filter ( arr. as_view ( ) , & mask, & mut filter_ctx)
288292 . unwrap ( )
@@ -314,3 +318,133 @@ fn test_onpair_filter_shares_dict() {
314318 } )
315319 . unwrap ( ) ;
316320}
321+
322+ /// Rebuild an OnPair array, swapping `codes_offsets` for a narrowed
323+ /// (smaller-ptype) primitive copy. Used by the narrowed-child
324+ /// regression tests below.
325+ fn narrow_codes_offsets (
326+ arr : & crate :: OnPairArray ,
327+ target : PType ,
328+ ) -> crate :: OnPairArray {
329+ let view = arr. as_view ( ) ;
330+ let mut ctx = SESSION . create_execution_ctx ( ) ;
331+ let original = view
332+ . codes_offsets ( )
333+ . clone ( )
334+ . execute :: < PrimitiveArray > ( & mut ctx)
335+ . unwrap ( ) ;
336+
337+ let narrowed_array = match_each_integer_ptype ! ( original. ptype( ) , |SRC | {
338+ let src = original. as_slice:: <SRC >( ) ;
339+ match_each_integer_ptype!( target, |DST | {
340+ let mut buf = BufferMut :: <DST >:: with_capacity( src. len( ) ) ;
341+ for & v in src {
342+ buf. push( DST :: try_from( v as u64 ) . expect( "value must fit in target ptype" ) ) ;
343+ }
344+ PrimitiveArray :: new( buf. freeze( ) , Validity :: NonNullable ) . into_array( )
345+ } )
346+ } ) ;
347+
348+ unsafe {
349+ OnPair :: new_unchecked (
350+ view. dtype ( ) . clone ( ) ,
351+ view. dict_bytes_handle ( ) . clone ( ) ,
352+ view. dict_offsets ( ) . clone ( ) ,
353+ view. codes ( ) . clone ( ) ,
354+ narrowed_array,
355+ view. uncompressed_lengths ( ) . clone ( ) ,
356+ view. array_validity ( ) ,
357+ view. bits ( ) ,
358+ )
359+ }
360+ }
361+
362+ /// Regression: the cascading compressor can narrow `codes_offsets`
363+ /// from u32 → u16 when every row's token count is small. The previous
364+ /// `filter` impl read the child as `as_slice::<u32>()` and panicked
365+ /// with `Other error: Attempted to get slice of type u32 from array
366+ /// of type u16`. The fix dispatches via `match_each_integer_ptype!`.
367+ #[ cfg_attr( miri, ignore) ]
368+ #[ test]
369+ fn test_onpair_filter_with_narrowed_codes_offsets_u16 ( ) {
370+ let n = 200usize ;
371+ // Short rows so per-row token counts stay small and codes_offsets
372+ // values fit in u16. (We narrow manually below regardless — this
373+ // matches the shape the cascading compressor produces in the
374+ // wild.)
375+ let strings: Vec < String > = ( 0 ..n) . map ( |i| format ! ( "r{:03}" , i) ) . collect ( ) ;
376+ let varbin = VarBinArray :: from_iter (
377+ strings. iter ( ) . map ( |s| Some ( s. as_bytes ( ) ) ) ,
378+ DType :: Utf8 ( Nullability :: NonNullable ) ,
379+ ) ;
380+ let arr =
381+ onpair_compress ( & varbin, varbin. len ( ) , varbin. dtype ( ) , DEFAULT_DICT12_CONFIG ) . unwrap ( ) ;
382+
383+ // Force `codes_offsets` to u16 so the panicking pre-fix
384+ // `as_slice::<u32>()` would fire.
385+ let arr = narrow_codes_offsets ( & arr, PType :: U16 ) ;
386+ assert_eq ! (
387+ arr. as_view( ) . codes_offsets( ) . dtype( ) . as_ptype( ) ,
388+ PType :: U16 ,
389+ "codes_offsets must be u16 to exercise the regression path"
390+ ) ;
391+
392+ let keep: Vec < bool > = ( 0 ..n) . map ( |i| i % 3 == 0 ) . collect ( ) ;
393+ let mask = vortex_mask:: Mask :: from_iter ( keep. iter ( ) . copied ( ) ) ;
394+ let expected: Vec < & str > = strings
395+ . iter ( )
396+ . enumerate ( )
397+ . filter_map ( |( i, s) | keep[ i] . then_some ( s. as_str ( ) ) )
398+ . collect ( ) ;
399+
400+ let mut filter_ctx = SESSION . create_execution_ctx ( ) ;
401+ // Pre-fix: this call panics with "Attempted to get slice of type
402+ // u32 from array of type u16". Post-fix: succeeds.
403+ let filtered = <OnPair as FilterKernel >:: filter ( arr. as_view ( ) , & mask, & mut filter_ctx)
404+ . unwrap ( )
405+ . expect ( "OnPair filter must return Some" ) ;
406+ let typed = filtered. try_downcast :: < OnPair > ( ) . expect ( "OnPair" ) ;
407+ assert_eq ! ( typed. len( ) , expected. len( ) ) ;
408+
409+ let mut ctx = SESSION . create_execution_ctx ( ) ;
410+ let canonical = typed
411+ . into_array ( )
412+ . execute :: < VarBinViewArray > ( & mut ctx)
413+ . unwrap ( ) ;
414+ canonical
415+ . with_iterator ( |iter| {
416+ let got: Vec < Option < Vec < u8 > > > = iter. map ( |b| b. map ( |s| s. to_vec ( ) ) ) . collect ( ) ;
417+ assert_eq ! ( got. len( ) , expected. len( ) ) ;
418+ for ( i, want) in expected. iter ( ) . enumerate ( ) {
419+ assert_eq ! ( got[ i] . as_deref( ) , Some ( want. as_bytes( ) ) , "row {i}" ) ;
420+ }
421+ Ok :: < _ , vortex_error:: VortexError > ( ( ) )
422+ } )
423+ . unwrap ( ) ;
424+ }
425+
426+ /// Same regression, narrowed to u8 (smallest possible ptype) — extra
427+ /// coverage that the macro dispatch handles every integer ptype the
428+ /// cascading compressor might pick.
429+ #[ cfg_attr( miri, ignore) ]
430+ #[ test]
431+ fn test_onpair_filter_with_narrowed_codes_offsets_u8 ( ) {
432+ let n = 100usize ;
433+ let strings: Vec < String > = ( 0 ..n) . map ( |i| format ! ( "{i}" ) ) . collect ( ) ;
434+ let varbin = VarBinArray :: from_iter (
435+ strings. iter ( ) . map ( |s| Some ( s. as_bytes ( ) ) ) ,
436+ DType :: Utf8 ( Nullability :: NonNullable ) ,
437+ ) ;
438+ let arr =
439+ onpair_compress ( & varbin, varbin. len ( ) , varbin. dtype ( ) , DEFAULT_DICT12_CONFIG ) . unwrap ( ) ;
440+ let arr = narrow_codes_offsets ( & arr, PType :: U8 ) ;
441+ assert_eq ! ( arr. as_view( ) . codes_offsets( ) . dtype( ) . as_ptype( ) , PType :: U8 ) ;
442+
443+ let mask = vortex_mask:: Mask :: from_iter ( ( 0 ..n) . map ( |i| i % 2 == 0 ) ) ;
444+
445+ let mut filter_ctx = SESSION . create_execution_ctx ( ) ;
446+ let filtered = <OnPair as FilterKernel >:: filter ( arr. as_view ( ) , & mask, & mut filter_ctx)
447+ . unwrap ( )
448+ . expect ( "OnPair filter must return Some" ) ;
449+ assert_eq ! ( filtered. len( ) , n / 2 ) ;
450+ }
0 commit comments