Skip to content

Commit a1ba67f

Browse files
committed
OnPair: regression tests for narrowed codes_offsets in filter
Two tests rebuild a compressed array with `codes_offsets` deliberately narrowed (u32 → u16, then u32 → u8) — the shape the cascading compressor produces for short-row corpora — and assert that `<OnPair as FilterKernel>::filter` succeeds and returns the expected rows. Pre-fix (`as_slice::<u32>()` hard-coded), both tests panic with "Other error: Attempted to get slice of type u32 from array of type u16". Post-fix (match_each_integer_ptype! dispatch), both pass. Also drops a redundant function-scoped `use FilterKernel` since the trait is now imported at module scope. Signed-off-by: claude <claude@anthropic.com>
1 parent 87011ec commit a1ba67f

1 file changed

Lines changed: 135 additions & 1 deletion

File tree

encodings/onpair/src/tests.rs

Lines changed: 135 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,13 @@ use vortex_array::IntoArray;
88
use vortex_array::VortexSessionExecute;
99
use vortex_array::accessor::ArrayAccessor;
1010
use vortex_array::arrays::ConstantArray;
11+
use vortex_array::arrays::PrimitiveArray;
1112
use vortex_array::arrays::VarBinArray;
1213
use vortex_array::arrays::VarBinViewArray;
14+
use vortex_array::arrays::filter::FilterKernel;
15+
use vortex_array::match_each_integer_ptype;
16+
use vortex_array::validity::Validity;
17+
use vortex_buffer::BufferMut;
1318
use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt;
1419
use vortex_array::builtins::ArrayBuiltins;
1520
use vortex_array::dtype::DType;
@@ -282,7 +287,6 @@ fn test_onpair_filter_shares_dict() {
282287
.filter_map(|(i, s)| keep[i].then_some(s.as_str()))
283288
.collect();
284289

285-
use vortex_array::arrays::filter::FilterKernel;
286290
let mut filter_ctx = SESSION.create_execution_ctx();
287291
let filtered = <OnPair as FilterKernel>::filter(arr.as_view(), &mask, &mut filter_ctx)
288292
.unwrap()
@@ -314,3 +318,133 @@ fn test_onpair_filter_shares_dict() {
314318
})
315319
.unwrap();
316320
}
321+
322+
/// Rebuild an OnPair array, swapping `codes_offsets` for a narrowed
323+
/// (smaller-ptype) primitive copy. Used by the narrowed-child
324+
/// regression tests below.
325+
fn narrow_codes_offsets(
326+
arr: &crate::OnPairArray,
327+
target: PType,
328+
) -> crate::OnPairArray {
329+
let view = arr.as_view();
330+
let mut ctx = SESSION.create_execution_ctx();
331+
let original = view
332+
.codes_offsets()
333+
.clone()
334+
.execute::<PrimitiveArray>(&mut ctx)
335+
.unwrap();
336+
337+
let narrowed_array = match_each_integer_ptype!(original.ptype(), |SRC| {
338+
let src = original.as_slice::<SRC>();
339+
match_each_integer_ptype!(target, |DST| {
340+
let mut buf = BufferMut::<DST>::with_capacity(src.len());
341+
for &v in src {
342+
buf.push(DST::try_from(v as u64).expect("value must fit in target ptype"));
343+
}
344+
PrimitiveArray::new(buf.freeze(), Validity::NonNullable).into_array()
345+
})
346+
});
347+
348+
unsafe {
349+
OnPair::new_unchecked(
350+
view.dtype().clone(),
351+
view.dict_bytes_handle().clone(),
352+
view.dict_offsets().clone(),
353+
view.codes().clone(),
354+
narrowed_array,
355+
view.uncompressed_lengths().clone(),
356+
view.array_validity(),
357+
view.bits(),
358+
)
359+
}
360+
}
361+
362+
/// Regression: the cascading compressor can narrow `codes_offsets`
363+
/// from u32 → u16 when every row's token count is small. The previous
364+
/// `filter` impl read the child as `as_slice::<u32>()` and panicked
365+
/// with `Other error: Attempted to get slice of type u32 from array
366+
/// of type u16`. The fix dispatches via `match_each_integer_ptype!`.
367+
#[cfg_attr(miri, ignore)]
368+
#[test]
369+
fn test_onpair_filter_with_narrowed_codes_offsets_u16() {
370+
let n = 200usize;
371+
// Short rows so per-row token counts stay small and codes_offsets
372+
// values fit in u16. (We narrow manually below regardless — this
373+
// matches the shape the cascading compressor produces in the
374+
// wild.)
375+
let strings: Vec<String> = (0..n).map(|i| format!("r{:03}", i)).collect();
376+
let varbin = VarBinArray::from_iter(
377+
strings.iter().map(|s| Some(s.as_bytes())),
378+
DType::Utf8(Nullability::NonNullable),
379+
);
380+
let arr =
381+
onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap();
382+
383+
// Force `codes_offsets` to u16 so the panicking pre-fix
384+
// `as_slice::<u32>()` would fire.
385+
let arr = narrow_codes_offsets(&arr, PType::U16);
386+
assert_eq!(
387+
arr.as_view().codes_offsets().dtype().as_ptype(),
388+
PType::U16,
389+
"codes_offsets must be u16 to exercise the regression path"
390+
);
391+
392+
let keep: Vec<bool> = (0..n).map(|i| i % 3 == 0).collect();
393+
let mask = vortex_mask::Mask::from_iter(keep.iter().copied());
394+
let expected: Vec<&str> = strings
395+
.iter()
396+
.enumerate()
397+
.filter_map(|(i, s)| keep[i].then_some(s.as_str()))
398+
.collect();
399+
400+
let mut filter_ctx = SESSION.create_execution_ctx();
401+
// Pre-fix: this call panics with "Attempted to get slice of type
402+
// u32 from array of type u16". Post-fix: succeeds.
403+
let filtered = <OnPair as FilterKernel>::filter(arr.as_view(), &mask, &mut filter_ctx)
404+
.unwrap()
405+
.expect("OnPair filter must return Some");
406+
let typed = filtered.try_downcast::<OnPair>().expect("OnPair");
407+
assert_eq!(typed.len(), expected.len());
408+
409+
let mut ctx = SESSION.create_execution_ctx();
410+
let canonical = typed
411+
.into_array()
412+
.execute::<VarBinViewArray>(&mut ctx)
413+
.unwrap();
414+
canonical
415+
.with_iterator(|iter| {
416+
let got: Vec<Option<Vec<u8>>> = iter.map(|b| b.map(|s| s.to_vec())).collect();
417+
assert_eq!(got.len(), expected.len());
418+
for (i, want) in expected.iter().enumerate() {
419+
assert_eq!(got[i].as_deref(), Some(want.as_bytes()), "row {i}");
420+
}
421+
Ok::<_, vortex_error::VortexError>(())
422+
})
423+
.unwrap();
424+
}
425+
426+
/// Same regression, narrowed to u8 (smallest possible ptype) — extra
427+
/// coverage that the macro dispatch handles every integer ptype the
428+
/// cascading compressor might pick.
429+
#[cfg_attr(miri, ignore)]
430+
#[test]
431+
fn test_onpair_filter_with_narrowed_codes_offsets_u8() {
432+
let n = 100usize;
433+
let strings: Vec<String> = (0..n).map(|i| format!("{i}")).collect();
434+
let varbin = VarBinArray::from_iter(
435+
strings.iter().map(|s| Some(s.as_bytes())),
436+
DType::Utf8(Nullability::NonNullable),
437+
);
438+
let arr =
439+
onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap();
440+
let arr = narrow_codes_offsets(&arr, PType::U8);
441+
assert_eq!(arr.as_view().codes_offsets().dtype().as_ptype(), PType::U8);
442+
443+
let mask = vortex_mask::Mask::from_iter((0..n).map(|i| i % 2 == 0));
444+
445+
let mut filter_ctx = SESSION.create_execution_ctx();
446+
let filtered = <OnPair as FilterKernel>::filter(arr.as_view(), &mask, &mut filter_ctx)
447+
.unwrap()
448+
.expect("OnPair filter must return Some");
449+
assert_eq!(filtered.len(), n / 2);
450+
}

0 commit comments

Comments
 (0)