vortex-data
diff --git a/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎encodings/onpair/Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎encodings/onpair/Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎encodings/onpair/benches/decode.rs‎
Lines changed: 88 additions & 3 deletions b/‎encodings/onpair/benches/decode.rs‎
Lines changed: 88 additions & 3 deletions
diff --git a/‎encodings/onpair/src/compute/compare.rs‎
Lines changed: 33 additions & 35 deletions b/‎encodings/onpair/src/compute/compare.rs‎
Lines changed: 33 additions & 35 deletions
diff --git a/‎encodings/onpair/src/compute/filter.rs‎
Lines changed: 83 additions & 15 deletions b/‎encodings/onpair/src/compute/filter.rs‎
Lines changed: 83 additions & 15 deletions
@@ -17,6 +17,7 @@ version = { workspace = true }
 workspace = true
 
 [dependencies]
+memchr = { version = "2.8.0" }
 parking_lot = { workspace = true }
 prost = { workspace = true }
 vortex-array = { workspace = true }
 
@@ -20,21 +20,31 @@
     clippy::panic,
     clippy::tests_outside_test_module,
     clippy::redundant_clone,
-    clippy::missing_safety_doc
+    clippy::missing_safety_doc,
+    clippy::unwrap_used,
+    clippy::expect_used
 )]
 
 use std::sync::LazyLock;
 
 use divan::Bencher;
 use vortex_array::IntoArray;
 use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::ConstantArray;
 use vortex_array::arrays::VarBinArray;
 use vortex_array::arrays::VarBinViewArray;
+use vortex_array::arrays::filter::FilterKernel;
 use vortex_array::dtype::DType;
 use vortex_array::dtype::Nullability;
+use vortex_array::scalar_fn::fns::binary::CompareKernel;
+use vortex_array::scalar_fn::fns::like::LikeKernel;
+use vortex_array::scalar_fn::fns::like::LikeOptions;
+use vortex_array::scalar_fn::fns::operators::CompareOperator;
 use vortex_array::session::ArraySession;
+use vortex_mask::Mask;
 use vortex_onpair::DEFAULT_DICT12_CONFIG;
 use vortex_onpair::MAX_TOKEN_SIZE;
+use vortex_onpair::OnPair;
 use vortex_onpair::OnPairArray;
 use vortex_onpair::decode::OwnedDecodeInputs;
 use vortex_onpair::onpair_compress;
@@ -83,8 +93,7 @@ fn corpus(n: usize, shape: Shape) -> Vec<String> {
             }
         }
         Shape::Short => {
-            let templates: &[&str] =
-                &["alpha", "beta", "gamma", "delta", "eps", "zeta", "eta"];
+            let templates: &[&str] = &["alpha", "beta", "gamma", "delta", "eps", "zeta", "eta"];
             for _ in 0..n {
                 let s = next();
                 out.push(templates[(s as usize) % templates.len()].to_string());
@@ -179,6 +188,82 @@ fn canonicalize_to_varbinview(bencher: Bencher, case: (Shape, usize)) {
         });
 }
 
+// ─── Compute kernels ─────────────────────────────────────────────────────
+
+const COMPUTE_CASES: &[(Shape, usize)] = &[(Shape::UrlLog, 100_000), (Shape::UrlLog, 1_000_000)];
+
+/// `Eq` against a literal (token-aware fast path: no row decode, just
+/// `&[u16]` comparison).
+#[divan::bench(args = COMPUTE_CASES)]
+fn eq_constant(bencher: Bencher, case: (Shape, usize)) {
+    let (shape, n) = case;
+    let arr = compress(n, shape);
+    let strings = corpus(n, shape);
+    // Pick the very first row's value as the needle so we always hit at
+    // least one match.
+    let needle = strings[0].clone();
+    bencher.bench_local(|| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let result = <OnPair as CompareKernel>::compare(
+            arr.as_view(),
+            &ConstantArray::new(needle.as_str(), n).into_array(),
+            CompareOperator::Eq,
+            &mut ctx,
+        )
+        .unwrap()
+        .unwrap();
+        divan::black_box(result);
+    });
+}
+
+/// `LIKE 'prefix%'` — byte-streaming row prefix check.
+#[divan::bench(args = COMPUTE_CASES)]
+fn like_prefix(bencher: Bencher, case: (Shape, usize)) {
+    let (shape, n) = case;
+    let arr = compress(n, shape);
+    bencher.bench_local(|| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let pattern = ConstantArray::new("https://www.%", n).into_array();
+        let result =
+            <OnPair as LikeKernel>::like(arr.as_view(), &pattern, LikeOptions::default(), &mut ctx)
+                .unwrap()
+                .unwrap();
+        divan::black_box(result);
+    });
+}
+
+/// `LIKE '%substring%'` — `memchr::memmem::Finder` over decoded rows.
+#[divan::bench(args = COMPUTE_CASES)]
+fn like_contains(bencher: Bencher, case: (Shape, usize)) {
+    let (shape, n) = case;
+    let arr = compress(n, shape);
+    bencher.bench_local(|| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let pattern = ConstantArray::new("%example.com%", n).into_array();
+        let result =
+            <OnPair as LikeKernel>::like(arr.as_view(), &pattern, LikeOptions::default(), &mut ctx)
+                .unwrap()
+                .unwrap();
+        divan::black_box(result);
+    });
+}
+
+/// Filter — share-dict path. Builds a 1-in-7 mask so we keep ~14 % of
+/// rows; the cost is dominated by the `codes` segment copy + offsets.
+#[divan::bench(args = COMPUTE_CASES)]
+fn filter_share_dict(bencher: Bencher, case: (Shape, usize)) {
+    let (shape, n) = case;
+    let arr = compress(n, shape);
+    let mask = Mask::from_iter((0..n).map(|i| i % 7 == 0));
+    bencher.bench_local(|| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let result = <OnPair as FilterKernel>::filter(arr.as_view(), &mask, &mut ctx)
+            .unwrap()
+            .unwrap();
+        divan::black_box(result);
+    });
+}
+
 fn main() {
     divan::main();
 }
@@ -1,9 +1,19 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 //
-//! `Eq` / `NotEq` against a constant. Each row's decoded bytes are streamed
-//! through `DecodeView::for_each_dict_slice`, comparing prefix-wise against
-//! the needle, so most non-matches short-circuit before any decode work.
+//! `Eq` / `NotEq` against a constant via **token-aware** comparison.
+//!
+//! OnPair's compressor encodes every byte string deterministically via
+//! greedy LPM against the same dictionary, so two byte strings are
+//! equal **iff** their LPM token sequences are equal. We tokenise the
+//! needle once and then compare the row's `codes[lo..hi]` slice
+//! directly against the tokenised needle as `&[u16]` — no row decode.
+//!
+//! Edge case: if the needle contains a byte that has no dict entry at
+//! all (degenerate dict; OnPair training normally guarantees every
+//! single-byte token), no row can possibly equal the needle, since
+//! every row was compressed against the same dict. We return an
+//! all-zeros bitmap (or all-ones for `NotEq`).
 
 use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
@@ -19,8 +29,9 @@ use vortex_buffer::ByteBuffer;
 use vortex_error::VortexResult;
 
 use crate::OnPair;
-use crate::decode::DecodeView;
 use crate::decode::OwnedDecodeInputs;
+use crate::lpm::DictIndex;
+use crate::lpm::tokenize_needle;
 
 impl CompareKernel for OnPair {
     fn compare(
@@ -43,11 +54,26 @@ impl CompareKernel for OnPair {
         let dv = inputs.view();
         let n = lhs.array().len();
         let mut bytes = vec![0u8; n.div_ceil(8)];
-        for row in 0..n {
-            if row_equals_needle(&dv, row, &needle) {
-                bytes[row / 8] |= 1u8 << (row % 8);
+
+        let index = DictIndex::build(&dv);
+        if let Some(needle_toks) = tokenize_needle(&dv, &index, &needle) {
+            let codes = dv.codes;
+            let codes_offsets = dv.codes_offsets;
+            for r in 0..n {
+                let lo = codes_offsets[r] as usize;
+                let hi = codes_offsets[r + 1] as usize;
+                // SAFETY: codes_offsets validated at construction time.
+                let row_toks = unsafe { codes.get_unchecked(lo..hi) };
+                if row_toks == needle_toks.as_slice() {
+                    bytes[r / 8] |= 1u8 << (r % 8);
+                }
             }
         }
+        // If `tokenize_needle` returned None, no row can equal the
+        // needle (every row was compressed against the same dict, so
+        // any byte not in the dict can't appear in any row either).
+        // Leave the bitmap zeroed.
+
         let mut bool_buf = BitBuffer::new(ByteBuffer::from(bytes), n);
         if operator == CompareOperator::NotEq {
             bool_buf = !bool_buf;
@@ -67,31 +93,3 @@ fn needle_bytes(scalar: &Scalar) -> Option<Vec<u8>> {
         _ => None,
     }
 }
-
-/// True iff row `r` decodes to exactly `needle`.
-fn row_equals_needle(dv: &DecodeView<'_>, r: usize, needle: &[u8]) -> bool {
-    let mut pos = 0usize;
-    let n = needle.len();
-    let needle_ptr = needle.as_ptr();
-    let ok = dv.for_each_dict_slice(r, |slice| {
-        let take = slice.len();
-        // Fast-path: bail on length overflow first so we never compare a
-        // partial slice that would walk past `needle`.
-        if pos + take > n {
-            return false;
-        }
-        // SAFETY: `pos + take <= n`, `take == slice.len()`. Compares
-        // `needle[pos..pos+take]` with `slice` via raw `memcmp`-style
-        // pointer math. The branch on length above is the only check.
-        let eq = unsafe {
-            let lhs = needle_ptr.add(pos);
-            std::slice::from_raw_parts(lhs, take) == slice
-        };
-        if !eq {
-            return false;
-        }
-        pos += take;
-        true
-    });
-    ok && pos == n
-}
@@ -1,40 +1,108 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 //
-//! Filter is implemented as a re-compress through canonical because OnPair's
-//! `codes` for surviving rows would also need to be re-laid out (the codes
-//! belong to whole rows, not single elements), and re-training keeps the
-//! resulting dictionary tight to the surviving data. Slice is cheaper — see
-//! `slice.rs` — because we can just sub-slice `codes_offsets` /
-//! `uncompressed_lengths`.
+//! Filter that **shares the dictionary**. The previous implementation
+//! decoded the whole array, filtered the canonical bytes, and re-trained
+//! a brand-new OnPair dictionary on the surviving rows — order-of-
+//! magnitude regressions on TPC-H Q22 at SF=10 traced back to that cost
+//! (the customer table's `c_phone` column gets two consecutive filters,
+//! each of which was paying full `Column::compress` training overhead).
+//!
+//! FSST-shape filter: keep `dict_bytes` + `dict_offsets` **identical**
+//! to the input; rebuild only `codes`, `codes_offsets`,
+//! `uncompressed_lengths`, and validity by walking the mask. No decode,
+//! no retrain, no C++ call on the read path.
 
 use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
-use vortex_array::Canonical;
 use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
+use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::filter::FilterKernel;
+use vortex_array::match_each_integer_ptype;
+use vortex_buffer::BufferMut;
 use vortex_error::VortexResult;
+use vortex_error::vortex_err;
 use vortex_mask::Mask;
 
 use crate::OnPair;
-use crate::compress::DEFAULT_DICT12_CONFIG;
-use crate::compress::onpair_compress_array;
+use crate::OnPairArrayExt;
 
 impl FilterKernel for OnPair {
     fn filter(
         array: ArrayView<'_, Self>,
         mask: &Mask,
         ctx: &mut ExecutionCtx,
     ) -> VortexResult<Option<ArrayRef>> {
-        let canonical = array
-            .array()
+        let n_in = array.array().len();
+        let n_out = mask.true_count();
+
+        // Materialise the per-row offset arrays we walk during filtering.
+        // The codes themselves we read through whatever ptype the
+        // cascading compressor narrowed to — match_each_integer_ptype
+        // dispatches on it below.
+        let codes_offsets_arr = array
+            .codes_offsets()
             .clone()
-            .execute::<Canonical>(ctx)?
-            .into_array();
-        let filtered = canonical.filter(mask.clone())?;
+            .execute::<PrimitiveArray>(ctx)?;
+        let codes_arr = array.codes().clone().execute::<PrimitiveArray>(ctx)?;
+        let codes_offsets = codes_offsets_arr.as_slice::<u32>();
+
+        // First pass: sum the surviving token count so we reserve once.
+        let mut new_codes_len: usize = 0;
+        for r in 0..n_in {
+            if mask.value(r) {
+                new_codes_len += (codes_offsets[r + 1] - codes_offsets[r]) as usize;
+            }
+        }
+
+        let mut new_codes_offsets = BufferMut::<u32>::with_capacity(n_out + 1);
+        // SAFETY: capacity reserved.
+        unsafe { new_codes_offsets.push_unchecked(0u32) };
+
+        let new_codes: ArrayRef = match_each_integer_ptype!(codes_arr.ptype(), |P| {
+            let codes = codes_arr.as_slice::<P>();
+            let mut out = BufferMut::<P>::with_capacity(new_codes_len);
+            let mut cursor: u32 = 0;
+            for r in 0..n_in {
+                if mask.value(r) {
+                    let lo = codes_offsets[r] as usize;
+                    let hi = codes_offsets[r + 1] as usize;
+                    // SAFETY: codes_offsets validated at construction.
+                    let segment = unsafe { codes.get_unchecked(lo..hi) };
+                    out.extend_from_slice(segment);
+                    let segment_len = u32::try_from(hi - lo)
+                        .map_err(|_| vortex_err!("token segment overflows u32"))?;
+                    cursor = cursor
+                        .checked_add(segment_len)
+                        .ok_or_else(|| vortex_err!("codes_offsets overflow u32"))?;
+                    // SAFETY: capacity reserved (n_out + 1 entries).
+                    unsafe { new_codes_offsets.push_unchecked(cursor) };
+                }
+            }
+            out.freeze().into_array()
+        });
+
+        // uncompressed_lengths + validity flow through the standard
+        // primitive filter — these are short integer arrays so the cost
+        // is negligible compared to the (avoided) recompress.
+        let uncompressed_lengths = array.uncompressed_lengths().clone().filter(mask.clone())?;
+        let validity = array.array_validity().filter(mask)?;
+
         Ok(Some(
-            onpair_compress_array(&filtered, DEFAULT_DICT12_CONFIG, ctx)?.into_array(),
+            unsafe {
+                OnPair::new_unchecked(
+                    array.dtype().clone(),
+                    array.dict_bytes_handle().clone(),
+                    array.dict_offsets().clone(),
+                    new_codes,
+                    new_codes_offsets.freeze().into_array(),
+                    uncompressed_lengths,
+                    validity,
+                    array.bits(),
+                )
+            }
+            .into_array(),
         ))
     }
 }