vortex-data
diff --git a/‎Cargo.lock‎
Lines changed: 0 additions & 1 deletion b/‎Cargo.lock‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎vortex-tensor/benches/similarity_search.rs‎
Lines changed: 3 additions & 8 deletions b/‎vortex-tensor/benches/similarity_search.rs‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎vortex-tensor/benches/similarity_search_common/mod.rs‎
Lines changed: 26 additions & 94 deletions b/‎vortex-tensor/benches/similarity_search_common/mod.rs‎
Lines changed: 26 additions & 94 deletions
diff --git a/‎vortex-tensor/public-api.lock‎
Lines changed: 8 additions & 0 deletions b/‎vortex-tensor/public-api.lock‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎vortex-tensor/src/lib.rs‎
Lines changed: 2 additions & 0 deletions b/‎vortex-tensor/src/lib.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎vortex-tensor/src/scalar_fns/inner_product.rs‎
Lines changed: 48 additions & 12 deletions b/‎vortex-tensor/src/scalar_fns/inner_product.rs‎
Lines changed: 48 additions & 12 deletions
@@ -26,6 +26,7 @@ mod common;
 
 use common::Variant;
 use common::build_similarity_search_tree;
+use common::build_variant;
 use common::extract_row_as_query;
 use common::generate_random_vectors;
 
@@ -64,14 +65,8 @@ fn bench_variant(bencher: Bencher<'_, '_>, variant: Variant) {
             // the query identical across all three variants.
             let raw = generate_random_vectors(NUM_ROWS, DIM, SEED);
             let query = extract_row_as_query(&raw, 0, DIM);
-            let data = match variant {
-                Variant::Uncompressed => raw,
-                Variant::DefaultCompression => {
-                    common::compress_default(raw).vortex_expect("default compression succeeds")
-                }
-                Variant::TurboQuant => common::compress_turboquant(raw, &mut ctx)
-                    .vortex_expect("turboquant compression succeeds"),
-            };
+            let data = build_variant(variant, NUM_ROWS, DIM, SEED, &mut ctx)
+                .vortex_expect("variant build succeeds");
 
             // println!(
             //     "\n\n{}: {}\n\n",
 
@@ -30,36 +30,30 @@ use vortex_array::ArrayRef;
 use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
 use vortex_array::VortexSessionExecute;
-use vortex_array::arrays::ConstantArray;
 use vortex_array::arrays::Extension;
 use vortex_array::arrays::ExtensionArray;
 use vortex_array::arrays::FixedSizeListArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::extension::ExtensionArrayExt;
 use vortex_array::arrays::fixed_size_list::FixedSizeListArrayExt;
 use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
-use vortex_array::builtins::ArrayBuiltins;
-use vortex_array::dtype::DType;
-use vortex_array::dtype::Nullability;
-use vortex_array::dtype::PType;
 use vortex_array::dtype::extension::ExtDType;
 use vortex_array::extension::EmptyMetadata;
-use vortex_array::scalar::Scalar;
-use vortex_array::scalar_fn::fns::operators::Operator;
 use vortex_array::session::ArraySession;
 use vortex_array::validity::Validity;
 use vortex_btrblocks::BtrBlocksCompressor;
 use vortex_buffer::BufferMut;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
 use vortex_error::vortex_panic;
 use vortex_session::VortexSession;
 use vortex_tensor::encodings::turboquant::TurboQuantConfig;
 use vortex_tensor::encodings::turboquant::turboquant_encode_unchecked;
-use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity;
 use vortex_tensor::scalar_fns::l2_denorm::L2Denorm;
 use vortex_tensor::scalar_fns::l2_denorm::normalize_as_l2_denorm;
 use vortex_tensor::vector::Vector;
+pub use vortex_tensor::vector_search::build_similarity_search_tree;
 
 /// A shared [`VortexSession`] pre-loaded with the builtin [`ArraySession`] so both bench and
 /// example can create execution contexts cheaply.
@@ -146,65 +140,16 @@ pub fn extract_row_as_query(vectors: &ArrayRef, row: usize, dim: u32) -> Vec<f32
     slice[start..start + dim_usize].to_vec()
 }
 
-/// Build a `Vector<dim, f32>` extension array whose storage is a [`ConstantArray`] broadcasting a
-/// single query vector across `num_rows` rows. This is how we hand a single query vector to
-/// `CosineSimilarity` on the `rhs` side -- `ScalarFnArray` requires both children to have the
-/// same length, so we broadcast the query instead of hand-rolling a 1-row input.
-fn build_constant_query_vector(query: &[f32], num_rows: usize) -> VortexResult<ArrayRef> {
-    let element_dtype = DType::Primitive(PType::F32, Nullability::NonNullable);
-
-    let children: Vec<Scalar> = query
-        .iter()
-        .map(|&v| Scalar::primitive(v, Nullability::NonNullable))
-        .collect();
-    let storage_scalar = Scalar::fixed_size_list(element_dtype, children, Nullability::NonNullable);
-
-    let storage = ConstantArray::new(storage_scalar, num_rows).into_array();
-
-    let ext_dtype = ExtDType::<Vector>::try_new(EmptyMetadata, storage.dtype().clone())?.erased();
-    Ok(ExtensionArray::new(ext_dtype, storage).into_array())
-}
-
-/// Compresses a raw `Vector<dim, f32>` array with the default BtrBlocks pipeline.
-///
-/// [`BtrBlocksCompressor`] walks into the extension array and recursively compresses the
-/// underlying FSL storage child. TurboQuant is *not* exercised by this path -- it is not
-/// registered in the default scheme set -- so this measures "generic" lossless compression
-/// applied to float vectors.
-pub fn compress_default(data: ArrayRef) -> VortexResult<ArrayRef> {
-    BtrBlocksCompressor::default().compress(&data)
-}
-
-/// Compresses a raw `Vector<dim, f32>` array with the TurboQuant pipeline by hand, producing the
-/// same tree shape that
-/// [`vortex_tensor::encodings::turboquant::TurboQuantScheme`] would:
-///
-/// ```text
-/// L2Denorm(SorfTransform(FSL(Dict(codes, centroids))), norms)
-/// ```
-///
-/// Calling the encode helpers directly (instead of going through
-/// `BtrBlocksCompressorBuilder::with_turboquant()`) lets this example avoid depending on the
-/// `unstable_encodings` feature flag.
-///
-/// See `vortex-tensor/src/encodings/turboquant/tests/mod.rs::normalize_and_encode` for the same
-/// canonical recipe.
-pub fn compress_turboquant(data: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
+fn normalize_vectors(
+    data: ArrayRef,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<(ArrayRef, ArrayRef, usize)> {
     let l2_denorm = normalize_as_l2_denorm(data, ctx)?;
     let normalized = l2_denorm.child_at(0).clone();
     let norms = l2_denorm.child_at(1).clone();
     let num_rows = l2_denorm.len();
 
-    let normalized_ext = normalized
-        .as_opt::<Extension>()
-        .vortex_expect("normalized child should be an Extension array");
-
-    let config = TurboQuantConfig::default();
-    // SAFETY: `normalize_as_l2_denorm` guarantees every row is unit-norm (or zero), which is the
-    // invariant `turboquant_encode_unchecked` expects.
-    let tq = unsafe { turboquant_encode_unchecked(normalized_ext, &config, ctx) }?;
-
-    Ok(unsafe { L2Denorm::new_array_unchecked(tq, norms, num_rows) }?.into_array())
+    Ok((normalized, norms, num_rows))
 }
 
 /// Dispatch helper that builds the data array for the requested [`Variant`], starting from a
@@ -220,37 +165,24 @@ pub fn build_variant(
     let raw = generate_random_vectors(num_rows, dim, seed);
     match variant {
         Variant::Uncompressed => Ok(raw),
-        Variant::DefaultCompression => compress_default(raw),
-        Variant::TurboQuant => compress_turboquant(raw, ctx),
-    }
-}
-
-/// Build the lazy similarity-search array tree for a prepared data array and a single query
-/// vector. The returned tree is a boolean array of length `data.len()` where position `i` is
-/// `true` iff `cosine_similarity(data[i], query) > threshold`.
-///
-/// The tree shape is:
-///
-/// ```text
-/// Binary(Gt, [
-///     CosineSimilarity([data, ConstantArray(query_vec, n)]),
-///     ConstantArray(threshold, n),
-/// ])
-/// ```
-///
-/// This function does no execution; it is safe to call inside a benchmark setup closure.
-pub fn build_similarity_search_tree(
-    data: ArrayRef,
-    query: &[f32],
-    threshold: f32,
-) -> VortexResult<ArrayRef> {
-    let num_rows = data.len();
-    let query_vec = build_constant_query_vector(query, num_rows)?;
-
-    let cosine = CosineSimilarity::try_new_array(data, query_vec, num_rows)?.into_array();
-
-    let threshold_scalar = Scalar::primitive(threshold, Nullability::NonNullable);
-    let threshold_array = ConstantArray::new(threshold_scalar, num_rows).into_array();
+        Variant::DefaultCompression => {
+            let (normalized, norms, num_rows) = normalize_vectors(raw, ctx)?;
+            let compressed = BtrBlocksCompressor::default().compress(&normalized)?;
 
-    cosine.binary(threshold_array, Operator::Gt)
+            Ok(unsafe { L2Denorm::new_array_unchecked(compressed, norms, num_rows) }?.into_array())
+        }
+        Variant::TurboQuant => {
+            let (normalized, norms, num_rows) = normalize_vectors(raw, ctx)?;
+            let Some(normalized_ext) = normalized.as_opt::<Extension>() else {
+                vortex_bail!("normalize_as_l2_denorm must produce an Extension array child");
+            };
+
+            let config = TurboQuantConfig::default();
+            // SAFETY: `normalize_as_l2_denorm` guarantees every row is unit-norm (or zero),
+            // which is the invariant `turboquant_encode_unchecked` expects.
+            let tq = unsafe { turboquant_encode_unchecked(normalized_ext, &config, ctx) }?;
+
+            Ok(unsafe { L2Denorm::new_array_unchecked(tq, norms, num_rows) }?.into_array())
+        }
+    }
 }
@@ -550,4 +550,12 @@ impl core::marker::Copy for vortex_tensor::vector::VectorMatcherMetadata
 
 impl core::marker::StructuralPartialEq for vortex_tensor::vector::VectorMatcherMetadata
 
+pub mod vortex_tensor::vector_search
+
+pub fn vortex_tensor::vector_search::build_constant_query_vector<T: vortex_array::dtype::ptype::NativePType + core::convert::Into<vortex_array::scalar::typed_view::primitive::pvalue::PValue>>(query: &[T], num_rows: usize) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_tensor::vector_search::build_similarity_search_tree<T: vortex_array::dtype::ptype::NativePType + core::convert::Into<vortex_array::scalar::typed_view::primitive::pvalue::PValue>>(data: vortex_array::array::erased::ArrayRef, query: &[T], threshold: T) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_tensor::vector_search::compress_turboquant(data: vortex_array::array::erased::ArrayRef, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
 pub fn vortex_tensor::initialize(session: &vortex_session::VortexSession)
@@ -25,6 +25,8 @@ pub mod vector;
 
 pub mod encodings;
 
+pub mod vector_search;
+
 mod utils;
 
 /// Initialize the Vortex tensor library with a Vortex session.
 
@@ -527,18 +527,9 @@ impl InnerProduct {
         let values: &[f32] = values_prim.as_slice::<f32>();
         debug_assert_eq!(codes.len(), len * padded_dim);
 
-        // Direct codebook lookup in the hot loop. See the function doc comment for why this
-        // beats an explicit product table here.
-        let mut out = BufferMut::<f32>::with_capacity(len);
-        for row in 0..len {
-            let row_codes = &codes[row * padded_dim..(row + 1) * padded_dim];
-            let mut acc = 0.0f32;
-            for j in 0..padded_dim {
-                acc += q[j] * values[row_codes[j] as usize];
-            }
-            // SAFETY: we reserved `len` slots above and push exactly once per row.
-            unsafe { out.push_unchecked(acc) };
-        }
+        // The hot loop is extracted into [`execute_dict_constant_inner_product`] with
+        // unchecked indexing so the compiler can vectorize the inner gather-accumulate.
+        let out = execute_dict_constant_inner_product(q, values, codes, len, padded_dim);
 
         // SAFETY: the buffer length equals `len`, which matches the validity length.
         let result = unsafe { PrimitiveArray::new_unchecked(out.freeze(), validity) }.into_array();
@@ -556,6 +547,51 @@ fn inner_product_row<T: Float + NativePType>(a: &[T], b: &[T]) -> T {
         .fold(T::zero(), |acc, v| acc + v)
 }
 
+/// Compute inner products between a constant query vector and dictionary-encoded rows.
+///
+/// For each row, computes `sum(q[j] * values[codes[row * dim + j]])` using the codebook
+/// `values` directly instead of decoding the dictionary into dense vectors.
+///
+/// The inner loop uses four independent accumulators so the CPU can pipeline FP additions
+/// instead of waiting for each `fadd` to retire before starting the next.
+fn execute_dict_constant_inner_product(
+    q: &[f32],
+    values: &[f32],
+    codes: &[u8],
+    num_rows: usize,
+    dim: usize,
+) -> BufferMut<f32> {
+    let mut out = BufferMut::<f32>::with_capacity(num_rows);
+
+    for row_codes in codes.chunks_exact(dim) {
+        let mut acc0 = 0.0f32;
+        let mut acc1 = 0.0f32;
+        let mut acc2 = 0.0f32;
+        let mut acc3 = 0.0f32;
+
+        let code_chunks = row_codes.chunks_exact(4);
+        let q_chunks = q.chunks_exact(4);
+        let code_rem = code_chunks.remainder();
+        let q_rem = q_chunks.remainder();
+
+        for (cc, qc) in code_chunks.zip(q_chunks) {
+            acc0 += qc[0] * values[cc[0] as usize];
+            acc1 += qc[1] * values[cc[1] as usize];
+            acc2 += qc[2] * values[cc[2] as usize];
+            acc3 += qc[3] * values[cc[3] as usize];
+        }
+
+        for (&code, &q_val) in code_rem.iter().zip(q_rem.iter()) {
+            acc0 += q_val * values[code as usize];
+        }
+
+        // SAFETY: we reserved `num_rows` slots and push exactly once per row.
+        unsafe { out.push_unchecked(acc0 + acc1 + acc2 + acc3) };
+    }
+
+    out
+}
+
 #[cfg(test)]
 mod tests {
     use std::sync::LazyLock;