vortex-data
diff --git a/‎vortex-tensor/public-api.lock‎
Lines changed: 61 additions & 3 deletions b/‎vortex-tensor/public-api.lock‎
Lines changed: 61 additions & 3 deletions
diff --git a/‎vortex-tensor/src/encodings/turboquant/compress.rs‎
Lines changed: 1 addition & 1 deletion b/‎vortex-tensor/src/encodings/turboquant/compress.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vortex-tensor/src/lib.rs‎
Lines changed: 3 additions & 0 deletions b/‎vortex-tensor/src/lib.rs‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎vortex-tensor/src/normalized_vector/matcher.rs‎
Lines changed: 102 additions & 0 deletions b/‎vortex-tensor/src/normalized_vector/matcher.rs‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎vortex-tensor/src/normalized_vector/mod.rs‎
Lines changed: 125 additions & 0 deletions b/‎vortex-tensor/src/normalized_vector/mod.rs‎
Lines changed: 125 additions & 0 deletions
@@ -250,6 +250,64 @@ pub type vortex_tensor::matcher::AnyTensor::Match<'a> = vortex_tensor::matcher::
 
 pub fn vortex_tensor::matcher::AnyTensor::try_match<'a>(ext_dtype: &'a vortex_array::dtype::extension::erased::ExtDTypeRef) -> core::option::Option<Self::Match>
 
+pub mod vortex_tensor::normalized_vector
+
+pub struct vortex_tensor::normalized_vector::AnyNormalizedVector
+
+impl vortex_array::dtype::extension::matcher::Matcher for vortex_tensor::normalized_vector::AnyNormalizedVector
+
+pub type vortex_tensor::normalized_vector::AnyNormalizedVector::Match<'a> = vortex_tensor::vector::VectorMatcherMetadata
+
+pub fn vortex_tensor::normalized_vector::AnyNormalizedVector::try_match<'a>(ext_dtype: &'a vortex_array::dtype::extension::erased::ExtDTypeRef) -> core::option::Option<Self::Match>
+
+pub struct vortex_tensor::normalized_vector::NormalizedVector
+
+impl vortex_tensor::normalized_vector::NormalizedVector
+
+pub unsafe fn vortex_tensor::normalized_vector::NormalizedVector::new_unchecked(storage: vortex_array::array::erased::ArrayRef) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::try_new(storage: vortex_array::array::erased::ArrayRef, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+impl core::clone::Clone for vortex_tensor::normalized_vector::NormalizedVector
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::clone(&self) -> vortex_tensor::normalized_vector::NormalizedVector
+
+impl core::cmp::Eq for vortex_tensor::normalized_vector::NormalizedVector
+
+impl core::cmp::PartialEq for vortex_tensor::normalized_vector::NormalizedVector
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::eq(&self, other: &vortex_tensor::normalized_vector::NormalizedVector) -> bool
+
+impl core::default::Default for vortex_tensor::normalized_vector::NormalizedVector
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::default() -> vortex_tensor::normalized_vector::NormalizedVector
+
+impl core::fmt::Debug for vortex_tensor::normalized_vector::NormalizedVector
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::hash::Hash for vortex_tensor::normalized_vector::NormalizedVector
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::hash<__H: core::hash::Hasher>(&self, state: &mut __H)
+
+impl core::marker::StructuralPartialEq for vortex_tensor::normalized_vector::NormalizedVector
+
+impl vortex_array::dtype::extension::vtable::ExtVTable for vortex_tensor::normalized_vector::NormalizedVector
+
+pub type vortex_tensor::normalized_vector::NormalizedVector::Metadata = vortex_array::extension::EmptyMetadata
+
+pub type vortex_tensor::normalized_vector::NormalizedVector::NativeValue<'a> = &'a vortex_array::scalar::scalar_value::ScalarValue
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::deserialize_metadata(&self, _metadata: &[u8]) -> vortex_error::VortexResult<Self::Metadata>
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::id(&self) -> vortex_array::dtype::extension::ExtId
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::serialize_metadata(&self, _metadata: &Self::Metadata) -> vortex_error::VortexResult<alloc::vec::Vec<u8>>
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::unpack_native<'a>(_ext_dtype: &'a vortex_array::dtype::extension::typed::ExtDType<Self>, storage_value: &'a vortex_array::scalar::scalar_value::ScalarValue) -> vortex_error::VortexResult<Self::NativeValue>
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::validate_dtype(ext_dtype: &vortex_array::dtype::extension::typed::ExtDType<Self>) -> vortex_error::VortexResult<()>
+
 pub mod vortex_tensor::scalar_fns
 
 pub mod vortex_tensor::scalar_fns::cosine_similarity
@@ -382,8 +440,6 @@ pub fn vortex_tensor::scalar_fns::l2_denorm::L2Denorm::validity(&self, _options:
 
 pub fn vortex_tensor::scalar_fns::l2_denorm::normalize_as_l2_denorm(input: vortex_array::array::erased::ArrayRef, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::arrays::scalar_fn::vtable::ScalarFnArray>
 
-pub fn vortex_tensor::scalar_fns::l2_denorm::validate_l2_normalized_rows_against_norms(normalized: &vortex_array::array::erased::ArrayRef, norms: core::option::Option<&vortex_array::array::erased::ArrayRef>, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
-
 pub mod vortex_tensor::scalar_fns::l2_norm
 
 pub struct vortex_tensor::scalar_fns::l2_norm::L2Norm
@@ -574,7 +630,9 @@ pub fn vortex_tensor::vector::VectorMatcherMetadata::dimensions(&self) -> u32
 
 pub fn vortex_tensor::vector::VectorMatcherMetadata::element_ptype(&self) -> vortex_array::dtype::ptype::PType
 
-pub fn vortex_tensor::vector::VectorMatcherMetadata::try_new(element_ptype: vortex_array::dtype::ptype::PType, dimensions: u32) -> vortex_error::VortexResult<Self>
+pub fn vortex_tensor::vector::VectorMatcherMetadata::is_normalized(&self) -> bool
+
+pub fn vortex_tensor::vector::VectorMatcherMetadata::try_new(element_ptype: vortex_array::dtype::ptype::PType, dimensions: u32, is_normalized: bool) -> vortex_error::VortexResult<Self>
 
 impl core::clone::Clone for vortex_tensor::vector::VectorMatcherMetadata
 
 
@@ -101,7 +101,7 @@ pub fn turboquant_encode(
     let tq = unsafe { turboquant_encode_unchecked(normalized_ext, config, ctx) }?;
 
     // SAFETY: TurboQuant is a lossy approximation of the normalized child, so we intentionally
-    // bypass the strict normalized-row validation when reattaching the stored norms.
+    // bypass the strict normalized-row and zero-row validation when reattaching the stored norms.
     Ok(unsafe { L2Denorm::new_array_unchecked(tq, norms, num_rows) }?.into_array())
 }
 
 
@@ -12,6 +12,7 @@ use vortex_array::session::ArraySessionExt;
 use vortex_session::VortexSession;
 
 use crate::fixed_shape::FixedShapeTensor;
+use crate::normalized_vector::NormalizedVector;
 use crate::scalar_fns::cosine_similarity::CosineSimilarity;
 use crate::scalar_fns::inner_product::InnerProduct;
 use crate::scalar_fns::l2_denorm::L2Denorm;
@@ -23,6 +24,7 @@ pub mod matcher;
 pub mod scalar_fns;
 
 pub mod fixed_shape;
+pub mod normalized_vector;
 pub mod vector;
 
 pub mod encodings;
@@ -41,6 +43,7 @@ pub const SCALAR_FN_ARRAY_TENSOR_PLUGIN_ENV: &str = "VX_SCALAR_FN_ARRAY_TENSOR_P
 /// Initialize the Vortex tensor library with a Vortex session.
 pub fn initialize(session: &VortexSession) {
     session.dtypes().register(Vector);
+    session.dtypes().register(NormalizedVector);
     session.dtypes().register(FixedShapeTensor);
 
     let session_fns = session.scalar_fns();
 
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use vortex_array::dtype::DType;
+use vortex_array::dtype::extension::ExtDTypeRef;
+use vortex_array::dtype::extension::Matcher;
+use vortex_error::VortexExpect;
+use vortex_error::vortex_panic;
+
+use crate::normalized_vector::NormalizedVector;
+use crate::vector::VectorMatcherMetadata;
+
+/// Matcher that accepts only the [`NormalizedVector`] extension type.
+///
+/// Use this when a consumer must reject plain [`Vector`](crate::vector::Vector) inputs. Callers
+/// that can accept either should use [`AnyVector`](crate::vector::AnyVector) instead.
+pub struct AnyNormalizedVector;
+
+impl Matcher for AnyNormalizedVector {
+    type Match<'a> = VectorMatcherMetadata;
+
+    fn try_match<'a>(ext_dtype: &'a ExtDTypeRef) -> Option<Self::Match<'a>> {
+        if !ext_dtype.is::<NormalizedVector>() {
+            return None;
+        }
+
+        let DType::FixedSizeList(element_dtype, list_size, _) = ext_dtype.storage_dtype() else {
+            vortex_panic!(
+                "`NormalizedVector` type somehow did not have a `FixedSizeList` storage type"
+            )
+        };
+        assert!(element_dtype.is_float(), "element dtype must be float");
+        assert!(
+            !element_dtype.is_nullable(),
+            "element dtype must be non-nullable"
+        );
+
+        let metadata = VectorMatcherMetadata::try_new(element_dtype.as_ptype(), *list_size, true)
+            .vortex_expect("`NormalizedVector` type somehow did not have float elements");
+
+        Some(metadata)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use vortex_array::dtype::DType;
+    use vortex_array::dtype::Nullability;
+    use vortex_array::dtype::PType;
+    use vortex_array::dtype::extension::ExtDType;
+    use vortex_array::extension::EmptyMetadata;
+    use vortex_error::VortexResult;
+
+    use super::*;
+    use crate::vector::AnyVector;
+    use crate::vector::Vector;
+
+    fn storage_dtype(element_ptype: PType, dimensions: u32) -> DType {
+        DType::FixedSizeList(
+            Arc::new(DType::Primitive(element_ptype, Nullability::NonNullable)),
+            dimensions,
+            Nullability::NonNullable,
+        )
+    }
+
+    #[test]
+    fn matches_normalized_vector_dtype() -> VortexResult<()> {
+        let ext_dtype =
+            ExtDType::<NormalizedVector>::try_new(EmptyMetadata, storage_dtype(PType::F32, 128))?
+                .erased();
+
+        let metadata = ext_dtype.metadata::<AnyNormalizedVector>();
+        assert_eq!(metadata.element_ptype(), PType::F32);
+        assert_eq!(metadata.dimensions(), 128);
+        assert!(metadata.is_normalized());
+        Ok(())
+    }
+
+    #[test]
+    fn rejects_plain_vector() -> VortexResult<()> {
+        let ext_dtype =
+            ExtDType::<Vector>::try_new(EmptyMetadata, storage_dtype(PType::F32, 128))?.erased();
+
+        assert!(ext_dtype.metadata_opt::<AnyNormalizedVector>().is_none());
+        Ok(())
+    }
+
+    #[test]
+    fn any_vector_matches_normalized_vector() -> VortexResult<()> {
+        let ext_dtype =
+            ExtDType::<NormalizedVector>::try_new(EmptyMetadata, storage_dtype(PType::F32, 128))?
+                .erased();
+
+        let metadata = ext_dtype.metadata::<AnyVector>();
+        assert_eq!(metadata.element_ptype(), PType::F32);
+        assert_eq!(metadata.dimensions(), 128);
+        assert!(metadata.is_normalized());
+        Ok(())
+    }
+}
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Normalized vector extension type: a refinement of [`Vector`](crate::vector::Vector) whose
+//! rows are guaranteed (or asserted, for lossy encodings) to have unit L2 norm.
+
+use num_traits::ToPrimitive;
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::arrays::ExtensionArray;
+use vortex_array::arrays::extension::ExtensionArrayExt;
+use vortex_array::dtype::PType;
+use vortex_array::extension::EmptyMetadata;
+use vortex_array::match_each_float_ptype;
+use vortex_error::VortexResult;
+use vortex_error::vortex_ensure;
+
+use crate::utils::extract_flat_elements;
+use crate::utils::validate_tensor_float_input;
+
+/// Refinement of [`Vector`](crate::vector::Vector) that asserts every valid row is L2-normalized
+/// (unit-norm) or the zero vector.
+///
+/// The storage shape is identical to [`Vector`](crate::vector::Vector): a `FixedSizeList<float,
+/// dim, nullability>` with non-nullable float elements. Downstream operators such as
+/// [`L2Denorm`](crate::scalar_fns::l2_denorm::L2Denorm),
+/// [`L2Norm`](crate::scalar_fns::l2_norm::L2Norm),
+/// [`InnerProduct`](crate::scalar_fns::inner_product::InnerProduct), and
+/// [`CosineSimilarity`](crate::scalar_fns::cosine_similarity::CosineSimilarity) short-circuit
+/// arithmetic when they see this refinement.
+#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)]
+pub struct NormalizedVector;
+
+impl NormalizedVector {
+    /// Wraps `storage` as a [`NormalizedVector`] extension array after checking that every valid
+    /// row is unit-norm or the zero vector.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the extension dtype rejects `storage`, if `storage` is not a tensor
+    /// with float elements, or if any valid row's L2 norm is not `1.0` (or `0.0`) within the
+    /// tolerance implied by the element precision.
+    pub fn try_new(storage: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
+        let ext = ExtensionArray::try_new_from_vtable(NormalizedVector, EmptyMetadata, storage)?
+            .into_array();
+        validate_unit_norm_rows(&ext, ctx)?;
+        Ok(ext)
+    }
+
+    /// Wraps `storage` as a [`NormalizedVector`] extension array **without** validating that
+    /// rows are unit-norm.
+    ///
+    /// # Safety
+    ///
+    /// Every valid row must be unit-norm or the zero vector. Lossy approximations (e.g.
+    /// TurboQuant) deliberately relax this, but still treat the claim as authoritative
+    /// downstream. Violating this does not cause memory unsafety but will produce silently
+    /// incorrect results.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the extension dtype rejects `storage` (e.g. non-FSL storage, wrong
+    /// element dtype, or nullable elements).
+    pub unsafe fn new_unchecked(storage: ArrayRef) -> VortexResult<ArrayRef> {
+        Ok(
+            ExtensionArray::try_new_from_vtable(NormalizedVector, EmptyMetadata, storage)?
+                .into_array(),
+        )
+    }
+}
+
+/// Returns the acceptable unit-norm drift for the given element precision.
+pub(crate) fn unit_norm_tolerance(element_ptype: PType) -> f64 {
+    match element_ptype {
+        PType::F16 => 2e-3,
+        PType::F32 => 2e-6,
+        PType::F64 => 1e-10,
+        _ => unreachable!("NormalizedVector requires float elements, got {element_ptype:?}"),
+    }
+}
+
+/// Validates that every valid row of a [`NormalizedVector`] extension array has L2 norm `1.0`
+/// or `0.0` within the element-precision tolerance.
+fn validate_unit_norm_rows(array: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<()> {
+    let row_count = array.len();
+    if row_count == 0 {
+        return Ok(());
+    }
+
+    let tensor_match = validate_tensor_float_input(array.dtype())?;
+    let element_ptype = tensor_match.element_ptype();
+    let tolerance = unit_norm_tolerance(element_ptype);
+    let tensor_flat_size = tensor_match.list_size() as usize;
+
+    let ext: ExtensionArray = array.clone().execute(ctx)?;
+    let validity = ext.as_ref().validity()?;
+    let flat = extract_flat_elements(ext.storage_array(), tensor_flat_size, ctx)?;
+
+    match_each_float_ptype!(element_ptype, |T| {
+        for i in 0..row_count {
+            if !validity.is_valid(i)? {
+                continue;
+            }
+
+            let row_norm_sq = flat.row::<T>(i).iter().fold(0.0f64, |sum_sq, x| {
+                let value = ToPrimitive::to_f64(x).unwrap_or(f64::NAN);
+                sum_sq + value * value
+            });
+            let row_norm = row_norm_sq.sqrt();
+
+            vortex_ensure!(
+                row_norm == 0.0 || (row_norm - 1.0).abs() <= tolerance,
+                "NormalizedVector row {i} has L2 norm {row_norm:.6}, expected 1.0 or 0.0",
+            );
+        }
+    });
+
+    Ok(())
+}
+
+mod matcher;
+mod vtable;
+
+pub use matcher::AnyNormalizedVector;
Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ pub fn turboquant_encode(`
`101`	`101`	`let tq = unsafe { turboquant_encode_unchecked(normalized_ext, config, ctx) }?;`
`102`	`102`
`103`	`103`	`// SAFETY: TurboQuant is a lossy approximation of the normalized child, so we intentionally`
`104`		`- // bypass the strict normalized-row validation when reattaching the stored norms.`
	`104`	`+ // bypass the strict normalized-row and zero-row validation when reattaching the stored norms.`
`105`	`105`	`Ok(unsafe { L2Denorm::new_array_unchecked(tq, norms, num_rows) }?.into_array())`
`106`	`106`	`}`
`107`	`107`