vortex-data
diff --git a/‎vortex-tensor/public-api.lock‎
Lines changed: 61 additions & 3 deletions b/‎vortex-tensor/public-api.lock‎
Lines changed: 61 additions & 3 deletions
diff --git a/‎vortex-tensor/src/encodings/turboquant/compress.rs‎
Lines changed: 1 addition & 1 deletion b/‎vortex-tensor/src/encodings/turboquant/compress.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vortex-tensor/src/lib.rs‎
Lines changed: 3 additions & 0 deletions b/‎vortex-tensor/src/lib.rs‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎vortex-tensor/src/scalar_fns/cosine_similarity.rs‎
Lines changed: 83 additions & 40 deletions b/‎vortex-tensor/src/scalar_fns/cosine_similarity.rs‎
Lines changed: 83 additions & 40 deletions
@@ -250,6 +250,64 @@ pub type vortex_tensor::matcher::AnyTensor::Match<'a> = vortex_tensor::matcher::
 
 pub fn vortex_tensor::matcher::AnyTensor::try_match<'a>(ext_dtype: &'a vortex_array::dtype::extension::erased::ExtDTypeRef) -> core::option::Option<Self::Match>
 
+pub mod vortex_tensor::normalized_vector
+
+pub struct vortex_tensor::normalized_vector::AnyNormalizedVector
+
+impl vortex_array::dtype::extension::matcher::Matcher for vortex_tensor::normalized_vector::AnyNormalizedVector
+
+pub type vortex_tensor::normalized_vector::AnyNormalizedVector::Match<'a> = vortex_tensor::vector::VectorMatcherMetadata
+
+pub fn vortex_tensor::normalized_vector::AnyNormalizedVector::try_match<'a>(ext_dtype: &'a vortex_array::dtype::extension::erased::ExtDTypeRef) -> core::option::Option<Self::Match>
+
+pub struct vortex_tensor::normalized_vector::NormalizedVector
+
+impl vortex_tensor::normalized_vector::NormalizedVector
+
+pub unsafe fn vortex_tensor::normalized_vector::NormalizedVector::new_unchecked(storage: vortex_array::array::erased::ArrayRef) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::try_new(storage: vortex_array::array::erased::ArrayRef, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+impl core::clone::Clone for vortex_tensor::normalized_vector::NormalizedVector
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::clone(&self) -> vortex_tensor::normalized_vector::NormalizedVector
+
+impl core::cmp::Eq for vortex_tensor::normalized_vector::NormalizedVector
+
+impl core::cmp::PartialEq for vortex_tensor::normalized_vector::NormalizedVector
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::eq(&self, other: &vortex_tensor::normalized_vector::NormalizedVector) -> bool
+
+impl core::default::Default for vortex_tensor::normalized_vector::NormalizedVector
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::default() -> vortex_tensor::normalized_vector::NormalizedVector
+
+impl core::fmt::Debug for vortex_tensor::normalized_vector::NormalizedVector
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::hash::Hash for vortex_tensor::normalized_vector::NormalizedVector
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::hash<__H: core::hash::Hasher>(&self, state: &mut __H)
+
+impl core::marker::StructuralPartialEq for vortex_tensor::normalized_vector::NormalizedVector
+
+impl vortex_array::dtype::extension::vtable::ExtVTable for vortex_tensor::normalized_vector::NormalizedVector
+
+pub type vortex_tensor::normalized_vector::NormalizedVector::Metadata = vortex_array::extension::EmptyMetadata
+
+pub type vortex_tensor::normalized_vector::NormalizedVector::NativeValue<'a> = &'a vortex_array::scalar::scalar_value::ScalarValue
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::deserialize_metadata(&self, _metadata: &[u8]) -> vortex_error::VortexResult<Self::Metadata>
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::id(&self) -> vortex_array::dtype::extension::ExtId
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::serialize_metadata(&self, _metadata: &Self::Metadata) -> vortex_error::VortexResult<alloc::vec::Vec<u8>>
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::unpack_native<'a>(_ext_dtype: &'a vortex_array::dtype::extension::typed::ExtDType<Self>, storage_value: &'a vortex_array::scalar::scalar_value::ScalarValue) -> vortex_error::VortexResult<Self::NativeValue>
+
+pub fn vortex_tensor::normalized_vector::NormalizedVector::validate_dtype(ext_dtype: &vortex_array::dtype::extension::typed::ExtDType<Self>) -> vortex_error::VortexResult<()>
+
 pub mod vortex_tensor::scalar_fns
 
 pub mod vortex_tensor::scalar_fns::cosine_similarity
@@ -382,8 +440,6 @@ pub fn vortex_tensor::scalar_fns::l2_denorm::L2Denorm::validity(&self, _options:
 
 pub fn vortex_tensor::scalar_fns::l2_denorm::normalize_as_l2_denorm(input: vortex_array::array::erased::ArrayRef, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::arrays::scalar_fn::vtable::ScalarFnArray>
 
-pub fn vortex_tensor::scalar_fns::l2_denorm::validate_l2_normalized_rows_against_norms(normalized: &vortex_array::array::erased::ArrayRef, norms: core::option::Option<&vortex_array::array::erased::ArrayRef>, ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
-
 pub mod vortex_tensor::scalar_fns::l2_norm
 
 pub struct vortex_tensor::scalar_fns::l2_norm::L2Norm
@@ -574,7 +630,9 @@ pub fn vortex_tensor::vector::VectorMatcherMetadata::dimensions(&self) -> u32
 
 pub fn vortex_tensor::vector::VectorMatcherMetadata::element_ptype(&self) -> vortex_array::dtype::ptype::PType
 
-pub fn vortex_tensor::vector::VectorMatcherMetadata::try_new(element_ptype: vortex_array::dtype::ptype::PType, dimensions: u32) -> vortex_error::VortexResult<Self>
+pub fn vortex_tensor::vector::VectorMatcherMetadata::is_normalized(&self) -> bool
+
+pub fn vortex_tensor::vector::VectorMatcherMetadata::try_new(element_ptype: vortex_array::dtype::ptype::PType, dimensions: u32, is_normalized: bool) -> vortex_error::VortexResult<Self>
 
 impl core::clone::Clone for vortex_tensor::vector::VectorMatcherMetadata
 
 
@@ -101,7 +101,7 @@ pub fn turboquant_encode(
     let tq = unsafe { turboquant_encode_unchecked(normalized_ext, config, ctx) }?;
 
     // SAFETY: TurboQuant is a lossy approximation of the normalized child, so we intentionally
-    // bypass the strict normalized-row validation when reattaching the stored norms.
+    // bypass the strict normalized-row and zero-row validation when reattaching the stored norms.
     Ok(unsafe { L2Denorm::new_array_unchecked(tq, norms, num_rows) }?.into_array())
 }
 
 
@@ -17,6 +17,7 @@ use crate::scalar_fns::l2_denorm::L2Denorm;
 use crate::scalar_fns::l2_norm::L2Norm;
 use crate::scalar_fns::sorf_transform::SorfTransform;
 use crate::types::fixed_shape::FixedShapeTensor;
+use crate::types::normalized_vector::NormalizedVector;
 use crate::types::vector::Vector;
 
 pub mod matcher;
@@ -25,6 +26,7 @@ pub mod scalar_fns;
 mod types;
 
 pub use types::fixed_shape;
+pub use types::normalized_vector;
 pub use types::vector;
 
 pub mod encodings;
@@ -43,6 +45,7 @@ pub const SCALAR_FN_ARRAY_TENSOR_PLUGIN_ENV: &str = "VX_SCALAR_FN_ARRAY_TENSOR_P
 /// Initialize the Vortex tensor library with a Vortex session.
 pub fn initialize(session: &VortexSession) {
     session.dtypes().register(Vector);
+    session.dtypes().register(NormalizedVector);
     session.dtypes().register(FixedShapeTensor);
 
     let session_fns = session.scalar_fns();
 
@@ -35,10 +35,9 @@ use vortex_session::VortexSession;
 
 use crate::scalar_fns::inner_product::BinaryTensorOpMetadata;
 use crate::scalar_fns::inner_product::InnerProduct;
-use crate::scalar_fns::l2_denorm::DenormOrientation;
+use crate::scalar_fns::l2_denorm::NormalForm;
 use crate::scalar_fns::l2_denorm::try_build_constant_l2_denorm;
 use crate::scalar_fns::l2_norm::L2Norm;
-use crate::utils::extract_l2_denorm_children;
 use crate::utils::validate_binary_tensor_float_inputs;
 
 /// Cosine similarity between two columns.
@@ -141,15 +140,21 @@ impl ScalarFnVTable for CosineSimilarity {
             rhs_ref = sfn.into_array();
         }
 
-        // Take any L2Denorm-wrapped fast path that applies.
-        match DenormOrientation::classify(&lhs_ref, &rhs_ref) {
-            DenormOrientation::Both { lhs, rhs } => {
-                return self.execute_both_denorm(lhs, rhs, len);
+        // Classify each operand by its normal form. When both operands carry a known unit-norm
+        // representation, cosine similarity collapses to the dot product of the unit vectors.
+        let lhs_form = NormalForm::classify(&lhs_ref);
+        let rhs_form = NormalForm::classify(&rhs_ref);
+        match (lhs_form.unit_array(), rhs_form.unit_array()) {
+            (Some(unit_lhs), Some(unit_rhs)) => {
+                return self.execute_both_unit(unit_lhs, unit_rhs, &lhs_ref, &rhs_ref, len);
             }
-            DenormOrientation::One { denorm, plain } => {
-                return self.execute_one_denorm(denorm, plain, len, ctx);
+            (Some(unit_lhs), None) => {
+                return self.execute_one_unit(unit_lhs, &rhs_ref, &lhs_ref, len, ctx);
             }
-            DenormOrientation::Neither => {}
+            (None, Some(unit_rhs)) => {
+                return self.execute_one_unit(unit_rhs, &lhs_ref, &rhs_ref, len, ctx);
+            }
+            (None, None) => {}
         }
 
         // Compute combined validity.
@@ -242,22 +247,20 @@ impl ScalarFnArrayVTable for CosineSimilarity {
 }
 
 impl CosineSimilarity {
-    /// Both sides are `L2Denorm`: treat the normalized children as authoritative, so
-    /// `cosine_similarity = dot(n_l, n_r)`.
-    fn execute_both_denorm(
+    /// Both sides carry a known unit-norm representation: cosine similarity collapses to the
+    /// dot product of the unit children.
+    fn execute_both_unit(
         &self,
+        unit_lhs: &ArrayRef,
+        unit_rhs: &ArrayRef,
         lhs_ref: &ArrayRef,
         rhs_ref: &ArrayRef,
         len: usize,
     ) -> VortexResult<ArrayRef> {
         let validity = lhs_ref.validity()?.and(rhs_ref.validity()?)?;
 
-        let (normalized_l, _) = extract_l2_denorm_children(lhs_ref);
-        let (normalized_r, _) = extract_l2_denorm_children(rhs_ref);
-
-        // `L2Denorm` makes the normalized children authoritative, so their dot product is the
-        // cosine similarity even for lossy storage wrappers.
-        let dot = InnerProduct::try_new_array(normalized_l, normalized_r, len)?.into_array();
+        let dot =
+            InnerProduct::try_new_array(unit_lhs.clone(), unit_rhs.clone(), len)?.into_array();
 
         if !matches!(validity, Validity::NonNullable) {
             // Masking always changes the nullability to nullable.
@@ -267,22 +270,21 @@ impl CosineSimilarity {
         }
     }
 
-    /// One side is `L2Denorm`: treat the normalized child as authoritative, so
-    /// `cosine_similarity = dot(n, b) / ||b||`.
-    ///
-    /// The caller must pass the denorm array as `denorm_ref` and the plain array as `plain_ref`.
-    fn execute_one_denorm(
+    /// Exactly one side carries a unit-norm representation: cosine similarity reduces to
+    /// `dot(unit, other) / ||other||`. The norms of the unit side are implicitly `1.0` (naked
+    /// `NormalizedVector`) or stored separately (the outer `L2Denorm` wrapper, which is not
+    /// needed here since cosine ignores magnitude).
+    fn execute_one_unit(
         &self,
-        denorm_ref: &ArrayRef,
+        unit: &ArrayRef,
         plain_ref: &ArrayRef,
+        unit_ref: &ArrayRef,
         len: usize,
         ctx: &mut ExecutionCtx,
     ) -> VortexResult<ArrayRef> {
-        let validity = denorm_ref.validity()?.and(plain_ref.validity()?)?;
+        let validity = unit_ref.validity()?.and(plain_ref.validity()?)?;
 
-        let (normalized, _) = extract_l2_denorm_children(denorm_ref);
-
-        let dot_arr = InnerProduct::try_new_array(normalized, plain_ref.clone(), len)?;
+        let dot_arr = InnerProduct::try_new_array(unit.clone(), plain_ref.clone(), len)?;
         let dot: PrimitiveArray = dot_arr.into_array().execute(ctx)?;
 
         let norm_arr = L2Norm::try_new_array(plain_ref.clone(), len)?;
@@ -331,6 +333,7 @@ mod tests {
     use crate::utils::test_helpers::assert_close;
     use crate::utils::test_helpers::constant_tensor_array;
     use crate::utils::test_helpers::l2_denorm_array;
+    use crate::utils::test_helpers::normalized_vector_array;
     use crate::utils::test_helpers::tensor_array;
     use crate::utils::test_helpers::vector_array;
 
@@ -519,13 +522,25 @@ mod tests {
         Ok(())
     }
 
+    /// Naked [`NormalizedVector`](crate::normalized_vector::NormalizedVector) operands take the
+    /// fast path: cosine similarity collapses to the dot product without computing norms.
+    #[test]
+    fn naked_normalized_vector_cosine() -> VortexResult<()> {
+        let mut ctx = SESSION.create_execution_ctx();
+        let lhs = normalized_vector_array(2, &[0.6, 0.8, 1.0, 0.0], &mut ctx)?;
+        let rhs = normalized_vector_array(2, &[0.6, 0.8, 0.0, 1.0], &mut ctx)?;
+        // Row 0: identical -> 1.0, Row 1: orthogonal -> 0.0.
+        assert_close(&eval_cosine_similarity(lhs, rhs, 2)?, &[1.0, 0.0]);
+        Ok(())
+    }
+
     #[test]
     fn both_denorm_self_similarity() -> VortexResult<()> {
         // [3.0, 4.0] has norm 5.0, normalized [0.6, 0.8].
         // [1.0, 0.0] has norm 1.0, normalized [1.0, 0.0].
         let mut ctx = SESSION.create_execution_ctx();
-        let lhs = l2_denorm_array(&[2], &[0.6, 0.8, 1.0, 0.0], &[5.0, 1.0], &mut ctx)?;
-        let rhs = l2_denorm_array(&[2], &[0.6, 0.8, 1.0, 0.0], &[5.0, 1.0], &mut ctx)?;
+        let lhs = l2_denorm_array(2, &[0.6, 0.8, 1.0, 0.0], &[5.0, 1.0], &mut ctx)?;
+        let rhs = l2_denorm_array(2, &[0.6, 0.8, 1.0, 0.0], &[5.0, 1.0], &mut ctx)?;
 
         // Self-similarity should always be 1.0.
         assert_close(&eval_cosine_similarity(lhs, rhs, 2)?, &[1.0, 1.0]);
@@ -537,8 +552,8 @@ mod tests {
         // [3.0, 0.0] normalized [1.0, 0.0], norm 3.0.
         // [0.0, 4.0] normalized [0.0, 1.0], norm 4.0.
         let mut ctx = SESSION.create_execution_ctx();
-        let lhs = l2_denorm_array(&[2], &[1.0, 0.0], &[3.0], &mut ctx)?;
-        let rhs = l2_denorm_array(&[2], &[0.0, 1.0], &[4.0], &mut ctx)?;
+        let lhs = l2_denorm_array(2, &[1.0, 0.0], &[3.0], &mut ctx)?;
+        let rhs = l2_denorm_array(2, &[0.0, 1.0], &[4.0], &mut ctx)?;
 
         assert_close(&eval_cosine_similarity(lhs, rhs, 1)?, &[0.0]);
         Ok(())
@@ -548,8 +563,8 @@ mod tests {
     fn both_denorm_zero_norm() -> VortexResult<()> {
         // Zero-norm row: normalized is [0.0, 0.0], norm is 0.0.
         let mut ctx = SESSION.create_execution_ctx();
-        let lhs = l2_denorm_array(&[2], &[0.6, 0.8, 0.0, 0.0], &[5.0, 0.0], &mut ctx)?;
-        let rhs = l2_denorm_array(&[2], &[0.6, 0.8, 1.0, 0.0], &[5.0, 1.0], &mut ctx)?;
+        let lhs = l2_denorm_array(2, &[0.6, 0.8, 0.0, 0.0], &[5.0, 0.0], &mut ctx)?;
+        let rhs = l2_denorm_array(2, &[0.6, 0.8, 1.0, 0.0], &[5.0, 1.0], &mut ctx)?;
 
         // Row 0: dot([0.6, 0.8], [0.6, 0.8]) = 1.0, row 1: dot([0,0], [1,0]) = 0.0.
         assert_close(&eval_cosine_similarity(lhs, rhs, 2)?, &[1.0, 0.0]);
@@ -562,8 +577,8 @@ mod tests {
         // RHS is plain [3.0, 4.0].
         // cosine_similarity([3.0, 4.0], [3.0, 4.0]) = 1.0.
         let mut ctx = SESSION.create_execution_ctx();
-        let lhs = l2_denorm_array(&[2], &[0.6, 0.8], &[5.0], &mut ctx)?;
-        let rhs = tensor_array(&[2], &[3.0, 4.0])?;
+        let lhs = l2_denorm_array(2, &[0.6, 0.8], &[5.0], &mut ctx)?;
+        let rhs = vector_array(2, &[3.0, 4.0])?;
 
         assert_close(&eval_cosine_similarity(lhs, rhs, 1)?, &[1.0]);
         Ok(())
@@ -574,8 +589,8 @@ mod tests {
         // LHS is plain [1.0, 0.0], RHS is L2Denorm([0.6, 0.8], 5.0) representing [3.0, 4.0].
         // cosine_similarity([1.0, 0.0], [3.0, 4.0]) = 3.0 / (1.0 * 5.0) = 0.6.
         let mut ctx = SESSION.create_execution_ctx();
-        let lhs = tensor_array(&[2], &[1.0, 0.0])?;
-        let rhs = l2_denorm_array(&[2], &[0.6, 0.8], &[5.0], &mut ctx)?;
+        let lhs = vector_array(2, &[1.0, 0.0])?;
+        let rhs = l2_denorm_array(2, &[0.6, 0.8], &[5.0], &mut ctx)?;
 
         assert_close(&eval_cosine_similarity(lhs, rhs, 1)?, &[0.6]);
         Ok(())
@@ -585,9 +600,9 @@ mod tests {
     fn both_denorm_null_norms() -> VortexResult<()> {
         // Row 0: valid, row 1: null (via nullable norms on rhs).
         let mut ctx = SESSION.create_execution_ctx();
-        let lhs = l2_denorm_array(&[2], &[0.6, 0.8, 1.0, 0.0], &[5.0, 1.0], &mut ctx)?;
+        let lhs = l2_denorm_array(2, &[0.6, 0.8, 1.0, 0.0], &[5.0, 1.0], &mut ctx)?;
 
-        let normalized_r = tensor_array(&[2], &[0.6, 0.8, 1.0, 0.0])?;
+        let normalized_r = normalized_vector_array(2, &[0.6, 0.8, 1.0, 0.0], &mut ctx)?;
         let norms_r = PrimitiveArray::from_option_iter([Some(5.0f64), None]).into_array();
         let rhs = L2Denorm::try_new_array(normalized_r, norms_r, 2, &mut ctx)?.into_array();
 
@@ -703,6 +718,34 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn serde_round_trip_mixed_vector_and_normalized_vector() -> VortexResult<()> {
+        let mut ctx = SESSION.create_execution_ctx();
+        let lhs = normalized_vector_array(2, &[0.6, 0.8, 1.0, 0.0], &mut ctx)?;
+        let rhs = vector_array(2, &[3.0, 4.0, 0.0, 1.0])?;
+        let original = CosineSimilarity::try_new_array(lhs.clone(), rhs.clone(), 2)?.into_array();
+
+        let plugin = ScalarFnArrayPlugin::new(CosineSimilarity);
+        let metadata = plugin
+            .serialize(&original, &SESSION)?
+            .expect("CosineSimilarity serialize must produce metadata");
+
+        let children = vec![lhs, rhs];
+        let recovered = plugin.deserialize(
+            original.dtype(),
+            original.len(),
+            &metadata,
+            &[],
+            &children,
+            &SESSION,
+        )?;
+
+        assert_eq!(recovered.dtype(), original.dtype());
+        assert_eq!(recovered.len(), original.len());
+        assert_eq!(recovered.encoding_id(), original.encoding_id());
+        Ok(())
+    }
+
     #[rstest]
     #[case::vector(
         vector_array(3, &[1.0, 0.0, 0.0, 3.0, 4.0, 0.0]).unwrap(),
Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ pub fn turboquant_encode(`
`101`	`101`	`let tq = unsafe { turboquant_encode_unchecked(normalized_ext, config, ctx) }?;`
`102`	`102`
`103`	`103`	`// SAFETY: TurboQuant is a lossy approximation of the normalized child, so we intentionally`
`104`		`- // bypass the strict normalized-row validation when reattaching the stored norms.`
	`104`	`+ // bypass the strict normalized-row and zero-row validation when reattaching the stored norms.`
`105`	`105`	`Ok(unsafe { L2Denorm::new_array_unchecked(tq, norms, num_rows) }?.into_array())`
`106`	`106`	`}`
`107`	`107`