vortex-data
diff --git a/‎vortex-turboquant/Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎vortex-turboquant/Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vortex-turboquant/src/lib.rs‎
Lines changed: 12 additions & 19 deletions b/‎vortex-turboquant/src/lib.rs‎
Lines changed: 12 additions & 19 deletions
diff --git a/‎vortex-turboquant/src/scalar_fns/compute/l2_norm.rs‎
Lines changed: 17 additions & 23 deletions b/‎vortex-turboquant/src/scalar_fns/compute/l2_norm.rs‎
Lines changed: 17 additions & 23 deletions
diff --git a/‎vortex-turboquant/src/scalar_fns/compute/mod.rs‎
Lines changed: 9 additions & 4 deletions b/‎vortex-turboquant/src/scalar_fns/compute/mod.rs‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎vortex-turboquant/src/scalar_fns/decode.rs‎
Lines changed: 12 additions & 0 deletions b/‎vortex-turboquant/src/scalar_fns/decode.rs‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎vortex-turboquant/src/sorf/splitmix64.rs‎
Lines changed: 3 additions & 1 deletion b/‎vortex-turboquant/src/sorf/splitmix64.rs‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎vortex-turboquant/src/tests/encode_decode.rs‎
Lines changed: 124 additions & 0 deletions b/‎vortex-turboquant/src/tests/encode_decode.rs‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎vortex-turboquant/src/tests/file.rs‎
Lines changed: 51 additions & 0 deletions b/‎vortex-turboquant/src/tests/file.rs‎
Lines changed: 51 additions & 0 deletions
@@ -32,7 +32,7 @@ vortex-utils = { workspace = true, features = ["dashmap"] }
 divan = { workspace = true }
 rand = { workspace = true }
 rstest = { workspace = true }
-vortex-array = { path = "../vortex-array", features = ["_test-harness"] }
+vortex-array = { workspace = true, features = ["_test-harness"] }
 vortex-file = { workspace = true }
 vortex-io = { workspace = true }
 vortex-layout = { workspace = true }
 
@@ -34,31 +34,24 @@
 //! )
 //! ```
 //!
-//! Stored norms are authoritative for future TurboQuant-aware scalar functions. Scalar quantization
-//! perturbs the transformed unit vector, and inverse SORF plus truncation can leave the decoded
-//! quantized direction with norm different from `1.0`. If decode only multiplied that direction by
-//! the original row norm, `L2Norm(TQDecode(_))` would not equal the norm of the vector returned by
-//! `TQDecode`. TurboQuant therefore stores `inv_direction_norms = 1 / ||decoded_direction||` so
-//! decode can first renormalize the lossy quantized direction and then apply the original norm.
-//!
-//! Storing the correction also keeps future query kernels cheap. Inner product and cosine kernels can
-//! rotate a query once and gather against centroids directly; the per-row scale they need is already
-//! available as `norms * inv_direction_norms` for inner product and `inv_direction_norms` for cosine.
-//! Without this field, those kernels would have to recompute the inverse SORF/truncated norm per row
-//! or give up the `TQDecode` norm-preservation invariant.
+//! Stored norms are authoritative for future TurboQuant-aware scalar functions. The rationale
+//! for the `inv_direction_norms` correction field lives next to the storage layout; see
+//! `vector/storage.rs`.
 //!
 //! # Source map
 //!
 //! Implementation details are documented next to the code that owns them:
 //!
-//! - `vector/storage.rs`: physical storage shape, full-length child arrays, and field-level
-//!   validity for null vectors.
-//! - `vector/normalize.rs`: TurboQuant-local normalization and how it differs from the tensor
-//!   crate's null-row zeroing helper.
-//! - `vector/quantize.rs`: SORF transform, centroid lookup, and why invalid rows are skipped rather
-//!   than quantized.
+//! - `vector/storage.rs`: physical storage shape and parsing.
+//! - `vector/normalize.rs`: TurboQuant-local normalization and the encode-time finite-norm
+//!   guard.
+//! - `vector/quantize.rs`: SORF transform, centroid lookup, and the per-row
+//!   `inv_direction_norm` computation.
+//! - `scalar_fns/compute/`: session-scoped optimizer kernels that intercept canonical scalar
+//!   functions over TurboQuant inputs (currently `L2Norm(TQDecode(_))`).
 //! - `centroids.rs`: deterministic Max-Lloyd centroid computation and process-local caching.
-//! - `sorf/`: the Walsh-Hadamard-based structured transform and the stable SplitMix64 sign stream.
+//! - `sorf/`: Walsh-Hadamard-based structured transform plus the stable SplitMix64 sign
+//!   stream.
 //!
 //! The current encoding is intentionally MSE-only. It does not yet implement the paper's QJL
 //! residual correction for unbiased inner-product estimation, and it still uses internal
 
@@ -11,17 +11,18 @@ use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::ScalarFn;
 use vortex_array::arrays::scalar_fn::ExactScalarFn;
 use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
+use vortex_array::dtype::Nullability;
 use vortex_array::optimizer::kernels::ArrayKernelsExt;
 use vortex_array::optimizer::kernels::ExecuteParentFn;
 use vortex_array::scalar_fn::ScalarFnVTable;
+use vortex_array::validity::Validity;
 use vortex_error::VortexResult;
 use vortex_error::vortex_ensure_eq;
 use vortex_session::VortexSession;
 use vortex_tensor::scalar_fns::l2_norm::L2Norm;
 
 use crate::TQDecode;
-use crate::vector::storage::parse_storage;
-use crate::vtable::TurboQuant;
+use crate::vector::storage::parse_storage_norms_only;
 
 /// Register the `L2Norm(TQDecode(_))` execute-parent kernel on the session.
 pub(super) fn register(session: &VortexSession) {
@@ -34,13 +35,14 @@ pub(super) fn register(session: &VortexSession) {
 
 /// Intercepts `L2Norm(TQDecode(tq_arr))` and returns the stored TurboQuant `norms` field.
 ///
-/// The kernel only fires when both the parent matches `ExactScalarFn<L2Norm>` and the child
-/// matches `ExactScalarFn<TQDecode>`. Returns `Ok(None)` for any other shape so the canonical
-/// `L2Norm` path runs unchanged.
-//
-// This is semantically correct because TurboQuant stores per-row inverse direction norms and
-// `TQDecode` applies that correction before re-applying the original row norm. In other words,
-// valid nonzero decoded rows preserve the stored L2 norm even though coordinates are lossy.
+/// Semantically valid because [`TQDecode`] renormalizes the lossy quantized direction with the
+/// stored inverse direction-norm before re-applying the original row norm, so decoded rows
+/// preserve the stored L2 norm. The kernel returns `Ok(None)` for any non-matching parent /
+/// child pair so the canonical `L2Norm` path runs unchanged.
+///
+/// The result's nullability is coerced to the parent's expected dtype because the stored
+/// `norms` child may be wider than the outer struct (a shape [`parse_storage_norms_only`]
+/// accepts).
 fn l2_norm_tq_decode_execute_parent(
     child: &ArrayRef,
     parent: &ArrayRef,
@@ -55,24 +57,16 @@ fn l2_norm_tq_decode_execute_parent(
     }
 
     let tq_array = child.as_::<ScalarFn>().child_at(0).clone();
+    let parsed = parse_storage_norms_only(tq_array, ctx)?;
 
-    // Defensive: TQDecode's signature already guarantees this, but a misregistration or a
-    // future TQDecode that takes a wrapped child should fall back to the canonical path.
-    if tq_array
-        .dtype()
-        .as_extension_opt()
-        .and_then(|d| d.metadata_opt::<TurboQuant>())
-        .is_none()
-    {
-        return Ok(None);
-    }
-
-    let parsed = parse_storage(tq_array, ctx)?;
-    let norms_validity = parsed.norms.validity()?;
+    let norms_validity = match parent.dtype().nullability() {
+        Nullability::NonNullable => Validity::NonNullable,
+        Nullability::Nullable => parsed.vector_validity,
+    };
     let norms = PrimitiveArray::from_buffer_handle(
         parsed.norms.buffer_handle().clone(),
         parsed.norms.ptype(),
-        norms_validity.and(parsed.vector_validity)?,
+        norms_validity,
     )
     .into_array();
 
 
@@ -3,15 +3,20 @@
 
 //! TurboQuant-specific session-scoped optimizer kernels.
 //!
-//! Each kernel module owns its own [`ArrayKernelsExt::register_execute_parent`] call. New
-//! kernels (e.g. for `InnerProduct` or `CosineSimilarity`) should be added as sibling modules
-//! and threaded through [`register_kernels`] with a single line.
+//! Each kernel module owns its own
+//! [`register_execute_parent`](vortex_array::optimizer::kernels::ArrayKernelsExt::register_execute_parent)
+//! call. New kernels (for example `InnerProduct` or `CosineSimilarity`) should be added as
+//! sibling modules and threaded through [`register_kernels`] with a single line.
 
 mod l2_norm;
 
 use vortex_session::VortexSession;
 
-/// Register every TurboQuant kernel on `session`.
+/// Register every TurboQuant-specific optimizer kernel on `session`.
+///
+/// Called from the crate-level [`crate::initialize`] after the TurboQuant extension type and
+/// the `TQEncode` / `TQDecode` scalar functions are registered, so kernels can resolve the
+/// scalar-fn ids they intercept.
 pub(crate) fn register_kernels(session: &VortexSession) {
     l2_norm::register(session);
 }
@@ -205,12 +205,24 @@ fn build_empty_vector(
     })
 }
 
+/// Borrowed bundle of the per-array decode inputs passed to the typed inner loop.
+///
+/// Packaged as a struct rather than positional arguments because `decode_typed` runs through
+/// [`vortex_array::match_each_float_ptype!`] which expands once per supported element ptype.
+/// Each expansion takes the same set of inputs, and the struct keeps the call site short.
 struct DecodeInputs<'a> {
+    /// TurboQuant metadata recovered from the input extension dtype.
     metadata: &'a TurboQuantMetadata,
+    /// SORF transform reconstructed from `metadata.seed` and `metadata.num_rounds`.
     sorf_matrix: &'a SorfMatrix,
+    /// Centroid codebook for `(padded_dim, bit_width)`, in f32.
     centroids: &'a [f32],
+    /// Per-row stored L2 norm of the original input vector, in the element ptype.
     norms: &'a PrimitiveArray,
+    /// Per-row reciprocal of the decoded direction's L2 norm, always in f32. See
+    /// [`crate::vector::storage`] for the sentinel semantics.
     inv_direction_norms: &'a PrimitiveArray,
+    /// Flat per-row centroid indices, `num_vectors * padded_dim` bytes.
     codes: &'a PrimitiveArray,
 }
 
 
@@ -19,7 +19,9 @@ const SPLITMIX64_MUL1: u64 = 0xBF58_476D_1CE4_E5B9;
 /// Second SplitMix64 mixing multiplier from the reference implementation.
 const SPLITMIX64_MUL2: u64 = 0x94D0_49BB_1331_11EB;
 
-/// Frozen local SplitMix64 stream used to define SORF sign diagonals.
+/// Frozen local SplitMix64 stream used to define SORF sign diagonals. Bit-identical to the
+/// reference implementation linked at the module top, which makes the sign stream part of the
+/// encoding's wire contract.
 pub(crate) struct SplitMix64 {
     state: u64,
 }
 
@@ -200,6 +200,130 @@ fn decode_preserves_original_l2_norms_for_non_power_of_two_dimensions() -> Vorte
     Ok(())
 }
 
+/// Encode rejects rows whose L2 norm is non-finite. Without this guard, a row whose squared
+/// sum overflows would normalize to all-zero placeholders and decode-vs-kernel would silently
+/// diverge (`NaN` vs `+inf`).
+#[test]
+fn encode_rejects_non_finite_norms() -> VortexResult<()> {
+    let session = test_session();
+    let mut ctx = session.create_execution_ctx();
+
+    // A row of `1e30` repeated `dim=128` times has squared sum `128 * 1e60 ≈ 1.28e62`, which
+    // overflows `f32` (max ≈ 3.4e38) and produces `+inf` when `L2Norm` runs in `f32`.
+    let values = vec![1e30f32; 128];
+    let input = vector_array(128, &values, Validity::NonNullable)?;
+
+    let result = execute_tq_encode(input, &TurboQuantConfig::default(), &mut ctx);
+    assert!(
+        result.is_err(),
+        "encode must reject non-finite norms (overflow case)"
+    );
+    let error = result.err().unwrap().to_string();
+    assert!(
+        error.contains("non-finite"),
+        "expected non-finite error, got: {error}"
+    );
+    Ok(())
+}
+
+/// Encode rejects rows containing `NaN` values, which propagate through `L2Norm` to produce
+/// a `NaN` stored norm.
+#[test]
+fn encode_rejects_nan_input() -> VortexResult<()> {
+    let session = test_session();
+    let mut ctx = session.create_execution_ctx();
+
+    let mut values = vec![1.0f32; 128];
+    values[0] = f32::NAN;
+    let input = vector_array(128, &values, Validity::NonNullable)?;
+
+    let result = execute_tq_encode(input, &TurboQuantConfig::default(), &mut ctx);
+    assert!(result.is_err(), "encode must reject NaN input rows");
+    Ok(())
+}
+
+/// Decode preserves stored L2 norms across element ptypes and padded/unpadded dimensions.
+#[rstest]
+#[case::f16_dim_128(PType::F16, 128_u32, 1e-2_f32)]
+#[case::f16_dim_129(PType::F16, 129_u32, 1e-2_f32)]
+#[case::f32_dim_128(PType::F32, 128_u32, 1e-4_f32)]
+#[case::f32_dim_129(PType::F32, 129_u32, 1e-4_f32)]
+#[case::f32_dim_257(PType::F32, 257_u32, 1e-4_f32)]
+#[case::f64_dim_128(PType::F64, 128_u32, 1e-4_f32)]
+#[case::f64_dim_129(PType::F64, 129_u32, 1e-4_f32)]
+fn decode_preserves_original_l2_norms_across_ptypes_and_dims(
+    #[case] ptype: PType,
+    #[case] dim: u32,
+    #[case] tolerance: f32,
+) -> VortexResult<()> {
+    let session = tensor_test_session();
+    let mut ctx = session.create_execution_ctx();
+    let rows = 3;
+    let raw = (0..rows * dim as usize)
+        .map(|i| (i % 17) as f32 - 8.0)
+        .map(|v| v * 0.25)
+        .collect::<Vec<_>>();
+    let input = match ptype {
+        PType::F16 => {
+            let values: Vec<half::f16> = raw.iter().copied().map(half::f16::from_f32).collect();
+            vector_array(dim, &values, Validity::NonNullable)?
+        }
+        PType::F32 => vector_array(dim, &raw, Validity::NonNullable)?,
+        PType::F64 => {
+            let values: Vec<f64> = raw.iter().copied().map(f64::from).collect();
+            vector_array(dim, &values, Validity::NonNullable)?
+        }
+        _ => unreachable!("ptype must be float"),
+    };
+    let config = TurboQuantConfig::try_new(3, 42, 3)?;
+
+    let encoded = execute_tq_encode(input, &config, &mut ctx)?;
+    let decoded = execute_tq_decode(encoded, &mut ctx)?;
+    let decoded_norms: PrimitiveArray = L2Norm::try_new_array(decoded, rows)?
+        .into_array()
+        .execute(&mut ctx)?;
+
+    // L2Norm returns the element ptype; widen to f32 for comparison.
+    let actuals: Vec<f32> = match ptype {
+        PType::F16 => decoded_norms
+            .as_slice::<half::f16>()
+            .iter()
+            .map(|v| f32::from(*v))
+            .collect(),
+        PType::F32 => decoded_norms.as_slice::<f32>().to_vec(),
+        PType::F64 => decoded_norms
+            .as_slice::<f64>()
+            .iter()
+            .map(|v| {
+                #[expect(
+                    clippy::cast_possible_truncation,
+                    reason = "norms are bounded by the test's input magnitudes (~|raw| * dim^0.5), \
+                              well within f32 range"
+                )]
+                let widened = *v as f32;
+                widened
+            })
+            .collect(),
+        _ => unreachable!(),
+    };
+
+    // Recompute expected from the raw f32 input to avoid coupling to internal storage.
+    let expected: Vec<f32> = (0..rows)
+        .map(|i| {
+            let row = &raw[i * dim as usize..][..dim as usize];
+            row.iter().map(|v| v * v).sum::<f32>().sqrt()
+        })
+        .collect();
+
+    for (actual, exp) in actuals.iter().zip(expected.iter()) {
+        assert!(
+            (*actual - *exp).abs() <= tolerance * exp.max(1.0),
+            "decoded norm {actual} did not match expected {exp} (ptype {ptype:?}, dim {dim})"
+        );
+    }
+    Ok(())
+}
+
 #[test]
 fn normalize_as_l2_denorm_preserves_child_validity() -> VortexResult<()> {
     let session = test_session();
 
@@ -3,13 +3,15 @@
 
 use vortex_array::IntoArray;
 use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::PrimitiveArray;
 use vortex_array::stream::ArrayStreamExt;
 use vortex_array::validity::Validity;
 use vortex_error::VortexResult;
 use vortex_file::OpenOptionsSessionExt;
 use vortex_file::VortexWriteOptions;
 use vortex_io::runtime::BlockingRuntime;
 use vortex_io::runtime::single::SingleThreadRuntime;
+use vortex_tensor::scalar_fns::l2_norm::L2Norm;
 use vortex_tensor::vector::Vector;
 
 use super::execute_tq_decode_from_metadata;
@@ -19,6 +21,7 @@ use super::file_session;
 use super::vector_validity;
 use crate::TQDecode;
 use crate::TurboQuantConfig;
+use crate::vector::storage::parse_storage;
 use crate::vtable::tq_metadata;
 
 #[test]
@@ -46,6 +49,54 @@ fn file_roundtrip_with_initialize_session() -> VortexResult<()> {
     Ok(())
 }
 
+/// File-roundtrip preserves `inv_direction_norms` and the `L2Norm(TQDecode(_))` fast-path
+/// invariant. A regression that silently dropped the field at serialization would only show
+/// up downstream as norm divergence; this test surfaces it at the IO layer.
+#[test]
+fn file_roundtrip_preserves_inv_direction_norms_and_l2_norm_invariant() -> VortexResult<()> {
+    let runtime = SingleThreadRuntime::default();
+    let session = file_session(&runtime);
+    let mut ctx = session.create_execution_ctx();
+    let input = f32_vector_array(128, 4, 0.25, Validity::NonNullable)?;
+    let config = TurboQuantConfig::try_new(3, 42, 3)?;
+    let encoded = execute_tq_encode(input, &config, &mut ctx)?;
+    let original_norms: PrimitiveArray = parse_storage(encoded.clone(), &mut ctx)?.norms;
+
+    let mut file_bytes = Vec::new();
+    VortexWriteOptions::new(session.clone())
+        .blocking(&runtime)
+        .write(&mut file_bytes, encoded.to_array_iterator())?;
+
+    let file = session.open_options().open_buffer(file_bytes)?;
+    let read = runtime.block_on(async { file.scan()?.into_array_stream()?.read_all().await })?;
+
+    // The inv_direction_norms field must survive serialization with finite-positive values for
+    // every valid row.
+    let parsed = parse_storage(read.clone(), &mut ctx)?;
+    let inv_direction_norms = parsed.inv_direction_norms.as_slice::<f32>();
+    assert_eq!(inv_direction_norms.len(), 4);
+    for &v in inv_direction_norms {
+        assert!(
+            v.is_finite() && v > 0.0,
+            "inv_direction_norm {v} after file roundtrip is not finite-positive"
+        );
+    }
+
+    // Fast-path `L2Norm(TQDecode(_))` must still return the originally stored row norms after
+    // the file roundtrip. If the kernel or the `inv_direction_norms` field had silently broken
+    // at serialization, this is where it would surface.
+    let decoded = TQDecode::try_new_array(read)?.into_array();
+    let kernel_norms: PrimitiveArray = L2Norm::try_new_array(decoded, 4)?
+        .into_array()
+        .execute(&mut ctx)?;
+    assert_eq!(
+        kernel_norms.as_slice::<f32>(),
+        original_norms.as_slice::<f32>(),
+        "L2Norm(TQDecode(read_back)) must equal the originally stored row norms"
+    );
+    Ok(())
+}
+
 #[test]
 fn file_roundtrip_lazy_decode_scalar_fn_with_initialize_session() -> VortexResult<()> {
     let runtime = SingleThreadRuntime::default();