vortex-data
diff --git a/‎vortex-turboquant/src/lib.rs‎
Lines changed: 15 additions & 4 deletions b/‎vortex-turboquant/src/lib.rs‎
Lines changed: 15 additions & 4 deletions
diff --git a/‎vortex-turboquant/src/scalar_fns/compute/l2_norm.rs‎
Lines changed: 3 additions & 7 deletions b/‎vortex-turboquant/src/scalar_fns/compute/l2_norm.rs‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎vortex-turboquant/src/scalar_fns/decode.rs‎
Lines changed: 9 additions & 4 deletions b/‎vortex-turboquant/src/scalar_fns/decode.rs‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎vortex-turboquant/src/scalar_fns/encode.rs‎
Lines changed: 16 additions & 2 deletions b/‎vortex-turboquant/src/scalar_fns/encode.rs‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎vortex-turboquant/src/tests/encode_decode.rs‎
Lines changed: 66 additions & 0 deletions b/‎vortex-turboquant/src/tests/encode_decode.rs‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎vortex-turboquant/src/tests/kernels.rs‎
Lines changed: 3 additions & 3 deletions b/‎vortex-turboquant/src/tests/kernels.rs‎
Lines changed: 3 additions & 3 deletions
@@ -19,22 +19,33 @@
 //! The [`TQEncode`] scalar function first computes and stores the original L2 norm for each vector
 //! row, then normalizes each valid nonzero row internally before SORF transform and scalar
 //! quantization. The [`TQDecode`] scalar function dequantizes through deterministic centroids,
-//! applies the inverse SORF transform, truncates back to the original dimension, and re-applies the
-//! stored norm.
+//! applies the inverse SORF transform, truncates back to the original dimension, and applies a
+//! stored inverse direction-norm correction before re-applying the stored norm.
 //!
 //! The encoded storage is a row-aligned extension tree:
 //!
 //! ```text
 //! Extension<TurboQuant>(
 //!     Struct {
 //!         norms: Primitive<element_ptype, vector_validity>,
+//!         inv_direction_norms: Primitive<f32, vector_validity>,
 //!         codes: FixedSizeList<Primitive<u8>, padded_dim, vector_validity>,
 //!     }
 //! )
 //! ```
 //!
-//! Stored norms are authoritative for future TurboQuant-aware scalar functions. Decoded quantized
-//! directions are not guaranteed to have unit norm after scalar quantization and inverse transform.
+//! Stored norms are authoritative for future TurboQuant-aware scalar functions. Scalar quantization
+//! perturbs the transformed unit vector, and inverse SORF plus truncation can leave the decoded
+//! quantized direction with norm different from `1.0`. If decode only multiplied that direction by
+//! the original row norm, `L2Norm(TQDecode(_))` would not equal the norm of the vector returned by
+//! `TQDecode`. TurboQuant therefore stores `inv_direction_norms = 1 / ||decoded_direction||` so
+//! decode can first renormalize the lossy quantized direction and then apply the original norm.
+//!
+//! Storing the correction also keeps future query kernels cheap. Inner product and cosine kernels can
+//! rotate a query once and gather against centroids directly; the per-row scale they need is already
+//! available as `norms * inv_direction_norms` for inner product and `inv_direction_norms` for cosine.
+//! Without this field, those kernels would have to recompute the inverse SORF/truncated norm per row
+//! or give up the `TQDecode` norm-preservation invariant.
 //!
 //! # Source map
 //!
 
@@ -38,13 +38,9 @@ pub(super) fn register(session: &VortexSession) {
 /// matches `ExactScalarFn<TQDecode>`. Returns `Ok(None)` for any other shape so the canonical
 /// `L2Norm` path runs unchanged.
 //
-// TODO(vortex-data/vortex#TODO): The TurboQuant storage `norms` field is pre-quantization — it
-// is the L2 norm of each original vector before SORF transform and scalar quantization. The
-// lossy contract (see `vortex-turboquant/src/lib.rs`) means decoded vectors are not guaranteed
-// to be unit-norm, so strictly `l2_norm(tq_decode(x))` may differ slightly from the stored
-// norm. We treat the stored norms as authoritative here for parity with the `L2Denorm` fast
-// path in `vortex-tensor/src/scalar_fns/l2_norm.rs`. A future fix should recompute norms
-// post-quantization.
+// This is semantically correct because TurboQuant stores per-row inverse direction norms and
+// `TQDecode` applies that correction before re-applying the original row norm. In other words,
+// valid nonzero decoded rows preserve the stored L2 norm even though coordinates are lossy.
 fn l2_norm_tq_decode_execute_parent(
     child: &ArrayRef,
     parent: &ArrayRef,
 
@@ -153,9 +153,10 @@ impl ScalarFnVTable for TQDecode {
 
 /// Decode a `TurboQuant` extension array back into a `Vector` extension array.
 ///
-/// The decoded directions are inverse-transformed, truncated to the original dimension, and
-/// multiplied by the stored row norms. The conversion is lossy and does not roundtrip with
-/// [`TQEncode`](crate::TQEncode).
+/// The decoded directions are inverse-transformed, truncated to the original dimension, normalized
+/// by the stored inverse direction norms, and multiplied by the stored row norms. The conversion is
+/// lossy and does not roundtrip with [`TQEncode`](crate::TQEncode), but valid nonzero decoded rows
+/// preserve the original stored L2 norm.
 pub(crate) fn decode_vector(input: ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
     let parsed = parse_storage(input, ctx)?;
     let metadata = parsed.metadata;
@@ -177,6 +178,7 @@ pub(crate) fn decode_vector(input: ArrayRef, ctx: &mut ExecutionCtx) -> VortexRe
                 sorf_matrix: &transform,
                 centroids: &centroids,
                 norms: &parsed.norms,
+                inv_direction_norms: &parsed.inv_direction_norms,
                 codes: &parsed.codes,
             },
             parsed.vector_validity,
@@ -208,6 +210,7 @@ struct DecodeInputs<'a> {
     sorf_matrix: &'a SorfMatrix,
     centroids: &'a [f32],
     norms: &'a PrimitiveArray,
+    inv_direction_norms: &'a PrimitiveArray,
     codes: &'a PrimitiveArray,
 }
 
@@ -226,6 +229,7 @@ where
     let padded_dim = decode.sorf_matrix.padded_dim();
     let centroids = decode.centroids;
     let norms = decode.norms.as_slice::<T>();
+    let inv_direction_norms = decode.inv_direction_norms.as_slice::<f32>();
     let codes = decode.codes.as_slice::<u8>();
     let mask = vector_validity.execute_mask(num_vectors, ctx)?;
 
@@ -249,11 +253,12 @@ where
         decode.sorf_matrix.inverse_transform(&decoded, &mut inverse);
 
         let norm = norms[i];
+        let inv_direction_norm = inv_direction_norms[i];
         for &value in inverse.iter().take(dimensions) {
             // `T::from_f32` is infallible for the supported float ptypes (`f16`, `f32`,
             // `f64`): values outside `f16` range saturate to `±inf` rather than returning
             // `None`.
-            let value = T::from_f32(value)
+            let value = T::from_f32(value * inv_direction_norm)
                 .vortex_expect("from_f32 is infallible for supported float types");
 
             // SAFETY: total pushes across all match arms equal `output_len`.
 
@@ -12,6 +12,7 @@ use vortex_array::IntoArray;
 use vortex_array::arrays::Extension;
 use vortex_array::arrays::ExtensionArray;
 use vortex_array::arrays::FixedSizeListArray;
+use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::ScalarFnArray;
 use vortex_array::arrays::extension::ExtensionArrayExt;
 use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
@@ -209,7 +210,14 @@ pub(crate) fn encode_vector(
         // SAFETY: `tq_normalize_as_l2_denorm` returned this normalized Vector child.
         unsafe { turboquant_quantize_core(&normalized_fsl, config, ctx)? }
     };
-    let codes = build_codes_child(num_vectors, core, vector_validity.clone())?;
+    let inv_direction_norms =
+        PrimitiveArray::new::<f32>(core.inv_direction_norms, vector_validity.clone()).into_array();
+    let codes = build_codes_child(
+        num_vectors,
+        core.all_indices,
+        core.padded_dim,
+        vector_validity.clone(),
+    )?;
 
     let metadata = TurboQuantMetadata {
         element_ptype,
@@ -218,7 +226,13 @@ pub(crate) fn encode_vector(
         seed: config.seed(),
         num_rounds: config.num_rounds(),
     };
-    let storage = build_storage(norms, codes, num_vectors, vector_validity)?;
+    let storage = build_storage(
+        norms,
+        inv_direction_norms,
+        codes,
+        num_vectors,
+        vector_validity,
+    )?;
 
     Ok(ExtensionArray::try_new_from_vtable(TurboQuant, metadata, storage)?.into_array())
 }
@@ -16,10 +16,12 @@ use vortex_array::dtype::PType;
 use vortex_array::validity::Validity;
 use vortex_buffer::Buffer;
 use vortex_error::VortexResult;
+use vortex_tensor::scalar_fns::l2_norm::L2Norm;
 
 use super::execute_tq_decode;
 use super::execute_tq_encode;
 use super::f32_vector_array;
+use super::tensor_test_session;
 use super::test_session;
 use super::turboquant_storage;
 use super::vector_array;
@@ -29,6 +31,7 @@ use super::vector_values_f32;
 use crate::TurboQuantConfig;
 use crate::centroids::compute_or_get_centroids;
 use crate::vector::normalize::tq_normalize_as_l2_denorm;
+use crate::vector::storage::parse_storage;
 
 #[rstest]
 #[case::zero_bits(0, 42, 3)]
@@ -105,6 +108,10 @@ fn encode_stores_norms_and_struct_validity() -> VortexResult<()> {
         .unmasked_field_by_name("norms")?
         .clone()
         .execute(&mut ctx)?;
+    let inv_direction_norms: PrimitiveArray = storage
+        .unmasked_field_by_name("inv_direction_norms")?
+        .clone()
+        .execute(&mut ctx)?;
     let codes: FixedSizeListArray = storage
         .unmasked_field_by_name("codes")?
         .clone()
@@ -114,13 +121,21 @@ fn encode_stores_norms_and_struct_validity() -> VortexResult<()> {
     assert!(!mask.value(1));
     assert!(mask.value(2));
     assert_eq!(norms.validity()?.nullability(), Nullability::Nullable);
+    assert_eq!(
+        inv_direction_norms.validity()?.nullability(),
+        Nullability::Nullable
+    );
     assert_eq!(codes.validity()?.nullability(), Nullability::Nullable);
 
     let norms_validity = norms.validity()?.execute_mask(3, &mut ctx)?;
+    let inv_direction_norms_validity = inv_direction_norms.validity()?.execute_mask(3, &mut ctx)?;
     let codes_validity = codes.validity()?.execute_mask(3, &mut ctx)?;
     assert!(norms_validity.value(0));
     assert!(!norms_validity.value(1));
     assert!(norms_validity.value(2));
+    assert!(inv_direction_norms_validity.value(0));
+    assert!(!inv_direction_norms_validity.value(1));
+    assert!(inv_direction_norms_validity.value(2));
     assert!(codes_validity.value(0));
     assert!(!codes_validity.value(1));
     assert!(codes_validity.value(2));
@@ -134,6 +149,57 @@ fn encode_stores_norms_and_struct_validity() -> VortexResult<()> {
     Ok(())
 }
 
+#[test]
+fn encode_stores_zero_inv_direction_norm_for_zero_rows() -> VortexResult<()> {
+    let session = test_session();
+    let mut ctx = session.create_execution_ctx();
+    let mut values = vec![0.0f32; 3 * 128];
+    values[0] = 3.0;
+    values[1] = 4.0;
+    values[256] = 1.0;
+    let input = vector_array(128, &values, Validity::NonNullable)?;
+
+    let encoded = execute_tq_encode(input, &TurboQuantConfig::default(), &mut ctx)?;
+    let storage = turboquant_storage(encoded, &mut ctx)?;
+    let inv_direction_norms: PrimitiveArray = storage
+        .unmasked_field_by_name("inv_direction_norms")?
+        .clone()
+        .execute(&mut ctx)?;
+
+    let values = inv_direction_norms.as_slice::<f32>();
+    assert!(values[0].is_finite() && values[0] > 0.0);
+    assert_eq!(values[1], 0.0);
+    assert!(values[2].is_finite() && values[2] > 0.0);
+    Ok(())
+}
+
+#[test]
+fn decode_preserves_original_l2_norms_for_non_power_of_two_dimensions() -> VortexResult<()> {
+    let session = tensor_test_session();
+    let mut ctx = session.create_execution_ctx();
+    let input = f32_vector_array(129, 3, 0.25, Validity::NonNullable)?;
+    let config = TurboQuantConfig::try_new(3, 42, 3)?;
+
+    let encoded = execute_tq_encode(input, &config, &mut ctx)?;
+    let expected_norms = parse_storage(encoded.clone(), &mut ctx)?.norms;
+    let decoded = execute_tq_decode(encoded, &mut ctx)?;
+    let decoded_norms: PrimitiveArray = L2Norm::try_new_array(decoded, 3)?
+        .into_array()
+        .execute(&mut ctx)?;
+
+    for (actual, expected) in decoded_norms
+        .as_slice::<f32>()
+        .iter()
+        .zip(expected_norms.as_slice::<f32>())
+    {
+        assert!(
+            (*actual - *expected).abs() <= 1e-4 * expected.max(1.0),
+            "decoded norm {actual} did not match stored norm {expected}"
+        );
+    }
+    Ok(())
+}
+
 #[test]
 fn normalize_as_l2_denorm_preserves_child_validity() -> VortexResult<()> {
     let session = test_session();
 
@@ -30,9 +30,9 @@ const DIM: u32 = 128;
 
 /// Fast path: `L2Norm(TQDecode(tq_arr))` returns the storage `norms` field bit-for-bit.
 ///
-/// The slow path would recompute norms from lossily decoded vectors, which only approximately
-/// match the stored norms. Bit-exact equality is the strongest invariant that confirms the
-/// session-registered kernel fired.
+/// `TQDecode` applies the stored inverse direction-norm correction, so decoded vectors preserve
+/// these norms. Bit-exact equality is the strongest invariant that confirms the session-registered
+/// kernel fired instead of recomputing.
 #[test]
 fn l2_norm_over_tq_decode_returns_stored_norms() -> VortexResult<()> {
     let session = tensor_test_session();