vortex-data
diff --git a/‎vortex-turboquant/src/centroids.rs‎
Lines changed: 8 additions & 0 deletions b/‎vortex-turboquant/src/centroids.rs‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎vortex-turboquant/src/lib.rs‎
Lines changed: 16 additions & 40 deletions b/‎vortex-turboquant/src/lib.rs‎
Lines changed: 16 additions & 40 deletions
diff --git a/‎vortex-turboquant/src/sorf/transform.rs‎
Lines changed: 9 additions & 3 deletions b/‎vortex-turboquant/src/sorf/transform.rs‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎vortex-turboquant/src/tests/malformed.rs‎
Lines changed: 72 additions & 0 deletions b/‎vortex-turboquant/src/tests/malformed.rs‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎vortex-turboquant/src/tests/metadata.rs‎
Lines changed: 50 additions & 3 deletions b/‎vortex-turboquant/src/tests/metadata.rs‎
Lines changed: 50 additions & 3 deletions
@@ -11,6 +11,14 @@
 //!
 //! The Max-Lloyd algorithm finds optimal quantization centroids that minimize MSE for this
 //! distribution.
+//!
+//! Centroids are not stored in TurboQuant arrays. They are deterministically derived from
+//! `(padded_dim, bit_width)` and cached process-locally.
+//!
+//! The centroid model follows the random-rotation marginal used by the TurboQuant paper. This
+//! encoder applies a SORF-style structured rotation instead of a dense random Gaussian or
+//! orthogonal matrix, so paper-level error bounds should not be treated as verified for this
+//! implementation without separate empirical validation.
 
 use std::sync::LazyLock;
 
 
@@ -25,60 +25,36 @@
 //! [`turboquant_pack()`]: crate::turboquant_pack
 //! [`turboquant_unpack()`]: crate::turboquant_unpack
 //!
-//! The full packed tree is:
+//! The packed storage is a row-aligned extension tree:
 //!
 //! ```text
 //! Extension<TurboQuant>(
 //!     Struct {
-//!         norms: Primitive<element_ptype>,
-//!         codes: FixedSizeList<Primitive<u8>, padded_dim>,
+//!         norms: Primitive<element_ptype, row_validity>,
+//!         codes: FixedSizeList<Primitive<u8>, padded_dim, row_validity>,
 //!     }
 //! )
 //! ```
 //!
-//! Row validity is stored on the `StructArray`, preserving original vector nulls. The `norms` and
-//! `codes` children are non-nullable and may contain deterministic placeholder values for null
-//! rows. Centroids are not stored; they are deterministically derived from the padded dimension and
-//! bit width, and cached process-locally.
-//!
 //! Stored norms are authoritative for future TurboQuant-aware scalar functions. Decoded quantized
 //! directions are not guaranteed to have unit norm after scalar quantization and inverse rotation.
 //!
-//! The TurboQuant paper analyzes a full random orthogonal rotation. The current Vortex
-//! implementation instead uses a fixed 3-round Walsh-Hadamard-based structured transform with
-//! random sign diagonals generated by Vortex's frozen local SplitMix64 stream. This is a practical
-//! approximation chosen for encode/decode efficiency, and should be understood as an
-//! implementation choice rather than the exact construction used in the paper's proofs.
-//!
-//! The current encoding is also intentionally MSE-only. It does not yet implement the paper's QJL
-//! residual correction for unbiased inner-product estimation, and it still uses internal
-//! power-of-2 padding rather than the block decomposition proposed in RFC 0033.
-//!
-//! # Theoretical error bounds
+//! # Source map
 //!
-//! For unit-norm vectors quantized at `b` bits per coordinate, the paper's Theorem 1
-//! guarantees normalized MSE distortion:
+//! Implementation details are documented next to the code that owns them:
 //!
-//! > `E[||x - x_hat||² / ||x||²] <= (√3 · π / 2) / 4^b`
+//! - `vector/storage.rs`: physical storage shape, full-length child arrays, and field-level
+//!   validity for null vectors.
+//! - `vector/normalize.rs`: TurboQuant-local normalization and how it differs from the tensor
+//!   crate's null-row zeroing helper.
+//! - `vector/quantize.rs`: SORF rotation, centroid lookup, and why invalid rows are skipped rather
+//!   than quantized.
+//! - `centroids.rs`: deterministic Max-Lloyd centroid computation and process-local caching.
+//! - `sorf/`: the Walsh-Hadamard-based structured rotation and the stable SplitMix64 sign stream.
 //!
-//! | Bits | MSE bound  | Quality           |
-//! |------|------------|-------------------|
-//! | 1    | 6.80e-01   | Poor              |
-//! | 2    | 1.70e-01   | Usable for ANN    |
-//! | 3    | 4.25e-02   | Good              |
-//! | 4    | 1.06e-02   | Very good         |
-//! | 5    | 2.66e-03   | Excellent         |
-//! | 6    | 6.64e-04   | Near-lossless     |
-//! | 7    | 1.66e-04   | Near-lossless     |
-//! | 8    | 4.15e-05   | Near-lossless     |
-//!
-//! # Storage notes
-//!
-//! Each vector is logically stored as `padded_dim` u8 quantized codes plus one stored norm in the
-//! vector's element float type. Non-power-of-2 dimensions are padded to the next power of 2 for
-//! the structured rotation, which affects the storage size. Physical compression of those child
-//! arrays is left to the normal Vortex compressor rather than implemented as a TurboQuant-specific
-//! compressor scheme.
+//! The current encoding is intentionally MSE-only. It does not yet implement the paper's QJL
+//! residual correction for unbiased inner-product estimation, and it still uses internal
+//! power-of-2 padding rather than the block decomposition proposed in RFC 0033.
 
 mod centroids;
 mod config;
 
@@ -7,12 +7,15 @@
 //! approximation to a random orthogonal matrix using random sign diagonals interleaved with the
 //! Fast Walsh-Hadamard Transform (FWHT).
 //!
+//! [sorf-paper]: https://proceedings.neurips.cc/paper_files/paper/2016/file/53adaf494dc89ef7196d73636eb2451b-Paper.pdf
+//!
 //! For `k` rounds, the transform is `norm * H * D_k * ... * H * D_1 * x`, where `D_1` is the
 //! first sign diagonal applied. The number of rounds is configurable (typically 3). Each round
 //! applies a random sign diagonal `D_i` and then the Hadamard matrix `H`, giving O(d log d) cost
 //! per matrix-vector product instead of the O(d^2) cost of a dense orthogonal matrix.
 //!
-//! Vortex defines those sign diagonals using a frozen local SplitMix64 stream rather than an
+//! This implementation defines those sign diagonals using a frozen local SplitMix64 stream rather
+//! than an
 //! external RNG crate. The contract is:
 //!
 //! - state is a single `u64` seed,
@@ -22,10 +25,13 @@
 //! - each generated `u64` contributes 64 signs in least-significant-bit-first order,
 //! - bit `1` means `+1` and bit `0` means `-1`.
 //!
-//! This makes SORF sign generation stable as a Vortex format contract even if external RNG
+//! This makes SORF sign generation stable as an extension format contract even if external RNG
 //! implementations change.
 //!
-//! [sorf-paper]: https://proceedings.neurips.cc/paper_files/paper/2016/file/53adaf494dc89ef7196d73636eb2451b-Paper.pdf
+//! This transform is the crate's practical structured-rotation choice for TurboQuant. It is not
+//! the dense random Gaussian or orthogonal matrix used by some theoretical analyses, so theoretical
+//! bounds from those models need separate validation before being presented as implementation
+//! guarantees.
 //!
 //! The FWHT exploits the Kronecker product structure of the Hadamard matrix (`H_n = H_2 (x) H_2
 //! (x) ... (x) H_2`, with `log2(n)` factors) to compute the matrix-vector product in O(n log n)
 
@@ -1,13 +1,15 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use rstest::rstest;
 use vortex_array::IntoArray;
 use vortex_array::VortexSessionExecute;
 use vortex_array::arrays::ExtensionArray;
 use vortex_array::arrays::FixedSizeListArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::arrays::StructArray;
 use vortex_array::dtype::FieldNames;
+use vortex_array::dtype::Nullability;
 use vortex_array::dtype::PType;
 use vortex_array::validity::Validity;
 use vortex_buffer::Buffer;
@@ -17,6 +19,76 @@ use crate::TurboQuant;
 use crate::TurboQuantMetadata;
 use crate::turboquant_unpack;
 
+fn validity_for_nullability(nullability: Nullability) -> Validity {
+    match nullability {
+        Nullability::NonNullable => Validity::NonNullable,
+        Nullability::Nullable => Validity::AllValid,
+    }
+}
+
+#[rstest]
+#[case::nullable_norms_under_nonnullable_struct(
+    Nullability::NonNullable,
+    Nullability::Nullable,
+    Nullability::NonNullable
+)]
+#[case::nullable_codes_under_nonnullable_struct(
+    Nullability::NonNullable,
+    Nullability::NonNullable,
+    Nullability::Nullable
+)]
+#[case::nonnullable_norms_under_nullable_struct(
+    Nullability::Nullable,
+    Nullability::NonNullable,
+    Nullability::Nullable
+)]
+#[case::nonnullable_codes_under_nullable_struct(
+    Nullability::Nullable,
+    Nullability::Nullable,
+    Nullability::NonNullable
+)]
+fn unpack_rejects_row_nullability_mismatch(
+    #[case] struct_nullability: Nullability,
+    #[case] norms_nullability: Nullability,
+    #[case] codes_nullability: Nullability,
+) {
+    let session = test_session();
+    let mut ctx = session.create_execution_ctx();
+    let metadata = TurboQuantMetadata {
+        element_ptype: PType::F32,
+        dimensions: 128,
+        bit_width: 1,
+        seed: 42,
+        num_rounds: 3,
+    };
+    let norms = PrimitiveArray::new::<f32>(
+        Buffer::copy_from([1.0]),
+        validity_for_nullability(norms_nullability),
+    )
+    .into_array();
+    let codes = PrimitiveArray::new::<u8>(vec![0u8; 128], Validity::NonNullable);
+    let codes = FixedSizeListArray::try_new(
+        codes.into_array(),
+        128,
+        validity_for_nullability(codes_nullability),
+        1,
+    )
+    .unwrap()
+    .into_array();
+    let storage = StructArray::try_new(
+        FieldNames::from(["norms", "codes"]),
+        vec![norms, codes],
+        1,
+        validity_for_nullability(struct_nullability),
+    )
+    .unwrap();
+    let tq = ExtensionArray::try_new_from_vtable(TurboQuant, metadata, storage.into_array())
+        .unwrap()
+        .into_array();
+
+    assert!(turboquant_unpack(tq, &mut ctx).is_err());
+}
+
 #[test]
 #[should_panic(expected = "TurboQuant code exceeds centroid count")]
 fn unpack_panics_on_codes_outside_centroid_table() {
 
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use std::sync::Arc;
+
 use prost::Message;
 use rstest::rstest;
 use vortex_array::dtype::DType;
@@ -11,10 +13,13 @@ use vortex_array::dtype::StructFields;
 use vortex_array::dtype::extension::ExtDType;
 use vortex_array::dtype::extension::ExtVTable;
 use vortex_error::VortexResult;
+use vortex_error::vortex_err;
 
 use crate::TurboQuant;
 use crate::TurboQuantMetadata;
-use crate::vector::storage::tq_storage_dtype;
+use crate::vector::storage::CODES_FIELD;
+use crate::vector::storage::NORMS_FIELD;
+use crate::vector::tq_padded_dim;
 
 #[derive(Clone, PartialEq, Message)]
 struct MetadataWire {
@@ -30,6 +35,28 @@ struct MetadataWire {
     num_rounds: u32,
 }
 
+fn tq_storage_dtype(
+    metadata: &TurboQuantMetadata,
+    row_nullability: Nullability,
+) -> VortexResult<DType> {
+    let padded_dim = u32::try_from(tq_padded_dim(metadata.dimensions)?)
+        .map_err(|_| vortex_err!("TurboQuant padded dimension does not fit u32"))?;
+    Ok(DType::Struct(
+        StructFields::new(
+            FieldNames::from([NORMS_FIELD, CODES_FIELD]),
+            vec![
+                DType::Primitive(metadata.element_ptype, row_nullability),
+                DType::FixedSizeList(
+                    Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)),
+                    padded_dim,
+                    row_nullability,
+                ),
+            ],
+        ),
+        row_nullability,
+    ))
+}
+
 #[rstest]
 #[case::f16(PType::F16)]
 #[case::f32(PType::F32)]
@@ -94,7 +121,27 @@ fn dtype_validation_accepts_expected_storage() -> VortexResult<()> {
         num_rounds: 3,
     };
 
-    ExtDType::<TurboQuant>::try_new(metadata, tq_storage_dtype(&metadata)?)?;
+    ExtDType::<TurboQuant>::try_new(
+        metadata,
+        tq_storage_dtype(&metadata, Nullability::Nullable)?,
+    )?;
+    Ok(())
+}
+
+#[test]
+fn dtype_validation_accepts_nonnullable_storage() -> VortexResult<()> {
+    let metadata = TurboQuantMetadata {
+        element_ptype: PType::F32,
+        dimensions: 129,
+        bit_width: 2,
+        seed: 42,
+        num_rounds: 3,
+    };
+
+    ExtDType::<TurboQuant>::try_new(
+        metadata,
+        tq_storage_dtype(&metadata, Nullability::NonNullable)?,
+    )?;
     Ok(())
 }
 
@@ -113,7 +160,7 @@ fn dtype_validation_rejects_malformed_storage() {
             vec![
                 DType::Primitive(PType::F32, Nullability::Nullable),
                 DType::FixedSizeList(
-                    DType::Primitive(PType::U8, Nullability::NonNullable).into(),
+                    DType::Primitive(PType::U8, Nullability::Nullable).into(),
                     128,
                     Nullability::NonNullable,
                 ),