vortex-data
diff --git a/‎vortex-tensor/src/encodings/turboquant/array/data.rs‎
Lines changed: 19 additions & 18 deletions b/‎vortex-tensor/src/encodings/turboquant/array/data.rs‎
Lines changed: 19 additions & 18 deletions
diff --git a/‎vortex-tensor/src/encodings/turboquant/compress.rs‎
Lines changed: 21 additions & 15 deletions b/‎vortex-tensor/src/encodings/turboquant/compress.rs‎
Lines changed: 21 additions & 15 deletions
diff --git a/‎vortex-tensor/src/encodings/turboquant/decompress.rs‎
Lines changed: 9 additions & 5 deletions b/‎vortex-tensor/src/encodings/turboquant/decompress.rs‎
Lines changed: 9 additions & 5 deletions
@@ -37,13 +37,15 @@ pub struct TurboQuantData {
 
     /// Child arrays stored as slots. See [`Slot`] for positions:
     ///
-    /// - [`Codes`](Slot::Codes): `FixedSizeListArray<u8>` with `list_size == padded_dim`. Each row
-    ///   holds one u8 centroid index per padded coordinate. The cascade compressor handles packing
-    ///   to the actual `bit_width` on disk. The validity of the entire array is stored with this.
+    /// - [`Codes`](Slot::Codes): Non-nullable `FixedSizeListArray<u8>` with
+    ///   `list_size == padded_dim`. Each row holds one u8 centroid index per padded coordinate.
+    ///   Null vectors are represented by all-zero codes. The cascade compressor handles packing
+    ///   to the actual `bit_width` on disk.
     ///
     /// - [`Norms`](Slot::Norms): Per-vector L2 norms, one per row. The dtype matches the element
-    ///   type of the Vector (e.g., f64 norms for f64 vectors). Exact norms are stored during
-    ///   compression, enabling O(1) L2 norm readthrough without decompression.
+    ///   type of the Vector (e.g., f64 norms for f64 vectors) and carries the nullability of the
+    ///   parent dtype. Null vectors have null norms. This child determines the validity of the
+    ///   entire TurboQuant array, enabling O(1) L2 norm readthrough without decompression.
     ///
     /// - [`Centroids`](Slot::Centroids): `PrimitiveArray<f32>` codebook with `2^bit_width` entries
     ///   that is shared across all rows. We always store these as f32 regardless of the input
@@ -101,10 +103,11 @@ impl TurboQuantData {
     ///
     /// - `dtype` is a [`Vector`](crate::vector::Vector) extension type whose storage list size
     ///   is >= 3.
-    /// - `codes` is a `FixedSizeListArray<u8>` with `list_size == padded_dim` and
-    ///   `codes.len() == norms.len()`.
+    /// - `codes` is a non-nullable `FixedSizeListArray<u8>` with `list_size == padded_dim` and
+    ///   `codes.len() == norms.len()`. Null vectors are represented by all-zero codes.
     /// - `norms` is a primitive array whose ptype matches the element type of the Vector's storage
-    ///   dtype. This must match the validity of the `codes` array.
+    ///   dtype. The nullability must match `dtype.nullability()`. Norms carry the validity of the
+    ///   entire array, since null vectors have null norms.
     /// - `centroids` is a non-nullable `PrimitiveArray<f32>` whose length is a power of 2 in
     ///   `[2, 256]` (i.e., `2^bit_width` for bit_width 1-8), or empty for degenerate arrays.
     /// - `rotation_signs` has `3 * padded_dim` elements, or is empty for degenerate arrays.
@@ -166,11 +169,12 @@ impl TurboQuantData {
         let dimension = extension_list_size(ext)?;
         let padded_dim = dimension.next_power_of_two();
 
-        // Codes must be a FixedSizeList<u8> with list_size == padded_dim.
+        // Codes must be a non-nullable FixedSizeList<u8> with list_size == padded_dim.
+        // Null vectors are represented by all-zero codes since validity lives in the norms array.
         let expected_codes_dtype = DType::FixedSizeList(
-            Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)), // FIX THIS!!!
+            Arc::new(DType::Primitive(PType::U8, Nullability::NonNullable)),
             padded_dim,
-            dtype.nullability(),
+            Nullability::NonNullable,
         );
         vortex_ensure_eq!(
             *codes.dtype(),
@@ -185,10 +189,6 @@ impl TurboQuantData {
             "norms length must match codes length",
         );
 
-        // TODO(connor): Should we check that the codes and norms have the same validity? We could
-        // also make it so that norms holds the validity and any null vectors encoded as codes is
-        // just 0...
-
         // Degenerate (empty) case: all children must be empty, and bit_width is 0.
         if num_rows == 0 {
             vortex_ensure!(
@@ -219,13 +219,14 @@ impl TurboQuantData {
             "derived bit_width must be 1-8, got {bit_width}"
         );
 
-        // Norms dtype must match the element ptype of the Vector.
+        // Norms dtype must match the element ptype of the Vector, with the parent's nullability.
+        // Norms carry the validity of the entire TurboQuant array.
         let element_ptype = extension_element_ptype(ext)?;
-        let expected_norms_dtype = DType::Primitive(element_ptype, Nullability::NonNullable); // FIX THIS!!!
+        let expected_norms_dtype = DType::Primitive(element_ptype, dtype.nullability());
         vortex_ensure_eq!(
             *norms.dtype(),
             expected_norms_dtype,
-            "norms dtype does not match expected (must match Vector element type)",
+            "norms dtype does not match expected {expected_norms_dtype}",
         );
 
         // Centroids are always f32 regardless of element type.
 
@@ -76,7 +76,8 @@ struct QuantizationResult {
     rotation: RotationMatrix,
     centroids: Vec<f32>,
     all_indices: BufferMut<u8>,
-    /// Native-precision norms (matching the Vector element type).
+    /// Native-precision norms (matching the Vector element type). Carries validity: null vectors
+    /// have null norms.
     norms_array: ArrayRef,
     padded_dim: usize,
 }
@@ -85,19 +86,22 @@ struct QuantizationResult {
 /// normalize/rotate/quantize all rows.
 ///
 /// Norms are computed in the native element precision via the [`L2Norm`] scalar function.
-/// The rotation and centroid lookup happen in f32.
+/// The rotation and centroid lookup happen in f32. Null rows (per the input validity) produce
+/// all-zero codes.
 #[allow(clippy::cast_possible_truncation)]
 fn turboquant_quantize_core(
     ext: &ExtensionArray,
     fsl: &FixedSizeListArray,
     seed: u64,
     bit_width: u8,
+    validity: &Validity,
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<QuantizationResult> {
     let dimension = fsl.list_size() as usize;
     let num_rows = fsl.len();
 
-    // Compute native-precision norms via the L2Norm scalar fn.
+    // Compute native-precision norms via the L2Norm scalar fn. L2Norm propagates validity from
+    // the input, so null vectors get null norms automatically.
     let norms_sfn = L2Norm::try_new_array(&ApproxOptions::Exact, ext.as_ref().clone(), num_rows)?;
     let norms_array: ArrayRef = norms_sfn.into_array().execute(ctx)?;
     let norms_prim: PrimitiveArray = norms_array.to_canonical()?.into_primitive();
@@ -125,6 +129,12 @@ fn turboquant_quantize_core(
 
     let f32_slice = f32_elements.as_slice::<f32>();
     for row in 0..num_rows {
+        // Null vectors get all-zero codes.
+        if !validity.is_valid(row)? {
+            all_indices.extend(std::iter::repeat_n(0u8, padded_dim));
+            continue;
+        }
+
         let x = &f32_slice[row * dimension..(row + 1) * dimension];
         let norm = f32_norms[row];
 
@@ -189,12 +199,10 @@ fn build_turboquant(
     )
 }
 
-/// Encode a [`Vector`] extension array into a `TurboQuantArray`.
-///
-/// The input must be a non-nullable [`Vector`] extension array. TurboQuant is a lossy encoding
-/// that does not preserve null positions; callers must handle validity externally.
+/// Encode a [`Vector`](crate::vector::Vector) extension array into a `TurboQuantArray`.
 ///
-/// [`Vector`]: crate::vector::Vector
+/// Nullable inputs are supported: null vectors get all-zero codes and null norms. The validity
+/// of the resulting TurboQuant array is carried by the norms child.
 pub fn turboquant_encode(
     ext: &ExtensionArray,
     config: &TurboQuantConfig,
@@ -204,10 +212,6 @@ pub fn turboquant_encode(
     let storage = ext.storage_array();
     let fsl = storage.to_canonical()?.into_fixed_size_list();
 
-    vortex_ensure!(
-        fsl.dtype().nullability() == Nullability::NonNullable,
-        "TurboQuant requires non-nullable input, got nullable FixedSizeListArray"
-    );
     vortex_ensure!(
         config.bit_width >= 1 && config.bit_width <= 8,
         "bit_width must be 1-8, got {}",
@@ -228,10 +232,11 @@ pub fn turboquant_encode(
             0,
         )?;
 
-        // Norms dtype matches the element type.
+        // Norms dtype matches the element type and carries the parent's nullability.
         let element_ptype = fsl.elements().dtype().as_ptype();
+        let norms_nullability = ext_dtype.nullability();
         let empty_norms: ArrayRef = match_each_float_ptype!(element_ptype, |T| {
-            PrimitiveArray::empty::<T>(Nullability::NonNullable).into_array()
+            PrimitiveArray::empty::<T>(norms_nullability).into_array()
         });
 
         let empty_centroids = PrimitiveArray::empty::<f32>(Nullability::NonNullable);
@@ -246,8 +251,9 @@ pub fn turboquant_encode(
         .into_array());
     }
 
+    let validity = ext.as_ref().validity()?;
     let seed = config.seed.unwrap_or(42);
-    let core = turboquant_quantize_core(ext, &fsl, seed, config.bit_width, ctx)?;
+    let core = turboquant_quantize_core(ext, &fsl, seed, config.bit_width, &validity, ctx)?;
 
     Ok(build_turboquant(&fsl, core, ext_dtype)?.into_array())
 }
 
@@ -13,6 +13,7 @@ use vortex_array::arrays::ExtensionArray;
 use vortex_array::arrays::FixedSizeListArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::dtype::NativePType;
+use vortex_array::dtype::Nullability;
 use vortex_array::match_each_float_ptype;
 use vortex_array::validity::Validity;
 use vortex_buffer::BufferMut;
@@ -39,15 +40,17 @@ pub fn execute_decompress(
     let element_ptype = extension_element_ptype(&ext_dtype)?;
 
     if num_rows == 0 {
-        let nn = vortex_array::dtype::Nullability::NonNullable;
+        let fsl_validity = Validity::from(ext_dtype.storage_dtype().nullability());
+
         match_each_float_ptype!(element_ptype, |T| {
-            let elements = PrimitiveArray::empty::<T>(nn);
+            let elements = PrimitiveArray::empty::<T>(Nullability::NonNullable);
             let fsl = FixedSizeListArray::try_new(
                 elements.into_array(),
                 array.dimension(),
-                Validity::NonNullable,
+                fsl_validity,
                 0,
             )?;
+
             return Ok(ExtensionArray::new(ext_dtype, fsl.into_array()).into_array());
         })
     }
@@ -70,8 +73,9 @@ pub fn execute_decompress(
     let codes_prim = codes_fsl.elements().to_canonical()?.into_primitive();
     let indices = codes_prim.as_slice::<u8>();
 
-    // Read norms in their native precision.
+    // Read norms in their native precision. Norms carry the validity of the array.
     let norms_prim = array.norms().clone().execute::<PrimitiveArray>(ctx)?;
+    let output_validity = array.norms().validity()?;
 
     // MSE decode: dequantize (f32) -> inverse rotate (f32) -> scale by norm -> cast to T.
     // The rotation and centroid lookup always happen in f32. The final output is cast to the
@@ -90,7 +94,7 @@ pub fn execute_decompress(
             let fsl = FixedSizeListArray::try_new(
                 elements.into_array(),
                 array.dimension(),
-                Validity::NonNullable,
+                output_validity,
                 num_rows,
             )?;
             Ok(ExtensionArray::new(ext_dtype, fsl.into_array()).into_array())