Revert "share rotation matrix between MSE and QJL"

lwwmanning · lwwmanning · commit f978d7b936a9 · 2026-03-30T17:55:26.000-04:00
This reverts commit 0c5e8e73af9afc001e20405c91d11d59a8129796.

Signed-off-by: Will Manning &lt;will@willmanning.io&gt;
diff --git a/encodings/turboquant/src/array.rs b/encodings/turboquant/src/array.rs
@@ -37,17 +37,15 @@ pub struct TurboQuantMetadata {
 }
 
 /// Optional QJL (Quantized Johnson-Lindenstrauss) correction for unbiased
-/// inner product estimation. When present, adds 2 additional children.
-///
-/// The QJL correction reuses the MSE rotation matrix (stored in `rotation_signs`)
-/// rather than maintaining a separate rotation. This halves the rotation sign
-/// storage and avoids reconstructing a second `RotationMatrix` at decode time.
+/// inner product estimation. When present, adds 3 additional children.
 #[derive(Clone, Debug)]
 pub struct QjlCorrection {
-    /// Sign bits: `BitPackedArray` (1-bit), length `num_rows * padded_dim`.
+    /// Sign bits: `BoolArray`, length `num_rows * padded_dim`.
     pub(crate) signs: ArrayRef,
     /// Residual norms: `PrimitiveArray<f32>`, length `num_rows`.
     pub(crate) residual_norms: ArrayRef,
+    /// QJL rotation signs: `BoolArray`, length `3 * padded_dim` (inverse order).
+    pub(crate) rotation_signs: ArrayRef,
 }
 
 impl QjlCorrection {
@@ -60,6 +58,11 @@ impl QjlCorrection {
     pub fn residual_norms(&self) -> &ArrayRef {
         &self.residual_norms
     }
+
+    /// The QJL rotation signs (BoolArray, inverse application order).
+    pub fn rotation_signs(&self) -> &ArrayRef {
+        &self.rotation_signs
+    }
 }
 
 /// TurboQuant array.
@@ -68,11 +71,12 @@ impl QjlCorrection {
 /// - 0: `codes` — `BitPackedArray` or `PrimitiveArray<u8>` (quantized indices)
 /// - 1: `norms` — `PrimitiveArray<f32>` (one per vector row)
 /// - 2: `centroids` — `PrimitiveArray<f32>` (codebook, length 2^bit_width)
-/// - 3: `rotation_signs` — `BitPackedArray` (3 * padded_dim, 1-bit u8 0/1, inverse order)
+/// - 3: `rotation_signs` — `BoolArray` (3 * padded_dim bits, inverse application order)
 ///
 /// Optional QJL children (when `has_qjl` is true):
-/// - 4: `qjl_signs` — `BitPackedArray` (num_rows * padded_dim, 1-bit u8 0/1)
+/// - 4: `qjl_signs` — `BoolArray` (num_rows * padded_dim bits)
 /// - 5: `qjl_residual_norms` — `PrimitiveArray<f32>` (one per row)
+/// - 6: `qjl_rotation_signs` — `BoolArray` (3 * padded_dim bits, QJL rotation, inverse order)
 #[derive(Clone, Debug)]
 pub struct TurboQuantArray {
     pub(crate) dtype: DType,
diff --git a/encodings/turboquant/src/compress.rs b/encodings/turboquant/src/compress.rs
@@ -237,9 +237,9 @@ pub fn turboquant_encode_qjl(
     let core = turboquant_quantize_core(fsl, seed, mse_bit_width)?;
     let padded_dim = core.padded_dim;
 
-    // QJL reuses the MSE rotation matrix. This saves one stored rotation child
-    // and one RotationMatrix reconstruction at decode time. Empirically verified
-    // via the qjl_inner_product_bias test suite to not introduce significant bias.
+    // QJL uses a different rotation than the MSE stage to ensure statistical
+    // independence between the quantization noise and the sign projection.
+    let qjl_rotation = RotationMatrix::try_new(seed.wrapping_add(25), dim)?;
 
     let num_rows = fsl.len();
     let mut residual_norms_buf = BufferMut::<f32>::with_capacity(num_rows);
@@ -281,9 +281,9 @@ pub fn turboquant_encode_qjl(
             let residual_norm = l2_norm(&residual[..dim]);
             residual_norms_buf.push(residual_norm);
 
-            // QJL: sign(S · r), reusing the MSE rotation S.
+            // QJL: sign(S · r).
             if residual_norm > 0.0 {
-                core.rotation.rotate(&residual, &mut projected);
+                qjl_rotation.rotate(&residual, &mut projected);
             } else {
                 projected.fill(0.0);
             }
@@ -297,16 +297,17 @@ pub fn turboquant_encode_qjl(
     // Build the MSE part.
     let mut array = build_turboquant_mse(fsl, core, mse_bit_width)?;
 
-    // Attach QJL correction. The QJL reuses the MSE rotation matrix (already
-    // stored as rotation_signs), so we only need to store signs and residual norms.
+    // Attach QJL correction.
     let residual_norms_array =
         PrimitiveArray::new::<f32>(residual_norms_buf.freeze(), Validity::NonNullable);
     let qjl_signs_prim = PrimitiveArray::new::<u8>(qjl_sign_u8.freeze(), Validity::NonNullable);
     let qjl_signs_packed = bitpack_encode(&qjl_signs_prim, 1, None)?.into_array();
+    let qjl_rotation_signs = bitpack_rotation_signs(&qjl_rotation)?;
 
     array.qjl = Some(QjlCorrection {
         signs: qjl_signs_packed,
         residual_norms: residual_norms_array.into_array(),
+        rotation_signs: qjl_rotation_signs,
     });
 
     Ok(array.into_array())
diff --git a/encodings/turboquant/src/decompress.rs b/encodings/turboquant/src/decompress.rs
@@ -28,9 +28,8 @@ fn qjl_correction_scale(padded_dim: usize) -> f32 {
 /// Decompress a `TurboQuantArray` into a `FixedSizeListArray` of floats.
 ///
 /// Reads stored centroids and rotation signs from the array's children,
-/// avoiding any recomputation. If QJL correction is present, the MSE decode
-/// and QJL correction are fused into a single pass over rows to avoid an
-/// intermediate buffer allocation and extra memory traffic.
+/// avoiding any recomputation. If QJL correction is present, applies
+/// the residual correction after MSE decoding.
 pub fn execute_decompress(
     array: TurboQuantArray,
     ctx: &mut ExecutionCtx,
@@ -55,7 +54,8 @@ pub fn execute_decompress(
     let centroids = centroids_prim.as_slice::<f32>();
 
     // FastLanes SIMD-unpacks the 1-bit bitpacked rotation signs into u8 0/1 values,
-    // then we expand to u32 XOR masks once (amortized over all rows).
+    // then we expand to u32 XOR masks once (amortized over all rows). This enables
+    // branchless XOR-based sign application in the per-row SRHT hot loop.
     let signs_prim = array
         .rotation_signs
         .clone()
@@ -69,57 +69,73 @@ pub fn execute_decompress(
     let norms_prim = array.norms.clone().execute::<PrimitiveArray>(ctx)?;
     let norms = norms_prim.as_slice::<f32>();
 
-    // Prepare QJL data (if present) before entering the row loop.
-    // QJL reuses the MSE rotation matrix — no separate rotation to reconstruct.
-    let qjl_scale = qjl_correction_scale(padded_dim);
-    let qjl_data = if let Some(qjl) = &array.qjl {
-        let qjl_signs_prim = qjl.signs.clone().execute::<PrimitiveArray>(ctx)?;
-        let residual_norms_prim = qjl.residual_norms.clone().execute::<PrimitiveArray>(ctx)?;
-        Some((qjl_signs_prim, residual_norms_prim))
-    } else {
-        None
-    };
-
-    // Single fused loop: MSE decode + optional QJL correction per row.
-    let mut output = BufferMut::<f32>::with_capacity(num_rows * dim);
+    // MSE decode: dequantize → inverse rotate → scale by norm.
+    let mut mse_output = BufferMut::<f32>::with_capacity(num_rows * dim);
     let mut dequantized = vec![0.0f32; padded_dim];
     let mut unrotated = vec![0.0f32; padded_dim];
-    // QJL scratch buffers (only used when qjl_data is Some).
-    let mut qjl_signs_vec = vec![0.0f32; padded_dim];
-    let mut qjl_projected = vec![0.0f32; padded_dim];
 
     for row in 0..num_rows {
         let row_indices = &indices[row * padded_dim..(row + 1) * padded_dim];
         let norm = norms[row];
 
-        // MSE: dequantize → inverse rotate → scale by norm.
         for idx in 0..padded_dim {
             dequantized[idx] = centroids[row_indices[idx] as usize];
         }
+
         rotation.inverse_rotate(&dequantized, &mut unrotated);
+
         for idx in 0..dim {
             unrotated[idx] *= norm;
         }
 
-        if let Some((ref qjl_signs_prim, ref residual_norms_prim)) = qjl_data {
-            // QJL: apply residual correction inline, reusing the MSE rotation.
-            let qjl_signs_u8 = qjl_signs_prim.as_slice::<u8>();
-            let residual_norms = residual_norms_prim.as_slice::<f32>();
-            let residual_norm = residual_norms[row];
-
-            let row_signs = &qjl_signs_u8[row * padded_dim..(row + 1) * padded_dim];
-            for idx in 0..padded_dim {
-                qjl_signs_vec[idx] = if row_signs[idx] != 0 { 1.0 } else { -1.0 };
-            }
-
-            rotation.inverse_rotate(&qjl_signs_vec, &mut qjl_projected);
-            let scale = qjl_scale * residual_norm;
-
-            for idx in 0..dim {
-                output.push(unrotated[idx] + scale * qjl_projected[idx]);
-            }
-        } else {
-            output.extend_from_slice(&unrotated[..dim]);
+        mse_output.extend_from_slice(&unrotated[..dim]);
+    }
+
+    // If no QJL correction, we're done.
+    let Some(qjl) = &array.qjl else {
+        let elements = PrimitiveArray::new::<f32>(mse_output.freeze(), Validity::NonNullable);
+        return Ok(FixedSizeListArray::try_new(
+            elements.into_array(),
+            array.dimension(),
+            Validity::NonNullable,
+            num_rows,
+        )?
+        .into_array());
+    };
+
+    // Apply QJL residual correction.
+    // FastLanes SIMD-unpacks the 1-bit bitpacked QJL signs into u8 0/1 values.
+    let qjl_signs_prim = qjl.signs.clone().execute::<PrimitiveArray>(ctx)?;
+    let qjl_signs_u8 = qjl_signs_prim.as_slice::<u8>();
+
+    let residual_norms_prim = qjl.residual_norms.clone().execute::<PrimitiveArray>(ctx)?;
+    let residual_norms = residual_norms_prim.as_slice::<f32>();
+
+    let qjl_rot_signs_prim = qjl.rotation_signs.clone().execute::<PrimitiveArray>(ctx)?;
+    let qjl_rot = RotationMatrix::from_u8_slice(qjl_rot_signs_prim.as_slice::<u8>(), dim)?;
+
+    let qjl_scale = qjl_correction_scale(padded_dim);
+    let mse_elements = mse_output.as_ref();
+
+    let mut output = BufferMut::<f32>::with_capacity(num_rows * dim);
+    let mut qjl_signs_vec = vec![0.0f32; padded_dim];
+    let mut qjl_projected = vec![0.0f32; padded_dim];
+
+    for row in 0..num_rows {
+        let mse_row = &mse_elements[row * dim..(row + 1) * dim];
+        let residual_norm = residual_norms[row];
+
+        // Convert u8 0/1 → f32 ±1.0 for this row's signs.
+        let row_signs = &qjl_signs_u8[row * padded_dim..(row + 1) * padded_dim];
+        for idx in 0..padded_dim {
+            qjl_signs_vec[idx] = if row_signs[idx] != 0 { 1.0 } else { -1.0 };
+        }
+
+        qjl_rot.inverse_rotate(&qjl_signs_vec, &mut qjl_projected);
+        let scale = qjl_scale * residual_norm;
+
+        for idx in 0..dim {
+            output.push(mse_row[idx] + scale * qjl_projected[idx]);
         }
     }
 
diff --git a/encodings/turboquant/src/vtable.rs b/encodings/turboquant/src/vtable.rs
@@ -42,7 +42,7 @@ use crate::array::TurboQuantMetadata;
 use crate::decompress::execute_decompress;
 
 const MSE_CHILDREN: usize = 4;
-const QJL_CHILDREN: usize = 2;
+const QJL_CHILDREN: usize = 3;
 
 impl VTable for TurboQuant {
     type Array = TurboQuantArray;
@@ -86,6 +86,7 @@ impl VTable for TurboQuant {
         if let Some(qjl) = &array.qjl {
             qjl.signs.array_hash(state, precision);
             qjl.residual_norms.array_hash(state, precision);
+            qjl.rotation_signs.array_hash(state, precision);
         }
     }
 
@@ -104,6 +105,7 @@ impl VTable for TurboQuant {
                 (Some(a), Some(b)) => {
                     a.signs.array_eq(&b.signs, precision)
                         && a.residual_norms.array_eq(&b.residual_norms, precision)
+                        && a.rotation_signs.array_eq(&b.rotation_signs, precision)
                 }
                 (None, None) => true,
                 _ => false,
@@ -148,6 +150,12 @@ impl VTable for TurboQuant {
                 .vortex_expect("QJL child requested but has_qjl is false")
                 .residual_norms
                 .clone(),
+            6 => array
+                .qjl
+                .as_ref()
+                .vortex_expect("QJL child requested but has_qjl is false")
+                .rotation_signs
+                .clone(),
             _ => vortex_panic!("TurboQuantArray child index {idx} out of bounds"),
         }
     }
@@ -160,6 +168,7 @@ impl VTable for TurboQuant {
             3 => "rotation_signs".to_string(),
             4 => "qjl_signs".to_string(),
             5 => "qjl_residual_norms".to_string(),
+            6 => "qjl_rotation_signs".to_string(),
             _ => vortex_panic!("TurboQuantArray child_name index {idx} out of bounds"),
         }
     }
@@ -213,9 +222,11 @@ impl VTable for TurboQuant {
         let qjl = if metadata.has_qjl {
             let qjl_signs = children.get(4, &signs_dtype, len * padded_dim)?;
             let qjl_residual_norms = children.get(5, &norms_dtype, len)?;
+            let qjl_rotation_signs = children.get(6, &signs_dtype, 3 * padded_dim)?;
             Some(QjlCorrection {
                 signs: qjl_signs,
                 residual_norms: qjl_residual_norms,
+                rotation_signs: qjl_rotation_signs,
             })
         } else {
             None
@@ -253,6 +264,7 @@ impl VTable for TurboQuant {
         if let Some(qjl) = &mut array.qjl {
             qjl.signs = iter.next().vortex_expect("qjl_signs child");
             qjl.residual_norms = iter.next().vortex_expect("qjl_residual_norms child");
+            qjl.rotation_signs = iter.next().vortex_expect("qjl_rotation_signs child");
         }
         Ok(())
     }