Skip to content

Commit 334d31e

Browse files
lwwmanningclaude
andcommitted
perf[turboquant]: restore fast SIMD-friendly decode by expanding stored signs
The bit-packed apply_inverse_srht_from_bits path introduced a ~20% decode throughput regression vs the original f32 sign multiply path, because per-element bit extraction + conditional negate is hard for the compiler to autovectorize. Fix: expand the stored BoolArray signs into f32 ±1.0 vectors once at decode start via RotationMatrix::from_bool_array(), then use the original inverse_rotate() with its SIMD-friendly apply_signs() inner loop. The expansion costs 3 × padded_dim × 4 bytes of temporary memory (12KB for dim=1024), amortized over all rows. We still store signs as 1-bit BoolArray on disk (32x space savings), but recover full autovectorized throughput at decode time. The apply_inverse_srht_from_bits function is retained (with tests) for potential future use with explicit SIMD bit-extraction intrinsics. Signed-off-by: Will Manning <will@spiraldb.com> Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: Will Manning <will@willmanning.io>
1 parent 2d84cbf commit 334d31e

1 file changed

Lines changed: 8 additions & 14 deletions

File tree

encodings/turboquant/src/decompress.rs

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ use vortex_error::VortexResult;
1616
use crate::mse::array::TurboQuantMSEArray;
1717
use crate::qjl::array::TurboQuantQJLArray;
1818
use crate::rotation::RotationMatrix;
19-
use crate::rotation::apply_inverse_srht_from_bits;
2019

2120
/// Decompress a `TurboQuantMSEArray` into a `FixedSizeListArray` of floats.
2221
///
@@ -45,11 +44,11 @@ pub fn execute_decompress_mse(
4544
let centroids_prim = array.centroids.clone().execute::<PrimitiveArray>(ctx)?;
4645
let centroids = centroids_prim.as_slice::<f32>();
4746

48-
// Read stored rotation signs — no recomputation.
47+
// Expand stored rotation signs into f32 ±1.0 vectors once (amortized over all rows).
48+
// This costs 3 × padded_dim × 4 bytes of temporary memory (e.g. 12KB for dim=1024)
49+
// but enables autovectorized f32 multiply in the per-row SRHT hot loop.
4950
let signs_bool = array.rotation_signs.clone().execute::<BoolArray>(ctx)?;
50-
let bit_buf = signs_bool.to_bit_buffer();
51-
let (_, _, raw_signs) = bit_buf.into_inner();
52-
let norm_factor = 1.0 / (padded_dim as f32 * (padded_dim as f32).sqrt());
51+
let rotation = RotationMatrix::from_bool_array(&signs_bool, dim)?;
5352

5453
// Unpack codes.
5554
let codes_prim = array.codes.clone().execute::<PrimitiveArray>(ctx)?;
@@ -60,6 +59,7 @@ pub fn execute_decompress_mse(
6059

6160
let mut output = BufferMut::<f32>::with_capacity(num_rows * dim);
6261
let mut dequantized = vec![0.0f32; padded_dim];
62+
let mut unrotated = vec![0.0f32; padded_dim];
6363

6464
for row in 0..num_rows {
6565
let row_indices = &indices[row * padded_dim..(row + 1) * padded_dim];
@@ -69,19 +69,13 @@ pub fn execute_decompress_mse(
6969
dequantized[idx] = centroids[row_indices[idx] as usize];
7070
}
7171

72-
// Inverse rotate using stored sign bits (hot path).
73-
apply_inverse_srht_from_bits(
74-
&mut dequantized,
75-
raw_signs.as_ref(),
76-
padded_dim,
77-
norm_factor,
78-
);
72+
rotation.inverse_rotate(&dequantized, &mut unrotated);
7973

8074
for idx in 0..dim {
81-
dequantized[idx] *= norm;
75+
unrotated[idx] *= norm;
8276
}
8377

84-
output.extend_from_slice(&dequantized[..dim]);
78+
output.extend_from_slice(&unrotated[..dim]);
8579
}
8680

8781
let elements = PrimitiveArray::new::<f32>(output.freeze(), Validity::NonNullable);

0 commit comments

Comments
 (0)