perf(alp): avoid per-combination allocations when searching exponents (#8565)

miniex · web-flow · commit 4457a4a2330b · 2026-06-24T14:39:39.000+01:00
## Summary `find_best_exponents` ran a full `encode` for every `(e, f)` candidate just to size the result, allocating the encoded, patch and chunk-offset buffers across the whole `e`/`f` search space. It now estimates the size in a single allocation-free pass via `estimate_encoded_size_for_exponents`, which reproduces the existing `estimate_encoded_size` exactly, so the chosen exponents (and therefore the compressed output) are bit-for-bit unchanged. This is the throughput half of #919; the two-level stratified sampling / top-N reuse the issue describes would change which exponents are picked and depends on per-column scheme state (#8434), so I left it out here. ## Benchmark `encodings/alp/benches/alp_compress.rs`, compress medians (type, length, patch-fraction, valid-fraction): | case | before | after | speedup | | --- | --- | --- | --- | | f64 (1000, 0.0, 1.0) | 39.1 us | 14.0 us | 2.8x | | f64 (1000, 0.1, 0.25) | 45.0 us | 16.5 us | 2.7x | | f32 (1000, 0.0, 1.0) | 14.6 us | 5.8 us | 2.5x | | f64 (10000, 0.0, 1.0) | 53.1 us | 28.6 us | 1.9x | | f64 (10000, 0.1, 0.25) | 73.7 us | 49.4 us | 1.5x | ## Testing `cargo nextest run -p vortex-alp -p vortex-btrblocks` passes, including a new equivalence test (`estimate_for_exponents_matches_full_encode`) that asserts the allocation-free estimate equals a full `encode` + `estimate_encoded_size` for every candidate exponent across kept, patched and all-patched inputs. `fmt` + `clippy --all-targets --all-features` + `cargo doc` clean. --- I'm Korean, so sorry if any wording reads a little awkward. --------- Signed-off-by: Han Damin <miniex@daminstudio.net>
diff --git a/encodings/alp/src/alp/mod.rs b/encodings/alp/src/alp/mod.rs
@@ -30,8 +30,49 @@ mod tests {
     use vortex_array::patches::PatchesMetadata;
     use vortex_array::test_harness::check_metadata;
 
+    use crate::alp::ALPFloat;
+    use crate::alp::Exponents;
     use crate::alp::array::ALPMetadata;
 
+    // The allocation-free estimate must match a full encode + estimate for every candidate
+    // exponent pair, so `find_best_exponents` picks the same exponents and compression is unchanged.
+    fn check_estimate_matches<T: ALPFloat>(values: &[T]) {
+        for e in 0..T::MAX_EXPONENT {
+            for f in 0..e {
+                let exp = Exponents { e, f };
+                let lightweight = T::estimate_encoded_size_for_exponents(values, exp);
+                let (_, encoded, _, patches, _) = T::encode(values, Some(exp));
+                let full = T::estimate_encoded_size(&encoded, &patches);
+                assert_eq!(
+                    lightweight,
+                    full,
+                    "mismatch at e={e}, f={f}, len={}",
+                    values.len()
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn estimate_for_exponents_matches_full_encode() {
+        // Clean 2-decimal values (mostly kept), repeating decimals (many patches), large
+        // magnitudes, constants, and a single element.
+        let mut f64s: Vec<f64> = (0..200).map(|i| i as f64 / 100.0).collect();
+        f64s.extend((0..60).map(|i| i as f64 / 7.0));
+        f64s.extend([1e17, -1e17, 0.0, 123.0]);
+        check_estimate_matches(&f64s);
+        check_estimate_matches::<f64>(&[123.456; 5]);
+        check_estimate_matches::<f64>(&[42.0]);
+        // Every value patches at every exponent -> exercises the all-patched branch.
+        check_estimate_matches::<f64>(&[1.0 / 3.0; 8]);
+
+        let mut f32s: Vec<f32> = (0..200).map(|i| i as f32 / 100.0).collect();
+        f32s.extend((0..60).map(|i| i as f32 / 7.0));
+        f32s.extend([1e9, -1e9, 0.0, 123.0]);
+        check_estimate_matches(&f32s);
+        check_estimate_matches::<f32>(&[1.0 / 3.0; 8]);
+    }
+
     #[cfg_attr(miri, ignore)]
     #[test]
     fn test_alp_metadata() {
@@ -88,6 +129,13 @@ mod private {
     impl Sealed for f64 {}
 }
 
+/// Widen a running `(min, max)` bound to include `value`, seeding it on the first value.
+fn update_bounds<I: Ord + Copy>(bounds: &mut Option<(I, I)>, value: I) {
+    *bounds = Some(bounds.map_or((value, value), |(min, max)| {
+        (min.min(value), max.max(value))
+    }));
+}
+
 pub trait ALPFloat: private::Sealed + Float + Display + NativePType {
     type ALPInt: PrimInt + Display + ToPrimitive + Copy + NativePType + Into<PValue>;
 
@@ -120,27 +168,69 @@ pub trait ALPFloat: private::Sealed + Float + Display + NativePType {
                 .cloned()
                 .collect_vec()
         });
+        let sample = sample.as_deref().unwrap_or(values);
 
         for e in (0..Self::MAX_EXPONENT).rev() {
             for f in 0..e {
-                let (_, encoded, _, exc_patches, _) = Self::encode(
-                    sample.as_deref().unwrap_or(values),
-                    Some(Exponents { e, f }),
-                );
-
-                let size = Self::estimate_encoded_size(&encoded, &exc_patches);
+                let exp = Exponents { e, f };
+                let size = Self::estimate_encoded_size_for_exponents(sample, exp);
                 if size < best_nbytes {
                     best_nbytes = size;
-                    best_exp = Exponents { e, f };
+                    best_exp = exp;
                 } else if size == best_nbytes && e - f < best_exp.e - best_exp.f {
-                    best_exp = Exponents { e, f };
+                    best_exp = exp;
                 }
             }
         }
 
         best_exp
     }
 
+    /// Size estimate for `values` under `exponents` matching a full [`Self::encode`] plus
+    /// [`Self::estimate_encoded_size`], but without the per-candidate allocations.
+    fn estimate_encoded_size_for_exponents(values: &[Self], exponents: Exponents) -> usize {
+        // `kept` is the (min, max) over values that round-trip exactly (kept inline by `encode`);
+        // `all` is the (min, max) over every encoded value. `encode` fills patched slots in-range,
+        // so its emitted range is `kept`, except with all values patched (no fill) where `all` wins.
+        let mut kept: Option<(Self::ALPInt, Self::ALPInt)> = None;
+        let mut all: Option<(Self::ALPInt, Self::ALPInt)> = None;
+        let mut patch_count = 0usize;
+
+        for &value in values {
+            let encoded = Self::encode_single_unchecked(value, exponents);
+            update_bounds(&mut all, encoded);
+            if Self::decode_single(encoded, exponents).is_eq(value) {
+                update_bounds(&mut kept, encoded);
+            } else {
+                patch_count += 1;
+            }
+        }
+
+        let range = if patch_count == values.len() {
+            all
+        } else {
+            kept
+        };
+
+        let bits_per_encoded = range
+            .and_then(|(min, max)| max.checked_sub(&min))
+            .and_then(|range_size| range_size.to_u64())
+            .and_then(|range_size| {
+                range_size
+                    .checked_ilog2()
+                    .map(|bits| (bits + 1) as usize)
+                    .or(Some(0))
+            })
+            .unwrap_or(size_of::<Self::ALPInt>() * 8);
+
+        let encoded_bytes = (values.len() * bits_per_encoded).div_ceil(8);
+        // each patch is a value + a position
+        // in practice, patch positions are in [0, u16::MAX] because of how we chunk
+        let patch_bytes = patch_count * (size_of::<Self>() + size_of::<u16>());
+
+        encoded_bytes + patch_bytes
+    }
+
     #[inline]
     fn estimate_encoded_size(encoded: &[Self::ALPInt], patches: &[Self]) -> usize {
         let bits_per_encoded = encoded