Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
54bc343
Add alp to the thrift definitions
sdf-jkl Feb 6, 2026
622841b
Add alp to the encoding enum
sdf-jkl Feb 6, 2026
bf754b2
Add alp header and page layout structs
sdf-jkl Feb 12, 2026
8337fcd
Add Interleaved Vectors​ support
sdf-jkl Feb 15, 2026
5349a9e
Merge branch 'main' into alp
sdf-jkl Feb 15, 2026
148b1d5
Add bit unpacking, reverse FOR and exceptions patching
sdf-jkl Feb 18, 2026
5f0222e
Decode page works
sdf-jkl Feb 18, 2026
027172f
Add more checks and tests
sdf-jkl Feb 19, 2026
8ee1b7e
Make decode page lazy + some tests
sdf-jkl Feb 20, 2026
18f885e
Find vectors using offsets at decode time, not parse time
sdf-jkl Feb 20, 2026
a9fa0c3
get decode fast path
sdf-jkl Feb 20, 2026
7754ebc
Simplify decode_page_values
sdf-jkl Feb 20, 2026
6b7e923
remove `version` from page header
sdf-jkl Mar 12, 2026
a13f244
change `log_vetctor_size` range consts
sdf-jkl Mar 12, 2026
6e86d6f
redo the integration test using a test file from the c++ pr
sdf-jkl Mar 12, 2026
ac56cc2
cache parsed vectors
sdf-jkl Mar 12, 2026
07ca6f0
use `Bytes` for `packed_values`
sdf-jkl Mar 12, 2026
def5c09
clippy too many args
sdf-jkl Mar 12, 2026
5e5c8d2
cargo fmt
sdf-jkl Mar 12, 2026
6060ca2
Merge branch 'main' of https://github.com/apache/arrow-rs into alp
sdf-jkl Mar 16, 2026
5ed70ae
fix decode to two-step multiplication
sdf-jkl Mar 16, 2026
e328d7a
make vector byte count more strict
sdf-jkl Mar 16, 2026
1807d32
add stricter offset validation
sdf-jkl Mar 16, 2026
11c7905
fmt
sdf-jkl Mar 16, 2026
e9827e5
Merge tag '58.2.0' into alp
alamb May 8, 2026
cacacf4
Fix up compilation
alamb May 8, 2026
578481c
Refactor: Pull common header structures into common module
alamb May 8, 2026
131851a
Merge pull request #3 from alamb/alamb/move_common_structures
sdf-jkl May 9, 2026
f10279c
fmt
sdf-jkl May 9, 2026
fbf4ad2
small fix
sdf-jkl May 9, 2026
f65ff9a
Align structure names with spec
alamb May 9, 2026
c912f4e
Merge pull request #5 from alamb/alamb/alp_names
sdf-jkl May 9, 2026
26c382c
fmt
sdf-jkl May 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions parquet/src/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,10 @@ enum Encoding {
/// afterwards. Note that the use of this encoding with FIXED_LEN_BYTE_ARRAY(N) data may
/// perform poorly for large values of N.
BYTE_STREAM_SPLIT = 9;
/// Adaptive Lossless floating-Point encoding (ALP).
///
/// Currently specified for FLOAT and DOUBLE.
ALP = 10;
}
);

Expand All @@ -654,6 +658,7 @@ impl FromStr for Encoding {
"DELTA_BYTE_ARRAY" | "delta_byte_array" => Ok(Encoding::DELTA_BYTE_ARRAY),
"RLE_DICTIONARY" | "rle_dictionary" => Ok(Encoding::RLE_DICTIONARY),
"BYTE_STREAM_SPLIT" | "byte_stream_split" => Ok(Encoding::BYTE_STREAM_SPLIT),
"ALP" | "alp" => Ok(Encoding::ALP),
_ => Err(general_err!("unknown encoding: {}", s)),
}
}
Expand Down Expand Up @@ -791,6 +796,7 @@ fn i32_to_encoding(val: i32) -> Encoding {
7 => Encoding::DELTA_BYTE_ARRAY,
8 => Encoding::RLE_DICTIONARY,
9 => Encoding::BYTE_STREAM_SPLIT,
10 => Encoding::ALP,
_ => panic!("Impossible encoding {val}"),
}
}
Expand Down Expand Up @@ -2137,6 +2143,7 @@ mod tests {
);
assert_eq!(Encoding::DELTA_BYTE_ARRAY.to_string(), "DELTA_BYTE_ARRAY");
assert_eq!(Encoding::RLE_DICTIONARY.to_string(), "RLE_DICTIONARY");
assert_eq!(Encoding::ALP.to_string(), "ALP");
}

#[test]
Expand Down Expand Up @@ -2438,6 +2445,8 @@ mod tests {
assert_eq!(encoding, Encoding::RLE_DICTIONARY);
encoding = "BYTE_STREAM_SPLIT".parse().unwrap();
assert_eq!(encoding, Encoding::BYTE_STREAM_SPLIT);
encoding = "alp".parse().unwrap();
assert_eq!(encoding, Encoding::ALP);

// test lowercase
encoding = "byte_stream_split".parse().unwrap();
Expand Down Expand Up @@ -2573,6 +2582,7 @@ mod tests {
Encoding::PLAIN_DICTIONARY,
Encoding::RLE_DICTIONARY,
Encoding::BYTE_STREAM_SPLIT,
Encoding::ALP,
];
encodings_roundtrip(encodings.into());
}
Expand Down
336 changes: 336 additions & 0 deletions parquet/src/encodings/alp.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,336 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! ALP (Adaptive Lossless floating-Point) Encoding
//!
//! Based on the draft Parquet spec: <https://github.com/apache/parquet-format/pull/557>

use crate::util::bit_util::{FromBitpacked, FromBytes};

pub(crate) const ALP_HEADER_SIZE: usize = 7;
pub(crate) const ALP_COMPRESSION_MODE: u8 = 0;
pub(crate) const ALP_INTEGER_ENCODING_FOR_BIT_PACK: u8 = 0;
pub(crate) const ALP_MIN_LOG_VECTOR_SIZE: u8 = 3;
pub(crate) const ALP_MAX_LOG_VECTOR_SIZE: u8 = 15;
pub(crate) const ALP_MAX_EXPONENT_F32: u8 = 10;
pub(crate) const ALP_MAX_EXPONENT_F64: u8 = 18;

/// Page-level ALP header (7 bytes).
///
/// ```text
/// Byte: 0 1 2 3 4 5 6
/// +----------------+---------------+--------------+----+----+----+----+
/// | compression | integer | log_vector | num_elements |
/// | _mode | _encoding | _size | (int32 LE) |
/// +----------------+---------------+--------------+----+----+----+----+
/// ```
///
/// Layout in bytes:
/// - `[0]` `compression_mode`
/// - `[1]` `integer_encoding`
/// - `[2]` `log_vector_size`
/// - `[3..7]` `num_elements` (little-endian `i32`)
#[derive(Debug, Clone, Copy)]
pub(crate) struct AlpHeader {
pub(crate) compression_mode: u8,
pub(crate) integer_encoding: u8,
pub(crate) log_vector_size: u8,
pub(crate) num_elements: i32,
}

impl AlpHeader {
pub(crate) fn num_elements_usize(&self) -> usize {
self.num_elements as usize
}

fn vector_size(&self) -> usize {
1usize << self.log_vector_size
}

pub(crate) fn num_vectors(&self) -> usize {
if self.num_elements == 0 {
0
} else {
self.num_elements_usize().div_ceil(self.vector_size())
}
}

pub(crate) fn vector_num_elements(&self, vector_index: usize) -> u16 {
let vector_size = self.vector_size();
let num_full_vectors = self.num_elements_usize() / vector_size;
let remainder = self.num_elements_usize() % vector_size;
if vector_index < num_full_vectors {
vector_size as u16
} else if vector_index == num_full_vectors && remainder > 0 {
remainder as u16
} else {
0
}
}
}

/// Per-vector ALP metadata (4 bytes), equivalent to C++ `AlpEncodedVectorInfo`.
///
/// ##### AlpInfo (4 bytes, both types)
///
/// ```text
/// Byte: 0 1 2 3
/// +----------+----------+---------+---------+
/// | exponent | factor | num_exceptions |
/// | (uint8) | (uint8) | (uint16 LE) |
/// +----------+----------+---------+---------+
/// ```
#[derive(Debug, Clone, Copy)]
pub(crate) struct AlpInfo {
pub(crate) exponent: u8,
pub(crate) factor: u8,
pub(crate) num_exceptions: u16,
}

impl AlpInfo {
pub(crate) const STORED_SIZE: usize = 4;
}

/// Per-vector FOR metadata for exact integer type (`u32` for `f32`, `u64` for `f64`).
///
/// Frame of reference (FOR) encoding
///
/// ###### ForInfo for FLOAT (5 bytes) / DOUBLE (9 bytes)
///
/// ```text
/// Byte: 0 1 2 3 4
/// +----+----+----+----+-----------+
/// | frame_of_reference | bit_width |
/// | (int32 LE) | (uint8) |
/// +----+----+----+----+-----------+
/// ```
#[derive(Debug, Clone, Copy)]
pub(crate) struct ForInfo<Exact: AlpExact> {
pub(crate) frame_of_reference: Exact,
pub(crate) bit_width: u8,
}

impl<Exact: AlpExact> ForInfo<Exact> {
pub(crate) fn stored_size() -> usize {
Exact::WIDTH + 1
}

pub(crate) fn get_bit_packed_size(&self, num_elements: u16) -> usize {
(self.bit_width as usize * num_elements as usize).div_ceil(8)
}

pub(crate) fn get_data_stored_size(&self, num_elements: u16, num_exceptions: u16) -> usize {
let bit_packed_size = self.get_bit_packed_size(num_elements);
bit_packed_size
+ num_exceptions as usize * std::mem::size_of::<u16>()
+ num_exceptions as usize * Exact::WIDTH
}
}

/// Exact integer type used by FOR reconstruction.
///
/// This mirrors C++:
/// - `float` -> `uint32_t`
/// - `double` -> `uint64_t`
///
/// Why unsigned (not `i32`/`i64`)?
/// - FOR stores non-negative deltas optimized for bitpacking.
/// - Unsigned arithmetic avoids signed-overflow edge cases in FOR stage.
/// - Signed interpretation is applied later during decimal reconstruction.
pub(crate) trait AlpExact: Copy + std::fmt::Debug + FromBitpacked {
const WIDTH: usize;
type Signed: Copy;
fn from_le_slice(slice: &[u8]) -> Self;
fn zero() -> Self;
fn wrapping_add(self, rhs: Self) -> Self;
fn reinterpret_as_signed(self) -> Self::Signed;
}

impl AlpExact for u32 {
const WIDTH: usize = 4;
type Signed = i32;

fn from_le_slice(slice: &[u8]) -> Self {
u32::from_le_bytes([slice[0], slice[1], slice[2], slice[3]])
}

fn zero() -> Self {
0
}

fn wrapping_add(self, rhs: Self) -> Self {
self.wrapping_add(rhs)
}

fn reinterpret_as_signed(self) -> Self::Signed {
i32::from_ne_bytes(self.to_ne_bytes())
}
}

impl AlpExact for u64 {
const WIDTH: usize = 8;
type Signed = i64;

fn from_le_slice(slice: &[u8]) -> Self {
u64::from_le_bytes([
slice[0], slice[1], slice[2], slice[3], slice[4], slice[5], slice[6], slice[7],
])
}

fn zero() -> Self {
0
}

fn wrapping_add(self, rhs: Self) -> Self {
self.wrapping_add(rhs)
}

fn reinterpret_as_signed(self) -> Self::Signed {
i64::from_ne_bytes(self.to_ne_bytes())
}
}
pub(crate) const ALP_POW10_F32: [f32; 11] = [
1.0,
10.0,
100.0,
1000.0,
10000.0,
100000.0,
1000000.0,
10000000.0,
100000000.0,
1000000000.0,
10000000000.0,
];

pub(crate) const ALP_POW10_F64: [f64; 19] = [
1.0,
10.0,
100.0,
1000.0,
10000.0,
100000.0,
1000000.0,
10000000.0,
100000000.0,
1000000000.0,
10000000000.0,
100000000000.0,
1000000000000.0,
10000000000000.0,
100000000000000.0,
1000000000000000.0,
10000000000000000.0,
100000000000000000.0,
1000000000000000000.0,
];

pub(crate) const ALP_NEG_POW10_F32: [f32; 11] = [
1.0,
0.1,
0.01,
0.001,
0.0001,
0.00001,
0.000001,
0.0000001,
0.00000001,
0.000000001,
0.0000000001,
];

pub(crate) const ALP_NEG_POW10_F64: [f64; 19] = [
1.0,
0.1,
0.01,
0.001,
0.0001,
0.00001,
0.000001,
0.0000001,
0.00000001,
0.000000001,
0.0000000001,
0.00000000001,
0.000000000001,
0.0000000000001,
0.00000000000001,
0.000000000000001,
0.0000000000000001,
0.00000000000000001,
0.000000000000000001,
];

pub(crate) trait AlpFloat: Copy + Default {
type Exact: AlpExact + FromBytes;
type Scale: Copy;

/// Precompute vector-level ALP decimal scale constants for:
/// `value = (encoded * 10^(factor)) * 10^(-exponent)`.
///
/// Preconditions are validated during page parse.
fn decode_scale(exponent: u8, factor: u8) -> Self::Scale;

/// Decode one signed exact integer using a precomputed two-step scale.
fn decode_value(signed_encoded: <Self::Exact as AlpExact>::Signed, scale: Self::Scale) -> Self;

fn from_exact_bits(bits: Self::Exact) -> Self;
}

impl AlpFloat for f32 {
type Exact = u32;
type Scale = (f32, f32);

fn decode_scale(exponent: u8, factor: u8) -> Self::Scale {
debug_assert!(exponent <= ALP_MAX_EXPONENT_F32);
debug_assert!(factor <= exponent);
(
ALP_POW10_F32[factor as usize],
ALP_NEG_POW10_F32[exponent as usize],
)
}

fn decode_value(signed_encoded: i32, scale: Self::Scale) -> Self {
((signed_encoded as f32) * scale.0) * scale.1
}

fn from_exact_bits(bits: Self::Exact) -> Self {
f32::from_bits(bits)
}
}

impl AlpFloat for f64 {
type Exact = u64;
type Scale = (f64, f64);

fn decode_scale(exponent: u8, factor: u8) -> Self::Scale {
debug_assert!(exponent <= ALP_MAX_EXPONENT_F64);
debug_assert!(factor <= exponent);
(
ALP_POW10_F64[factor as usize],
ALP_NEG_POW10_F64[exponent as usize],
)
}

fn decode_value(signed_encoded: i64, scale: Self::Scale) -> Self {
((signed_encoded as f64) * scale.0) * scale.1
}

fn from_exact_bits(bits: Self::Exact) -> Self {
f64::from_bits(bits)
}
}
Loading
Loading