Skip to content

Commit a51c8e9

Browse files
committed
encodings/onpair-rs: pure-Rust port of OnPair training + encoding
New crate `onpair-lib` at `encodings/onpair-rs/` that mirrors the subset of `vortex-onpair-sys` actually consumed by `vortex-onpair`: BPE-style dictionary training plus LSB-first bit-packed token encoding, exposed via `Column::compress` and `Column::parts` with the same shape as the FFI crate. Decode, LIKE, and EQ remain in `vortex-onpair` (already pure Rust) and read the same `(dict_bytes, dict_offsets, codes_packed, codes_boundaries, bits)` layout. Modules ported from `gargiulofrancesco/onpair_cpp`: * types, dict, store, bit_writer, bit_unpack * lpm (flat HashMap keyed by (u128, u8); behavioural-equivalent replacement for the C++ short/long bucket split) * trainer (BPE pair-discovery + DynamicThresholdController + sort) * parser, column Tests: * 162 unit tests ported from the C++ GoogleTest suite (types, dictionary, store, bit_writer, lpm, trainer, parser, column round-trip across all 8 bit widths). * 8 cross-impl tests in `tests/cross_impl.rs` against `vortex-onpair-sys`: structural parity, decompression equivalence, eq / starts_with / contains predicate equivalence on a shared decode loop, and dictionary invariants (covers all 256 bytes, lex-sorted). Known divergence from C++: bit-exact dictionary equality is not asserted because the two implementations use different RNGs (`std::mt19937_64` vs Rust's `StdRng`). Every observable downstream operation matches. Signed-off-by: Claude <noreply@anthropic.com>
1 parent a1ba67f commit a51c8e9

16 files changed

Lines changed: 3660 additions & 0 deletions

File tree

Cargo.lock

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ members = [
4949
"encodings/datetime-parts",
5050
"encodings/fsst",
5151
"encodings/onpair",
52+
"encodings/onpair-rs",
5253
"encodings/onpair-sys",
5354
"encodings/pco",
5455
"encodings/sparse",

encodings/onpair-rs/Cargo.toml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
[package]
2+
name = "onpair-lib"
3+
description = "Pure-Rust port of the OnPair short-string compression library"
4+
authors = { workspace = true }
5+
categories = { workspace = true }
6+
edition = { workspace = true }
7+
homepage = { workspace = true }
8+
include = { workspace = true }
9+
keywords = { workspace = true }
10+
license = { workspace = true }
11+
readme = "README.md"
12+
repository = { workspace = true }
13+
rust-version = { workspace = true }
14+
version = { workspace = true }
15+
16+
[lints]
17+
workspace = true
18+
19+
[dependencies]
20+
hashbrown = { workspace = true }
21+
rand = { workspace = true }
22+
23+
[dev-dependencies]
24+
rstest = { workspace = true }
25+
vortex-onpair-sys = { workspace = true }
26+
27+
[[test]]
28+
name = "cross_impl"
29+
path = "tests/cross_impl.rs"

encodings/onpair-rs/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# onpair-lib
2+
3+
Pure-Rust port of the training + encoding parts of
4+
[`onpair_cpp`](https://github.com/gargiulofrancesco/onpair_cpp).
5+
6+
Scope is limited to what `vortex-onpair` actually consumes from
7+
`vortex-onpair-sys`: `Column::compress` (BPE-style dictionary training plus
8+
LSB-first bit-packed token encoding) and raw access to the resulting parts
9+
(dictionary bytes/offsets, packed token stream, per-row boundaries). Decode,
10+
LIKE, and EQ predicates are already pure Rust in `vortex-onpair` and reuse the
11+
same `parts()` layout.
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
//
4+
// Pure-Rust reader for the LSB-first bit-packed token stream produced by
5+
// `BitWriter`. The implementation is identical to `vortex-onpair-sys`'s
6+
// helper of the same name; we keep a local copy so this crate doesn't depend
7+
// on the C++ FFI crate.
8+
9+
/// Read `bits` (1..=16) bits from `packed` starting at LSB-first bit position
10+
/// `bit_pos`. Matches OnPair's `BitWriter` layout exactly.
11+
#[inline]
12+
pub fn read_bits_lsb(packed: &[u64], bit_pos: usize, bits: u32) -> u16 {
13+
debug_assert!((1..=16).contains(&bits));
14+
let word_idx = bit_pos / 64;
15+
let bit_off = (bit_pos % 64) as u32;
16+
let mask: u64 = (1u64 << bits) - 1;
17+
let low = packed[word_idx] >> bit_off;
18+
let combined = if bit_off + bits <= 64 {
19+
low & mask
20+
} else {
21+
let high = packed[word_idx + 1] << (64 - bit_off);
22+
(low | high) & mask
23+
};
24+
combined as u16
25+
}
26+
27+
/// Decompress an LSB-first bit-packed token stream into a flat `Vec<u16>`,
28+
/// one element per token. Each `u16` only uses its low `bits` bits.
29+
pub fn unpack_codes_to_u16(packed: &[u64], total_tokens: usize, bits: u32) -> Vec<u16> {
30+
assert!((9..=16).contains(&bits), "bits must be in [9, 16]");
31+
let mut out = Vec::with_capacity(total_tokens);
32+
for t in 0..total_tokens {
33+
out.push(read_bits_lsb(packed, t * bits as usize, bits));
34+
}
35+
out
36+
}
37+
38+
#[cfg(test)]
39+
mod tests {
40+
use super::*;
41+
42+
#[test]
43+
fn unpack_roundtrips_simple_pattern() {
44+
// Three 12-bit tokens packed LSB-first into one u64.
45+
let bits = 12u32;
46+
let a = 0xABC_u64;
47+
let b = 0xDEF_u64;
48+
let c = 0x123_u64;
49+
// word0 layout: a in bits 0..12, b in 12..24, c in 24..36.
50+
let word = a | (b << 12) | (c << 24);
51+
let packed = vec![word, 0];
52+
assert_eq!(read_bits_lsb(&packed, 0, bits), 0xABC);
53+
assert_eq!(read_bits_lsb(&packed, 12, bits), 0xDEF);
54+
assert_eq!(read_bits_lsb(&packed, 24, bits), 0x123);
55+
56+
let unpacked = unpack_codes_to_u16(&packed, 3, bits);
57+
assert_eq!(unpacked, vec![0xABC, 0xDEF, 0x123]);
58+
}
59+
}

0 commit comments

Comments
 (0)