Skip to content

Commit 87f217f

Browse files
committed
Refactor OnPair to FSST-shape: dict-as-blob, u16 codes child, Rust decode
Replaces the previous opaque-blob layout with one that mirrors how FSST splits its symbols-as-buffer / codes-as-child encoding, and shifts every read path off the C++ FFI. Layout ------ Buffer 0 dict_bytes — dictionary blob built by C++ training Slot 0 dict_offsets u32[] — len = dict_size + 1 Slot 1 codes u16[] — one token id per element, low `bits` bits populated (FastLanes-bit-packable) Slot 2 codes_offsets u32[] — per-row token offsets, len = n + 1 Slot 3 uncompressed_lengths — i32[], len = n Slot 4 validity — optional Bool child metadata = { bits: u32, uncompressed_lengths_ptype: i32 } Decode path ----------- At compress time we call OnPair's C++ trainer to produce the dictionary and bit-packed token stream, then immediately unpack the stream into u16 codes in Rust (`vortex_onpair_sys::unpack_codes_to_u16`) and drop the C++ column. After that, nothing on the read path touches C++: decode_row(r): for c in codes[codes_offsets[r] .. codes_offsets[r+1]]: out.extend_from_slice( dict_bytes[dict_offsets[c] .. dict_offsets[c+1]] ) `canonicalize`, `scalar_at`, and the compute kernels all share a `DecodeView` over the materialised children. Compute kernels (pure Rust, no C++ scan) ---------------------------------------- * compare (Eq / NotEq): streams dict slices per row, short-circuits on the first mismatch. * like ('lit', 'pre%', '%sub%'): same streaming approach for prefix; a full row decode + memmem for contains. * filter: canonical round-trip + recompress (unchanged). * slice: zero-copy — narrows codes_offsets / uncompressed_lengths / validity and shares the dict blob + codes child. * cast: identity rewrap, no payload touched. Tests ----- All 7 unit tests + the 100 000-row big_data smoke test pass. On the smoke corpus (release): compress 147 ms, full canonicalize 7.5 ms, equals / starts_with / contains pushdown counts match a brute-force reference exactly. Signed-off-by: Claude <noreply@anthropic.com>
1 parent 0fb5929 commit 87f217f

17 files changed

Lines changed: 806 additions & 381 deletions

File tree

encodings/onpair-sys/cxx/onpair_shim.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,11 @@
1414
#include <string_view>
1515
#include <vector>
1616

17+
using onpair::DECOMPRESS_BUFFER_PADDING;
18+
using onpair::DictionaryView;
1719
using onpair::OnPairColumn;
1820
using onpair::OnPairColumnView;
19-
using onpair::DECOMPRESS_BUFFER_PADDING;
21+
using onpair::StoreView;
2022
using onpair::encoding::DynamicThreshold;
2123
using onpair::encoding::TrainingConfig;
2224

@@ -351,4 +353,43 @@ size_t onpair_column_dict_bytes(const OnPairColumnHandle* handle) {
351353
}
352354
}
353355

356+
OnPairStatus onpair_column_parts(
357+
const OnPairColumnHandle* handle,
358+
OnPairColumnParts* out_parts) {
359+
if (handle == nullptr || out_parts == nullptr) {
360+
return ONPAIR_ERR_INVALID_ARG;
361+
}
362+
auto* h = const_cast<ColumnHandle*>(reinterpret_cast<const ColumnHandle*>(handle));
363+
try {
364+
const auto& view = h->get_view();
365+
const DictionaryView& dv = view.dictionary();
366+
const StoreView& sv = view.store();
367+
368+
const size_t dict_size = dv.num_tokens();
369+
const uint32_t* dict_off = dv.raw_offsets();
370+
const size_t dict_bytes = dict_size == 0 ? 0 : dict_off[dict_size];
371+
372+
const size_t num_rows = sv.num_strings();
373+
const uint32_t bw = static_cast<uint32_t>(sv.bits());
374+
const size_t tokens = sv.num_tokens();
375+
// The packed stream is laid out by BitWriter as a vector<uint64_t>;
376+
// round-up-to-u64 of (tokens * bits) bits.
377+
const size_t packed_u64 = (tokens * bw + 63) / 64;
378+
379+
out_parts->dict_bytes = dv.raw_bytes();
380+
out_parts->dict_bytes_len = dict_bytes;
381+
out_parts->dict_offsets = dict_off;
382+
out_parts->dict_offsets_len = dict_size + 1;
383+
out_parts->codes_packed = sv.packed_data();
384+
out_parts->codes_packed_u64_len = packed_u64;
385+
out_parts->codes_boundaries = sv.boundaries();
386+
out_parts->codes_boundaries_len = num_rows + 1;
387+
out_parts->bits = bw;
388+
out_parts->num_rows = num_rows;
389+
return ONPAIR_OK;
390+
} catch (...) {
391+
return ONPAIR_ERR_INTERNAL;
392+
}
393+
}
394+
354395
} // extern "C"

encodings/onpair-sys/cxx/onpair_shim.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,29 @@ OnPairStatus onpair_column_dict_copy(
124124
// Bytes occupied by the dictionary (sum of entry lengths).
125125
size_t onpair_column_dict_bytes(const OnPairColumnHandle* handle);
126126

127+
// --- Decomposition into raw arrays (Vortex layout) ------------------------
128+
//
129+
// Borrows pointers to the column's underlying Dictionary + Store vectors.
130+
// The pointers remain valid until `handle` is freed; the caller is expected
131+
// to copy them out into Vortex buffers/children and then drop the column.
132+
133+
typedef struct OnPairColumnParts {
134+
const uint8_t* dict_bytes;
135+
size_t dict_bytes_len; // = dict_offsets[dict_size] (true, unpadded)
136+
const uint32_t* dict_offsets;
137+
size_t dict_offsets_len; // = dict_size + 1
138+
const uint64_t* codes_packed; // LSB-first bit-packed token stream
139+
size_t codes_packed_u64_len; // u64 word count
140+
const uint32_t* codes_boundaries; // per-row token index
141+
size_t codes_boundaries_len; // = num_rows + 1
142+
uint32_t bits; // 9..=16
143+
size_t num_rows;
144+
} OnPairColumnParts;
145+
146+
OnPairStatus onpair_column_parts(
147+
const OnPairColumnHandle* handle,
148+
OnPairColumnParts* out_parts);
149+
127150
#ifdef __cplusplus
128151
} // extern "C"
129152
#endif

encodings/onpair-sys/src/lib.rs

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,26 @@ pub mod ffi {
115115
bytes_capacity: usize,
116116
out_offsets: *mut u64,
117117
) -> u32;
118+
119+
pub fn onpair_column_parts(
120+
handle: *const OnPairColumnHandle,
121+
out_parts: *mut OnPairColumnParts,
122+
) -> u32;
123+
}
124+
125+
#[repr(C)]
126+
#[derive(Debug, Copy, Clone)]
127+
pub struct OnPairColumnParts {
128+
pub dict_bytes: *const u8,
129+
pub dict_bytes_len: usize,
130+
pub dict_offsets: *const u32,
131+
pub dict_offsets_len: usize,
132+
pub codes_packed: *const u64,
133+
pub codes_packed_u64_len: usize,
134+
pub codes_boundaries: *const u32,
135+
pub codes_boundaries_len: usize,
136+
pub bits: u32,
137+
pub num_rows: usize,
118138
}
119139
}
120140

@@ -322,8 +342,109 @@ impl Column {
322342
}
323343
}
324344

345+
impl Column {
346+
/// Borrow the column's raw decomposition: dictionary, bit-packed token
347+
/// stream, and per-row boundaries. The returned pointers reference memory
348+
/// owned by `self` and remain valid for as long as the column does.
349+
pub fn parts(&self) -> Result<Parts<'_>, Error> {
350+
let mut raw = OnPairColumnParts {
351+
dict_bytes: std::ptr::null(),
352+
dict_bytes_len: 0,
353+
dict_offsets: std::ptr::null(),
354+
dict_offsets_len: 0,
355+
codes_packed: std::ptr::null(),
356+
codes_packed_u64_len: 0,
357+
codes_boundaries: std::ptr::null(),
358+
codes_boundaries_len: 0,
359+
bits: 0,
360+
num_rows: 0,
361+
};
362+
let status = unsafe { onpair_column_parts(self.handle.as_ptr(), &raw mut raw) };
363+
Error::check(status)?;
364+
// SAFETY: the C side returns pointers into vectors owned by `self`
365+
// (the underlying `OnPairColumn`); they remain valid for `&self`.
366+
Ok(unsafe { Parts::from_raw(raw) })
367+
}
368+
}
369+
325370
impl Drop for Column {
326371
fn drop(&mut self) {
327372
unsafe { onpair_column_free(self.handle.as_ptr()) }
328373
}
329374
}
375+
376+
/// Borrowed view over a column's raw arrays. See [`Column::parts`].
377+
#[derive(Copy, Clone)]
378+
pub struct Parts<'a> {
379+
/// Concatenated dictionary entry bytes (unpadded).
380+
pub dict_bytes: &'a [u8],
381+
/// Length `dict_size + 1`; entry `i` spans `dict_bytes[dict_offsets[i]..dict_offsets[i + 1]]`.
382+
pub dict_offsets: &'a [u32],
383+
/// LSB-first bit-packed token stream, packed `bits` bits per token.
384+
pub codes_packed: &'a [u64],
385+
/// Length `num_rows + 1`; row `r` spans tokens `codes_boundaries[r]..codes_boundaries[r + 1]`.
386+
pub codes_boundaries: &'a [u32],
387+
/// Bits per token (9..=16).
388+
pub bits: u32,
389+
pub num_rows: usize,
390+
}
391+
392+
impl<'a> Parts<'a> {
393+
/// # Safety
394+
/// Caller must guarantee the pointers in `raw` are valid for `'a`.
395+
unsafe fn from_raw(raw: OnPairColumnParts) -> Self {
396+
unsafe {
397+
Self {
398+
dict_bytes: slice_or_empty(raw.dict_bytes, raw.dict_bytes_len),
399+
dict_offsets: slice_or_empty(raw.dict_offsets, raw.dict_offsets_len),
400+
codes_packed: slice_or_empty(raw.codes_packed, raw.codes_packed_u64_len),
401+
codes_boundaries: slice_or_empty(raw.codes_boundaries, raw.codes_boundaries_len),
402+
bits: raw.bits,
403+
num_rows: raw.num_rows,
404+
}
405+
}
406+
}
407+
}
408+
409+
#[inline]
410+
unsafe fn slice_or_empty<'a, T>(ptr: *const T, len: usize) -> &'a [T] {
411+
if ptr.is_null() || len == 0 {
412+
&[]
413+
} else {
414+
unsafe { std::slice::from_raw_parts(ptr, len) }
415+
}
416+
}
417+
418+
/// Read `bits` (1..=16) bits from `packed` starting at LSB-first bit position
419+
/// `bit_pos`. Matches OnPair's `BitWriter` layout.
420+
#[inline]
421+
pub fn read_bits_lsb(packed: &[u64], bit_pos: usize, bits: u32) -> u16 {
422+
debug_assert!((1..=16).contains(&bits));
423+
let word_idx = bit_pos / 64;
424+
// SAFETY of cast: `bit_pos % 64` is always in `0..64`, which fits in u32.
425+
#[allow(clippy::cast_possible_truncation)]
426+
let bit_off = (bit_pos % 64) as u32;
427+
let mask: u64 = (1u64 << bits) - 1;
428+
let low = packed[word_idx] >> bit_off;
429+
let combined = if bit_off + bits <= 64 {
430+
low & mask
431+
} else {
432+
let high = packed[word_idx + 1] << (64 - bit_off);
433+
(low | high) & mask
434+
};
435+
// SAFETY of cast: `combined` has been masked to at most `bits` (<=16) bits.
436+
#[allow(clippy::cast_possible_truncation)]
437+
let value = combined as u16;
438+
value
439+
}
440+
441+
/// Decompress an LSB-first bit-packed token stream into a flat `Vec<u16>`,
442+
/// one element per token. Each `u16` only uses its low `bits` bits.
443+
pub fn unpack_codes_to_u16(packed: &[u64], total_tokens: usize, bits: u32) -> Vec<u16> {
444+
assert!((9..=16).contains(&bits), "bits must be in [9, 16]");
445+
let mut out = Vec::with_capacity(total_tokens);
446+
for t in 0..total_tokens {
447+
out.push(read_bits_lsb(packed, t * bits as usize, bits));
448+
}
449+
out
450+
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
 �
1+


0 commit comments

Comments
 (0)