Skip to content

Commit 803bc4e

Browse files
committed
Make OnPair the default string-fragmentation scheme + register globally
Two changes that together stop FSST from being the default and make OnPair work end-to-end through the file writer + reader. vortex-btrblocks * Remove `FSSTScheme` from `ALL_SCHEMES`. The struct and `Scheme` impl stay in place so callers can opt back in via `BtrBlocksCompressorBuilder::with_new_scheme(&FSSTScheme)`; it just isn't in the default cascade anymore. OnPair fills the string-fragmentation slot. * Tighten `only_cuda_compatible` to exclude OnPair (heavier toolchain dep) instead of FSST. * Tests: drop the FSST-vs-OnPair tie-break test; add `test_onpair_compressed` (FSST-style corpus → OnPair) and `test_fsst_opt_in_still_works` (empty builder + with_new_scheme + FSSTScheme). vortex-file * New `onpair` Cargo feature (default-on, mirrors `vortex-btrblocks`'s) that pulls in `vortex-onpair` and registers `OnPair` in both `register_default_encodings` and `ALLOWED_ENCODINGS`. Without this the normalizer rejects vortex.onpair with "normalize forbids encoding (vortex.onpair)" when round-tripping a file. Consumers without a C++ toolchain can `default-features = false`. CI / reproducibility * Pin `onpair_cpp` to a full commit SHA in `cmake/onpair_pin.cmake` (was tracking `main`). CI's `FetchContent` step is now reproducible and won't break when upstream's main branch moves. Tests: 109 across vortex-onpair, vortex-btrblocks, vortex-file; all green. Clippy clean. Signed-off-by: Claude <noreply@anthropic.com>
1 parent 70947a8 commit 803bc4e

7 files changed

Lines changed: 80 additions & 23 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Pin of gargiulofrancesco/onpair_cpp consumed by FetchContent.
2-
# Bump both fields when updating.
2+
# Bump `ONPAIR_CPP_TAG` to a full commit SHA when updating — never use a
3+
# branch name in CI, otherwise builds become non-reproducible.
34
set(ONPAIR_CPP_REPO "https://github.com/gargiulofrancesco/onpair_cpp.git")
4-
set(ONPAIR_CPP_TAG "main")
5+
set(ONPAIR_CPP_TAG "ae590713515c7bb7893e14a757b484545e5339c3")

vortex-btrblocks/src/builder.rs

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[
5353
// String schemes.
5454
////////////////////////////////////////////////////////////////////////////////////////////////
5555
&string::StringDictScheme,
56-
&string::FSSTScheme,
5756
#[cfg(feature = "onpair")]
5857
&string::OnPairScheme,
5958
&string::StringConstantScheme,
@@ -170,14 +169,20 @@ impl BtrBlocksCompressorBuilder {
170169
/// preserves the array buffer layout for zero-conversion GPU decompression. Without it,
171170
/// interleaved Zstd compression is used.
172171
pub fn only_cuda_compatible(self) -> Self {
173-
let builder = self.exclude_schemes([
172+
// String fragmentation schemes (OnPair, FSST) require host-side
173+
// dictionary expansion at decode time, which is incompatible with
174+
// pure-GPU decompression paths. Strip whichever string-fragment
175+
// scheme is enabled by feature.
176+
let mut excluded: Vec<SchemeId> = vec![
174177
integer::SparseScheme.id(),
175178
integer::IntRLEScheme.id(),
176179
float::FloatRLEScheme.id(),
177180
float::NullDominatedSparseScheme.id(),
178181
string::StringDictScheme.id(),
179-
string::FSSTScheme.id(),
180-
]);
182+
];
183+
#[cfg(feature = "onpair")]
184+
excluded.push(string::OnPairScheme.id());
185+
let builder = self.exclude_schemes(excluded);
181186

182187
#[cfg(all(feature = "zstd", feature = "unstable_encodings"))]
183188
let builder = builder.with_new_scheme(&string::ZstdBuffersScheme);

vortex-btrblocks/src/schemes/string.rs

Lines changed: 56 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -41,17 +41,22 @@ use crate::Scheme;
4141
use crate::SchemeExt;
4242

4343
/// FSST (Fast Static Symbol Table) compression.
44+
///
45+
/// Retained for callers that want to opt back in via
46+
/// [`BtrBlocksCompressorBuilder::with_new_scheme`]; it is **not** part of the
47+
/// default [`ALL_SCHEMES`] anymore — the default string-fragmentation slot is
48+
/// filled by [`OnPairScheme`].
4449
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
4550
pub struct FSSTScheme;
4651

47-
/// OnPair short-string compression (dict-12, FSST-shape children).
52+
/// OnPair short-string compression (dict-12).
4853
///
49-
/// Targets the same workload as FSST — large columns of short-to-medium
50-
/// strings with high lexical overlap — but uses a learned dictionary of
51-
/// frequent adjacent substrings and 12-bit codes. The codes / offsets /
52-
/// uncompressed-lengths children all flow through the cascading compressor
53-
/// the same way FSST's do, so any downstream bit-packing / FoR / etc. still
54-
/// applies.
54+
/// The default string-fragmentation scheme — targets large columns of
55+
/// short-to-medium strings with high lexical overlap, like URLs or log lines.
56+
/// Uses a learned dictionary of frequent adjacent substrings (built by the
57+
/// OnPair C++ trainer at compress time) and 12-bit token codes stored as a
58+
/// u16 child, with offsets / uncompressed-lengths flowing through the
59+
/// cascading compressor like any other primitive children.
5560
#[cfg(feature = "onpair")]
5661
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
5762
pub struct OnPairScheme;
@@ -546,11 +551,12 @@ mod scheme_selection_tests {
546551
);
547552
}
548553

554+
#[cfg(feature = "onpair")]
549555
#[test]
550-
fn test_dictionary_string_scheme_compressed() -> VortexResult<()> {
556+
fn test_onpair_compressed() -> VortexResult<()> {
551557
// Dictionary-style string corpus: high lexical overlap, short rows.
552-
// FSST and OnPair both target this shape; the cascading compressor
553-
// picks whichever samples better, so accept either.
558+
// OnPair is the only string-fragmentation scheme in the default
559+
// builder, so it should win the sample-based comparison.
554560
let mut strings = Vec::with_capacity(1000);
555561
for i in 0..1000 {
556562
strings.push(Some(format!(
@@ -561,14 +567,47 @@ mod scheme_selection_tests {
561567
let array_ref = array.into_array();
562568
let compressed = BtrBlocksCompressor::default()
563569
.compress(&array_ref, &mut SESSION.create_execution_ctx())?;
564-
let is_fsst = compressed.is::<FSST>();
565-
#[cfg(feature = "onpair")]
566-
let is_onpair = compressed.is::<vortex_onpair::OnPair>();
567-
#[cfg(not(feature = "onpair"))]
568-
let is_onpair = false;
569570
assert!(
570-
is_fsst || is_onpair,
571-
"expected FSST or OnPair, got {}",
571+
compressed.is::<vortex_onpair::OnPair>(),
572+
"expected OnPair, got {}",
573+
compressed.encoding_id()
574+
);
575+
Ok(())
576+
}
577+
578+
/// FSST is no longer in the default scheme list, but `with_new_scheme`
579+
/// still lets callers opt it back in.
580+
#[test]
581+
fn test_fsst_opt_in_still_works() -> VortexResult<()> {
582+
use crate::BtrBlocksCompressorBuilder;
583+
use crate::SchemeExt;
584+
use crate::schemes::string::FSSTScheme;
585+
586+
// FSST must not be registered by default.
587+
assert!(
588+
!crate::ALL_SCHEMES.iter().any(|s| s.id() == FSSTScheme.id()),
589+
"FSSTScheme should not be in ALL_SCHEMES anymore",
590+
);
591+
592+
// ...but explicitly adding it back should still produce a compressor
593+
// that returns an FSST array for FSST-favourable input. Start from an
594+
// empty builder so the sample-based comparison can't pick OnPair.
595+
let mut strings = Vec::with_capacity(1000);
596+
for i in 0..1000 {
597+
strings.push(Some(format!(
598+
"this_is_a_common_prefix_with_some_variation_{i}_and_a_common_suffix_pattern"
599+
)));
600+
}
601+
let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable));
602+
let array_ref = array.into_array();
603+
604+
let compressor = BtrBlocksCompressorBuilder::empty()
605+
.with_new_scheme(&FSSTScheme)
606+
.build();
607+
let compressed = compressor.compress(&array_ref, &mut SESSION.create_execution_ctx())?;
608+
assert!(
609+
compressed.is::<FSST>(),
610+
"expected FSST when only FSSTScheme is registered, got {}",
572611
compressed.encoding_id()
573612
);
574613
Ok(())

vortex-file/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ vortex-io = { workspace = true }
4646
vortex-layout = { workspace = true }
4747
vortex-mask = { workspace = true }
4848
vortex-metrics = { workspace = true }
49+
vortex-onpair = { workspace = true, optional = true }
4950
vortex-pco = { workspace = true }
5051
vortex-runend = { workspace = true }
5152
vortex-scan = { workspace = true }
@@ -68,7 +69,9 @@ vortex-scan = { workspace = true }
6869
workspace = true
6970

7071
[features]
72+
default = ["onpair"]
7173
object_store = ["dep:object_store", "vortex-io/object_store", "tokio"]
74+
onpair = ["dep:vortex-onpair", "vortex-btrblocks/onpair"]
7275
tokio = [
7376
"dep:tokio",
7477
"vortex-error/tokio",

vortex-file/src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ use vortex_array::arrays::patched::use_experimental_patches;
115115
use vortex_array::session::ArraySessionExt;
116116
use vortex_bytebool::ByteBool;
117117
use vortex_fsst::FSST;
118+
#[cfg(feature = "onpair")]
119+
use vortex_onpair::OnPair;
118120
use vortex_pco::Pco;
119121
use vortex_session::VortexSession;
120122
use vortex_sparse::Sparse;
@@ -163,6 +165,8 @@ pub fn register_default_encodings(session: &VortexSession) {
163165
arrays.register(ByteBool);
164166
arrays.register(Dict);
165167
arrays.register(FSST);
168+
#[cfg(feature = "onpair")]
169+
arrays.register(OnPair);
166170
arrays.register(Pco);
167171
arrays.register(Sparse);
168172
arrays.register(ZigZag);

vortex-file/src/strategy.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ use vortex_layout::layouts::repartition::RepartitionWriterOptions;
5252
use vortex_layout::layouts::table::TableStrategy;
5353
use vortex_layout::layouts::zoned::writer::ZonedLayoutOptions;
5454
use vortex_layout::layouts::zoned::writer::ZonedStrategy;
55+
#[cfg(feature = "onpair")]
56+
use vortex_onpair::OnPair;
5557
use vortex_pco::Pco;
5658
use vortex_runend::RunEnd;
5759
use vortex_sequence::Sequence;
@@ -100,6 +102,8 @@ pub static ALLOWED_ENCODINGS: LazyLock<HashSet<ArrayId>> = LazyLock::new(|| {
100102
allowed.insert(Delta.id());
101103
allowed.insert(FoR.id());
102104
allowed.insert(FSST.id());
105+
#[cfg(feature = "onpair")]
106+
allowed.insert(OnPair.id());
103107
allowed.insert(Pco.id());
104108
allowed.insert(RLE.id());
105109
allowed.insert(RunEnd.id());

0 commit comments

Comments
 (0)