Skip to content

Commit 09862bf

Browse files
committed
pluggable compressor first draft
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
1 parent 2672e1b commit 09862bf

17 files changed

Lines changed: 1138 additions & 1437 deletions

File tree

fuzz/src/array/mod.rs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,10 @@ use vortex_array::search_sorted::SearchSorted;
6060
use vortex_array::search_sorted::SearchSortedSide;
6161
use vortex_btrblocks::BtrBlocksCompressor;
6262
use vortex_btrblocks::BtrBlocksCompressorBuilder;
63-
use vortex_btrblocks::FloatCode;
64-
use vortex_btrblocks::IntCode;
65-
use vortex_btrblocks::StringCode;
63+
use vortex_btrblocks::Scheme;
64+
use vortex_btrblocks::compressor::float;
65+
use vortex_btrblocks::compressor::integer;
66+
use vortex_btrblocks::compressor::string;
6667
use vortex_error::VortexExpect;
6768
use vortex_error::vortex_panic;
6869
use vortex_mask::Mask;
@@ -538,9 +539,11 @@ pub fn compress_array(array: &ArrayRef, strategy: CompressorStrategy) -> ArrayRe
538539
.compress(array)
539540
.vortex_expect("BtrBlocksCompressor compress should succeed in fuzz test"),
540541
CompressorStrategy::Compact => BtrBlocksCompressorBuilder::default()
541-
.include_string([StringCode::Zstd])
542-
.include_int([IntCode::Pco])
543-
.include_float([FloatCode::Pco])
542+
.include([
543+
string::ZstdScheme.id(),
544+
integer::PcoScheme.id(),
545+
float::PcoScheme.id(),
546+
])
544547
.build()
545548
.compress(array)
546549
.vortex_expect("Compact compress should succeed in fuzz test"),

vortex-btrblocks/src/builder.rs

Lines changed: 101 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -3,156 +3,152 @@
33

44
//! Builder for configuring `BtrBlocksCompressor` instances.
55
6-
use itertools::Itertools;
76
use vortex_utils::aliases::hash_set::HashSet;
87

98
use crate::BtrBlocksCompressor;
10-
use crate::FloatCode;
11-
use crate::IntCode;
12-
use crate::StringCode;
13-
use crate::compressor::float::ALL_FLOAT_SCHEMES;
14-
use crate::compressor::float::FloatScheme;
15-
use crate::compressor::integer::ALL_INT_SCHEMES;
16-
use crate::compressor::integer::IntegerScheme;
17-
use crate::compressor::string::ALL_STRING_SCHEMES;
18-
use crate::compressor::string::StringScheme;
9+
use crate::Scheme;
10+
use crate::SchemeId;
11+
12+
/// All available compression schemes.
13+
///
14+
/// This list is order-sensitive: the builder preserves this order when constructing
15+
/// the final scheme list, so that tie-breaking is deterministic.
16+
pub const ALL_SCHEMES: &[&dyn Scheme] = &[
17+
// Integer schemes.
18+
&crate::compressor::integer::UncompressedScheme as &dyn Scheme,
19+
&crate::compressor::integer::ConstantScheme,
20+
&crate::compressor::integer::FORScheme,
21+
&crate::compressor::integer::ZigZagScheme,
22+
&crate::compressor::integer::BitPackingScheme,
23+
&crate::compressor::integer::SparseScheme,
24+
&crate::compressor::integer::DictScheme,
25+
&crate::compressor::integer::RunEndScheme,
26+
&crate::compressor::integer::SequenceScheme,
27+
&crate::compressor::integer::RLE_INTEGER_SCHEME,
28+
#[cfg(feature = "pco")]
29+
&crate::compressor::integer::PcoScheme,
30+
// Float schemes.
31+
&crate::compressor::float::UncompressedScheme,
32+
&crate::compressor::float::ConstantScheme,
33+
&crate::compressor::float::ALPScheme,
34+
&crate::compressor::float::ALPRDScheme,
35+
&crate::compressor::float::DictScheme,
36+
&crate::compressor::float::NullDominated,
37+
&crate::compressor::float::RLE_FLOAT_SCHEME,
38+
#[cfg(feature = "pco")]
39+
&crate::compressor::float::PcoScheme,
40+
// String schemes.
41+
&crate::compressor::string::UncompressedScheme,
42+
&crate::compressor::string::DictScheme,
43+
&crate::compressor::string::FSSTScheme,
44+
&crate::compressor::string::ConstantScheme,
45+
&crate::compressor::string::NullDominated,
46+
#[cfg(feature = "zstd")]
47+
&crate::compressor::string::ZstdScheme,
48+
#[cfg(all(feature = "zstd", feature = "unstable_encodings"))]
49+
&crate::compressor::string::ZstdBuffersScheme,
50+
];
51+
52+
/// Schemes excluded by default (behind feature gates that are off or known-expensive).
53+
const DEFAULT_EXCLUDED: &[SchemeId] = &[
54+
#[cfg(feature = "pco")]
55+
SchemeId {
56+
name: "vortex.int.pco",
57+
},
58+
#[cfg(feature = "pco")]
59+
SchemeId {
60+
name: "vortex.float.pco",
61+
},
62+
#[cfg(feature = "zstd")]
63+
SchemeId {
64+
name: "vortex.string.zstd",
65+
},
66+
#[cfg(all(feature = "zstd", feature = "unstable_encodings"))]
67+
SchemeId {
68+
name: "vortex.string.zstd_buffers",
69+
},
70+
];
1971

2072
/// Builder for creating configured [`BtrBlocksCompressor`] instances.
2173
///
22-
/// Use this builder to configure which compression schemes are allowed for each data type.
23-
/// By default, all schemes are enabled.
74+
/// Use this builder to configure which compression schemes are allowed.
75+
/// By default, all schemes are enabled except those in [`DEFAULT_EXCLUDED`].
2476
///
2577
/// # Examples
2678
///
2779
/// ```rust
28-
/// use vortex_btrblocks::{BtrBlocksCompressorBuilder, IntCode, FloatCode};
80+
/// use vortex_btrblocks::{BtrBlocksCompressorBuilder, Scheme};
81+
/// use vortex_btrblocks::compressor::integer::DictScheme;
2982
///
30-
/// // Default compressor - all schemes allowed
83+
/// // Default compressor - all non-excluded schemes allowed.
3184
/// let compressor = BtrBlocksCompressorBuilder::default().build();
3285
///
33-
/// // Exclude specific schemes
86+
/// // Exclude specific schemes.
3487
/// let compressor = BtrBlocksCompressorBuilder::default()
35-
/// .exclude_int([IntCode::Dict])
88+
/// .exclude([DictScheme.id()])
3689
/// .build();
3790
///
38-
/// // Exclude then re-include
91+
/// // Exclude then re-include.
3992
/// let compressor = BtrBlocksCompressorBuilder::default()
40-
/// .exclude_int([IntCode::Dict, IntCode::Rle])
41-
/// .include_int([IntCode::Dict])
93+
/// .exclude([DictScheme.id()])
94+
/// .include([DictScheme.id()])
4295
/// .build();
4396
/// ```
4497
#[derive(Debug, Clone)]
4598
pub struct BtrBlocksCompressorBuilder {
46-
int_schemes: HashSet<&'static dyn IntegerScheme>,
47-
float_schemes: HashSet<&'static dyn FloatScheme>,
48-
string_schemes: HashSet<&'static dyn StringScheme>,
99+
schemes: HashSet<&'static dyn Scheme>,
49100
}
50101

51102
impl Default for BtrBlocksCompressorBuilder {
52103
fn default() -> Self {
104+
let excluded: HashSet<SchemeId> = DEFAULT_EXCLUDED.iter().copied().collect();
53105
Self {
54-
int_schemes: ALL_INT_SCHEMES
55-
.iter()
56-
.copied()
57-
.filter(|s| s.code() != IntCode::Pco)
58-
.collect(),
59-
float_schemes: ALL_FLOAT_SCHEMES
60-
.iter()
61-
.copied()
62-
.filter(|s| s.code() != FloatCode::Pco)
63-
.collect(),
64-
string_schemes: ALL_STRING_SCHEMES
106+
schemes: ALL_SCHEMES
65107
.iter()
66108
.copied()
67-
.filter(|s| s.code() != StringCode::Zstd && s.code() != StringCode::ZstdBuffers)
109+
.filter(|s| !excluded.contains(&s.id()))
68110
.collect(),
69111
}
70112
}
71113
}
72114

73115
impl BtrBlocksCompressorBuilder {
74-
/// Create a new builder with no encodings enabled.
75-
pub fn empty() -> Self {
76-
Self {
77-
int_schemes: Default::default(),
78-
float_schemes: Default::default(),
79-
string_schemes: Default::default(),
80-
}
81-
}
82-
83-
/// Excludes the specified integer compression schemes.
84-
pub fn exclude_int(mut self, codes: impl IntoIterator<Item = IntCode>) -> Self {
85-
let codes: HashSet<_> = codes.into_iter().collect();
86-
self.int_schemes.retain(|s| !codes.contains(&s.code()));
116+
/// Excludes the specified compression schemes by their [`SchemeId`].
117+
pub fn exclude(mut self, ids: impl IntoIterator<Item = SchemeId>) -> Self {
118+
let ids: HashSet<_> = ids.into_iter().collect();
119+
self.schemes.retain(|s| !ids.contains(&s.id()));
87120
self
88121
}
89122

90-
/// Excludes the specified float compression schemes.
91-
pub fn exclude_float(mut self, codes: impl IntoIterator<Item = FloatCode>) -> Self {
92-
let codes: HashSet<_> = codes.into_iter().collect();
93-
self.float_schemes.retain(|s| !codes.contains(&s.code()));
94-
self
95-
}
96-
97-
/// Excludes the specified string compression schemes.
98-
pub fn exclude_string(mut self, codes: impl IntoIterator<Item = StringCode>) -> Self {
99-
let codes: HashSet<_> = codes.into_iter().collect();
100-
self.string_schemes.retain(|s| !codes.contains(&s.code()));
101-
self
102-
}
103-
104-
/// Includes the specified integer compression schemes.
105-
pub fn include_int(mut self, codes: impl IntoIterator<Item = IntCode>) -> Self {
106-
let codes: HashSet<_> = codes.into_iter().collect();
107-
for scheme in ALL_INT_SCHEMES {
108-
if codes.contains(&scheme.code()) {
109-
self.int_schemes.insert(*scheme);
123+
/// Includes the specified compression schemes by their [`SchemeId`].
124+
///
125+
/// Only schemes present in [`ALL_SCHEMES`] can be included.
126+
pub fn include(mut self, ids: impl IntoIterator<Item = SchemeId>) -> Self {
127+
let ids: HashSet<_> = ids.into_iter().collect();
128+
for scheme in ALL_SCHEMES {
129+
if ids.contains(&scheme.id()) {
130+
self.schemes.insert(*scheme);
110131
}
111132
}
112133
self
113134
}
114135

115-
/// Includes the specified float compression schemes.
116-
pub fn include_float(mut self, codes: impl IntoIterator<Item = FloatCode>) -> Self {
117-
let codes: HashSet<_> = codes.into_iter().collect();
118-
for scheme in ALL_FLOAT_SCHEMES {
119-
if codes.contains(&scheme.code()) {
120-
self.float_schemes.insert(*scheme);
121-
}
122-
}
136+
/// Adds a single scheme to the builder.
137+
pub fn with_scheme(mut self, scheme: &'static dyn Scheme) -> Self {
138+
self.schemes.insert(scheme);
123139
self
124140
}
125141

126-
/// Includes the specified string compression schemes.
127-
pub fn include_string(mut self, codes: impl IntoIterator<Item = StringCode>) -> Self {
128-
let codes: HashSet<_> = codes.into_iter().collect();
129-
for scheme in ALL_STRING_SCHEMES {
130-
if codes.contains(&scheme.code()) {
131-
self.string_schemes.insert(*scheme);
132-
}
133-
}
134-
self
135-
}
136-
137-
/// Builds the configured `BtrBlocksCompressor`.
142+
/// Builds the configured [`BtrBlocksCompressor`].
143+
///
144+
/// The resulting scheme list preserves the order of [`ALL_SCHEMES`] for deterministic
145+
/// tie-breaking.
138146
pub fn build(self) -> BtrBlocksCompressor {
139-
// Note we should apply the schemes in the same order, in case try conflict.
140-
BtrBlocksCompressor {
141-
int_schemes: self
142-
.int_schemes
143-
.into_iter()
144-
.sorted_by_key(|s| s.code())
145-
.collect_vec(),
146-
float_schemes: self
147-
.float_schemes
148-
.into_iter()
149-
.sorted_by_key(|s| s.code())
150-
.collect_vec(),
151-
string_schemes: self
152-
.string_schemes
153-
.into_iter()
154-
.sorted_by_key(|s| s.code())
155-
.collect_vec(),
156-
}
147+
let schemes = ALL_SCHEMES
148+
.iter()
149+
.copied()
150+
.filter(|s| self.schemes.contains(s))
151+
.collect();
152+
BtrBlocksCompressor { schemes }
157153
}
158154
}

0 commit comments

Comments
 (0)