Skip to content

Commit 8addf4c

Browse files
committed
rework schemes AGAIN
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
1 parent 1fa9ebe commit 8addf4c

37 files changed

Lines changed: 1513 additions & 1238 deletions

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-btrblocks/Cargo.toml

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -65,15 +65,5 @@ name = "compress_listview"
6565
harness = false
6666
test = false
6767

68-
[[bench]]
69-
name = "dict_encode"
70-
harness = false
71-
test = false
72-
73-
[[bench]]
74-
name = "stats_calc"
75-
harness = false
76-
test = false
77-
7868
[package.metadata.cargo-machete]
7969
ignored = ["getrandom_v03"]

vortex-btrblocks/public-api.lock

Lines changed: 19 additions & 19 deletions
Large diffs are not rendered by default.

vortex-btrblocks/src/lib.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,11 @@ pub use builder::BtrBlocksCompressorBuilder;
6666
pub use canonical_compressor::BtrBlocksCompressor;
6767
pub use schemes::patches::compress_patches;
6868
pub use vortex_compressor::CascadingCompressor;
69-
pub use vortex_compressor::builtins::integer_dictionary_encode;
7069
pub use vortex_compressor::ctx::CompressorContext;
7170
pub use vortex_compressor::ctx::MAX_CASCADE;
7271
pub use vortex_compressor::scheme::Scheme;
7372
pub use vortex_compressor::scheme::SchemeExt;
7473
pub use vortex_compressor::scheme::SchemeId;
75-
pub use vortex_compressor::scheme::estimate_compression_ratio_with_sampling;
7674
pub use vortex_compressor::stats::ArrayAndStats;
7775
pub use vortex_compressor::stats::BoolStats;
7876
pub use vortex_compressor::stats::FloatStats;

vortex-btrblocks/src/schemes/decimal.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use vortex_array::arrays::PrimitiveArray;
1111
use vortex_array::arrays::decimal::narrowed_decimal;
1212
use vortex_array::dtype::DecimalType;
1313
use vortex_decimal_byte_parts::DecimalByteParts;
14+
use vortex_compressor::estimate::CompressionEstimate;
1415
use vortex_error::VortexResult;
1516

1617
use crate::ArrayAndStats;
@@ -42,12 +43,11 @@ impl Scheme for DecimalScheme {
4243

4344
fn expected_compression_ratio(
4445
&self,
45-
_compressor: &CascadingCompressor,
4646
_data: &mut ArrayAndStats,
4747
_ctx: CompressorContext,
48-
) -> VortexResult<f64> {
48+
) -> CompressionEstimate {
4949
// Decimal compression is almost always beneficial (narrowing + primitive compression).
50-
Ok(f64::MAX)
50+
CompressionEstimate::AlwaysUse
5151
}
5252

5353
fn compress(

vortex-btrblocks/src/schemes/float.rs

Lines changed: 44 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use vortex_array::Canonical;
1111
use vortex_array::IntoArray;
1212
use vortex_array::ToCanonical;
1313
use vortex_array::dtype::PType;
14+
use vortex_compressor::estimate::CompressionEstimate;
1415
use vortex_compressor::scheme::ChildSelection;
1516
use vortex_compressor::scheme::DescendantExclusion;
1617
use vortex_error::VortexResult;
@@ -24,7 +25,6 @@ use crate::CompressorContext;
2425
use crate::Scheme;
2526
use crate::SchemeExt;
2627
use crate::compress_patches;
27-
use crate::estimate_compression_ratio_with_sampling;
2828

2929
/// ALP (Adaptive Lossless floating-Point) encoding.
3030
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
@@ -69,22 +69,21 @@ impl Scheme for ALPScheme {
6969

7070
fn expected_compression_ratio(
7171
&self,
72-
compressor: &CascadingCompressor,
7372
data: &mut ArrayAndStats,
7473
ctx: CompressorContext,
75-
) -> VortexResult<f64> {
74+
) -> CompressionEstimate {
7675
// ALP encodes floats as integers. Without integer compression afterward, the encoded ints
7776
// are the same size.
7877
if ctx.finished_cascading() {
79-
return Ok(0.0);
78+
return CompressionEstimate::Skip;
8079
}
8180

8281
// We don't support ALP for f16.
83-
if data.float_stats().source().ptype() == PType::F16 {
84-
return Ok(0.0);
82+
if data.array_as_primitive().ptype() == PType::F16 {
83+
return CompressionEstimate::Skip;
8584
}
8685

87-
estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx)
86+
CompressionEstimate::Sample
8887
}
8988

9089
fn compress(
@@ -93,9 +92,7 @@ impl Scheme for ALPScheme {
9392
data: &mut ArrayAndStats,
9493
ctx: CompressorContext,
9594
) -> VortexResult<ArrayRef> {
96-
let stats = data.float_stats();
97-
98-
let alp_encoded = alp_encode(stats.source(), None)?;
95+
let alp_encoded = alp_encode(&data.array_as_primitive(), None)?;
9996

10097
// Compress the ALP ints.
10198
let compressed_alp_ints =
@@ -120,15 +117,15 @@ impl Scheme for ALPRDScheme {
120117

121118
fn expected_compression_ratio(
122119
&self,
123-
compressor: &CascadingCompressor,
124120
data: &mut ArrayAndStats,
125-
ctx: CompressorContext,
126-
) -> VortexResult<f64> {
127-
if data.float_stats().source().ptype() == PType::F16 {
128-
return Ok(0.0);
121+
_ctx: CompressorContext,
122+
) -> CompressionEstimate {
123+
// We don't support ALPRD for f16.
124+
if data.array_as_primitive().ptype() == PType::F16 {
125+
return CompressionEstimate::Skip;
129126
}
130127

131-
estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx)
128+
CompressionEstimate::Sample
132129
}
133130

134131
fn compress(
@@ -137,15 +134,15 @@ impl Scheme for ALPRDScheme {
137134
data: &mut ArrayAndStats,
138135
_ctx: CompressorContext,
139136
) -> VortexResult<ArrayRef> {
140-
let stats = data.float_stats();
137+
let primitive_array = data.array_as_primitive();
141138

142-
let encoder = match stats.source().ptype() {
143-
PType::F32 => RDEncoder::new(stats.source().as_slice::<f32>()),
144-
PType::F64 => RDEncoder::new(stats.source().as_slice::<f64>()),
139+
let encoder = match primitive_array.ptype() {
140+
PType::F32 => RDEncoder::new(primitive_array.as_slice::<f32>()),
141+
PType::F64 => RDEncoder::new(primitive_array.as_slice::<f64>()),
145142
ptype => vortex_panic!("cannot ALPRD compress ptype {ptype}"),
146143
};
147144

148-
let alp_rd = encoder.encode(stats.source());
145+
let alp_rd = encoder.encode(&primitive_array);
149146
let dtype = alp_rd.dtype().clone();
150147
let right_bit_width = alp_rd.right_bit_width();
151148
let mut alp_rd_data = alp_rd.into_data();
@@ -193,24 +190,25 @@ impl Scheme for NullDominatedSparseScheme {
193190

194191
fn expected_compression_ratio(
195192
&self,
196-
_compressor: &CascadingCompressor,
197193
data: &mut ArrayAndStats,
198194
_ctx: CompressorContext,
199-
) -> VortexResult<f64> {
195+
) -> CompressionEstimate {
196+
let len = data.array_len() as f64;
200197
let stats = data.float_stats();
198+
let value_count = stats.value_count();
201199

202-
if stats.value_count() == 0 {
203-
// All nulls should use ConstantScheme instead of this.
204-
return Ok(0.0);
200+
// All-null arrays should be compressed as constant instead anyways.
201+
if value_count == 0 {
202+
return CompressionEstimate::Skip;
205203
}
206204

207205
// If the majority (90%) of values is null, this will compress well.
208-
if stats.null_count() as f64 / stats.source().len() as f64 > 0.9 {
209-
return Ok(stats.source().len() as f64 / stats.value_count() as f64);
206+
if stats.null_count() as f64 / len > 0.9 {
207+
return CompressionEstimate::Ratio(len / value_count as f64);
210208
}
211209

212210
// Otherwise we don't go this route.
213-
Ok(0.0)
211+
CompressionEstimate::Skip
214212
}
215213

216214
fn compress(
@@ -219,10 +217,8 @@ impl Scheme for NullDominatedSparseScheme {
219217
data: &mut ArrayAndStats,
220218
ctx: CompressorContext,
221219
) -> VortexResult<ArrayRef> {
222-
let stats = data.float_stats();
223-
224220
// We pass None as we only run this pathway for NULL-dominated float arrays.
225-
let sparse_encoded = Sparse::encode(&stats.source().clone().into_array(), None)?;
221+
let sparse_encoded = Sparse::encode(data.array(), None)?;
226222

227223
if let Some(sparse) = sparse_encoded.as_opt::<Sparse>() {
228224
let indices = sparse.patches().indices().to_primitive().narrow()?;
@@ -252,17 +248,26 @@ impl Scheme for PcoScheme {
252248
is_float_primitive(canonical)
253249
}
254250

251+
fn expected_compression_ratio(
252+
&self,
253+
_data: &mut ArrayAndStats,
254+
_ctx: CompressorContext,
255+
) -> CompressionEstimate {
256+
CompressionEstimate::Sample
257+
}
258+
255259
fn compress(
256260
&self,
257261
_compressor: &CascadingCompressor,
258262
data: &mut ArrayAndStats,
259263
_ctx: CompressorContext,
260264
) -> VortexResult<ArrayRef> {
261-
let stats = data.float_stats();
262-
Ok(
263-
vortex_pco::Pco::from_primitive(stats.source(), pco::DEFAULT_COMPRESSION_LEVEL, 8192)?
264-
.into_array(),
265-
)
265+
Ok(vortex_pco::Pco::from_primitive(
266+
&data.array_as_primitive(),
267+
pco::DEFAULT_COMPRESSION_LEVEL,
268+
8192,
269+
)?
270+
.into_array())
266271
}
267272
}
268273

@@ -408,7 +413,8 @@ mod scheme_selection_tests {
408413
let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable);
409414
let btr = BtrBlocksCompressor::default();
410415
let compressed = btr.compress(&array.into_array())?;
411-
assert!(compressed.is::<Dict>());
416+
assert!(compressed.is::<ALP>());
417+
assert!(compressed.children()[0].is::<Dict>());
412418
Ok(())
413419
}
414420

0 commit comments

Comments
 (0)