Skip to content

Commit be65147

Browse files
authored
Rework Scheme estimation in compressor (#7230)
## Summary Tracking Issue: #7216 Adds a new `CompressionEstimate` type in `vortex-compressor/src/estimate.rs` that the `expected_compression_ratio` method now returns. Additionally moves some things around for clarity. Note that this is not just a refactor, there is subtle logic that has changed in a few places (that I think is better, not actually sure). I'm happy to split out some stuff into other PRs if that helps. ### Future Work - I also would like to add a variant called `Exact` that returns the fully compressed array in the case that we can only determine if a scheme is a candidate by compressing the whole thing without any errors; the only case where we want to do this is `SequenceArray` (and maybe there's an argument to do this for `ConstantArray` too, but the semantics around `ConstantArray` should be even more special regardless, imo). - This might be in a `ResolvedEstimate` enum instead. - There are also a bunch of TODOs littered everywhere that are easily fixed, but I want to do those in a followup. - We probably want to hardcode the `ConstantScheme` logic into the compressor since I cannot think of any reason why you would not want to have a `ConstantScheme` (except when you have a very small array, and at that point you don't care about perf regardless). ## API Changes `expected_compression_ratio` now only takes stats and compressor context (it does not take the compressor at all) and returns a `CompressionEstimate`. This method must be super quick, and any sampling or expensive operations are now deferred to later by the compressor. ## Testing Just a few extra tests, am relying on the existing test suite as it's not like completely new logic is happening. Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
1 parent 06065ff commit be65147

File tree

37 files changed

+1504
-1229
lines changed

37 files changed

+1504
-1229
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-btrblocks/Cargo.toml

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -65,15 +65,5 @@ name = "compress_listview"
6565
harness = false
6666
test = false
6767

68-
[[bench]]
69-
name = "dict_encode"
70-
harness = false
71-
test = false
72-
73-
[[bench]]
74-
name = "stats_calc"
75-
harness = false
76-
test = false
77-
7868
[package.metadata.cargo-machete]
7969
ignored = ["getrandom_v03"]

vortex-btrblocks/public-api.lock

Lines changed: 19 additions & 19 deletions
Large diffs are not rendered by default.

vortex-btrblocks/src/lib.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,11 @@ pub use builder::BtrBlocksCompressorBuilder;
6666
pub use canonical_compressor::BtrBlocksCompressor;
6767
pub use schemes::patches::compress_patches;
6868
pub use vortex_compressor::CascadingCompressor;
69-
pub use vortex_compressor::builtins::integer_dictionary_encode;
7069
pub use vortex_compressor::ctx::CompressorContext;
7170
pub use vortex_compressor::ctx::MAX_CASCADE;
7271
pub use vortex_compressor::scheme::Scheme;
7372
pub use vortex_compressor::scheme::SchemeExt;
7473
pub use vortex_compressor::scheme::SchemeId;
75-
pub use vortex_compressor::scheme::estimate_compression_ratio_with_sampling;
7674
pub use vortex_compressor::stats::ArrayAndStats;
7775
pub use vortex_compressor::stats::BoolStats;
7876
pub use vortex_compressor::stats::FloatStats;

vortex-btrblocks/src/schemes/decimal.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use vortex_array::ToCanonical;
1010
use vortex_array::arrays::PrimitiveArray;
1111
use vortex_array::arrays::decimal::narrowed_decimal;
1212
use vortex_array::dtype::DecimalType;
13+
use vortex_compressor::estimate::CompressionEstimate;
1314
use vortex_decimal_byte_parts::DecimalByteParts;
1415
use vortex_error::VortexResult;
1516

@@ -42,12 +43,11 @@ impl Scheme for DecimalScheme {
4243

4344
fn expected_compression_ratio(
4445
&self,
45-
_compressor: &CascadingCompressor,
4646
_data: &mut ArrayAndStats,
4747
_ctx: CompressorContext,
48-
) -> VortexResult<f64> {
48+
) -> CompressionEstimate {
4949
// Decimal compression is almost always beneficial (narrowing + primitive compression).
50-
Ok(f64::MAX)
50+
CompressionEstimate::AlwaysUse
5151
}
5252

5353
fn compress(

vortex-btrblocks/src/schemes/float.rs

Lines changed: 44 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use vortex_array::IntoArray;
1515
use vortex_array::ToCanonical;
1616
use vortex_array::arrays::primitive::PrimitiveArrayExt;
1717
use vortex_array::dtype::PType;
18+
use vortex_compressor::estimate::CompressionEstimate;
1819
use vortex_compressor::scheme::ChildSelection;
1920
use vortex_compressor::scheme::DescendantExclusion;
2021
use vortex_error::VortexResult;
@@ -28,7 +29,6 @@ use crate::CompressorContext;
2829
use crate::Scheme;
2930
use crate::SchemeExt;
3031
use crate::compress_patches;
31-
use crate::estimate_compression_ratio_with_sampling;
3232

3333
/// ALP (Adaptive Lossless floating-Point) encoding.
3434
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
@@ -73,22 +73,21 @@ impl Scheme for ALPScheme {
7373

7474
fn expected_compression_ratio(
7575
&self,
76-
compressor: &CascadingCompressor,
7776
data: &mut ArrayAndStats,
7877
ctx: CompressorContext,
79-
) -> VortexResult<f64> {
78+
) -> CompressionEstimate {
8079
// ALP encodes floats as integers. Without integer compression afterward, the encoded ints
8180
// are the same size.
8281
if ctx.finished_cascading() {
83-
return Ok(0.0);
82+
return CompressionEstimate::Skip;
8483
}
8584

8685
// We don't support ALP for f16.
87-
if data.float_stats().source().ptype() == PType::F16 {
88-
return Ok(0.0);
86+
if data.array_as_primitive().ptype() == PType::F16 {
87+
return CompressionEstimate::Skip;
8988
}
9089

91-
estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx)
90+
CompressionEstimate::Sample
9291
}
9392

9493
fn compress(
@@ -97,9 +96,7 @@ impl Scheme for ALPScheme {
9796
data: &mut ArrayAndStats,
9897
ctx: CompressorContext,
9998
) -> VortexResult<ArrayRef> {
100-
let stats = data.float_stats();
101-
102-
let alp_encoded = alp_encode(stats.source(), None)?;
99+
let alp_encoded = alp_encode(&data.array_as_primitive(), None)?;
103100

104101
// Compress the ALP ints.
105102
let compressed_alp_ints =
@@ -124,15 +121,15 @@ impl Scheme for ALPRDScheme {
124121

125122
fn expected_compression_ratio(
126123
&self,
127-
compressor: &CascadingCompressor,
128124
data: &mut ArrayAndStats,
129-
ctx: CompressorContext,
130-
) -> VortexResult<f64> {
131-
if data.float_stats().source().ptype() == PType::F16 {
132-
return Ok(0.0);
125+
_ctx: CompressorContext,
126+
) -> CompressionEstimate {
127+
// We don't support ALPRD for f16.
128+
if data.array_as_primitive().ptype() == PType::F16 {
129+
return CompressionEstimate::Skip;
133130
}
134131

135-
estimate_compression_ratio_with_sampling(self, compressor, data.array(), ctx)
132+
CompressionEstimate::Sample
136133
}
137134

138135
fn compress(
@@ -141,15 +138,15 @@ impl Scheme for ALPRDScheme {
141138
data: &mut ArrayAndStats,
142139
_ctx: CompressorContext,
143140
) -> VortexResult<ArrayRef> {
144-
let stats = data.float_stats();
141+
let primitive_array = data.array_as_primitive();
145142

146-
let encoder = match stats.source().ptype() {
147-
PType::F32 => RDEncoder::new(stats.source().as_slice::<f32>()),
148-
PType::F64 => RDEncoder::new(stats.source().as_slice::<f64>()),
143+
let encoder = match primitive_array.ptype() {
144+
PType::F32 => RDEncoder::new(primitive_array.as_slice::<f32>()),
145+
PType::F64 => RDEncoder::new(primitive_array.as_slice::<f64>()),
149146
ptype => vortex_panic!("cannot ALPRD compress ptype {ptype}"),
150147
};
151148

152-
let alp_rd = encoder.encode(stats.source());
149+
let alp_rd = encoder.encode(&primitive_array);
153150
let dtype = alp_rd.dtype().clone();
154151
let right_bit_width = alp_rd.right_bit_width();
155152
let mut parts = ALPRDArrayOwnedExt::into_data_parts(alp_rd);
@@ -191,24 +188,25 @@ impl Scheme for NullDominatedSparseScheme {
191188

192189
fn expected_compression_ratio(
193190
&self,
194-
_compressor: &CascadingCompressor,
195191
data: &mut ArrayAndStats,
196192
_ctx: CompressorContext,
197-
) -> VortexResult<f64> {
193+
) -> CompressionEstimate {
194+
let len = data.array_len() as f64;
198195
let stats = data.float_stats();
196+
let value_count = stats.value_count();
199197

200-
if stats.value_count() == 0 {
201-
// All nulls should use ConstantScheme instead of this.
202-
return Ok(0.0);
198+
// All-null arrays should be compressed as constant instead anyways.
199+
if value_count == 0 {
200+
return CompressionEstimate::Skip;
203201
}
204202

205203
// If the majority (90%) of values is null, this will compress well.
206-
if stats.null_count() as f64 / stats.source().len() as f64 > 0.9 {
207-
return Ok(stats.source().len() as f64 / stats.value_count() as f64);
204+
if stats.null_count() as f64 / len > 0.9 {
205+
return CompressionEstimate::Ratio(len / value_count as f64);
208206
}
209207

210208
// Otherwise we don't go this route.
211-
Ok(0.0)
209+
CompressionEstimate::Skip
212210
}
213211

214212
fn compress(
@@ -217,10 +215,8 @@ impl Scheme for NullDominatedSparseScheme {
217215
data: &mut ArrayAndStats,
218216
ctx: CompressorContext,
219217
) -> VortexResult<ArrayRef> {
220-
let stats = data.float_stats();
221-
222218
// We pass None as we only run this pathway for NULL-dominated float arrays.
223-
let sparse_encoded = Sparse::encode(&stats.source().clone().into_array(), None)?;
219+
let sparse_encoded = Sparse::encode(data.array(), None)?;
224220

225221
if let Some(sparse) = sparse_encoded.as_opt::<Sparse>() {
226222
let indices = sparse.patches().indices().to_primitive().narrow()?;
@@ -250,17 +246,26 @@ impl Scheme for PcoScheme {
250246
is_float_primitive(canonical)
251247
}
252248

249+
fn expected_compression_ratio(
250+
&self,
251+
_data: &mut ArrayAndStats,
252+
_ctx: CompressorContext,
253+
) -> CompressionEstimate {
254+
CompressionEstimate::Sample
255+
}
256+
253257
fn compress(
254258
&self,
255259
_compressor: &CascadingCompressor,
256260
data: &mut ArrayAndStats,
257261
_ctx: CompressorContext,
258262
) -> VortexResult<ArrayRef> {
259-
let stats = data.float_stats();
260-
Ok(
261-
vortex_pco::Pco::from_primitive(stats.source(), pco::DEFAULT_COMPRESSION_LEVEL, 8192)?
262-
.into_array(),
263-
)
263+
Ok(vortex_pco::Pco::from_primitive(
264+
&data.array_as_primitive(),
265+
pco::DEFAULT_COMPRESSION_LEVEL,
266+
8192,
267+
)?
268+
.into_array())
264269
}
265270
}
266271

@@ -406,7 +411,8 @@ mod scheme_selection_tests {
406411
let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable);
407412
let btr = BtrBlocksCompressor::default();
408413
let compressed = btr.compress(&array.into_array())?;
409-
assert!(compressed.is::<Dict>());
414+
assert!(compressed.is::<ALP>());
415+
assert!(compressed.children()[0].is::<Dict>());
410416
Ok(())
411417
}
412418

0 commit comments

Comments
 (0)