Skip to content

Commit a083533

Browse files
committed
make distinct values calc lazy + refactor typed stats
Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
1 parent 09862bf commit a083533

10 files changed

Lines changed: 353 additions & 144 deletions

File tree

vortex-btrblocks/src/canonical_compressor.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ use vortex_array::arrays::ExtensionArray;
1616
use vortex_array::arrays::FixedSizeListArray;
1717
use vortex_array::arrays::ListArray;
1818
use vortex_array::arrays::ListViewArray;
19+
use vortex_array::arrays::Primitive;
1920
use vortex_array::arrays::StructArray;
2021
use vortex_array::arrays::TemporalArray;
2122
use vortex_array::arrays::listview::list_from_list_view;
@@ -31,11 +32,16 @@ use vortex_error::VortexResult;
3132

3233
use crate::BtrBlocksCompressorBuilder;
3334
use crate::CompressorContext;
35+
use crate::CompressorStats;
36+
use crate::GenerateStatsOptions;
3437
use crate::Scheme;
3538
use crate::SchemeId;
3639
use crate::StatsCache;
3740
use crate::compressor::decimal::compress_decimal;
41+
use crate::compressor::float::FloatStats;
3842
use crate::compressor::integer::DictScheme as IntDictScheme;
43+
use crate::compressor::integer::IntegerStats;
44+
use crate::compressor::string::StringStats;
3945
use crate::compressor::temporal::compress_temporal;
4046

4147
/// The main compressor type implementing BtrBlocks-inspired compression.
@@ -217,8 +223,44 @@ impl BtrBlocksCompressor {
217223
}
218224

219225
let before_nbytes = array.nbytes();
226+
let needs_distinct = eligible.iter().any(|s| s.needs_distinct_values());
220227
let mut cache = StatsCache::new();
221228

229+
// Pre-populate the stats cache with the right `count_distinct_values` setting.
230+
// This matches the old `gen_stats` behavior where distinct values were only computed
231+
// when Dict was in the scheme list.
232+
if let Some(prim) = array.as_opt::<Primitive>() {
233+
let prim = prim.to_primitive();
234+
if prim.ptype().is_int() {
235+
cache.get_or_insert_with::<IntegerStats>(|| {
236+
IntegerStats::generate_opts(
237+
&prim,
238+
GenerateStatsOptions {
239+
count_distinct_values: needs_distinct,
240+
},
241+
)
242+
});
243+
} else {
244+
cache.get_or_insert_with::<FloatStats>(|| {
245+
FloatStats::generate_opts(
246+
&prim,
247+
GenerateStatsOptions {
248+
count_distinct_values: needs_distinct,
249+
},
250+
)
251+
});
252+
}
253+
} else if array.as_opt::<vortex_array::arrays::VarBinView>().is_some() {
254+
cache.get_or_insert_with::<StringStats>(|| {
255+
StringStats::generate_opts(
256+
&array.to_varbinview(),
257+
GenerateStatsOptions {
258+
count_distinct_values: needs_distinct,
259+
},
260+
)
261+
});
262+
}
263+
222264
if let Some(winner) = self.choose_scheme(&eligible, &array, ctx, &mut cache, excludes)? {
223265
let compressed = winner.compress(self, &array, ctx, &mut cache, excludes)?;
224266
if compressed.nbytes() < before_nbytes {

vortex-btrblocks/src/compressor/float/dictionary.rs

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,18 @@ use vortex_array::dtype::half::f16;
1212
use vortex_array::validity::Validity;
1313
use vortex_array::vtable::ValidityHelper;
1414
use vortex_buffer::Buffer;
15+
use vortex_error::VortexExpect;
1516

16-
use super::stats::ErasedDistinctValues;
17+
use super::stats::ErasedStats;
1718
use super::stats::FloatStats;
1819

1920
macro_rules! typed_encode {
2021
($stats:ident, $typed:ident, $validity:ident, $typ:ty) => {{
21-
let values: Buffer<$typ> = $typed.values.iter().map(|x| x.0).collect();
22+
let distinct = $typed.distinct.as_ref().vortex_expect(
23+
"this must be present since `DictScheme` declared that we need distinct values",
24+
);
25+
26+
let values: Buffer<$typ> = distinct.distinct_values.iter().map(|x| x.0).collect();
2227

2328
let max_code = values.len();
2429
let codes = if max_code <= u8::MAX as usize {
@@ -49,10 +54,10 @@ macro_rules! typed_encode {
4954
/// Compresses a floating-point array into a dictionary arrays according to attached stats.
5055
pub fn dictionary_encode(stats: &FloatStats) -> DictArray {
5156
let validity = stats.src.validity();
52-
match &stats.distinct_values {
53-
ErasedDistinctValues::F16(typed) => typed_encode!(stats, typed, validity, f16),
54-
ErasedDistinctValues::F32(typed) => typed_encode!(stats, typed, validity, f32),
55-
ErasedDistinctValues::F64(typed) => typed_encode!(stats, typed, validity, f64),
57+
match &stats.erased {
58+
ErasedStats::F16(typed) => typed_encode!(stats, typed, validity, f16),
59+
ErasedStats::F32(typed) => typed_encode!(stats, typed, validity, f32),
60+
ErasedStats::F64(typed) => typed_encode!(stats, typed, validity, f64),
5661
}
5762
}
5863

@@ -118,7 +123,12 @@ mod tests {
118123
Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
119124
let array = PrimitiveArray::new(values, validity);
120125

121-
let stats = FloatStats::generate(&array);
126+
let stats = FloatStats::generate_opts(
127+
&array,
128+
crate::GenerateStatsOptions {
129+
count_distinct_values: true,
130+
},
131+
);
122132
let dict_array = dictionary_encode(&stats);
123133
assert_eq!(dict_array.values().len(), 2);
124134
assert_eq!(dict_array.codes().len(), 5);

vortex-btrblocks/src/compressor/float/mod.rs

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -173,11 +173,11 @@ impl Scheme for ConstantScheme {
173173
}
174174

175175
// Can only have 1 distinct value.
176-
if stats.distinct_values_count != 1 {
177-
return Ok(0.0);
176+
if stats.distinct_count().is_some_and(|count| count == 1) {
177+
return Ok(stats.value_count as f64);
178178
}
179179

180-
Ok(stats.value_count as f64)
180+
Ok(0.0)
181181
}
182182

183183
fn compress(
@@ -350,6 +350,10 @@ impl Scheme for DictScheme {
350350
is_float_primitive(canonical)
351351
}
352352

353+
fn needs_distinct_values(&self) -> bool {
354+
true
355+
}
356+
353357
fn expected_compression_ratio(
354358
&self,
355359
compressor: &BtrBlocksCompressor,
@@ -371,13 +375,19 @@ impl Scheme for DictScheme {
371375
return Ok(0.0);
372376
}
373377

374-
// If the array is high cardinality (>50% unique values) skip.
375-
if stats.distinct_values_count > stats.value_count / 2 {
376-
return Ok(0.0);
378+
// If the array is high cardinality (>50% unique values), we do not want to compress as a
379+
// dictionary.
380+
if stats
381+
.distinct_count()
382+
.is_some_and(|count| count <= stats.value_count / 2)
383+
{
384+
// Take a sample and run compression on the sample to determine before/after size.
385+
return estimate_compression_ratio_with_sampling(
386+
self, compressor, array, ctx, excludes,
387+
);
377388
}
378389

379-
// Take a sample and run compression on the sample to determine before/after size.
380-
estimate_compression_ratio_with_sampling(self, compressor, array, ctx, excludes)
390+
Ok(0.0)
381391
}
382392

383393
fn compress(

vortex-btrblocks/src/compressor/float/stats.rs

Lines changed: 63 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -27,43 +27,58 @@ use crate::compressor::rle::RLEStats;
2727
use crate::sample::sample;
2828

2929
#[derive(Debug, Clone)]
30-
pub struct DistinctValues<T> {
31-
pub values: HashSet<NativeValue<T>, FxBuildHasher>,
30+
pub struct DistinctInfo<T> {
31+
pub(super) distinct_values: HashSet<NativeValue<T>, FxBuildHasher>,
32+
distinct_count: u32,
3233
}
3334

3435
#[derive(Debug, Clone)]
35-
pub enum ErasedDistinctValues {
36-
F16(DistinctValues<f16>),
37-
F32(DistinctValues<f32>),
38-
F64(DistinctValues<f64>),
36+
pub struct TypedStats<T> {
37+
pub(super) distinct: Option<DistinctInfo<T>>,
38+
}
39+
40+
#[derive(Debug, Clone)]
41+
pub enum ErasedStats {
42+
F16(TypedStats<f16>),
43+
F32(TypedStats<f32>),
44+
F64(TypedStats<f64>),
45+
}
46+
47+
impl ErasedStats {
48+
/// Get the count of distinct values, if we have computed it already.
49+
fn distinct_count(&self) -> Option<u32> {
50+
match self {
51+
ErasedStats::F16(x) => x.distinct.as_ref().map(|d| d.distinct_count),
52+
ErasedStats::F32(x) => x.distinct.as_ref().map(|d| d.distinct_count),
53+
ErasedStats::F64(x) => x.distinct.as_ref().map(|d| d.distinct_count),
54+
}
55+
}
3956
}
4057

4158
macro_rules! impl_from_typed {
42-
($typ:ty, $variant:path) => {
43-
impl From<DistinctValues<$typ>> for ErasedDistinctValues {
44-
fn from(value: DistinctValues<$typ>) -> Self {
45-
$variant(value)
59+
($T:ty, $variant:path) => {
60+
impl From<TypedStats<$T>> for ErasedStats {
61+
fn from(typed: TypedStats<$T>) -> Self {
62+
$variant(typed)
4663
}
4764
}
4865
};
4966
}
5067

51-
impl_from_typed!(f16, ErasedDistinctValues::F16);
52-
impl_from_typed!(f32, ErasedDistinctValues::F32);
53-
impl_from_typed!(f64, ErasedDistinctValues::F64);
68+
impl_from_typed!(f16, ErasedStats::F16);
69+
impl_from_typed!(f32, ErasedStats::F32);
70+
impl_from_typed!(f64, ErasedStats::F64);
5471

5572
/// Array of floating-point numbers and relevant stats for compression.
5673
#[derive(Debug, Clone)]
5774
pub struct FloatStats {
58-
pub(crate) src: PrimitiveArray,
75+
pub(super) src: PrimitiveArray,
5976
// cache for validity.false_count()
60-
pub(crate) null_count: u32,
77+
pub(super) null_count: u32,
6178
// cache for validity.true_count()
62-
pub(crate) value_count: u32,
63-
#[allow(dead_code)]
64-
pub(crate) average_run_length: u32,
65-
pub(crate) distinct_values: ErasedDistinctValues,
66-
pub(crate) distinct_values_count: u32,
79+
pub(super) value_count: u32,
80+
pub(super) average_run_length: u32,
81+
pub(super) erased: ErasedStats,
6782
}
6883

6984
impl FloatStats {
@@ -78,6 +93,11 @@ impl FloatStats {
7893
_ => vortex_panic!("cannot generate FloatStats from ptype {}", input.ptype()),
7994
}
8095
}
96+
97+
/// Get the count of distinct values, if we have computed it already.
98+
pub fn distinct_count(&self) -> Option<u32> {
99+
self.erased.distinct_count()
100+
}
81101
}
82102

83103
impl CompressorStats for FloatStats {
@@ -119,8 +139,8 @@ fn typed_float_stats<T: NativePType + Float>(
119139
count_distinct_values: bool,
120140
) -> VortexResult<FloatStats>
121141
where
122-
DistinctValues<T>: Into<ErasedDistinctValues>,
123142
NativeValue<T>: Hash + Eq,
143+
TypedStats<T>: Into<ErasedStats>,
124144
{
125145
// Special case: empty array
126146
if array.is_empty() {
@@ -129,23 +149,15 @@ where
129149
null_count: 0,
130150
value_count: 0,
131151
average_run_length: 0,
132-
distinct_values_count: 0,
133-
distinct_values: DistinctValues {
134-
values: HashSet::<NativeValue<T>, FxBuildHasher>::with_hasher(FxBuildHasher),
135-
}
136-
.into(),
152+
erased: TypedStats { distinct: None }.into(),
137153
});
138154
} else if array.all_invalid()? {
139155
return Ok(FloatStats {
140156
src: array.clone(),
141157
null_count: u32::try_from(array.len())?,
142158
value_count: 0,
143159
average_run_length: 0,
144-
distinct_values_count: 0,
145-
distinct_values: DistinctValues {
146-
values: HashSet::<NativeValue<T>, FxBuildHasher>::with_hasher(FxBuildHasher),
147-
}
148-
.into(),
160+
erased: TypedStats { distinct: None }.into(),
149161
});
150162
}
151163

@@ -208,7 +220,7 @@ where
208220

209221
let null_count = u32::try_from(null_count)?;
210222
let value_count = u32::try_from(value_count)?;
211-
let distinct_values_count = if count_distinct_values {
223+
let distinct_count = if count_distinct_values {
212224
u32::try_from(distinct_values.len())?
213225
} else {
214226
u32::MAX
@@ -217,11 +229,13 @@ where
217229
Ok(FloatStats {
218230
null_count,
219231
value_count,
220-
distinct_values_count,
221232
src: array.clone(),
222233
average_run_length: value_count / runs,
223-
distinct_values: DistinctValues {
224-
values: distinct_values,
234+
erased: TypedStats {
235+
distinct: Some(DistinctInfo {
236+
distinct_values,
237+
distinct_count,
238+
}),
225239
}
226240
.into(),
227241
})
@@ -243,12 +257,17 @@ mod tests {
243257
let floats = buffer![0.0f32, 1.0f32, 2.0f32].into_array();
244258
let floats = floats.to_primitive();
245259

246-
let stats = FloatStats::generate(&floats);
260+
let stats = FloatStats::generate_opts(
261+
&floats,
262+
crate::GenerateStatsOptions {
263+
count_distinct_values: true,
264+
},
265+
);
247266

248267
assert_eq!(stats.value_count, 3);
249268
assert_eq!(stats.null_count, 0);
250269
assert_eq!(stats.average_run_length, 1);
251-
assert_eq!(stats.distinct_values_count, 3);
270+
assert_eq!(stats.distinct_count().unwrap(), 3);
252271
}
253272

254273
#[test]
@@ -258,11 +277,16 @@ mod tests {
258277
Validity::from_iter([false, true, true]),
259278
);
260279

261-
let stats = FloatStats::generate(&floats);
280+
let stats = FloatStats::generate_opts(
281+
&floats,
282+
crate::GenerateStatsOptions {
283+
count_distinct_values: true,
284+
},
285+
);
262286

263287
assert_eq!(stats.value_count, 2);
264288
assert_eq!(stats.null_count, 1);
265289
assert_eq!(stats.average_run_length, 1);
266-
assert_eq!(stats.distinct_values_count, 2);
290+
assert_eq!(stats.distinct_count().unwrap(), 2);
267291
}
268292
}

0 commit comments

Comments
 (0)