@@ -27,43 +27,58 @@ use crate::compressor::rle::RLEStats;
2727use crate :: sample:: sample;
2828
2929#[ derive( Debug , Clone ) ]
30- pub struct DistinctValues < T > {
31- pub values : HashSet < NativeValue < T > , FxBuildHasher > ,
30+ pub struct DistinctInfo < T > {
31+ pub ( super ) distinct_values : HashSet < NativeValue < T > , FxBuildHasher > ,
32+ distinct_count : u32 ,
3233}
3334
3435#[ derive( Debug , Clone ) ]
35- pub enum ErasedDistinctValues {
36- F16 ( DistinctValues < f16 > ) ,
37- F32 ( DistinctValues < f32 > ) ,
38- F64 ( DistinctValues < f64 > ) ,
36+ pub struct TypedStats < T > {
37+ pub ( super ) distinct : Option < DistinctInfo < T > > ,
38+ }
39+
40+ #[ derive( Debug , Clone ) ]
41+ pub enum ErasedStats {
42+ F16 ( TypedStats < f16 > ) ,
43+ F32 ( TypedStats < f32 > ) ,
44+ F64 ( TypedStats < f64 > ) ,
45+ }
46+
47+ impl ErasedStats {
48+ /// Get the count of distinct values, if we have computed it already.
49+ fn distinct_count ( & self ) -> Option < u32 > {
50+ match self {
51+ ErasedStats :: F16 ( x) => x. distinct . as_ref ( ) . map ( |d| d. distinct_count ) ,
52+ ErasedStats :: F32 ( x) => x. distinct . as_ref ( ) . map ( |d| d. distinct_count ) ,
53+ ErasedStats :: F64 ( x) => x. distinct . as_ref ( ) . map ( |d| d. distinct_count ) ,
54+ }
55+ }
3956}
4057
4158macro_rules! impl_from_typed {
42- ( $typ : ty, $variant: path) => {
43- impl From <DistinctValues <$typ >> for ErasedDistinctValues {
44- fn from( value : DistinctValues <$typ >) -> Self {
45- $variant( value )
59+ ( $T : ty, $variant: path) => {
60+ impl From <TypedStats <$T >> for ErasedStats {
61+ fn from( typed : TypedStats <$T >) -> Self {
62+ $variant( typed )
4663 }
4764 }
4865 } ;
4966}
5067
51- impl_from_typed ! ( f16, ErasedDistinctValues :: F16 ) ;
52- impl_from_typed ! ( f32 , ErasedDistinctValues :: F32 ) ;
53- impl_from_typed ! ( f64 , ErasedDistinctValues :: F64 ) ;
68+ impl_from_typed ! ( f16, ErasedStats :: F16 ) ;
69+ impl_from_typed ! ( f32 , ErasedStats :: F32 ) ;
70+ impl_from_typed ! ( f64 , ErasedStats :: F64 ) ;
5471
5572/// Array of floating-point numbers and relevant stats for compression.
5673#[ derive( Debug , Clone ) ]
5774pub struct FloatStats {
58- pub ( crate ) src : PrimitiveArray ,
75+ pub ( super ) src : PrimitiveArray ,
5976 // cache for validity.false_count()
60- pub ( crate ) null_count : u32 ,
77+ pub ( super ) null_count : u32 ,
6178 // cache for validity.true_count()
62- pub ( crate ) value_count : u32 ,
63- #[ allow( dead_code) ]
64- pub ( crate ) average_run_length : u32 ,
65- pub ( crate ) distinct_values : ErasedDistinctValues ,
66- pub ( crate ) distinct_values_count : u32 ,
79+ pub ( super ) value_count : u32 ,
80+ pub ( super ) average_run_length : u32 ,
81+ pub ( super ) erased : ErasedStats ,
6782}
6883
6984impl FloatStats {
@@ -78,6 +93,11 @@ impl FloatStats {
7893 _ => vortex_panic ! ( "cannot generate FloatStats from ptype {}" , input. ptype( ) ) ,
7994 }
8095 }
96+
97+ /// Get the count of distinct values, if we have computed it already.
98+ pub fn distinct_count ( & self ) -> Option < u32 > {
99+ self . erased . distinct_count ( )
100+ }
81101}
82102
83103impl CompressorStats for FloatStats {
@@ -119,8 +139,8 @@ fn typed_float_stats<T: NativePType + Float>(
119139 count_distinct_values : bool ,
120140) -> VortexResult < FloatStats >
121141where
122- DistinctValues < T > : Into < ErasedDistinctValues > ,
123142 NativeValue < T > : Hash + Eq ,
143+ TypedStats < T > : Into < ErasedStats > ,
124144{
125145 // Special case: empty array
126146 if array. is_empty ( ) {
@@ -129,23 +149,15 @@ where
129149 null_count : 0 ,
130150 value_count : 0 ,
131151 average_run_length : 0 ,
132- distinct_values_count : 0 ,
133- distinct_values : DistinctValues {
134- values : HashSet :: < NativeValue < T > , FxBuildHasher > :: with_hasher ( FxBuildHasher ) ,
135- }
136- . into ( ) ,
152+ erased : TypedStats { distinct : None } . into ( ) ,
137153 } ) ;
138154 } else if array. all_invalid ( ) ? {
139155 return Ok ( FloatStats {
140156 src : array. clone ( ) ,
141157 null_count : u32:: try_from ( array. len ( ) ) ?,
142158 value_count : 0 ,
143159 average_run_length : 0 ,
144- distinct_values_count : 0 ,
145- distinct_values : DistinctValues {
146- values : HashSet :: < NativeValue < T > , FxBuildHasher > :: with_hasher ( FxBuildHasher ) ,
147- }
148- . into ( ) ,
160+ erased : TypedStats { distinct : None } . into ( ) ,
149161 } ) ;
150162 }
151163
@@ -208,7 +220,7 @@ where
208220
209221 let null_count = u32:: try_from ( null_count) ?;
210222 let value_count = u32:: try_from ( value_count) ?;
211- let distinct_values_count = if count_distinct_values {
223+ let distinct_count = if count_distinct_values {
212224 u32:: try_from ( distinct_values. len ( ) ) ?
213225 } else {
214226 u32:: MAX
@@ -217,11 +229,13 @@ where
217229 Ok ( FloatStats {
218230 null_count,
219231 value_count,
220- distinct_values_count,
221232 src : array. clone ( ) ,
222233 average_run_length : value_count / runs,
223- distinct_values : DistinctValues {
224- values : distinct_values,
234+ erased : TypedStats {
235+ distinct : Some ( DistinctInfo {
236+ distinct_values,
237+ distinct_count,
238+ } ) ,
225239 }
226240 . into ( ) ,
227241 } )
@@ -243,12 +257,17 @@ mod tests {
243257 let floats = buffer ! [ 0.0f32 , 1.0f32 , 2.0f32 ] . into_array ( ) ;
244258 let floats = floats. to_primitive ( ) ;
245259
246- let stats = FloatStats :: generate ( & floats) ;
260+ let stats = FloatStats :: generate_opts (
261+ & floats,
262+ crate :: GenerateStatsOptions {
263+ count_distinct_values : true ,
264+ } ,
265+ ) ;
247266
248267 assert_eq ! ( stats. value_count, 3 ) ;
249268 assert_eq ! ( stats. null_count, 0 ) ;
250269 assert_eq ! ( stats. average_run_length, 1 ) ;
251- assert_eq ! ( stats. distinct_values_count , 3 ) ;
270+ assert_eq ! ( stats. distinct_count ( ) . unwrap ( ) , 3 ) ;
252271 }
253272
254273 #[ test]
@@ -258,11 +277,16 @@ mod tests {
258277 Validity :: from_iter ( [ false , true , true ] ) ,
259278 ) ;
260279
261- let stats = FloatStats :: generate ( & floats) ;
280+ let stats = FloatStats :: generate_opts (
281+ & floats,
282+ crate :: GenerateStatsOptions {
283+ count_distinct_values : true ,
284+ } ,
285+ ) ;
262286
263287 assert_eq ! ( stats. value_count, 2 ) ;
264288 assert_eq ! ( stats. null_count, 1 ) ;
265289 assert_eq ! ( stats. average_run_length, 1 ) ;
266- assert_eq ! ( stats. distinct_values_count , 2 ) ;
290+ assert_eq ! ( stats. distinct_count ( ) . unwrap ( ) , 2 ) ;
267291 }
268292}
0 commit comments