Skip to content

Commit 79408c7

Browse files
committed
refactor: add build_column_statistics method to StatisticsAccumulators
Encapsulate get_col_stats parameters by adding build_column_statistics() method to StatisticsAccumulators, removing the standalone function.
1 parent 8d6a4b1 commit 79408c7

1 file changed

Lines changed: 52 additions & 59 deletions

File tree

datafusion/datasource-parquet/src/metadata.rs

Lines changed: 52 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -333,16 +333,16 @@ impl<'a> DFParquetMetadata<'a> {
333333
},
334334
);
335335

336-
get_col_stats(
337-
logical_file_schema,
338-
&null_counts_array,
339-
&mut max_accs,
340-
&mut min_accs,
341-
&mut is_max_value_exact,
342-
&mut is_min_value_exact,
343-
&column_byte_sizes,
344-
&distinct_counts_array,
345-
)
336+
let mut accumulators = StatisticsAccumulators {
337+
min_accs: &mut min_accs,
338+
max_accs: &mut max_accs,
339+
null_counts_array: &mut null_counts_array,
340+
is_min_value_exact: &mut is_min_value_exact,
341+
is_max_value_exact: &mut is_max_value_exact,
342+
column_byte_sizes: &mut column_byte_sizes,
343+
distinct_counts_array: &mut distinct_counts_array,
344+
};
345+
accumulators.build_column_statistics(logical_file_schema)
346346
} else {
347347
// Record column sizes
348348
logical_file_schema
@@ -415,55 +415,6 @@ fn create_max_min_accs(
415415
(max_values, min_values)
416416
}
417417

418-
#[expect(clippy::too_many_arguments)]
419-
fn get_col_stats(
420-
schema: &Schema,
421-
null_counts: &[Precision<usize>],
422-
max_values: &mut [Option<MaxAccumulator>],
423-
min_values: &mut [Option<MinAccumulator>],
424-
is_max_value_exact: &mut [Option<bool>],
425-
is_min_value_exact: &mut [Option<bool>],
426-
column_byte_sizes: &[Precision<usize>],
427-
distinct_counts: &[Precision<usize>],
428-
) -> Vec<ColumnStatistics> {
429-
(0..schema.fields().len())
430-
.map(|i| {
431-
let max_value = match (
432-
max_values.get_mut(i).unwrap(),
433-
is_max_value_exact.get(i).unwrap(),
434-
) {
435-
(Some(max_value), Some(true)) => {
436-
max_value.evaluate().ok().map(Precision::Exact)
437-
}
438-
(Some(max_value), Some(false)) | (Some(max_value), None) => {
439-
max_value.evaluate().ok().map(Precision::Inexact)
440-
}
441-
(None, _) => None,
442-
};
443-
let min_value = match (
444-
min_values.get_mut(i).unwrap(),
445-
is_min_value_exact.get(i).unwrap(),
446-
) {
447-
(Some(min_value), Some(true)) => {
448-
min_value.evaluate().ok().map(Precision::Exact)
449-
}
450-
(Some(min_value), Some(false)) | (Some(min_value), None) => {
451-
min_value.evaluate().ok().map(Precision::Inexact)
452-
}
453-
(None, _) => None,
454-
};
455-
ColumnStatistics {
456-
null_count: null_counts[i],
457-
max_value: max_value.unwrap_or(Precision::Absent),
458-
min_value: min_value.unwrap_or(Precision::Absent),
459-
sum_value: Precision::Absent,
460-
distinct_count: distinct_counts[i],
461-
byte_size: column_byte_sizes[i],
462-
}
463-
})
464-
.collect()
465-
}
466-
467418
/// Holds the accumulator state for collecting statistics from row groups
468419
struct StatisticsAccumulators<'a> {
469420
min_accs: &'a mut [Option<MinAccumulator>],
@@ -475,6 +426,48 @@ struct StatisticsAccumulators<'a> {
475426
distinct_counts_array: &'a mut [Precision<usize>],
476427
}
477428

429+
impl StatisticsAccumulators<'_> {
430+
/// Converts the accumulated statistics into a vector of `ColumnStatistics`
431+
fn build_column_statistics(&mut self, schema: &Schema) -> Vec<ColumnStatistics> {
432+
(0..schema.fields().len())
433+
.map(|i| {
434+
let max_value = match (
435+
self.max_accs.get_mut(i).unwrap(),
436+
self.is_max_value_exact.get(i).unwrap(),
437+
) {
438+
(Some(max_value), Some(true)) => {
439+
max_value.evaluate().ok().map(Precision::Exact)
440+
}
441+
(Some(max_value), Some(false)) | (Some(max_value), None) => {
442+
max_value.evaluate().ok().map(Precision::Inexact)
443+
}
444+
(None, _) => None,
445+
};
446+
let min_value = match (
447+
self.min_accs.get_mut(i).unwrap(),
448+
self.is_min_value_exact.get(i).unwrap(),
449+
) {
450+
(Some(min_value), Some(true)) => {
451+
min_value.evaluate().ok().map(Precision::Exact)
452+
}
453+
(Some(min_value), Some(false)) | (Some(min_value), None) => {
454+
min_value.evaluate().ok().map(Precision::Inexact)
455+
}
456+
(None, _) => None,
457+
};
458+
ColumnStatistics {
459+
null_count: self.null_counts_array[i],
460+
max_value: max_value.unwrap_or(Precision::Absent),
461+
min_value: min_value.unwrap_or(Precision::Absent),
462+
sum_value: Precision::Absent,
463+
distinct_count: self.distinct_counts_array[i],
464+
byte_size: self.column_byte_sizes[i],
465+
}
466+
})
467+
.collect()
468+
}
469+
}
470+
478471
fn summarize_column_statistics(
479472
parquet_schema: &SchemaDescriptor,
480473
logical_file_schema: &Schema,

0 commit comments

Comments
 (0)