Skip to content

Commit 0cb7d29

Browse files
feat: Add observability metrics for scan log replay (delta-io#1866)
Adds observability metrics to scan log replay for tracking performance and debugging parallel checkpoint processing. Key changes: ScanMetrics struct with atomic counters: - num_add_files_seen - add files seen during deduplication - num_active_add_files - add files that survived log replay (files to read) - num_remove_files_seen - remove files seen from delta/commit files - num_non_file_actions - protocol, metadata, etc. - num_predicate_filtered - files filtered by data skipping or partition pruning - hash_set_size - peak size of deduplication hash set - dedup_visitor_time_ns - time spent in deduplication visitor - predicate_eval_time_ns - time spent evaluating predicates Other changes: - Metrics logged at phase boundaries with log_with_message() - Counters reset between sequential and parallel phases - Uses relaxed atomic ordering for performance across parallel threads ``` INFO Completed sequential scan metadata num_adds=0 num_removes=0 num_non_file_actions=5 hash_set_size=0 data_skipping_filtered=0 partition_pruning_filtered=0 dedup_visitor_time_ms=0 data_skipping_time_ms=0 partition_pruning_time_ms=0 INFO Completed parallel scan metadata num_adds=101 num_removes=0 num_non_file_actions=0 hash_set_size=101 data_skipping_filtered=4 partition_pruning_filtered=0 dedup_visitor_time_ms=2 data_skipping_time_ms=15 partition_pruning_time_ms=0 ``` # Testing Added parameterized tests with `ExpectedMetrics` struct verifying exact counter values across 4 scenarios: * JSON-only checkpoint (no sidecars * Parquet sidecars * with/without predicates * With/without parallel phase
1 parent f6b5821 commit 0cb7d29

8 files changed

Lines changed: 671 additions & 78 deletions

File tree

kernel/src/actions/visitors.rs

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ impl RowVisitor for MetadataVisitor {
4343
#[derive(Default)]
4444
pub(crate) struct SelectionVectorVisitor {
4545
pub(crate) selection_vector: Vec<bool>,
46+
pub(crate) num_filtered: u64,
4647
}
4748

4849
/// A single non-nullable BOOL column
@@ -61,8 +62,11 @@ impl RowVisitor for SelectionVectorVisitor {
6162
))
6263
);
6364
for i in 0..row_count {
64-
self.selection_vector
65-
.push(getters[0].get(i, "selectionvector.output")?);
65+
let selected: bool = getters[0].get(i, "selectionvector.output")?;
66+
if !selected {
67+
self.num_filtered += 1;
68+
}
69+
self.selection_vector.push(selected);
6670
}
6771
Ok(())
6872
}
@@ -695,7 +699,10 @@ impl RowVisitor for InCommitTimestampVisitor {
695699
mod tests {
696700
use super::*;
697701

698-
use crate::arrow::array::StringArray;
702+
use crate::arrow::array::{BooleanArray, StringArray};
703+
use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
704+
use crate::arrow::record_batch::RecordBatch;
705+
use crate::engine::arrow_data::ArrowEngineData;
699706

700707
use crate::engine::sync::SyncEngine;
701708
use crate::expressions::{column_expr_ref, Expression};
@@ -1299,4 +1306,29 @@ mod tests {
12991306
Some(1677811178585), // Retrieved ICT
13001307
);
13011308
}
1309+
1310+
// Helper to create a boolean batch for SelectionVectorVisitor tests
1311+
fn create_boolean_batch(values: Vec<bool>) -> Box<dyn EngineData> {
1312+
let array = BooleanArray::from(values);
1313+
let arrow_schema = ArrowSchema::new(vec![Field::new("output", DataType::Boolean, false)]);
1314+
let batch = RecordBatch::try_new(Arc::new(arrow_schema), vec![Arc::new(array)]).unwrap();
1315+
Box::new(ArrowEngineData::new(batch))
1316+
}
1317+
1318+
#[rstest::rstest]
1319+
#[case::empty_batch(vec![], 0, "empty batch should have no filtered rows")]
1320+
#[case::all_selected(vec![true, true, true, true], 0, "all selected should have no filtered rows")]
1321+
#[case::all_filtered(vec![false, false, false, false, false], 5, "all filtered should count all rows")]
1322+
#[case::mixed_selection(vec![true, false, true, false, false, true], 3, "mixed selection should count false values")]
1323+
fn selection_vector_visitor_counter_accuracy(
1324+
#[case] input: Vec<bool>,
1325+
#[case] expected_filtered: u64,
1326+
#[case] _description: &str,
1327+
) {
1328+
let batch = create_boolean_batch(input.clone());
1329+
let mut visitor = SelectionVectorVisitor::default();
1330+
visitor.visit_rows_of(batch.as_ref()).unwrap();
1331+
assert_eq!(visitor.selection_vector, input);
1332+
assert_eq!(visitor.num_filtered, expected_filtered);
1333+
}
13021334
}

0 commit comments

Comments
 (0)