@@ -24,15 +24,23 @@ use crate::delete_file_index::DeleteFileIndex;
2424use crate :: expr:: { Bind , BoundPredicate , Predicate } ;
2525use crate :: io:: object_cache:: ObjectCache ;
2626use crate :: scan:: {
27- AppendSnapshotSet , BoundPredicates , ExpressionEvaluatorCache , FileScanTask ,
28- ManifestEvaluatorCache , PartitionFilterCache ,
27+ BoundPredicates , ExpressionEvaluatorCache , FileScanTask , ManifestEvaluatorCache ,
28+ PartitionFilterCache ,
2929} ;
3030use crate :: spec:: {
31- ManifestContentType , ManifestEntryRef , ManifestFile , ManifestList , ManifestStatus , SchemaRef ,
32- SnapshotRef , TableMetadataRef ,
31+ ManifestContentType , ManifestEntryRef , ManifestFile , ManifestList , SchemaRef , SnapshotRef ,
32+ TableMetadataRef ,
3333} ;
3434use crate :: { Error , ErrorKind , Result } ;
3535
36+ /// Filter applied to each [`ManifestFile`] before fetching it.
37+ /// Returns `true` to include the manifest, `false` to skip it.
38+ pub ( crate ) type ManifestFileFilter = Arc < dyn Fn ( & ManifestFile ) -> bool + Send + Sync > ;
39+
40+ /// Filter applied to each manifest entry after loading a manifest.
41+ /// Returns `true` to include the entry, `false` to skip it.
42+ pub ( crate ) type ManifestEntryFilter = Arc < dyn Fn ( & ManifestEntryRef ) -> bool + Send + Sync > ;
43+
3644/// Wraps a [`ManifestFile`] alongside the objects that are needed
3745/// to process it in a thread-safe manner
3846pub ( crate ) struct ManifestFileContext {
@@ -47,7 +55,7 @@ pub(crate) struct ManifestFileContext {
4755 expression_evaluator_cache : Arc < ExpressionEvaluatorCache > ,
4856 delete_file_index : DeleteFileIndex ,
4957 case_sensitive : bool ,
50- snapshot_range : Option < Arc < AppendSnapshotSet > > ,
58+ entry_filter : Option < ManifestEntryFilter > ,
5159}
5260
5361/// Wraps a [`ManifestEntryRef`] alongside the objects that are needed
@@ -78,33 +86,16 @@ impl ManifestFileContext {
7886 expression_evaluator_cache,
7987 delete_file_index,
8088 case_sensitive,
81- snapshot_range ,
89+ entry_filter ,
8290 } = self ;
8391
8492 let manifest = object_cache. get_manifest ( & manifest_file) . await ?;
8593
8694 for manifest_entry in manifest. entries ( ) {
87- // For incremental scans, filter entries to only include those:
88- // 1. With status ADDED (not EXISTING or DELETED)
89- // 2. With a snapshot_id that falls within the range
90- if let Some ( ref range) = snapshot_range {
91- // Only include entries with status ADDED
92- if manifest_entry. status ( ) != ManifestStatus :: Added {
95+ if let Some ( ref filter) = entry_filter {
96+ if !filter ( manifest_entry) {
9397 continue ;
9498 }
95-
96- // Only include entries from snapshots in the range
97- match manifest_entry. snapshot_id ( ) {
98- Some ( entry_snapshot_id) => {
99- if !range. contains ( entry_snapshot_id) {
100- continue ;
101- }
102- }
103- None => {
104- // Skip entries without a snapshot_id in incremental mode
105- continue ;
106- }
107- }
10899 }
109100
110101 let manifest_entry_context = ManifestEntryContext {
@@ -171,7 +162,6 @@ impl ManifestEntryContext {
171162
172163/// PlanContext wraps a [`SnapshotRef`] alongside all the other
173164/// objects that are required to perform a scan file plan.
174- #[ derive( Debug ) ]
175165pub ( crate ) struct PlanContext {
176166 pub snapshot : SnapshotRef ,
177167
@@ -186,7 +176,25 @@ pub(crate) struct PlanContext {
186176 pub partition_filter_cache : Arc < PartitionFilterCache > ,
187177 pub manifest_evaluator_cache : Arc < ManifestEvaluatorCache > ,
188178 pub expression_evaluator_cache : Arc < ExpressionEvaluatorCache > ,
189- pub snapshot_range : Option < Arc < AppendSnapshotSet > > ,
179+ pub manifest_file_filter : Option < ManifestFileFilter > ,
180+ pub manifest_entry_filter : Option < ManifestEntryFilter > ,
181+ }
182+
183+ impl std:: fmt:: Debug for PlanContext {
184+ fn fmt ( & self , f : & mut std:: fmt:: Formatter < ' _ > ) -> std:: fmt:: Result {
185+ f. debug_struct ( "PlanContext" )
186+ . field ( "snapshot" , & self . snapshot )
187+ . field ( "case_sensitive" , & self . case_sensitive )
188+ . field (
189+ "manifest_file_filter" ,
190+ & self . manifest_file_filter . as_ref ( ) . map ( |_| "..." ) ,
191+ )
192+ . field (
193+ "manifest_entry_filter" ,
194+ & self . manifest_entry_filter . as_ref ( ) . map ( |_| "..." ) ,
195+ )
196+ . finish_non_exhaustive ( )
197+ }
190198}
191199
192200impl PlanContext {
@@ -240,17 +248,8 @@ impl PlanContext {
240248 // TODO: Ideally we could ditch this intermediate Vec as we return an iterator.
241249 let mut filtered_mfcs = vec ! [ ] ;
242250 for manifest_file in manifest_files {
243- // For incremental scans, skip manifests that can't contain relevant entries:
244- // 1. Delete manifests — we only care about newly added data files.
245- // 2. Data manifests whose added_snapshot_id is outside the scan range —
246- // they can't contain entries added in the snapshots we care about.
247- // (We still keep the entry-level filter because a manifest can contain
248- // entries from multiple snapshots via manifest reuse.)
249- if let Some ( ref range) = self . snapshot_range {
250- if manifest_file. content == ManifestContentType :: Deletes {
251- continue ;
252- }
253- if !range. contains ( manifest_file. added_snapshot_id ) {
251+ if let Some ( ref filter) = self . manifest_file_filter {
252+ if !filter ( manifest_file) {
254253 continue ;
255254 }
256255 }
@@ -324,7 +323,7 @@ impl PlanContext {
324323 expression_evaluator_cache : self . expression_evaluator_cache . clone ( ) ,
325324 delete_file_index,
326325 case_sensitive : self . case_sensitive ,
327- snapshot_range : self . snapshot_range . clone ( ) ,
326+ entry_filter : self . manifest_entry_filter . clone ( ) ,
328327 }
329328 }
330329}
0 commit comments