@@ -24,15 +24,25 @@ use crate::delete_file_index::DeleteFileIndex;
2424use crate :: expr:: { Bind , BoundPredicate , Predicate } ;
2525use crate :: io:: object_cache:: ObjectCache ;
2626use crate :: scan:: {
27- AppendSnapshotSet , BoundPredicates , ExpressionEvaluatorCache , FileScanTask ,
28- ManifestEvaluatorCache , PartitionFilterCache ,
27+ BoundPredicates , ExpressionEvaluatorCache , FileScanTask , ManifestEvaluatorCache ,
28+ PartitionFilterCache ,
2929} ;
3030use crate :: spec:: {
31- ManifestContentType , ManifestEntryRef , ManifestFile , ManifestList , ManifestStatus , SchemaRef ,
32- SnapshotRef , TableMetadataRef ,
31+ ManifestContentType , ManifestEntryRef , ManifestFile , ManifestList , SchemaRef , SnapshotRef ,
32+ TableMetadataRef ,
3333} ;
3434use crate :: { Error , ErrorKind , Result } ;
3535
36+ /// Filter applied to each [`ManifestFile`] before fetching it.
37+ /// Returns `true` to include the manifest, `false` to skip it.
38+ pub ( crate ) type ManifestFileFilter =
39+ Arc < dyn Fn ( & ManifestFile ) -> bool + Send + Sync > ;
40+
41+ /// Filter applied to each manifest entry after loading a manifest.
42+ /// Returns `true` to include the entry, `false` to skip it.
43+ pub ( crate ) type ManifestEntryFilter =
44+ Arc < dyn Fn ( & ManifestEntryRef ) -> bool + Send + Sync > ;
45+
3646/// Wraps a [`ManifestFile`] alongside the objects that are needed
3747/// to process it in a thread-safe manner
3848pub ( crate ) struct ManifestFileContext {
@@ -47,7 +57,7 @@ pub(crate) struct ManifestFileContext {
4757 expression_evaluator_cache : Arc < ExpressionEvaluatorCache > ,
4858 delete_file_index : DeleteFileIndex ,
4959 case_sensitive : bool ,
50- snapshot_range : Option < Arc < AppendSnapshotSet > > ,
60+ entry_filter : Option < ManifestEntryFilter > ,
5161}
5262
5363/// Wraps a [`ManifestEntryRef`] alongside the objects that are needed
@@ -78,33 +88,16 @@ impl ManifestFileContext {
7888 expression_evaluator_cache,
7989 delete_file_index,
8090 case_sensitive,
81- snapshot_range ,
91+ entry_filter ,
8292 } = self ;
8393
8494 let manifest = object_cache. get_manifest ( & manifest_file) . await ?;
8595
8696 for manifest_entry in manifest. entries ( ) {
87- // For incremental scans, filter entries to only include those:
88- // 1. With status ADDED (not EXISTING or DELETED)
89- // 2. With a snapshot_id that falls within the range
90- if let Some ( ref range) = snapshot_range {
91- // Only include entries with status ADDED
92- if manifest_entry. status ( ) != ManifestStatus :: Added {
97+ if let Some ( ref filter) = entry_filter {
98+ if !filter ( manifest_entry) {
9399 continue ;
94100 }
95-
96- // Only include entries from snapshots in the range
97- match manifest_entry. snapshot_id ( ) {
98- Some ( entry_snapshot_id) => {
99- if !range. contains ( entry_snapshot_id) {
100- continue ;
101- }
102- }
103- None => {
104- // Skip entries without a snapshot_id in incremental mode
105- continue ;
106- }
107- }
108101 }
109102
110103 let manifest_entry_context = ManifestEntryContext {
@@ -171,7 +164,6 @@ impl ManifestEntryContext {
171164
172165/// PlanContext wraps a [`SnapshotRef`] alongside all the other
173166/// objects that are required to perform a scan file plan.
174- #[ derive( Debug ) ]
175167pub ( crate ) struct PlanContext {
176168 pub snapshot : SnapshotRef ,
177169
@@ -186,7 +178,25 @@ pub(crate) struct PlanContext {
186178 pub partition_filter_cache : Arc < PartitionFilterCache > ,
187179 pub manifest_evaluator_cache : Arc < ManifestEvaluatorCache > ,
188180 pub expression_evaluator_cache : Arc < ExpressionEvaluatorCache > ,
189- pub snapshot_range : Option < Arc < AppendSnapshotSet > > ,
181+ pub manifest_file_filter : Option < ManifestFileFilter > ,
182+ pub manifest_entry_filter : Option < ManifestEntryFilter > ,
183+ }
184+
185+ impl std:: fmt:: Debug for PlanContext {
186+ fn fmt ( & self , f : & mut std:: fmt:: Formatter < ' _ > ) -> std:: fmt:: Result {
187+ f. debug_struct ( "PlanContext" )
188+ . field ( "snapshot" , & self . snapshot )
189+ . field ( "case_sensitive" , & self . case_sensitive )
190+ . field (
191+ "manifest_file_filter" ,
192+ & self . manifest_file_filter . as_ref ( ) . map ( |_| "..." ) ,
193+ )
194+ . field (
195+ "manifest_entry_filter" ,
196+ & self . manifest_entry_filter . as_ref ( ) . map ( |_| "..." ) ,
197+ )
198+ . finish_non_exhaustive ( )
199+ }
190200}
191201
192202impl PlanContext {
@@ -240,17 +250,8 @@ impl PlanContext {
240250 // TODO: Ideally we could ditch this intermediate Vec as we return an iterator.
241251 let mut filtered_mfcs = vec ! [ ] ;
242252 for manifest_file in manifest_files {
243- // For incremental scans, skip manifests that can't contain relevant entries:
244- // 1. Delete manifests — we only care about newly added data files.
245- // 2. Data manifests whose added_snapshot_id is outside the scan range —
246- // they can't contain entries added in the snapshots we care about.
247- // (We still keep the entry-level filter because a manifest can contain
248- // entries from multiple snapshots via manifest reuse.)
249- if let Some ( ref range) = self . snapshot_range {
250- if manifest_file. content == ManifestContentType :: Deletes {
251- continue ;
252- }
253- if !range. contains ( manifest_file. added_snapshot_id ) {
253+ if let Some ( ref filter) = self . manifest_file_filter {
254+ if !filter ( manifest_file) {
254255 continue ;
255256 }
256257 }
@@ -324,7 +325,7 @@ impl PlanContext {
324325 expression_evaluator_cache : self . expression_evaluator_cache . clone ( ) ,
325326 delete_file_index,
326327 case_sensitive : self . case_sensitive ,
327- snapshot_range : self . snapshot_range . clone ( ) ,
328+ entry_filter : self . manifest_entry_filter . clone ( ) ,
328329 }
329330 }
330331}
0 commit comments