2020
2121from pypaimon .common .core_options import CoreOptions
2222from pypaimon .common .predicate import Predicate
23- from pypaimon .common .predicate_builder import PredicateBuilder
2423from pypaimon .manifest .manifest_file_manager import ManifestFileManager
2524from pypaimon .manifest .manifest_list_manager import ManifestListManager
2625from pypaimon .manifest .schema .data_file_meta import DataFileMeta
2726from pypaimon .manifest .schema .manifest_entry import ManifestEntry
2827from pypaimon .manifest .schema .manifest_file_meta import ManifestFileMeta
2928from pypaimon .read .interval_partition import IntervalPartition , SortedRun
3029from pypaimon .read .plan import Plan
31- from pypaimon .read .push_down_utils import (extract_predicate_to_dict ,
32- extract_predicate_to_list ,
33- to_partition_predicate )
30+ from pypaimon .read .push_down_utils import (filter_and_transform_predicate )
3431from pypaimon .read .scanner .starting_scanner import StartingScanner
3532from pypaimon .read .split import Split
3633from pypaimon .snapshot .snapshot_manager import SnapshotManager
@@ -49,14 +46,11 @@ def __init__(self, table, predicate: Optional[Predicate], limit: Optional[int]):
4946 self .manifest_list_manager = ManifestListManager (table )
5047 self .manifest_file_manager = ManifestFileManager (table )
5148
52- pk_conditions = []
53- trimmed_pk = [field .name for field in self .table .table_schema .get_trimmed_primary_key_fields ()]
54- extract_predicate_to_list (pk_conditions , self .predicate , trimmed_pk )
55- self .primary_key_predicate = PredicateBuilder (self .table .fields ).and_predicates (pk_conditions )
49+ self .primary_key_predicate = filter_and_transform_predicate (
50+ self .predicate , self .table .field_names , self .table .table_schema .get_trimmed_primary_keys ())
5651
57- partition_conditions = defaultdict (list )
58- extract_predicate_to_dict (partition_conditions , self .predicate , self .table .partition_keys )
59- self .partition_key_predicate = partition_conditions
52+ self .partition_key_predicate = filter_and_transform_predicate (
53+ self .predicate , self .table .field_names , self .table .partition_keys )
6054
6155 self .target_split_size = 128 * 1024 * 1024
6256 self .open_file_cost = 4 * 1024 * 1024
@@ -82,29 +76,29 @@ def scan(self) -> Plan:
8276 splits = self ._apply_push_down_limit (splits )
8377 return Plan (splits )
8478
85- def _read_manifest_files (self ) -> List [ManifestFileMeta ]:
79+ def plan_files (self ) -> List [ManifestEntry ]:
8680 latest_snapshot = self .snapshot_manager .get_latest_snapshot ()
8781 if not latest_snapshot :
8882 return []
8983 manifest_files = self .manifest_list_manager .read_all (latest_snapshot )
90- partition_predicate = to_partition_predicate ( self .predicate , self . table . field_names , self . table . partition_keys )
84+ return self .read_manifest_entries ( manifest_files )
9185
92- def test_predicate (file : ManifestFileMeta ) -> bool :
93- if not partition_predicate :
86+ def read_manifest_entries (self , manifest_files : List [ManifestFileMeta ]) -> List [ManifestEntry ]:
87+ def filter_manifest_file (file : ManifestFileMeta ) -> bool :
88+ if not self .partition_key_predicate :
9489 return True
95- return partition_predicate .test_by_simple_stats (
90+ return self . partition_key_predicate .test_by_simple_stats (
9691 file .partition_stats ,
9792 file .num_added_files + file .num_deleted_files )
9893
99- return [file for file in manifest_files if test_predicate (file )]
100-
101- def plan_files (self ) -> List [ManifestEntry ]:
102- manifest_files = self ._read_manifest_files ()
10394 deleted_entries = set ()
10495 added_entries = []
10596 for manifest_file in manifest_files :
106- manifest_entries = self .manifest_file_manager .read (manifest_file .file_name ,
107- lambda row : self ._bucket_filter (row ))
97+ if not filter_manifest_file (manifest_file ):
98+ continue
99+ manifest_entries = self .manifest_file_manager .read (
100+ manifest_file .file_name ,
101+ lambda row : self ._filter_manifest_entry (row ))
108102 for entry in manifest_entries :
109103 if entry .kind == 0 :
110104 added_entries .append (entry )
@@ -115,8 +109,6 @@ def plan_files(self) -> List[ManifestEntry]:
115109 entry for entry in added_entries
116110 if (tuple (entry .partition .values ), entry .bucket , entry .file .file_name ) not in deleted_entries
117111 ]
118- if self .predicate :
119- file_entries = self ._filter_by_predicate (file_entries )
120112 return file_entries
121113
122114 def with_shard (self , idx_of_this_subtask , number_of_para_subtasks ) -> 'FullStartingScanner' :
@@ -203,12 +195,6 @@ def _primary_key_filter_by_shard(self, file_entries: List[ManifestEntry]) -> Lis
203195 filtered_entries .append (entry )
204196 return filtered_entries
205197
206- def _bucket_filter (self , entry : Optional [ManifestEntry ]) -> bool :
207- bucket = entry .bucket
208- if self .only_read_real_buckets and bucket < 0 :
209- return False
210- return True
211-
212198 def _apply_push_down_limit (self , splits : List [Split ]) -> List [Split ]:
213199 if self .limit is None :
214200 return splits
@@ -224,45 +210,26 @@ def _apply_push_down_limit(self, splits: List[Split]) -> List[Split]:
224210
225211 return limited_splits
226212
227- def _filter_by_predicate (self , file_entries : List [ManifestEntry ]) -> List [ManifestEntry ]:
228- if not self .predicate :
229- return file_entries
230-
231- filtered_files = []
232- for file_entry in file_entries :
233- if self .partition_key_predicate and not self ._filter_by_partition (file_entry ):
234- continue
235- if not self ._filter_by_stats (file_entry ):
236- continue
237- filtered_files .append (file_entry )
238-
239- return filtered_files
240-
241- def _filter_by_partition (self , file_entry : ManifestEntry ) -> bool :
242- partition_dict = file_entry .partition .to_dict ()
243- for field_name , conditions in self .partition_key_predicate .items ():
244- partition_value = partition_dict [field_name ]
245- for predicate in conditions :
246- if not predicate .test_by_value (partition_value ):
247- return False
248- return True
249-
250- def _filter_by_stats (self , file_entry : ManifestEntry ) -> bool :
251- if file_entry .kind != 0 :
213+ def _filter_manifest_entry (self , entry : ManifestEntry ) -> bool :
214+ if self .only_read_real_buckets and entry .bucket < 0 :
215+ return False
216+ if self .partition_key_predicate and not self .partition_key_predicate .test (entry .partition ):
252217 return False
253218 if self .table .is_primary_key_table :
254219 predicate = self .primary_key_predicate
255- stats = file_entry .file .key_stats
220+ stats = entry .file .key_stats
256221 else :
257222 predicate = self .predicate
258- stats = file_entry .file .value_stats
223+ stats = entry .file .value_stats
224+ if not predicate :
225+ return True
259226 return predicate .test_by_stats ({
260227 "min_values" : stats .min_values .to_dict (),
261228 "max_values" : stats .max_values .to_dict (),
262229 "null_counts" : {
263230 stats .min_values .fields [i ].name : stats .null_counts [i ] for i in range (len (stats .min_values .fields ))
264231 },
265- "row_count" : file_entry .file .row_count ,
232+ "row_count" : entry .file .row_count
266233 })
267234
268235 def _create_append_only_splits (self , file_entries : List [ManifestEntry ]) -> List ['Split' ]:
0 commit comments