Skip to content

Commit 3d3b097

Browse files
authored
[python] Drop stats for manifest entries reading (#6429)
1 parent b318e3c commit 3d3b097

7 files changed

Lines changed: 71 additions & 12 deletions

File tree

paimon-python/pypaimon/manifest/manifest_file_manager.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def __init__(self, table):
4141
self.primary_key_fields = self.table.table_schema.get_primary_key_fields()
4242
self.trimmed_primary_key_fields = self.table.table_schema.get_trimmed_primary_key_fields()
4343

44-
def read(self, manifest_file_name: str, manifest_entry_filter=None) -> List[ManifestEntry]:
44+
def read(self, manifest_file_name: str, manifest_entry_filter=None, drop_stats=True) -> List[ManifestEntry]:
4545
manifest_file_path = self.manifest_path / manifest_file_name
4646

4747
entries = []
@@ -107,6 +107,8 @@ def read(self, manifest_file_name: str, manifest_entry_filter=None) -> List[Mani
107107
)
108108
if manifest_entry_filter is not None and not manifest_entry_filter(entry):
109109
continue
110+
if drop_stats:
111+
entry = entry.copy_without_stats()
110112
entries.append(entry)
111113
return entries
112114

paimon-python/pypaimon/manifest/schema/data_file_meta.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,32 @@ def set_file_path(self, table_path: Path, partition: GenericRow, bucket: int):
6161
path_builder = path_builder / ("bucket-" + str(bucket)) / self.file_name
6262
self.file_path = str(path_builder)
6363

64+
def copy_without_stats(self) -> 'DataFileMeta':
65+
"""Create a new DataFileMeta without value statistics."""
66+
return DataFileMeta(
67+
file_name=self.file_name,
68+
file_size=self.file_size,
69+
row_count=self.row_count,
70+
min_key=self.min_key,
71+
max_key=self.max_key,
72+
key_stats=self.key_stats,
73+
value_stats=SimpleStats.empty_stats(),
74+
min_sequence_number=self.min_sequence_number,
75+
max_sequence_number=self.max_sequence_number,
76+
schema_id=self.schema_id,
77+
level=self.level,
78+
extra_files=self.extra_files,
79+
creation_time=self.creation_time,
80+
delete_row_count=self.delete_row_count,
81+
embedded_index=self.embedded_index,
82+
file_source=self.file_source,
83+
value_stats_cols=[],
84+
external_path=self.external_path,
85+
first_row_id=self.first_row_id,
86+
write_cols=self.write_cols,
87+
file_path=self.file_path
88+
)
89+
6490
def assign_first_row_id(self, first_row_id: int) -> 'DataFileMeta':
6591
"""Create a new DataFileMeta with the assigned first_row_id."""
6692
return DataFileMeta(

paimon-python/pypaimon/manifest/schema/manifest_entry.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,16 @@ class ManifestEntry:
3131
total_buckets: int
3232
file: DataFileMeta
3333

34+
def copy_without_stats(self) -> 'ManifestEntry':
35+
"""Create a new ManifestEntry without value statistics."""
36+
return ManifestEntry(
37+
kind=self.kind,
38+
partition=self.partition,
39+
bucket=self.bucket,
40+
total_buckets=self.total_buckets,
41+
file=self.file.copy_without_stats()
42+
)
43+
3444
def assign_first_row_id(self, first_row_id: int) -> 'ManifestEntry':
3545
"""Create a new ManifestEntry with the assigned first_row_id."""
3646
return ManifestEntry(

paimon-python/pypaimon/manifest/schema/simple_stats.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from dataclasses import dataclass
2020
from typing import List, Optional
21+
from typing import ClassVar
2122

2223
from pypaimon.table.row.generic_row import GenericRow
2324

@@ -28,6 +29,16 @@ class SimpleStats:
2829
max_values: GenericRow
2930
null_counts: Optional[List[int]]
3031

32+
_empty_stats: ClassVar[object] = None
33+
34+
@classmethod
35+
def empty_stats(cls):
36+
if cls._empty_stats is None:
37+
min_values = GenericRow([], [])
38+
max_values = GenericRow([], [])
39+
cls._empty_stats = cls(min_values, max_values, None)
40+
return cls._empty_stats
41+
3142

3243
SIMPLE_STATS_SCHEMA = {
3344
"type": "record",

paimon-python/pypaimon/tests/predicates_test.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -454,20 +454,20 @@ def test_pk_reader_with_filter(self):
454454
if split.partition.values == ["p1", 2]:
455455
count += 1
456456
self.assertEqual(len(split.files), 1)
457-
min_values = split.files[0].value_stats.min_values.to_dict()
458-
max_values = split.files[0].value_stats.max_values.to_dict()
457+
min_values = split.files[0].key_stats.min_values.to_dict()
458+
max_values = split.files[0].key_stats.max_values.to_dict()
459459
self.assertTrue(min_values["key1"] == 1 and min_values["key2"] == "e"
460460
and max_values["key1"] == 4 and max_values["key2"] == "h")
461461
elif split.partition.values == ["p2", 2]:
462462
count += 1
463-
min_values = split.files[0].value_stats.min_values.to_dict()
464-
max_values = split.files[0].value_stats.max_values.to_dict()
463+
min_values = split.files[0].key_stats.min_values.to_dict()
464+
max_values = split.files[0].key_stats.max_values.to_dict()
465465
self.assertTrue(min_values["key1"] == 5 and min_values["key2"] == "a"
466466
and max_values["key1"] == 8 and max_values["key2"] == "d")
467467
elif split.partition.values == ["p1", 1]:
468468
count += 1
469-
min_values = split.files[0].value_stats.min_values.to_dict()
470-
max_values = split.files[0].value_stats.max_values.to_dict()
469+
min_values = split.files[0].key_stats.min_values.to_dict()
470+
max_values = split.files[0].key_stats.max_values.to_dict()
471471
self.assertTrue(min_values["key1"] == max_values["key1"] == 7
472472
and max_values["key2"] == max_values["key2"] == "b")
473473
self.assertEqual(count, 3)

paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,9 @@ def test_full_data_types(self):
180180
latest_snapshot = SnapshotManager(table).get_latest_snapshot()
181181
manifest_files = table_scan.starting_scanner.manifest_list_manager.read_all(latest_snapshot)
182182
manifest_entries = table_scan.starting_scanner.manifest_file_manager.read(
183-
manifest_files[0].file_name, lambda row: table_scan.starting_scanner._filter_manifest_entry(row))
183+
manifest_files[0].file_name,
184+
lambda row: table_scan.starting_scanner._filter_manifest_entry(row),
185+
drop_stats=False)
184186
min_value_stats = manifest_entries[0].file.value_stats.min_values.values
185187
max_value_stats = manifest_entries[0].file.value_stats.max_values.values
186188
expected_min_values = [col[0].as_py() for col in expect_data]
@@ -849,7 +851,7 @@ def _test_value_stats_cols_case(self, manifest_manager, table, value_stats_cols,
849851
manifest_manager.write(manifest_file_name, [entry])
850852

851853
# Read the manifest entry back
852-
entries = manifest_manager.read(manifest_file_name)
854+
entries = manifest_manager.read(manifest_file_name, drop_stats=False)
853855

854856
# Verify we have exactly one entry
855857
self.assertEqual(len(entries), 1)

paimon-python/pypaimon/tests/reader_base_test.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -210,14 +210,22 @@ def test_full_data_types(self):
210210
read_builder = table.new_read_builder()
211211
table_scan = read_builder.new_scan()
212212
table_read = read_builder.new_read()
213-
actual_data = table_read.to_arrow(table_scan.plan().splits())
213+
splits = table_scan.plan().splits()
214+
215+
# assert data file without stats
216+
first_file = splits[0].files[0]
217+
self.assertEqual(first_file.value_stats_cols, [])
218+
self.assertEqual(first_file.value_stats, SimpleStats.empty_stats())
219+
220+
# assert equal
221+
actual_data = table_read.to_arrow(splits)
214222
self.assertEqual(actual_data, expect_data)
215223

216224
# to test GenericRow ability
217225
latest_snapshot = SnapshotManager(table).get_latest_snapshot()
218226
manifest_files = table_scan.starting_scanner.manifest_list_manager.read_all(latest_snapshot)
219227
manifest_entries = table_scan.starting_scanner.manifest_file_manager.read(
220-
manifest_files[0].file_name, lambda row: table_scan.starting_scanner._filter_manifest_entry(row))
228+
manifest_files[0].file_name, lambda row: table_scan.starting_scanner._filter_manifest_entry(row), False)
221229
min_value_stats = manifest_entries[0].file.value_stats.min_values.values
222230
max_value_stats = manifest_entries[0].file.value_stats.max_values.values
223231
expected_min_values = [col[0].as_py() for col in expect_data]
@@ -627,7 +635,7 @@ def _test_value_stats_cols_case(self, manifest_manager, table, value_stats_cols,
627635
manifest_manager.write(manifest_file_name, [entry])
628636

629637
# Read the manifest entry back
630-
entries = manifest_manager.read(manifest_file_name)
638+
entries = manifest_manager.read(manifest_file_name, drop_stats=False)
631639

632640
# Verify we have exactly one entry
633641
self.assertEqual(len(entries), 1)

0 commit comments

Comments
 (0)