Skip to content

Commit cc9bb8e

Browse files
[python] Fix manifest read failure when _WRITE_COLS contains system fields (#8131)
### Purpose When reading a table whose data files have `_WRITE_COLS` containing system fields (e.g. `_ROW_ID`, `_SEQUENCE_NUMBER`), the read fails with: KeyError: '_ROW_ID' Aligns with the Java-side fix in #7797 — skip metadata fields that are not in the table schema when resolving value stats fields from `_WRITE_COLS`. ## Test - `test_read_write_cols_with_system_field`
1 parent d13301c commit cc9bb8e

2 files changed

Lines changed: 47 additions & 1 deletion

File tree

paimon-python/pypaimon/manifest/manifest_file_manager.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,10 @@ def _get_value_stats_fields(self, file_dict: dict, schema_fields: list) -> List:
190190
fields = schema_fields
191191
else:
192192
read_fields = file_dict['_WRITE_COLS']
193-
fields = [self.table.field_dict[col] for col in read_fields]
193+
# writeCols may contain metadata fields (e.g. _ROW_ID, _SEQUENCE_NUMBER)
194+
data_field_dict = {f.name: f for f in schema_fields}
195+
fields = [data_field_dict[col] for col in read_fields
196+
if col in data_field_dict]
194197
else:
195198
fields = schema_fields
196199
elif not file_dict['_VALUE_STATS_COLS']:

paimon-python/pypaimon/tests/manifest/manifest_manager_test.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from pypaimon.manifest.schema.manifest_entry import ManifestEntry
3737
from pypaimon.manifest.schema.manifest_file_meta import ManifestFileMeta
3838
from pypaimon.manifest.schema.simple_stats import SimpleStats
39+
from pypaimon.schema.data_types import AtomicType, DataField
3940
from pypaimon.schema.schema import Schema
4041
from pypaimon.table.row.generic_row import GenericRow
4142

@@ -282,6 +283,48 @@ def test_filter_applied_after_read(self):
282283
"test-manifest.avro", manifest_entry_filter=lambda e: e.bucket == 0)
283284
self.assertEqual(len(result_filtered), 2)
284285

286+
def test_read_write_cols_with_system_field(self):
287+
manager = self._make_manager()
288+
289+
id_field = DataField(0, 'id', AtomicType('INT', nullable=True))
290+
min_row = GenericRow([1], [id_field])
291+
max_row = GenericRow([10], [id_field])
292+
value_stats = SimpleStats(
293+
min_values=min_row, max_values=max_row, null_counts=[2])
294+
295+
entry = ManifestEntry(
296+
kind=0,
297+
partition=_EMPTY_ROW,
298+
bucket=0,
299+
total_buckets=1,
300+
file=DataFileMeta(
301+
file_name="data-dirty.parquet", file_size=1024, row_count=50,
302+
min_key=_EMPTY_ROW, max_key=_EMPTY_ROW,
303+
key_stats=_EMPTY_STATS, value_stats=value_stats,
304+
min_sequence_number=1, max_sequence_number=50,
305+
schema_id=0, level=0, extra_files=[],
306+
creation_time=Timestamp.from_epoch_millis(0),
307+
delete_row_count=0, embedded_index=None, file_source=None,
308+
value_stats_cols=None, external_path=None,
309+
first_row_id=0,
310+
write_cols=["id", "_ROW_ID", "_SEQUENCE_NUMBER"],
311+
),
312+
)
313+
manager.write("dirty-manifest.avro", [entry])
314+
315+
entries = manager.read("dirty-manifest.avro", drop_stats=False)
316+
self.assertEqual(len(entries), 1)
317+
self.assertEqual(
318+
entries[0].file.write_cols, ["id", "_ROW_ID", "_SEQUENCE_NUMBER"])
319+
320+
read_stats = entries[0].file.value_stats
321+
stats_field_names = [f.name for f in read_stats.min_values.fields]
322+
self.assertEqual(stats_field_names, ["id"])
323+
324+
self.assertEqual(read_stats.min_values.get_field(0), 1)
325+
self.assertEqual(read_stats.max_values.get_field(0), 10)
326+
self.assertEqual(read_stats.null_counts, [2])
327+
285328

286329
class ManifestListManagerTest(_ManifestManagerSetup):
287330
"""Tests for ManifestListManager."""

0 commit comments

Comments
 (0)