[python] Align nested-leaf projection with field-id schema evolution

TheR1sing3un · TheR1sing3un · commit b4d46ecc378b · 2026-06-10T15:42:02.000+08:00
Nested-leaf projection on append-only reads pushed the leaf path down by
the LATEST name, bypassing the per-file field-id normalization: after a
sub-field rename the old file's leaf read NULL, and after a sub-field type
change old and new batches carried different types and failed to
concatenate.

Mirror the merge path instead: widen the projection to the full top-level
columns so the field-id normalization applies (rename follows the id,
missing sub-fields pad NULL, types are cast), then extract the requested
leaf paths back to the user's flat schema - batch-level via
NestedLeafBatchReader, or row-level via OuterProjectionRecordReader when a
post-read filter is involved.

Add regression tests projecting a renamed and a type-changed sub-field
across old and new files.
diff --git a/paimon-python/pypaimon/read/reader/nested_leaf_batch_reader.py b/paimon-python/pypaimon/read/reader/nested_leaf_batch_reader.py
@@ -0,0 +1,64 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from typing import List, Optional
+
+import pyarrow as pa
+import pyarrow.compute as pc
+from pyarrow import RecordBatch
+
+from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
+from pypaimon.schema.data_types import DataField, PyarrowFieldParser
+
+
+class NestedLeafBatchReader(RecordBatchReader):
+    """Extract projected nested leaves from batches of full top-level columns.
+
+    The inner reader yields batches carrying the widened top-level columns,
+    already normalized to the latest schema by field id (renames followed,
+    missing sub-fields padded NULL, types cast). Each requested name path is
+    walked through the struct children (a NULL parent propagates to the
+    leaf), producing the user's flat projected schema.
+    """
+
+    def __init__(self, inner: RecordBatchReader, name_paths: List[List[str]],
+                 output_fields: List[DataField]):
+        if len(name_paths) != len(output_fields):
+            raise ValueError(
+                "name_paths length {} does not match output_fields length {}".format(
+                    len(name_paths), len(output_fields)))
+        self._inner = inner
+        self._paths = name_paths
+        self._schema = PyarrowFieldParser.from_paimon_schema(output_fields)
+
+    def read_arrow_batch(self) -> Optional[RecordBatch]:
+        batch = self._inner.read_arrow_batch()
+        if batch is None:
+            return None
+        arrays = []
+        for i, path in enumerate(self._paths):
+            column = batch.column(path[0])
+            for name in path[1:]:
+                column = pc.struct_field(column, name)
+            target_type = self._schema.field(i).type
+            if column.type != target_type:
+                column = column.cast(target_type, safe=False)
+            arrays.append(column)
+        return pa.RecordBatch.from_arrays(arrays, schema=self._schema)
+
+    def close(self) -> None:
+        self._inner.close()
diff --git a/paimon-python/pypaimon/read/split_read.py b/paimon-python/pypaimon/read/split_read.py
@@ -627,6 +627,33 @@ def _genarate_deletion_file_readers(self):
 
 
 class RawFileSplitRead(SplitRead):
+    def __init__(
+            self,
+            table,
+            predicate: Optional[Predicate],
+            read_type: List[DataField],
+            split: Split,
+            row_tracking_enabled: bool,
+            outer_extract_name_paths: Optional[List[List[str]]] = None,
+            outer_flat_read_type: Optional[List[DataField]] = None,
+            limit: Optional[int] = None):
+        # Nested-leaf projection is NOT pushed down by name: a leaf path is
+        # only valid against the latest schema, while each data file stores
+        # its own (possibly renamed / retyped) sub-fields. Instead the read
+        # widens to the full top-level columns, which the per-file field-id
+        # normalization aligns to the latest schema, and the requested leaf
+        # paths are extracted afterwards (``outer_extract_name_paths``).
+        super().__init__(
+            table=table,
+            predicate=predicate,
+            read_type=read_type,
+            split=split,
+            row_tracking_enabled=row_tracking_enabled,
+            nested_name_paths=None,
+            limit=limit)
+        self.outer_extract_name_paths = outer_extract_name_paths
+        self.outer_flat_read_type = outer_flat_read_type
+
     def raw_reader_supplier(self, file: DataFileMeta, dv_factory: Optional[Callable] = None) -> Optional[RecordReader]:
         read_fields = self._get_final_read_data_fields()
         # Check if this is a SlicedSplit to get shard_file_idx_map
@@ -676,10 +703,27 @@ def create_reader(self) -> RecordReader:
         # if the table is appendonly table, we don't need extra filter, all predicates has pushed down
         if self.table.is_primary_key_table and self.predicate_for_reader:
             reader = FilterRecordReader(concat_reader, self.predicate_for_reader)
+            if self.outer_extract_name_paths:
+                # Row-level extraction: the filter evaluates rows in the
+                # widened top-level coordinate space, so extract after it.
+                from pypaimon.read.reader.outer_projection_record_reader import \
+                    OuterProjectionRecordReader
+                reader = OuterProjectionRecordReader(
+                    reader, [f.name for f in self.read_fields],
+                    self.outer_extract_name_paths,
+                    file_io=self.table.file_io,
+                    blob_field_indices=_blob_field_indices(self.read_fields),
+                    vector_field_indices=_vector_field_indices(self.read_fields))
             if self.limit is not None:
                 reader = LimitedRecordReader(reader, self.limit)
         else:
             reader = concat_reader
+            if self.outer_extract_name_paths:
+                from pypaimon.read.reader.nested_leaf_batch_reader import \
+                    NestedLeafBatchReader
+                reader = NestedLeafBatchReader(
+                    reader, self.outer_extract_name_paths,
+                    self.outer_flat_read_type)
             if self.limit is not None:
                 reader = LimitedRecordBatchReader(reader, self.limit)
         return reader
diff --git a/paimon-python/pypaimon/read/table_read.py b/paimon-python/pypaimon/read/table_read.py
@@ -603,13 +603,26 @@ def _create_split_read(self, split: Split) -> SplitRead:
                 limit=self.limit,
             )
         else:
+            inner_read_type = self.read_type
+            outer_extract_name_paths: Optional[List[List[str]]] = None
+            if self.nested_name_paths and any(
+                    len(p) > 1 for p in self.nested_name_paths):
+                # Mirror the merge path: read the full top-level columns so
+                # the per-file field-id normalization applies (a leaf path is
+                # only valid against the latest schema, not each file's own
+                # names/types), then extract the requested sub-paths back to
+                # the user's flat schema.
+                inner_read_type = self._widen_to_top_level_for_merge()
+                outer_extract_name_paths = self.nested_name_paths
             return RawFileSplitRead(
                 table=self.table,
                 predicate=self.predicate,
-                read_type=self.read_type,
+                read_type=inner_read_type,
                 split=split,
                 row_tracking_enabled=self.table.options.row_tracking_enabled(),
-                nested_name_paths=self.nested_name_paths,
+                outer_extract_name_paths=outer_extract_name_paths,
+                outer_flat_read_type=(
+                    self.read_type if outer_extract_name_paths else None),
                 limit=self.limit,
             )
 
diff --git a/paimon-python/pypaimon/tests/schema_evolution_nested_read_test.py b/paimon-python/pypaimon/tests/schema_evolution_nested_read_test.py
@@ -351,6 +351,56 @@ def test_unsupported_subfield_cast_rejected(self):
                     ['mv', 'latest_version'], AtomicType('DATE'))], False)
         self.assertIn('cannot be converted', str(cm.exception))
 
+    def test_nested_projection_after_rename_subfield(self):
+        # Projecting a renamed leaf must follow the field id into old files,
+        # not look the new name up in the file's physical schema.
+        s0 = pa.schema([('id', pa.int64()),
+                        ('mv', pa.struct([('v', pa.int32()), ('s', pa.string())]))])
+        table = self._create('nsub_proj_rename', s0)
+        self._write(table, pa.Table.from_pylist(
+            [{'id': 1, 'mv': {'v': 10, 's': 'a'}}], schema=s0))
+        self.catalog.alter_table(
+            'default.nsub_proj_rename',
+            [SchemaChange.rename_column(['mv', 's'], 'ss')], False)
+        table = self.catalog.get_table('default.nsub_proj_rename')
+        s1 = pa.schema([('id', pa.int64()),
+                        ('mv', pa.struct([('v', pa.int32()), ('ss', pa.string())]))])
+        self._write(table, pa.Table.from_pylist(
+            [{'id': 2, 'mv': {'v': 20, 'ss': 'b'}}], schema=s1))
+
+        rows = self._read_sorted(table, projection=['id', 'mv.ss'])
+        self.assertEqual(rows, [
+            {'id': 1, 'mv_ss': 'a'},
+            {'id': 2, 'mv_ss': 'b'},
+        ])
+
+    def test_nested_projection_after_update_subfield_type(self):
+        # Projecting a type-changed leaf must cast old batches to the latest
+        # type instead of emitting mixed-type batches.
+        s0 = pa.schema([('id', pa.int64()),
+                        ('mv', pa.struct([('v', pa.int32()), ('s', pa.string())]))])
+        table = self._create('nsub_proj_type', s0)
+        self._write(table, pa.Table.from_pylist(
+            [{'id': 1, 'mv': {'v': 10, 's': 'a'}}], schema=s0))
+        self.catalog.alter_table(
+            'default.nsub_proj_type',
+            [SchemaChange.update_column_type(['mv', 'v'], AtomicType('BIGINT'))], False)
+        table = self.catalog.get_table('default.nsub_proj_type')
+        s1 = pa.schema([('id', pa.int64()),
+                        ('mv', pa.struct([('v', pa.int64()), ('s', pa.string())]))])
+        self._write(table, pa.Table.from_pylist(
+            [{'id': 2, 'mv': {'v': 20, 's': 'b'}}], schema=s1))
+
+        rb = table.new_read_builder().with_projection(['id', 'mv.v'])
+        splits = rb.new_scan().plan().splits()
+        arrow = rb.new_read().to_arrow(splits)
+        self.assertEqual(arrow.schema.field('mv_v').type, pa.int64())
+        rows = sorted(arrow.to_pylist(), key=lambda r: r['id'])
+        self.assertEqual(rows, [
+            {'id': 1, 'mv_v': 10},
+            {'id': 2, 'mv_v': 20},
+        ])
+
     def test_pk_nested_subfield_evolution_merge(self):
         s0 = pa.schema([('id', pa.int64()), ('mv', _MV_PA)])
         table = self._create('nsub_pk', s0, primary_keys=['id'], bucket='1')