[python] Re-apply dropped nested-leaf predicate after projection extraction

TheR1sing3un · TheR1sing3un · commit a3bb6a61b6e7 · 2026-06-14T13:29:14.000+08:00
When a read projects a nested struct sub-field (e.g. mv.latest_version), the
read widens the projection to the full top-level column so per-file field-id
normalization applies, then extracts the leaf afterwards. The leaf path is
absent from the widened read fields, so SplitRead.__init__ dropped any
predicate referencing it (predicate_for_reader=None) and the filter was
silently lost -- every row was returned.

Re-evaluate the dropped predicate after extraction, where the flat columns
match the predicate fields: RawFileSplitRead (append-only / PK raw-convertible)
wraps the extracted batches with FilterRecordBatchReader; MergeFileSplitRead
(PK non raw-convertible) filters the extracted rows with FilterRecordReader,
rewriting indices into the flat output. The predicate is trimmed to the
projected columns first, so a filter on a non-projected column keeps the
existing drop semantics instead of referencing a missing column.
diff --git a/paimon-python/pypaimon/read/split_read.py b/paimon-python/pypaimon/read/split_read.py
@@ -724,6 +724,22 @@ def create_reader(self) -> RecordReader:
                 reader = NestedLeafBatchReader(
                     reader, self.outer_extract_name_paths,
                     self.outer_flat_read_type)
+                # A predicate on a projected nested leaf cannot be pushed down:
+                # its leaf path is absent from the widened top-level read
+                # fields, so SplitRead.__init__ dropped it (predicate_for_reader
+                # is None). Without re-applying it the filter is silently lost
+                # and every row is returned. Re-evaluate it on the extracted
+                # flat batches, whose column names match the predicate fields;
+                # trim to the projected columns so a filter on a non-projected
+                # column keeps the existing "dropped" semantics rather than
+                # referencing a missing column.
+                if self.predicate is not None and self.predicate_for_reader is None:
+                    flat_names = [f.name for f in self.outer_flat_read_type]
+                    trimmed = trim_predicate_by_fields(self.predicate, flat_names)
+                    if trimmed is not None:
+                        from pypaimon.read.reader.filter_record_batch_reader \
+                            import FilterRecordBatchReader
+                        reader = FilterRecordBatchReader(reader, trimmed)
             if self.limit is not None:
                 reader = LimitedRecordBatchReader(reader, self.limit)
         return reader
@@ -743,6 +759,7 @@ def __init__(
             split: Split,
             row_tracking_enabled: bool,
             outer_extract_name_paths: Optional[List[List[str]]] = None,
+            outer_flat_read_type: Optional[List[DataField]] = None,
             limit: Optional[int] = None):
         # Merge functions need full ROW sub-structures, so nested paths
         # are not pushed down here; sub-path extraction happens above
@@ -757,6 +774,7 @@ def __init__(
             limit=limit,
         )
         self.outer_extract_name_paths = outer_extract_name_paths
+        self.outer_flat_read_type = outer_flat_read_type
         # Built once per split-read (value_fields and options are constant
         # for the object's life), not per section. ``None`` when
         # ``sequence.field`` is unset, in which case the heap falls back to
@@ -855,6 +873,21 @@ def create_reader(self) -> RecordReader:
                 file_io=self.table.file_io,
                 blob_field_indices=_blob_field_indices(inner_value_fields),
                 vector_field_indices=_vector_field_indices(inner_value_fields))
+            # A predicate on a projected nested leaf is not pushed down (its leaf
+            # path is absent from the widened-to-full-ROW read fields, so it was
+            # dropped in __init__). Without re-applying it after extraction the
+            # filter is silently lost. Evaluate it on the extracted flat rows,
+            # whose fields are outer_flat_read_type; trim to the projected
+            # columns and rewrite indices into that flat row.
+            if (self.predicate is not None and self.predicate_for_reader is None
+                    and self.outer_flat_read_type is not None):
+                flat_names = [f.name for f in self.outer_flat_read_type]
+                trimmed = trim_predicate_by_fields(self.predicate, flat_names)
+                if trimmed is not None:
+                    reader = FilterRecordReader(
+                        reader,
+                        rewrite_predicate_indices(
+                            trimmed, self.outer_flat_read_type))
         if self.limit is not None:
             reader = LimitedRecordReader(reader, self.limit)
         return reader
diff --git a/paimon-python/pypaimon/read/table_read.py b/paimon-python/pypaimon/read/table_read.py
@@ -585,6 +585,8 @@ def _create_split_read(self, split: Split) -> SplitRead:
                 split=split,
                 row_tracking_enabled=False,
                 outer_extract_name_paths=outer_extract_name_paths,
+                outer_flat_read_type=(
+                    self.read_type if outer_extract_name_paths else None),
                 limit=self.limit,
             )
         elif self.table.options.data_evolution_enabled():
diff --git a/paimon-python/pypaimon/tests/test_nested_projection_e2e.py b/paimon-python/pypaimon/tests/test_nested_projection_e2e.py
@@ -193,6 +193,23 @@ def test_partitioned_table_with_nested_projection(self):
             [{'part': 'A', 'mv_latest_version': 100, 'val': 'x'},
              {'part': 'B', 'mv_latest_version': 200, 'val': 'y'}])
 
+    def test_filter_on_projected_nested_leaf(self):
+        """A predicate on a projected nested leaf must actually filter rows.
+        The read widens the projection to the top-level struct, which drops
+        the leaf predicate from push-down (its path is absent from the read
+        fields); without re-applying it after the leaves are extracted, every
+        row leaks through."""
+        table = self._create_table('ao_nested_leaf_filter')
+        rb = table.new_read_builder().with_projection(['id', 'mv.latest_version'])
+        pred = rb.new_predicate_builder().greater_than('mv_latest_version', 150)
+        rb = rb.with_filter(pred)
+        got = rb.new_read().to_arrow(rb.new_scan().plan().splits()).to_pylist()
+        got = sorted(got, key=lambda r: r['id'])
+        self.assertEqual(
+            got,
+            [{'id': 2, 'mv_latest_version': 200},
+             {'id': 3, 'mv_latest_version': 300}])
+
     def test_avro_nested_projection_python_fallback(self):
         """Avro has no native nested column pruning; the reader walks
         each fastavro record dict by path and assembles the column
@@ -245,11 +262,43 @@ def _create_pk_table(self, name: str, file_format: str = 'parquet'):
             w.close()
         return table
 
+    def _create_pk_raw_table(self, name: str, file_format: str = 'parquet'):
+        """Single commit keeps the split raw-convertible, so the read stays on
+        the RawFileSplitRead fast path rather than the merge reader."""
+        identifier = 'default.{}'.format(name)
+        schema = Schema.from_pyarrow_schema(
+            self.pa_schema,
+            primary_keys=['id'],
+            options={'bucket': '1', 'file.format': file_format},
+        )
+        self.catalog.create_table(identifier, schema, False)
+        table = self.catalog.get_table(identifier)
+        wb = table.new_batch_write_builder()
+        w = wb.new_write()
+        w.write_arrow(pa.Table.from_pylist(self.rows, schema=self.pa_schema))
+        wb.new_commit().commit(w.prepare_commit())
+        w.close()
+        return table
+
     def _read_arrow(self, table, projection):
         rb = table.new_read_builder().with_projection(projection)
         splits = rb.new_scan().plan().splits()
         return rb.new_read().to_arrow(splits)
 
+    def test_raw_convertible_filter_on_projected_nested_leaf(self):
+        """PK raw-convertible split also widens nested projection and so drops
+        the leaf predicate from push-down. The filter must be re-applied on the
+        extracted leaves; otherwise all rows are returned (reviewer repro)."""
+        table = self._create_pk_raw_table('pk_raw_nested_leaf_filter')
+        rb = table.new_read_builder().with_projection(['id', 'mv.latest_version'])
+        pred = rb.new_predicate_builder().greater_than('mv_latest_version', 150)
+        rb = rb.with_filter(pred)
+        arrow = rb.new_read().to_arrow(rb.new_scan().plan().splits())
+        rows = sorted(zip(
+            arrow.column('id').to_pylist(),
+            arrow.column('mv_latest_version').to_pylist()))
+        self.assertEqual(rows, [(2, 200), (3, 300)])
+
     def test_extracts_single_nested_leaf(self):
         table = self._create_pk_table('pk_nested_single')
         arrow = self._read_arrow(table, ['mv.latest_version'])
@@ -301,6 +350,21 @@ def test_dotted_top_level_field_kept(self):
         got = rb.new_read().to_arrow(rb.new_scan().plan().splits()).to_pylist()
         self.assertEqual(got, [{'id': 1, 'media.left': 'hello'}])
 
+    def test_merge_filter_on_projected_nested_leaf(self):
+        """Non-raw-convertible PK splits go through the merge reader, which
+        widens the nested projection to the full ROW and so also drops the leaf
+        predicate from push-down. The filter must be re-applied on the extracted
+        leaves above the merge; otherwise all rows are returned."""
+        table = self._create_pk_table('pk_merge_nested_leaf_filter')
+        rb = table.new_read_builder().with_projection(['id', 'mv.latest_version'])
+        pred = rb.new_predicate_builder().greater_than('mv_latest_version', 150)
+        rb = rb.with_filter(pred)
+        arrow = rb.new_read().to_arrow(rb.new_scan().plan().splits())
+        rows = sorted(zip(
+            arrow.column('id').to_pylist(),
+            arrow.column('mv_latest_version').to_pylist()))
+        self.assertEqual(rows, [(2, 200), (3, 300)])
+
     def test_avro_extracts_single_nested_leaf(self):
         # Avro PK reads resolve DataFields through ``full_fields_map`` which
         # historically only covered merge-internal aliases; without the

Original file line number	Diff line number	Diff line change
`@@ -585,6 +585,8 @@ def _create_split_read(self, split: Split) -> SplitRead:`
`585`	`585`	`split=split,`
`586`	`586`	`row_tracking_enabled=False,`
`587`	`587`	`outer_extract_name_paths=outer_extract_name_paths,`
	`588`	`+ outer_flat_read_type=(`
	`589`	`+ self.read_type if outer_extract_name_paths else None),`
`588`	`590`	`limit=self.limit,`
`589`	`591`	`)`
`590`	`592`	`elif self.table.options.data_evolution_enabled():`