ray-project
diff --git a/‎python/ray/data/_internal/datasource_v2/parquet_datasource_v2.py‎
Lines changed: 0 additions & 6 deletions b/‎python/ray/data/_internal/datasource_v2/parquet_datasource_v2.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎python/ray/data/_internal/datasource_v2/readers/file_reader.py‎
Lines changed: 52 additions & 28 deletions b/‎python/ray/data/_internal/datasource_v2/readers/file_reader.py‎
Lines changed: 52 additions & 28 deletions
diff --git a/‎python/ray/data/_internal/datasource_v2/readers/parquet_file_reader.py‎
Lines changed: 174 additions & 5 deletions b/‎python/ray/data/_internal/datasource_v2/readers/parquet_file_reader.py‎
Lines changed: 174 additions & 5 deletions
@@ -272,11 +272,6 @@ def create_scanner(
         filesystem: Optional["FileSystem"] = None,
         **options: Any,
     ) -> ParquetScanner:
-        # ``filter=`` in V1 read_parquet() is the legacy pyarrow-compute
-        # predicate. Stamp it on the scanner's ``predicate`` field so it's
-        # honored at scan time (V2 does not yet dispatch Ray-level
-        # predicate pushdown rules).
-        predicate = self._arrow_parquet_args.get("filter")
         # Callers (``_read_datasource_v2``) supply the sample-resolved
         # ``Partitioning`` via ``options["partitioning"]`` so the
         # datasource itself stays immutable — fall back to the
@@ -291,5 +286,4 @@ def create_scanner(
             shuffle=self._shuffle,
             ignore_prefixes=options.get("ignore_prefixes"),
             target_block_size=DataContext.get_current().target_max_block_size,
-            predicate=predicate,
         )
@@ -1,10 +1,9 @@
 from enum import Enum
-from functools import cached_property
+from functools import cached_property, partial
 from typing import Any, Iterator, List, Optional, Set, Tuple
 
 import pyarrow as pa
 import pyarrow.dataset as pds
-from pyarrow import compute as pc
 from pyarrow.fs import FileSystem, LocalFileSystem
 
 from ray.data._internal.arrow_block import _BATCH_SIZE_PRESERVING_STUB_COL_NAME
@@ -14,6 +13,7 @@
 from ray.data._internal.util import iterate_with_retry
 from ray.data.context import DataContext
 from ray.data.datasource.partitioning import Partitioning, PathPartitionParser
+from ray.data.expressions import Expr
 from ray.util.annotations import DeveloperAPI
 
 # Synthetic column name produced when ``include_paths=True``. Shared with
@@ -58,7 +58,7 @@ def __init__(
         format: FileFormat,
         batch_size: int = _ARROW_DEFAULT_BATCH_SIZE,
         columns: Optional[List[str]] = None,
-        predicate: Optional[pc.Expression] = None,
+        predicate: Optional[Expr] = None,
         limit: Optional[int] = None,
         filesystem: Optional[FileSystem] = None,
         partitioning: Optional[Partitioning] = None,
@@ -74,7 +74,8 @@ def __init__(
             format: Format of the files to read.
             batch_size: Number of rows per batch.
             columns: Columns to read. None means all columns.
-            predicate: PyArrow compute expression for filtering.
+            predicate: Ray Data expression for filtering. Converted to a
+                PyArrow expression at the scanner-kwargs boundary.
             limit: Maximum number of rows to read.
             filesystem: Filesystem for reading files.
             partitioning: Ray ``Partitioning`` object. Partition columns are
@@ -226,18 +227,17 @@ def read(self, input_split: FileManifest) -> Iterator[pa.Table]:
 
         scanner_kwargs = {
             "columns": columns_to_read_from_file,
-            "filter": self._predicate,
+            "filter": (
+                self._predicate.to_pyarrow() if self._predicate is not None else None
+            ),
             "batch_size": self._resolve_batch_size(dataset),
             "batch_readahead": _ARROW_SCANNER_BATCH_READAHEAD,
         }
         scanner_kwargs.update(self._arrow_scanner_kwargs())
 
-        ctx = DataContext.get_current()
         rows_read = 0
-        for table, fragment_path, fragment_row_offset in iterate_with_retry(
-            lambda: self._read_fragment_batches(dataset, scanner_kwargs),
-            "read batches",
-            match=ctx.retried_io_errors,
+        for table, fragment_path, fragment_row_offset in self._read_fragment_batches(
+            dataset, scanner_kwargs
         ):
             if self._limit is not None:
                 if rows_read >= self._limit:
@@ -340,33 +340,57 @@ def _read_fragment_batches(
         one fragment at a time.
 
         ``fragment_row_offset`` is the post-filter row position of the first
-        row of ``table`` within the current fragment. Tracking it inside the
-        generator means it resets correctly whenever ``iterate_with_retry``
-        recreates the generator on a retry — outer-loop state would otherwise
-        carry stale values from the failed attempt and corrupt row hashes.
+        row of ``table`` within the current fragment. ``iterate_with_retry``
+        skips already-yielded items on retry, so ``offset`` reflects only the
+        rows that actually surface to the caller — matching V1 row-hash
+        semantics even when a fragment fails partway through.
+
+        Retry is scoped per-fragment: if a fragment fails mid-read, only
+        that fragment is re-read (skipping batches already yielded).
+        Wrapping the whole manifest in a single retry would re-iterate
+        fragments that already succeeded and double-emit their batches.
 
         Each fragment gets its own scanner so pyarrow uses the native
         per-file schema. A cross-fragment scanner would force a unified
         schema cast, which refuses extension-to-extension conversion
         (e.g. variable-shape tensors). V1 ``ParquetDatasource`` follows
         the same per-fragment pattern via ``fragment.to_batches``.
-
-        When a non-extension caller schema is available we pin it at the
-        scanner so pyarrow null-fills any column the unified schema names
-        but the fragment lacks (V1 parity). Falling back to the
-        per-fragment ``physical_schema`` preserves the variable-shape
-        tensor escape hatch already encoded in ``_file_dataset_schema``.
         """
+        ctx = DataContext.get_current()
         for fragment in dataset.get_fragments():
-            fragment_schema = (
-                self._file_dataset_schema
-                if self._file_dataset_schema is not None
-                else fragment.physical_schema
-            )
-            scanner = fragment.scanner(**scanner_kwargs, schema=fragment_schema)
             offset = 0
-            for tagged in scanner.scan_batches():
-                table = pa.Table.from_batches(batches=[tagged.record_batch])
+            for table in iterate_with_retry(
+                partial(self._iter_fragment_tables, fragment, scanner_kwargs),
+                f"read fragment {fragment.path}",
+                match=ctx.retried_io_errors,
+            ):
                 if table.num_rows > 0:
                     yield table, fragment.path, offset
                     offset += table.num_rows
+
+    def _iter_fragment_tables(
+        self,
+        fragment: pds.Fragment,
+        scanner_kwargs: dict,
+    ) -> Iterator[pa.Table]:
+        """Yield Arrow tables for a single fragment.
+
+        Subclasses override this to swap in a format-specific reader for
+        fragments that don't fit the default scanner-based path (e.g.
+        Parquet's ARROW-5030 nested-type fallback).
+
+        When a non-extension caller schema is available we pin it at the
+        scanner so pyarrow null-fills any column the unified schema names
+        but the fragment lacks (V1 parity — ``ParquetDatasource`` passes
+        ``read_schema`` to ``fragment.to_batches``). Falling back to the
+        per-fragment ``physical_schema`` preserves the variable-shape
+        tensor escape hatch already encoded in ``_file_dataset_schema``.
+        """
+        fragment_schema = (
+            self._file_dataset_schema
+            if self._file_dataset_schema is not None
+            else fragment.physical_schema
+        )
+        scanner = fragment.scanner(**scanner_kwargs, schema=fragment_schema)
+        for tagged in scanner.scan_batches():
+            yield pa.Table.from_batches(batches=[tagged.record_batch])
@@ -1,11 +1,10 @@
 import logging
 import math
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Iterator, List, Optional
 
 import pyarrow as pa
 import pyarrow.dataset as pds
 import pyarrow.parquet as pq
-from pyarrow import compute as pc
 from pyarrow.fs import FileSystem
 from typing_extensions import override
 
@@ -20,7 +19,9 @@
 from ray.data._internal.datasource_v2.readers.in_memory_size_estimator import (
     PARQUET_ENCODING_RATIO_ESTIMATE_DEFAULT,
 )
+from ray.data.expressions import Expr
 from ray.util.annotations import DeveloperAPI
+from ray.util.debug import log_once
 
 logger = logging.getLogger(__name__)
 
@@ -135,7 +136,7 @@ def __init__(
         self,
         batch_size: Optional[int] = None,
         columns: Optional[List[str]] = None,
-        predicate: Optional[pc.Expression] = None,
+        predicate: Optional[Expr] = None,
         limit: Optional[int] = None,
         filesystem: Optional[FileSystem] = None,
         partitioning: "Optional[Partitioning]" = None,
@@ -151,7 +152,7 @@ def __init__(
             batch_size: Explicit batch size override. If provided, disables
                 adaptive batch sizing.
             columns: Columns to read. None means all columns.
-            predicate: PyArrow compute expression for filtering.
+            predicate: Ray Data expression for filtering.
             limit: Maximum number of rows to read.
             filesystem: Filesystem for reading files.
             partitioning: Ray ``Partitioning`` for synthesizing partition
@@ -229,6 +230,173 @@ def _on_batch_read(self, table: pa.Table) -> None:
         row_size = table.nbytes / table.num_rows
         self._sampled_batch_size = max(math.ceil(self._target_block_size / row_size), 1)
 
+    @override
+    def _iter_fragment_tables(
+        self,
+        fragment: pds.Fragment,
+        scanner_kwargs: dict,
+    ) -> "Iterator[pa.Table]":
+        """Use V1's nested-type fallback path when the fragment has nested
+        columns whose row-group size exceeds Arrow's ~2GB chunking limit
+        (ARROW-5030).
+        """
+        import pyarrow.compute as pc
+
+        from ray.data._internal.arrow_ops.transform_pyarrow import (
+            _align_struct_fields,
+        )
+        from ray.data._internal.datasource.parquet_datasource import (
+            _get_safe_batch_size_for_nested_types,
+            _needs_nested_type_fallback,
+            _resolve_leaf_column_indices,
+            _resolve_read_columns,
+        )
+        from ray.data._internal.planner.plan_expression.expression_visitors import (
+            get_column_references,
+        )
+
+        columns = scanner_kwargs.get("columns")
+        filter_expr: pc.Expression = scanner_kwargs.get("filter")
+        # Include filter-referenced columns in the fallback check: a filter
+        # that touches a large nested column outside the projection still
+        # forces row-level decoding of that column, which would otherwise
+        # hit ARROW-5030 in the normal scanner path.
+        filter_columns = (
+            get_column_references(self._predicate)
+            if self._predicate is not None
+            else None
+        )
+        read_columns = _resolve_read_columns(columns, filter_expr, filter_columns)
+        if not _needs_nested_type_fallback(fragment, read_columns):
+            yield from super()._iter_fragment_tables(fragment, scanner_kwargs)
+            return
+
+        if log_once(f"parquet_nested_fallback_v2:{fragment.path}"):
+            logger.warning(
+                "Using pyarrow.parquet row-level batched reader for '%s' due "
+                "to Arrow nested type chunking limitation (ARROW-5030). "
+                "Consider writing Parquet files with smaller row group sizes "
+                "to avoid this.",
+                fragment.path,
+            )
+
+        batch_size = scanner_kwargs.get("batch_size")
+
+        pf = pq.ParquetFile(
+            fragment.path,
+            filesystem=fragment.filesystem,  # pyrefly: ignore[unexpected-keyword]
+        )
+
+        # Scope the safe batch-size calculation to the columns actually being
+        # decoded so we don't shrink batches based on columns we won't read.
+        leaf_indices = (
+            _resolve_leaf_column_indices(pf.metadata, read_columns)
+            if read_columns is not None and pf.metadata.num_row_groups > 0
+            else None
+        )
+        safe_batch_size = _get_safe_batch_size_for_nested_types(pf, leaf_indices)
+        fallback_batch_size = (
+            min(batch_size, safe_batch_size) if batch_size else safe_batch_size
+        )
+
+        # Apply row-group-level predicate pushdown via fragment.subset; the
+        # row-level filter is applied per-batch below since iter_batches
+        # doesn't accept a filter expression. Under schema evolution the
+        # filter may reference a column absent from this fragment's
+        # physical schema — fragment.subset uses that schema (not the
+        # unified one) and raises ArrowInvalid, so skip row-group pruning
+        # in that case and let the per-batch filter (post null-fill) do
+        # all the row-dropping.
+        fragment_physical_columns = set(fragment.physical_schema.names)
+        filter_touches_missing_column = filter_columns is not None and any(
+            c not in fragment_physical_columns for c in filter_columns
+        )
+        if filter_expr is not None and not filter_touches_missing_column:
+            subset = fragment.subset(filter=filter_expr)
+        else:
+            subset = fragment
+        row_groups = (
+            [rg.id for rg in subset.row_groups]
+            if subset.row_groups is not None
+            else None
+        )
+        if row_groups is not None and len(row_groups) == 0:
+            return
+
+        # ``pq.ParquetFile.iter_batches`` returns batches with the fragment's
+        # physical schema, so the fallback path would otherwise emit tables
+        # that differ from the scanner path (which pins
+        # ``_file_dataset_schema``) in struct field order, integer width,
+        # or missing columns. Align + cast to the same unified schema so
+        # fallback and non-fallback fragments concat cleanly downstream.
+        # Scoped to ``columns`` (not ``read_columns``) since filter-only
+        # columns are projected away before alignment.
+        file_dataset_schema = self._file_dataset_schema
+        if file_dataset_schema is not None and columns is not None:
+            align_schema = pa.schema(
+                [
+                    file_dataset_schema.field(c)
+                    for c in columns
+                    if file_dataset_schema.get_field_index(c) != -1
+                ]
+            )
+        else:
+            align_schema = file_dataset_schema
+
+        # Under schema evolution a filter-referenced column may live in
+        # the unified dataset schema but be absent from this fragment.
+        # The scanner path null-fills such columns via dataset-level
+        # schema pinning; ``pq.ParquetFile.iter_batches`` silently drops
+        # them and then ``table.filter(filter_expr)`` raises
+        # ``ArrowInvalid: No match for FieldRef.Name``. Mirror the
+        # scanner: append a null column of the unified type before the
+        # filter evaluates, so ``null > 15`` resolves to false and the
+        # fragment contributes 0 rows.
+        columns_to_null_fill: List[str] = (
+            [c for c in read_columns if c not in fragment_physical_columns]
+            if read_columns is not None
+            else []
+        )
+        null_fill_type_by_column = {
+            column_name: (
+                file_dataset_schema.field(column_name).type
+                if file_dataset_schema is not None
+                and file_dataset_schema.get_field_index(column_name) != -1
+                else pa.null()
+            )
+            for column_name in columns_to_null_fill
+        }
+
+        for batch in pf.iter_batches(
+            batch_size=fallback_batch_size,
+            columns=read_columns,
+            use_threads=False,
+            row_groups=row_groups,
+        ):
+            table = pa.Table.from_batches([batch])
+            for column_name in columns_to_null_fill:
+                if column_name not in table.column_names:
+                    table = table.append_column(
+                        column_name,
+                        pa.nulls(
+                            table.num_rows,
+                            type=null_fill_type_by_column[column_name],
+                        ),
+                    )
+            if filter_expr is not None:
+                table = table.filter(filter_expr)
+                # Skip downstream select/align/cast on fully-filtered
+                # batches — the caller discards empty tables anyway.
+                if table.num_rows == 0:
+                    continue
+            if columns is not None:
+                table = table.select([c for c in columns if c in table.column_names])
+            if align_schema is not None:
+                table = _align_struct_fields([table], align_schema)[0].cast(
+                    align_schema
+                )
+            yield table
+
     @override
     def _arrow_scanner_kwargs(self) -> dict:
         # pre_buffer=True (pyarrow default) holds a whole fragment's worth of
@@ -239,10 +407,11 @@ def _arrow_scanner_kwargs(self) -> dict:
         # while keeping throughput equal to the default. batch_readahead=1
         # (inherited from FileReader base kwargs) plus fragment_readahead=1
         # is enough to keep decode pipelined. See apache/arrow#39808.
-        return {
+        kwargs: dict = {
             "fragment_scan_options": pds.ParquetFragmentScanOptions(
                 pre_buffer=False,
                 use_buffered_stream=True,
             ),
             "fragment_readahead": 1,
         }
+        return kwargs