dancingactor
diff --git a/‎python/ray/data/_internal/datasource_v2/parquet_datasource_v2.py‎
Lines changed: 13 additions & 0 deletions b/‎python/ray/data/_internal/datasource_v2/parquet_datasource_v2.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎python/ray/data/_internal/datasource_v2/readers/file_reader.py‎
Lines changed: 117 additions & 23 deletions b/‎python/ray/data/_internal/datasource_v2/readers/file_reader.py‎
Lines changed: 117 additions & 23 deletions
diff --git a/‎python/ray/data/_internal/datasource_v2/readers/parquet_file_reader.py‎
Lines changed: 4 additions & 0 deletions b/‎python/ray/data/_internal/datasource_v2/readers/parquet_file_reader.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎python/ray/data/_internal/datasource_v2/scanners/parquet_scanner.py‎
Lines changed: 4 additions & 0 deletions b/‎python/ray/data/_internal/datasource_v2/scanners/parquet_scanner.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎python/ray/data/_internal/datasource_v2/tests/test_parquet_datasource_v2.py‎
Lines changed: 34 additions & 0 deletions b/‎python/ray/data/_internal/datasource_v2/tests/test_parquet_datasource_v2.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎python/ray/data/_internal/logical/operators/read_operator.py‎
Lines changed: 19 additions & 0 deletions b/‎python/ray/data/_internal/logical/operators/read_operator.py‎
Lines changed: 19 additions & 0 deletions
@@ -68,6 +68,7 @@ def __init__(
         file_extensions: Optional[List[str]] = None,
         ignore_missing_paths: bool = False,
         include_paths: bool = False,
+        include_row_hash: bool = False,
         shuffle: Optional[Union[Literal["files"], "FileShuffleConfig"]] = None,
         arrow_parquet_args: Optional[dict] = None,
         schema: Optional[pa.Schema] = None,
@@ -89,6 +90,7 @@ def __init__(
         self._file_extensions = file_extensions or ParquetDatasource._FILE_EXTENSIONS
         self._ignore_missing_paths = ignore_missing_paths
         self._include_paths = include_paths
+        self._include_row_hash = include_row_hash
         self._shuffle = shuffle
         self._arrow_parquet_args = arrow_parquet_args or {}
         # User-supplied schema override. When set, ``infer_schema`` returns
@@ -245,6 +247,16 @@ def _read_schema(path: str):
         if self._include_paths and schema.get_field_index("path") == -1:
             schema = schema.append(pa.field("path", pa.string()))
 
+        if self._include_row_hash:
+            # ``row_hash`` is synthesized post-read as ``uint64``. Replace
+            # the field type when the file already has a ``row_hash``
+            # column (matches V1 ``_derive_schema``); otherwise append.
+            idx = schema.get_field_index("row_hash")
+            if idx == -1:
+                schema = schema.append(pa.field("row_hash", pa.uint64()))
+            elif schema.field(idx).type != pa.uint64():
+                schema = schema.set(idx, pa.field("row_hash", pa.uint64()))
+
         check_for_legacy_tensor_type(schema)
         return schema
 
@@ -269,6 +281,7 @@ def create_scanner(
             filesystem=filesystem or self._filesystem,
             partitioning=partitioning,
             include_paths=self._include_paths,
+            include_row_hash=self._include_row_hash,
             shuffle=self._shuffle,
             ignore_prefixes=options.get("ignore_prefixes"),
             target_block_size=DataContext.get_current().target_max_block_size,
 
@@ -8,6 +8,7 @@
 from pyarrow.fs import FileSystem, LocalFileSystem
 
 from ray.data._internal.arrow_block import _BATCH_SIZE_PRESERVING_STUB_COL_NAME
+from ray.data._internal.datasource.parquet_datasource import _compute_row_hashes
 from ray.data._internal.datasource_v2.listing.file_manifest import FileManifest
 from ray.data._internal.datasource_v2.readers.base_reader import Reader
 from ray.data._internal.util import iterate_with_retry
@@ -19,6 +20,14 @@
 # Default is specified by PyArrow.
 _ARROW_DEFAULT_BATCH_SIZE = 131_072
 
+# Small fixed readahead keeps driver memory bounded when scanning
+# uncompressed batches (jumbo tensor columns can run to multi-GB per
+# batch, and pyarrow's default 16-batch readahead would retain all of
+# them).
+_ARROW_SCANNER_BATCH_READAHEAD = 1
+
+_ROW_HASH_COLUMN_NAME = "row_hash"
+
 
 class FileFormat(str, Enum):
     PARQUET = "parquet"
@@ -50,6 +59,7 @@ def __init__(
         partitioning: Optional[Partitioning] = None,
         ignore_prefixes: Optional[List[str]] = None,
         include_paths: bool = False,
+        include_row_hash: bool = False,
         schema: Optional[pa.Schema] = None,
     ):
         """Initialize the reader.
@@ -68,6 +78,12 @@ def __init__(
             ignore_prefixes: Prefixes to ignore when reading files. Default is ['.', '_'] set by PyArrow.
             include_paths: If True, include the source file path in a
                 ``'path'`` column for each row.
+            include_row_hash: If True, include a deterministic uint64 hash
+                per row in a ``'row_hash'`` column. The hash is derived from
+                the source file path and the row's post-filter output
+                position within the fragment, matching V1 semantics. If a
+                ``'row_hash'`` column already exists in the file, it is
+                overwritten.
             schema: Caller-supplied unified schema used both to override
                 pyarrow's per-fragment inference (so a file whose column
                 is all-null doesn't pin the type to ``null``) and to cast
@@ -86,27 +102,53 @@ def __init__(
         )
         self._ignore_prefixes = ignore_prefixes
         self._include_paths = include_paths
+        self._include_row_hash = include_row_hash
         self._schema = schema
 
     @cached_property
     def _file_dataset_schema(self) -> Optional[pa.Schema]:
         """Schema passed to ``pds.dataset`` — partition keys and ``path``
         stripped out since those are synthesized post-read.
 
-        A caller-supplied schema overrides pyarrow's per-fragment
-        inference — without it, a file with all-null values in column X
-        pins X to ``null`` type and pyarrow can't cast string → null in
-        later files.
+        Pinning the caller-supplied schema at the pyarrow layer is how
+        we cover the "first file has an all-null column, later files
+        have the real type" case (e.g.
+        ``test_read_null_data_in_first_file``): without the pin,
+        pyarrow locks column X to ``null`` across the fragment group
+        and the later string-typed file fails the cast.
+
+        But pyarrow refuses extension-to-extension casts (e.g.
+        ``ArrowTensorTypeV2(shape=X)`` → ``ArrowVariableShapedTensor``),
+        and files with different per-file tensor shapes only unify
+        through ``ArrowVariableShapedTensor``. When the caller schema
+        contains *any* extension column we skip the pin entirely and
+        let pyarrow infer per-file — downstream concat handles the
+        heterogeneous blocks. Losing the all-null promotion in this
+        narrow case is acceptable; the combination of an all-null
+        first file *and* an extension column is uncommon, whereas
+        reading multiple files with variable-shape tensors is a
+        supported V1 feature.
         """
         if self._schema is None:
             return None
+        if any(isinstance(f.type, pa.ExtensionType) for f in self._schema):
+            return None
         partition_keys = (
             set(self._partition_parser._scheme.field_names or [])
             if self._partition_parser is not None
             else set()
         )
+        synthesized = {"path"}
+        if self._include_row_hash:
+            # ``row_hash`` is synthesized post-read, and the schema's type
+            # (``uint64``) may not match the on-disk column's type when a
+            # file already carries a ``row_hash`` column. Strip it from the
+            # dataset schema so pyarrow doesn't try to cast.
+            synthesized.add(_ROW_HASH_COLUMN_NAME)
         fields = [
-            f for f in self._schema if f.name not in partition_keys and f.name != "path"
+            f
+            for f in self._schema
+            if f.name not in partition_keys and f.name not in synthesized
         ]
         return pa.schema(fields) if fields else None
 
@@ -146,6 +188,14 @@ def read(self, input_split: FileManifest) -> Iterator[pa.Table]:
 
         paths = list(input_split.paths)
         filesystem = self._filesystem or LocalFileSystem()
+        # Build a ``pds.Dataset`` over *all* manifest paths so pyarrow's
+        # listing + column metadata is shared, but then iterate its
+        # fragments one at a time. ``dataset.scanner(fragments=...)``
+        # at the aggregate level would force a cross-fragment cast —
+        # which breaks variable-shape tensor extensions where each
+        # file has its own ``ArrowTensorTypeV2(shape=...)``. Per-
+        # fragment scanners let pyarrow use the native per-file type,
+        # and downstream concat handles unification.
         dataset = pds.dataset(
             source=paths,
             format=self._format.value,
@@ -169,19 +219,18 @@ def read(self, input_split: FileManifest) -> Iterator[pa.Table]:
             ]
             columns_to_synthesize = set(self._columns) - on_disk_column_names
 
-        scanner_kwargs = dict(
-            columns=columns_to_read_from_file,
-            filter=self._predicate,
-            batch_size=self._resolve_batch_size(dataset),
-            batch_readahead=1,
-        )
+        scanner_kwargs = {
+            "columns": columns_to_read_from_file,
+            "filter": self._predicate,
+            "batch_size": self._resolve_batch_size(dataset),
+            "batch_readahead": _ARROW_SCANNER_BATCH_READAHEAD,
+        }
         scanner_kwargs.update(self._arrow_scanner_kwargs())
-        scanner = dataset.scanner(**scanner_kwargs)
 
         ctx = DataContext.get_current()
         rows_read = 0
-        for table, fragment_path in iterate_with_retry(
-            lambda: self._read_batches(scanner),
+        for table, fragment_path, fragment_row_offset in iterate_with_retry(
+            lambda: self._read_fragment_batches(dataset, scanner_kwargs),
             "read batches",
             match=ctx.retried_io_errors,
         ):
@@ -216,6 +265,21 @@ def read(self, input_split: FileManifest) -> Iterator[pa.Table]:
                     self._broadcast_partition_value(name, value, table.num_rows),
                 )
 
+            # Skip when projection pushdown has narrowed ``columns`` to
+            # exclude ``row_hash`` — the projection below would just drop it.
+            if self._include_row_hash and (
+                columns_to_synthesize is None
+                or _ROW_HASH_COLUMN_NAME in columns_to_synthesize
+            ):
+                hashes = _compute_row_hashes(
+                    fragment_path, fragment_row_offset, table.num_rows
+                )
+                if _ROW_HASH_COLUMN_NAME in table.column_names:
+                    table = table.drop([_ROW_HASH_COLUMN_NAME])
+                table = table.append_column(
+                    _ROW_HASH_COLUMN_NAME, pa.array(hashes, type=pa.uint64())
+                )
+
             if self._columns is not None:
                 # Project/reorder to the caller's requested column order;
                 # drop any that weren't produced (matches V1's lenient
@@ -262,12 +326,42 @@ def _arrow_scanner_kwargs(self) -> dict:
         """
         return {}
 
-    @staticmethod
-    def _read_batches(
-        scanner: pds.Scanner,
-    ) -> Iterator[tuple[pa.Table, str]]:
-        """Yield non-empty (table, fragment_path) pairs from scanner batches."""
-        for tagged in scanner.scan_batches():
-            table = pa.Table.from_batches(batches=[tagged.record_batch])
-            if table.num_rows > 0:
-                yield table, tagged.fragment.path
+    def _read_fragment_batches(
+        self,
+        dataset: pds.Dataset,
+        scanner_kwargs: dict,
+    ) -> Iterator[Tuple[pa.Table, str, int]]:
+        """Yield non-empty (table, fragment_path, fragment_row_offset) triples
+        one fragment at a time.
+
+        ``fragment_row_offset`` is the post-filter row position of the first
+        row of ``table`` within the current fragment. Tracking it inside the
+        generator means it resets correctly whenever ``iterate_with_retry``
+        recreates the generator on a retry — outer-loop state would otherwise
+        carry stale values from the failed attempt and corrupt row hashes.
+
+        Each fragment gets its own scanner so pyarrow uses the native
+        per-file schema. A cross-fragment scanner would force a unified
+        schema cast, which refuses extension-to-extension conversion
+        (e.g. variable-shape tensors). V1 ``ParquetDatasource`` follows
+        the same per-fragment pattern via ``fragment.to_batches``.
+
+        When a non-extension caller schema is available we pin it at the
+        scanner so pyarrow null-fills any column the unified schema names
+        but the fragment lacks (V1 parity). Falling back to the
+        per-fragment ``physical_schema`` preserves the variable-shape
+        tensor escape hatch already encoded in ``_file_dataset_schema``.
+        """
+        for fragment in dataset.get_fragments():
+            fragment_schema = (
+                self._file_dataset_schema
+                if self._file_dataset_schema is not None
+                else fragment.physical_schema
+            )
+            scanner = fragment.scanner(**scanner_kwargs, schema=fragment_schema)
+            offset = 0
+            for tagged in scanner.scan_batches():
+                table = pa.Table.from_batches(batches=[tagged.record_batch])
+                if table.num_rows > 0:
+                    yield table, fragment.path, offset
+                    offset += table.num_rows
@@ -142,6 +142,7 @@ def __init__(
         ignore_prefixes: Optional[List[str]] = None,
         target_block_size: Optional[int] = None,
         include_paths: bool = False,
+        include_row_hash: bool = False,
         schema: Optional[pa.Schema] = None,
     ):
         """Initialize the Parquet reader.
@@ -160,6 +161,8 @@ def __init__(
                 Used for adaptive batch sizing when ``batch_size`` is not set.
             include_paths: If True, include the source file path in a
                 ``'path'`` column for each row.
+            include_row_hash: If True, include a deterministic uint64 hash
+                per row in a ``'row_hash'`` column.
             schema: Caller-supplied unified schema forwarded to the base
                 :class:`FileReader` for per-fragment inference override
                 and partition-column type casting.
@@ -174,6 +177,7 @@ def __init__(
             partitioning=partitioning,
             ignore_prefixes=ignore_prefixes,
             include_paths=include_paths,
+            include_row_hash=include_row_hash,
             schema=schema,
         )
         self._explicit_batch_size = batch_size
 
@@ -28,12 +28,15 @@ class ParquetScanner(ArrowFileScanner):
 
     target_block_size: Optional[int] = None
     include_paths: bool = False
+    include_row_hash: bool = False
 
     def read_schema(self) -> pa.Schema:
         """Return schema after column pruning and tensor check."""
         schema = super().read_schema()
         if self.include_paths and schema.get_field_index("path") == -1:
             schema = schema.append(pa.field("path", pa.string()))
+        if self.include_row_hash and schema.get_field_index("row_hash") == -1:
+            schema = schema.append(pa.field("row_hash", pa.uint64()))
 
         check_for_legacy_tensor_type(schema)
         return schema
@@ -54,5 +57,6 @@ def create_reader(self) -> ParquetFileReader:
             ignore_prefixes=self.ignore_prefixes,
             target_block_size=self.target_block_size,
             include_paths=self.include_paths,
+            include_row_hash=self.include_row_hash,
             schema=self.schema,
         )
@@ -105,3 +105,37 @@ def test_paths_and_filesystem_resolved(tmp_path):
     # the caller passed None.
     assert datasource.filesystem is not None
     assert len(datasource.paths) == 1
+
+
+def test_infer_schema_with_include_row_hash(tmp_path):
+    file_path = tmp_path / "data.parquet"
+    _write_parquet(str(file_path), pa.table({"a": [1, 2]}))
+
+    datasource = ParquetDatasourceV2([str(file_path)], include_row_hash=True)
+    schema = datasource.infer_schema(_manifest_of([str(file_path)]))
+
+    assert "row_hash" in schema.names
+    assert schema.field("row_hash").type == pa.uint64()
+
+
+def test_infer_schema_with_include_row_hash_existing_column_promoted_to_uint64(
+    tmp_path,
+):
+    file_path = tmp_path / "data.parquet"
+    _write_parquet(str(file_path), pa.table({"val": [1, 2], "row_hash": [10, 20]}))
+
+    datasource = ParquetDatasourceV2([str(file_path)], include_row_hash=True)
+    schema = datasource.infer_schema(_manifest_of([str(file_path)]))
+
+    assert schema.field("row_hash").type == pa.uint64()
+
+
+def test_create_scanner_propagates_include_row_hash(tmp_path):
+    file_path = tmp_path / "data.parquet"
+    _write_parquet(str(file_path), pa.table({"a": [1]}))
+
+    datasource = ParquetDatasourceV2([str(file_path)], include_row_hash=True)
+    schema = datasource.infer_schema(_manifest_of([str(file_path)]))
+    scanner = datasource.create_scanner(schema)
+
+    assert scanner.include_row_hash is True
@@ -12,6 +12,7 @@
 )
 from ray.data._internal.logical.operators.map_operator import AbstractMap
 from ray.data.block import (
+    Block,
     BlockMetadata,
     BlockMetadataWithSchema,
 )
@@ -266,6 +267,12 @@ class ReadFiles(
     # renamed. The scanner only knows original names; renames are applied
     # in ``plan_read_files_op`` after each block is read.
     column_renames: Optional[Dict[str, str]] = None
+    # Optional post-read block transform. Used by ``read_parquet``'s
+    # ``_block_udf`` and ``tensor_column_schema`` (the latter is folded
+    # into a ``_block_udf`` by ``_resolve_parquet_args`` before it gets
+    # here). Applied in ``plan_read_files_op.do_read`` after each
+    # table is read and before column renames.
+    block_udf: Optional[Callable[[Block], Block]] = None
     can_modify_num_rows: bool = field(init=False, default=True)
     min_rows_per_bundled_input: Optional[int] = field(init=False, default=None)
     ray_remote_args_fn: None = field(init=False, default=None)
@@ -314,6 +321,18 @@ def infer_schema(self) -> "pa.Schema":
         # ``select_columns([])``); the stored ``self.schema`` is the
         # unprojected one and only used for construction.
         schema = self.scanner.read_schema()
+        # When a ``block_udf`` is attached (e.g. ``read_parquet`` was
+        # called with ``tensor_column_schema`` or ``_block_udf``), probe
+        # its effect on the schema so downstream consumers see the
+        # post-transform column types. Mirrors V1 ``ParquetDatasource``'s
+        # dummy-table trick. Falls back to the scanner schema if the
+        # probe fails — the UDF may require a non-empty input.
+        if self.block_udf is not None:
+            try:
+                transformed = self.block_udf(schema.empty_table()).schema
+                schema = transformed.with_metadata(schema.metadata)
+            except Exception:
+                pass
         if self.column_renames:
             import pyarrow as pa