Skip to content

Commit 661f481

Browse files
[Data] - Only return selected data columns in hive partitioned parquet files (#60236)
## Description Returning `None` when you don't have partition_columns selects all the partitions which is not the right behavior. Returning `[]` when no partition columns are selected. ## Related issues Closes #60215 ## Additional information > Optional: Add implementation details, API changes, usage examples, screenshots, etc. --------- Signed-off-by: Goutam <goutam@anyscale.com>
1 parent c86b2e8 commit 661f481

2 files changed

Lines changed: 44 additions & 1 deletion

File tree

python/ray/data/_internal/datasource/parquet_datasource.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -616,7 +616,11 @@ def _get_partition_columns(self) -> Optional[List[str]]:
616616
return None
617617

618618
if not self._partition_columns:
619-
return None
619+
# If a projection is active but the dataset has no partition columns,
620+
# then no partition columns should be included in the output.
621+
# Returning [] ensures that no partition columns are added,
622+
# `None` is interpreted as including all partition columns.
623+
return []
620624

621625
# Extract partition columns that are in the projection map
622626
partition_cols = [

python/ray/data/tests/datasource/test_parquet.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,45 @@ def test_parquet_read_partitioned_with_columns(
391391
]
392392

393393

394+
def test_parquet_read_partitioned_excludes_unrequested_partition_columns(
395+
ray_start_regular_shared, tmp_path
396+
):
397+
"""Test that partition columns are excluded when not explicitly requested.
398+
399+
This is a regression test to ensure that when a user uses select_columns()
400+
with only data columns, partition columns are NOT automatically included.
401+
"""
402+
table = pa.table(
403+
{
404+
"partition_col0": [1, 1, 2, 2],
405+
"partition_col1": ["a", "a", "b", "b"],
406+
"data_col0": [10.5, 20.3, 30.2, 25.8],
407+
"data_col1": [100, 200, 300, 400],
408+
}
409+
)
410+
411+
pq.write_to_dataset(
412+
table,
413+
root_path=tmp_path,
414+
partition_cols=["partition_col0", "partition_col1"],
415+
)
416+
417+
# Request only data columns excluding partition columns
418+
ds = ray.data.read_parquet(
419+
tmp_path,
420+
columns=["data_col0"],
421+
partitioning=Partitioning("hive"),
422+
)
423+
424+
# Verify only the requested column is present
425+
assert ds.columns() == ["data_col0"]
426+
427+
# Verify the data is correct
428+
result_df = ds.to_pandas()
429+
expected_df = pd.DataFrame({"data_col0": [10.5, 20.3, 25.8, 30.2]})
430+
assert rows_same(result_df, expected_df)
431+
432+
394433
@pytest.mark.parametrize(
395434
"fs,data_path",
396435
[

0 commit comments

Comments
 (0)