Skip to content

Commit 3e017c4

Browse files
committed
fix: raise NotImplementedError when filtering by UUID column
PyArrow does not support filtering on UUID-typed columns. This commit raises a NotImplementedError with a clear message when such a filter is attempted
1 parent 7425bc4 commit 3e017c4

File tree

3 files changed

+27
-20
lines changed

3 files changed

+27
-20
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,10 @@
203203
MAP_VALUE_NAME = "value"
204204
DOC = "doc"
205205
UTC_ALIASES = {"UTC", "+00:00", "Etc/UTC", "Z"}
206+
UUID_FILTER_NOT_SUPPORTED_ERROR_MESSAGE = (
207+
f"Filtering on UUID columns is not supported by the installed PyArrow version ({pa.__version__})"
208+
)
209+
206210

207211
T = TypeVar("T")
208212

@@ -1641,7 +1645,12 @@ def _task_to_record_batches(
16411645
bound_row_filter, file_schema, case_sensitive=case_sensitive, projected_field_values=projected_missing_fields
16421646
)
16431647
bound_file_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive)
1644-
pyarrow_filter = expression_to_pyarrow(bound_file_filter, file_schema)
1648+
try:
1649+
pyarrow_filter = expression_to_pyarrow(bound_file_filter, file_schema)
1650+
except pyarrow.lib.ArrowNotImplementedError as e:
1651+
if "arrow.uuid" in str(e):
1652+
raise NotImplementedError(UUID_FILTER_NOT_SUPPORTED_ERROR_MESSAGE) from e
1653+
raise
16451654

16461655
file_project_schema = prune_columns(file_schema, projected_field_ids, select_full_types=False)
16471656

tests/integration/test_reads.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -820,17 +820,20 @@ def test_partitioned_tables(catalog: Catalog) -> None:
820820
@pytest.mark.parametrize("catalog", [lf("session_catalog_hive"), lf("session_catalog")])
821821
def test_unpartitioned_uuid_table(catalog: Catalog) -> None:
822822
unpartitioned_uuid = catalog.load_table("default.test_uuid_and_fixed_unpartitioned")
823-
arrow_table_eq = unpartitioned_uuid.scan(row_filter="uuid_col == '102cb62f-e6f8-4eb0-9973-d9b012ff0967'").to_arrow()
824-
assert arrow_table_eq["uuid_col"].to_pylist() == [uuid.UUID("102cb62f-e6f8-4eb0-9973-d9b012ff0967")]
825-
826-
arrow_table_neq = unpartitioned_uuid.scan(
827-
row_filter="uuid_col != '102cb62f-e6f8-4eb0-9973-d9b012ff0967' and uuid_col != '639cccce-c9d2-494a-a78c-278ab234f024'"
828-
).to_arrow()
829-
assert arrow_table_neq["uuid_col"].to_pylist() == [
830-
uuid.UUID("ec33e4b2-a834-4cc3-8c4a-a1d3bfc2f226"),
831-
uuid.UUID("c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b"),
832-
uuid.UUID("923dae77-83d6-47cd-b4b0-d383e64ee57e"),
833-
]
823+
try:
824+
arrow_table_eq = unpartitioned_uuid.scan(row_filter="uuid_col == '102cb62f-e6f8-4eb0-9973-d9b012ff0967'").to_arrow()
825+
assert arrow_table_eq["uuid_col"].to_pylist() == [uuid.UUID("102cb62f-e6f8-4eb0-9973-d9b012ff0967")]
826+
827+
arrow_table_neq = unpartitioned_uuid.scan(
828+
row_filter="uuid_col != '102cb62f-e6f8-4eb0-9973-d9b012ff0967' and uuid_col != '639cccce-c9d2-494a-a78c-278ab234f024'"
829+
).to_arrow()
830+
assert arrow_table_neq["uuid_col"].to_pylist() == [
831+
uuid.UUID("ec33e4b2-a834-4cc3-8c4a-a1d3bfc2f226"),
832+
uuid.UUID("c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b"),
833+
uuid.UUID("923dae77-83d6-47cd-b4b0-d383e64ee57e"),
834+
]
835+
except NotImplementedError as e:
836+
assert "Filtering on UUID columns is not supported" in str(e)
834837

835838

836839
@pytest.mark.integration
@@ -840,14 +843,11 @@ def test_unpartitioned_fixed_table(catalog: Catalog) -> None:
840843
arrow_table_eq = fixed_table.scan(row_filter=EqualTo("fixed_col", b"1234567890123456789012345")).to_arrow()
841844
assert arrow_table_eq["fixed_col"].to_pylist() == [b"1234567890123456789012345"]
842845

843-
arrow_table_neq = fixed_table.scan(
844-
row_filter=And(
845-
NotEqualTo("fixed_col", b"1234567890123456789012345"), NotEqualTo("uuid_col", "c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b")
846-
)
847-
).to_arrow()
846+
arrow_table_neq = fixed_table.scan(row_filter=NotEqualTo("fixed_col", b"1234567890123456789012345")).to_arrow()
848847
assert arrow_table_neq["fixed_col"].to_pylist() == [
849848
b"1231231231231231231231231",
850849
b"12345678901234567ass12345",
850+
b"asdasasdads12312312312111",
851851
b"qweeqwwqq1231231231231111",
852852
]
853853

tests/integration/test_writes/test_writes.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2123,9 +2123,7 @@ def test_uuid_partitioning(session_catalog: Catalog, spark: SparkSession, transf
21232123
},
21242124
schema=pa.schema(
21252125
[
2126-
# Uuid not yet supported, so we have to stick with `binary(16)`
2127-
# https://github.com/apache/arrow/issues/46468
2128-
pa.field("uuid", pa.binary(16), nullable=False),
2126+
pa.field("uuid", pa.uuid(), nullable=False),
21292127
]
21302128
),
21312129
)

0 commit comments

Comments
 (0)