fix: raise NotImplementedError when filtering by UUID column

ndrluis · ndrluis · commit 3e017c4397ba · 2026-02-28T12:57:02.000-03:00
PyArrow does not support filtering on UUID-typed columns. This commit
raises a NotImplementedError with a clear message when such a filter
is attempted
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -203,6 +203,10 @@
 MAP_VALUE_NAME = "value"
 DOC = "doc"
 UTC_ALIASES = {"UTC", "+00:00", "Etc/UTC", "Z"}
+UUID_FILTER_NOT_SUPPORTED_ERROR_MESSAGE = (
+    f"Filtering on UUID columns is not supported by the installed PyArrow version ({pa.__version__})"
+)
+
 
 T = TypeVar("T")
 
@@ -1641,7 +1645,12 @@ def _task_to_record_batches(
                 bound_row_filter, file_schema, case_sensitive=case_sensitive, projected_field_values=projected_missing_fields
             )
             bound_file_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive)
-            pyarrow_filter = expression_to_pyarrow(bound_file_filter, file_schema)
+            try:
+                pyarrow_filter = expression_to_pyarrow(bound_file_filter, file_schema)
+            except pyarrow.lib.ArrowNotImplementedError as e:
+                if "arrow.uuid" in str(e):
+                    raise NotImplementedError(UUID_FILTER_NOT_SUPPORTED_ERROR_MESSAGE) from e
+                raise
 
         file_project_schema = prune_columns(file_schema, projected_field_ids, select_full_types=False)
 
diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py
@@ -820,17 +820,20 @@ def test_partitioned_tables(catalog: Catalog) -> None:
 @pytest.mark.parametrize("catalog", [lf("session_catalog_hive"), lf("session_catalog")])
 def test_unpartitioned_uuid_table(catalog: Catalog) -> None:
     unpartitioned_uuid = catalog.load_table("default.test_uuid_and_fixed_unpartitioned")
-    arrow_table_eq = unpartitioned_uuid.scan(row_filter="uuid_col == '102cb62f-e6f8-4eb0-9973-d9b012ff0967'").to_arrow()
-    assert arrow_table_eq["uuid_col"].to_pylist() == [uuid.UUID("102cb62f-e6f8-4eb0-9973-d9b012ff0967")]
-
-    arrow_table_neq = unpartitioned_uuid.scan(
-        row_filter="uuid_col != '102cb62f-e6f8-4eb0-9973-d9b012ff0967' and uuid_col != '639cccce-c9d2-494a-a78c-278ab234f024'"
-    ).to_arrow()
-    assert arrow_table_neq["uuid_col"].to_pylist() == [
-        uuid.UUID("ec33e4b2-a834-4cc3-8c4a-a1d3bfc2f226"),
-        uuid.UUID("c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b"),
-        uuid.UUID("923dae77-83d6-47cd-b4b0-d383e64ee57e"),
-    ]
+    try:
+        arrow_table_eq = unpartitioned_uuid.scan(row_filter="uuid_col == '102cb62f-e6f8-4eb0-9973-d9b012ff0967'").to_arrow()
+        assert arrow_table_eq["uuid_col"].to_pylist() == [uuid.UUID("102cb62f-e6f8-4eb0-9973-d9b012ff0967")]
+
+        arrow_table_neq = unpartitioned_uuid.scan(
+            row_filter="uuid_col != '102cb62f-e6f8-4eb0-9973-d9b012ff0967' and uuid_col != '639cccce-c9d2-494a-a78c-278ab234f024'"
+        ).to_arrow()
+        assert arrow_table_neq["uuid_col"].to_pylist() == [
+            uuid.UUID("ec33e4b2-a834-4cc3-8c4a-a1d3bfc2f226"),
+            uuid.UUID("c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b"),
+            uuid.UUID("923dae77-83d6-47cd-b4b0-d383e64ee57e"),
+        ]
+    except NotImplementedError as e:
+        assert "Filtering on UUID columns is not supported" in str(e)
 
 
 @pytest.mark.integration
@@ -840,14 +843,11 @@ def test_unpartitioned_fixed_table(catalog: Catalog) -> None:
     arrow_table_eq = fixed_table.scan(row_filter=EqualTo("fixed_col", b"1234567890123456789012345")).to_arrow()
     assert arrow_table_eq["fixed_col"].to_pylist() == [b"1234567890123456789012345"]
 
-    arrow_table_neq = fixed_table.scan(
-        row_filter=And(
-            NotEqualTo("fixed_col", b"1234567890123456789012345"), NotEqualTo("uuid_col", "c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b")
-        )
-    ).to_arrow()
+    arrow_table_neq = fixed_table.scan(row_filter=NotEqualTo("fixed_col", b"1234567890123456789012345")).to_arrow()
     assert arrow_table_neq["fixed_col"].to_pylist() == [
         b"1231231231231231231231231",
         b"12345678901234567ass12345",
+        b"asdasasdads12312312312111",
         b"qweeqwwqq1231231231231111",
     ]
 
diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py
@@ -2123,9 +2123,7 @@ def test_uuid_partitioning(session_catalog: Catalog, spark: SparkSession, transf
         },
         schema=pa.schema(
             [
-                # Uuid not yet supported, so we have to stick with `binary(16)`
-                # https://github.com/apache/arrow/issues/46468
-                pa.field("uuid", pa.binary(16), nullable=False),
+                pa.field("uuid", pa.uuid(), nullable=False),
             ]
         ),
     )

Original file line number	Diff line number	Diff line change
`@@ -2123,9 +2123,7 @@ def test_uuid_partitioning(session_catalog: Catalog, spark: SparkSession, transf`
`2123`	`2123`	`},`
`2124`	`2124`	`schema=pa.schema(`
`2125`	`2125`	`[`
`2126`		- # Uuid not yet supported, so we have to stick with `binary(16)`
`2127`		`- # https://github.com/apache/arrow/issues/46468`
`2128`		`- pa.field("uuid", pa.binary(16), nullable=False),`
	`2126`	`+ pa.field("uuid", pa.uuid(), nullable=False),`
`2129`	`2127`	`]`
`2130`	`2128`	`),`
`2131`	`2129`	`)`