Trino: Restructure to make Trino integration optional and modular

dingo4dev · dingo4dev · commit fd26ee414088 · 2026-02-01T11:44:47.000+08:00
Address reviewer feedback from kevinjqliu:

1. Consolidated Trino infrastructure:
   - All Trino config files remain in dev/trino/ directory
   - docker-compose-trino.yml moved to dev/ (alongside integration compose)
   - run-trino.sh moved to dev/ (alongside other run scripts)

2. Removed Trino from main integration docker-compose:
   - Trino service removed from dev/docker-compose-integration.yml
   - Trino can now be spun up separately alongside main integration
   - Keeps Trino testing optional and not part of CI

3. Created dedicated test file:
   - tests/integration/test_trino.py for all Trino-specific tests
   - Moved test_schema_exists_in_trino from test_rest_catalog.py
   - Moved test_uuid_partitioning_with_trino from test_writes.py
   - Better separation of concerns and easier to maintain

4. Simplified pytest marker:
   - Changed from @pytest.mark.integration_trino to @pytest.mark.trino
   - Updated Makefile target: test-integration-trino -&gt; test-trino
   - Updated pyproject.toml and conftest.py references

This makes Trino integration testing opt-in and follows the same
pattern as other optional test suites (s3, adls, gcs).
diff --git a/Makefile b/Makefile
@@ -117,9 +117,9 @@ test-integration-rebuild: ## Rebuild integration Docker services from scratch
 	docker compose -f dev/docker-compose-integration.yml rm -f
 	docker compose -f dev/docker-compose-integration.yml build --no-cache
 
-test-integration-trino: ## Run tests marked with @pytest.mark.integration_trino
+test-trino: ## Run tests marked with @pytest.mark.trino
 	sh ./dev/run-trino.sh
-	$(TEST_RUNNER) pytest tests/ -m integration_trino $(PYTEST_ARGS)
+	$(TEST_RUNNER) pytest tests/ -m trino $(PYTEST_ARGS)
 
 test-s3: ## Run tests marked with @pytest.mark.s3
 	sh ./dev/run-minio.sh
@@ -134,7 +134,7 @@ test-gcs: ## Run tests marked with @pytest.mark.gcs
 	$(TEST_RUNNER) pytest tests/ -m gcs $(PYTEST_ARGS)
 
 test-coverage: COVERAGE=1
-test-coverage: test test-integration test-integration-trino test-s3 test-adls test-gcs coverage-report ## Run all tests with coverage and report
+test-coverage: test test-integration test-trino test-s3 test-adls test-gcs coverage-report ## Run all tests with coverage and report
 
 coverage-report: ## Combine and report coverage
 	uv run $(PYTHON_ARG) coverage combine
diff --git a/dev/docker-compose-integration.yml b/dev/docker-compose-integration.yml
@@ -58,20 +58,6 @@ services:
       - CATALOG_S3_ENDPOINT=http://minio:9000
       - CATALOG_JDBC_STRICT__MODE=true
 
-  trino:
-    image: trinodb/trino:478
-    container_name: pyiceberg-trino
-    networks:
-      iceberg_net:
-    ports:
-      - 8082:8080
-    environment:
-      - CATALOG_MANAGEMENT=dynamic
-    depends_on:
-      - rest
-      - hive
-    volumes:
-      - ./trino/catalog:/etc/trino/catalog
   minio:
     image: minio/minio
     container_name: pyiceberg-minio
diff --git a/pyproject.toml b/pyproject.toml
@@ -154,7 +154,7 @@ markers = [
   "s3: marks a test as requiring access to s3 compliant storage (use with --aws-access-key-id, --aws-secret-access-key, and --endpoint args)",
   "adls: marks a test as requiring access to adls compliant storage (use with --adls.account-name, --adls.account-key, and --adls.endpoint args)",
   "integration: marks integration tests against Apache Spark",
-  "integration_trino: marks integration tests against Trino",
+  "trino: marks integration tests against Trino",
   "gcs: marks a test as requiring access to gcs compliant storage (use with --gs.token, --gs.project, and --gs.endpoint)",
   "benchmark: collection of tests to validate read/write performance before and after a change",
 ]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -148,13 +148,13 @@ def pytest_addoption(parser: pytest.Parser) -> None:
         "--trino.rest.endpoint",
         action="store",
         default="trino://test@localhost:8082/warehouse_rest",
-        help="The Trino REST endpoint URL for tests marked as integration_trino",
+        help="The Trino REST endpoint URL for tests marked as trino",
     )
     parser.addoption(
         "--trino.hive.endpoint",
         action="store",
         default="trino://test@localhost:8082/warehouse_hive",
-        help="The Trino Hive endpoint URL for tests marked as integration_trino",
+        help="The Trino Hive endpoint URL for tests marked as trino",
     )
 
 
diff --git a/tests/integration/test_rest_catalog.py b/tests/integration/test_rest_catalog.py
@@ -62,22 +62,3 @@ def test_create_namespace_if_already_existing(catalog: RestCatalog) -> None:
     catalog.create_namespace_if_not_exists(TEST_NAMESPACE_IDENTIFIER)
 
     assert catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER)
-
-
-@pytest.mark.integration
-@pytest.mark.integration_trino
-@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog")])
-def test_schema_exists_in_trino(trino_rest_conn: Connection, catalog: RestCatalog) -> None:
-    """Verifies that an Iceberg namespace correctly appears as a schema in Trino.
-
-    This test ensures the synchronization between Iceberg's namespace concept and
-    Trino's schema concept, confirming that after creating a namespace in the Iceberg
-    catalog, it becomes visible as a schema in the Trino environment.
-    """
-
-    if catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER):
-        catalog.drop_namespace(TEST_NAMESPACE_IDENTIFIER)
-    catalog.create_namespace_if_not_exists(TEST_NAMESPACE_IDENTIFIER)
-
-    assert catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER)
-    assert TEST_NAMESPACE_IDENTIFIER.lower() in inspect(trino_rest_conn).get_schema_names()
diff --git a/tests/integration/test_trino.py b/tests/integration/test_trino.py
@@ -0,0 +1,115 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Integration tests for Trino engine."""
+
+import uuid
+
+import pyarrow as pa
+import pytest
+from sqlalchemy import Connection, inspect, text
+from sqlalchemy.engine import Engine
+
+from pyiceberg.catalog import Catalog
+from pyiceberg.catalog.rest import RestCatalog
+from pyiceberg.exceptions import NoSuchTableError
+from pyiceberg.partitioning import PartitionField, PartitionSpec
+from pyiceberg.schema import Schema
+from pyiceberg.transforms import BucketTransform, IdentityTransform, Transform
+from pyiceberg.types import NestedField, UUIDType
+
+TEST_NAMESPACE = "test_trino_namespace"
+TEST_NAMESPACE_IDENTIFIER = (TEST_NAMESPACE,)
+
+
+@pytest.mark.trino
+def test_schema_exists_in_trino(trino_rest_conn: Connection, catalog: RestCatalog) -> None:
+    """Verifies that an Iceberg namespace correctly appears as a schema in Trino.
+
+    This test ensures the synchronization between Iceberg's namespace concept and
+    Trino's schema concept, confirming that after creating a namespace in the Iceberg
+    catalog, it becomes visible as a schema in the Trino environment.
+    """
+    if catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER):
+        catalog.drop_namespace(TEST_NAMESPACE_IDENTIFIER)
+    catalog.create_namespace_if_not_exists(TEST_NAMESPACE_IDENTIFIER)
+
+    assert catalog.namespace_exists(TEST_NAMESPACE_IDENTIFIER)
+    assert TEST_NAMESPACE_IDENTIFIER[0].lower() in inspect(trino_rest_conn).get_schema_names()
+
+
+@pytest.mark.trino
+@pytest.mark.parametrize(
+    "transform",
+    [
+        IdentityTransform(),
+        BucketTransform(num_buckets=32),
+    ],
+)
+@pytest.mark.parametrize(
+    "catalog,trino_conn",
+    [
+        (pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("trino_hive_conn")),
+        (pytest.lazy_fixture("session_catalog"), pytest.lazy_fixture("trino_rest_conn")),
+    ],
+)
+def test_uuid_partitioning_with_trino(catalog: Catalog, trino_conn: Connection, transform: Transform) -> None:  # type: ignore
+    """Test UUID partitioning using Trino engine.
+
+    This test verifies that UUID-partitioned tables created via PyIceberg can be
+    correctly queried through Trino. It tests both Identity and Bucket transforms
+    on UUID columns, which are not fully supported in Spark but work in Trino.
+    """
+    identifier = f"default.test_uuid_partitioning_{str(transform).replace('[32]', '')}"
+
+    schema = Schema(NestedField(field_id=1, name="uuid", field_type=UUIDType(), required=True))
+
+    try:
+        catalog.drop_table(identifier=identifier)
+    except NoSuchTableError:
+        pass
+
+    partition_spec = PartitionSpec(
+        PartitionField(source_id=1, field_id=1000, transform=transform, name=f"uuid_{str(transform).replace('[32]', '')}")
+    )
+
+    arr_table = pa.Table.from_pydict(
+        {
+            "uuid": [
+                uuid.UUID("00000000-0000-0000-0000-000000000000").bytes,
+                uuid.UUID("11111111-1111-1111-1111-111111111111").bytes,
+            ],
+        },
+        schema=pa.schema(
+            [
+                # Uuid not yet supported, so we have to stick with `binary(16)`
+                # https://github.com/apache/arrow/issues/46468
+                pa.field("uuid", pa.binary(16), nullable=False),
+            ]
+        ),
+    )
+
+    tbl = catalog.create_table(
+        identifier=identifier,
+        schema=schema,
+        partition_spec=partition_spec,
+    )
+
+    tbl.append(arr_table)
+    rows = trino_conn.execute(text(f"SELECT * FROM {identifier}")).fetchall()
+    lhs = sorted([r[0] for r in rows])
+    rhs = sorted([u.as_py() for u in tbl.scan().to_arrow()["uuid"].combine_chunks()])
+    assert lhs == rhs
diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py
@@ -2148,67 +2148,6 @@ def test_uuid_partitioning(session_catalog: Catalog, spark: SparkSession, transf
     lhs = [r[0] for r in spark.table(identifier).collect()]
     rhs = [str(u.as_py()) for u in tbl.scan().to_arrow()["uuid"].combine_chunks()]
     assert lhs == rhs
-
-
-@pytest.mark.integration_trino
-@pytest.mark.integration
-@pytest.mark.parametrize(
-    "transform",
-    [IdentityTransform(), BucketTransform(32)],
-)
-@pytest.mark.parametrize(
-    "catalog, trino_conn",
-    [
-        (pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("trino_hive_conn")),
-        (pytest.lazy_fixture("session_catalog"), pytest.lazy_fixture("trino_rest_conn")),
-    ],
-)
-def test_uuid_partitioning_with_trino(catalog: Catalog, trino_conn: Connection, transform: Transform) -> None:  # type: ignore
-    identifier = f"default.test_uuid_partitioning_{str(transform).replace('[32]', '')}"
-
-    schema = Schema(NestedField(field_id=1, name="uuid", field_type=UUIDType(), required=True))
-
-    try:
-        catalog.drop_table(identifier=identifier)
-    except NoSuchTableError:
-        pass
-
-    partition_spec = PartitionSpec(
-        PartitionField(source_id=1, field_id=1000, transform=transform, name=f"uuid_{str(transform).replace('[32]', '')}")
-    )
-
-    import pyarrow as pa
-
-    arr_table = pa.Table.from_pydict(
-        {
-            "uuid": [
-                uuid.UUID("00000000-0000-0000-0000-000000000000").bytes,
-                uuid.UUID("11111111-1111-1111-1111-111111111111").bytes,
-            ],
-        },
-        schema=pa.schema(
-            [
-                # Uuid not yet supported, so we have to stick with `binary(16)`
-                # https://github.com/apache/arrow/issues/46468
-                pa.field("uuid", pa.binary(16), nullable=False),
-            ]
-        ),
-    )
-
-    tbl = catalog.create_table(
-        identifier=identifier,
-        schema=schema,
-        partition_spec=partition_spec,
-    )
-
-    tbl.append(arr_table)
-    rows = trino_conn.execute(text(f"SELECT * FROM {identifier}")).fetchall()
-    lhs = sorted([r[0] for r in rows])
-    rhs = sorted([u.as_py() for u in tbl.scan().to_arrow()["uuid"].combine_chunks()])
-    assert lhs == rhs
-
-
-@pytest.mark.integration
 def test_avro_compression_codecs(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
     identifier = "default.test_avro_compression_codecs"
     tbl = _create_table(session_catalog, identifier, schema=arrow_table_with_null.schema, data=[arrow_table_with_null])

Original file line number	Diff line number	Diff line change
`@@ -154,7 +154,7 @@ markers = [`
`154`	`154`	`"s3: marks a test as requiring access to s3 compliant storage (use with --aws-access-key-id, --aws-secret-access-key, and --endpoint args)",`
`155`	`155`	`"adls: marks a test as requiring access to adls compliant storage (use with --adls.account-name, --adls.account-key, and --adls.endpoint args)",`
`156`	`156`	`"integration: marks integration tests against Apache Spark",`
`157`		`- "integration_trino: marks integration tests against Trino",`
	`157`	`+ "trino: marks integration tests against Trino",`
`158`	`158`	`"gcs: marks a test as requiring access to gcs compliant storage (use with --gs.token, --gs.project, and --gs.endpoint)",`
`159`	`159`	`"benchmark: collection of tests to validate read/write performance before and after a change",`
`160`	`160`	`]`
Original file line number	Diff line number	Diff line change
`@@ -148,13 +148,13 @@ def pytest_addoption(parser: pytest.Parser) -> None:`
`148`	`148`	`"--trino.rest.endpoint",`
`149`	`149`	`action="store",`
`150`	`150`	`default="trino://test@localhost:8082/warehouse_rest",`
`151`		`- help="The Trino REST endpoint URL for tests marked as integration_trino",`
	`151`	`+ help="The Trino REST endpoint URL for tests marked as trino",`
`152`	`152`	`)`
`153`	`153`	`parser.addoption(`
`154`	`154`	`"--trino.hive.endpoint",`
`155`	`155`	`action="store",`
`156`	`156`	`default="trino://test@localhost:8082/warehouse_hive",`
`157`		`- help="The Trino Hive endpoint URL for tests marked as integration_trino",`
	`157`	`+ help="The Trino Hive endpoint URL for tests marked as trino",`
`158`	`158`	`)`
`159`	`159`
`160`	`160`