apache · HonahX · Jan 30, 2024 · Jan 25, 2024 · Jan 25, 2024 · Jan 25, 2024
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -146,6 +146,25 @@ catalog.create_table(
 )
 ```
 
+To create a table using a pyarrow schema:
+
+```python
+import pyarrow as pa
+
+schema = pa.schema(
+    [
+        pa.field("foo", pa.string(), nullable=True),
+        pa.field("bar", pa.int32(), nullable=False),
+        pa.field("baz", pa.bool_(), nullable=True),
+    ]
+)
+
+catalog.create_table(
+    identifier="docs_example.bids",
+    schema=schema,
+)
+```
+
 ## Load a table
 
 ### Catalog table

diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py
@@ -24,6 +24,7 @@
 from dataclasses import dataclass
 from enum import Enum
 from typing import (
+    TYPE_CHECKING,
     Callable,
     Dict,
     List,
@@ -56,6 +57,9 @@
 )
 from pyiceberg.utils.config import Config, merge_config
 
+if TYPE_CHECKING:
+    import pyarrow as pa
+
 logger = logging.getLogger(__name__)
 
 _ENV_CONFIG = Config()
@@ -288,7 +292,7 @@ def _load_file_io(self, properties: Properties = EMPTY_DICT, location: Optional[
     def create_table(
         self,
         identifier: Union[str, Identifier],
-        schema: Schema,
+        schema: Union[Schema, "pa.Schema"],
         location: Optional[str] = None,
         partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
         sort_order: SortOrder = UNSORTED_SORT_ORDER,
@@ -512,6 +516,22 @@ def _check_for_overlap(removals: Optional[Set[str]], updates: Properties) -> Non
             if overlap:
                 raise ValueError(f"Updates and deletes have an overlap: {overlap}")
 
+    @staticmethod
+    def _convert_schema_if_needed(schema: Union[Schema, "pa.Schema"]) -> Schema:
+        if isinstance(schema, Schema):
+            return schema
+        try:
+            import pyarrow as pa
+
+            from pyiceberg.io.pyarrow import _ConvertToIcebergWithFreshIds, pre_order_visit_pyarrow
+
+            if isinstance(schema, pa.Schema):
+                schema: Schema = pre_order_visit_pyarrow(schema, _ConvertToIcebergWithFreshIds())  # type: ignore
+                return schema
+        except ModuleNotFoundError:
+            pass
+        raise ValueError(f"{type(schema)=}, but it must be pyiceberg.schema.Schema or pyarrow.Schema")
+
     def _resolve_table_location(self, location: Optional[str], database_name: str, table_name: str) -> str:
         if not location:
             return self._get_default_warehouse_location(database_name, table_name)

diff --git a/pyiceberg/catalog/dynamodb.py b/pyiceberg/catalog/dynamodb.py
@@ -17,6 +17,7 @@
 import uuid
 from time import time
 from typing import (
+    TYPE_CHECKING,
     Any,
     Dict,
     List,
@@ -57,6 +58,9 @@
 from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
 from pyiceberg.typedef import EMPTY_DICT
 
+if TYPE_CHECKING:
+    import pyarrow as pa
+
 DYNAMODB_CLIENT = "dynamodb"
 
 DYNAMODB_COL_IDENTIFIER = "identifier"
@@ -127,7 +131,7 @@ def _dynamodb_table_exists(self) -> bool:
     def create_table(
         self,
         identifier: Union[str, Identifier],
-        schema: Schema,
+        schema: Union[Schema, "pa.Schema"],
         location: Optional[str] = None,
         partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
         sort_order: SortOrder = UNSORTED_SORT_ORDER,
@@ -152,6 +156,8 @@ def create_table(
             ValueError: If the identifier is invalid, or no path is given to store metadata.
 
         """
+        schema: Schema = self._convert_schema_if_needed(schema)  # type: ignore
+
         database_name, table_name = self.identifier_to_database_and_table(identifier)
 
         location = self._resolve_table_location(location, database_name, table_name)

diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py
@@ -17,6 +17,7 @@
 
 
 from typing import (
+    TYPE_CHECKING,
     Any,
     Dict,
     List,
@@ -88,6 +89,9 @@
     UUIDType,
 )
 
+if TYPE_CHECKING:
+    import pyarrow as pa
+
 # If Glue should skip archiving an old table version when creating a new version in a commit. By
 # default, Glue archives all old table versions after an UpdateTable call, but Glue has a default
 # max number of archived table versions (can be increased). So for streaming use case with lots
@@ -329,7 +333,7 @@ def _get_glue_table(self, database_name: str, table_name: str) -> TableTypeDef:
     def create_table(
         self,
         identifier: Union[str, Identifier],
-        schema: Schema,
+        schema: Union[Schema, "pa.Schema"],
         location: Optional[str] = None,
         partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
         sort_order: SortOrder = UNSORTED_SORT_ORDER,
@@ -354,6 +358,8 @@ def create_table(
             ValueError: If the identifier is invalid, or no path is given to store metadata.
 
         """
+        schema: Schema = self._convert_schema_if_needed(schema)  # type: ignore
+
         database_name, table_name = self.identifier_to_database_and_table(identifier)
 
         location = self._resolve_table_location(location, database_name, table_name)

diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py
@@ -18,6 +18,7 @@
 import time
 from types import TracebackType
 from typing import (
+    TYPE_CHECKING,
     Any,
     Dict,
     List,
@@ -91,6 +92,10 @@
     UUIDType,
 )
 
+if TYPE_CHECKING:
+    import pyarrow as pa
+
+
 # Replace by visitor
 hive_types = {
     BooleanType: "boolean",
@@ -250,7 +255,7 @@ def _convert_hive_into_iceberg(self, table: HiveTable, io: FileIO) -> Table:
     def create_table(
         self,
         identifier: Union[str, Identifier],
-        schema: Schema,
+        schema: Union[Schema, "pa.Schema"],
         location: Optional[str] = None,
         partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
         sort_order: SortOrder = UNSORTED_SORT_ORDER,
@@ -273,6 +278,8 @@ def create_table(
             AlreadyExistsError: If a table with the name already exists.
             ValueError: If the identifier is invalid.
         """
+        schema: Schema = self._convert_schema_if_needed(schema)  # type: ignore
+
         properties = {**DEFAULT_PROPERTIES, **properties}
         database_name, table_name = self.identifier_to_database_and_table(identifier)
         current_time_millis = int(time.time() * 1000)

diff --git a/pyiceberg/catalog/noop.py b/pyiceberg/catalog/noop.py
@@ -15,6 +15,7 @@
 #  specific language governing permissions and limitations
 #  under the License.
 from typing import (
+    TYPE_CHECKING,
     List,
     Optional,
     Set,
@@ -33,12 +34,15 @@
 from pyiceberg.table.sorting import UNSORTED_SORT_ORDER
 from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties
 
+if TYPE_CHECKING:
+    import pyarrow as pa
+
 
 class NoopCatalog(Catalog):
     def create_table(
         self,
         identifier: Union[str, Identifier],
-        schema: Schema,
+        schema: Union[Schema, "pa.Schema"],
         location: Optional[str] = None,
         partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
         sort_order: SortOrder = UNSORTED_SORT_ORDER,

diff --git a/pyiceberg/catalog/rest.py b/pyiceberg/catalog/rest.py
@@ -16,6 +16,7 @@
 #  under the License.
 from json import JSONDecodeError
 from typing import (
+    TYPE_CHECKING,
     Any,
     Dict,
     List,
@@ -68,6 +69,9 @@
 from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
 from pyiceberg.typedef import EMPTY_DICT, UTF8, IcebergBaseModel
 
+if TYPE_CHECKING:
+    import pyarrow as pa
+
 ICEBERG_REST_SPEC_VERSION = "0.14.1"
 
 
@@ -437,12 +441,14 @@ def _response_to_table(self, identifier_tuple: Tuple[str, ...], table_response:
     def create_table(
         self,
         identifier: Union[str, Identifier],
-        schema: Schema,
+        schema: Union[Schema, "pa.Schema"],
         location: Optional[str] = None,
         partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
         sort_order: SortOrder = UNSORTED_SORT_ORDER,
         properties: Properties = EMPTY_DICT,
     ) -> Table:
+        schema: Schema = self._convert_schema_if_needed(schema)  # type: ignore
+
         namespace_and_table = self._split_identifier_for_path(identifier)
         request = CreateTableRequest(
             name=namespace_and_table["table"],

diff --git a/pyiceberg/catalog/sql.py b/pyiceberg/catalog/sql.py
@@ -16,6 +16,7 @@
 # under the License.
 
 from typing import (
+    TYPE_CHECKING,
     List,
     Optional,
     Set,
@@ -65,6 +66,9 @@
 from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
 from pyiceberg.typedef import EMPTY_DICT
 
+if TYPE_CHECKING:
+    import pyarrow as pa
+
 
 class SqlCatalogBaseTable(MappedAsDataclass, DeclarativeBase):
     pass
@@ -140,7 +144,7 @@ def _convert_orm_to_iceberg(self, orm_table: IcebergTables) -> Table:
     def create_table(
         self,
         identifier: Union[str, Identifier],
-        schema: Schema,
+        schema: Union[Schema, "pa.Schema"],
         location: Optional[str] = None,
         partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
         sort_order: SortOrder = UNSORTED_SORT_ORDER,
@@ -165,6 +169,8 @@ def create_table(
             ValueError: If the identifier is invalid, or no path is given to store metadata.
 
         """
+        schema: Schema = self._convert_schema_if_needed(schema)  # type: ignore
+
         database_name, table_name = self.identifier_to_database_and_table(identifier)
         if not self._namespace_exists(database_name):
             raise NoSuchNamespaceError(f"Namespace does not exist: {database_name}")