apache · HonahX · Jul 10, 2024 · Jun 20, 2024 · Jun 22, 2024 · Jun 27, 2024
diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md
@@ -299,4 +299,8 @@ PyIceberg uses multiple threads to parallelize operations. The number of workers
 
 # Backward Compatibility
 
-Previous versions of Java (`<1.4.0`) implementations incorrectly assume the optional attribute `current-snapshot-id` to be a required attribute in TableMetadata. This means that if `current-snapshot-id` is missing in the metadata file (e.g. on table creation), the application will throw an exception without being able to load the table. This assumption has been corrected in more recent Iceberg versions. However, it is possible to force PyIceberg to create a table with a metadata file that will be compatible with previous versions. This can be configured by setting the `legacy-current-snapshot-id` entry as "True" in the configuration file, or by setting the `PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID` environment variable. Refer to the [PR discussion](https://github.com/apache/iceberg-python/pull/473) for more details on the issue
+Previous versions of Java (`<1.4.0`) implementations incorrectly assume the optional attribute `current-snapshot-id` to be a required attribute in TableMetadata. This means that if `current-snapshot-id` is missing in the metadata file (e.g. on table creation), the application will throw an exception without being able to load the table. This assumption has been corrected in more recent Iceberg versions. However, it is possible to force PyIceberg to create a table with a metadata file that will be compatible with previous versions. This can be configured by setting the `legacy-current-snapshot-id` property as "True" in the configuration file, or by setting the `PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID` environment variable. Refer to the [PR discussion](https://github.com/apache/iceberg-python/pull/473) for more details on the issue
+
+# Nanoseconds Support
+
+PyIceberg currently only supports upto microsecond precision in its TimestampType. PyArrow timestamp types in 's' and 'ms' will be upcast automatically to 'us' precision timestamps on write. Timestamps in 'ns' precision can also be downcast automatically on write if desired. This can be configured by setting the `downcast-ns-timestamp-on-write` property as "True" in the configuration file, or by setting the `PYICEBERG_DOWNCAST_NS_TIMESTAMP_ON_WRITE` environment variable. Refer to the [nanoseconds timestamp proposal document](https://docs.google.com/document/d/1bE1DcEGNzZAMiVJSZ0X1wElKLNkT9kRkk0hDlfkXzvU/edit#heading=h.ibflcctc9i1d) for more details on the long term roadmap for nanoseconds support
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -154,6 +154,7 @@
     UUIDType,
 )
 from pyiceberg.utils.concurrent import ExecutorFactory
+from pyiceberg.utils.config import Config
 from pyiceberg.utils.datetime import millis_to_datetime
 from pyiceberg.utils.singleton import Singleton
 from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string
@@ -918,11 +919,24 @@ def primitive(self, primitive: pa.DataType) -> PrimitiveType:
             return TimeType()
         elif pa.types.is_timestamp(primitive):
             primitive = cast(pa.TimestampType, primitive)
-            if primitive.unit == "us":
-                if primitive.tz == "UTC" or primitive.tz == "+00:00":
-                    return TimestamptzType()
-                elif primitive.tz is None:
-                    return TimestampType()
+            if primitive.unit in ("s", "ms", "us"):
+                # Supported types, will be upcast automatically to 'us'
+                pass
+            elif primitive.unit == "ns":
+                if Config().get_bool("downcast-ns-timestamp-on-write"):
 elif (target_type := schema_to_pyarrow(field.field_type, include_field_ids=False)) != values.type: 
 col_aggs[field_id].update_min(statistics.min) 
 col_aggs[field_id].update_max(statistics.max) 
 col_aggs[field_id].update_min(statistics.min) 
 col_aggs[field_id].update_max(statistics.max) 
 elif (target_type := schema_to_pyarrow(field.field_type, include_field_ids=False)) != values.type: 
 col_aggs[field_id].update_min(statistics.min) 
 col_aggs[field_id].update_max(statistics.max) 
 col_aggs[field_id].update_min(statistics.min) 
 col_aggs[field_id].update_max(statistics.max) 
+                    logger.warning("Iceberg does not yet support 'ns' timestamp precision. Downcasting to 'us'.")
+                else:
+                    raise TypeError(
+                        "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-on-write' configuration property to automatically downcast 'ns' to 'us' on write."
+                    )
+            else:
+                raise TypeError(f"Unsupported precision for timestamp type: {primitive.unit}")
+
+            if primitive.tz == "UTC" or primitive.tz == "+00:00":
+                return TimestamptzType()
+            elif primitive.tz is None:
+                return TimestampType()
+
         elif pa.types.is_binary(primitive) or pa.types.is_large_binary(primitive):
             return BinaryType()
         elif pa.types.is_fixed_size_binary(primitive):

diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py
@@ -18,7 +18,7 @@
 import math
 import os
 import time
-from datetime import date, datetime
+from datetime import date, datetime, timezone
 from pathlib import Path
 from typing import Any, Dict
 from urllib.parse import urlparse
@@ -925,3 +925,70 @@ def table_write_subset_of_schema(session_catalog: Catalog, arrow_table_with_null
     tbl.append(arrow_table_without_some_columns)
     # overwrite and then append should produce twice the data
     assert len(tbl.scan().to_arrow()) == len(arrow_table_without_some_columns) * 2
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("format_version", [1, 2])
+def test_write_all_timestamp_precision(mocker: MockerFixture, session_catalog: Catalog, format_version: int) -> None:
+    identifier = "default.table_all_timestamp_precision"
+    arrow_table_schema_with_all_timestamp_precisions = pa.schema([
+        ("timestamp_s", pa.timestamp(unit="s")),
+        ("timestamptz_s", pa.timestamp(unit="s", tz="UTC")),
+        ("timestamp_ms", pa.timestamp(unit="ms")),
+        ("timestamptz_ms", pa.timestamp(unit="ms", tz="UTC")),
+        ("timestamp_us", pa.timestamp(unit="us")),
+        ("timestamptz_us", pa.timestamp(unit="us", tz="UTC")),
+        ("timestamp_ns", pa.timestamp(unit="ns")),
+        ("timestamptz_ns", pa.timestamp(unit="ns", tz="UTC")),
+    ])
+    TEST_DATA_WITH_NULL = {
+        "timestamp_s": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)],
+        "timestamptz_s": [
+            datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc),
+            None,
+            datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc),
+        ],
+        "timestamp_ms": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)],
+        "timestamptz_ms": [
+            datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc),
+            None,
+            datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc),
+        ],
+        "timestamp_us": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)],
+        "timestamptz_us": [
+            datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc),
+            None,
+            datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc),
+        ],
+        "timestamp_ns": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)],
+        "timestamptz_ns": [
+            datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc),
+            None,
+            datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc),
+        ],
+    }
+    input_arrow_table = pa.Table.from_pydict(TEST_DATA_WITH_NULL, schema=arrow_table_schema_with_all_timestamp_precisions)
+    mocker.patch.dict(os.environ, values={"PYICEBERG_DOWNCAST_NS_TIMESTAMP_ON_WRITE": "True"})
+
+    tbl = _create_table(
+        session_catalog,
+        identifier,
+        {"format-version": format_version},
+        data=[input_arrow_table],
+        schema=arrow_table_schema_with_all_timestamp_precisions,
+    )
+    tbl.overwrite(input_arrow_table)
+    written_arrow_table = tbl.scan().to_arrow()
+
+    expected_schema_in_all_us = pa.schema([
+        ("timestamp_s", pa.timestamp(unit="us")),
+        ("timestamptz_s", pa.timestamp(unit="us", tz="UTC")),
+        ("timestamp_ms", pa.timestamp(unit="us")),
+        ("timestamptz_ms", pa.timestamp(unit="us", tz="UTC")),
+        ("timestamp_us", pa.timestamp(unit="us")),
+        ("timestamptz_us", pa.timestamp(unit="us", tz="UTC")),
+        ("timestamp_ns", pa.timestamp(unit="us")),
+        ("timestamptz_ns", pa.timestamp(unit="us", tz="UTC")),
+    ])
+    assert written_arrow_table.schema == expected_schema_in_all_us
+    assert written_arrow_table == input_arrow_table.cast(expected_schema_in_all_us)
diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py
@@ -15,10 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=protected-access,unused-argument,redefined-outer-name
+import os
 import re
 
 import pyarrow as pa
 import pytest
+from pytest_mock.plugin import MockerFixture
 
 from pyiceberg.io.pyarrow import (
     _ConvertToArrowSchema,
@@ -161,22 +163,25 @@ def test_pyarrow_time64_ns_to_iceberg() -> None:
         visit_pyarrow(pyarrow_type, _ConvertToIceberg())
 
 
-def test_pyarrow_timestamp_to_iceberg() -> None:
-    pyarrow_type = pa.timestamp(unit="us")
+@pytest.mark.parametrize("precision", ["s", "ms", "us", "ns"])
+def test_pyarrow_timestamp_to_iceberg(mocker: MockerFixture, precision: str) -> None:
+    mocker.patch.dict(os.environ, values={"PYICEBERG_DOWNCAST_NS_TIMESTAMP_ON_WRITE": "True"})
+
+    pyarrow_type = pa.timestamp(unit=precision)
     converted_iceberg_type = visit_pyarrow(pyarrow_type, _ConvertToIceberg())
     assert converted_iceberg_type == TimestampType()
-    assert visit(converted_iceberg_type, _ConvertToArrowSchema()) == pyarrow_type
+    # all timestamp types are converted to 'us' precision
+    assert visit(converted_iceberg_type, _ConvertToArrowSchema()) == pa.timestamp(unit="us")
 
 
 def test_pyarrow_timestamp_invalid_units() -> None:
-    pyarrow_type = pa.timestamp(unit="ms")
-    with pytest.raises(TypeError, match=re.escape("Unsupported type: timestamp[ms]")):
-        visit_pyarrow(pyarrow_type, _ConvertToIceberg())
-    pyarrow_type = pa.timestamp(unit="s")
-    with pytest.raises(TypeError, match=re.escape("Unsupported type: timestamp[s]")):
-        visit_pyarrow(pyarrow_type, _ConvertToIceberg())
     pyarrow_type = pa.timestamp(unit="ns")
-    with pytest.raises(TypeError, match=re.escape("Unsupported type: timestamp[ns]")):
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-on-write' configuration property to automatically downcast 'ns' to 'us' on write."
+        ),
+    ):
         visit_pyarrow(pyarrow_type, _ConvertToIceberg())
 
 
@@ -192,14 +197,13 @@ def test_pyarrow_timestamp_tz_to_iceberg() -> None:
 
 
 def test_pyarrow_timestamp_tz_invalid_units() -> None:
-    pyarrow_type = pa.timestamp(unit="ms", tz="UTC")
-    with pytest.raises(TypeError, match=re.escape("Unsupported type: timestamp[ms, tz=UTC]")):
-        visit_pyarrow(pyarrow_type, _ConvertToIceberg())
-    pyarrow_type = pa.timestamp(unit="s", tz="UTC")
-    with pytest.raises(TypeError, match=re.escape("Unsupported type: timestamp[s, tz=UTC]")):
-        visit_pyarrow(pyarrow_type, _ConvertToIceberg())
     pyarrow_type = pa.timestamp(unit="ns", tz="UTC")
-    with pytest.raises(TypeError, match=re.escape("Unsupported type: timestamp[ns, tz=UTC]")):
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-on-write' configuration property to automatically downcast 'ns' to 'us' on write."
+        ),
+    ):
         visit_pyarrow(pyarrow_type, _ConvertToIceberg())