fix: preserve timestamp scale and handle non-null list item types

eddietejeda · eddietejeda · commit bddb9f2e3728 · 2026-05-26T14:18:02.000-07:00
Two bugs found by Codex review:

1. Timestamp scale was discarded: Arrow timestamp[ms] should map to
   dt.Timestamp(scale=3), not dt.Timestamp(). Add unit-to-scale mapping
   s=0, ms=3, us=6, ns=9, matching PyArrow convention.

2. Non-nullable list items mis-parsed: PyArrow emits
   'list&lt;item: int32 not null&gt;' for non-nullable item fields. Strip the
   ' not null' suffix and pass nullable=False to the recursive call so
   the element type is correctly typed instead of falling back to Unknown.
diff --git a/src/ibis_hotdata/types.py b/src/ibis_hotdata/types.py
@@ -45,13 +45,23 @@
 _DECIMAL_RE = re.compile(r"^decimal128?\((\d+),\s*(\d+)\)$", re.IGNORECASE)
 _LIST_RE = re.compile(r"^(?:large_)?list<item:\s*(.+)>$", re.IGNORECASE)
 
-# Map Arrow time-unit strings to Ibis IntervalUnit strings.
+# Map Arrow time-unit strings to Ibis IntervalUnit strings and Timestamp scales.
+# Scales follow PyArrow's convention: s→0, ms→3, us→6, ns→9.
 _ARROW_UNIT_TO_IBIS: dict[str, str] = {
     "s": "s",
     "ms": "ms",
     "us": "us",
     "ns": "ns",
 }
+_ARROW_UNIT_TO_TIMESTAMP_SCALE: dict[str, int] = {
+    "s": 0,
+    "ms": 3,
+    "us": 6,
+    "ns": 9,
+}
+
+# Suffix appended by PyArrow when a list's item field is non-nullable.
+_NOT_NULL_SUFFIX_RE = re.compile(r"\s+not\s+null$", re.IGNORECASE)
 
 
 def _parse_parametric_arrow_type(raw: str, *, nullable: bool) -> dt.DataType | None:
@@ -62,8 +72,10 @@ def _parse_parametric_arrow_type(raw: str, *, nullable: bool) -> dt.DataType | N
     """
     m = _TIMESTAMP_RE.match(raw)
     if m:
+        unit = m.group(1).lower()
         tz: str | None = m.group(2).strip() if m.group(2) else None
-        return dt.Timestamp(timezone=tz, nullable=nullable)
+        scale: int | None = _ARROW_UNIT_TO_TIMESTAMP_SCALE.get(unit)
+        return dt.Timestamp(timezone=tz, scale=scale, nullable=nullable)
 
     m = _DURATION_RE.match(raw)
     if m:
@@ -76,7 +88,12 @@ def _parse_parametric_arrow_type(raw: str, *, nullable: bool) -> dt.DataType | N
 
     m = _LIST_RE.match(raw)
     if m:
-        value_type = dtype_from_hotdata_sql_type(m.group(1).strip(), nullable=True)
+        item_raw = m.group(1).strip()
+        # PyArrow appends " not null" for non-nullable item fields; strip it and
+        # pass nullable=False so the element type is marked non-nullable.
+        item_not_null = bool(_NOT_NULL_SUFFIX_RE.search(item_raw))
+        item_str = _NOT_NULL_SUFFIX_RE.sub("", item_raw).strip()
+        value_type = dtype_from_hotdata_sql_type(item_str, nullable=not item_not_null)
         return dt.Array(value_type=value_type, nullable=nullable)
 
     return None
diff --git a/tests/test_hotdata_types.py b/tests/test_hotdata_types.py
@@ -77,21 +77,22 @@ def test_dtype_from_hotdata_arrow_type_names(sql_type, nullable, expected_cls):
 
 
 @pytest.mark.parametrize(
-    ("sql_type", "expected_tz"),
+    ("sql_type", "expected_tz", "expected_scale"),
     [
-        ("timestamp[s]", None),
-        ("timestamp[ms]", None),
-        ("timestamp[us]", None),
-        ("timestamp[ns]", None),
-        ("timestamp[us, tz=UTC]", "UTC"),
-        ("timestamp[us, tz=America/New_York]", "America/New_York"),
-        ("TIMESTAMP[US]", None),
+        ("timestamp[s]", None, 0),
+        ("timestamp[ms]", None, 3),
+        ("timestamp[us]", None, 6),
+        ("timestamp[ns]", None, 9),
+        ("timestamp[us, tz=UTC]", "UTC", 6),
+        ("timestamp[us, tz=America/New_York]", "America/New_York", 6),
+        ("TIMESTAMP[MS]", None, 3),
     ],
 )
-def test_dtype_from_hotdata_arrow_timestamp(sql_type, expected_tz):
+def test_dtype_from_hotdata_arrow_timestamp(sql_type, expected_tz, expected_scale):
     out = dtype_from_hotdata_sql_type(sql_type, nullable=True)
     assert isinstance(out, dt.Timestamp)
     assert out.timezone == expected_tz
+    assert out.scale == expected_scale
     assert out.nullable is True
 
 
@@ -130,17 +131,22 @@ def test_dtype_from_hotdata_arrow_decimal(sql_type, expected_precision, expected
 
 
 @pytest.mark.parametrize(
-    ("sql_type", "expected_value_cls"),
+    ("sql_type", "expected_value_cls", "expected_item_nullable"),
     [
-        ("list<item: int32>", dt.Int32),
-        ("list<item: utf8>", dt.String),
-        ("list<item: float64>", dt.Float64),
-        ("large_list<item: int64>", dt.Int64),
-        ("LIST<ITEM: UINT8>", dt.UInt8),
+        ("list<item: int32>", dt.Int32, True),
+        ("list<item: utf8>", dt.String, True),
+        ("list<item: float64>", dt.Float64, True),
+        ("large_list<item: int64>", dt.Int64, True),
+        ("LIST<ITEM: UINT8>", dt.UInt8, True),
+        # Non-nullable item fields — PyArrow appends " not null"
+        ("list<item: int32 not null>", dt.Int32, False),
+        ("list<item: utf8 not null>", dt.String, False),
+        ("large_list<item: float32 not null>", dt.Float32, False),
     ],
 )
-def test_dtype_from_hotdata_arrow_list(sql_type, expected_value_cls):
+def test_dtype_from_hotdata_arrow_list(sql_type, expected_value_cls, expected_item_nullable):
     out = dtype_from_hotdata_sql_type(sql_type, nullable=True)
     assert isinstance(out, dt.Array)
     assert isinstance(out.value_type, expected_value_cls)
+    assert out.value_type.nullable is expected_item_nullable
     assert out.nullable is True