refactor: replace manual Ibis type construction with PyArrow bridge

eddietejeda · eddietejeda · commit f25b43b5befd · 2026-05-26T14:30:45.000-07:00
Instead of constructing Ibis types directly (with hand-coded unit-&gt;scale
mappings etc.), resolve Arrow type strings to PyArrow types first and
convert via ibis.formats.pyarrow.PyArrowType.to_ibis().

- Replace _ARROW_TYPE_MAP (str -&gt; Ibis class) with _PA_TYPE_MAP
  (str -&gt; pa.DataType instance), covering all scalar types that can
  appear as list element types
- Merge _parse_parametric_arrow_type() into _pa_type_from_arrow_str()
  which returns a pa.DataType or None
- Remove _ARROW_UNIT_TO_IBIS and _ARROW_UNIT_TO_TIMESTAMP_SCALE —
  PyArrow encodes unit/scale semantics natively
- Fix decimal256: pa.decimal128 rejects precision &gt; 38; fall back to
  pa.decimal256 for wider values
- dtype_from_hotdata_sql_type() simplified to: try Arrow path via
  PyArrow bridge, then Postgres parser, then String fallback
diff --git a/src/ibis_hotdata/types.py b/src/ibis_hotdata/types.py
@@ -4,127 +4,142 @@
 
 import re
 
+import pyarrow as pa
 import ibis.expr.datatypes as dt
 from ibis.backends.sql.datatypes import PostgresType
-
-# Arrow-style type names returned by Hotdata's information_schema when tables are
-# loaded from Parquet/Arrow sources.  PostgresType.from_string() treats these as
-# USERDEFINED unknowns, so we resolve them explicitly before falling through.
-_ARROW_TYPE_MAP: dict[str, type[dt.DataType]] = {
+from ibis.formats.pyarrow import PyArrowType
+
+# Simple Arrow type strings → PyArrow instances.  Covers non-parametric types
+# that the Postgres dialect parser does not know (Arrow-specific names, unsigned
+# ints) or mis-maps (Arrow "int8" = 8-bit; Postgres "int8" = 8-byte / int64).
+# All scalar types that can appear as list/map element types must be listed here
+# because element type strings are resolved via this map, not the Postgres parser.
+_PA_TYPE_MAP: dict[str, pa.DataType] = {
     # dates
-    "date32": dt.Date,
-    "date64": dt.Date,
+    "date32": pa.date32(),
+    "date64": pa.date64(),
     # floats — "halffloat" is PyArrow's str() name for float16
-    "float16": dt.Float16,
-    "float32": dt.Float32,
-    "float64": dt.Float64,
-    "halffloat": dt.Float16,
-    # signed ints — must override Postgres parser: Postgres "int8" means 8-byte (64-bit),
-    # but Arrow "int8" means 8-bit.  int16/32/64 parse correctly via Postgres.
-    "int8": dt.Int8,
-    # unsigned ints
-    "uint8": dt.UInt8,
-    "uint16": dt.UInt16,
-    "uint32": dt.UInt32,
-    "uint64": dt.UInt64,
-    # strings — "large_string" / "largeutf8" are PyArrow large-offset variants
-    "utf8": dt.String,
-    "largeutf8": dt.String,
-    "large_string": dt.String,
+    "float16": pa.float16(),
+    "float32": pa.float32(),
+    "float64": pa.float64(),
+    "halffloat": pa.float16(),
+    # signed ints — Arrow "int8" ≠ Postgres "int8" (8-byte/int64); all four
+    # listed here so they resolve correctly when used as list element types
+    "int8": pa.int8(),
+    "int16": pa.int16(),
+    "int32": pa.int32(),
+    "int64": pa.int64(),
+    # unsigned ints (Postgres parser returns Unknown for all of these)
+    "uint8": pa.uint8(),
+    "uint16": pa.uint16(),
+    "uint32": pa.uint32(),
+    "uint64": pa.uint64(),
+    # strings — large-offset variants not known to the Postgres parser
+    "utf8": pa.utf8(),
+    "largeutf8": pa.large_utf8(),
+    "large_string": pa.large_utf8(),
+    "string": pa.string(),
     # binary
-    "largebinary": dt.Binary,
-    # time
-    "time32": dt.Time,
-    "time64": dt.Time,
+    "binary": pa.binary(),
+    "largebinary": pa.large_binary(),
+    # boolean / null
+    "bool": pa.bool_(),
+    "boolean": pa.bool_(),
+    "null": pa.null(),
+    # time — unit is absent from these bare string forms; the unit does not
+    # affect the Ibis type (both time32 and time64 map to dt.Time)
+    "time32": pa.time32("ms"),
+    "time64": pa.time64("us"),
 }
 
-# Regex patterns for Arrow parametric types whose string representation includes
-# embedded parameters (unit, timezone, precision, value type, …).
+# Regex patterns for parametric Arrow types that embed parameters in the string.
 _TIMESTAMP_RE = re.compile(r"^timestamp\[(\w+)(?:,\s*tz=(.+))?\]$", re.IGNORECASE)
 _DURATION_RE = re.compile(r"^duration\[(\w+)\]$", re.IGNORECASE)
 _DECIMAL_RE = re.compile(r"^decimal(?:128|256)?\((\d+),\s*(\d+)\)$", re.IGNORECASE)
-_LIST_RE = re.compile(r"^(?:large_)?list<item:\s*(.+)>$", re.IGNORECASE)
-
-# Map Arrow time-unit strings to Ibis IntervalUnit strings and Timestamp scales.
-# Scales follow PyArrow's convention: s→0, ms→3, us→6, ns→9.
-_ARROW_UNIT_TO_IBIS: dict[str, str] = {
-    "s": "s",
-    "ms": "ms",
-    "us": "us",
-    "ns": "ns",
-}
-_ARROW_UNIT_TO_TIMESTAMP_SCALE: dict[str, int] = {
-    "s": 0,
-    "ms": 3,
-    "us": 6,
-    "ns": 9,
-}
-
-# Suffix appended by PyArrow when a list's item field is non-nullable.
+_LIST_RE = re.compile(r"^(large_)?list<item:\s*(.+)>$", re.IGNORECASE)
+# PyArrow appends " not null" when a list's item field is non-nullable.
 _NOT_NULL_SUFFIX_RE = re.compile(r"\s+not\s+null$", re.IGNORECASE)
 
 
-def _parse_parametric_arrow_type(raw: str, *, nullable: bool) -> dt.DataType | None:
-    """Try to parse an Arrow parametric type string into an Ibis DataType.
+def _pa_type_from_arrow_str(raw: str) -> pa.DataType | None:
+    """Best-effort: Arrow type string → PyArrow DataType, or ``None`` if not recognised.
 
-    Returns ``None`` if ``raw`` does not match any known parametric pattern,
-    allowing the caller to fall through to the Postgres dialect parser.
+    Handles simple names (via ``_PA_TYPE_MAP``) and parametric forms
+    (timestamp, duration, decimal, list/large_list).  Returns ``None`` if the
+    string is not a known Arrow type, allowing the caller to fall through to the
+    Postgres dialect parser or String fallback.
     """
-    m = _TIMESTAMP_RE.match(raw)
+    s = raw.strip()
+
+    # Simple non-parametric types.
+    pa_type = _PA_TYPE_MAP.get(s.lower())
+    if pa_type is not None:
+        return pa_type
+
+    # timestamp[unit] or timestamp[unit, tz=…]
+    m = _TIMESTAMP_RE.match(s)
     if m:
         unit = m.group(1).lower()
         tz: str | None = m.group(2).strip() if m.group(2) else None
-        scale: int | None = _ARROW_UNIT_TO_TIMESTAMP_SCALE.get(unit)
-        return dt.Timestamp(timezone=tz, scale=scale, nullable=nullable)
+        try:
+            return pa.timestamp(unit, tz=tz)
+        except Exception:
+            return None
 
-    m = _DURATION_RE.match(raw)
+    # duration[unit] — unknown units return None so the caller falls through
+    m = _DURATION_RE.match(s)
     if m:
-        unit = _ARROW_UNIT_TO_IBIS.get(m.group(1).lower())
-        if unit is None:
-            return None  # unrecognised unit — fall through to Postgres parser / String fallback
-        return dt.Interval(unit=unit, nullable=nullable)
+        try:
+            return pa.duration(m.group(1).lower())
+        except Exception:
+            return None
 
-    m = _DECIMAL_RE.match(raw)
+    # decimal / decimal128 / decimal256
+    m = _DECIMAL_RE.match(s)
     if m:
-        return dt.Decimal(precision=int(m.group(1)), scale=int(m.group(2)), nullable=nullable)
-
-    m = _LIST_RE.match(raw)
+        precision, scale = int(m.group(1)), int(m.group(2))
+        try:
+            # decimal128 supports precision 1–38; fall back to decimal256 for wider values
+            return pa.decimal128(precision, scale) if precision <= 38 else pa.decimal256(precision, scale)
+        except Exception:
+            return None
+
+    # list<item: T> or large_list<item: T> (recursive for nested types)
+    m = _LIST_RE.match(s)
     if m:
-        item_raw = m.group(1).strip()
-        # PyArrow appends " not null" for non-nullable item fields; strip it and
-        # pass nullable=False so the element type is marked non-nullable.
+        is_large = bool(m.group(1))
+        item_raw = m.group(2).strip()
         item_not_null = bool(_NOT_NULL_SUFFIX_RE.search(item_raw))
         item_str = _NOT_NULL_SUFFIX_RE.sub("", item_raw).strip()
-        value_type = dtype_from_hotdata_sql_type(item_str, nullable=not item_not_null)
-        return dt.Array(value_type=value_type, nullable=nullable)
+        item_pa_type = _pa_type_from_arrow_str(item_str)
+        if item_pa_type is None:
+            return None
+        item_field = pa.field("item", item_pa_type, nullable=not item_not_null)
+        return pa.large_list(item_field) if is_large else pa.list_(item_field)
 
     return None
 
 
 def dtype_from_hotdata_sql_type(sql_type: str | None, *, nullable: bool) -> dt.DataType:
-    """Best-effort mapping from Hotdata `/information_schema` column `data_type` strings.
+    """Best-effort mapping from Hotdata ``/information_schema`` column ``data_type`` strings.
 
     Hotdata may return either SQL-style names (``INTEGER``, ``VARCHAR``, ``DOUBLE
     PRECISION``, …) or Arrow-style names (``Date32``, ``Float64``, ``Utf8``, …).
-    SQL-style names are delegated to the Postgres dialect parser; Arrow-style names
-    are resolved via an explicit lookup table or parametric pattern before falling
-    back to the parser.
+    Arrow-style names are resolved via PyArrow's type system and converted to Ibis
+    types using the Ibis–PyArrow bridge; SQL-style names fall through to the Postgres
+    dialect parser.
     """
     if not sql_type:
         return dt.String(nullable=nullable)
 
     raw = sql_type.strip()
 
-    # Arrow-style names (case-insensitive lookup).
-    arrow_cls = _ARROW_TYPE_MAP.get(raw.lower())
-    if arrow_cls is not None:
-        return arrow_cls(nullable=nullable)
-
-    # Arrow parametric types (timestamp[us], duration[ms], decimal128(p,s), list<…>).
-    parametric = _parse_parametric_arrow_type(raw, nullable=nullable)
-    if parametric is not None:
-        return parametric
+    # Try to parse as an Arrow type string (simple or parametric).
+    pa_type = _pa_type_from_arrow_str(raw)
+    if pa_type is not None:
+        return PyArrowType.to_ibis(pa_type).copy(nullable=nullable)
 
+    # Fall through to Postgres dialect parser for SQL-style type names.
     try:
         return PostgresType.from_string(raw, nullable=nullable)
     except Exception:  # ibis/sqlglot raise a variety of parse errors; fall back to String