Skip to content

Commit f25b43b

Browse files
committed
refactor: replace manual Ibis type construction with PyArrow bridge
Instead of constructing Ibis types directly (with hand-coded unit->scale mappings etc.), resolve Arrow type strings to PyArrow types first and convert via ibis.formats.pyarrow.PyArrowType.to_ibis(). - Replace _ARROW_TYPE_MAP (str -> Ibis class) with _PA_TYPE_MAP (str -> pa.DataType instance), covering all scalar types that can appear as list element types - Merge _parse_parametric_arrow_type() into _pa_type_from_arrow_str() which returns a pa.DataType or None - Remove _ARROW_UNIT_TO_IBIS and _ARROW_UNIT_TO_TIMESTAMP_SCALE — PyArrow encodes unit/scale semantics natively - Fix decimal256: pa.decimal128 rejects precision > 38; fall back to pa.decimal256 for wider values - dtype_from_hotdata_sql_type() simplified to: try Arrow path via PyArrow bridge, then Postgres parser, then String fallback
1 parent 1379aa0 commit f25b43b

1 file changed

Lines changed: 96 additions & 81 deletions

File tree

src/ibis_hotdata/types.py

Lines changed: 96 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -4,127 +4,142 @@
44

55
import re
66

7+
import pyarrow as pa
78
import ibis.expr.datatypes as dt
89
from ibis.backends.sql.datatypes import PostgresType
9-
10-
# Arrow-style type names returned by Hotdata's information_schema when tables are
11-
# loaded from Parquet/Arrow sources. PostgresType.from_string() treats these as
12-
# USERDEFINED unknowns, so we resolve them explicitly before falling through.
13-
_ARROW_TYPE_MAP: dict[str, type[dt.DataType]] = {
10+
from ibis.formats.pyarrow import PyArrowType
11+
12+
# Simple Arrow type strings → PyArrow instances. Covers non-parametric types
13+
# that the Postgres dialect parser does not know (Arrow-specific names, unsigned
14+
# ints) or mis-maps (Arrow "int8" = 8-bit; Postgres "int8" = 8-byte / int64).
15+
# All scalar types that can appear as list/map element types must be listed here
16+
# because element type strings are resolved via this map, not the Postgres parser.
17+
_PA_TYPE_MAP: dict[str, pa.DataType] = {
1418
# dates
15-
"date32": dt.Date,
16-
"date64": dt.Date,
19+
"date32": pa.date32(),
20+
"date64": pa.date64(),
1721
# floats — "halffloat" is PyArrow's str() name for float16
18-
"float16": dt.Float16,
19-
"float32": dt.Float32,
20-
"float64": dt.Float64,
21-
"halffloat": dt.Float16,
22-
# signed ints — must override Postgres parser: Postgres "int8" means 8-byte (64-bit),
23-
# but Arrow "int8" means 8-bit. int16/32/64 parse correctly via Postgres.
24-
"int8": dt.Int8,
25-
# unsigned ints
26-
"uint8": dt.UInt8,
27-
"uint16": dt.UInt16,
28-
"uint32": dt.UInt32,
29-
"uint64": dt.UInt64,
30-
# strings — "large_string" / "largeutf8" are PyArrow large-offset variants
31-
"utf8": dt.String,
32-
"largeutf8": dt.String,
33-
"large_string": dt.String,
22+
"float16": pa.float16(),
23+
"float32": pa.float32(),
24+
"float64": pa.float64(),
25+
"halffloat": pa.float16(),
26+
# signed ints — Arrow "int8" ≠ Postgres "int8" (8-byte/int64); all four
27+
# listed here so they resolve correctly when used as list element types
28+
"int8": pa.int8(),
29+
"int16": pa.int16(),
30+
"int32": pa.int32(),
31+
"int64": pa.int64(),
32+
# unsigned ints (Postgres parser returns Unknown for all of these)
33+
"uint8": pa.uint8(),
34+
"uint16": pa.uint16(),
35+
"uint32": pa.uint32(),
36+
"uint64": pa.uint64(),
37+
# strings — large-offset variants not known to the Postgres parser
38+
"utf8": pa.utf8(),
39+
"largeutf8": pa.large_utf8(),
40+
"large_string": pa.large_utf8(),
41+
"string": pa.string(),
3442
# binary
35-
"largebinary": dt.Binary,
36-
# time
37-
"time32": dt.Time,
38-
"time64": dt.Time,
43+
"binary": pa.binary(),
44+
"largebinary": pa.large_binary(),
45+
# boolean / null
46+
"bool": pa.bool_(),
47+
"boolean": pa.bool_(),
48+
"null": pa.null(),
49+
# time — unit is absent from these bare string forms; the unit does not
50+
# affect the Ibis type (both time32 and time64 map to dt.Time)
51+
"time32": pa.time32("ms"),
52+
"time64": pa.time64("us"),
3953
}
4054

41-
# Regex patterns for Arrow parametric types whose string representation includes
42-
# embedded parameters (unit, timezone, precision, value type, …).
55+
# Regex patterns for parametric Arrow types that embed parameters in the string.
4356
_TIMESTAMP_RE = re.compile(r"^timestamp\[(\w+)(?:,\s*tz=(.+))?\]$", re.IGNORECASE)
4457
_DURATION_RE = re.compile(r"^duration\[(\w+)\]$", re.IGNORECASE)
4558
_DECIMAL_RE = re.compile(r"^decimal(?:128|256)?\((\d+),\s*(\d+)\)$", re.IGNORECASE)
46-
_LIST_RE = re.compile(r"^(?:large_)?list<item:\s*(.+)>$", re.IGNORECASE)
47-
48-
# Map Arrow time-unit strings to Ibis IntervalUnit strings and Timestamp scales.
49-
# Scales follow PyArrow's convention: s→0, ms→3, us→6, ns→9.
50-
_ARROW_UNIT_TO_IBIS: dict[str, str] = {
51-
"s": "s",
52-
"ms": "ms",
53-
"us": "us",
54-
"ns": "ns",
55-
}
56-
_ARROW_UNIT_TO_TIMESTAMP_SCALE: dict[str, int] = {
57-
"s": 0,
58-
"ms": 3,
59-
"us": 6,
60-
"ns": 9,
61-
}
62-
63-
# Suffix appended by PyArrow when a list's item field is non-nullable.
59+
_LIST_RE = re.compile(r"^(large_)?list<item:\s*(.+)>$", re.IGNORECASE)
60+
# PyArrow appends " not null" when a list's item field is non-nullable.
6461
_NOT_NULL_SUFFIX_RE = re.compile(r"\s+not\s+null$", re.IGNORECASE)
6562

6663

67-
def _parse_parametric_arrow_type(raw: str, *, nullable: bool) -> dt.DataType | None:
68-
"""Try to parse an Arrow parametric type string into an Ibis DataType.
64+
def _pa_type_from_arrow_str(raw: str) -> pa.DataType | None:
65+
"""Best-effort: Arrow type string → PyArrow DataType, or ``None`` if not recognised.
6966
70-
Returns ``None`` if ``raw`` does not match any known parametric pattern,
71-
allowing the caller to fall through to the Postgres dialect parser.
67+
Handles simple names (via ``_PA_TYPE_MAP``) and parametric forms
68+
(timestamp, duration, decimal, list/large_list). Returns ``None`` if the
69+
string is not a known Arrow type, allowing the caller to fall through to the
70+
Postgres dialect parser or String fallback.
7271
"""
73-
m = _TIMESTAMP_RE.match(raw)
72+
s = raw.strip()
73+
74+
# Simple non-parametric types.
75+
pa_type = _PA_TYPE_MAP.get(s.lower())
76+
if pa_type is not None:
77+
return pa_type
78+
79+
# timestamp[unit] or timestamp[unit, tz=…]
80+
m = _TIMESTAMP_RE.match(s)
7481
if m:
7582
unit = m.group(1).lower()
7683
tz: str | None = m.group(2).strip() if m.group(2) else None
77-
scale: int | None = _ARROW_UNIT_TO_TIMESTAMP_SCALE.get(unit)
78-
return dt.Timestamp(timezone=tz, scale=scale, nullable=nullable)
84+
try:
85+
return pa.timestamp(unit, tz=tz)
86+
except Exception:
87+
return None
7988

80-
m = _DURATION_RE.match(raw)
89+
# duration[unit] — unknown units return None so the caller falls through
90+
m = _DURATION_RE.match(s)
8191
if m:
82-
unit = _ARROW_UNIT_TO_IBIS.get(m.group(1).lower())
83-
if unit is None:
84-
return None # unrecognised unit — fall through to Postgres parser / String fallback
85-
return dt.Interval(unit=unit, nullable=nullable)
92+
try:
93+
return pa.duration(m.group(1).lower())
94+
except Exception:
95+
return None
8696

87-
m = _DECIMAL_RE.match(raw)
97+
# decimal / decimal128 / decimal256
98+
m = _DECIMAL_RE.match(s)
8899
if m:
89-
return dt.Decimal(precision=int(m.group(1)), scale=int(m.group(2)), nullable=nullable)
90-
91-
m = _LIST_RE.match(raw)
100+
precision, scale = int(m.group(1)), int(m.group(2))
101+
try:
102+
# decimal128 supports precision 1–38; fall back to decimal256 for wider values
103+
return pa.decimal128(precision, scale) if precision <= 38 else pa.decimal256(precision, scale)
104+
except Exception:
105+
return None
106+
107+
# list<item: T> or large_list<item: T> (recursive for nested types)
108+
m = _LIST_RE.match(s)
92109
if m:
93-
item_raw = m.group(1).strip()
94-
# PyArrow appends " not null" for non-nullable item fields; strip it and
95-
# pass nullable=False so the element type is marked non-nullable.
110+
is_large = bool(m.group(1))
111+
item_raw = m.group(2).strip()
96112
item_not_null = bool(_NOT_NULL_SUFFIX_RE.search(item_raw))
97113
item_str = _NOT_NULL_SUFFIX_RE.sub("", item_raw).strip()
98-
value_type = dtype_from_hotdata_sql_type(item_str, nullable=not item_not_null)
99-
return dt.Array(value_type=value_type, nullable=nullable)
114+
item_pa_type = _pa_type_from_arrow_str(item_str)
115+
if item_pa_type is None:
116+
return None
117+
item_field = pa.field("item", item_pa_type, nullable=not item_not_null)
118+
return pa.large_list(item_field) if is_large else pa.list_(item_field)
100119

101120
return None
102121

103122

104123
def dtype_from_hotdata_sql_type(sql_type: str | None, *, nullable: bool) -> dt.DataType:
105-
"""Best-effort mapping from Hotdata `/information_schema` column `data_type` strings.
124+
"""Best-effort mapping from Hotdata ``/information_schema`` column ``data_type`` strings.
106125
107126
Hotdata may return either SQL-style names (``INTEGER``, ``VARCHAR``, ``DOUBLE
108127
PRECISION``, …) or Arrow-style names (``Date32``, ``Float64``, ``Utf8``, …).
109-
SQL-style names are delegated to the Postgres dialect parser; Arrow-style names
110-
are resolved via an explicit lookup table or parametric pattern before falling
111-
back to the parser.
128+
Arrow-style names are resolved via PyArrow's type system and converted to Ibis
129+
types using the Ibis–PyArrow bridge; SQL-style names fall through to the Postgres
130+
dialect parser.
112131
"""
113132
if not sql_type:
114133
return dt.String(nullable=nullable)
115134

116135
raw = sql_type.strip()
117136

118-
# Arrow-style names (case-insensitive lookup).
119-
arrow_cls = _ARROW_TYPE_MAP.get(raw.lower())
120-
if arrow_cls is not None:
121-
return arrow_cls(nullable=nullable)
122-
123-
# Arrow parametric types (timestamp[us], duration[ms], decimal128(p,s), list<…>).
124-
parametric = _parse_parametric_arrow_type(raw, nullable=nullable)
125-
if parametric is not None:
126-
return parametric
137+
# Try to parse as an Arrow type string (simple or parametric).
138+
pa_type = _pa_type_from_arrow_str(raw)
139+
if pa_type is not None:
140+
return PyArrowType.to_ibis(pa_type).copy(nullable=nullable)
127141

142+
# Fall through to Postgres dialect parser for SQL-style type names.
128143
try:
129144
return PostgresType.from_string(raw, nullable=nullable)
130145
except Exception: # ibis/sqlglot raise a variety of parse errors; fall back to String

0 commit comments

Comments
 (0)