|
4 | 4 |
|
5 | 5 | import re |
6 | 6 |
|
| 7 | +import pyarrow as pa |
7 | 8 | import ibis.expr.datatypes as dt |
8 | 9 | from ibis.backends.sql.datatypes import PostgresType |
9 | | - |
10 | | -# Arrow-style type names returned by Hotdata's information_schema when tables are |
11 | | -# loaded from Parquet/Arrow sources. PostgresType.from_string() treats these as |
12 | | -# USERDEFINED unknowns, so we resolve them explicitly before falling through. |
13 | | -_ARROW_TYPE_MAP: dict[str, type[dt.DataType]] = { |
| 10 | +from ibis.formats.pyarrow import PyArrowType |
| 11 | + |
| 12 | +# Simple Arrow type strings → PyArrow instances. Covers non-parametric types |
| 13 | +# that the Postgres dialect parser does not know (Arrow-specific names, unsigned |
| 14 | +# ints) or mis-maps (Arrow "int8" = 8-bit; Postgres "int8" = 8-byte / int64). |
| 15 | +# All scalar types that can appear as list/map element types must be listed here |
| 16 | +# because element type strings are resolved via this map, not the Postgres parser. |
| 17 | +_PA_TYPE_MAP: dict[str, pa.DataType] = { |
14 | 18 | # dates |
15 | | - "date32": dt.Date, |
16 | | - "date64": dt.Date, |
| 19 | + "date32": pa.date32(), |
| 20 | + "date64": pa.date64(), |
17 | 21 | # floats — "halffloat" is PyArrow's str() name for float16 |
18 | | - "float16": dt.Float16, |
19 | | - "float32": dt.Float32, |
20 | | - "float64": dt.Float64, |
21 | | - "halffloat": dt.Float16, |
22 | | - # signed ints — must override Postgres parser: Postgres "int8" means 8-byte (64-bit), |
23 | | - # but Arrow "int8" means 8-bit. int16/32/64 parse correctly via Postgres. |
24 | | - "int8": dt.Int8, |
25 | | - # unsigned ints |
26 | | - "uint8": dt.UInt8, |
27 | | - "uint16": dt.UInt16, |
28 | | - "uint32": dt.UInt32, |
29 | | - "uint64": dt.UInt64, |
30 | | - # strings — "large_string" / "largeutf8" are PyArrow large-offset variants |
31 | | - "utf8": dt.String, |
32 | | - "largeutf8": dt.String, |
33 | | - "large_string": dt.String, |
| 22 | + "float16": pa.float16(), |
| 23 | + "float32": pa.float32(), |
| 24 | + "float64": pa.float64(), |
| 25 | + "halffloat": pa.float16(), |
| 26 | + # signed ints — Arrow "int8" ≠ Postgres "int8" (8-byte/int64); all four |
| 27 | + # listed here so they resolve correctly when used as list element types |
| 28 | + "int8": pa.int8(), |
| 29 | + "int16": pa.int16(), |
| 30 | + "int32": pa.int32(), |
| 31 | + "int64": pa.int64(), |
| 32 | + # unsigned ints (Postgres parser returns Unknown for all of these) |
| 33 | + "uint8": pa.uint8(), |
| 34 | + "uint16": pa.uint16(), |
| 35 | + "uint32": pa.uint32(), |
| 36 | + "uint64": pa.uint64(), |
| 37 | + # strings — large-offset variants not known to the Postgres parser |
| 38 | + "utf8": pa.utf8(), |
| 39 | + "largeutf8": pa.large_utf8(), |
| 40 | + "large_string": pa.large_utf8(), |
| 41 | + "string": pa.string(), |
34 | 42 | # binary |
35 | | - "largebinary": dt.Binary, |
36 | | - # time |
37 | | - "time32": dt.Time, |
38 | | - "time64": dt.Time, |
| 43 | + "binary": pa.binary(), |
| 44 | + "largebinary": pa.large_binary(), |
| 45 | + # boolean / null |
| 46 | + "bool": pa.bool_(), |
| 47 | + "boolean": pa.bool_(), |
| 48 | + "null": pa.null(), |
| 49 | + # time — unit is absent from these bare string forms; the unit does not |
| 50 | + # affect the Ibis type (both time32 and time64 map to dt.Time) |
| 51 | + "time32": pa.time32("ms"), |
| 52 | + "time64": pa.time64("us"), |
39 | 53 | } |
40 | 54 |
|
41 | | -# Regex patterns for Arrow parametric types whose string representation includes |
42 | | -# embedded parameters (unit, timezone, precision, value type, …). |
| 55 | +# Regex patterns for parametric Arrow types that embed parameters in the string. |
43 | 56 | _TIMESTAMP_RE = re.compile(r"^timestamp\[(\w+)(?:,\s*tz=(.+))?\]$", re.IGNORECASE) |
44 | 57 | _DURATION_RE = re.compile(r"^duration\[(\w+)\]$", re.IGNORECASE) |
45 | 58 | _DECIMAL_RE = re.compile(r"^decimal(?:128|256)?\((\d+),\s*(\d+)\)$", re.IGNORECASE) |
46 | | -_LIST_RE = re.compile(r"^(?:large_)?list<item:\s*(.+)>$", re.IGNORECASE) |
47 | | - |
48 | | -# Map Arrow time-unit strings to Ibis IntervalUnit strings and Timestamp scales. |
49 | | -# Scales follow PyArrow's convention: s→0, ms→3, us→6, ns→9. |
50 | | -_ARROW_UNIT_TO_IBIS: dict[str, str] = { |
51 | | - "s": "s", |
52 | | - "ms": "ms", |
53 | | - "us": "us", |
54 | | - "ns": "ns", |
55 | | -} |
56 | | -_ARROW_UNIT_TO_TIMESTAMP_SCALE: dict[str, int] = { |
57 | | - "s": 0, |
58 | | - "ms": 3, |
59 | | - "us": 6, |
60 | | - "ns": 9, |
61 | | -} |
62 | | - |
63 | | -# Suffix appended by PyArrow when a list's item field is non-nullable. |
| 59 | +_LIST_RE = re.compile(r"^(large_)?list<item:\s*(.+)>$", re.IGNORECASE) |
| 60 | +# PyArrow appends " not null" when a list's item field is non-nullable. |
64 | 61 | _NOT_NULL_SUFFIX_RE = re.compile(r"\s+not\s+null$", re.IGNORECASE) |
65 | 62 |
|
66 | 63 |
|
67 | | -def _parse_parametric_arrow_type(raw: str, *, nullable: bool) -> dt.DataType | None: |
68 | | - """Try to parse an Arrow parametric type string into an Ibis DataType. |
| 64 | +def _pa_type_from_arrow_str(raw: str) -> pa.DataType | None: |
| 65 | + """Best-effort: Arrow type string → PyArrow DataType, or ``None`` if not recognised. |
69 | 66 |
|
70 | | - Returns ``None`` if ``raw`` does not match any known parametric pattern, |
71 | | - allowing the caller to fall through to the Postgres dialect parser. |
| 67 | + Handles simple names (via ``_PA_TYPE_MAP``) and parametric forms |
| 68 | + (timestamp, duration, decimal, list/large_list). Returns ``None`` if the |
| 69 | + string is not a known Arrow type, allowing the caller to fall through to the |
| 70 | + Postgres dialect parser or String fallback. |
72 | 71 | """ |
73 | | - m = _TIMESTAMP_RE.match(raw) |
| 72 | + s = raw.strip() |
| 73 | + |
| 74 | + # Simple non-parametric types. |
| 75 | + pa_type = _PA_TYPE_MAP.get(s.lower()) |
| 76 | + if pa_type is not None: |
| 77 | + return pa_type |
| 78 | + |
| 79 | + # timestamp[unit] or timestamp[unit, tz=…] |
| 80 | + m = _TIMESTAMP_RE.match(s) |
74 | 81 | if m: |
75 | 82 | unit = m.group(1).lower() |
76 | 83 | tz: str | None = m.group(2).strip() if m.group(2) else None |
77 | | - scale: int | None = _ARROW_UNIT_TO_TIMESTAMP_SCALE.get(unit) |
78 | | - return dt.Timestamp(timezone=tz, scale=scale, nullable=nullable) |
| 84 | + try: |
| 85 | + return pa.timestamp(unit, tz=tz) |
| 86 | + except Exception: |
| 87 | + return None |
79 | 88 |
|
80 | | - m = _DURATION_RE.match(raw) |
| 89 | + # duration[unit] — unknown units return None so the caller falls through |
| 90 | + m = _DURATION_RE.match(s) |
81 | 91 | if m: |
82 | | - unit = _ARROW_UNIT_TO_IBIS.get(m.group(1).lower()) |
83 | | - if unit is None: |
84 | | - return None # unrecognised unit — fall through to Postgres parser / String fallback |
85 | | - return dt.Interval(unit=unit, nullable=nullable) |
| 92 | + try: |
| 93 | + return pa.duration(m.group(1).lower()) |
| 94 | + except Exception: |
| 95 | + return None |
86 | 96 |
|
87 | | - m = _DECIMAL_RE.match(raw) |
| 97 | + # decimal / decimal128 / decimal256 |
| 98 | + m = _DECIMAL_RE.match(s) |
88 | 99 | if m: |
89 | | - return dt.Decimal(precision=int(m.group(1)), scale=int(m.group(2)), nullable=nullable) |
90 | | - |
91 | | - m = _LIST_RE.match(raw) |
| 100 | + precision, scale = int(m.group(1)), int(m.group(2)) |
| 101 | + try: |
| 102 | + # decimal128 supports precision 1–38; fall back to decimal256 for wider values |
| 103 | + return pa.decimal128(precision, scale) if precision <= 38 else pa.decimal256(precision, scale) |
| 104 | + except Exception: |
| 105 | + return None |
| 106 | + |
| 107 | + # list<item: T> or large_list<item: T> (recursive for nested types) |
| 108 | + m = _LIST_RE.match(s) |
92 | 109 | if m: |
93 | | - item_raw = m.group(1).strip() |
94 | | - # PyArrow appends " not null" for non-nullable item fields; strip it and |
95 | | - # pass nullable=False so the element type is marked non-nullable. |
| 110 | + is_large = bool(m.group(1)) |
| 111 | + item_raw = m.group(2).strip() |
96 | 112 | item_not_null = bool(_NOT_NULL_SUFFIX_RE.search(item_raw)) |
97 | 113 | item_str = _NOT_NULL_SUFFIX_RE.sub("", item_raw).strip() |
98 | | - value_type = dtype_from_hotdata_sql_type(item_str, nullable=not item_not_null) |
99 | | - return dt.Array(value_type=value_type, nullable=nullable) |
| 114 | + item_pa_type = _pa_type_from_arrow_str(item_str) |
| 115 | + if item_pa_type is None: |
| 116 | + return None |
| 117 | + item_field = pa.field("item", item_pa_type, nullable=not item_not_null) |
| 118 | + return pa.large_list(item_field) if is_large else pa.list_(item_field) |
100 | 119 |
|
101 | 120 | return None |
102 | 121 |
|
103 | 122 |
|
104 | 123 | def dtype_from_hotdata_sql_type(sql_type: str | None, *, nullable: bool) -> dt.DataType: |
105 | | - """Best-effort mapping from Hotdata `/information_schema` column `data_type` strings. |
| 124 | + """Best-effort mapping from Hotdata ``/information_schema`` column ``data_type`` strings. |
106 | 125 |
|
107 | 126 | Hotdata may return either SQL-style names (``INTEGER``, ``VARCHAR``, ``DOUBLE |
108 | 127 | PRECISION``, …) or Arrow-style names (``Date32``, ``Float64``, ``Utf8``, …). |
109 | | - SQL-style names are delegated to the Postgres dialect parser; Arrow-style names |
110 | | - are resolved via an explicit lookup table or parametric pattern before falling |
111 | | - back to the parser. |
| 128 | + Arrow-style names are resolved via PyArrow's type system and converted to Ibis |
| 129 | + types using the Ibis–PyArrow bridge; SQL-style names fall through to the Postgres |
| 130 | + dialect parser. |
112 | 131 | """ |
113 | 132 | if not sql_type: |
114 | 133 | return dt.String(nullable=nullable) |
115 | 134 |
|
116 | 135 | raw = sql_type.strip() |
117 | 136 |
|
118 | | - # Arrow-style names (case-insensitive lookup). |
119 | | - arrow_cls = _ARROW_TYPE_MAP.get(raw.lower()) |
120 | | - if arrow_cls is not None: |
121 | | - return arrow_cls(nullable=nullable) |
122 | | - |
123 | | - # Arrow parametric types (timestamp[us], duration[ms], decimal128(p,s), list<…>). |
124 | | - parametric = _parse_parametric_arrow_type(raw, nullable=nullable) |
125 | | - if parametric is not None: |
126 | | - return parametric |
| 137 | + # Try to parse as an Arrow type string (simple or parametric). |
| 138 | + pa_type = _pa_type_from_arrow_str(raw) |
| 139 | + if pa_type is not None: |
| 140 | + return PyArrowType.to_ibis(pa_type).copy(nullable=nullable) |
127 | 141 |
|
| 142 | + # Fall through to Postgres dialect parser for SQL-style type names. |
128 | 143 | try: |
129 | 144 | return PostgresType.from_string(raw, nullable=nullable) |
130 | 145 | except Exception: # ibis/sqlglot raise a variety of parse errors; fall back to String |
|
0 commit comments