From 55b951ef2a2aacf0d347689af088f94889195a3c Mon Sep 17 00:00:00 2001 From: Eddie A Tejeda <669988+eddietejeda@users.noreply.github.com> Date: Tue, 26 May 2026 14:12:46 -0700 Subject: [PATCH 1/7] feat: complete Arrow type mapping for parametric and missing types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for all common Arrow type strings returned by Hotdata's information_schema when tables are loaded from Parquet/Arrow sources. Simple additions to _ARROW_TYPE_MAP: - int8 → dt.Int8 (Postgres parser wrongly returns Int64 for "int8") - halffloat → dt.Float16 (PyArrow's str() name for float16 - large_string → dt.String (PyArrow large-offset string variant) New _parse_parametric_arrow_type() for types with embedded parameters: - timestamp[us] / timestamp[us, tz=UTC] → dt.Timestamp(timezone=...) - duration[ms] → dt.Interval(unit='ms') - decimal128(10, 3) / decimal(5, 2) → dt.Decimal(precision, scale) - list / large_list → dt.Array(T) (recursive) Adds 27 new test cases covering all new patterns including timezone variants, all 4 time units, case-insensitivity, and recursive list element type resolution. EOF ) --- src/ibis_hotdata/types.py | 68 ++++++++++++++++++++++++++++--- tests/test_hotdata_types.py | 79 +++++++++++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+), 5 deletions(-) diff --git a/src/ibis_hotdata/types.py b/src/ibis_hotdata/types.py index cb7920d..832eecd 100644 --- a/src/ibis_hotdata/types.py +++ b/src/ibis_hotdata/types.py @@ -2,6 +2,8 @@ from __future__ import annotations +import re + import ibis.expr.datatypes as dt from ibis.backends.sql.datatypes import PostgresType @@ -12,18 +14,23 @@ # dates "date32": dt.Date, "date64": dt.Date, - # floats + # floats — "halffloat" is PyArrow's str() name for float16 "float16": dt.Float16, "float32": dt.Float32, "float64": dt.Float64, + "halffloat": dt.Float16, + # signed ints — must override Postgres parser: Postgres "int8" means 8-byte (64-bit), + # but Arrow "int8" means 8-bit. int16/32/64 parse correctly via Postgres. + "int8": dt.Int8, # unsigned ints "uint8": dt.UInt8, "uint16": dt.UInt16, "uint32": dt.UInt32, "uint64": dt.UInt64, - # strings + # strings — "large_string" / "largeutf8" are PyArrow large-offset variants "utf8": dt.String, "largeutf8": dt.String, + "large_string": dt.String, # binary "largebinary": dt.Binary, # time @@ -31,6 +38,49 @@ "time64": dt.Time, } +# Regex patterns for Arrow parametric types whose string representation includes +# embedded parameters (unit, timezone, precision, value type, …). +_TIMESTAMP_RE = re.compile(r"^timestamp\[(\w+)(?:,\s*tz=(.+))?\]$", re.IGNORECASE) +_DURATION_RE = re.compile(r"^duration\[(\w+)\]$", re.IGNORECASE) +_DECIMAL_RE = re.compile(r"^decimal128?\((\d+),\s*(\d+)\)$", re.IGNORECASE) +_LIST_RE = re.compile(r"^(?:large_)?list$", re.IGNORECASE) + +# Map Arrow time-unit strings to Ibis IntervalUnit strings. +_ARROW_UNIT_TO_IBIS: dict[str, str] = { + "s": "s", + "ms": "ms", + "us": "us", + "ns": "ns", +} + + +def _parse_parametric_arrow_type(raw: str, *, nullable: bool) -> dt.DataType | None: + """Try to parse an Arrow parametric type string into an Ibis DataType. + + Returns ``None`` if ``raw`` does not match any known parametric pattern, + allowing the caller to fall through to the Postgres dialect parser. + """ + m = _TIMESTAMP_RE.match(raw) + if m: + tz: str | None = m.group(2).strip() if m.group(2) else None + return dt.Timestamp(timezone=tz, nullable=nullable) + + m = _DURATION_RE.match(raw) + if m: + unit = _ARROW_UNIT_TO_IBIS.get(m.group(1).lower(), "s") + return dt.Interval(unit=unit, nullable=nullable) + + m = _DECIMAL_RE.match(raw) + if m: + return dt.Decimal(precision=int(m.group(1)), scale=int(m.group(2)), nullable=nullable) + + m = _LIST_RE.match(raw) + if m: + value_type = dtype_from_hotdata_sql_type(m.group(1).strip(), nullable=True) + return dt.Array(value_type=value_type, nullable=nullable) + + return None + def dtype_from_hotdata_sql_type(sql_type: str | None, *, nullable: bool) -> dt.DataType: """Best-effort mapping from Hotdata `/information_schema` column `data_type` strings. @@ -38,17 +88,25 @@ def dtype_from_hotdata_sql_type(sql_type: str | None, *, nullable: bool) -> dt.D Hotdata may return either SQL-style names (``INTEGER``, ``VARCHAR``, ``DOUBLE PRECISION``, …) or Arrow-style names (``Date32``, ``Float64``, ``Utf8``, …). SQL-style names are delegated to the Postgres dialect parser; Arrow-style names - are resolved via an explicit lookup table before falling back to the parser. + are resolved via an explicit lookup table or parametric pattern before falling + back to the parser. """ if not sql_type: return dt.String(nullable=nullable) + raw = sql_type.strip() + # Arrow-style names (case-insensitive lookup). - arrow_cls = _ARROW_TYPE_MAP.get(sql_type.strip().lower()) + arrow_cls = _ARROW_TYPE_MAP.get(raw.lower()) if arrow_cls is not None: return arrow_cls(nullable=nullable) + # Arrow parametric types (timestamp[us], duration[ms], decimal128(p,s), list<…>). + parametric = _parse_parametric_arrow_type(raw, nullable=nullable) + if parametric is not None: + return parametric + try: - return PostgresType.from_string(sql_type.strip(), nullable=nullable) + return PostgresType.from_string(raw, nullable=nullable) except Exception: # ibis/sqlglot raise a variety of parse errors; fall back to String return dt.String(nullable=nullable) diff --git a/tests/test_hotdata_types.py b/tests/test_hotdata_types.py index 0e743c2..5286da6 100644 --- a/tests/test_hotdata_types.py +++ b/tests/test_hotdata_types.py @@ -55,6 +55,15 @@ def test_dtype_from_hotdata_malformed_fallback_string(): ("LargeBinary", True, dt.Binary), ("Time32", True, dt.Time), ("Time64", False, dt.Time), + # Previously missing: signed int8 (Postgres "int8" means int64, not int8) + ("int8", True, dt.Int8), + ("Int8", False, dt.Int8), + # Previously missing: halffloat (PyArrow's str() name for float16) + ("halffloat", True, dt.Float16), + ("HALFFLOAT", False, dt.Float16), + # Previously missing: large_string (PyArrow large-offset string variant) + ("large_string", True, dt.String), + ("Large_String", False, dt.String), # Case-insensitive ("date32", True, dt.Date), ("FLOAT64", True, dt.Float64), @@ -65,3 +74,73 @@ def test_dtype_from_hotdata_arrow_type_names(sql_type, nullable, expected_cls): out = dtype_from_hotdata_sql_type(sql_type, nullable=nullable) assert out.nullable is nullable assert isinstance(out, expected_cls) + + +@pytest.mark.parametrize( + ("sql_type", "expected_tz"), + [ + ("timestamp[s]", None), + ("timestamp[ms]", None), + ("timestamp[us]", None), + ("timestamp[ns]", None), + ("timestamp[us, tz=UTC]", "UTC"), + ("timestamp[us, tz=America/New_York]", "America/New_York"), + ("TIMESTAMP[US]", None), + ], +) +def test_dtype_from_hotdata_arrow_timestamp(sql_type, expected_tz): + out = dtype_from_hotdata_sql_type(sql_type, nullable=True) + assert isinstance(out, dt.Timestamp) + assert out.timezone == expected_tz + assert out.nullable is True + + +@pytest.mark.parametrize( + ("sql_type", "expected_unit"), + [ + ("duration[s]", "s"), + ("duration[ms]", "ms"), + ("duration[us]", "us"), + ("duration[ns]", "ns"), + ("DURATION[MS]", "ms"), + ], +) +def test_dtype_from_hotdata_arrow_duration(sql_type, expected_unit): + out = dtype_from_hotdata_sql_type(sql_type, nullable=False) + assert isinstance(out, dt.Interval) + assert out.unit.value == expected_unit + assert out.nullable is False + + +@pytest.mark.parametrize( + ("sql_type", "expected_precision", "expected_scale"), + [ + ("decimal128(10, 3)", 10, 3), + ("decimal128(38, 0)", 38, 0), + ("decimal(5, 2)", 5, 2), + ("DECIMAL128(18, 6)", 18, 6), + ], +) +def test_dtype_from_hotdata_arrow_decimal(sql_type, expected_precision, expected_scale): + out = dtype_from_hotdata_sql_type(sql_type, nullable=True) + assert isinstance(out, dt.Decimal) + assert out.precision == expected_precision + assert out.scale == expected_scale + assert out.nullable is True + + +@pytest.mark.parametrize( + ("sql_type", "expected_value_cls"), + [ + ("list", dt.Int32), + ("list", dt.String), + ("list", dt.Float64), + ("large_list", dt.Int64), + ("LIST", dt.UInt8), + ], +) +def test_dtype_from_hotdata_arrow_list(sql_type, expected_value_cls): + out = dtype_from_hotdata_sql_type(sql_type, nullable=True) + assert isinstance(out, dt.Array) + assert isinstance(out.value_type, expected_value_cls) + assert out.nullable is True From bddb9f2e372841434b081aadf3331fd3e5bc2864 Mon Sep 17 00:00:00 2001 From: Eddie A Tejeda <669988+eddietejeda@users.noreply.github.com> Date: Tue, 26 May 2026 14:18:02 -0700 Subject: [PATCH 2/7] fix: preserve timestamp scale and handle non-null list item types Two bugs found by Codex review: 1. Timestamp scale was discarded: Arrow timestamp[ms] should map to dt.Timestamp(scale=3), not dt.Timestamp(). Add unit-to-scale mapping s=0, ms=3, us=6, ns=9, matching PyArrow convention. 2. Non-nullable list items mis-parsed: PyArrow emits 'list' for non-nullable item fields. Strip the ' not null' suffix and pass nullable=False to the recursive call so the element type is correctly typed instead of falling back to Unknown. --- src/ibis_hotdata/types.py | 23 +++++++++++++++++++--- tests/test_hotdata_types.py | 38 +++++++++++++++++++++---------------- 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/src/ibis_hotdata/types.py b/src/ibis_hotdata/types.py index 832eecd..bcfee64 100644 --- a/src/ibis_hotdata/types.py +++ b/src/ibis_hotdata/types.py @@ -45,13 +45,23 @@ _DECIMAL_RE = re.compile(r"^decimal128?\((\d+),\s*(\d+)\)$", re.IGNORECASE) _LIST_RE = re.compile(r"^(?:large_)?list$", re.IGNORECASE) -# Map Arrow time-unit strings to Ibis IntervalUnit strings. +# Map Arrow time-unit strings to Ibis IntervalUnit strings and Timestamp scales. +# Scales follow PyArrow's convention: s→0, ms→3, us→6, ns→9. _ARROW_UNIT_TO_IBIS: dict[str, str] = { "s": "s", "ms": "ms", "us": "us", "ns": "ns", } +_ARROW_UNIT_TO_TIMESTAMP_SCALE: dict[str, int] = { + "s": 0, + "ms": 3, + "us": 6, + "ns": 9, +} + +# Suffix appended by PyArrow when a list's item field is non-nullable. +_NOT_NULL_SUFFIX_RE = re.compile(r"\s+not\s+null$", re.IGNORECASE) def _parse_parametric_arrow_type(raw: str, *, nullable: bool) -> dt.DataType | None: @@ -62,8 +72,10 @@ def _parse_parametric_arrow_type(raw: str, *, nullable: bool) -> dt.DataType | N """ m = _TIMESTAMP_RE.match(raw) if m: + unit = m.group(1).lower() tz: str | None = m.group(2).strip() if m.group(2) else None - return dt.Timestamp(timezone=tz, nullable=nullable) + scale: int | None = _ARROW_UNIT_TO_TIMESTAMP_SCALE.get(unit) + return dt.Timestamp(timezone=tz, scale=scale, nullable=nullable) m = _DURATION_RE.match(raw) if m: @@ -76,7 +88,12 @@ def _parse_parametric_arrow_type(raw: str, *, nullable: bool) -> dt.DataType | N m = _LIST_RE.match(raw) if m: - value_type = dtype_from_hotdata_sql_type(m.group(1).strip(), nullable=True) + item_raw = m.group(1).strip() + # PyArrow appends " not null" for non-nullable item fields; strip it and + # pass nullable=False so the element type is marked non-nullable. + item_not_null = bool(_NOT_NULL_SUFFIX_RE.search(item_raw)) + item_str = _NOT_NULL_SUFFIX_RE.sub("", item_raw).strip() + value_type = dtype_from_hotdata_sql_type(item_str, nullable=not item_not_null) return dt.Array(value_type=value_type, nullable=nullable) return None diff --git a/tests/test_hotdata_types.py b/tests/test_hotdata_types.py index 5286da6..699ce25 100644 --- a/tests/test_hotdata_types.py +++ b/tests/test_hotdata_types.py @@ -77,21 +77,22 @@ def test_dtype_from_hotdata_arrow_type_names(sql_type, nullable, expected_cls): @pytest.mark.parametrize( - ("sql_type", "expected_tz"), + ("sql_type", "expected_tz", "expected_scale"), [ - ("timestamp[s]", None), - ("timestamp[ms]", None), - ("timestamp[us]", None), - ("timestamp[ns]", None), - ("timestamp[us, tz=UTC]", "UTC"), - ("timestamp[us, tz=America/New_York]", "America/New_York"), - ("TIMESTAMP[US]", None), + ("timestamp[s]", None, 0), + ("timestamp[ms]", None, 3), + ("timestamp[us]", None, 6), + ("timestamp[ns]", None, 9), + ("timestamp[us, tz=UTC]", "UTC", 6), + ("timestamp[us, tz=America/New_York]", "America/New_York", 6), + ("TIMESTAMP[MS]", None, 3), ], ) -def test_dtype_from_hotdata_arrow_timestamp(sql_type, expected_tz): +def test_dtype_from_hotdata_arrow_timestamp(sql_type, expected_tz, expected_scale): out = dtype_from_hotdata_sql_type(sql_type, nullable=True) assert isinstance(out, dt.Timestamp) assert out.timezone == expected_tz + assert out.scale == expected_scale assert out.nullable is True @@ -130,17 +131,22 @@ def test_dtype_from_hotdata_arrow_decimal(sql_type, expected_precision, expected @pytest.mark.parametrize( - ("sql_type", "expected_value_cls"), + ("sql_type", "expected_value_cls", "expected_item_nullable"), [ - ("list", dt.Int32), - ("list", dt.String), - ("list", dt.Float64), - ("large_list", dt.Int64), - ("LIST", dt.UInt8), + ("list", dt.Int32, True), + ("list", dt.String, True), + ("list", dt.Float64, True), + ("large_list", dt.Int64, True), + ("LIST", dt.UInt8, True), + # Non-nullable item fields — PyArrow appends " not null" + ("list", dt.Int32, False), + ("list", dt.String, False), + ("large_list", dt.Float32, False), ], ) -def test_dtype_from_hotdata_arrow_list(sql_type, expected_value_cls): +def test_dtype_from_hotdata_arrow_list(sql_type, expected_value_cls, expected_item_nullable): out = dtype_from_hotdata_sql_type(sql_type, nullable=True) assert isinstance(out, dt.Array) assert isinstance(out.value_type, expected_value_cls) + assert out.value_type.nullable is expected_item_nullable assert out.nullable is True From 1379aa0d3eab0bc37d8d0922287f0289ed29508d Mon Sep 17 00:00:00 2001 From: Eddie A Tejeda <669988+eddietejeda@users.noreply.github.com> Date: Tue, 26 May 2026 14:20:44 -0700 Subject: [PATCH 3/7] fix: tighten decimal regex and fall through on unknown duration unit Two nits from PR review: - decimal regex 'decimal128?' made only the trailing 8 optional, so it matched the non-existent 'decimal12' form. Changed to 'decimal(?:128|256)?' to accept decimal, decimal128, or decimal256. - _ARROW_UNIT_TO_IBIS.get(..., 's') silently mapped any unrecognised duration unit (e.g. duration[foo]) to Interval(s). Now returns None so the caller falls through to the Postgres parser / String fallback. --- src/ibis_hotdata/types.py | 6 ++++-- tests/test_hotdata_types.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/ibis_hotdata/types.py b/src/ibis_hotdata/types.py index bcfee64..45783a1 100644 --- a/src/ibis_hotdata/types.py +++ b/src/ibis_hotdata/types.py @@ -42,7 +42,7 @@ # embedded parameters (unit, timezone, precision, value type, …). _TIMESTAMP_RE = re.compile(r"^timestamp\[(\w+)(?:,\s*tz=(.+))?\]$", re.IGNORECASE) _DURATION_RE = re.compile(r"^duration\[(\w+)\]$", re.IGNORECASE) -_DECIMAL_RE = re.compile(r"^decimal128?\((\d+),\s*(\d+)\)$", re.IGNORECASE) +_DECIMAL_RE = re.compile(r"^decimal(?:128|256)?\((\d+),\s*(\d+)\)$", re.IGNORECASE) _LIST_RE = re.compile(r"^(?:large_)?list$", re.IGNORECASE) # Map Arrow time-unit strings to Ibis IntervalUnit strings and Timestamp scales. @@ -79,7 +79,9 @@ def _parse_parametric_arrow_type(raw: str, *, nullable: bool) -> dt.DataType | N m = _DURATION_RE.match(raw) if m: - unit = _ARROW_UNIT_TO_IBIS.get(m.group(1).lower(), "s") + unit = _ARROW_UNIT_TO_IBIS.get(m.group(1).lower()) + if unit is None: + return None # unrecognised unit — fall through to Postgres parser / String fallback return dt.Interval(unit=unit, nullable=nullable) m = _DECIMAL_RE.match(raw) diff --git a/tests/test_hotdata_types.py b/tests/test_hotdata_types.py index 699ce25..93e9cc8 100644 --- a/tests/test_hotdata_types.py +++ b/tests/test_hotdata_types.py @@ -113,13 +113,22 @@ def test_dtype_from_hotdata_arrow_duration(sql_type, expected_unit): assert out.nullable is False +def test_dtype_from_hotdata_arrow_duration_unknown_unit_falls_back(): + # An unrecognised duration unit should not silently map to seconds; + # it falls through to the Postgres parser (which returns Unknown) or String fallback. + out = dtype_from_hotdata_sql_type("duration[foo]", nullable=True) + assert not isinstance(out, dt.Interval) # must not produce a valid Interval + + @pytest.mark.parametrize( ("sql_type", "expected_precision", "expected_scale"), [ ("decimal128(10, 3)", 10, 3), ("decimal128(38, 0)", 38, 0), + ("decimal256(76, 38)", 76, 38), ("decimal(5, 2)", 5, 2), ("DECIMAL128(18, 6)", 18, 6), + # decimal12 is NOT a valid form — should not be matched by the decimal regex ], ) def test_dtype_from_hotdata_arrow_decimal(sql_type, expected_precision, expected_scale): @@ -150,3 +159,10 @@ def test_dtype_from_hotdata_arrow_list(sql_type, expected_value_cls, expected_it assert isinstance(out.value_type, expected_value_cls) assert out.value_type.nullable is expected_item_nullable assert out.nullable is True + + +def test_dtype_from_hotdata_arrow_decimal12_not_matched(): + # "decimal12" (only the trailing 8 made optional) must NOT match the decimal regex. + # The Postgres parser handles bare "decimal" forms; decimal12 is not a real type. + out = dtype_from_hotdata_sql_type("decimal12(10, 3)", nullable=True) + assert not isinstance(out, dt.Decimal) # falls through to Unknown or String From f25b43b5befdbf86c9d9ba6cd1dc077ad47b58fe Mon Sep 17 00:00:00 2001 From: Eddie A Tejeda <669988+eddietejeda@users.noreply.github.com> Date: Tue, 26 May 2026 14:30:45 -0700 Subject: [PATCH 4/7] refactor: replace manual Ibis type construction with PyArrow bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of constructing Ibis types directly (with hand-coded unit->scale mappings etc.), resolve Arrow type strings to PyArrow types first and convert via ibis.formats.pyarrow.PyArrowType.to_ibis(). - Replace _ARROW_TYPE_MAP (str -> Ibis class) with _PA_TYPE_MAP (str -> pa.DataType instance), covering all scalar types that can appear as list element types - Merge _parse_parametric_arrow_type() into _pa_type_from_arrow_str() which returns a pa.DataType or None - Remove _ARROW_UNIT_TO_IBIS and _ARROW_UNIT_TO_TIMESTAMP_SCALE — PyArrow encodes unit/scale semantics natively - Fix decimal256: pa.decimal128 rejects precision > 38; fall back to pa.decimal256 for wider values - dtype_from_hotdata_sql_type() simplified to: try Arrow path via PyArrow bridge, then Postgres parser, then String fallback --- src/ibis_hotdata/types.py | 177 +++++++++++++++++++++----------------- 1 file changed, 96 insertions(+), 81 deletions(-) diff --git a/src/ibis_hotdata/types.py b/src/ibis_hotdata/types.py index 45783a1..d64e99c 100644 --- a/src/ibis_hotdata/types.py +++ b/src/ibis_hotdata/types.py @@ -4,127 +4,142 @@ import re +import pyarrow as pa import ibis.expr.datatypes as dt from ibis.backends.sql.datatypes import PostgresType - -# Arrow-style type names returned by Hotdata's information_schema when tables are -# loaded from Parquet/Arrow sources. PostgresType.from_string() treats these as -# USERDEFINED unknowns, so we resolve them explicitly before falling through. -_ARROW_TYPE_MAP: dict[str, type[dt.DataType]] = { +from ibis.formats.pyarrow import PyArrowType + +# Simple Arrow type strings → PyArrow instances. Covers non-parametric types +# that the Postgres dialect parser does not know (Arrow-specific names, unsigned +# ints) or mis-maps (Arrow "int8" = 8-bit; Postgres "int8" = 8-byte / int64). +# All scalar types that can appear as list/map element types must be listed here +# because element type strings are resolved via this map, not the Postgres parser. +_PA_TYPE_MAP: dict[str, pa.DataType] = { # dates - "date32": dt.Date, - "date64": dt.Date, + "date32": pa.date32(), + "date64": pa.date64(), # floats — "halffloat" is PyArrow's str() name for float16 - "float16": dt.Float16, - "float32": dt.Float32, - "float64": dt.Float64, - "halffloat": dt.Float16, - # signed ints — must override Postgres parser: Postgres "int8" means 8-byte (64-bit), - # but Arrow "int8" means 8-bit. int16/32/64 parse correctly via Postgres. - "int8": dt.Int8, - # unsigned ints - "uint8": dt.UInt8, - "uint16": dt.UInt16, - "uint32": dt.UInt32, - "uint64": dt.UInt64, - # strings — "large_string" / "largeutf8" are PyArrow large-offset variants - "utf8": dt.String, - "largeutf8": dt.String, - "large_string": dt.String, + "float16": pa.float16(), + "float32": pa.float32(), + "float64": pa.float64(), + "halffloat": pa.float16(), + # signed ints — Arrow "int8" ≠ Postgres "int8" (8-byte/int64); all four + # listed here so they resolve correctly when used as list element types + "int8": pa.int8(), + "int16": pa.int16(), + "int32": pa.int32(), + "int64": pa.int64(), + # unsigned ints (Postgres parser returns Unknown for all of these) + "uint8": pa.uint8(), + "uint16": pa.uint16(), + "uint32": pa.uint32(), + "uint64": pa.uint64(), + # strings — large-offset variants not known to the Postgres parser + "utf8": pa.utf8(), + "largeutf8": pa.large_utf8(), + "large_string": pa.large_utf8(), + "string": pa.string(), # binary - "largebinary": dt.Binary, - # time - "time32": dt.Time, - "time64": dt.Time, + "binary": pa.binary(), + "largebinary": pa.large_binary(), + # boolean / null + "bool": pa.bool_(), + "boolean": pa.bool_(), + "null": pa.null(), + # time — unit is absent from these bare string forms; the unit does not + # affect the Ibis type (both time32 and time64 map to dt.Time) + "time32": pa.time32("ms"), + "time64": pa.time64("us"), } -# Regex patterns for Arrow parametric types whose string representation includes -# embedded parameters (unit, timezone, precision, value type, …). +# Regex patterns for parametric Arrow types that embed parameters in the string. _TIMESTAMP_RE = re.compile(r"^timestamp\[(\w+)(?:,\s*tz=(.+))?\]$", re.IGNORECASE) _DURATION_RE = re.compile(r"^duration\[(\w+)\]$", re.IGNORECASE) _DECIMAL_RE = re.compile(r"^decimal(?:128|256)?\((\d+),\s*(\d+)\)$", re.IGNORECASE) -_LIST_RE = re.compile(r"^(?:large_)?list$", re.IGNORECASE) - -# Map Arrow time-unit strings to Ibis IntervalUnit strings and Timestamp scales. -# Scales follow PyArrow's convention: s→0, ms→3, us→6, ns→9. -_ARROW_UNIT_TO_IBIS: dict[str, str] = { - "s": "s", - "ms": "ms", - "us": "us", - "ns": "ns", -} -_ARROW_UNIT_TO_TIMESTAMP_SCALE: dict[str, int] = { - "s": 0, - "ms": 3, - "us": 6, - "ns": 9, -} - -# Suffix appended by PyArrow when a list's item field is non-nullable. +_LIST_RE = re.compile(r"^(large_)?list$", re.IGNORECASE) +# PyArrow appends " not null" when a list's item field is non-nullable. _NOT_NULL_SUFFIX_RE = re.compile(r"\s+not\s+null$", re.IGNORECASE) -def _parse_parametric_arrow_type(raw: str, *, nullable: bool) -> dt.DataType | None: - """Try to parse an Arrow parametric type string into an Ibis DataType. +def _pa_type_from_arrow_str(raw: str) -> pa.DataType | None: + """Best-effort: Arrow type string → PyArrow DataType, or ``None`` if not recognised. - Returns ``None`` if ``raw`` does not match any known parametric pattern, - allowing the caller to fall through to the Postgres dialect parser. + Handles simple names (via ``_PA_TYPE_MAP``) and parametric forms + (timestamp, duration, decimal, list/large_list). Returns ``None`` if the + string is not a known Arrow type, allowing the caller to fall through to the + Postgres dialect parser or String fallback. """ - m = _TIMESTAMP_RE.match(raw) + s = raw.strip() + + # Simple non-parametric types. + pa_type = _PA_TYPE_MAP.get(s.lower()) + if pa_type is not None: + return pa_type + + # timestamp[unit] or timestamp[unit, tz=…] + m = _TIMESTAMP_RE.match(s) if m: unit = m.group(1).lower() tz: str | None = m.group(2).strip() if m.group(2) else None - scale: int | None = _ARROW_UNIT_TO_TIMESTAMP_SCALE.get(unit) - return dt.Timestamp(timezone=tz, scale=scale, nullable=nullable) + try: + return pa.timestamp(unit, tz=tz) + except Exception: + return None - m = _DURATION_RE.match(raw) + # duration[unit] — unknown units return None so the caller falls through + m = _DURATION_RE.match(s) if m: - unit = _ARROW_UNIT_TO_IBIS.get(m.group(1).lower()) - if unit is None: - return None # unrecognised unit — fall through to Postgres parser / String fallback - return dt.Interval(unit=unit, nullable=nullable) + try: + return pa.duration(m.group(1).lower()) + except Exception: + return None - m = _DECIMAL_RE.match(raw) + # decimal / decimal128 / decimal256 + m = _DECIMAL_RE.match(s) if m: - return dt.Decimal(precision=int(m.group(1)), scale=int(m.group(2)), nullable=nullable) - - m = _LIST_RE.match(raw) + precision, scale = int(m.group(1)), int(m.group(2)) + try: + # decimal128 supports precision 1–38; fall back to decimal256 for wider values + return pa.decimal128(precision, scale) if precision <= 38 else pa.decimal256(precision, scale) + except Exception: + return None + + # list or large_list (recursive for nested types) + m = _LIST_RE.match(s) if m: - item_raw = m.group(1).strip() - # PyArrow appends " not null" for non-nullable item fields; strip it and - # pass nullable=False so the element type is marked non-nullable. + is_large = bool(m.group(1)) + item_raw = m.group(2).strip() item_not_null = bool(_NOT_NULL_SUFFIX_RE.search(item_raw)) item_str = _NOT_NULL_SUFFIX_RE.sub("", item_raw).strip() - value_type = dtype_from_hotdata_sql_type(item_str, nullable=not item_not_null) - return dt.Array(value_type=value_type, nullable=nullable) + item_pa_type = _pa_type_from_arrow_str(item_str) + if item_pa_type is None: + return None + item_field = pa.field("item", item_pa_type, nullable=not item_not_null) + return pa.large_list(item_field) if is_large else pa.list_(item_field) return None def dtype_from_hotdata_sql_type(sql_type: str | None, *, nullable: bool) -> dt.DataType: - """Best-effort mapping from Hotdata `/information_schema` column `data_type` strings. + """Best-effort mapping from Hotdata ``/information_schema`` column ``data_type`` strings. Hotdata may return either SQL-style names (``INTEGER``, ``VARCHAR``, ``DOUBLE PRECISION``, …) or Arrow-style names (``Date32``, ``Float64``, ``Utf8``, …). - SQL-style names are delegated to the Postgres dialect parser; Arrow-style names - are resolved via an explicit lookup table or parametric pattern before falling - back to the parser. + Arrow-style names are resolved via PyArrow's type system and converted to Ibis + types using the Ibis–PyArrow bridge; SQL-style names fall through to the Postgres + dialect parser. """ if not sql_type: return dt.String(nullable=nullable) raw = sql_type.strip() - # Arrow-style names (case-insensitive lookup). - arrow_cls = _ARROW_TYPE_MAP.get(raw.lower()) - if arrow_cls is not None: - return arrow_cls(nullable=nullable) - - # Arrow parametric types (timestamp[us], duration[ms], decimal128(p,s), list<…>). - parametric = _parse_parametric_arrow_type(raw, nullable=nullable) - if parametric is not None: - return parametric + # Try to parse as an Arrow type string (simple or parametric). + pa_type = _pa_type_from_arrow_str(raw) + if pa_type is not None: + return PyArrowType.to_ibis(pa_type).copy(nullable=nullable) + # Fall through to Postgres dialect parser for SQL-style type names. try: return PostgresType.from_string(raw, nullable=nullable) except Exception: # ibis/sqlglot raise a variety of parse errors; fall back to String From 80c82ed5c711b85a377d0ad25406e4106bea8473 Mon Sep 17 00:00:00 2001 From: Eddie A Tejeda <669988+eddietejeda@users.noreply.github.com> Date: Tue, 26 May 2026 14:50:37 -0700 Subject: [PATCH 5/7] refactor: clean up code quality issues - backend.py: replace walrus-then-overwrite pattern with plain assignments in _to_catalog_db_tuple - backend.py: getattr(self, '_http', None) is not None -> hasattr(self, '_http') - backend.py: add explanatory comments to the silent pass in _resolve_database_connection_id and the no-op _register_in_memory_table - http.py: replace magic HTTP status integers with http.HTTPStatus constants - http.py: replace range(len(columns)) index loop with direct field iteration - types.py: cache m.group(2) to avoid calling it twice - Remove managed.py (single-constant module); inline 'public' directly at the two call sites in backend.py and http.py --- src/ibis_hotdata/backend.py | 15 ++++++++------- src/ibis_hotdata/http.py | 19 +++++++++---------- src/ibis_hotdata/managed.py | 5 ----- src/ibis_hotdata/types.py | 3 ++- 4 files changed, 19 insertions(+), 23 deletions(-) delete mode 100644 src/ibis_hotdata/managed.py diff --git a/src/ibis_hotdata/backend.py b/src/ibis_hotdata/backend.py index cccc6cf..c09b0ea 100644 --- a/src/ibis_hotdata/backend.py +++ b/src/ibis_hotdata/backend.py @@ -43,7 +43,6 @@ from ibis.backends.sql import SQLBackend from ibis_hotdata.http import HotdataAPIError, HotdataClient -from ibis_hotdata.managed import DEFAULT_SCHEMA from ibis_hotdata.types import dtype_from_hotdata_sql_type _INFORMATION_SCHEMA_PAGE_SIZE = 500 @@ -203,7 +202,7 @@ def do_connect( ) def disconnect(self) -> None: - if getattr(self, "_http", None) is not None: + if hasattr(self, "_http"): self._http.close() # --- hierarchy --------------------------------------------------------- @@ -253,10 +252,12 @@ def _to_catalog_db_tuple(self, table_loc: sge.Table): """Use the compiler SQL dialect when stringifying qualifiers (backend name is not a dialect).""" dialect = self.dialect - if (sg_cat := table_loc.args["catalog"]) is not None: + sg_cat = table_loc.args["catalog"] + if sg_cat is not None: sg_cat.args["quoted"] = False sg_cat = sg_cat.sql(dialect=dialect) - if (sg_db := table_loc.args["db"]) is not None: + sg_db = table_loc.args["db"] + if sg_db is not None: sg_db.args["quoted"] = False sg_db = sg_db.sql(dialect=dialect) @@ -429,7 +430,7 @@ def _resolve_database_connection_id(self) -> str | None: db = self._http.get_database(self._database_id) self._database_connection_id = db.get("default_connection_id") except HotdataAPIError: - pass + pass # best-effort: if the lookup fails, callers fall back to the catalog name return self._database_connection_id # --- schema / sql execution -------------------------------------------- @@ -575,7 +576,7 @@ def create_database( /, *, catalog: str | None = None, - schema: str = DEFAULT_SCHEMA, + schema: str = "public", tables: Sequence[str] | None = None, force: bool = False, ) -> None: @@ -722,7 +723,7 @@ def drop_table( raise _ibis_err_from_hotdata(exc) from exc def _register_in_memory_table(self, _op: ops.InMemoryTable) -> None: - return + pass # Hotdata has no local in-memory table concept; Ibis calls this hook before execute @cached_property def version(self) -> str: diff --git a/src/ibis_hotdata/http.py b/src/ibis_hotdata/http.py index 3ffec5a..a71b07b 100644 --- a/src/ibis_hotdata/http.py +++ b/src/ibis_hotdata/http.py @@ -2,6 +2,7 @@ from __future__ import annotations +import http import io import json import time @@ -30,8 +31,6 @@ from hotdata.models.database_default_table_decl import DatabaseDefaultTableDecl from hotdata.models.load_managed_table_request import LoadManagedTableRequest -from ibis_hotdata.managed import DEFAULT_SCHEMA - T = TypeVar("T") # Matches Hotdata / runtimedb ``GET /v1/results/{{id}}`` Arrow responses. @@ -197,7 +196,7 @@ def create_managed_database( self, description: str | None = None, *, - schema: str = DEFAULT_SCHEMA, + schema: str = "public", tables: Sequence[str] = (), ) -> dict[str, Any]: """POST ``/v1/databases`` — creates a managed database with an auto-provisioned default catalog.""" @@ -264,27 +263,27 @@ def _poll_result_arrow( status = raw.status ctype = (raw.headers.get("Content-Type") or "").split(";")[0].strip().lower() - if status == 200 and ctype == APPLICATION_ARROW_STREAM.lower(): + if status == http.HTTPStatus.OK and ctype == APPLICATION_ARROW_STREAM.lower(): table = _ipc_stream_bytes_to_table(body) return self._arrow_payload_from_table(table, result_id=result_id) - if status == 202: + if status == http.HTTPStatus.ACCEPTED: _sleep_until(deadline, poll_interval_s) continue - if status == 409: + if status == http.HTTPStatus.CONFLICT: d = _json_utf8(body) if body else {} raise HotdataAPIError( d.get("error_message") or "Result failed", - status_code=409, + status_code=http.HTTPStatus.CONFLICT, body=d, ) - if status == 404: + if status == http.HTTPStatus.NOT_FOUND: d = _json_utf8(body) if body else {} raise HotdataAPIError( d.get("detail") or f"Result {result_id!r} not found", - status_code=404, + status_code=http.HTTPStatus.NOT_FOUND, body=d, ) @@ -304,7 +303,7 @@ def _arrow_payload_from_table( ) -> dict[str, Any]: sch = table.schema columns = sch.names - nullable = [sch.field(i).nullable for i in range(len(columns))] + nullable = [field.nullable for field in sch] return { "format": "arrow", "pa_table": table, diff --git a/src/ibis_hotdata/managed.py b/src/ibis_hotdata/managed.py deleted file mode 100644 index 972c366..0000000 --- a/src/ibis_hotdata/managed.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Helpers for Hotdata managed databases.""" - -from __future__ import annotations - -DEFAULT_SCHEMA = "public" diff --git a/src/ibis_hotdata/types.py b/src/ibis_hotdata/types.py index d64e99c..7ddaa49 100644 --- a/src/ibis_hotdata/types.py +++ b/src/ibis_hotdata/types.py @@ -80,7 +80,8 @@ def _pa_type_from_arrow_str(raw: str) -> pa.DataType | None: m = _TIMESTAMP_RE.match(s) if m: unit = m.group(1).lower() - tz: str | None = m.group(2).strip() if m.group(2) else None + tz_raw = m.group(2) + tz: str | None = tz_raw.strip() if tz_raw else None try: return pa.timestamp(unit, tz=tz) except Exception: From 328185247c24a13bc8029d5d5dec2246f0cdfdf5 Mon Sep 17 00:00:00 2001 From: Eddie A Tejeda <669988+eddietejeda@users.noreply.github.com> Date: Tue, 26 May 2026 14:54:37 -0700 Subject: [PATCH 6/7] docs: update README to reflect current implementation - Correct ibis-framework version requirement to >=10,<11 - Document all connect() optional parameters with inline comments - Add URL query string example with optional parameters - Document schema-only create_table (empty table from schema) - Document force=True on drop operations - Note to_pyarrow_batches() downloads full result then splits locally - Add in-memory tables (unsupported) and Arrow type mapping to feature table --- README.md | 43 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 8b836e0..e8e1e5a 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Use [Ibis](https://ibis-project.org/) to create on-demand databases, upload data, and query with Python expressions — get pandas or Arrow results back without writing SQL. -**Requirements:** Python 3.10+, **ibis-framework** 10.x, **hotdata** ≥0.2.3. +**Requirements:** Python 3.10+, **ibis-framework** ≥10,<11, **hotdata** ≥0.2.3. ## Install @@ -60,13 +60,26 @@ con = ibis.hotdata.connect( api_url="https://api.hotdata.dev", token="YOUR_API_KEY", workspace_id="ws_...", + # optional + session_id=None, # sandbox id (X-Session-Id header) + timeout=120.0, # per-request HTTP timeout in seconds + verify_ssl=True, # False to skip TLS verification, or path to CA bundle + default_connection=None, # default catalog (connection id); auto-detected if only one exists + default_schema=None, # default schema; auto-detected if only one exists + database_id=None, # bind an existing managed database id at connect time + poll_interval_s=0.25, # polling interval for async queries + poll_timeout_s=600.0, # max time to wait for a query result ) ``` -URL-style also works: +URL-style also works, with the same parameters as query string keys: ```python -con = ibis.connect("hotdata://api.hotdata.dev/?token=...&workspace_id=ws_...") +con = ibis.connect( + "hotdata://api.hotdata.dev/" + "?token=...&workspace_id=ws_..." + "&default_connection=my_conn&default_schema=public" +) ``` ## Managed databases @@ -86,9 +99,17 @@ con.create_table("events", events_df, database=("analytics", "public"), overwrit import pyarrow as pa table = pa.table({"id": [1, 2], "name": ["alice", "bob"]}) con.create_table("users", table, database=("analytics", "public"), overwrite=True) + +# Schema-only (no data): creates an empty table with the declared schema +import ibis.expr.schema as sch +con.create_table( + "staging", + schema=sch.Schema({"id": "int64", "ts": "timestamp"}), + database=("analytics", "public"), +) ``` -Table names must be declared when the database is created — you cannot add new table names later without recreating the database. +Table names must be declared when the database is created — you cannot upload to a table name that was not listed in `tables=`. ### Query @@ -118,9 +139,14 @@ result = con.sql( ### Delete +Pass `force=True` to silently skip errors when the database or table does not exist: + ```python con.drop_table("events", database=("analytics", "public")) +con.drop_table("events", database=("analytics", "public"), force=True) # no-op if missing + con.drop_database("analytics") +con.drop_database("analytics", force=True) # no-op if missing ``` ### Addressing summary @@ -146,7 +172,7 @@ summary = ( ) ``` -`.execute()` returns a **pandas DataFrame**. Use `.to_pyarrow()` for an Arrow table or `.to_pyarrow_batches()` to stream batches without materializing the full result. +`.execute()` returns a **pandas DataFrame**. `.to_pyarrow()` returns an Arrow table. `.to_pyarrow_batches()` returns a `RecordBatchReader` — note that Hotdata returns a single Arrow IPC payload per query, so this method downloads the full result first and then splits it into local batches. ### Raw SQL @@ -189,17 +215,20 @@ con.list_tables(database=("my_postgres", "public")) # tables | Feature | Status | |---------|--------| | `create_database` / `drop_database` (managed) | ✅ | -| `create_table` / `drop_table` (DataFrame or Arrow upload) | ✅ | +| `create_table` from pandas / PyArrow / schema-only | ✅ | +| `drop_table` | ✅ | | `con.table(...)` with full schema metadata | ✅ | | Ibis expressions: filter, select, join, group\_by, agg, order\_by, limit | ✅ | | `con.sql(...)` raw SQL | ✅ | | `.execute()` → pandas, `.to_pyarrow()`, `.to_pyarrow_batches()` | ✅ | | `list_catalogs`, `list_databases`, `list_tables` | ✅ | +| Arrow / Parquet column types (timestamp, decimal, list, duration, …) | ✅ | | Temporary tables | ❌ | +| In-memory tables (`ibis.memtable(...)`) | ❌ | | Python UDFs | ❌ | | INSERT / UPDATE / DELETE on external connections | ❌ | -SQL compilation uses Ibis's Postgres dialect. Use `con.sql(...)` as a fallback for expressions that don't compile cleanly. +SQL compilation uses Ibis's Postgres dialect. Column types returned by Hotdata's information schema are resolved via PyArrow's type system, so Parquet-loaded tables with Arrow-native types (timestamps with time zones, decimals, lists, durations) are mapped correctly to Ibis types. ## Development From c76cc61a3d1ea051ecb4ae08f15eead4f18ce538 Mon Sep 17 00:00:00 2001 From: Eddie A Tejeda <669988+eddietejeda@users.noreply.github.com> Date: Tue, 26 May 2026 15:07:52 -0700 Subject: [PATCH 7/7] fix: align default schema and query run statuses with runtimedb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Change default schema public to main to match runtimedb DEFAULT_SCHEMA_NAME; runtimedb auto-inserts main into every managed database, so using public silently created a spurious empty main schema alongside the declared one - Trim _IN_FLIGHT to {running} — runtimedb QueryRunStatus only emits running, succeeded, and failed; queued and pending are result statuses - Update README examples to use main schema throughout --- README.md | 28 ++++++++++++++-------------- src/ibis_hotdata/backend.py | 2 +- src/ibis_hotdata/http.py | 3 ++- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index e8e1e5a..bc5760a 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ con = ibis.hotdata.connect( ) # 1. Create a database and declare the tables you'll load -con.create_database("sales", schema="public", tables=["orders"]) +con.create_database("sales", tables=["orders"]) # 2. Upload a pandas DataFrame (or PyArrow table) df = pd.DataFrame({ @@ -33,14 +33,14 @@ df = pd.DataFrame({ "amount": [9.99, 49.99, 5.00], "region": ["west", "east", "west"], }) -con.create_table("orders", df, database=("sales", "public"), overwrite=True) +con.create_table("orders", df, database=("sales", "main"), overwrite=True) # 3. Uploads are async — wait briefly before querying time.sleep(2) # 4. Query with Ibis expressions # Managed tables are always accessed with catalog "default" -t = con.table("orders", database=("default", "public")) +t = con.table("orders", database=("default", "main")) result = ( t.group_by("region") .agg(total=t.amount.sum()) @@ -49,7 +49,7 @@ result = ( ) # 5. Clean up -con.drop_table("orders", database=("sales", "public")) +con.drop_table("orders", database=("sales", "main")) con.drop_database("sales") ``` @@ -90,22 +90,22 @@ Managed databases are the primary way to bring data into Hotdata with Ibis. Decl ```python # Declare the database and all table names up front -con.create_database("analytics", schema="public", tables=["events", "users"]) +con.create_database("analytics", tables=["events", "users"]) # Upload from a pandas DataFrame -con.create_table("events", events_df, database=("analytics", "public"), overwrite=True) +con.create_table("events", events_df, database=("analytics", "main"), overwrite=True) # PyArrow tables also work import pyarrow as pa table = pa.table({"id": [1, 2], "name": ["alice", "bob"]}) -con.create_table("users", table, database=("analytics", "public"), overwrite=True) +con.create_table("users", table, database=("analytics", "main"), overwrite=True) # Schema-only (no data): creates an empty table with the declared schema import ibis.expr.schema as sch con.create_table( "staging", schema=sch.Schema({"id": "int64", "ts": "timestamp"}), - database=("analytics", "public"), + database=("analytics", "main"), ) ``` @@ -116,7 +116,7 @@ Table names must be declared when the database is created — you cannot upload When querying, use `"default"` as the catalog: ```python -t = con.table("events", database=("default", "public")) +t = con.table("events", database=("default", "main")) result = ( t.filter(t.event_type == "click") @@ -131,7 +131,7 @@ Or with raw SQL: ```python result = con.sql( 'SELECT user_id, COUNT(*) AS n ' - 'FROM "default"."public"."events" ' + 'FROM "default"."main"."events" ' 'WHERE event_type = \'click\' ' 'GROUP BY user_id' ).execute() @@ -142,8 +142,8 @@ result = con.sql( Pass `force=True` to silently skip errors when the database or table does not exist: ```python -con.drop_table("events", database=("analytics", "public")) -con.drop_table("events", database=("analytics", "public"), force=True) # no-op if missing +con.drop_table("events", database=("analytics", "main")) +con.drop_table("events", database=("analytics", "main"), force=True) # no-op if missing con.drop_database("analytics") con.drop_database("analytics", force=True) # no-op if missing @@ -161,7 +161,7 @@ con.drop_database("analytics", force=True) # no-op if missing ### Ibis expressions ```python -t = con.table("orders", database=("default", "public")) +t = con.table("orders", database=("default", "main")) summary = ( t.filter(t.amount > 10) @@ -178,7 +178,7 @@ summary = ( ```python base = con.sql( - 'SELECT * FROM "default"."public"."orders"', + 'SELECT * FROM "default"."main"."orders"', dialect="postgres", ) result = base.filter(base.amount > 10).execute() diff --git a/src/ibis_hotdata/backend.py b/src/ibis_hotdata/backend.py index c09b0ea..cfdb1d4 100644 --- a/src/ibis_hotdata/backend.py +++ b/src/ibis_hotdata/backend.py @@ -576,7 +576,7 @@ def create_database( /, *, catalog: str | None = None, - schema: str = "public", + schema: str = "main", tables: Sequence[str] | None = None, force: bool = False, ) -> None: diff --git a/src/ibis_hotdata/http.py b/src/ibis_hotdata/http.py index a71b07b..bd51cd9 100644 --- a/src/ibis_hotdata/http.py +++ b/src/ibis_hotdata/http.py @@ -37,7 +37,8 @@ APPLICATION_ARROW_STREAM = "application/vnd.apache.arrow.stream" # Statuses that mean the query run is still in progress. -_IN_FLIGHT = {"running", "queued", "pending"} +# runtimedb QueryRunStatus only emits "running", "succeeded", "failed". +_IN_FLIGHT = {"running"} def _sleep_until(deadline: float, interval: float) -> None: