Skip to content

Commit ba2eb84

Browse files
committed
[python] Reject column type casts the read path cannot execute
supports_cast only encodes the logical cast specification (mirroring Java DataTypeCasts), so casts with no runtime implementation -- e.g. TIMESTAMP -> DECIMAL, BOOLEAN -> DECIMAL, TIME -> TIMESTAMP -- were accepted at alter time and then failed at read with ArrowNotImplementedError. Java additionally checks CastExecutors.resolve(...) != null. Add can_execute_cast as the executable-cast counterpart: leaf casts defer to PyArrow's cast-kernel availability (probed once and cached), constructed -> string is rendered by the read path, and other constructed conversions are rejected. update column type now requires both supports_cast and can_execute_cast.
1 parent 18da50c commit ba2eb84

3 files changed

Lines changed: 108 additions & 2 deletions

File tree

paimon-python/pypaimon/casting/data_type_casts.py

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,11 @@
2626
applies the conversion leniently.
2727
"""
2828

29+
import pyarrow as pa
30+
2931
from pypaimon.schema.data_types import (ArrayType, AtomicType, DataTypeParser,
30-
MapType, MultisetType, RowType,
32+
MapType, MultisetType,
33+
PyarrowFieldParser, RowType,
3134
VectorType)
3235

3336
# ---- Type roots --------------------------------------------------------------
@@ -187,3 +190,63 @@ def _equals_ignore_nullable(source_type, target_type) -> bool:
187190
source_copy.nullable = True
188191
target_copy.nullable = True
189192
return source_copy == target_copy
193+
194+
195+
# Caches the PyArrow cast-kernel probe per (source, target) pyarrow type so the
196+
# alter-time check stays cheap. Keyed by the pyarrow type strings.
197+
_EXECUTABLE_CAST_CACHE = {}
198+
199+
200+
def can_execute_cast(source_type, target_type) -> bool:
201+
"""Whether the Python read path can actually *materialize* a stored
202+
``source_type`` value as ``target_type`` when reading a file written before
203+
the column type change.
204+
205+
``supports_cast`` only encodes the *logical* cast specification (mirroring
206+
Java ``DataTypeCasts``). This is the executable-cast counterpart of Java's
207+
``CastExecutors.resolve(...) != null`` guard: some logically-valid casts
208+
(e.g. ``TIMESTAMP -> DECIMAL``, ``BOOLEAN -> DECIMAL``, ``TIME ->
209+
TIMESTAMP``) have no PyArrow cast kernel, so without this check the alter
210+
succeeds and the read later fails with ``ArrowNotImplementedError``.
211+
"""
212+
source_root = _root(source_type)
213+
target_root = _root(target_type)
214+
if source_root is None or target_root is None:
215+
return False
216+
# Same root: identity, or a same-shape constructed type whose value is
217+
# rebuilt by the read path's field-id alignment rather than a value cast.
218+
if source_root == target_root:
219+
return True
220+
# Constructed -> character string is rendered by the read path's custom
221+
# ``_constructed_to_string_array`` (see DataFileBatchReader), not a cast.
222+
if (source_root in STRING_RENDERABLE_CONSTRUCTED
223+
and target_root in CHARACTER_STRING):
224+
return True
225+
# Any other conversion touching a constructed type has no runtime cast.
226+
if source_root in CONSTRUCTED or target_root in CONSTRUCTED:
227+
return False
228+
# Leaf-to-leaf: defer to PyArrow's cast-kernel availability, which is the
229+
# read path's actual cast executor (``array.cast(target, safe=False)``).
230+
return _pyarrow_cast_supported(source_type, target_type)
231+
232+
233+
def _pyarrow_cast_supported(source_type, target_type) -> bool:
234+
source_pa = PyarrowFieldParser.from_paimon_type(source_type)
235+
target_pa = PyarrowFieldParser.from_paimon_type(target_type)
236+
if source_pa == target_pa:
237+
return True
238+
cache_key = (str(source_pa), str(target_pa))
239+
cached = _EXECUTABLE_CAST_CACHE.get(cache_key)
240+
if cached is not None:
241+
return cached
242+
# An empty-array cast exercises only kernel resolution (not per-value
243+
# conversion), so it reports whether PyArrow has a cast kernel for the pair
244+
# without depending on any data. ``safe=False`` matches the read path.
245+
try:
246+
pa.array([], type=source_pa).cast(target_pa, safe=False)
247+
ok = True
248+
except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid,
249+
pa.lib.ArrowTypeError):
250+
ok = False
251+
_EXECUTABLE_CAST_CACHE[cache_key] = ok
252+
return ok

paimon-python/pypaimon/schema/schema_manager.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from pypaimon.schema.column_directive_utils import (
2727
apply_add_column_directive, apply_directives,
2828
remove_dropped_directive_options)
29-
from pypaimon.casting.data_type_casts import supports_cast
29+
from pypaimon.casting.data_type_casts import can_execute_cast, supports_cast
3030
from pypaimon.schema.data_types import (ArrayType, AtomicInteger, DataField,
3131
MapType, RowType, reassign_field_id)
3232
from pypaimon.schema.schema import Schema
@@ -240,6 +240,15 @@ def update_func(field: DataField, depth: int) -> DataField:
240240
"Column type {}[{}] cannot be converted to {} without losing information."
241241
.format(field.name, source_root, target_root)
242242
)
243+
# Logical cast support is not enough: the read path materializes the
244+
# change via PyArrow when reading old files, so reject casts it cannot
245+
# execute (mirrors Java's CastExecutors.resolve(...) != null check).
246+
if not can_execute_cast(source_root, target_root):
247+
raise ValueError(
248+
"Column type {}[{}] cannot be converted to {}: the read path "
249+
"has no executable cast for this conversion."
250+
.format(field.name, source_root, target_root)
251+
)
243252
new_type = _get_array_map_type_with_target_type_root(
244253
field.type, target_root, depth, max_depth)
245254
return DataField(

paimon-python/pypaimon/tests/filesystem_catalog_test.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,40 @@ def _make_table(name, options):
257257
table = catalog.get_table(allowed_id)
258258
self.assertFalse(table.fields[1].type.nullable)
259259

260+
def test_update_column_type_rejects_non_executable_cast(self):
261+
catalog = CatalogFactory.create({"warehouse": self.warehouse})
262+
catalog.create_database("test_db_cast", False)
263+
264+
identifier = "test_db_cast.ts_table"
265+
schema = Schema(
266+
fields=[
267+
DataField.from_dict({"id": 0, "name": "k", "type": "INT"}),
268+
DataField.from_dict({"id": 1, "name": "ts", "type": "TIMESTAMP(3)"}),
269+
],
270+
partition_keys=[], primary_keys=[], options={}, comment="",
271+
)
272+
catalog.create_table(identifier, schema, False)
273+
274+
# TIMESTAMP -> DECIMAL is logically allowed but has no PyArrow cast
275+
# kernel, so the read path could not materialize it. Reject at alter
276+
# time (mirrors Java's CastExecutors.resolve(...) != null check) instead
277+
# of failing later at read with ArrowNotImplementedError.
278+
with self.assertRaises(RuntimeError) as ctx:
279+
catalog.alter_table(
280+
identifier,
281+
[SchemaChange.update_column_type(
282+
"ts", AtomicType("DECIMAL(10, 0)"))],
283+
False)
284+
self.assertIn("no executable cast", str(ctx.exception))
285+
286+
# An executable widening still succeeds.
287+
catalog.alter_table(
288+
identifier,
289+
[SchemaChange.update_column_type("k", AtomicType("BIGINT"))],
290+
False)
291+
table = catalog.get_table(identifier)
292+
self.assertEqual(table.fields[0].type.type, "BIGINT")
293+
260294
def test_add_column_before_partition(self):
261295
catalog = CatalogFactory.create({
262296
"warehouse": self.warehouse

0 commit comments

Comments
 (0)