apache
diff --git a/‎paimon-python/pypaimon/casting/__init__.py‎
Lines changed: 16 additions & 0 deletions b/‎paimon-python/pypaimon/casting/__init__.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎paimon-python/pypaimon/casting/data_type_casts.py‎
Lines changed: 167 additions & 0 deletions b/‎paimon-python/pypaimon/casting/data_type_casts.py‎
Lines changed: 167 additions & 0 deletions
diff --git a/‎paimon-python/pypaimon/read/reader/data_file_batch_reader.py‎
Lines changed: 79 additions & 33 deletions b/‎paimon-python/pypaimon/read/reader/data_file_batch_reader.py‎
Lines changed: 79 additions & 33 deletions
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
@@ -0,0 +1,167 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Type-cast support rules used to validate ``update column type`` schema
+changes.
+
+The rules mirror the engine-wide cast specification so a type change accepted
+here is one the read path can also materialize: an *implicit* cast is a safe
+widening (e.g. INT -> BIGINT, any numeric -> DECIMAL/DOUBLE), while an
+*explicit* cast covers the broader, possibly lossy conversions a user opts into
+(e.g. DOUBLE -> INT truncation, anything -> STRING). Read-time execution then
+applies the conversion leniently.
+"""
+
+from pypaimon.schema.data_types import (ArrayType, AtomicType, MapType,
+                                        MultisetType, RowType, VectorType)
+
+# ---- Type roots --------------------------------------------------------------
+
+CHAR = "CHAR"
+VARCHAR = "VARCHAR"
+BOOLEAN = "BOOLEAN"
+BINARY = "BINARY"
+VARBINARY = "VARBINARY"
+DECIMAL = "DECIMAL"
+TINYINT = "TINYINT"
+SMALLINT = "SMALLINT"
+INTEGER = "INTEGER"
+BIGINT = "BIGINT"
+FLOAT = "FLOAT"
+DOUBLE = "DOUBLE"
+DATE = "DATE"
+TIME = "TIME"
+TIMESTAMP = "TIMESTAMP"
+TIMESTAMP_LTZ = "TIMESTAMP_LTZ"
+ARRAY = "ARRAY"
+MAP = "MAP"
+MULTISET = "MULTISET"
+ROW = "ROW"
+VECTOR = "VECTOR"
+VARIANT = "VARIANT"
+BLOB = "BLOB"
+
+# ---- Families ----------------------------------------------------------------
+
+CHARACTER_STRING = {CHAR, VARCHAR}
+BINARY_STRING = {BINARY, VARBINARY}
+INTEGER_NUMERIC = {TINYINT, SMALLINT, INTEGER, BIGINT}
+NUMERIC = INTEGER_NUMERIC | {FLOAT, DOUBLE, DECIMAL}
+TIMESTAMP_FAMILY = {TIMESTAMP, TIMESTAMP_LTZ}
+TIME_FAMILY = {TIME}
+DATETIME = {DATE, TIME, TIMESTAMP, TIMESTAMP_LTZ}
+PREDEFINED = {
+    CHAR, VARCHAR, BOOLEAN, BINARY, VARBINARY, DECIMAL,
+    TINYINT, SMALLINT, INTEGER, BIGINT, FLOAT, DOUBLE,
+    DATE, TIME, TIMESTAMP, TIMESTAMP_LTZ,
+}
+CONSTRUCTED = {ARRAY, MAP, MULTISET, ROW, VECTOR}
+
+
+def _root(data_type) -> str:
+    if isinstance(data_type, RowType):
+        return ROW
+    if isinstance(data_type, ArrayType):
+        return ARRAY
+    if isinstance(data_type, MapType):
+        return MAP
+    if isinstance(data_type, MultisetType):
+        return MULTISET
+    if isinstance(data_type, VectorType):
+        return VECTOR
+    if isinstance(data_type, AtomicType):
+        t = data_type.type.upper()
+        if t.startswith("DECIMAL") or t.startswith("NUMERIC") or t.startswith("DEC"):
+            return DECIMAL
+        if t in ("INT", "INTEGER"):
+            return INTEGER
+        if t in (TINYINT, SMALLINT, BIGINT, FLOAT, DOUBLE, BOOLEAN, DATE):
+            return t
+        if t == "STRING" or t.startswith("VARCHAR"):
+            return VARCHAR
+        if t.startswith("CHAR"):
+            return CHAR
+        if t == "BYTES" or t.startswith("VARBINARY"):
+            return VARBINARY
+        if t.startswith("BINARY"):
+            return BINARY
+        if t == "BLOB":
+            return BLOB
+        if t.startswith("TIMESTAMP_LTZ"):
+            return TIMESTAMP_LTZ
+        if t.startswith("TIMESTAMP"):
+            return TIMESTAMP
+        if t.startswith("TIME"):
+            return TIME
+        if t == "VARIANT":
+            return VARIANT
+    return None
+
+
+def _build_rules():
+    implicit = {}
+    explicit = {}
+    # Identity cast for every root.
+    for root in (PREDEFINED | CONSTRUCTED | {VARIANT, BLOB}):
+        implicit[root] = {root}
+        explicit[root] = set()
+
+    def rule(target, implicit_from=None, explicit_from=None):
+        implicit[target] |= set(implicit_from or set())
+        explicit[target] |= set(explicit_from or set())
+
+    rule(CHAR, {CHAR}, PREDEFINED | CONSTRUCTED)
+    rule(VARCHAR, CHARACTER_STRING, PREDEFINED | CONSTRUCTED)
+    rule(BOOLEAN, {BOOLEAN}, CHARACTER_STRING | INTEGER_NUMERIC)
+    rule(BINARY, {BINARY}, CHARACTER_STRING | {VARBINARY})
+    rule(VARBINARY, BINARY_STRING, CHARACTER_STRING | {BINARY})
+    rule(DECIMAL, NUMERIC, CHARACTER_STRING | {BOOLEAN, TIMESTAMP, TIMESTAMP_LTZ})
+    int_explicit = NUMERIC | CHARACTER_STRING | {BOOLEAN, TIMESTAMP, TIMESTAMP_LTZ}
+    rule(TINYINT, {TINYINT}, int_explicit)
+    rule(SMALLINT, {TINYINT, SMALLINT}, int_explicit)
+    rule(INTEGER, {TINYINT, SMALLINT, INTEGER}, int_explicit)
+    rule(BIGINT, {TINYINT, SMALLINT, INTEGER, BIGINT}, int_explicit)
+    rule(FLOAT, {TINYINT, SMALLINT, INTEGER, BIGINT, FLOAT, DECIMAL}, int_explicit)
+    rule(DOUBLE, NUMERIC, CHARACTER_STRING | {BOOLEAN, TIMESTAMP, TIMESTAMP_LTZ})
+    rule(DATE, {DATE, TIMESTAMP}, TIMESTAMP_FAMILY | CHARACTER_STRING)
+    rule(TIME, {TIME, TIMESTAMP}, TIME_FAMILY | TIMESTAMP_FAMILY | CHARACTER_STRING)
+    rule(TIMESTAMP, {TIMESTAMP, TIMESTAMP_LTZ}, DATETIME | CHARACTER_STRING | NUMERIC)
+    rule(TIMESTAMP_LTZ, {TIMESTAMP_LTZ, TIMESTAMP}, DATETIME | CHARACTER_STRING | NUMERIC)
+    return implicit, explicit
+
+
+_IMPLICIT_RULES, _EXPLICIT_RULES = _build_rules()
+
+
+def supports_cast(source_type, target_type, allow_explicit: bool = True) -> bool:
+    """Whether ``source_type`` can be cast to ``target_type`` for a column type
+    change. ``allow_explicit`` permits the broader (possibly lossy) conversions
+    in addition to the safe widening ones."""
+    source_root = _root(source_type)
+    target_root = _root(target_type)
+    if source_root is None or target_root is None:
+        return False
+    # A NOT NULL target cannot accept a nullable source unless explicitly allowed.
+    if source_type.nullable and not target_type.nullable and not allow_explicit:
+        return False
+    if source_root == target_root:
+        return True
+    if source_root in _IMPLICIT_RULES.get(target_root, set()):
+        return True
+    if allow_explicit and source_root in _EXPLICIT_RULES.get(target_root, set()):
+        return True
+    return False
@@ -18,13 +18,15 @@
 from typing import List, Optional
 
 import pyarrow as pa
+import pyarrow.compute as pc
 from pyarrow import RecordBatch
 
 from pypaimon.common.file_io import FileIO
 from pypaimon.read.partition_info import PartitionInfo
 from pypaimon.read.reader.format_blob_reader import FormatBlobReader
 from pypaimon.read.reader.iface.record_batch_reader import RecordBatchReader
-from pypaimon.schema.data_types import DataField, PyarrowFieldParser
+from pypaimon.schema.data_types import (ArrayType, DataField, MapType,
+                                        PyarrowFieldParser, RowType)
 from pypaimon.table.special_fields import SpecialFields
 
 
@@ -57,55 +59,99 @@ def __init__(self, format_reader: RecordBatchReader, index_mapping: List[int], p
         self.file_io = file_io
         # Per-file field-id normalization: map the physically-read columns
         # (the file's own field order/names) onto the latest read target by
-        # field id, padding missing ids with NULL. ``None`` when there is no
-        # evolution to reconcile (identity) -- the common path stays zero-copy.
-        self._normalize_positions, self._normalize_names = \
-            self._build_normalize_plan(file_data_fields, target_data_fields)
+        # field id, padding missing ids with NULL and recursing into nested
+        # ROW / ARRAY<ROW> / MAP<.,ROW> sub-fields the same way. ``None`` when
+        # there is no evolution to reconcile -- the common path stays zero-copy.
+        self._normalize_plan = self._build_normalize_plan(file_data_fields, target_data_fields)
 
     @staticmethod
     def _build_normalize_plan(file_data_fields, target_data_fields):
         """Build a per-file field-id alignment plan.
 
-        Returns ``(positions, names)`` where ``positions[i]`` is the column
-        index in the physically-read batch carrying ``target_data_fields[i]``
-        (matched by field id), or -1 if the file does not contain that id (pad
-        NULL). ``names[i]`` is the latest target name. Returns ``(None, None)``
-        when the plan is the identity (no evolution), so the caller skips
-        normalization and stays zero-copy.
+        Returns a list of ``(pos, file_field, target_field)`` -- one per target
+        field, in target order -- where ``pos`` is the column index in the
+        physically-read batch carrying ``target_field`` (matched by field id),
+        or -1 if the file does not contain that id (pad NULL). Returns ``None``
+        when the file already matches the target exactly (no evolution), so the
+        caller stays zero-copy.
         """
         if file_data_fields is None or target_data_fields is None:
-            return None, None
+            return None
+        # Recursive equality covers nested sub-field changes too: any rename /
+        # add / drop / type change at any depth makes the file != target.
+        if file_data_fields == target_data_fields:
+            return None
         file_id_to_pos = {f.id: i for i, f in enumerate(file_data_fields)}
-        positions = []
-        names = []
-        # Identity only when every target maps to the same physical position
-        # AND already carries the same name -- a rename keeps the position but
-        # changes the name, which still requires a relabel pass.
-        identity = len(file_data_fields) == len(target_data_fields)
-        for i, target in enumerate(target_data_fields):
+        plan = []
+        for target in target_data_fields:
             pos = file_id_to_pos.get(target.id, -1)
-            positions.append(pos)
-            names.append(target.name)
-            if pos != i or (pos >= 0 and file_data_fields[pos].name != target.name):
-                identity = False
-        if identity:
-            return None, None
-        return positions, names
+            file_field = file_data_fields[pos] if pos >= 0 else None
+            plan.append((pos, file_field, target))
+        return plan
 
     def _normalize_batch(self, record_batch: RecordBatch) -> RecordBatch:
         """Reorder/pad the physically-read batch onto the latest read target by
-        field id, and relabel columns to the latest names. Missing ids become
-        all-NULL columns; types are reconciled later by _align_batch_to_read_schema."""
-        if self._normalize_positions is None:
+        field id, relabel columns to the latest names, and align nested ROW
+        sub-fields by id. Missing ids become typed all-NULL columns."""
+        if self._normalize_plan is None:
             return record_batch
         num_rows = record_batch.num_rows
         arrays = []
-        for pos in self._normalize_positions:
+        names = []
+        for pos, file_field, target_field in self._normalize_plan:
+            target_pa_type = PyarrowFieldParser.from_paimon_type(target_field.type)
             if pos < 0:
-                arrays.append(pa.nulls(num_rows))
+                arrays.append(pa.nulls(num_rows, type=target_pa_type))
             else:
-                arrays.append(record_batch.column(pos))
-        return pa.RecordBatch.from_arrays(arrays, names=self._normalize_names)
+                arrays.append(self._align_array_by_id(
+                    record_batch.column(pos), file_field.type, target_field.type))
+            names.append(target_field.name)
+        return pa.RecordBatch.from_arrays(arrays, names=names)
+
+    def _align_array_by_id(self, array, file_type, target_type):
+        """Return *array* converted to *target_type*, matching ROW sub-fields by
+        field id (reorder, pad missing with NULL, follow renames, cast changed
+        types) recursively, transparently through ARRAY/MAP wrappers."""
+        if isinstance(target_type, RowType) and isinstance(file_type, RowType):
+            n = len(array)
+            file_id_to_pos = {f.id: i for i, f in enumerate(file_type.fields)}
+            children = []
+            pa_fields = []
+            for tsub in target_type.fields:
+                p = file_id_to_pos.get(tsub.id, -1)
+                if p < 0:
+                    child = pa.nulls(n, type=PyarrowFieldParser.from_paimon_type(tsub.type))
+                else:
+                    child = self._align_array_by_id(
+                        array.field(p), file_type.fields[p].type, tsub.type)
+                children.append(child)
+                pa_fields.append(pa.field(tsub.name, child.type, nullable=tsub.type.nullable))
+            # Preserve the struct's own null mask; child values under a null
+            # struct are irrelevant.
+            return pa.StructArray.from_arrays(
+                children, fields=pa_fields, mask=pc.is_null(array))
+        if isinstance(target_type, ArrayType) and isinstance(file_type, ArrayType):
+            aligned_values = self._align_array_by_id(
+                array.values, file_type.element, target_type.element)
+            return pa.ListArray.from_arrays(
+                array.offsets, aligned_values, mask=pc.is_null(array))
+        if isinstance(target_type, MapType) and isinstance(file_type, MapType):
+            aligned_items = self._align_array_by_id(
+                array.items, file_type.value, target_type.value)
+            # MapArray.from_arrays cannot carry a null mask (a null map would
+            # collapse to an empty one), so rebuild from buffers, reusing the
+            # original validity/offset buffers and only swapping the value child.
+            target_pa = PyarrowFieldParser.from_paimon_type(target_type)
+            entries = pa.StructArray.from_arrays(
+                [array.keys, aligned_items],
+                fields=[target_pa.key_field, target_pa.item_field])
+            return pa.Array.from_buffers(
+                target_pa, len(array), array.buffers()[:2], children=[entries])
+        # Leaf / non-nested: cast to the target type when it differs.
+        target_pa_type = PyarrowFieldParser.from_paimon_type(target_type)
+        if array.type != target_pa_type:
+            return array.cast(target_pa_type, safe=False)
+        return array
 
     def read_arrow_batch(self, start_idx=None, end_idx=None) -> Optional[RecordBatch]:
         if isinstance(self.format_reader, FormatBlobReader):