[python] Reject vector-to-string casts and validate nested wrapper tokens

TheR1sing3un · TheR1sing3un · commit 2ffdce30b6b6 · 2026-06-11T13:59:41.000+08:00
Two follow-ups on the nested schema-evolution path:

- update_column_type from VECTOR (or MULTISET) to STRING passed validation
  but old files failed on read: there is no string rendering for them.
  Narrow the cast rule so only ROW/ARRAY/MAP - the constructed types the
  read path can render - are accepted as string sources.

- The nested path walker consumed the ARRAY/MAP wrapper token by position
  without checking it, so an invalid path like ['arr', 'wrong', 'c'] was
  accepted and mutated the schema exactly like ['arr', 'element', 'c'].
  Require 'element' for arrays and 'value' for maps before descending.

Add tests for the rejected vector alter (the column still reads), the
narrowed cast rules, and wrong wrapper tokens on ARRAY&lt;ROW&gt; / MAP&lt;.,ROW&gt;.
diff --git a/paimon-python/pypaimon/casting/data_type_casts.py b/paimon-python/pypaimon/casting/data_type_casts.py
@@ -70,6 +70,11 @@
     DATE, TIME, TIMESTAMP, TIMESTAMP_LTZ,
 }
 CONSTRUCTED = {ARRAY, MAP, MULTISET, ROW, VECTOR}
+# Constructed types the read path can render as a character string
+# ('{v1, v2}' / '[e1, e2]' / '{k -> v}'). VECTOR and MULTISET have no string
+# rendering, so a type change from them to CHAR/VARCHAR is rejected here
+# rather than failing when an old file is read.
+STRING_RENDERABLE_CONSTRUCTED = {ARRAY, MAP, ROW}
 
 
 def _root(data_type) -> str:
@@ -124,8 +129,8 @@ def rule(target, implicit_from=None, explicit_from=None):
         implicit[target] |= set(implicit_from or set())
         explicit[target] |= set(explicit_from or set())
 
-    rule(CHAR, {CHAR}, PREDEFINED | CONSTRUCTED)
-    rule(VARCHAR, CHARACTER_STRING, PREDEFINED | CONSTRUCTED)
+    rule(CHAR, {CHAR}, PREDEFINED | STRING_RENDERABLE_CONSTRUCTED)
+    rule(VARCHAR, CHARACTER_STRING, PREDEFINED | STRING_RENDERABLE_CONSTRUCTED)
     rule(BOOLEAN, {BOOLEAN}, CHARACTER_STRING | INTEGER_NUMERIC)
     rule(BINARY, {BINARY}, CHARACTER_STRING | {VARBINARY})
     rule(VARBINARY, BINARY_STRING, CHARACTER_STRING | {BINARY})
diff --git a/paimon-python/pypaimon/schema/schema_manager.py b/paimon-python/pypaimon/schema/schema_manager.py
@@ -46,22 +46,36 @@ def _find_field_index(fields: List[DataField], field_name: str) -> Optional[int]
     return None
 
 
-def _extract_row_data_fields(data_type, out_fields: List[DataField]) -> int:
+def _extract_row_data_fields(data_type, out_fields: List[DataField],
+                             field_names: List[str], token_pos: int) -> int:
     """Collect the immediate sub-fields reachable from *data_type* into
     *out_fields* and return the path depth consumed. A ROW contributes its
     fields (depth 1); an ARRAY/MAP is transparent and descends into its
-    element/value (consuming the ``element``/``value`` path token); anything
-    else contributes nothing (depth 1)."""
+    element/value, consuming the ``element``/``value`` path token -- the
+    consumed token is validated so an unknown step cannot silently mutate
+    the schema; anything else contributes nothing (depth 1)."""
     if isinstance(data_type, RowType):
         out_fields.extend(data_type.fields)
         return 1
     if isinstance(data_type, ArrayType):
-        return _extract_row_data_fields(data_type.element, out_fields) + 1
+        _assert_wrapper_token(field_names, token_pos, 'element')
+        return _extract_row_data_fields(
+            data_type.element, out_fields, field_names, token_pos + 1) + 1
     if isinstance(data_type, MapType):
-        return _extract_row_data_fields(data_type.value, out_fields) + 1
+        _assert_wrapper_token(field_names, token_pos, 'value')
+        return _extract_row_data_fields(
+            data_type.value, out_fields, field_names, token_pos + 1) + 1
     return 1
 
 
+def _assert_wrapper_token(field_names: List[str], token_pos: int, expected: str):
+    # A path that ends inside the wrappers (token_pos out of range) is the
+    # update-the-wrapped-type-itself case, handled by the caller's overflow
+    # branch; only a present-but-wrong token is rejected.
+    if token_pos < len(field_names) and field_names[token_pos] != expected:
+        raise ColumnNotExistException('.'.join(field_names))
+
+
 def _wrap_new_row_type(data_type, nested_fields: List[DataField]):
     """Rebuild *data_type* substituting *nested_fields* at its innermost ROW,
     preserving any ARRAY/MAP wrappers."""
@@ -123,7 +137,8 @@ def _update_intermediate_column(new_fields, previous_fields, depth, prev_depth,
         if field.name != field_names[depth]:
             continue
         nested_fields: List[DataField] = []
-        new_depth = depth + _extract_row_data_fields(field.type, nested_fields)
+        new_depth = depth + _extract_row_data_fields(
+            field.type, nested_fields, field_names, depth + 1)
         _update_intermediate_column(
             nested_fields, new_fields, new_depth, depth, field_names, update_last_fn)
         field = new_fields[i]
diff --git a/paimon-python/pypaimon/tests/schema_evolution_nested_read_test.py b/paimon-python/pypaimon/tests/schema_evolution_nested_read_test.py
@@ -37,9 +37,10 @@
 
 from pypaimon import CatalogFactory, Schema
 from pypaimon.casting.data_type_casts import supports_cast
-from pypaimon.schema.data_types import (AtomicInteger, AtomicType, DataField,
+from pypaimon.schema.data_types import (ArrayType, AtomicInteger, AtomicType,
+                                        DataField, MapType, MultisetType,
                                         PyarrowFieldParser, RowType,
-                                        collect_field_ids,
+                                        VectorType, collect_field_ids,
                                         current_highest_field_id,
                                         reassign_field_id)
 from pypaimon.schema.schema_change import SchemaChange
@@ -457,6 +458,40 @@ def test_map_of_row_add_subfield(self):
         rows = self._read_sorted(table)
         self.assertEqual(rows[0]['m'], [('k', {'a': 1, 'b': 'x', 'c': None})])
 
+    def test_array_wrapper_token_validated(self):
+        # The token consumed when descending through an ARRAY must be
+        # 'element'; an unknown step must not silently mutate the schema.
+        elem = pa.struct([('a', pa.int64())])
+        s0 = pa.schema([('id', pa.int64()), ('arr', pa.list_(elem))])
+        self._create('ntok_arr', s0)
+        with self.assertRaises(RuntimeError) as cm:
+            self.catalog.alter_table(
+                'default.ntok_arr',
+                [SchemaChange.add_column(['arr', 'wrong', 'c'], AtomicType('INT'))],
+                False)
+        self.assertIn('arr.wrong.c', str(cm.exception))
+        # The canonical token still works.
+        self.catalog.alter_table(
+            'default.ntok_arr',
+            [SchemaChange.add_column(['arr', 'element', 'c'], AtomicType('INT'))],
+            False)
+
+    def test_map_wrapper_token_validated(self):
+        # The token consumed when descending through a MAP must be 'value'.
+        val = pa.struct([('a', pa.int64())])
+        s0 = pa.schema([('id', pa.int64()), ('m', pa.map_(pa.string(), val))])
+        self._create('ntok_map', s0)
+        with self.assertRaises(RuntimeError) as cm:
+            self.catalog.alter_table(
+                'default.ntok_map',
+                [SchemaChange.add_column(['m', 'wrong', 'c'], AtomicType('INT'))],
+                False)
+        self.assertIn('m.wrong.c', str(cm.exception))
+        self.catalog.alter_table(
+            'default.ntok_map',
+            [SchemaChange.add_column(['m', 'value', 'c'], AtomicType('INT'))],
+            False)
+
 
 class SchemaEvolutionConstructedToStringTest(_NestedBase):
     """update column type from ROW/ARRAY/MAP to STRING: old files must be
@@ -524,6 +559,24 @@ def test_row_to_string_null_semantics(self):
         self.assertIsNone(rows[0]['mv'])
         self.assertEqual(rows[1]['mv'], '{null, x}')
 
+    def test_vector_to_string_rejected(self):
+        # There is no read-time string rendering for vectors, so the type
+        # change must be rejected at alter time instead of failing on read.
+        s0 = pa.schema([('id', pa.int64()),
+                        ('embed', pa.list_(pa.float32(), 3))])
+        table = self._create('c2s_vec', s0)
+        self._write(table, pa.Table.from_pylist(
+            [{'id': 1, 'embed': [1.0, 2.0, 3.0]}], schema=s0))
+        with self.assertRaises(RuntimeError) as cm:
+            self.catalog.alter_table(
+                'default.c2s_vec',
+                [SchemaChange.update_column_type('embed', AtomicType('STRING'))],
+                False)
+        self.assertIn('cannot be converted', str(cm.exception))
+        # The vector column itself still reads fine.
+        rows = self._read_sorted(table)
+        self.assertEqual(rows[0]['embed'], [1.0, 2.0, 3.0])
+
     def test_nested_subfield_row_to_string(self):
         inner = pa.struct([('a', pa.int32())])
         s0 = pa.schema([('id', pa.int64()),
@@ -591,6 +644,19 @@ def test_unsupported_casts(self):
             self.assertFalse(supports_cast(AtomicType(src), AtomicType(dst)),
                              '{} -> {}'.format(src, dst))
 
+    def test_constructed_to_string(self):
+        # ROW/ARRAY/MAP have a read-time string rendering; vector and
+        # multiset do not, so their type change must be rejected.
+        row = RowType(True, [DataField(0, 'a', AtomicType('INT'))])
+        arr = ArrayType(True, AtomicType('INT'))
+        m = MapType(True, AtomicType('STRING'), AtomicType('INT'))
+        for src in (row, arr, m):
+            self.assertTrue(supports_cast(src, AtomicType('STRING')), str(src))
+        vec = VectorType(True, AtomicType('FLOAT'), 3)
+        ms = MultisetType(True, AtomicType('INT'))
+        for src in (vec, ms):
+            self.assertFalse(supports_cast(src, AtomicType('STRING')), str(src))
+
 
 if __name__ == '__main__':
     unittest.main()