Skip to content

Commit 2ffdce3

Browse files
committed
[python] Reject vector-to-string casts and validate nested wrapper tokens
Two follow-ups on the nested schema-evolution path: - update_column_type from VECTOR (or MULTISET) to STRING passed validation but old files failed on read: there is no string rendering for them. Narrow the cast rule so only ROW/ARRAY/MAP - the constructed types the read path can render - are accepted as string sources. - The nested path walker consumed the ARRAY/MAP wrapper token by position without checking it, so an invalid path like ['arr', 'wrong', 'c'] was accepted and mutated the schema exactly like ['arr', 'element', 'c']. Require 'element' for arrays and 'value' for maps before descending. Add tests for the rejected vector alter (the column still reads), the narrowed cast rules, and wrong wrapper tokens on ARRAY<ROW> / MAP<.,ROW>.
1 parent 824faae commit 2ffdce3

3 files changed

Lines changed: 96 additions & 10 deletions

File tree

paimon-python/pypaimon/casting/data_type_casts.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,11 @@
7070
DATE, TIME, TIMESTAMP, TIMESTAMP_LTZ,
7171
}
7272
CONSTRUCTED = {ARRAY, MAP, MULTISET, ROW, VECTOR}
73+
# Constructed types the read path can render as a character string
74+
# ('{v1, v2}' / '[e1, e2]' / '{k -> v}'). VECTOR and MULTISET have no string
75+
# rendering, so a type change from them to CHAR/VARCHAR is rejected here
76+
# rather than failing when an old file is read.
77+
STRING_RENDERABLE_CONSTRUCTED = {ARRAY, MAP, ROW}
7378

7479

7580
def _root(data_type) -> str:
@@ -124,8 +129,8 @@ def rule(target, implicit_from=None, explicit_from=None):
124129
implicit[target] |= set(implicit_from or set())
125130
explicit[target] |= set(explicit_from or set())
126131

127-
rule(CHAR, {CHAR}, PREDEFINED | CONSTRUCTED)
128-
rule(VARCHAR, CHARACTER_STRING, PREDEFINED | CONSTRUCTED)
132+
rule(CHAR, {CHAR}, PREDEFINED | STRING_RENDERABLE_CONSTRUCTED)
133+
rule(VARCHAR, CHARACTER_STRING, PREDEFINED | STRING_RENDERABLE_CONSTRUCTED)
129134
rule(BOOLEAN, {BOOLEAN}, CHARACTER_STRING | INTEGER_NUMERIC)
130135
rule(BINARY, {BINARY}, CHARACTER_STRING | {VARBINARY})
131136
rule(VARBINARY, BINARY_STRING, CHARACTER_STRING | {BINARY})

paimon-python/pypaimon/schema/schema_manager.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,22 +46,36 @@ def _find_field_index(fields: List[DataField], field_name: str) -> Optional[int]
4646
return None
4747

4848

49-
def _extract_row_data_fields(data_type, out_fields: List[DataField]) -> int:
49+
def _extract_row_data_fields(data_type, out_fields: List[DataField],
50+
field_names: List[str], token_pos: int) -> int:
5051
"""Collect the immediate sub-fields reachable from *data_type* into
5152
*out_fields* and return the path depth consumed. A ROW contributes its
5253
fields (depth 1); an ARRAY/MAP is transparent and descends into its
53-
element/value (consuming the ``element``/``value`` path token); anything
54-
else contributes nothing (depth 1)."""
54+
element/value, consuming the ``element``/``value`` path token -- the
55+
consumed token is validated so an unknown step cannot silently mutate
56+
the schema; anything else contributes nothing (depth 1)."""
5557
if isinstance(data_type, RowType):
5658
out_fields.extend(data_type.fields)
5759
return 1
5860
if isinstance(data_type, ArrayType):
59-
return _extract_row_data_fields(data_type.element, out_fields) + 1
61+
_assert_wrapper_token(field_names, token_pos, 'element')
62+
return _extract_row_data_fields(
63+
data_type.element, out_fields, field_names, token_pos + 1) + 1
6064
if isinstance(data_type, MapType):
61-
return _extract_row_data_fields(data_type.value, out_fields) + 1
65+
_assert_wrapper_token(field_names, token_pos, 'value')
66+
return _extract_row_data_fields(
67+
data_type.value, out_fields, field_names, token_pos + 1) + 1
6268
return 1
6369

6470

71+
def _assert_wrapper_token(field_names: List[str], token_pos: int, expected: str):
72+
# A path that ends inside the wrappers (token_pos out of range) is the
73+
# update-the-wrapped-type-itself case, handled by the caller's overflow
74+
# branch; only a present-but-wrong token is rejected.
75+
if token_pos < len(field_names) and field_names[token_pos] != expected:
76+
raise ColumnNotExistException('.'.join(field_names))
77+
78+
6579
def _wrap_new_row_type(data_type, nested_fields: List[DataField]):
6680
"""Rebuild *data_type* substituting *nested_fields* at its innermost ROW,
6781
preserving any ARRAY/MAP wrappers."""
@@ -123,7 +137,8 @@ def _update_intermediate_column(new_fields, previous_fields, depth, prev_depth,
123137
if field.name != field_names[depth]:
124138
continue
125139
nested_fields: List[DataField] = []
126-
new_depth = depth + _extract_row_data_fields(field.type, nested_fields)
140+
new_depth = depth + _extract_row_data_fields(
141+
field.type, nested_fields, field_names, depth + 1)
127142
_update_intermediate_column(
128143
nested_fields, new_fields, new_depth, depth, field_names, update_last_fn)
129144
field = new_fields[i]

paimon-python/pypaimon/tests/schema_evolution_nested_read_test.py

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,10 @@
3737

3838
from pypaimon import CatalogFactory, Schema
3939
from pypaimon.casting.data_type_casts import supports_cast
40-
from pypaimon.schema.data_types import (AtomicInteger, AtomicType, DataField,
40+
from pypaimon.schema.data_types import (ArrayType, AtomicInteger, AtomicType,
41+
DataField, MapType, MultisetType,
4142
PyarrowFieldParser, RowType,
42-
collect_field_ids,
43+
VectorType, collect_field_ids,
4344
current_highest_field_id,
4445
reassign_field_id)
4546
from pypaimon.schema.schema_change import SchemaChange
@@ -457,6 +458,40 @@ def test_map_of_row_add_subfield(self):
457458
rows = self._read_sorted(table)
458459
self.assertEqual(rows[0]['m'], [('k', {'a': 1, 'b': 'x', 'c': None})])
459460

461+
def test_array_wrapper_token_validated(self):
462+
# The token consumed when descending through an ARRAY must be
463+
# 'element'; an unknown step must not silently mutate the schema.
464+
elem = pa.struct([('a', pa.int64())])
465+
s0 = pa.schema([('id', pa.int64()), ('arr', pa.list_(elem))])
466+
self._create('ntok_arr', s0)
467+
with self.assertRaises(RuntimeError) as cm:
468+
self.catalog.alter_table(
469+
'default.ntok_arr',
470+
[SchemaChange.add_column(['arr', 'wrong', 'c'], AtomicType('INT'))],
471+
False)
472+
self.assertIn('arr.wrong.c', str(cm.exception))
473+
# The canonical token still works.
474+
self.catalog.alter_table(
475+
'default.ntok_arr',
476+
[SchemaChange.add_column(['arr', 'element', 'c'], AtomicType('INT'))],
477+
False)
478+
479+
def test_map_wrapper_token_validated(self):
480+
# The token consumed when descending through a MAP must be 'value'.
481+
val = pa.struct([('a', pa.int64())])
482+
s0 = pa.schema([('id', pa.int64()), ('m', pa.map_(pa.string(), val))])
483+
self._create('ntok_map', s0)
484+
with self.assertRaises(RuntimeError) as cm:
485+
self.catalog.alter_table(
486+
'default.ntok_map',
487+
[SchemaChange.add_column(['m', 'wrong', 'c'], AtomicType('INT'))],
488+
False)
489+
self.assertIn('m.wrong.c', str(cm.exception))
490+
self.catalog.alter_table(
491+
'default.ntok_map',
492+
[SchemaChange.add_column(['m', 'value', 'c'], AtomicType('INT'))],
493+
False)
494+
460495

461496
class SchemaEvolutionConstructedToStringTest(_NestedBase):
462497
"""update column type from ROW/ARRAY/MAP to STRING: old files must be
@@ -524,6 +559,24 @@ def test_row_to_string_null_semantics(self):
524559
self.assertIsNone(rows[0]['mv'])
525560
self.assertEqual(rows[1]['mv'], '{null, x}')
526561

562+
def test_vector_to_string_rejected(self):
563+
# There is no read-time string rendering for vectors, so the type
564+
# change must be rejected at alter time instead of failing on read.
565+
s0 = pa.schema([('id', pa.int64()),
566+
('embed', pa.list_(pa.float32(), 3))])
567+
table = self._create('c2s_vec', s0)
568+
self._write(table, pa.Table.from_pylist(
569+
[{'id': 1, 'embed': [1.0, 2.0, 3.0]}], schema=s0))
570+
with self.assertRaises(RuntimeError) as cm:
571+
self.catalog.alter_table(
572+
'default.c2s_vec',
573+
[SchemaChange.update_column_type('embed', AtomicType('STRING'))],
574+
False)
575+
self.assertIn('cannot be converted', str(cm.exception))
576+
# The vector column itself still reads fine.
577+
rows = self._read_sorted(table)
578+
self.assertEqual(rows[0]['embed'], [1.0, 2.0, 3.0])
579+
527580
def test_nested_subfield_row_to_string(self):
528581
inner = pa.struct([('a', pa.int32())])
529582
s0 = pa.schema([('id', pa.int64()),
@@ -591,6 +644,19 @@ def test_unsupported_casts(self):
591644
self.assertFalse(supports_cast(AtomicType(src), AtomicType(dst)),
592645
'{} -> {}'.format(src, dst))
593646

647+
def test_constructed_to_string(self):
648+
# ROW/ARRAY/MAP have a read-time string rendering; vector and
649+
# multiset do not, so their type change must be rejected.
650+
row = RowType(True, [DataField(0, 'a', AtomicType('INT'))])
651+
arr = ArrayType(True, AtomicType('INT'))
652+
m = MapType(True, AtomicType('STRING'), AtomicType('INT'))
653+
for src in (row, arr, m):
654+
self.assertTrue(supports_cast(src, AtomicType('STRING')), str(src))
655+
vec = VectorType(True, AtomicType('FLOAT'), 3)
656+
ms = MultisetType(True, AtomicType('INT'))
657+
for src in (vec, ms):
658+
self.assertFalse(supports_cast(src, AtomicType('STRING')), str(src))
659+
594660

595661
if __name__ == '__main__':
596662
unittest.main()

0 commit comments

Comments
 (0)