Skip to content

Commit b8cc8cd

Browse files
[python] Fix stats evolution index mapping to use field ID instead of field name (#7593)
SimpleStatsEvolutions._create_index_cast_mapping was using field.name to build the mapping between table schema and data schema. This breaks after column renames because names change while field IDs remain stable. Changed to use field.id, aligning with Java's SchemaEvolutionUtil.createIndexMapping.
1 parent 0223348 commit b8cc8cd

2 files changed

Lines changed: 162 additions & 4 deletions

File tree

paimon-python/pypaimon/manifest/simple_stats_evolutions.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,15 @@ def _create_index_cast_mapping(table_fields: List[DataField],
5555
Create index and cast mapping between table fields and data fields.
5656
This is a simplified implementation.
5757
"""
58-
# Create a mapping from field names to indices in data_fields
59-
data_field_map = {field.name: i for i, field in enumerate(data_fields)}
58+
# Create a mapping from field IDs to indices in data_fields.
59+
# Field IDs are stable across schema evolution (including column renames),
60+
# while field names may change.
61+
data_field_map = {field.id: i for i, field in enumerate(data_fields)}
6062

6163
index_mapping = []
6264
for table_field in table_fields:
63-
if table_field.name in data_field_map:
64-
index_mapping.append(data_field_map[table_field.name])
65+
if table_field.id in data_field_map:
66+
index_mapping.append(data_field_map[table_field.id])
6567
else:
6668
index_mapping.append(-1) # Field not found in data schema
6769

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
################################################################################
2+
# Licensed to the Apache Software Foundation (ASF) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. The ASF licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
################################################################################
18+
19+
import unittest
20+
21+
from pypaimon.manifest.simple_stats_evolutions import SimpleStatsEvolutions
22+
from pypaimon.schema.data_types import DataField, AtomicType
23+
from pypaimon.manifest.schema.simple_stats import SimpleStats
24+
from pypaimon.table.row.generic_row import GenericRow
25+
26+
27+
class SimpleStatsEvolutionsTest(unittest.TestCase):
28+
29+
def _make_fields(self, field_defs):
30+
"""Helper: field_defs is list of (id, name, type_str)."""
31+
return [DataField(fid, fname, AtomicType(ftype)) for fid, fname, ftype in field_defs]
32+
33+
def _make_stats(self, fields, min_vals, max_vals, null_counts):
34+
"""Helper: build SimpleStats from parallel lists."""
35+
df = [DataField(fid, fname, AtomicType(ftype)) for fid, fname, ftype in fields]
36+
return SimpleStats(GenericRow(min_vals, df), GenericRow(max_vals, df), null_counts)
37+
38+
def test_same_schema_no_mapping(self):
39+
"""When data schema == table schema, index_mapping should be None."""
40+
fields = self._make_fields([(0, 'a', 'INT'), (1, 'b', 'STRING')])
41+
schemas = {0: fields}
42+
evolutions = SimpleStatsEvolutions(lambda sid: schemas[sid], 0)
43+
evolution = evolutions.get_or_create(0)
44+
self.assertIsNone(evolution.index_mapping)
45+
46+
def test_added_column(self):
47+
"""New column: mapping is -1, null_count = row_count."""
48+
data_fields = self._make_fields([(0, 'a', 'INT'), (1, 'b', 'INT')])
49+
table_fields = self._make_fields([(0, 'a', 'INT'), (1, 'b', 'INT'), (2, 'c', 'INT')])
50+
schemas = {0: data_fields, 1: table_fields}
51+
evolutions = SimpleStatsEvolutions(lambda sid: schemas[sid], 1)
52+
evolution = evolutions.get_or_create(0)
53+
54+
self.assertEqual(evolution.index_mapping, [0, 1, -1])
55+
56+
stats = self._make_stats([(0, 'a', 'INT'), (1, 'b', 'INT')],
57+
[1, 2], [10, 20], [0, 5])
58+
evolved = evolution.evolution(stats, row_count=500, stats_fields=None)
59+
self.assertIsNone(evolved.min_values.get_field(2))
60+
self.assertIsNone(evolved.max_values.get_field(2))
61+
self.assertEqual(evolved.null_counts, [0, 5, 500])
62+
63+
def test_dropped_column(self):
64+
"""Dropped column is excluded from mapping."""
65+
data_fields = self._make_fields([(0, 'a', 'INT'), (1, 'b', 'STRING'), (2, 'c', 'BIGINT')])
66+
table_fields = self._make_fields([(0, 'a', 'INT'), (2, 'c', 'BIGINT')])
67+
schemas = {0: data_fields, 1: table_fields}
68+
evolutions = SimpleStatsEvolutions(lambda sid: schemas[sid], 1)
69+
evolution = evolutions.get_or_create(0)
70+
# a->0, c->2 (b dropped)
71+
self.assertEqual(evolution.index_mapping, [0, 2])
72+
73+
def test_renamed_column(self):
74+
"""Column rename: matched by field ID, not name."""
75+
data_fields = self._make_fields([(0, 'old_name', 'INT'), (1, 'b', 'INT')])
76+
table_fields = self._make_fields([(0, 'new_name', 'INT'), (1, 'b', 'INT')])
77+
schemas = {0: data_fields, 1: table_fields}
78+
evolutions = SimpleStatsEvolutions(lambda sid: schemas[sid], 1)
79+
evolution = evolutions.get_or_create(0)
80+
81+
self.assertEqual(evolution.index_mapping, [0, 1])
82+
self.assertNotEqual(evolution.index_mapping[0], -1) # not missing
83+
84+
stats = self._make_stats([(0, 'old_name', 'INT'), (1, 'b', 'INT')],
85+
[10, 20], [100, 200], [0, 5])
86+
evolved = evolution.evolution(stats, row_count=1000, stats_fields=None)
87+
self.assertEqual(evolved.min_values.get_field(0), 10)
88+
self.assertEqual(evolved.min_values.get_field(1), 20)
89+
self.assertEqual(evolved.max_values.get_field(0), 100)
90+
self.assertEqual(evolved.max_values.get_field(1), 200)
91+
self.assertEqual(evolved.null_counts, [0, 5])
92+
93+
def test_reordered_columns(self):
94+
"""Column order differs between schemas, matched by field ID."""
95+
data_fields = self._make_fields([(0, 'a', 'INT'), (1, 'b', 'STRING'), (2, 'c', 'BIGINT')])
96+
table_fields = self._make_fields([(2, 'c', 'BIGINT'), (0, 'a', 'INT'), (1, 'b', 'STRING')])
97+
schemas = {0: data_fields, 1: table_fields}
98+
evolutions = SimpleStatsEvolutions(lambda sid: schemas[sid], 1)
99+
evolution = evolutions.get_or_create(0)
100+
self.assertEqual(evolution.index_mapping, [2, 0, 1])
101+
102+
def test_caching(self):
103+
"""Evolution objects are cached by schema ID."""
104+
fields = self._make_fields([(0, 'a', 'INT')])
105+
schemas = {0: fields, 1: fields}
106+
evolutions = SimpleStatsEvolutions(lambda sid: schemas[sid], 1)
107+
e1 = evolutions.get_or_create(0)
108+
e2 = evolutions.get_or_create(0)
109+
self.assertIs(e1, e2)
110+
111+
def test_complex_evolution(self):
112+
"""Reorder + add + drop + rename combined.
113+
114+
Data schema (id=0): a(0), b(1), c(2), d(3)
115+
Table schema (id=1): cc(2), a(0), d(3), e(4, new), bb(1)
116+
117+
Verifies index_mapping, min/max projection, and null count evolution.
118+
"""
119+
data_fields = self._make_fields([
120+
(0, 'a', 'INT'), (1, 'b', 'INT'), (2, 'c', 'INT'), (3, 'd', 'INT')
121+
])
122+
table_fields = self._make_fields([
123+
(2, 'cc', 'INT'), (0, 'a', 'INT'), (3, 'd', 'INT'), (4, 'e', 'INT'), (1, 'bb', 'INT')
124+
])
125+
schemas = {0: data_fields, 1: table_fields}
126+
evolutions = SimpleStatsEvolutions(lambda sid: schemas[sid], 1)
127+
evolution = evolutions.get_or_create(0)
128+
129+
# Mapping: cc(id=2)->2, a(id=0)->0, d(id=3)->3, e(id=4)->-1, bb(id=1)->1
130+
self.assertEqual(evolution.index_mapping, [2, 0, 3, -1, 1])
131+
132+
stats = self._make_stats(
133+
[(0, 'a', 'INT'), (1, 'b', 'INT'), (2, 'c', 'INT'), (3, 'd', 'INT')],
134+
[10, 20, 30, 40], [100, 200, 300, 400], [1, 2, 3, 4])
135+
evolved = evolution.evolution(stats, row_count=1000, stats_fields=None)
136+
137+
# min: cc->30, a->10, d->40, e->None, bb->20
138+
self.assertEqual(evolved.min_values.get_field(0), 30)
139+
self.assertEqual(evolved.min_values.get_field(1), 10)
140+
self.assertEqual(evolved.min_values.get_field(2), 40)
141+
self.assertIsNone(evolved.min_values.get_field(3))
142+
self.assertEqual(evolved.min_values.get_field(4), 20)
143+
144+
# max: cc->300, a->100, d->400, e->None, bb->200
145+
self.assertEqual(evolved.max_values.get_field(0), 300)
146+
self.assertEqual(evolved.max_values.get_field(1), 100)
147+
self.assertEqual(evolved.max_values.get_field(2), 400)
148+
self.assertIsNone(evolved.max_values.get_field(3))
149+
self.assertEqual(evolved.max_values.get_field(4), 200)
150+
151+
# null counts: cc->3, a->1, d->4, e->1000(row_count), bb->2
152+
self.assertEqual(evolved.null_counts, [3, 1, 4, 1000, 2])
153+
154+
155+
if __name__ == '__main__':
156+
unittest.main()

0 commit comments

Comments
 (0)