Skip to content

Commit 15288b3

Browse files
authored
[FLINK-38450][iceberg] Fix duplicate records when schema change splits writes within a checkpoint (#4360)
1 parent a97a5bc commit 15288b3

6 files changed

Lines changed: 1202 additions & 48 deletions

File tree

flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/main/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/IcebergCommitter.java

Lines changed: 134 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
import java.util.List;
4848
import java.util.Map;
4949
import java.util.Optional;
50+
import java.util.TreeMap;
5051

5152
import static java.util.stream.Collectors.toList;
5253
import static org.apache.flink.runtime.checkpoint.CheckpointIDCounter.INITIAL_CHECKPOINT_ID;
@@ -62,6 +63,15 @@ public class IcebergCommitter implements Committer<WriteResultWrapper> {
6263

6364
public static final String TABLE_GROUP_KEY = "table";
6465

66+
// Use a flink-cdc. prefix so these don't clash with the flink. namespace reserved by the
67+
// Iceberg Flink connector.
68+
69+
/** Snapshot summary key for the batch index; used to resume partial commits on retry. */
70+
static final String FLINK_BATCH_INDEX = "flink-cdc.batch-index";
71+
72+
/** Snapshot summary key for the checkpoint ID on intermediate batch commits. */
73+
static final String FLINK_CHECKPOINT_ID_PROP = "flink-cdc.checkpoint-id";
74+
6575
private final Catalog catalog;
6676

6777
private final SinkCommitterMetricGroup metricGroup;
@@ -96,74 +106,140 @@ private void commit(List<WriteResultWrapper> writeResultWrappers) {
96106
if (writeResultWrappers.isEmpty()) {
97107
return;
98108
}
99-
// all commits a same checkpoint-id
100109
long checkpointId = writeResultWrappers.get(0).getCheckpointId();
101110
String newFlinkJobId = writeResultWrappers.get(0).getJobId();
102111
String operatorId = writeResultWrappers.get(0).getOperatorId();
103112

104-
Map<TableId, List<WriteResult>> tableMap = new HashMap<>();
105-
for (WriteResultWrapper writeResultWrapper : writeResultWrappers) {
106-
List<WriteResult> writeResult =
107-
tableMap.getOrDefault(writeResultWrapper.getTableId(), new ArrayList<>());
108-
writeResult.add(writeResultWrapper.getWriteResult());
109-
tableMap.put(writeResultWrapper.getTableId(), writeResult);
110-
LOGGER.info(writeResultWrapper.buildDescription());
113+
Map<TableId, List<WriteResultWrapper>> tableMap = new HashMap<>();
114+
for (WriteResultWrapper w : writeResultWrappers) {
115+
tableMap.computeIfAbsent(w.getTableId(), k -> new ArrayList<>()).add(w);
111116
}
112-
for (Map.Entry<TableId, List<WriteResult>> entry : tableMap.entrySet()) {
117+
118+
for (Map.Entry<TableId, List<WriteResultWrapper>> entry : tableMap.entrySet()) {
113119
TableId tableId = entry.getKey();
114120

121+
// Group by batchIndex so wrappers from different subtasks for the same batch
122+
// are merged into one snapshot, not committed separately.
123+
TreeMap<Integer, List<WriteResultWrapper>> batchGroups = new TreeMap<>();
124+
for (WriteResultWrapper w : entry.getValue()) {
125+
batchGroups.computeIfAbsent(w.getBatchIndex(), k -> new ArrayList<>()).add(w);
126+
LOGGER.info(w.buildDescription());
127+
}
128+
115129
Table table =
116130
catalog.loadTable(
117131
TableIdentifier.of(tableId.getSchemaName(), tableId.getTableName()));
118132

133+
int startBatchIndex = 0;
119134
Snapshot snapshot = table.currentSnapshot();
120135
if (snapshot != null) {
121136
Iterable<Snapshot> ancestors =
122137
SnapshotUtil.ancestorsOf(snapshot.snapshotId(), table::snapshot);
123-
long lastCheckpointId =
138+
long lastCommittedCheckpointId =
124139
getMaxCommittedCheckpointId(ancestors, newFlinkJobId, operatorId);
125-
if (lastCheckpointId == checkpointId) {
140+
if (lastCommittedCheckpointId >= checkpointId) {
126141
LOGGER.warn(
127142
"Checkpoint id {} has been committed to table {}, skipping",
128143
checkpointId,
129144
tableId.identifier());
130145
continue;
131146
}
147+
ancestors = SnapshotUtil.ancestorsOf(snapshot.snapshotId(), table::snapshot);
148+
startBatchIndex =
149+
getLastCommittedBatchIndex(
150+
ancestors, newFlinkJobId, operatorId, checkpointId)
151+
+ 1;
132152
}
133153

134154
Optional<TableMetric> tableMetric = getTableMetric(tableId);
135155
tableMetric.ifPresent(TableMetric::increaseCommitTimes);
136156

137-
List<WriteResult> results = entry.getValue();
138-
List<DataFile> dataFiles =
139-
results.stream()
140-
.filter(payload -> payload.dataFiles() != null)
141-
.flatMap(payload -> Arrays.stream(payload.dataFiles()))
142-
.filter(dataFile -> dataFile.recordCount() > 0)
143-
.collect(toList());
144-
List<DeleteFile> deleteFiles =
145-
results.stream()
146-
.filter(payload -> payload.deleteFiles() != null)
147-
.flatMap(payload -> Arrays.stream(payload.deleteFiles()))
148-
.filter(deleteFile -> deleteFile.recordCount() > 0)
149-
.collect(toList());
150-
if (dataFiles.isEmpty() && deleteFiles.isEmpty()) {
151-
LOGGER.info(String.format("Nothing to commit to table %s, skipping", table.name()));
152-
} else {
157+
int lastNonEmptyBatchIndex = -1;
158+
for (Map.Entry<Integer, List<WriteResultWrapper>> g : batchGroups.entrySet()) {
159+
List<DataFile> df = collectDataFilesFromGroup(g.getValue());
160+
List<DeleteFile> del = collectDeleteFilesFromGroup(g.getValue());
161+
if (!df.isEmpty() || !del.isEmpty()) {
162+
lastNonEmptyBatchIndex = g.getKey();
163+
}
164+
}
165+
166+
// Commit each batch as a separate snapshot so sequence numbers increase per batch.
167+
for (Map.Entry<Integer, List<WriteResultWrapper>> g : batchGroups.entrySet()) {
168+
int batchIdx = g.getKey();
169+
if (batchIdx < startBatchIndex) {
170+
LOGGER.info(
171+
"Batch {} for checkpoint {} of table {} already committed, skipping",
172+
batchIdx,
173+
checkpointId,
174+
tableId.identifier());
175+
continue;
176+
}
177+
178+
List<DataFile> dataFiles = collectDataFilesFromGroup(g.getValue());
179+
List<DeleteFile> deleteFiles = collectDeleteFilesFromGroup(g.getValue());
180+
181+
if (dataFiles.isEmpty() && deleteFiles.isEmpty()) {
182+
LOGGER.info(
183+
"Batch {} for checkpoint {} of table {} has nothing to commit, skipping",
184+
batchIdx,
185+
checkpointId,
186+
tableId.identifier());
187+
continue;
188+
}
189+
190+
SnapshotUpdate<?> operation;
153191
if (deleteFiles.isEmpty()) {
154192
AppendFiles append = table.newAppend();
155193
dataFiles.forEach(append::appendFile);
156-
commitOperation(append, newFlinkJobId, operatorId, checkpointId);
194+
operation = append;
157195
} else {
158196
RowDelta delta = table.newRowDelta();
159197
dataFiles.forEach(delta::addRows);
160198
deleteFiles.forEach(delta::addDeletes);
161-
commitOperation(delta, newFlinkJobId, operatorId, checkpointId);
199+
operation = delta;
162200
}
201+
202+
operation.set(SinkUtil.FLINK_JOB_ID, newFlinkJobId);
203+
operation.set(SinkUtil.OPERATOR_ID, operatorId);
204+
operation.set(FLINK_BATCH_INDEX, String.valueOf(batchIdx));
205+
operation.set(FLINK_CHECKPOINT_ID_PROP, String.valueOf(checkpointId));
206+
if (batchIdx == lastNonEmptyBatchIndex) {
207+
operation.set(
208+
SinkUtil.MAX_COMMITTED_CHECKPOINT_ID, String.valueOf(checkpointId));
209+
}
210+
operation.commit();
163211
}
164212
}
165213
}
166214

215+
private static List<DataFile> collectDataFilesFromGroup(List<WriteResultWrapper> group) {
216+
return group.stream()
217+
.flatMap(w -> collectDataFiles(w.getWriteResult()).stream())
218+
.collect(toList());
219+
}
220+
221+
private static List<DeleteFile> collectDeleteFilesFromGroup(List<WriteResultWrapper> group) {
222+
return group.stream()
223+
.flatMap(w -> collectDeleteFiles(w.getWriteResult()).stream())
224+
.collect(toList());
225+
}
226+
227+
private static List<DataFile> collectDataFiles(WriteResult result) {
228+
if (result.dataFiles() == null) {
229+
return new ArrayList<>();
230+
}
231+
return Arrays.stream(result.dataFiles()).filter(f -> f.recordCount() > 0).collect(toList());
232+
}
233+
234+
private static List<DeleteFile> collectDeleteFiles(WriteResult result) {
235+
if (result.deleteFiles() == null) {
236+
return new ArrayList<>();
237+
}
238+
return Arrays.stream(result.deleteFiles())
239+
.filter(f -> f.recordCount() > 0)
240+
.collect(toList());
241+
}
242+
167243
private static long getMaxCommittedCheckpointId(
168244
Iterable<Snapshot> ancestors, String flinkJobId, String operatorId) {
169245
long lastCommittedCheckpointId = INITIAL_CHECKPOINT_ID - 1;
@@ -185,15 +261,35 @@ private static long getMaxCommittedCheckpointId(
185261
return lastCommittedCheckpointId;
186262
}
187263

188-
private static void commitOperation(
189-
SnapshotUpdate<?> operation,
190-
String newFlinkJobId,
191-
String operatorId,
192-
long checkpointId) {
193-
operation.set(SinkUtil.MAX_COMMITTED_CHECKPOINT_ID, Long.toString(checkpointId));
194-
operation.set(SinkUtil.FLINK_JOB_ID, newFlinkJobId);
195-
operation.set(SinkUtil.OPERATOR_ID, operatorId);
196-
operation.commit();
264+
/**
265+
* Returns the highest batch index already committed for the given checkpoint, or -1 if none.
266+
* Used to skip already-persisted batches on retry.
267+
*/
268+
private static int getLastCommittedBatchIndex(
269+
Iterable<Snapshot> ancestors, String flinkJobId, String operatorId, long checkpointId) {
270+
for (Snapshot ancestor : ancestors) {
271+
Map<String, String> summary = ancestor.summary();
272+
if (!flinkJobId.equals(summary.get(SinkUtil.FLINK_JOB_ID))) {
273+
continue;
274+
}
275+
String snapshotOperatorId = summary.get(SinkUtil.OPERATOR_ID);
276+
if (snapshotOperatorId != null && !snapshotOperatorId.equals(operatorId)) {
277+
continue;
278+
}
279+
// Stop once we pass a fully-committed earlier checkpoint; intermediate batch
280+
// snapshots for the current checkpoint lie between it and the current tip.
281+
String maxCommittedStr = summary.get(SinkUtil.MAX_COMMITTED_CHECKPOINT_ID);
282+
if (maxCommittedStr != null && Long.parseLong(maxCommittedStr) < checkpointId) {
283+
break;
284+
}
285+
String snapshotCheckpointId = summary.get(FLINK_CHECKPOINT_ID_PROP);
286+
if (snapshotCheckpointId != null
287+
&& Long.parseLong(snapshotCheckpointId) == checkpointId) {
288+
String batchIndexStr = summary.get(FLINK_BATCH_INDEX);
289+
return batchIndexStr != null ? Integer.parseInt(batchIndexStr) : 0;
290+
}
291+
}
292+
return -1;
197293
}
198294

199295
private Optional<TableMetric> getTableMetric(TableId tableId) {

flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/main/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/IcebergWriter.java

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,11 @@ public class IcebergWriter
7272

7373
private final List<WriteResultWrapper> temporaryWriteResult;
7474

75+
/**
76+
* Per-table batch index; incremented on each schema-change flush, even when no writer exists.
77+
*/
78+
private Map<TableId, Integer> tableBatchIndexMap;
79+
7580
private Catalog catalog;
7681

7782
private final int taskId;
@@ -102,6 +107,7 @@ public IcebergWriter(
102107
writerFactoryMap = new HashMap<>();
103108
writerMap = new HashMap<>();
104109
schemaMap = new HashMap<>();
110+
tableBatchIndexMap = new HashMap<>();
105111
temporaryWriteResult = new ArrayList<>();
106112
this.taskId = taskId;
107113
this.attemptId = attemptId;
@@ -129,6 +135,7 @@ public Collection<WriteResultWrapper> prepareCommit() throws IOException {
129135
list.addAll(temporaryWriteResult);
130136
list.addAll(getWriteResult());
131137
temporaryWriteResult.clear();
138+
tableBatchIndexMap.clear();
132139
lastCheckpointId++;
133140
return list;
134141
}
@@ -166,6 +173,11 @@ public void write(Event event, Context context) throws IOException {
166173
} else {
167174
SchemaChangeEvent schemaChangeEvent = (SchemaChangeEvent) event;
168175
TableId tableId = schemaChangeEvent.tableId();
176+
// Flush only when the table is already known; skip on initial CreateTableEvent since
177+
// no data has been written yet and there is nothing to split.
178+
if (schemaMap.containsKey(tableId)) {
179+
flushTableWriter(tableId);
180+
}
169181
TableSchemaWrapper tableSchemaWrapper = schemaMap.get(tableId);
170182

171183
Schema newSchema =
@@ -179,21 +191,46 @@ public void write(Event event, Context context) throws IOException {
179191

180192
@Override
181193
public void flush(boolean flush) throws IOException {
182-
// Notice: flush method may be called many times during one checkpoint.
183-
temporaryWriteResult.addAll(getWriteResult());
194+
// Clear the factory cache so the next write picks up the latest catalog schema.
195+
// Writers keep running; schema-change splits are handled in flushTableWriter.
196+
writerFactoryMap.clear();
197+
}
198+
199+
private void flushTableWriter(TableId tableId) throws IOException {
200+
TaskWriter<RowData> writer = writerMap.remove(tableId);
201+
// Advance even when no writer exists, to keep batchIndex in sync across subtasks.
202+
int batchIndex = tableBatchIndexMap.getOrDefault(tableId, 0);
203+
tableBatchIndexMap.put(tableId, batchIndex + 1);
204+
if (writer == null) {
205+
return;
206+
}
207+
WriteResultWrapper writeResultWrapper =
208+
new WriteResultWrapper(
209+
writer.complete(),
210+
tableId,
211+
lastCheckpointId + 1,
212+
jobId,
213+
operatorId,
214+
batchIndex);
215+
temporaryWriteResult.add(writeResultWrapper);
216+
LOGGER.info(writeResultWrapper.buildDescription());
217+
writerFactoryMap.remove(tableId);
184218
}
185219

186220
private List<WriteResultWrapper> getWriteResult() throws IOException {
187221
long currentCheckpointId = lastCheckpointId + 1;
188222
List<WriteResultWrapper> writeResults = new ArrayList<>();
189223
for (Map.Entry<TableId, TaskWriter<RowData>> entry : writerMap.entrySet()) {
224+
TableId tableId = entry.getKey();
225+
int batchIndex = tableBatchIndexMap.getOrDefault(tableId, 0);
190226
WriteResultWrapper writeResultWrapper =
191227
new WriteResultWrapper(
192228
entry.getValue().complete(),
193-
entry.getKey(),
229+
tableId,
194230
currentCheckpointId,
195231
jobId,
196-
operatorId);
232+
operatorId,
233+
batchIndex);
197234
writeResults.add(writeResultWrapper);
198235
LOGGER.info(writeResultWrapper.buildDescription());
199236
}
@@ -225,6 +262,11 @@ public void close() throws Exception {
225262
writerFactoryMap = null;
226263
}
227264

265+
if (tableBatchIndexMap != null) {
266+
tableBatchIndexMap.clear();
267+
tableBatchIndexMap = null;
268+
}
269+
228270
catalog = null;
229271
}
230272
}

flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-iceberg/src/main/java/org/apache/flink/cdc/connectors/iceberg/sink/v2/WriteResultWrapper.java

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,31 @@ public class WriteResultWrapper implements Serializable {
4040

4141
private final String operatorId;
4242

43+
/** Batch index within the checkpoint for this table; increments on each schema-change flush. */
44+
private final int batchIndex;
45+
4346
public WriteResultWrapper(
4447
WriteResult writeResult,
4548
TableId tableId,
4649
long checkpointId,
4750
String jobId,
48-
String operatorId) {
51+
String operatorId,
52+
int batchIndex) {
4953
this.writeResult = writeResult;
5054
this.tableId = tableId;
5155
this.checkpointId = checkpointId;
5256
this.jobId = jobId;
5357
this.operatorId = operatorId;
58+
this.batchIndex = batchIndex;
59+
}
60+
61+
public WriteResultWrapper(
62+
WriteResult writeResult,
63+
TableId tableId,
64+
long checkpointId,
65+
String jobId,
66+
String operatorId) {
67+
this(writeResult, tableId, checkpointId, jobId, operatorId, 0);
5468
}
5569

5670
public WriteResult getWriteResult() {
@@ -73,6 +87,10 @@ public String getOperatorId() {
7387
return operatorId;
7488
}
7589

90+
public int getBatchIndex() {
91+
return batchIndex;
92+
}
93+
7694
/** Build a simple description for the write result. */
7795
public String buildDescription() {
7896
long addCount = 0;
@@ -95,6 +113,8 @@ public String buildDescription() {
95113
+ jobId
96114
+ ", OperatorId: "
97115
+ operatorId
116+
+ ", BatchIndex: "
117+
+ batchIndex
98118
+ ", AddCount: "
99119
+ addCount
100120
+ ", DeleteCount: "

0 commit comments

Comments
 (0)