Skip to content

Commit 3ee1e2d

Browse files
author
Alekhya Parisha
committed
Add worker retry mechanism for failed batch processing in LeaderOnlyTokenCrawler
Signed-off-by: Alekhya Parisha <aparisha@amazon.com>
1 parent 2c3b8e0 commit 3ee1e2d

2 files changed

Lines changed: 44 additions & 25 deletions

File tree

data-prepper-plugins/saas-source-plugins/source-crawler/src/main/java/org/opensearch/dataprepper/plugins/source/source_crawler/base/LeaderOnlyTokenCrawler.java

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
import org.opensearch.dataprepper.model.record.Record;
1313
import org.opensearch.dataprepper.model.source.coordinator.enhanced.EnhancedSourceCoordinator;
1414
import org.opensearch.dataprepper.plugins.source.source_crawler.coordination.partition.LeaderPartition;
15+
import org.opensearch.dataprepper.plugins.source.source_crawler.coordination.partition.SaasSourcePartition;
16+
import org.opensearch.dataprepper.plugins.source.source_crawler.coordination.state.PaginationCrawlerWorkerProgressState;
1517
import org.opensearch.dataprepper.plugins.source.source_crawler.coordination.state.TokenPaginationCrawlerLeaderProgressState;
1618
import org.opensearch.dataprepper.plugins.source.source_crawler.model.ItemInfo;
1719
import org.slf4j.Logger;
@@ -25,6 +27,7 @@
2527
import java.util.List;
2628
import java.util.concurrent.TimeUnit;
2729
import java.util.concurrent.atomic.AtomicBoolean;
30+
import java.util.stream.Collectors;
2831

2932
@Named
3033
public class LeaderOnlyTokenCrawler implements Crawler<SaasWorkerProgressState> {
@@ -54,7 +57,6 @@ public class LeaderOnlyTokenCrawler implements Crawler<SaasWorkerProgressState>
5457
private final Timer bufferWriteTimer;
5558

5659
private String lastToken;
57-
private boolean shouldStopCrawl = false;
5860
private Duration noAckTimeout;
5961

6062
public LeaderOnlyTokenCrawler(
@@ -73,7 +75,6 @@ public LeaderOnlyTokenCrawler(
7375
@Override
7476
public Instant crawl(LeaderPartition leaderPartition,
7577
EnhancedSourceCoordinator coordinator) {
76-
shouldStopCrawl = false;
7778
long startTime = System.currentTimeMillis();
7879
Instant lastCheckpointTime = Instant.now();
7980
TokenPaginationCrawlerLeaderProgressState leaderProgressState =
@@ -84,7 +85,7 @@ public Instant crawl(LeaderPartition leaderPartition,
8485

8586
Iterator<ItemInfo> itemIterator = ((LeaderOnlyTokenCrawlerClient) client).listItems(lastToken);
8687

87-
while (itemIterator.hasNext() && !shouldStopCrawl) {
88+
while (itemIterator.hasNext()) {
8889
List<ItemInfo> batch = collectBatch(itemIterator);
8990
if (batch.isEmpty()) {
9091
continue;
@@ -148,12 +149,12 @@ private void processBatch(List<ItemInfo> batch,
148149
if (success) {
149150
// On success: update checkpoint
150151
acknowledgementSetSuccesses.increment();
151-
updateLeaderProgressState(leaderPartition, lastToken, coordinator);
152152
} else {
153-
// On failure: Stop the crawl
153+
// On failure: Create a retry partition
154154
acknowledgementSetFailures.increment();
155-
log.warn("Batch processing received negative acknowledgment for token: {}. Stopping current crawl.", lastToken);
156-
shouldStopCrawl = true;
155+
log.warn("Batch processing received negative acknowledgment for token: {}. Creating retry " +
156+
"partition.", lastToken);
157+
createRetryPartition(batch, coordinator);
157158
}
158159
},
159160
noAckTimeout
@@ -172,17 +173,19 @@ private void processBatch(List<ItemInfo> batch,
172173

173174
if (!ackWaitDuration.minus(noAckTimeout).isNegative()) {
174175
// No ack received within NO_ACK_TIME_OUT_SECONDS
175-
log.warn("Acknowledgment not received for batch with token {} past wait time. Stopping current crawl.", lastToken);
176-
shouldStopCrawl = true;
176+
log.warn("No acknowledgment received for batch with token: {}. Creating retry partition.", lastToken);
177+
createRetryPartition(batch, coordinator);
177178
break;
178179
}
179180
}
181+
updateLeaderProgressState(leaderPartition, lastToken, coordinator);
180182
} catch (InterruptedException e) {
181183
Thread.currentThread().interrupt();
182184
throw new RuntimeException("Interrupted while waiting for acknowledgment", e);
183185
} catch (Exception e) {
184186
log.error("Failed to process batch ending with token {}", lastToken, e);
185187
acknowledgementSet.complete();
188+
createRetryPartition(batch, coordinator);
186189
throw e;
187190
}
188191
});
@@ -201,6 +204,23 @@ private void processBatch(List<ItemInfo> batch,
201204
}
202205
}
203206

207+
private void createRetryPartition(List<ItemInfo> itemInfoList, EnhancedSourceCoordinator coordinator) {
208+
if (itemInfoList.isEmpty()) {
209+
return;
210+
}
211+
ItemInfo itemInfo = itemInfoList.get(0);
212+
String partitionKey = itemInfo.getPartitionKey();
213+
List<String> itemIds = itemInfoList.stream().map(ItemInfo::getId).collect(Collectors.toList());
214+
PaginationCrawlerWorkerProgressState state = new PaginationCrawlerWorkerProgressState();
215+
state.setKeyAttributes(itemInfo.getKeyAttributes());
216+
state.setItemIds(itemIds);
217+
state.setExportStartTime(Instant.now());
218+
state.setLoadedItems(itemInfoList.size());
219+
SaasSourcePartition sourcePartition = new SaasSourcePartition(state, partitionKey);
220+
coordinator.createPartition(sourcePartition);
221+
}
222+
223+
204224
private void updateLeaderProgressState(LeaderPartition leaderPartition,
205225
String updatedToken,
206226
EnhancedSourceCoordinator coordinator) {

data-prepper-plugins/saas-source-plugins/source-crawler/src/test/java/org/opensearch/dataprepper/plugins/source/source_crawler/base/LeaderOnlyTokenCrawlerTest.java

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import static org.mockito.Mockito.when;
3636
import static org.mockito.Mockito.doThrow;
3737
import static org.mockito.Mockito.never;
38+
import static org.mockito.Mockito.doAnswer;
3839
import static org.mockito.internal.verification.VerificationModeFactory.times;
3940

4041
@ExtendWith(MockitoExtension.class)
@@ -145,28 +146,29 @@ void testProgressStateUpdate() {
145146

146147
@Test
147148
void testNegativeAcknowledgment() {
148-
List<ItemInfo> items = createTestItems(BATCH_SIZE + 1);
149+
List<ItemInfo> items = createTestItems(1);
149150
when(client.listItems(INITIAL_TOKEN)).thenReturn(items.iterator());
150-
when(acknowledgementSetManager.create(any(), eq(TEST_TIMEOUT)))
151-
.thenReturn(acknowledgementSet);
152151

153-
ArgumentCaptor<Consumer<Boolean>> callbackCaptor = ArgumentCaptor.forClass(Consumer.class);
152+
// Setup immediate negative acknowledgment
153+
doAnswer(invocation -> {
154+
Consumer<Boolean> callback = invocation.getArgument(0);
155+
callback.accept(false); // Trigger negative ack immediately
156+
return acknowledgementSet;
157+
}).when(acknowledgementSetManager).create(any(), eq(TEST_TIMEOUT));
154158

155159
crawler.setAcknowledgementsEnabled(true);
156160
crawler.crawl(leaderPartition, coordinator);
157161

158-
verify(acknowledgementSetManager).create(callbackCaptor.capture(), eq(TEST_TIMEOUT));
159-
160-
// Simulate negative acknowledgment
161-
callbackCaptor.getValue().accept(false);
162-
162+
// Verify behavior
163163
verify(client, times(1)).writeBatchToBuffer(any(), any(), any());
164-
verify(coordinator, never()).saveProgressStateForPartition(eq(leaderPartition), any(Duration.class));
164+
verify(coordinator, times(1)).createPartition(any());
165+
verify(acknowledgementSet, times(1)).complete();
165166
}
166167

168+
167169
@Test
168170
void testAcknowledgmentTimeout() {
169-
List<ItemInfo> items = createTestItems(BATCH_SIZE + 1);
171+
List<ItemInfo> items = createTestItems( 1);
170172
when(client.listItems(INITIAL_TOKEN)).thenReturn(items.iterator());
171173
when(acknowledgementSetManager.create(any(), eq(TEST_TIMEOUT)))
172174
.thenReturn(acknowledgementSet);
@@ -178,13 +180,10 @@ void testAcknowledgmentTimeout() {
178180

179181
verify(acknowledgementSetManager).create(callbackCaptor.capture(), eq(TEST_TIMEOUT));
180182

181-
// Verify:
182-
// 1. Only first batch was processed
183+
// Verify timeout behavior
183184
verify(client, times(1)).writeBatchToBuffer(any(), any(), any());
184-
// 2. No checkpoint update happened
185-
verify(coordinator, never()).saveProgressStateForPartition(eq(leaderPartition), any(Duration.class));
186-
// 3. Acknowledgment set was completed
187185
verify(acknowledgementSet).complete();
186+
verify(coordinator).createPartition(any());
188187
}
189188

190189

0 commit comments

Comments
 (0)