Skip to content

Commit b5ce0e5

Browse files
authored
[FIX] Fix flaky BookieAutoRecoveryTest#testOpenLedgers timeout (#4743)
When the killed bookie happens to be the Auditor leader, the test must wait for the ZK session timeout (default 10s) before the ephemeral node disappears, then wait for a new Auditor leader election, metadata scan, and underreplicated ledger publishing. In resource-constrained CI environments, this chain can exceed the 60-second await timeout. Two changes: - setZkTimeout(4000) — reduces ZK session timeout so the ephemeral node disappears faster - await 60s → 90s — provides more headroom for slow CI environments
1 parent ad2b6e9 commit b5ce0e5

1 file changed

Lines changed: 15 additions & 12 deletions

File tree

bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,9 @@ public BookieAutoRecoveryTest() throws IOException, KeeperException,
8888
"org.apache.bookkeeper.meta.HierarchicalLedgerManagerFactory");
8989
baseConf.setOpenLedgerRereplicationGracePeriod(openLedgerRereplicationGracePeriod);
9090
baseConf.setRwRereplicateBackoffMs(500);
91+
// Reduce ZK session timeout so killed bookie's ephemeral node disappears faster,
92+
// speeding up Auditor leader re-election when the killed bookie was the leader.
93+
baseConf.setZkTimeout(4000);
9194
baseClientConf.setLedgerManagerFactoryClassName(
9295
"org.apache.bookkeeper.meta.HierarchicalLedgerManagerFactory");
9396
this.digestType = DigestType.MAC;
@@ -168,7 +171,7 @@ public void testOpenLedgers() throws Exception {
168171

169172
// waiting to publish urLedger znode by Auditor
170173
assertTrue("Ledger should be marked as underreplicated",
171-
latch.await(60, TimeUnit.SECONDS));
174+
latch.await(90, TimeUnit.SECONDS));
172175
latch = new CountDownLatch(1);
173176
LOG.info("Watching on urLedgerPath:" + urLedgerZNode
174177
+ " to know the status of rereplication process");
@@ -186,7 +189,7 @@ public void testOpenLedgers() throws Exception {
186189
+ replicaToKillAddr);
187190
}
188191
assertTrue("Replication should complete",
189-
latch.await(60, TimeUnit.SECONDS));
192+
latch.await(90, TimeUnit.SECONDS));
190193

191194
// grace period to update the urledger metadata in zookeeper
192195
LOG.info("Waiting to update the urledger metadata in zookeeper");
@@ -223,7 +226,7 @@ public void testClosedLedgers() throws Exception {
223226

224227
// waiting to publish urLedger znode by Auditor
225228
assertTrue("Ledgers should be marked as underreplicated",
226-
latch.await(60, TimeUnit.SECONDS));
229+
latch.await(90, TimeUnit.SECONDS));
227230

228231
// Again watching the urLedger znode to know the replication status
229232
latch = new CountDownLatch(listOfLedgerHandle.size());
@@ -248,7 +251,7 @@ public void testClosedLedgers() throws Exception {
248251

249252
// waiting to finish replication
250253
assertTrue("Replication should complete",
251-
latch.await(60, TimeUnit.SECONDS));
254+
latch.await(90, TimeUnit.SECONDS));
252255

253256
// grace period to update the urledger metadata in zookeeper
254257
LOG.info("Waiting to update the urledger metadata in zookeeper");
@@ -296,7 +299,7 @@ public void testStopWhileReplicationInProgress() throws Exception {
296299

297300
// waiting to publish urLedger znode by Auditor
298301
assertTrue("Ledgers should be marked as underreplicated",
299-
latch.await(60, TimeUnit.SECONDS));
302+
latch.await(90, TimeUnit.SECONDS));
300303

301304
// Again watching the urLedger znode to know the replication status
302305
latch = new CountDownLatch(listOfLedgerHandle.size());
@@ -332,7 +335,7 @@ public void testStopWhileReplicationInProgress() throws Exception {
332335

333336
LOG.info("Waiting to finish rereplication processes");
334337
assertTrue("Replication should complete after restart",
335-
latch.await(60, TimeUnit.SECONDS));
338+
latch.await(90, TimeUnit.SECONDS));
336339

337340
// grace period to update the urledger metadata in zookeeper
338341
LOG.info("Waiting to update the urledger metadata in zookeeper");
@@ -369,7 +372,7 @@ public void testNoSuchLedgerExists() throws Exception {
369372
killBookie(replicaToKillAddr);
370373
// waiting to publish urLedger znode by Auditor
371374
assertTrue("Ledgers should be marked as underreplicated",
372-
latch.await(60, TimeUnit.SECONDS));
375+
latch.await(90, TimeUnit.SECONDS));
373376

374377
latch = new CountDownLatch(listOfLedgerHandle.size());
375378
for (LedgerHandle lh : listOfLedgerHandle) {
@@ -385,7 +388,7 @@ public void testNoSuchLedgerExists() throws Exception {
385388

386389
// waiting to delete published urledgers, since it doesn't exists
387390
assertTrue("UrLedgers should be cleaned up after deletion",
388-
latch.await(60, TimeUnit.SECONDS));
391+
latch.await(90, TimeUnit.SECONDS));
389392

390393
for (LedgerHandle lh : listOfLedgerHandle) {
391394
assertNull("UrLedger still exists after rereplication",
@@ -491,7 +494,7 @@ public void testLedgerMetadataContainsIpAddressAsBookieID()
491494

492495
// waiting to publish urLedger znode by Auditor
493496
assertTrue("Ledger should be marked as underreplicated",
494-
latch.await(60, TimeUnit.SECONDS));
497+
latch.await(90, TimeUnit.SECONDS));
495498
latch = new CountDownLatch(1);
496499
LOG.info("Watching on urLedgerPath:" + urLedgerZNode
497500
+ " to know the status of rereplication process");
@@ -512,7 +515,7 @@ public void testLedgerMetadataContainsIpAddressAsBookieID()
512515
+ replicaToKillAddr);
513516
}
514517
assertTrue("Replication should complete",
515-
latch.await(60, TimeUnit.SECONDS));
518+
latch.await(90, TimeUnit.SECONDS));
516519

517520
// grace period to update the urledger metadata in zookeeper
518521
LOG.info("Waiting to update the urledger metadata in zookeeper");
@@ -570,7 +573,7 @@ public void testLedgerMetadataContainsHostNameAsBookieID()
570573

571574
// waiting to publish urLedger znode by Auditor
572575
assertTrue("Ledger should be marked as underreplicated",
573-
latch.await(60, TimeUnit.SECONDS));
576+
latch.await(90, TimeUnit.SECONDS));
574577
latch = new CountDownLatch(1);
575578
LOG.info("Watching on urLedgerPath:" + urLedgerZNode
576579
+ " to know the status of rereplication process");
@@ -593,7 +596,7 @@ public void testLedgerMetadataContainsHostNameAsBookieID()
593596
+ replicaToKillAddr);
594597
}
595598
assertTrue("Replication should complete",
596-
latch.await(60, TimeUnit.SECONDS));
599+
latch.await(90, TimeUnit.SECONDS));
597600

598601
// grace period to update the urledger metadata in zookeeper
599602
LOG.info("Waiting to update the urledger metadata in zookeeper");

0 commit comments

Comments
 (0)