Skip to content

Commit 7c19adf

Browse files
authored
Merge pull request #29981 from vbotbuildovich/backport-pr-29980-v26.1.x-276
[v26.1.x] ct: Fix ctp_stm test
2 parents 17bb771 + c0aee4f commit 7c19adf

1 file changed

Lines changed: 52 additions & 14 deletions

File tree

src/v/cloud_topics/level_zero/stm/tests/ctp_stm_test.cc

Lines changed: 52 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -818,18 +818,38 @@ TEST_F_CORO(
818818
// This is where the bug manifests: Node 0 has stale in-memory window [11,
819819
// 12]
820820
node0.raft()->unblock_new_leadership();
821+
822+
// Wait for all nodes to catch up before transferring leadership.
823+
// The replication above may have achieved majority without the target
824+
// node, and the transfer will fail if the target hasn't caught up.
825+
co_await wait_for_committed_offset(node1.raft()->committed_offset(), 10s);
826+
821827
vlog(
822828
ct::cd_log.info,
823829
"Transferring leadership back to Node {}",
824830
initial_leader_id);
825-
co_await node1.raft()->transfer_leadership(
826-
raft::transfer_leadership_request{
827-
.group = node1.raft()->group(),
828-
.target = initial_leader_id,
829-
.timeout = 10s});
830831

831-
co_await wait_for_leader(10s);
832-
auto final_leader_id = *get_leader();
832+
// Retry the transfer since it can transiently fail if the target
833+
// node's follower state hasn't been fully updated yet.
834+
auto final_leader_id = model::node_id{};
835+
for (int attempt = 0; attempt < 5; ++attempt) {
836+
co_await node1.raft()->transfer_leadership(
837+
raft::transfer_leadership_request{
838+
.group = node1.raft()->group(),
839+
.target = initial_leader_id,
840+
.timeout = 10s});
841+
842+
co_await wait_for_leader(10s);
843+
final_leader_id = *get_leader();
844+
if (final_leader_id == initial_leader_id) {
845+
break;
846+
}
847+
vlog(
848+
ct::cd_log.info,
849+
"Transfer attempt {} landed on node {}, retrying",
850+
attempt,
851+
final_leader_id);
852+
}
833853
vlog(ct::cd_log.info, "Final leader: {}", final_leader_id);
834854
ASSERT_EQ_CORO(final_leader_id, initial_leader_id)
835855
<< "Leadership should have transferred back to original leader";
@@ -931,14 +951,32 @@ TEST_F_CORO(
931951

932952
// Step 6: Transfer leadership back to Node0.
933953
node0.raft()->unblock_new_leadership();
934-
co_await node1.raft()->transfer_leadership(
935-
raft::transfer_leadership_request{
936-
.group = node1.raft()->group(),
937-
.target = initial_leader_id,
938-
.timeout = 10s});
939954

940-
co_await wait_for_leader(10s);
941-
ASSERT_EQ_CORO(*get_leader(), initial_leader_id);
955+
// Wait for all nodes to catch up before transferring leadership.
956+
co_await wait_for_committed_offset(node1.raft()->committed_offset(), 10s);
957+
958+
// Retry the transfer since it can transiently fail if the target
959+
// node's follower state hasn't been fully updated yet.
960+
auto final_leader_id = model::node_id{};
961+
for (int attempt = 0; attempt < 5; ++attempt) {
962+
co_await node1.raft()->transfer_leadership(
963+
raft::transfer_leadership_request{
964+
.group = node1.raft()->group(),
965+
.target = initial_leader_id,
966+
.timeout = 10s});
967+
968+
co_await wait_for_leader(10s);
969+
final_leader_id = *get_leader();
970+
if (final_leader_id == initial_leader_id) {
971+
break;
972+
}
973+
vlog(
974+
ct::cd_log.info,
975+
"Transfer attempt {} landed on node {}, retrying",
976+
attempt,
977+
final_leader_id);
978+
}
979+
ASSERT_EQ_CORO(final_leader_id, initial_leader_id);
942980

943981
// Step 7: Try to fence epoch 5 on the returned leader.
944982
// Applied window is now [7, 8] so this must be rejected.

0 commit comments

Comments
 (0)