Skip to content

Commit b3fe753

Browse files
cjen1-msfteddyashtonachamayou
authored
Don't rollback uncommittable indices on become_follower (#7620)
Co-authored-by: Eddy Ashton <ashton.eddy@gmail.com> Co-authored-by: Amaury Chamayou <amchamay@microsoft.com> Co-authored-by: Amaury Chamayou <amaury@xargs.fr>
1 parent b4be36f commit b3fe753

10 files changed

Lines changed: 100 additions & 29 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1313

1414
- `GET` and `HEAD` `/node/ledger-chunk?since={seqno}` and `/node/ledger-chunk/{chunk_name}` endpoints, gated by the `LedgerChunkDownload` RPC interface operator feature. See [documentation](https://microsoft.github.io/CCF/main/operations/ledger_snapshot.html#download-endpoints) for more detail.
1515

16+
### Fixed
17+
18+
- Only rollback uncommittable indices during become_leader (#7620)
19+
1620
## [7.0.0-dev9]
1721

1822
[7.0.0-dev9]: https://github.com/microsoft/CCF/releases/tag/ccf-7.0.0-dev9

src/consensus/aft/raft.h

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1161,7 +1161,7 @@ namespace aft
11611161

11621162
// Reply false if the log doesn't contain an entry at r.prev_idx
11631163
// whose term is r.prev_term. Rejects "future" entries.
1164-
if (prev_term == 0)
1164+
if (prev_term == ccf::VIEW_UNKNOWN)
11651165
{
11661166
RAFT_DEBUG_FMT(
11671167
"Recv {} to {} from {} but our log does not yet "
@@ -1212,10 +1212,10 @@ namespace aft
12121212
state->commit_idx);
12131213
return;
12141214
}
1215-
// Redundant with check on get_term_internal() at line 1149
1216-
// Which captures this case in every situation, except r.prev_term == 0.
1217-
// That only happens if r.prev_idx == 0 however, see line 1033,
1218-
// in which case this path should not be taken either.
1215+
// This block is redundant - the checks above cover this case, so the code
1216+
// inside this block should be unreachable. It is retained out of
1217+
// abundance of caution, in case future rewrites of the above conditions
1218+
// allow a fallthrough.
12191219
if (r.prev_idx > state->last_idx)
12201220
{
12211221
RAFT_FAIL_FMT(
@@ -1652,6 +1652,7 @@ namespace aft
16521652
std::min(this_match, node->second.sent_idx), node->second.match_idx);
16531653
return;
16541654
}
1655+
16551656
// max(...) because why would we ever want to go backwards on a success
16561657
// response?!
16571658
node->second.match_idx = std::max(node->second.match_idx, r.last_log_idx);
@@ -2244,11 +2245,6 @@ namespace aft
22442245
restart_election_timeout();
22452246
reset_last_ack_timeouts();
22462247

2247-
// Drop anything unsigned here, but retain all signed entries. Only do a
2248-
// more aggressive rollback, potentially including signatures, when
2249-
// receiving a conflicting AppendEntries
2250-
rollback(last_committable_index());
2251-
22522248
state->leadership_state = ccf::kv::LeadershipState::Follower;
22532249
RAFT_INFO_FMT(
22542250
"Becoming follower {}: {}.{}",

src/consensus/aft/test/driver.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,11 @@ int main(int argc, char** argv)
294294
assert(items.size() == 3);
295295
driver->assert_absent_config(items[1], items[2]);
296296
break;
297+
case shash("assert_last_txid"):
298+
assert(items.size() == 3);
299+
skip_invariants = true;
300+
driver->assert_last_txid(items[1], items[2]);
301+
break;
297302
case shash("replicate_new_configuration"):
298303
assert(items.size() >= 3);
299304
items.erase(items.begin());

src/consensus/aft/test/driver.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1443,4 +1443,20 @@ class RaftDriver
14431443
}
14441444
}
14451445
}
1446+
1447+
void assert_last_txid(
1448+
const std::string& node_id, const std::string& last_txid_s)
1449+
{
1450+
auto idx = _nodes.at(node_id).raft->get_last_idx();
1451+
auto view = _nodes.at(node_id).raft->get_view(idx);
1452+
auto last_txid = fmt::format("{}.{}", view, idx);
1453+
if (last_txid != last_txid_s)
1454+
{
1455+
throw std::runtime_error(fmt::format(
1456+
"Node {} lastTxID is not as expected: {} != {}",
1457+
node_id,
1458+
last_txid,
1459+
last_txid_s));
1460+
}
1461+
}
14461462
};
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# A repro of the failing scenario where a follower rolls back its ledger by becoming a pre-vote-candidate and then back to a follower (#7618)
2+
# If this follower had previously acked an append-entries, then the leader's matchIdx for it would be higher than its truncated ledger.
3+
# This was fixed by no longer rolling back uncommittable entries (no signature) when becoming a follower.
4+
5+
start_node,0
6+
emit_signature,2
7+
assert_commit_idx,0,2
8+
9+
swap_nodes,2,in,1,2
10+
emit_signature,2
11+
12+
loop_until_sync
13+
14+
assert_last_txid,0,2.5
15+
replicate,2,tx1
16+
assert_last_txid,0,2.6
17+
18+
# advance matchIdx
19+
periodic_one,0,10
20+
dispatch_single,0,2
21+
dispatch_single,2,0
22+
23+
# Partition 2
24+
disconnect_node,2
25+
26+
emit_signature,2
27+
assert_last_txid,0,2.7
28+
29+
periodic_one,0,10
30+
31+
# b2 time out
32+
periodic_one,2,100
33+
assert_detail,2,leadership_state,PreVoteCandidate
34+
assert_last_txid,2,2.6
35+
36+
dispatch_all
37+
38+
# Heal partition
39+
reconnect_node,2
40+
41+
# leader sends append entries [2.7,2.7) to b2
42+
# b2 nacks but does not truncate
43+
periodic_one,0,10
44+
dispatch_single,0,2
45+
dispatch_single,2,0
46+
assert_detail,2,leadership_state,Follower
47+
assert_last_txid,2,2.6
48+
49+
loop_until_sync

tests/raft_scenarios/soft_rollback

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,30 +51,31 @@ dispatch_all
5151
replicate,3,saluton mondo
5252
replicate,3,ah well nevertheless
5353
state_all
54+
assert_last_txid,1,3.10
5455

5556
# Node 2 calls an election, advancing to term 3
5657
periodic_one,2,110
5758

58-
# Node 2's RequestVote reaches Node 1, causing a soft-rollback
59+
# -- Node 2's RequestVote reaches Node 1, causing a soft-rollback --
60+
# Node 2's RequestVote reaches Node 1, causing it to advance term but not roll-back
5961
dispatch_one,2
6062

61-
# Node 1 should retain the committed state!
63+
# Node 1 should retain the whole state 3.10 it had previously
6264
state_all
65+
assert_detail,1,current_view,4
66+
assert_last_txid,1,3.10
67+
assert_detail,2,current_view,4
68+
assert_last_txid,2,2.6
6369

6470
# Node 1 should not vote for node 2!
6571
dispatch_one,1
6672
assert_!detail,2,leadership_state,Leader
6773

68-
# Because if they did, then node 2's AEs would break persistence!
69-
periodic_all,10
70-
dispatch_all
71-
state_all
72-
7374
# Now restore comms between 0 and 1, and disconnect 2
7475
reconnect_node,0
7576
disconnect_node,2
7677

77-
# Despite not being able to _commit_ the entries it holds, node 1
78+
# Despite not being able to currently _commit_ the entries it holds, node 1
7879
# should be able to win an election by advertising them!
7980
periodic_one,1,110
8081
dispatch_all

tla/consensus/MCccfraft.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,4 @@ INVARIANTS
9797
RetiredCommittedInv
9898
RetirementCompletedNotInConfigsInv
9999
RetirementCompletedAreRetirementCompletedInv
100+
MatchIndexBoundedByLogInv

tla/consensus/SIMccfraft.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ INVARIANTS
111111
RetiredCommittedInv
112112
RetirementCompletedNotInConfigsInv
113113
RetirementCompletedAreRetirementCompletedInv
114+
MatchIndexBoundedByLogInv
114115

115116
\* DebugInvLeaderCannotStepDown
116117
\* DebugInvAnyCommitted

tla/consensus/Traceccfraft.tla

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,9 @@ IsBecomeLeader ==
202202
/\ Range(logline.msg.state.committable_indices) \subseteq CommittableIndices(logline.msg.state.node_id)
203203
/\ commitIndex[logline.msg.state.node_id] = logline.msg.state.commit_idx
204204
/\ leadershipState'[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state]
205-
/\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state]
206-
/\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx
205+
\* The log is truncated during BecomeLeader to the last committable index, and so membership state may have also changed
206+
/\ membershipState'[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state]
207+
/\ Len(log'[logline.msg.state.node_id]) = logline.msg.state.last_idx
207208
/\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id])
208209

209210
IsClientRequest ==

tla/consensus/ccfraft.tla

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1222,16 +1222,9 @@ UpdateTerm(i, j, m) ==
12221222
![i] = IF @ \in {Leader, Candidate, PreVoteCandidate, None} THEN Follower ELSE @]
12231223
/\ isNewFollower' = [isNewFollower EXCEPT ![i] = TRUE]
12241224
/\ votedFor' = [votedFor EXCEPT ![i] = Nil]
1225-
\* See rollback(last_committable_index()) in raft::become_follower
1226-
/\ log' = [log EXCEPT ![i] = SubSeq(@, 1, LastCommittableIndex(i))]
1227-
\* Potentially also shorten the configurations if the removed txns contained reconfigurations
1228-
/\ configurations' = [configurations EXCEPT ![i] = ConfigurationsToIndex(i,Len(log'[i]))]
1229-
\* If the leader was in the RetirementOrdered state, then its retirement has
1230-
\* been rolled back as it was unsigned
1231-
/\ membershipState' = [membershipState EXCEPT ![i] =
1232-
IF @ = RetirementOrdered THEN Active ELSE @]
12331225
\* messages is unchanged so m can be processed further.
1234-
/\ UNCHANGED <<preVoteStatus, messageVars, candidateVars, leaderVars, commitIndex, hasJoined, retirementCompleted>>
1226+
/\ UNCHANGED <<preVoteStatus, messageVars, candidateVars, leaderVars, logVars, hasJoined, retirementCompleted, membershipState, configurations>>
1227+
12351228

12361229
\* Responses with stale terms are ignored.
12371230
DropStaleResponse(i, j, m) ==
@@ -1719,6 +1712,10 @@ NeverCommitEntryPrevTermsProp ==
17191712
\* points to equals the leader's term.
17201713
commitIndex'[i] > commitIndex[i] => log[i][commitIndex'[i]].term = currentTerm'[i] ]_vars
17211714

1715+
MatchIndexBoundedByLogInv ==
1716+
\A i,j \in Servers:
1717+
(leadershipState[i] = Leader /\ currentTerm[i] = currentTerm[j]) => matchIndex[i][j] <= Len(log[j])
1718+
17221719
------------------------------------------------------------------------------
17231720
\* Refinement of the more high-level specification abs.tla that abstracts the
17241721
\* asynchronous network and the message passing between nodes. Instead, any

0 commit comments

Comments
 (0)