Skip to content

Commit 188d2f4

Browse files
Fix signature rollback race (#7746)
Co-authored-by: Amaury Chamayou <amchamay@microsoft.com>
1 parent ad95f5b commit 188d2f4

4 files changed

Lines changed: 161 additions & 6 deletions

File tree

src/kv/committable_tx.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,14 @@ namespace ccf::kv
414414

415415
if (!success)
416416
{
417-
throw std::logic_error("Failed to commit reserved transaction");
417+
if (pimpl->store->check_rollback_count(rollback_count))
418+
{
419+
throw std::logic_error("Failed to commit reserved transaction");
420+
}
421+
422+
committed = true;
423+
return {
424+
CommitResult::FAIL_NO_REPLICATE, {}, ccf::empty_claims(), {}, {}};
418425
}
419426

420427
ccf::crypto::Sha256Hash commit_evidence_digest;

src/kv/store.h

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,9 @@ namespace ccf::kv
642642

643643
version = tx_id.seqno;
644644
last_replicated = tx_id.seqno;
645+
// In practice rollback is only called at signature seqnos, so
646+
// clamping here restores the latest committable entry
647+
last_committable = std::min(last_committable, tx_id.seqno);
645648
unset_flag_unsafe(StoreFlag::SNAPSHOT_AT_NEXT_SIGNATURE);
646649
rollback_count++;
647650
pending_txs.clear();
@@ -992,14 +995,23 @@ namespace ccf::kv
992995
auto hooks_shared =
993996
std::make_shared<ccf::kv::ConsensusHookPtrs>(std::move(hooks_));
994997

995-
// NB: this cannot happen currently. Regular Tx only make it here if
996-
// they did succeed, and signatures cannot conflict because they
997-
// execute in order with a read_version that's version - 1, so even
998-
// two contiguous signatures are fine
999-
if (success_ != CommitResult::SUCCESS)
998+
// A pending tx may fail here if rollback invalidated a reserved
999+
// signature tx after it was dequeued from pending_txs.
1000+
if (success_ == CommitResult::FAIL_NO_REPLICATE)
10001001
{
10011002
LOG_DEBUG_FMT(
10021003
"Failed Tx commit {}", previous_last_replicated + offset);
1004+
return success_;
1005+
}
1006+
// We should never fail from here, as normal txs have already succeeded
1007+
// and reserved txs only fail with FAIL_NO_REPLICATE
1008+
if (success_ != CommitResult::SUCCESS)
1009+
{
1010+
LOG_FAIL_FMT(
1011+
"Unexpected failure reason {} during commit of {}.{}",
1012+
static_cast<int>(success_),
1013+
txid.view,
1014+
txid.seqno);
10031015
}
10041016

10051017
if (h)

src/kv/test/stub_consensus.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "ccf/crypto/symmetric_key.h"
66
#include "consensus/aft/impl/state.h"
77
#include "kv/kv_types.h"
8+
#include "kv/store.h"
89

910
#include <algorithm>
1011
#include <iostream>

src/node/test/history.cpp

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@
1818
#include <doctest/doctest.h>
1919
#undef FAIL
2020

21+
#include <condition_variable>
22+
#include <exception>
23+
#include <mutex>
24+
#include <thread>
25+
2126
using MapT = ccf::kv::Map<size_t, size_t>;
2227

2328
constexpr size_t certificate_validity_period_days = 365;
@@ -310,6 +315,55 @@ class TestPendingTx : public ccf::kv::PendingTx
310315
}
311316
};
312317

318+
struct PausedSignatureCommit
319+
{
320+
std::mutex lock;
321+
std::condition_variable reserved_tx_created_cv;
322+
std::condition_variable resume_cv;
323+
bool reserved_tx_created = false;
324+
bool resume = false;
325+
};
326+
327+
class PausedReservedSignatureTx : public ccf::kv::PendingTx
328+
{
329+
ccf::TxID txid;
330+
ccf::kv::Store& store;
331+
ccf::Signatures signatures;
332+
ccf::SerialisedMerkleTree serialised_tree;
333+
PausedSignatureCommit& paused;
334+
335+
public:
336+
PausedReservedSignatureTx(
337+
ccf::TxID txid_, ccf::kv::Store& store_, PausedSignatureCommit& paused_) :
338+
txid(txid_),
339+
store(store_),
340+
signatures(ccf::Tables::SIGNATURES),
341+
serialised_tree(ccf::Tables::SERIALISED_MERKLE_TREE),
342+
paused(paused_)
343+
{}
344+
345+
ccf::kv::PendingTxInfo call() override
346+
{
347+
auto tx = store.create_reserved_tx(txid);
348+
auto sig = tx.rw(signatures);
349+
auto tree = tx.rw(serialised_tree);
350+
351+
sig->put(ccf::PrimarySignature(ccf::kv::test::PrimaryNodeId, txid.seqno));
352+
tree->put({});
353+
354+
{
355+
std::lock_guard<std::mutex> guard(paused.lock);
356+
paused.reserved_tx_created = true;
357+
}
358+
paused.reserved_tx_created_cv.notify_one();
359+
360+
std::unique_lock<std::mutex> guard(paused.lock);
361+
paused.resume_cv.wait(guard, [this]() { return paused.resume; });
362+
363+
return tx.commit_reserved();
364+
}
365+
};
366+
313367
TEST_CASE(
314368
"Batches containing but not ending on a committable transaction should not "
315369
"halt replication")
@@ -457,6 +511,87 @@ TEST_CASE(
457511
}
458512
}
459513

514+
TEST_CASE(
515+
"Reserved signature tx returns no-replicate if rolled back before "
516+
"commit_reserved")
517+
{
518+
ccf::kv::Store store;
519+
auto encryptor = std::make_shared<ccf::kv::NullTxEncryptor>();
520+
store.set_encryptor(encryptor);
521+
auto consensus = std::make_shared<ccf::kv::test::PrimaryStubConsensus>();
522+
store.set_consensus(consensus);
523+
constexpr auto store_term = 2;
524+
store.initialise_term(store_term);
525+
526+
MapT table("public:table");
527+
528+
INFO("Commit two normal transactions before emitting a signature");
529+
{
530+
auto tx = store.create_tx();
531+
auto* txv = tx.rw(table);
532+
txv->put(0, 1);
533+
REQUIRE(tx.commit() == ccf::kv::CommitResult::SUCCESS);
534+
}
535+
536+
{
537+
auto tx = store.create_tx();
538+
auto* txv = tx.rw(table);
539+
txv->put(0, 2);
540+
REQUIRE(tx.commit() == ccf::kv::CommitResult::SUCCESS);
541+
}
542+
543+
REQUIRE(store.current_version() == 2);
544+
545+
auto txid = store.next_txid();
546+
REQUIRE(txid == ccf::TxID(store_term, 3));
547+
548+
PausedSignatureCommit paused;
549+
std::optional<ccf::kv::CommitResult> worker_result;
550+
551+
std::thread worker([&]() {
552+
worker_result = store.commit(
553+
txid,
554+
std::make_unique<PausedReservedSignatureTx>(txid, store, paused),
555+
true);
556+
});
557+
558+
{
559+
std::unique_lock<std::mutex> guard(paused.lock);
560+
paused.reserved_tx_created_cv.wait(
561+
guard, [&paused]() { return paused.reserved_tx_created; });
562+
}
563+
564+
const auto new_term = store_term + 1;
565+
566+
INFO(
567+
"Rollback after create_reserved_tx but before commit_reserved in a new "
568+
"term");
569+
store.rollback({store_term, 1}, new_term);
570+
REQUIRE(store.commit_view() == new_term);
571+
REQUIRE(store.current_txid() == ccf::TxID(store_term, 1));
572+
573+
{
574+
std::lock_guard<std::mutex> guard(paused.lock);
575+
paused.resume = true;
576+
}
577+
paused.resume_cv.notify_one();
578+
worker.join();
579+
580+
REQUIRE(worker_result.has_value());
581+
REQUIRE(worker_result.value() == ccf::kv::CommitResult::FAIL_NO_REPLICATE);
582+
REQUIRE(store.current_txid() == ccf::TxID(store_term, 1));
583+
584+
INFO("A normal transaction can still commit after the failed signature path");
585+
{
586+
auto tx = store.create_tx();
587+
auto* txv = tx.rw(table);
588+
txv->put(0, 3);
589+
REQUIRE(tx.commit() == ccf::kv::CommitResult::SUCCESS);
590+
}
591+
592+
REQUIRE(store.current_txid() == ccf::TxID(new_term, 2));
593+
}
594+
460595
TEST_CASE(
461596
"Check that rollback during replicate does not cause replication halts")
462597
{

0 commit comments

Comments
 (0)