Skip to content

Commit 575b734

Browse files
Copilotcjen1-msfteddyashtonachamayou
authored
Reject snapshot-less join against primary holding a more recent snapshot (#7844)
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: cjen1-msft <190360281+cjen1-msft@users.noreply.github.com> Co-authored-by: eddyashton <6000239+eddyashton@users.noreply.github.com> Co-authored-by: cjen1-msft <chrisjensen@microsoft.com> Co-authored-by: Amaury Chamayou <amaury@xargs.fr> Co-authored-by: Amaury Chamayou <amchamay@microsoft.com>
1 parent 483049d commit 575b734

6 files changed

Lines changed: 125 additions & 6 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
99

1010
[7.0.3]: https://github.com/microsoft/CCF/releases/tag/ccf-7.0.3
1111

12+
### Fixed
13+
14+
- On a joiner's first attempt, the primary now requires the joiner's startup seqno to be at least as recent as the primary's latest committed snapshot on disk, preventing snapshot-less joiners from replaying the entire ledger (#7844).
15+
1216
### Changed
1317

1418
- Upgraded QuickJS from 2024-01-13 to 2025-09-13 (#7849).

src/node/node_state.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,9 @@ namespace ccf
476476
ccf::tasks::Task snapshot_fetch_task;
477477
ccf::tasks::Task backup_snapshot_fetch_task;
478478

479+
// Number of times we have fetched the latest snapshot from the primary
480+
size_t join_fetch_count = 0;
481+
479482
std::shared_ptr<ccf::kv::AbstractTxEncryptor> make_encryptor()
480483
{
481484
#ifdef USE_NULL_ENCRYPTOR
@@ -570,6 +573,13 @@ namespace ccf
570573
last_recovered_idx = startup_seqno;
571574
last_recovered_signed_idx = last_recovered_idx;
572575

576+
if (start_type == StartType::Join)
577+
{
578+
// after fetching a snapshot, subsequent requests should use the
579+
// required bound instead of the preferred bound
580+
join_fetch_count += 1;
581+
}
582+
573583
if (start_type == StartType::Recover)
574584
{
575585
const auto segments = separate_segments(startup_snapshot_info->raw);
@@ -1317,6 +1327,14 @@ namespace ccf
13171327
join_params.public_encryption_key = node_encrypt_kp->public_key_pem();
13181328
join_params.quote_info = quote_info;
13191329
join_params.startup_seqno = startup_seqno;
1330+
if (config.join.fetch_recent_snapshot)
1331+
{
1332+
join_params.join_fetch_count = join_fetch_count;
1333+
}
1334+
else
1335+
{
1336+
join_params.join_fetch_count = 1;
1337+
}
13201338
join_params.certificate_signing_request = node_sign_kp->create_csr(
13211339
config.node_certificate.subject_name, subject_alt_names);
13221340
join_params.node_data = config.node_data;

src/node/rpc/node_call_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ namespace ccf
9797
std::optional<std::vector<uint8_t>> code_transparent_statement =
9898
std::nullopt;
9999
std::optional<ccf::LedgerSignMode> ledger_sign_mode = std::nullopt;
100+
// Incremented by the joiner each time it retries a join request after
101+
// receiving a StartupSeqnoIsOld response.
102+
std::optional<uint32_t> join_fetch_count = std::nullopt;
100103
};
101104

102105
struct Out

src/node/rpc/node_frontend.h

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "service/tables/local_sealing.h"
3333
#include "service/tables/previous_service_identity.h"
3434
#include "service/tables/snapshot_status.h"
35+
#include "snapshots/filenames.h"
3536

3637
#include <llhttp/llhttp.h>
3738
#include <stdexcept>
@@ -573,23 +574,62 @@ namespace ccf
573574
}
574575

575576
// Joiner's snapshot too old => StartupSeqnoIsOld
576-
// (cause it to fetch a more recent snapshot)
577+
// (causes joiner to fetch a more recent snapshot)
578+
//
579+
// The joiner always wants to use the most recent snapshot.
580+
// However this will result in the joiner chasing the primary if
581+
// snapshot production period ~= snapshot fetching delay
582+
//
583+
// So we have hysteresis in the fetching constraint:
584+
// If the joiner has already fetched a snapshot: joiner seqno > startup
585+
// snapshot seqno Otherwise: joiner seqno > latest snapshot on disk
586+
// seqno
577587
auto this_startup_seqno =
578588
this->node_operation.get_startup_snapshot_seqno();
589+
ccf::kv::Version required_seqno = this_startup_seqno;
590+
// If the joiner does not enable fetching, or is a legacy node,
591+
// join_fetch_count is unset and we should use the required bound to
592+
// prevent it chasing the primary.
593+
// Otherwise if this is the first request, use the preferred bound
594+
bool using_preferred_bound =
595+
(in.join_fetch_count.has_value() && in.join_fetch_count.value() == 0);
596+
if (using_preferred_bound)
597+
{
598+
auto node_configuration_subsystem =
599+
this->context.get_subsystem<NodeConfigurationSubsystem>();
600+
if (node_configuration_subsystem != nullptr)
601+
{
602+
const auto& snapshots_config =
603+
node_configuration_subsystem->get().node_config.snapshots;
604+
const auto latest_committed_snapshot =
605+
snapshots::find_latest_committed_snapshot_in_directory(
606+
snapshots_config.directory);
607+
if (latest_committed_snapshot.has_value())
608+
{
609+
const auto latest_snapshot_seqno =
610+
snapshots::get_snapshot_idx_from_file_name(
611+
latest_committed_snapshot->filename().string());
612+
required_seqno = std::max(
613+
required_seqno,
614+
static_cast<ccf::kv::Version>(latest_snapshot_seqno));
615+
}
616+
}
617+
}
579618
if (
580619
in.startup_seqno.has_value() &&
581-
this_startup_seqno > in.startup_seqno.value())
620+
in.startup_seqno.value() < required_seqno)
582621
{
583622
// Make sure that the joiner's snapshot is more recent than this
584623
// node's snapshot. Otherwise, the joiner may not be given all the
585624
// ledger secrets required to replay historical transactions.
586625
const std::string payload = fmt::format(
587626
"Node requested to join from seqno {} which is older than this "
588-
"node startup seqno {}. A snapshot at least as recent as {} must "
627+
"node {} {}. A snapshot at least as recent as {} must "
589628
"be used instead.",
590629
in.startup_seqno.value(),
591-
this_startup_seqno,
592-
this_startup_seqno);
630+
using_preferred_bound ? "latest_on_disk_seqno" : "startup_seqno",
631+
required_seqno,
632+
required_seqno);
593633
LOG_INFO_FMT("Join request rejected: {}", payload);
594634
return make_error(
595635
HTTP_STATUS_BAD_REQUEST, ccf::errors::StartupSeqnoIsOld, payload);

src/node/rpc/serialization.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ namespace ccf
3838
node_data,
3939
sealing_recovery_data,
4040
code_transparent_statement,
41-
ledger_sign_mode);
41+
ledger_sign_mode,
42+
join_fetch_count);
4243

4344
DECLARE_JSON_TYPE(NetworkIdentity);
4445
DECLARE_JSON_REQUIRED_FIELDS(NetworkIdentity, cert, priv_key);

tests/reconfiguration.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -812,6 +812,7 @@ def run_all(args):
812812
test_ledger_invariants(network, args)
813813

814814
run_join_old_snapshot(args)
815+
run_join_no_snapshot_against_original_primary(args)
815816

816817

817818
def run_join_old_snapshot(const_args):
@@ -929,3 +930,55 @@ def run_join_old_snapshot(const_args):
929930
fetch_recent_snapshot=True,
930931
timeout=3,
931932
)
933+
934+
935+
def run_join_no_snapshot_against_original_primary(const_args):
936+
# Regression test.
937+
# Previously a node which should fetch a snapshot, would not as the lower limit for this was the startup snapshot of the node.
938+
# This test ensures that the startup seqno of a joining node is higher than the startup snapshot of the primary
939+
txs = app.LoggingTxs("user0")
940+
args = deepcopy(const_args)
941+
args.nodes = infra.e2e_args.nodes(args, 1)
942+
args.label += "_no_snapshot_against_original_primary"
943+
944+
with infra.network.network(
945+
args.nodes,
946+
args.binary_dir,
947+
args.debug_nodes,
948+
pdb=args.pdb,
949+
txs=txs,
950+
) as network:
951+
network.start_and_open(args)
952+
primary, _ = network.find_primary()
953+
954+
# The original primary started without a snapshot; sanity-check this.
955+
with primary.client() as c:
956+
body = c.get("/node/state").body.json()
957+
assert (
958+
body["startup_seqno"] == 0
959+
), f"Original primary should have startup_seqno == 0, got {body['startup_seqno']}"
960+
961+
# Issue enough transactions for the primary to generate and commit a
962+
# snapshot. Wait until that snapshot is on disk.
963+
txs.issue(network, number_txs=args.snapshot_tx_interval)
964+
committed_snapshots_dir = network.get_committed_snapshots(primary)
965+
assert os.listdir(
966+
committed_snapshots_dir
967+
), f"Expected committed snapshot in {committed_snapshots_dir}"
968+
969+
# Assert that fetch_recent_snapshot fetches a snapshot and starts from it
970+
new_node = network.create_node()
971+
network.join_node(
972+
new_node,
973+
args.package,
974+
args,
975+
from_snapshot=False,
976+
fetch_recent_snapshot=True,
977+
timeout=10,
978+
)
979+
network.trust_node(new_node, args)
980+
with new_node.client() as c:
981+
body = c.get("/node/state").body.json()
982+
assert (
983+
body["startup_seqno"] > 0
984+
), f"Joiner should have started from a fetched snapshot, got startup_seqno={body['startup_seqno']}"

0 commit comments

Comments
 (0)