Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
2d9df82
Initial plan
Copilot Apr 28, 2026
043687d
Implement primary-side join check against latest on-disk snapshot
Copilot Apr 28, 2026
ef73def
Shorten retry_count doc comment per review
Copilot Apr 28, 2026
8e9c582
CHANGELOG: rename Unreleased to 7.0.3, bump pyproject and add PR ref
Copilot Apr 28, 2026
47de9f8
CHANGELOG: shorten 7.0.3 entry
Copilot Apr 28, 2026
730bbb5
node_frontend: distinguish preferred vs required snapshot seqno in er…
Copilot Apr 28, 2026
f3d26b8
node_frontend: treat missing retry_count as legacy joiner (value_or(1))
Copilot Apr 28, 2026
c6ab243
Merge remote-tracking branch 'upstream/main' into copilot/implement-s…
cjen1-msft May 5, 2026
06cb72e
Fixup run
cjen1-msft May 5, 2026
13066f0
Fmt
cjen1-msft May 5, 2026
597f7a6
If fetching is disabled, set fetch_count = std::nullopt
cjen1-msft May 5, 2026
bf96c65
Merge branch 'main' into copilot/implement-snapshot-joining-behaviour
cjen1-msft May 5, 2026
be211e1
Apply suggestions from code review
cjen1-msft May 5, 2026
4a9e61b
Remove failing case, now that we use startup seqno for when the joine…
cjen1-msft May 5, 2026
13dff1d
snags
cjen1-msft May 5, 2026
b24d2a2
fmt
cjen1-msft May 5, 2026
ec5a1d0
Merge branch 'main' into copilot/implement-snapshot-joining-behaviour
achamayou May 5, 2026
8dfe2b7
Revert "Remove failing case, now that we use startup seqno for when t…
cjen1-msft May 6, 2026
e43c8b2
Fix test fix.
cjen1-msft May 6, 2026
aa6e35c
Merge branch 'main' into copilot/implement-snapshot-joining-behaviour
achamayou May 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

[7.0.3]: https://github.com/microsoft/CCF/releases/tag/ccf-7.0.3

### Fixed

- On a joiner's first attempt, the primary now requires the joiner's startup seqno to be at least as recent as the primary's latest committed snapshot on disk, preventing snapshot-less joiners from replaying the entire ledger (#7844).

### Changed

- Upgraded QuickJS from 2024-01-13 to 2025-09-13 (#7849).
Expand Down
18 changes: 18 additions & 0 deletions src/node/node_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,9 @@ namespace ccf
ccf::tasks::Task snapshot_fetch_task;
ccf::tasks::Task backup_snapshot_fetch_task;

// Number of times we have fetched the latest snapshot from the primary
size_t join_fetch_count = 0;

std::shared_ptr<ccf::kv::AbstractTxEncryptor> make_encryptor()
{
#ifdef USE_NULL_ENCRYPTOR
Expand Down Expand Up @@ -570,6 +573,13 @@ namespace ccf
last_recovered_idx = startup_seqno;
last_recovered_signed_idx = last_recovered_idx;

if (start_type == StartType::Join)
{
// after fetching a snapshot, subsequent requests should use the
// required bound instead of the preferred bound
join_fetch_count += 1;
}

if (start_type == StartType::Recover)
{
const auto segments = separate_segments(startup_snapshot_info->raw);
Expand Down Expand Up @@ -1317,6 +1327,14 @@ namespace ccf
join_params.public_encryption_key = node_encrypt_kp->public_key_pem();
join_params.quote_info = quote_info;
join_params.startup_seqno = startup_seqno;
if (config.join.fetch_recent_snapshot)
{
join_params.join_fetch_count = join_fetch_count;
}
else
{
join_params.join_fetch_count = 1;
}
join_params.certificate_signing_request = node_sign_kp->create_csr(
config.node_certificate.subject_name, subject_alt_names);
join_params.node_data = config.node_data;
Expand Down
3 changes: 3 additions & 0 deletions src/node/rpc/node_call_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ namespace ccf
std::optional<std::vector<uint8_t>> code_transparent_statement =
std::nullopt;
std::optional<ccf::LedgerSignMode> ledger_sign_mode = std::nullopt;
// Incremented by the joiner each time it retries a join request after
// receiving a StartupSeqnoIsOld response.
std::optional<uint32_t> join_fetch_count = std::nullopt;
};

struct Out
Expand Down
50 changes: 45 additions & 5 deletions src/node/rpc/node_frontend.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "service/tables/local_sealing.h"
#include "service/tables/previous_service_identity.h"
#include "service/tables/snapshot_status.h"
#include "snapshots/filenames.h"

#include <llhttp/llhttp.h>
#include <stdexcept>
Expand Down Expand Up @@ -573,23 +574,62 @@ namespace ccf
}

// Joiner's snapshot too old => StartupSeqnoIsOld
// (cause it to fetch a more recent snapshot)
// (causes joiner to fetch a more recent snapshot)
//
// The joiner always wants to use the most recent snapshot.
// However this will result in the joiner chasing the primary if
// snapshot production period ~= snapshot fetching delay
//
// So we have hysteresis in the fetching constraint:
// If the joiner has already fetched a snapshot: joiner seqno > startup
// snapshot seqno Otherwise: joiner seqno > latest snapshot on disk
// seqno
auto this_startup_seqno =
this->node_operation.get_startup_snapshot_seqno();
ccf::kv::Version required_seqno = this_startup_seqno;
// If the joiner does not enable fetching, or is a legacy node,
// join_fetch_count is unset and we should use the required bound to
// prevent it chasing the primary.
// Otherwise if this is the first request, use the preferred bound
bool using_preferred_bound =
(in.join_fetch_count.has_value() && in.join_fetch_count.value() == 0);
if (using_preferred_bound)
{
auto node_configuration_subsystem =
this->context.get_subsystem<NodeConfigurationSubsystem>();
if (node_configuration_subsystem != nullptr)
{
const auto& snapshots_config =
node_configuration_subsystem->get().node_config.snapshots;
const auto latest_committed_snapshot =
snapshots::find_latest_committed_snapshot_in_directory(
snapshots_config.directory);
if (latest_committed_snapshot.has_value())
{
const auto latest_snapshot_seqno =
snapshots::get_snapshot_idx_from_file_name(
latest_committed_snapshot->filename().string());
required_seqno = std::max(
required_seqno,
static_cast<ccf::kv::Version>(latest_snapshot_seqno));
}
}
}
if (
in.startup_seqno.has_value() &&
this_startup_seqno > in.startup_seqno.value())
in.startup_seqno.value() < required_seqno)
{
// Make sure that the joiner's snapshot is more recent than this
// node's snapshot. Otherwise, the joiner may not be given all the
// ledger secrets required to replay historical transactions.
const std::string payload = fmt::format(
"Node requested to join from seqno {} which is older than this "
"node startup seqno {}. A snapshot at least as recent as {} must "
"node {} {}. A snapshot at least as recent as {} must "
"be used instead.",
in.startup_seqno.value(),
this_startup_seqno,
this_startup_seqno);
using_preferred_bound ? "latest_on_disk_seqno" : "startup_seqno",
required_seqno,
required_seqno);
LOG_INFO_FMT("Join request rejected: {}", payload);
return make_error(
HTTP_STATUS_BAD_REQUEST, ccf::errors::StartupSeqnoIsOld, payload);
Expand Down
3 changes: 2 additions & 1 deletion src/node/rpc/serialization.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ namespace ccf
node_data,
sealing_recovery_data,
code_transparent_statement,
ledger_sign_mode);
ledger_sign_mode,
join_fetch_count);

DECLARE_JSON_TYPE(NetworkIdentity);
DECLARE_JSON_REQUIRED_FIELDS(NetworkIdentity, cert, priv_key);
Expand Down
53 changes: 53 additions & 0 deletions tests/reconfiguration.py
Original file line number Diff line number Diff line change
Expand Up @@ -812,6 +812,7 @@ def run_all(args):
test_ledger_invariants(network, args)

run_join_old_snapshot(args)
run_join_no_snapshot_against_original_primary(args)


def run_join_old_snapshot(const_args):
Expand Down Expand Up @@ -929,3 +930,55 @@ def run_join_old_snapshot(const_args):
fetch_recent_snapshot=True,
timeout=3,
)


def run_join_no_snapshot_against_original_primary(const_args):
# Regression test.
# Previously a node which should fetch a snapshot, would not as the lower limit for this was the startup snapshot of the node.
# This test ensures that the startup seqno of a joining node is higher than the startup snapshot of the primary
txs = app.LoggingTxs("user0")
args = deepcopy(const_args)
args.nodes = infra.e2e_args.nodes(args, 1)
args.label += "_no_snapshot_against_original_primary"

with infra.network.network(
args.nodes,
args.binary_dir,
args.debug_nodes,
pdb=args.pdb,
txs=txs,
) as network:
network.start_and_open(args)
primary, _ = network.find_primary()

# The original primary started without a snapshot; sanity-check this.
with primary.client() as c:
body = c.get("/node/state").body.json()
assert (
body["startup_seqno"] == 0
), f"Original primary should have startup_seqno == 0, got {body['startup_seqno']}"

# Issue enough transactions for the primary to generate and commit a
# snapshot. Wait until that snapshot is on disk.
txs.issue(network, number_txs=args.snapshot_tx_interval)
committed_snapshots_dir = network.get_committed_snapshots(primary)
assert os.listdir(
committed_snapshots_dir
), f"Expected committed snapshot in {committed_snapshots_dir}"

# Assert that fetch_recent_snapshot fetches a snapshot and starts from it
new_node = network.create_node()
network.join_node(
new_node,
args.package,
args,
from_snapshot=False,
fetch_recent_snapshot=True,
timeout=10,
)
network.trust_node(new_node, args)
with new_node.client() as c:
body = c.get("/node/state").body.json()
assert (
body["startup_seqno"] > 0
), f"Joiner should have started from a fetched snapshot, got startup_seqno={body['startup_seqno']}"