|
32 | 32 | #include "service/tables/local_sealing.h" |
33 | 33 | #include "service/tables/previous_service_identity.h" |
34 | 34 | #include "service/tables/snapshot_status.h" |
| 35 | +#include "snapshots/filenames.h" |
35 | 36 |
|
36 | 37 | #include <llhttp/llhttp.h> |
37 | 38 | #include <stdexcept> |
@@ -573,23 +574,62 @@ namespace ccf |
573 | 574 | } |
574 | 575 |
|
575 | 576 | // Joiner's snapshot too old => StartupSeqnoIsOld |
576 | | - // (cause it to fetch a more recent snapshot) |
| 577 | + // (causes joiner to fetch a more recent snapshot) |
| 578 | + // |
| 579 | + // The joiner always wants to use the most recent snapshot. |
| 580 | + // However this will result in the joiner chasing the primary if |
| 581 | + // snapshot production period ~= snapshot fetching delay |
| 582 | + // |
| 583 | + // So we have hysteresis in the fetching constraint: |
| 584 | + // If the joiner has already fetched a snapshot: joiner seqno > startup |
| 585 | + // snapshot seqno Otherwise: joiner seqno > latest snapshot on disk |
| 586 | + // seqno |
577 | 587 | auto this_startup_seqno = |
578 | 588 | this->node_operation.get_startup_snapshot_seqno(); |
| 589 | + ccf::kv::Version required_seqno = this_startup_seqno; |
| 590 | + // If the joiner does not enable fetching, or is a legacy node, |
| 591 | + // join_fetch_count is unset and we should use the required bound to |
| 592 | + // prevent it chasing the primary. |
| 593 | + // Otherwise if this is the first request, use the preferred bound |
| 594 | + bool using_preferred_bound = |
| 595 | + (in.join_fetch_count.has_value() && in.join_fetch_count.value() == 0); |
| 596 | + if (using_preferred_bound) |
| 597 | + { |
| 598 | + auto node_configuration_subsystem = |
| 599 | + this->context.get_subsystem<NodeConfigurationSubsystem>(); |
| 600 | + if (node_configuration_subsystem != nullptr) |
| 601 | + { |
| 602 | + const auto& snapshots_config = |
| 603 | + node_configuration_subsystem->get().node_config.snapshots; |
| 604 | + const auto latest_committed_snapshot = |
| 605 | + snapshots::find_latest_committed_snapshot_in_directory( |
| 606 | + snapshots_config.directory); |
| 607 | + if (latest_committed_snapshot.has_value()) |
| 608 | + { |
| 609 | + const auto latest_snapshot_seqno = |
| 610 | + snapshots::get_snapshot_idx_from_file_name( |
| 611 | + latest_committed_snapshot->filename().string()); |
| 612 | + required_seqno = std::max( |
| 613 | + required_seqno, |
| 614 | + static_cast<ccf::kv::Version>(latest_snapshot_seqno)); |
| 615 | + } |
| 616 | + } |
| 617 | + } |
579 | 618 | if ( |
580 | 619 | in.startup_seqno.has_value() && |
581 | | - this_startup_seqno > in.startup_seqno.value()) |
| 620 | + in.startup_seqno.value() < required_seqno) |
582 | 621 | { |
583 | 622 | // Make sure that the joiner's snapshot is more recent than this |
584 | 623 | // node's snapshot. Otherwise, the joiner may not be given all the |
585 | 624 | // ledger secrets required to replay historical transactions. |
586 | 625 | const std::string payload = fmt::format( |
587 | 626 | "Node requested to join from seqno {} which is older than this " |
588 | | - "node startup seqno {}. A snapshot at least as recent as {} must " |
| 627 | + "node {} {}. A snapshot at least as recent as {} must " |
589 | 628 | "be used instead.", |
590 | 629 | in.startup_seqno.value(), |
591 | | - this_startup_seqno, |
592 | | - this_startup_seqno); |
| 630 | + using_preferred_bound ? "latest_on_disk_seqno" : "startup_seqno", |
| 631 | + required_seqno, |
| 632 | + required_seqno); |
593 | 633 | LOG_INFO_FMT("Join request rejected: {}", payload); |
594 | 634 | return make_error( |
595 | 635 | HTTP_STATUS_BAD_REQUEST, ccf::errors::StartupSeqnoIsOld, payload); |
|
0 commit comments