From 77c2097338839301ebfc6f6a0667208581e70477 Mon Sep 17 00:00:00 2001 From: ninsmiracle <110282526+ninsmiracle@users.noreply.github.com> Date: Thu, 28 Mar 2024 14:39:13 +0800 Subject: [PATCH] fix(duplication): prevent plog files from being removed by GC while they are being checked by duplication (#1597) https://github.com/apache/incubator-pegasus/issues/1596 Using an atomic member to prevent plog files from being removed by GC when the plog files are being checked by duplication. --- src/replica/duplication/load_from_private_log.cpp | 4 ++++ src/replica/duplication/load_from_private_log.h | 4 +++- src/replica/duplication/replica_duplicator.cpp | 5 +++++ src/replica/duplication/replica_duplicator.h | 4 +++- src/replica/replica.h | 8 ++++++++ src/replica/replica_chkpt.cpp | 6 ++++++ 6 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/replica/duplication/load_from_private_log.cpp b/src/replica/duplication/load_from_private_log.cpp index d3d88a075d..959cdaac71 100644 --- a/src/replica/duplication/load_from_private_log.cpp +++ b/src/replica/duplication/load_from_private_log.cpp @@ -29,6 +29,7 @@ #include "replica/replica.h" #include "replica_duplicator.h" #include "utils/autoref_ptr.h" +#include "utils/defer.h" #include "utils/error_code.h" #include "utils/errors.h" #include "utils/fail_point.h" @@ -123,6 +124,9 @@ void load_from_private_log::run() void load_from_private_log::find_log_file_to_start() { + _duplicator->set_duplication_plog_checking(true); + auto cleanup = dsn::defer([this]() { _duplicator->set_duplication_plog_checking(false); }); + // `file_map` has already excluded the useless log files during replica init. const auto &file_map = _private_log->get_log_file_map(); diff --git a/src/replica/duplication/load_from_private_log.h b/src/replica/duplication/load_from_private_log.h index 56651aabfa..5cf0d75867 100644 --- a/src/replica/duplication/load_from_private_log.h +++ b/src/replica/duplication/load_from_private_log.h @@ -60,7 +60,6 @@ class load_from_private_log final : public replica_base, /// Find the log file that contains `_start_decree`. void find_log_file_to_start(); - void find_log_file_to_start(const mutation_log::log_file_map_by_index &log_files); void replay_log_block(); @@ -78,6 +77,9 @@ class load_from_private_log final : public replica_base, static constexpr int MAX_ALLOWED_BLOCK_REPEATS{3}; static constexpr int MAX_ALLOWED_FILE_REPEATS{10}; +private: + void find_log_file_to_start(const mutation_log::log_file_map_by_index &log_files); + private: friend class load_from_private_log_test; friend class load_fail_mode_test; diff --git a/src/replica/duplication/replica_duplicator.cpp b/src/replica/duplication/replica_duplicator.cpp index a19947fe20..e7db4308b0 100644 --- a/src/replica/duplication/replica_duplicator.cpp +++ b/src/replica/duplication/replica_duplicator.cpp @@ -257,5 +257,10 @@ uint64_t replica_duplicator::get_pending_mutations_count() const return cnt > 0 ? static_cast(cnt) : 0; } +void replica_duplicator::set_duplication_plog_checking(bool checking) +{ + _replica->set_duplication_plog_checking(checking); +} + } // namespace replication } // namespace dsn diff --git a/src/replica/duplication/replica_duplicator.h b/src/replica/duplication/replica_duplicator.h index 1b2526d290..04fb89563c 100644 --- a/src/replica/duplication/replica_duplicator.h +++ b/src/replica/duplication/replica_duplicator.h @@ -138,7 +138,9 @@ class replica_duplicator : public replica_base, public pipeline::base // For metric "dup.pending_mutations_count" uint64_t get_pending_mutations_count() const; - duplication_status::type status() const { return _status; }; + duplication_status::type status() const { return _status; } + + void set_duplication_plog_checking(bool checking); private: friend class duplication_test_base; diff --git a/src/replica/replica.h b/src/replica/replica.h index 3933784da0..f9cfafdaca 100644 --- a/src/replica/replica.h +++ b/src/replica/replica.h @@ -254,6 +254,11 @@ class replica : public serverlet, public ref_counter, public replica_ba replica_duplicator_manager *get_duplication_manager() const { return _duplication_mgr.get(); } bool is_duplication_master() const { return _is_duplication_master; } bool is_duplication_follower() const { return _is_duplication_follower; } + bool is_duplication_plog_checking() const { return _is_duplication_plog_checking.load(); } + void set_duplication_plog_checking(bool checking) + { + _is_duplication_plog_checking.store(checking); + } // // Backup @@ -633,6 +638,9 @@ class replica : public serverlet, public ref_counter, public replica_ba bool _is_manual_emergency_checkpointing{false}; bool _is_duplication_master{false}; bool _is_duplication_follower{false}; + // Indicate whether the replica is during finding out some private logs to + // load for duplication. It useful to prevent plog GCed unexpectedly. + std::atomic _is_duplication_plog_checking{false}; // backup std::unique_ptr _backup_mgr; diff --git a/src/replica/replica_chkpt.cpp b/src/replica/replica_chkpt.cpp index b24f11157b..309422e75e 100644 --- a/src/replica/replica_chkpt.cpp +++ b/src/replica/replica_chkpt.cpp @@ -154,6 +154,12 @@ void replica::on_checkpoint_timer() return; } + if (is_duplication_plog_checking()) { + LOG_DEBUG_PREFIX("gc_private {}: skip gc because duplication is checking plog files", + enum_to_string(status())); + return; + } + tasking::enqueue(LPC_GARBAGE_COLLECT_LOGS_AND_REPLICAS, &_tracker, [this, plog, cleanable_decree, valid_start_offset] {