From e0d00b689c4b2e77245191fca2af68141eb0ff36 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Fri, 31 Oct 2025 18:16:05 +0000 Subject: [PATCH 01/66] Bumped zlib version fixed a race condition in deploy_local.sh --- WORKSPACE | 8 +++++--- scripts/deploy/script/deploy_local.sh | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index e05017f5c8..cfb3420dae 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -166,10 +166,12 @@ boost_deps() http_archive( name = "net_zlib_zlib", build_file = "@com_resdb_nexres//third_party:z.BUILD", - sha256 = "91844808532e5ce316b3c010929493c0244f3d37593afd6de04f71821d5136d9", - strip_prefix = "zlib-1.2.12", + sha256 = "9a93b2b7dfdac77ceba5a558a580e74667dd6fede4585b91eefb60f03b72df23", + strip_prefix = "zlib-1.3.1", urls = [ - "https://storage.googleapis.com/bazel-mirror/zlib.net/zlib-1.2.12.tar.gz", + "https://zlib.net/zlib-1.3.1.tar.gz", + "https://zlib.net/fossils/zlib-1.3.1.tar.gz", + "https://github.com/madler/zlib/releases/download/v1.3.1/zlib-1.3.1.tar.gz", ], ) diff --git a/scripts/deploy/script/deploy_local.sh b/scripts/deploy/script/deploy_local.sh index 7adb19185f..11145de839 100755 --- a/scripts/deploy/script/deploy_local.sh +++ b/scripts/deploy/script/deploy_local.sh @@ -112,7 +112,7 @@ fi idx=1 for ip in ${deploy_iplist[@]}; do - run_one_cmd "mkdir -p ${home_path}/${main_folder}/$idx" & + run_one_cmd "mkdir -p ${home_path}/${main_folder}/$idx" ((count++)) ((idx++)) done From 62f71af42e1fe01a87bf427bac2a70f2f8a48be1 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Tue, 4 Nov 2025 17:17:19 +0000 Subject: [PATCH 02/66] Added Raft files, currently copies of PoE files per Dakai's guide. --- benchmark/protocols/raft/BUILD | 43 ++++++++ .../protocols/raft/kv_server_performance.cpp | 83 ++++++++++++++++ benchmark/protocols/raft/kv_service_tools.cpp | 51 ++++++++++ .../consensus/ordering/raft/algorithm/BUILD | 33 +++++++ .../ordering/raft/algorithm/raft.cpp | 91 +++++++++++++++++ .../consensus/ordering/raft/algorithm/raft.h | 59 +++++++++++ .../consensus/ordering/raft/framework/BUILD | 33 +++++++ .../ordering/raft/framework/consensus.cpp | 98 +++++++++++++++++++ .../ordering/raft/framework/consensus.h | 53 ++++++++++ platform/consensus/ordering/raft/proto/BUILD | 34 +++++++ .../ordering/raft/proto/proposal.proto | 46 +++++++++ scripts/deploy/config/raft.config | 10 ++ .../deploy/performance/raft_performance.sh | 23 +++++ .../performance_local/pbft_performance.sh | 8 +- .../performance_local/poe_performance.sh | 7 +- .../performance_local/raft_performance.sh | 25 +++++ .../performance_local/run_performance.sh | 2 +- 17 files changed, 694 insertions(+), 5 deletions(-) create mode 100644 benchmark/protocols/raft/BUILD create mode 100644 benchmark/protocols/raft/kv_server_performance.cpp create mode 100644 benchmark/protocols/raft/kv_service_tools.cpp create mode 100644 platform/consensus/ordering/raft/algorithm/BUILD create mode 100644 platform/consensus/ordering/raft/algorithm/raft.cpp create mode 100644 platform/consensus/ordering/raft/algorithm/raft.h create mode 100644 platform/consensus/ordering/raft/framework/BUILD create mode 100644 platform/consensus/ordering/raft/framework/consensus.cpp create mode 100644 platform/consensus/ordering/raft/framework/consensus.h create mode 100644 platform/consensus/ordering/raft/proto/BUILD create mode 100644 platform/consensus/ordering/raft/proto/proposal.proto create mode 100644 scripts/deploy/config/raft.config create mode 100755 scripts/deploy/performance/raft_performance.sh create mode 100755 scripts/deploy/performance_local/raft_performance.sh diff --git a/benchmark/protocols/raft/BUILD b/benchmark/protocols/raft/BUILD new file mode 100644 index 0000000000..a65b722406 --- /dev/null +++ b/benchmark/protocols/raft/BUILD @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +package(default_visibility = ["//visibility:private"]) + +load("@bazel_skylib//rules:common_settings.bzl", "bool_flag") + +cc_binary( + name = "kv_server_performance", + srcs = ["kv_server_performance.cpp"], + deps = [ + "//chain/storage:memory_db", + "//executor/kv:kv_executor", + "//platform/config:resdb_config_utils", + "//platform/consensus/ordering/raft/framework:consensus", + "//service/utils:server_factory", + ], +) + +cc_binary( + name = "kv_service_tools", + srcs = ["kv_service_tools.cpp"], + deps = [ + "//common/proto:signature_info_cc_proto", + "//interface/kv:kv_client", + "//platform/config:resdb_config_utils", + ], +) diff --git a/benchmark/protocols/raft/kv_server_performance.cpp b/benchmark/protocols/raft/kv_server_performance.cpp new file mode 100644 index 0000000000..a74ef45375 --- /dev/null +++ b/benchmark/protocols/raft/kv_server_performance.cpp @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include "chain/storage/memory_db.h" +#include "executor/kv/kv_executor.h" +#include "platform/config/resdb_config_utils.h" +#include "platform/consensus/ordering/raft/framework/consensus.h" +#include "platform/networkstrate/service_network.h" +#include "platform/statistic/stats.h" +#include "proto/kv/kv.pb.h" + +using namespace resdb; +using namespace resdb::raft; +using namespace resdb::storage; + +void ShowUsage() { + printf(" [logging_dir]\n"); +} + +std::string GetRandomKey() { + int num1 = rand() % 10; + int num2 = rand() % 10; + return std::to_string(num1) + std::to_string(num2); +} + +int main(int argc, char** argv) { + if (argc < 3) { + ShowUsage(); + exit(0); + } + + // google::InitGoogleLogging(argv[0]); + // FLAGS_minloglevel = google::GLOG_WARNING; + + char* config_file = argv[1]; + char* private_key_file = argv[2]; + char* cert_file = argv[3]; + + if (argc >= 5) { + auto monitor_port = Stats::GetGlobalStats(5); + monitor_port->SetPrometheus(argv[4]); + } + + std::unique_ptr config = + GenerateResDBConfig(config_file, private_key_file, cert_file); + + config->RunningPerformance(true); + ResConfigData config_data = config->GetConfigData(); + + auto performance_consens = std::make_unique( + *config, std::make_unique(std::make_unique())); + performance_consens->SetupPerformanceDataFunc([]() { + KVRequest request; + request.set_cmd(KVRequest::SET); + request.set_key(GetRandomKey()); + request.set_value("helloword"); + std::string request_data; + request.SerializeToString(&request_data); + return request_data; + }); + + auto server = + std::make_unique(*config, std::move(performance_consens)); + server->Run(); +} diff --git a/benchmark/protocols/raft/kv_service_tools.cpp b/benchmark/protocols/raft/kv_service_tools.cpp new file mode 100644 index 0000000000..43627b34f4 --- /dev/null +++ b/benchmark/protocols/raft/kv_service_tools.cpp @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include + +#include + +#include "common/proto/signature_info.pb.h" +#include "interface/kv/kv_client.h" +#include "platform/config/resdb_config_utils.h" + +using resdb::GenerateReplicaInfo; +using resdb::GenerateResDBConfig; +using resdb::KVClient; +using resdb::ReplicaInfo; +using resdb::ResDBConfig; + +int main(int argc, char** argv) { + if (argc < 2) { + printf("\n"); + return 0; + } + std::string client_config_file = argv[1]; + ResDBConfig config = GenerateResDBConfig(client_config_file); + + config.SetClientTimeoutMs(100000); + + KVClient client(config); + + client.Set("start", "value"); + printf("start benchmark\n"); +} diff --git a/platform/consensus/ordering/raft/algorithm/BUILD b/platform/consensus/ordering/raft/algorithm/BUILD new file mode 100644 index 0000000000..59903f16ad --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/BUILD @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +package(default_visibility = ["//platform/consensus/ordering/raft:__subpackages__"]) + +cc_library( + name = "raft", + srcs = ["raft.cpp"], + hdrs = ["raft.h"], + deps = [ + "//common:comm", + "//common/crypto:signature_verifier", + "//platform/common/queue:lock_free_queue", + "//platform/consensus/ordering/common/algorithm:protocol_base", + "//platform/consensus/ordering/raft/proto:proposal_cc_proto", + "//platform/statistic:stats", + ], +) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp new file mode 100644 index 0000000000..cc92bb5624 --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "platform/consensus/ordering/raft/algorithm/raft.h" + +#include + +#include "common/crypto/signature_verifier.h" +#include "common/utils/utils.h" + +namespace resdb { +namespace raft { + +Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier) + : ProtocolBase(id, f, total_num), verifier_(verifier) { + LOG(ERROR) << "get proposal graph"; + id_ = id; + total_num_ = total_num; + f_ = f; + is_stop_ = false; + seq_ = 0; +} + +Raft::~Raft() { is_stop_ = true; } + +bool Raft::IsStop() { return is_stop_; } + +bool Raft::ReceiveTransaction(std::unique_ptr txn) { + // LOG(ERROR)<<"recv txn:"; + txn->set_create_time(GetCurrentTime()); + txn->set_seq(seq_++); + txn->set_proposer(id_); + + Broadcast(MessageType::Propose, *txn); + return true; +} + +bool Raft::ReceivePropose(std::unique_ptr txn) { + std::string hash = txn->hash(); + int64_t seq = txn->seq(); + int proposer = txn->proposer(); + { + std::unique_lock lk(mutex_); + data_[txn->hash()] = std::move(txn); + } + + Proposal proposal; + proposal.set_hash(hash); + proposal.set_seq(seq); + proposal.set_proposer(id_); + Broadcast(MessageType::Prepare, proposal); + return true; +} + +bool Raft::ReceivePrepare(std::unique_ptr proposal) { + std::unique_ptr txn = nullptr; + { + std::unique_lock lk(mutex_); + received_[proposal->hash()].insert(proposal->proposer()); + auto it = data_.find(proposal->hash()); + if (it != data_.end()) { + if (received_[proposal->hash()].size() >= 2 * f_ + 1) { + txn = std::move(it->second); + data_.erase(it); + } + } + } + if (txn != nullptr) { + commit_(*txn); + } + return true; +} + +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h new file mode 100644 index 0000000000..7406feb774 --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "platform/common/queue/lock_free_queue.h" +#include "platform/consensus/ordering/common/algorithm/protocol_base.h" +#include "platform/consensus/ordering/raft/proto/proposal.pb.h" +#include "platform/statistic/stats.h" + +namespace resdb { +namespace raft { + +class Raft : public common::ProtocolBase { + public: + Raft(int id, int f, int total_num, SignatureVerifier* verifier); + ~Raft(); + + bool ReceiveTransaction(std::unique_ptr txn); + bool ReceivePropose(std::unique_ptr txn); + bool ReceivePrepare(std::unique_ptr proposal); + + private: + bool IsStop(); + + private: + std::mutex mutex_; + std::map > received_; + std::map > data_; + + int64_t seq_; + bool is_stop_; + SignatureVerifier* verifier_; + Stats* global_stats_; +}; + +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/BUILD b/platform/consensus/ordering/raft/framework/BUILD new file mode 100644 index 0000000000..03137c0500 --- /dev/null +++ b/platform/consensus/ordering/raft/framework/BUILD @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +package(default_visibility = ["//visibility:private"]) + +cc_library( + name = "consensus", + srcs = ["consensus.cpp"], + hdrs = ["consensus.h"], + visibility = [ + "//visibility:public", + ], + deps = [ + "//common/utils", + "//platform/consensus/ordering/common/framework:consensus", + "//platform/consensus/ordering/raft/algorithm:raft", + ], +) diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp new file mode 100644 index 0000000000..1c6a6635ed --- /dev/null +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "platform/consensus/ordering/raft/framework/consensus.h" + +#include +#include + +#include "common/utils/utils.h" + +namespace resdb { +namespace raft { + +Consensus::Consensus(const ResDBConfig& config, + std::unique_ptr executor) + : common::Consensus(config, std::move(executor)) { + int total_replicas = config_.GetReplicaNum(); + int f = (total_replicas - 1) / 3; + + Init(); + + start_ = 0; + + if (config_.GetPublicKeyCertificateInfo() + .public_key() + .public_key_info() + .type() != CertificateKeyInfo::CLIENT) { + raft_ = std::make_unique(config_.GetSelfInfo().id(), f, total_replicas, + GetSignatureVerifier()); + InitProtocol(raft_.get()); + } +} + +int Consensus::ProcessCustomConsensus(std::unique_ptr request) { + if (request->user_type() == MessageType::Propose) { + std::unique_ptr txn = std::make_unique(); + if (!txn->ParseFromString(request->data())) { + assert(1 == 0); + LOG(ERROR) << "parse proposal fail"; + return -1; + } + raft_->ReceivePropose(std::move(txn)); + return 0; + } else if (request->user_type() == MessageType::Prepare) { + std::unique_ptr proposal = std::make_unique(); + if (!proposal->ParseFromString(request->data())) { + LOG(ERROR) << "parse proposal fail"; + assert(1 == 0); + return -1; + } + raft_->ReceivePrepare(std::move(proposal)); + return 0; + } + return 0; +} + +int Consensus::ProcessNewTransaction(std::unique_ptr request) { + std::unique_ptr txn = std::make_unique(); + txn->set_data(request->data()); + txn->set_hash(request->hash()); + txn->set_proxy_id(request->proxy_id()); + txn->set_uid(request->uid()); + return raft_->ReceiveTransaction(std::move(txn)); +} + +int Consensus::CommitMsg(const google::protobuf::Message& msg) { + return CommitMsgInternal(dynamic_cast(msg)); +} + +int Consensus::CommitMsgInternal(const Transaction& txn) { + std::unique_ptr request = std::make_unique(); + request->set_data(txn.data()); + request->set_seq(txn.seq()); + request->set_uid(txn.uid()); + request->set_proxy_id(txn.proxy_id()); + + transaction_executor_->Commit(std::move(request)); + return 0; +} + +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/consensus.h b/platform/consensus/ordering/raft/framework/consensus.h new file mode 100644 index 0000000000..bfff56b4f5 --- /dev/null +++ b/platform/consensus/ordering/raft/framework/consensus.h @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include "executor/common/transaction_manager.h" +#include "platform/consensus/ordering/common/framework/consensus.h" +#include "platform/consensus/ordering/raft/algorithm/raft.h" +#include "platform/networkstrate/consensus_manager.h" + +namespace resdb { +namespace raft { + +class Consensus : public common::Consensus { + public: + Consensus(const ResDBConfig& config, + std::unique_ptr transaction_manager); + virtual ~Consensus() = default; + + private: + int ProcessCustomConsensus(std::unique_ptr request) override; + int ProcessNewTransaction(std::unique_ptr request) override; + int CommitMsg(const google::protobuf::Message& msg) override; + int CommitMsgInternal(const Transaction& txn); + + int Prepare(const Transaction& txn); + + protected: + std::unique_ptr raft_; + Stats* global_stats_; + int64_t start_; + std::mutex mutex_; + int send_num_[200]; +}; + +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/proto/BUILD b/platform/consensus/ordering/raft/proto/BUILD new file mode 100644 index 0000000000..144a6751f3 --- /dev/null +++ b/platform/consensus/ordering/raft/proto/BUILD @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +package(default_visibility = ["//platform/consensus/ordering/raft:__subpackages__"]) + +load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("@rules_proto//proto:defs.bzl", "proto_library") +load("@rules_proto_grpc//python:defs.bzl", "python_proto_library") + +proto_library( + name = "proposal_proto", + srcs = ["proposal.proto"], + #visibility = ["//visibility:public"], +) + +cc_proto_library( + name = "proposal_cc_proto", + deps = [":proposal_proto"], +) diff --git a/platform/consensus/ordering/raft/proto/proposal.proto b/platform/consensus/ordering/raft/proto/proposal.proto new file mode 100644 index 0000000000..70d8559812 --- /dev/null +++ b/platform/consensus/ordering/raft/proto/proposal.proto @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +syntax = "proto3"; + +package resdb.raft; + +message Transaction{ + int32 id = 1; + bytes data = 2; + bytes hash = 3; + int32 proxy_id = 4; + int32 proposer = 5; + int64 uid = 6; + int64 create_time = 7; + int64 seq = 9; +} + +message Proposal { + bytes hash = 1; + int32 proposer = 2; + int64 seq =3 ; +} + +enum MessageType { + None = 0; + Propose = 1; + Prepare = 2; +} + diff --git a/scripts/deploy/config/raft.config b/scripts/deploy/config/raft.config new file mode 100644 index 0000000000..c5092a94c8 --- /dev/null +++ b/scripts/deploy/config/raft.config @@ -0,0 +1,10 @@ +{ + "clientBatchNum": 100, + "enable_viewchange": false, + "recovery_enabled": false, + "max_client_complaint_num":10, + "max_process_txn": 32, + "worker_num": 2, + "input_worker_num": 1, + "output_worker_num": 10 +} diff --git a/scripts/deploy/performance/raft_performance.sh b/scripts/deploy/performance/raft_performance.sh new file mode 100755 index 0000000000..9603df099f --- /dev/null +++ b/scripts/deploy/performance/raft_performance.sh @@ -0,0 +1,23 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +export server=//benchmark/protocols/raft:kv_server_performance +export TEMPLATE_PATH=$PWD/config/raft.config + +./performance/run_performance.sh $* diff --git a/scripts/deploy/performance_local/pbft_performance.sh b/scripts/deploy/performance_local/pbft_performance.sh index 003b9787f8..5b337fe318 100755 --- a/scripts/deploy/performance_local/pbft_performance.sh +++ b/scripts/deploy/performance_local/pbft_performance.sh @@ -17,8 +17,12 @@ # under the License. # -export server=//benchmark/protocols/pbft:kv_server_performance -#export TEMPLATE_PATH=$PWD/config/pbft.config +protocol=pbft +export server=//benchmark/protocols/$protocol:kv_server_performance +export service_tools=//benchmark/protocols/$protocol:kv_service_tools +export TEMPLATE_PATH=$PWD/config/$protocol.config export performance=true +#export TEMPLATE_PATH=$PWD/config/pbft.config + ./performance_local/run_performance.sh $* diff --git a/scripts/deploy/performance_local/poe_performance.sh b/scripts/deploy/performance_local/poe_performance.sh index fd23e077a1..2ef4398d9e 100755 --- a/scripts/deploy/performance_local/poe_performance.sh +++ b/scripts/deploy/performance_local/poe_performance.sh @@ -17,7 +17,10 @@ # under the License. # -export server=//benchmark/protocols/poe:kv_server_performance -export TEMPLATE_PATH=$PWD/config/poe.config +protocol=poe +export server=//benchmark/protocols/$protocol:kv_server_performance +export service_tools=//benchmark/protocols/$protocol:kv_service_tools +export TEMPLATE_PATH=$PWD/config/$protocol.config +export performance=true ./performance_local/run_performance.sh $* diff --git a/scripts/deploy/performance_local/raft_performance.sh b/scripts/deploy/performance_local/raft_performance.sh new file mode 100755 index 0000000000..def1aa30ef --- /dev/null +++ b/scripts/deploy/performance_local/raft_performance.sh @@ -0,0 +1,25 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +protocol=raft +export server=//benchmark/protocols/$protocol:kv_server_performance +export service_tools=//benchmark/protocols/$protocol:kv_service_tools +export TEMPLATE_PATH=$PWD/config/$protocol.config +export performance=true + +./performance_local/run_performance.sh $* diff --git a/scripts/deploy/performance_local/run_performance.sh b/scripts/deploy/performance_local/run_performance.sh index 25cab4cf4c..144e1c8572 100755 --- a/scripts/deploy/performance_local/run_performance.sh +++ b/scripts/deploy/performance_local/run_performance.sh @@ -26,7 +26,7 @@ home_path="./" server_name=`echo "$server" | awk -F':' '{print $NF}'` server_bin=${server_name} -bazel run //benchmark/protocols/pbft:kv_service_tools -- $PWD/config_out/client.config +bazel run $service_tools -- $PWD/config_out/client.config sleep 60 From 9aed7e73db453023b94b37928efa0d4742aafc45 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Tue, 4 Nov 2025 20:04:13 +0000 Subject: [PATCH 03/66] set PoE script to use pbft's kvtools --- scripts/deploy/performance_local/poe_performance.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/deploy/performance_local/poe_performance.sh b/scripts/deploy/performance_local/poe_performance.sh index 2ef4398d9e..bbb48846fe 100755 --- a/scripts/deploy/performance_local/poe_performance.sh +++ b/scripts/deploy/performance_local/poe_performance.sh @@ -11,7 +11,7 @@ # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANYß # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. @@ -19,7 +19,7 @@ protocol=poe export server=//benchmark/protocols/$protocol:kv_server_performance -export service_tools=//benchmark/protocols/$protocol:kv_service_tools +export service_tools=//benchmark/protocols/pbft:kv_service_tools export TEMPLATE_PATH=$PWD/config/$protocol.config export performance=true From ba5de1cb1eac0645630b150d9cdbce0b800b2227 Mon Sep 17 00:00:00 2001 From: nachiket Date: Tue, 11 Nov 2025 16:29:32 -0800 Subject: [PATCH 04/66] Replace view change flag with leader election flag --- scripts/deploy/config/raft.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/deploy/config/raft.config b/scripts/deploy/config/raft.config index c5092a94c8..bd1098c531 100644 --- a/scripts/deploy/config/raft.config +++ b/scripts/deploy/config/raft.config @@ -1,6 +1,6 @@ { "clientBatchNum": 100, - "enable_viewchange": false, + "enable_leader_election": false, "recovery_enabled": false, "max_client_complaint_num":10, "max_process_txn": 32, From b8f6799df699e5484c6a9d3b061a5eea42a42167 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Tue, 11 Nov 2025 23:11:44 -0800 Subject: [PATCH 05/66] WIP add initial happy path raft implementation --- .../ordering/raft/algorithm/raft.cpp | 97 ++++++++++++++----- .../consensus/ordering/raft/algorithm/raft.h | 30 +++++- .../ordering/raft/framework/consensus.cpp | 18 ++-- .../ordering/raft/framework/consensus.h | 4 +- .../ordering/raft/proto/proposal.proto | 23 +++-- 5 files changed, 123 insertions(+), 49 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index cc92bb5624..f6355e0873 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -35,55 +35,104 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier) f_ = f; is_stop_ = false; seq_ = 0; + currentTerm_ = 0; + votedFor_ = -1; + commitIndex_ = 0; + lastApplied_ = 0; + nextIndex_.assign(total_num_, 0); + matchIndex_.assign(total_num_, 0); } Raft::~Raft() { is_stop_ = true; } bool Raft::IsStop() { return is_stop_; } -bool Raft::ReceiveTransaction(std::unique_ptr txn) { +bool Raft::ReceiveTransaction(std::unique_ptr txn) { // LOG(ERROR)<<"recv txn:"; + LOG(INFO) << "Received Transaction to primary id: " << id_; + LOG(INFO) << "seq: " << seq_; txn->set_create_time(GetCurrentTime()); txn->set_seq(seq_++); txn->set_proposer(id_); + + // For now just set this to currentTerm_, but is wrong if it just became leader + txn->set_prevlogterm(currentTerm_); - Broadcast(MessageType::Propose, *txn); + // leader sends out highest seq that is committed + txn->set_leadercommitindex(commitIndex_); + + // This should be a term for each entry, but assuming no failure at first + txn->set_term(currentTerm_); + + Broadcast(MessageType::AppendEntriesMsg, *txn); return true; } -bool Raft::ReceivePropose(std::unique_ptr txn) { +bool Raft::ReceivePropose(std::unique_ptr txn) { + auto leader_id = txn->id(); + AppendEntriesResponse appendEntriesResponse; + appendEntriesResponse.set_term(currentTerm_); + appendEntriesResponse.set_id(id_); + appendEntriesResponse.set_lastapplied(lastApplied_); + appendEntriesResponse.set_nextentry(data_.size()); + if (txn->term() < currentTerm_) { + appendEntriesResponse.set_success(false); + SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); + return true; + } + auto prevSeq = txn->seq() - 1; + // This should be the same as checking if it has an entry + // with this prevLogIndex and term + if (prevSeq != -1 && + (prevSeq >= static_cast(data_.size()) || + txn->prevlogterm() != data_[dataIndexMapping_[prevSeq]]->term())) { + appendEntriesResponse.set_success(false); + SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); + return true; + } + // Implement an entry existing but with a different term + // delete that entry and all after it + std::string hash = txn->hash(); int64_t seq = txn->seq(); - int proposer = txn->proposer(); { std::unique_lock lk(mutex_); data_[txn->hash()] = std::move(txn); + dataIndexMapping_.push_back(txn->hash()); } + auto leaderCommit = txn->leadercommitindex(); + while (leaderCommit > commitIndex_ && lastApplied_ + 1 <= static_cast(data_.size())) { + std::unique_ptr txnToCommit = nullptr; + txnToCommit = std::move(data_[dataIndexMapping_[lastApplied_]]); + commit_(*txnToCommit); + lastApplied_++; + } + // I don't quite know if this needs to be conditional, but that's how the paper says it + if (leaderCommit > commitIndex_) + // not 100% certain if this second variable should be seq + commitIndex_ = std::min(leaderCommit, seq); - Proposal proposal; - proposal.set_hash(hash); - proposal.set_seq(seq); - proposal.set_proposer(id_); - Broadcast(MessageType::Prepare, proposal); + appendEntriesResponse.set_lastapplied(lastApplied_); + appendEntriesResponse.set_nextentry(data_.size()); + appendEntriesResponse.set_success(true); + SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); return true; } -bool Raft::ReceivePrepare(std::unique_ptr proposal) { - std::unique_ptr txn = nullptr; - { - std::unique_lock lk(mutex_); - received_[proposal->hash()].insert(proposal->proposer()); - auto it = data_.find(proposal->hash()); - if (it != data_.end()) { - if (received_[proposal->hash()].size() >= 2 * f_ + 1) { - txn = std::move(it->second); - data_.erase(it); - } - } - } - if (txn != nullptr) { - commit_(*txn); +bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr response) { + auto followerId = response->id(); + if (response->success()) { + nextIndex_[followerId] = response->nextentry(); + matchIndex_[followerId] = response->lastapplied(); + return true; } + + // handling for if leader is out of date and term is wrong + + // handling for if term is correct, but follower is just out of date + --nextIndex_[followerId]; + // send message + assert(false); return true; } diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 7406feb774..6b9982be49 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -37,17 +37,37 @@ class Raft : public common::ProtocolBase { Raft(int id, int f, int total_num, SignatureVerifier* verifier); ~Raft(); - bool ReceiveTransaction(std::unique_ptr txn); - bool ReceivePropose(std::unique_ptr txn); - bool ReceivePrepare(std::unique_ptr proposal); - + bool ReceiveTransaction(std::unique_ptr txn); + bool ReceivePropose(std::unique_ptr txn); + bool ReceiveAppendEntriesResponse(std::unique_ptr response); private: bool IsStop(); private: std::mutex mutex_; std::map > received_; - std::map > data_; + std::map > data_; // log[] + + std::vector dataIndexMapping_; + + // This is for everyone + // Most recent term it has seen + int currentTerm_; + // Id for vote in current Term + int votedFor_; + + // Volatile on all servers + // Index of highest log entry it knows to be committed + int64_t commitIndex_; + // Index of highest log entry executed + int64_t lastApplied_; + + // Only for leaders + // This keeps track of the next log entry to send to that replica + // Initialized to last log index + 1 + std::vector nextIndex_; + // This keeps track of the highest log entry it knows is executed on that replica + std::vector matchIndex_; int64_t seq_; bool is_stop_; diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 1c6a6635ed..cf52979d3f 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -48,8 +48,8 @@ Consensus::Consensus(const ResDBConfig& config, } int Consensus::ProcessCustomConsensus(std::unique_ptr request) { - if (request->user_type() == MessageType::Propose) { - std::unique_ptr txn = std::make_unique(); + if (request->user_type() == MessageType::AppendEntriesMsg) { + std::unique_ptr txn = std::make_unique(); if (!txn->ParseFromString(request->data())) { assert(1 == 0); LOG(ERROR) << "parse proposal fail"; @@ -57,21 +57,21 @@ int Consensus::ProcessCustomConsensus(std::unique_ptr request) { } raft_->ReceivePropose(std::move(txn)); return 0; - } else if (request->user_type() == MessageType::Prepare) { - std::unique_ptr proposal = std::make_unique(); - if (!proposal->ParseFromString(request->data())) { + } else if (request->user_type() == MessageType::AppendEntriesResponseMsg) { + std::unique_ptr AppendEntriesResponse = std::make_unique(); + if (!AppendEntriesResponse->ParseFromString(request->data())) { LOG(ERROR) << "parse proposal fail"; assert(1 == 0); return -1; } - raft_->ReceivePrepare(std::move(proposal)); + raft_->ReceiveAppendEntriesResponse(std::move(AppendEntriesResponse)); return 0; } return 0; } int Consensus::ProcessNewTransaction(std::unique_ptr request) { - std::unique_ptr txn = std::make_unique(); + std::unique_ptr txn = std::make_unique(); txn->set_data(request->data()); txn->set_hash(request->hash()); txn->set_proxy_id(request->proxy_id()); @@ -80,10 +80,10 @@ int Consensus::ProcessNewTransaction(std::unique_ptr request) { } int Consensus::CommitMsg(const google::protobuf::Message& msg) { - return CommitMsgInternal(dynamic_cast(msg)); + return CommitMsgInternal(dynamic_cast(msg)); } -int Consensus::CommitMsgInternal(const Transaction& txn) { +int Consensus::CommitMsgInternal(const AppendEntries& txn) { std::unique_ptr request = std::make_unique(); request->set_data(txn.data()); request->set_seq(txn.seq()); diff --git a/platform/consensus/ordering/raft/framework/consensus.h b/platform/consensus/ordering/raft/framework/consensus.h index bfff56b4f5..d5a9535b63 100644 --- a/platform/consensus/ordering/raft/framework/consensus.h +++ b/platform/consensus/ordering/raft/framework/consensus.h @@ -37,9 +37,7 @@ class Consensus : public common::Consensus { int ProcessCustomConsensus(std::unique_ptr request) override; int ProcessNewTransaction(std::unique_ptr request) override; int CommitMsg(const google::protobuf::Message& msg) override; - int CommitMsgInternal(const Transaction& txn); - - int Prepare(const Transaction& txn); + int CommitMsgInternal(const AppendEntries& txn); protected: std::unique_ptr raft_; diff --git a/platform/consensus/ordering/raft/proto/proposal.proto b/platform/consensus/ordering/raft/proto/proposal.proto index 70d8559812..f0c9723c2b 100644 --- a/platform/consensus/ordering/raft/proto/proposal.proto +++ b/platform/consensus/ordering/raft/proto/proposal.proto @@ -21,26 +21,33 @@ syntax = "proto3"; package resdb.raft; -message Transaction{ +message AppendEntries{ int32 id = 1; - bytes data = 2; + bytes data = 2; // this can maybe work as entries but maybe not? bytes hash = 3; int32 proxy_id = 4; int32 proposer = 5; int64 uid = 6; int64 create_time = 7; int64 seq = 9; + int32 prevLogTerm = 10; // term of the most recent log (term corresponding to seq) + int64 leaderCommitIndex = 11; // leader sends out highest seq that is committed + int32 term = 12; // This should be a term for each entry, but assuming no failure at first } -message Proposal { - bytes hash = 1; - int32 proposer = 2; - int64 seq =3 ; +message AppendEntriesResponse { + int32 term = 1; + bool success = 2; + int32 id = 3; + int32 lastApplied = 4; + int32 nextEntry = 5; } enum MessageType { None = 0; - Propose = 1; - Prepare = 2; + AppendEntriesMsg = 1; + AppendEntriesResponseMsg = 2; + RequestVote = 3; + GiveVote = 4; } From 6958cdc05bf5c566b4cc3d6ca5098227a5514920 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Wed, 12 Nov 2025 16:31:58 -0800 Subject: [PATCH 06/66] WIP some bug fixes --- platform/consensus/ordering/raft/algorithm/raft.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index f6355e0873..891943631d 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -83,7 +83,7 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { auto prevSeq = txn->seq() - 1; // This should be the same as checking if it has an entry // with this prevLogIndex and term - if (prevSeq != -1 && + if (prevSeq != 0 && (prevSeq >= static_cast(data_.size()) || txn->prevlogterm() != data_[dataIndexMapping_[prevSeq]]->term())) { appendEntriesResponse.set_success(false); @@ -97,8 +97,9 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { int64_t seq = txn->seq(); { std::unique_lock lk(mutex_); + std::string hash = txn->hash(); data_[txn->hash()] = std::move(txn); - dataIndexMapping_.push_back(txn->hash()); + dataIndexMapping_.push_back(hash); } auto leaderCommit = txn->leadercommitindex(); while (leaderCommit > commitIndex_ && lastApplied_ + 1 <= static_cast(data_.size())) { From 2940bcfe7a9908351573328c7fef6bfa432f0596 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Wed, 12 Nov 2025 19:29:42 -0800 Subject: [PATCH 07/66] WIP log statements to try and figure things out --- .../ordering/raft/algorithm/raft.cpp | 83 +++++++++++++++++-- .../ordering/raft/framework/consensus.cpp | 3 +- 2 files changed, 80 insertions(+), 6 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 891943631d..285f93d3df 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -27,9 +27,54 @@ namespace resdb { namespace raft { +static std::string ToHex(const std::string& input, size_t max_len = 16) { + std::ostringstream oss; + oss << std::hex << std::setfill('0'); + for (size_t i = 0; i < std::min(input.size(), max_len); ++i) { + oss << std::setw(2) << static_cast(static_cast(input[i])); + } + return oss.str(); +} + +static void printAppendEntries(const std::unique_ptr& txn) { + if (!txn) { + LOG(INFO) << "AppendEntries: nullptr"; + return; + } + + LOG(INFO) << "=== AppendEntries ==="; + LOG(INFO) << "id: " << txn->id(); + LOG(INFO) << "term: " << txn->term(); + LOG(INFO) << "seq: " << txn->seq(); + LOG(INFO) << "prevLogTerm: " << txn->prevlogterm(); + LOG(INFO) << "leaderCommitIndex: " << txn->leadercommitindex(); + LOG(INFO) << "proxy_id: " << txn->proxy_id(); + LOG(INFO) << "proposer: " << txn->proposer(); + LOG(INFO) << "uid: " << txn->uid(); + LOG(INFO) << "create_time: " << txn->create_time(); + + // bytes fields (print as hex or limited string to avoid binary garbage) + const std::string& data = txn->data(); + const std::string& hash = txn->hash(); + + LOG(INFO) << "data size: " << data.size(); + if (!data.empty()) { + LOG(INFO) << "data (first 32 bytes): " + << data.substr(0, std::min(32, data.size())); + } + + LOG(INFO) << "hash size: " << hash.size(); + if (!hash.empty()) { + LOG(INFO) << "hash (hex first 16 bytes): " + << ToHex(hash); + } + + LOG(INFO) << "====================="; +} + Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier) : ProtocolBase(id, f, total_num), verifier_(verifier) { - LOG(ERROR) << "get proposal graph"; + LOG(INFO) << "get proposal graph"; id_ = id; total_num_ = total_num; f_ = f; @@ -48,7 +93,8 @@ Raft::~Raft() { is_stop_ = true; } bool Raft::IsStop() { return is_stop_; } bool Raft::ReceiveTransaction(std::unique_ptr txn) { - // LOG(ERROR)<<"recv txn:"; + // LOG(INFO)<<"recv txn:"; + LOG(INFO) << "Received Transaction to primary id: " << id_; LOG(INFO) << "seq: " << seq_; txn->set_create_time(GetCurrentTime()); @@ -63,19 +109,27 @@ bool Raft::ReceiveTransaction(std::unique_ptr txn) { // This should be a term for each entry, but assuming no failure at first txn->set_term(currentTerm_); - + LOG(INFO) << "Before"; + printAppendEntries(txn); + LOG(INFO) << "After"; + Broadcast(MessageType::AppendEntriesMsg, *txn); return true; } bool Raft::ReceivePropose(std::unique_ptr txn) { auto leader_id = txn->id(); + auto leaderCommit = txn->leadercommitindex(); + LOG(INFO) << "Received AppendEntries to replica id: " << id_; + LOG(INFO) << "static_cast(data_.size()): " << static_cast(data_.size()); + printAppendEntries(txn); AppendEntriesResponse appendEntriesResponse; appendEntriesResponse.set_term(currentTerm_); appendEntriesResponse.set_id(id_); appendEntriesResponse.set_lastapplied(lastApplied_); appendEntriesResponse.set_nextentry(data_.size()); if (txn->term() < currentTerm_) { + LOG(INFO) << "AppendEntriesMsg Fail1"; appendEntriesResponse.set_success(false); SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); return true; @@ -86,41 +140,60 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { if (prevSeq != 0 && (prevSeq >= static_cast(data_.size()) || txn->prevlogterm() != data_[dataIndexMapping_[prevSeq]]->term())) { + LOG(INFO) << "AppendEntriesMsg Fail2"; + LOG(INFO) << "prevSeq: " << prevSeq << " data size: " << static_cast(data_.size()); + if (prevSeq < dataIndexMapping_.size()){ + LOG(INFO) << "txn->prevlogterm(): " << txn->prevlogterm() + << " last entry term: " << data_[dataIndexMapping_[prevSeq]]->term(); + } appendEntriesResponse.set_success(false); SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); return true; } // Implement an entry existing but with a different term // delete that entry and all after it - + LOG(INFO) << "Before AppendEntriesMsg Added to Log"; std::string hash = txn->hash(); int64_t seq = txn->seq(); { std::unique_lock lk(mutex_); std::string hash = txn->hash(); + LOG(INFO) << "Before adding to data"; data_[txn->hash()] = std::move(txn); + LOG(INFO) << "After adding to data"; dataIndexMapping_.push_back(hash); } - auto leaderCommit = txn->leadercommitindex(); + LOG(INFO) << "AppendEntriesMsg Added to Log"; + + LOG(INFO) << "leaderCommit: " << leaderCommit; + LOG(INFO) << "commitIndex_: " << commitIndex_; + LOG(INFO) << "lastApplied_: " << lastApplied_; + LOG(INFO) << "static_cast(data_.size()): " << static_cast(data_.size()); while (leaderCommit > commitIndex_ && lastApplied_ + 1 <= static_cast(data_.size())) { + // assert(false); + LOG(INFO) << "AppendEntriesMsg Committing"; std::unique_ptr txnToCommit = nullptr; txnToCommit = std::move(data_[dataIndexMapping_[lastApplied_]]); commit_(*txnToCommit); lastApplied_++; } + LOG(INFO) << "before commit index check"; // I don't quite know if this needs to be conditional, but that's how the paper says it if (leaderCommit > commitIndex_) // not 100% certain if this second variable should be seq commitIndex_ = std::min(leaderCommit, seq); + LOG(INFO) << "after commit index check"; appendEntriesResponse.set_lastapplied(lastApplied_); appendEntriesResponse.set_nextentry(data_.size()); appendEntriesResponse.set_success(true); + LOG(INFO) << "Leader_id: " << leader_id; SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); return true; } bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr response) { + LOG(INFO) << "ReceiveAppendEntriesResponse"; auto followerId = response->id(); if (response->success()) { nextIndex_[followerId] = response->nextentry(); diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index cf52979d3f..fe40c42e0a 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -49,10 +49,11 @@ Consensus::Consensus(const ResDBConfig& config, int Consensus::ProcessCustomConsensus(std::unique_ptr request) { if (request->user_type() == MessageType::AppendEntriesMsg) { + LOG(ERROR) << "Received AppendEntriesMsg"; std::unique_ptr txn = std::make_unique(); if (!txn->ParseFromString(request->data())) { - assert(1 == 0); LOG(ERROR) << "parse proposal fail"; + assert(1 == 0); return -1; } raft_->ReceivePropose(std::move(txn)); From f8453f34817563d8164ff52db05d26dceed5b869 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Wed, 12 Nov 2025 20:05:39 -0800 Subject: [PATCH 08/66] WIP fix using wrong value for leader id --- platform/consensus/ordering/raft/algorithm/raft.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 285f93d3df..ede85a399a 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -118,7 +118,7 @@ bool Raft::ReceiveTransaction(std::unique_ptr txn) { } bool Raft::ReceivePropose(std::unique_ptr txn) { - auto leader_id = txn->id(); + auto leader_id = txn->proposer(); auto leaderCommit = txn->leadercommitindex(); LOG(INFO) << "Received AppendEntries to replica id: " << id_; LOG(INFO) << "static_cast(data_.size()): " << static_cast(data_.size()); @@ -142,7 +142,7 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { txn->prevlogterm() != data_[dataIndexMapping_[prevSeq]]->term())) { LOG(INFO) << "AppendEntriesMsg Fail2"; LOG(INFO) << "prevSeq: " << prevSeq << " data size: " << static_cast(data_.size()); - if (prevSeq < dataIndexMapping_.size()){ + if (prevSeq < static_cast(dataIndexMapping_.size())){ LOG(INFO) << "txn->prevlogterm(): " << txn->prevlogterm() << " last entry term: " << data_[dataIndexMapping_[prevSeq]]->term(); } From 38a3e578e28e02aa80e9fd3c2e1b45ca1860ca99 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Wed, 12 Nov 2025 20:49:15 -0800 Subject: [PATCH 09/66] WIP more print statements --- .../ordering/common/framework/consensus.cpp | 2 ++ .../ordering/raft/framework/consensus.cpp | 1 + .../networkstrate/async_replica_client.cpp | 3 +++ .../networkstrate/replica_communicator.cpp | 25 ++++++++++++++----- 4 files changed, 25 insertions(+), 6 deletions(-) diff --git a/platform/consensus/ordering/common/framework/consensus.cpp b/platform/consensus/ordering/common/framework/consensus.cpp index 93e00cc848..b362c98c4a 100644 --- a/platform/consensus/ordering/common/framework/consensus.cpp +++ b/platform/consensus/ordering/common/framework/consensus.cpp @@ -98,6 +98,7 @@ int Consensus::Broadcast(int type, const google::protobuf::Message& msg) { Request request; msg.SerializeToString(request.mutable_data()); request.set_type(Request::TYPE_CUSTOM_CONSENSUS); + LOG(ERROR) << "Sending custom consensus Broadcast"; request.set_user_type(type); request.set_sender_id(config_.GetSelfInfo().id()); @@ -110,6 +111,7 @@ int Consensus::SendMsg(int type, const google::protobuf::Message& msg, Request request; msg.SerializeToString(request.mutable_data()); request.set_type(Request::TYPE_CUSTOM_CONSENSUS); + LOG(ERROR) << "Sending custom consensus message"; request.set_user_type(type); request.set_sender_id(config_.GetSelfInfo().id()); replica_communicator_->SendMessage(request, node_id); diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index fe40c42e0a..85c042f9e7 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -48,6 +48,7 @@ Consensus::Consensus(const ResDBConfig& config, } int Consensus::ProcessCustomConsensus(std::unique_ptr request) { + LOG(ERROR) << "Message type request->user_type(): " << request->user_type(); if (request->user_type() == MessageType::AppendEntriesMsg) { LOG(ERROR) << "Received AppendEntriesMsg"; std::unique_ptr txn = std::make_unique(); diff --git a/platform/networkstrate/async_replica_client.cpp b/platform/networkstrate/async_replica_client.cpp index af13b79965..f24fcf64d9 100644 --- a/platform/networkstrate/async_replica_client.cpp +++ b/platform/networkstrate/async_replica_client.cpp @@ -37,6 +37,7 @@ AsyncReplicaClient::~AsyncReplicaClient() {} int AsyncReplicaClient::SendMessage(const std::string& data) { queue_.Push(std::make_unique(data)); + LOG(ERROR) << "About to send"; if (!in_process_.load()) { bool old_value = false; if (in_process_.compare_exchange_strong(old_value, true, @@ -49,6 +50,7 @@ int AsyncReplicaClient::SendMessage(const std::string& data) { } void AsyncReplicaClient::OnSendNewMessage() { + LOG(ERROR) << "OnSendNewMessage()"; std::unique_ptr data = queue_.Pop(0); if (data == nullptr || data->empty()) { in_process_ = false; @@ -60,6 +62,7 @@ void AsyncReplicaClient::OnSendNewMessage() { } void AsyncReplicaClient::OnSendMessage() { + LOG(ERROR) << "OnSendMessage(), status: " << status_; if (status_ == 0) { data_size_ = pending_data_->size(); sending_data_size_ = sizeof(data_size_); diff --git a/platform/networkstrate/replica_communicator.cpp b/platform/networkstrate/replica_communicator.cpp index f1521acb49..63a2ca5fda 100644 --- a/platform/networkstrate/replica_communicator.cpp +++ b/platform/networkstrate/replica_communicator.cpp @@ -127,6 +127,7 @@ void ReplicaCommunicator::StartBroadcastInBackGround() { void ReplicaCommunicator::StartSingleInBackGround(const std::string& ip, int port) { single_bq_[std::make_pair(ip,port)] = std::make_unique>>("s_batch", tcp_batch_); + LOG(INFO) << "StartSingleInBackGround: "; ReplicaInfo replica_info; for (const auto& replica : replicas_) { if (replica.ip() == ip && replica.port() == port) { @@ -144,11 +145,12 @@ void ReplicaCommunicator::StartSingleInBackGround(const std::string& ip, int por } } - +LOG(INFO) << "before push back, IsRunning() " << IsRunning(); single_thread_.push_back(std::thread([&](BatchQueue> *bq, ReplicaInfo replica_info) { while (IsRunning()) { std::vector> batch_req = bq->Pop(50000); + LOG(INFO) << "batch_req.empty() " << batch_req.empty(); if (batch_req.empty()) { continue; } @@ -156,9 +158,9 @@ void ReplicaCommunicator::StartSingleInBackGround(const std::string& ip, int por for (auto& queue_item : batch_req) { broadcast_data.add_data()->swap(queue_item->data); } - + LOG(INFO) << "Before SendBroadCastMsg: "; global_stats_->SendBroadCastMsg(broadcast_data.data_size()); - //LOG(ERROR)<<" send to ip:"<BroadCastMsg(); + LOG(INFO) << "is_use_long_conn_: " << is_use_long_conn_; if (is_use_long_conn_) { auto item = std::make_unique(); item->data = NetChannel::GetRawMessageString(message, verifier_); @@ -188,6 +191,7 @@ const ReplicaInfo& replica_info) { single_bq_[std::make_pair(ip, port)]->Push(std::move(item)); return 0; } else { + LOG(INFO) << "Branch 2, calling SendMessageInternal: "; return SendMessageInternal(message, replicas_); } } @@ -208,12 +212,15 @@ int ReplicaCommunicator::SendMessage(const google::protobuf::Message& message, const ReplicaInfo& replica_info) { return SendSingleMessage(message, replica_info); + LOG(INFO) << "is_use_long_conn_: " << is_use_long_conn_; if (is_use_long_conn_) { + LOG(INFO) << "path 1"; std::string data = NetChannel::GetRawMessageString(message, verifier_); BroadcastData broadcast_data; broadcast_data.add_data()->swap(data); return SendMessageFromPool(broadcast_data, {replica_info}); } else { + LOG(INFO) << "path 2"; return SendMessageInternal(message, {replica_info}); } } @@ -240,6 +247,7 @@ int ReplicaCommunicator::SendBatchMessage( int ReplicaCommunicator::SendMessageFromPool( const google::protobuf::Message& message, const std::vector& replicas) { + LOG(ERROR) << "SendMessageFromPool():"; int ret = 0; std::string data; message.SerializeToString(&data); @@ -247,16 +255,17 @@ int ReplicaCommunicator::SendMessageFromPool( std::lock_guard lk(mutex_); for (const auto& replica : replicas) { auto client = GetClientFromPool(replica.ip(), replica.port()); + LOG(ERROR) << "Try client"; if (client == nullptr) { continue; } - //LOG(ERROR) << "send to:" << replica.ip(); + LOG(ERROR) << "send to:" << replica.ip(); if (client->SendMessage(data) == 0) { ret++; } else { LOG(ERROR) << "send to:" << replica.ip() << " fail"; } - //LOG(ERROR) << "send to:" << replica.ip()<<" done"; + LOG(ERROR) << "send to:" << replica.ip()<<" done"; } return ret; } @@ -273,7 +282,9 @@ int ReplicaCommunicator::SendMessageInternal( if (verifier_ != nullptr) { client->SetSignatureVerifier(verifier_); } + LOG(ERROR) << "Before Message sent"; if (client->SendRawMessage(message) == 0) { + LOG(ERROR) << "Message sent"; ret++; } } @@ -306,7 +317,9 @@ void ReplicaCommunicator::BroadCast(const google::protobuf::Message& message) { void ReplicaCommunicator::SendMessage(const google::protobuf::Message& message, int64_t node_id) { ReplicaInfo target_replica; + LOG(INFO) << "node_id: " << node_id; for (const auto& replica : replicas_) { + LOG(INFO) << "replica.id(): " << replica.id(); if (replica.id() == node_id) { target_replica = replica; break; From 03e430463335f73af22fd83f2e67cf4685e6e0b3 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Thu, 13 Nov 2025 18:14:30 +0000 Subject: [PATCH 10/66] Fixed errors in build files to support Intellisense --- WORKSPACE | 12 ++++++++---- platform/consensus/ordering/geo_pbft/BUILD | 2 +- platform/consensus/ordering/pbft/BUILD | 4 ++-- scripts/deploy/performance_local/run_performance.sh | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index cfb3420dae..3ef8ed863a 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -203,10 +203,14 @@ bind( http_archive( name = "com_zlib", - build_file = "@com_resdb_nexres//third_party:zlib.BUILD", - sha256 = "629380c90a77b964d896ed37163f5c3a34f6e6d897311f1df2a7016355c45eff", - strip_prefix = "zlib-1.2.11", - url = "https://github.com/madler/zlib/archive/v1.2.11.tar.gz", + build_file = "@com_resdb_nexres//third_party:z.BUILD", + sha256 = "9a93b2b7dfdac77ceba5a558a580e74667dd6fede4585b91eefb60f03b72df23", + strip_prefix = "zlib-1.3.1", + urls = [ + "https://zlib.net/zlib-1.3.1.tar.gz", + "https://zlib.net/fossils/zlib-1.3.1.tar.gz", + "https://github.com/madler/zlib/releases/download/v1.3.1/zlib-1.3.1.tar.gz", + ], ) http_archive( diff --git a/platform/consensus/ordering/geo_pbft/BUILD b/platform/consensus/ordering/geo_pbft/BUILD index 4bc568dd5b..a083eb3e86 100644 --- a/platform/consensus/ordering/geo_pbft/BUILD +++ b/platform/consensus/ordering/geo_pbft/BUILD @@ -62,7 +62,7 @@ cc_test( "//common/test:test_main", "//platform/config:resdb_config_utils", "//platform/consensus/execution:mock_geo_global_executor", - "//platform/consensus/execution:mock_transaction_executor_impl", + #"//platform/consensus/execution:mock_transaction_executor_impl", "//platform/networkstrate:mock_replica_communicator", ], ) diff --git a/platform/consensus/ordering/pbft/BUILD b/platform/consensus/ordering/pbft/BUILD index 6f4fdff709..6d03244418 100644 --- a/platform/consensus/ordering/pbft/BUILD +++ b/platform/consensus/ordering/pbft/BUILD @@ -256,8 +256,8 @@ cc_library( ) cc_library( - name = "pre_very_consensus_manager_pbft", - hdrs = ["pre_very_consensus_manager_pbft.h"], + name = "pre_very_consensus_service_pbft", + hdrs = ["pre_very_consensus_service_pbft.h"], visibility = [ "//platform:__subpackages__", "//service:__subpackages__", diff --git a/scripts/deploy/performance_local/run_performance.sh b/scripts/deploy/performance_local/run_performance.sh index 144e1c8572..558c2282c1 100755 --- a/scripts/deploy/performance_local/run_performance.sh +++ b/scripts/deploy/performance_local/run_performance.sh @@ -55,6 +55,6 @@ done python3 performance/calculate_result.py `ls result_*_log` > results.log -rm -rf result_*_log +#rm -rf result_*_log echo "save result to results.log" cat results.log From cf06093915a61a96dd5a4ae1c528005b8b15d3b1 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Thu, 13 Nov 2025 15:53:22 -0800 Subject: [PATCH 11/66] WIP temp change to broadcast --- platform/consensus/ordering/raft/algorithm/raft.cpp | 10 +++++++--- scripts/deploy/config/raft.config | 2 +- scripts/deploy/performance/calculate_result.py | 6 +++++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index ede85a399a..25b19cdeef 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -137,8 +137,8 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { auto prevSeq = txn->seq() - 1; // This should be the same as checking if it has an entry // with this prevLogIndex and term - if (prevSeq != 0 && - (prevSeq >= static_cast(data_.size()) || + if (prevSeq != 0 && prevSeq > static_cast(dataIndexMapping_.size()) && + (prevSeq >= static_cast(dataIndexMapping_.size()) || txn->prevlogterm() != data_[dataIndexMapping_[prevSeq]]->term())) { LOG(INFO) << "AppendEntriesMsg Fail2"; LOG(INFO) << "prevSeq: " << prevSeq << " data size: " << static_cast(data_.size()); @@ -188,13 +188,17 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { appendEntriesResponse.set_nextentry(data_.size()); appendEntriesResponse.set_success(true); LOG(INFO) << "Leader_id: " << leader_id; - SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); + // SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); + Broadcast(MessageType::AppendEntriesResponseMsg, appendEntriesResponse); return true; } bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr response) { + if (id_ != 1) + return true; LOG(INFO) << "ReceiveAppendEntriesResponse"; auto followerId = response->id(); + LOG(INFO) << "followerId: " << followerId; if (response->success()) { nextIndex_[followerId] = response->nextentry(); matchIndex_[followerId] = response->lastapplied(); diff --git a/scripts/deploy/config/raft.config b/scripts/deploy/config/raft.config index c5092a94c8..9753a60cb1 100644 --- a/scripts/deploy/config/raft.config +++ b/scripts/deploy/config/raft.config @@ -4,7 +4,7 @@ "recovery_enabled": false, "max_client_complaint_num":10, "max_process_txn": 32, - "worker_num": 2, + "worker_num": 1, "input_worker_num": 1, "output_worker_num": 10 } diff --git a/scripts/deploy/performance/calculate_result.py b/scripts/deploy/performance/calculate_result.py index f6892d2685..5852c3d472 100644 --- a/scripts/deploy/performance/calculate_result.py +++ b/scripts/deploy/performance/calculate_result.py @@ -53,7 +53,11 @@ def cal_lat(lat): lat_sum.append(v) print("max latency:",lat_max) - print("average latency:",sum(lat_sum)/len(lat_sum)) + if not len(lat_sum): + average_latency = 0 + else: + average_latency = sum(lat_sum)/len(lat_sum) + print("average latency:", average_latency) if __name__ == '__main__': files = sys.argv[1:] From 71829da64083c801c01cb742d372279b879d510a Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Thu, 13 Nov 2025 21:37:23 -0800 Subject: [PATCH 12/66] WIP add more prints, leader commits and executes --- .../ordering/raft/algorithm/raft.cpp | 41 +++++++++++++++++-- .../consensus/ordering/raft/algorithm/raft.h | 1 + .../ordering/raft/proto/proposal.proto | 2 + scripts/deploy/config/raft.config | 2 +- 4 files changed, 42 insertions(+), 4 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 25b19cdeef..695805057b 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -84,14 +84,23 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier) votedFor_ = -1; commitIndex_ = 0; lastApplied_ = 0; - nextIndex_.assign(total_num_, 0); - matchIndex_.assign(total_num_, 0); + nextIndex_.assign(total_num_ + 1, 0); + matchIndex_.assign(total_num_ + 1, 0); } Raft::~Raft() { is_stop_ = true; } bool Raft::IsStop() { return is_stop_; } +void Raft::Dump() { + LOG(INFO) << "=== Replica Dump ==="; + LOG(INFO) << "id_: " << id_; + LOG(INFO) << "currentTerm_: " << currentTerm_; + LOG(INFO) << "votedFor_: " << votedFor_; + LOG(INFO) << "commitIndex_: " << commitIndex_; + LOG(INFO) << "lastApplied_: " << lastApplied_; +} + bool Raft::ReceiveTransaction(std::unique_ptr txn) { // LOG(INFO)<<"recv txn:"; @@ -118,6 +127,7 @@ bool Raft::ReceiveTransaction(std::unique_ptr txn) { } bool Raft::ReceivePropose(std::unique_ptr txn) { + Dump(); auto leader_id = txn->proposer(); auto leaderCommit = txn->leadercommitindex(); LOG(INFO) << "Received AppendEntries to replica id: " << id_; @@ -169,7 +179,9 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { LOG(INFO) << "commitIndex_: " << commitIndex_; LOG(INFO) << "lastApplied_: " << lastApplied_; LOG(INFO) << "static_cast(data_.size()): " << static_cast(data_.size()); - while (leaderCommit > commitIndex_ && lastApplied_ + 1 <= static_cast(data_.size())) { + LOG(INFO) << "leaderCommit > commitIndex_: " << (leaderCommit > commitIndex_ ? "true" : "false"); + LOG(INFO) << "lealastApplied_ + 1 <= static_cast(data_.size()) " << ((lastApplied_ + 1 <= static_cast(data_.size())) ? "true" : "false"); + while ((leaderCommit != 0) && leaderCommit > commitIndex_ && lastApplied_ + 1 <= static_cast(data_.size())) { // assert(false); LOG(INFO) << "AppendEntriesMsg Committing"; std::unique_ptr txnToCommit = nullptr; @@ -187,6 +199,8 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { appendEntriesResponse.set_lastapplied(lastApplied_); appendEntriesResponse.set_nextentry(data_.size()); appendEntriesResponse.set_success(true); + appendEntriesResponse.set_hash(hash); + appendEntriesResponse.set_seq(seq); LOG(INFO) << "Leader_id: " << leader_id; // SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); Broadcast(MessageType::AppendEntriesResponseMsg, appendEntriesResponse); @@ -200,6 +214,27 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr r auto followerId = response->id(); LOG(INFO) << "followerId: " << followerId; if (response->success()) { + { + std::unique_lock lk(mutex_); + received_[response->hash()].insert(response->id()); + auto it = data_.find(response->hash()); + if (it != data_.end()) { + LOG(INFO) << "Transaction: " << response->seq() << " has gotten " << received_[response->hash()].size() << " responses"; + if (static_cast(received_[response->hash()].size()) >= f_ + 1) { + commitIndex_ = response->seq(); + + // pretty sure this should always be in order with no gaps + while (lastApplied_ + 1 <= static_cast(data_.size()) && + lastApplied_ <= commitIndex_) { + LOG(INFO) << "Leader Committing"; + std::unique_ptr txnToCommit = nullptr; + txnToCommit = std::move(data_[dataIndexMapping_[lastApplied_]]); + commit_(*txnToCommit); + lastApplied_++; + } + } + } + } nextIndex_[followerId] = response->nextentry(); matchIndex_[followerId] = response->lastapplied(); return true; diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 6b9982be49..8aa1905164 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -42,6 +42,7 @@ class Raft : public common::ProtocolBase { bool ReceiveAppendEntriesResponse(std::unique_ptr response); private: bool IsStop(); + void Dump(); private: std::mutex mutex_; diff --git a/platform/consensus/ordering/raft/proto/proposal.proto b/platform/consensus/ordering/raft/proto/proposal.proto index f0c9723c2b..687672d59f 100644 --- a/platform/consensus/ordering/raft/proto/proposal.proto +++ b/platform/consensus/ordering/raft/proto/proposal.proto @@ -41,6 +41,8 @@ message AppendEntriesResponse { int32 id = 3; int32 lastApplied = 4; int32 nextEntry = 5; + bytes hash = 6; + int64 seq = 7; } enum MessageType { diff --git a/scripts/deploy/config/raft.config b/scripts/deploy/config/raft.config index 9753a60cb1..5b5a1c2bc0 100644 --- a/scripts/deploy/config/raft.config +++ b/scripts/deploy/config/raft.config @@ -3,7 +3,7 @@ "enable_viewchange": false, "recovery_enabled": false, "max_client_complaint_num":10, - "max_process_txn": 32, + "max_process_txn": 10000, "worker_num": 1, "input_worker_num": 1, "output_worker_num": 10 From 0b7ab59349213c20a9edd091215d565ed850e6fe Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Thu, 13 Nov 2025 21:40:33 -0800 Subject: [PATCH 13/66] WIP switch back to SendMessage --- platform/consensus/ordering/raft/algorithm/raft.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 695805057b..f246bab5fc 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -202,8 +202,8 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { appendEntriesResponse.set_hash(hash); appendEntriesResponse.set_seq(seq); LOG(INFO) << "Leader_id: " << leader_id; - // SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); - Broadcast(MessageType::AppendEntriesResponseMsg, appendEntriesResponse); + SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); + // Broadcast(MessageType::AppendEntriesResponseMsg, appendEntriesResponse); return true; } From 258c7dbb0dcfd3bcecf016585fa6b347892e4d6a Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Thu, 13 Nov 2025 21:53:54 -0800 Subject: [PATCH 14/66] WIP fixed committing condition --- platform/consensus/ordering/raft/algorithm/raft.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index f246bab5fc..22f6923808 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -181,7 +181,7 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { LOG(INFO) << "static_cast(data_.size()): " << static_cast(data_.size()); LOG(INFO) << "leaderCommit > commitIndex_: " << (leaderCommit > commitIndex_ ? "true" : "false"); LOG(INFO) << "lealastApplied_ + 1 <= static_cast(data_.size()) " << ((lastApplied_ + 1 <= static_cast(data_.size())) ? "true" : "false"); - while ((leaderCommit != 0) && leaderCommit > commitIndex_ && lastApplied_ + 1 <= static_cast(data_.size())) { + while ((leaderCommit != 0) && leaderCommit > lastApplied_ && lastApplied_ + 1 <= static_cast(data_.size())) { // assert(false); LOG(INFO) << "AppendEntriesMsg Committing"; std::unique_ptr txnToCommit = nullptr; From 7d3c979bcedfb7a24a0e223070cf600c3b870d8c Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Thu, 20 Nov 2025 17:33:56 +0000 Subject: [PATCH 15/66] Added Heartbeat and Elections triggered by timeout --- .../execution/transaction_executor.cpp | 3 +- .../ordering/common/framework/consensus.cpp | 2 - .../common/framework/performance_manager.cpp | 1 - .../consensus/ordering/raft/algorithm/BUILD | 14 +- .../raft/algorithm/leaderelection_manager.cpp | 191 ++++++++++++++++++ .../raft/algorithm/leaderelection_manager.h | 80 ++++++++ .../ordering/raft/algorithm/raft.cpp | 104 +++++++++- .../consensus/ordering/raft/algorithm/raft.h | 28 ++- .../ordering/raft/framework/consensus.cpp | 11 +- .../ordering/raft/framework/consensus.h | 2 + .../ordering/raft/proto/proposal.proto | 40 ++++ platform/networkstrate/consensus_manager.cpp | 2 +- scripts/deploy/config/raft.config | 2 +- 13 files changed, 455 insertions(+), 25 deletions(-) create mode 100644 platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp create mode 100644 platform/consensus/ordering/raft/algorithm/leaderelection_manager.h diff --git a/platform/consensus/execution/transaction_executor.cpp b/platform/consensus/execution/transaction_executor.cpp index a62e55f590..a5489da8ec 100644 --- a/platform/consensus/execution/transaction_executor.cpp +++ b/platform/consensus/execution/transaction_executor.cpp @@ -178,7 +178,6 @@ void TransactionExecutor::OrderMessage() { // << " next seq:" << next_execute_seq_; continue; } - AddNewData(std::move(message)); } @@ -325,7 +324,7 @@ void TransactionExecutor::Execute(std::unique_ptr request, } // LOG(ERROR)<<" CF = :"<<(cf==1)<<" uid:"<AddExecuted(batch_request_p->hash(), batch_request_p->seq()); } diff --git a/platform/consensus/ordering/common/framework/consensus.cpp b/platform/consensus/ordering/common/framework/consensus.cpp index 93e00cc848..568a00ef06 100644 --- a/platform/consensus/ordering/common/framework/consensus.cpp +++ b/platform/consensus/ordering/common/framework/consensus.cpp @@ -38,8 +38,6 @@ Consensus::Consensus(const ResDBConfig& config, ResponseMsg(*resp_msg); }, nullptr, std::move(executor))) { - LOG(INFO) << "is running is performance mode:" - << config_.IsPerformanceRunning(); is_stop_ = false; global_stats_ = Stats::GetGlobalStats(); } diff --git a/platform/consensus/ordering/common/framework/performance_manager.cpp b/platform/consensus/ordering/common/framework/performance_manager.cpp index c07088f1f9..101b00af06 100644 --- a/platform/consensus/ordering/common/framework/performance_manager.cpp +++ b/platform/consensus/ordering/common/framework/performance_manager.cpp @@ -93,7 +93,6 @@ int PerformanceManager::StartEval() { eval_ready_promise_.set_value(true); } } - LOG(WARNING) << "start eval done"; return 0; } diff --git a/platform/consensus/ordering/raft/algorithm/BUILD b/platform/consensus/ordering/raft/algorithm/BUILD index 59903f16ad..d59c03bf2e 100644 --- a/platform/consensus/ordering/raft/algorithm/BUILD +++ b/platform/consensus/ordering/raft/algorithm/BUILD @@ -20,8 +20,14 @@ package(default_visibility = ["//platform/consensus/ordering/raft:__subpackages_ cc_library( name = "raft", - srcs = ["raft.cpp"], - hdrs = ["raft.h"], + srcs = [ + "raft.cpp", + "leaderelection_manager.cpp", + ], + hdrs = [ + "raft.h", + "leaderelection_manager.h", + ], deps = [ "//common:comm", "//common/crypto:signature_verifier", @@ -29,5 +35,9 @@ cc_library( "//platform/consensus/ordering/common/algorithm:protocol_base", "//platform/consensus/ordering/raft/proto:proposal_cc_proto", "//platform/statistic:stats", + "//platform/config:resdb_config", + "//platform/consensus/execution:system_info", + "//platform/networkstrate:replica_communicator", + "//platform/proto:viewchange_message_cc_proto", ], ) diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp new file mode 100644 index 0000000000..1cac636c03 --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" +#include "platform/consensus/ordering/raft/algorithm/raft.h" +#include + +#include "common/utils/utils.h" +#include "platform/proto/viewchange_message.pb.h" + +namespace resdb { +namespace raft { + +// A manager to address View change process. +// All stuff here will be addressed in sequential by using mutex +// to make things simplier. +LeaderElectionManager::LeaderElectionManager(const ResDBConfig& config) + : config_(config), + raft_(nullptr), + started_(false), + stop_(false), + timeout_min_ms(150), + timeout_max_ms(300), + heartbeat_timer_(50), + heartbeat_count_(0), + role_epoch_(0), + known_role_epoch_(0) { + global_stats_ = Stats::GetGlobalStats(); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in LeaderElectionManager constructor"; +} + +LeaderElectionManager::~LeaderElectionManager() { + stop_.store(true); + cv_.notify_all(); + + if (server_checking_timeout_thread_.joinable()) { + server_checking_timeout_thread_.join(); + } +} + +void LeaderElectionManager::MayStart() { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in LeaderElectionManager MayStart"; + bool expected = false; + if (!started_.compare_exchange_strong(expected, true)) { + return; + } + + if (config_.GetPublicKeyCertificateInfo() + .public_key() + .public_key_info() + .type() == CertificateKeyInfo::CLIENT) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in LeaderElectionManager MayStart, Client conditional"; + LOG(ERROR) << "client type not process view change"; + return; + } + + if (config_.GetConfigData().enable_viewchange()) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in LeaderElectionManager MayStart, viewchange is enabled"; + server_checking_timeout_thread_ = + std::thread(&LeaderElectionManager::MonitoringElectionTimeout, this); + } +} + +void LeaderElectionManager::SetRaft(raft::Raft* raft) { + raft_ = raft; +} + +void LeaderElectionManager::OnHeartBeat() { + { + LOG(INFO) << "JIM -> " << __FUNCTION__; + std::lock_guard lk(cv_mutex_); + heartbeat_count_++; + } + cv_.notify_all(); +} + +void LeaderElectionManager::OnRoleChange() { + { + LOG(INFO) << "JIM -> " << __FUNCTION__; + std::lock_guard lk(cv_mutex_); + role_epoch_++; + } + cv_.notify_all(); +} + +uint64_t LeaderElectionManager::RandomInt(uint64_t min, uint64_t max) { + static thread_local std::mt19937_64 gen(std::random_device{}()); + std::uniform_int_distribution dist(min, max); + return dist(gen); +} + +Waited LeaderElectionManager::LeaderWait() { + LOG(INFO) << "JIM -> " << __FUNCTION__; + std::unique_lock lk(cv_mutex_); + if (known_role_epoch_ != role_epoch_) { + known_role_epoch_ = role_epoch_; + return Waited::ROLE_CHANGE; + } + cv_.wait_for(lk, std::chrono::milliseconds(heartbeat_timer_), + [this] { + return (stop_.load() == true + || (known_role_epoch_ != role_epoch_)); + }); + if (stop_.load() == true) { return Waited::STOPPED; } + else if (known_role_epoch_ != role_epoch_) { + known_role_epoch_ = role_epoch_; + return Waited::ROLE_CHANGE; + } + else { return Waited::TIMEOUT; } +} + +Waited LeaderElectionManager::Wait() { + LOG(INFO) << "JIM -> " << __FUNCTION__; + const uint64_t timeout_ms = RandomInt(timeout_min_ms, timeout_max_ms); + timeout_ms_ = timeout_ms; + std::unique_lock lk(cv_mutex_); + const uint64_t heartbeat_snapshot = heartbeat_count_; + if (known_role_epoch_ != role_epoch_) { + known_role_epoch_ = role_epoch_; + return Waited::ROLE_CHANGE; + } + cv_.wait_for(lk, std::chrono::milliseconds(timeout_ms), + [this, heartbeat_snapshot] { + return (stop_.load() == true + || (heartbeat_snapshot != heartbeat_count_) + || (known_role_epoch_ != role_epoch_)); + }); + if (stop_.load() == true) { return Waited::STOPPED; } + else if (known_role_epoch_ != role_epoch_) { + known_role_epoch_ = role_epoch_; + return Waited::ROLE_CHANGE; + } + else if (heartbeat_snapshot != heartbeat_count_) { return Waited::HEARTBEAT; } + else { return Waited::TIMEOUT; } +} + +// Function that is run in server_checking_timeout_thread started in MayStart(). +// Causes leaders to Heartbeat. +// Causes followers and candidates to start an election if no heartbeat received. +void LeaderElectionManager::MonitoringElectionTimeout() { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": thread entered the function"; + while (!stop_.load()) { + raft::Role role = raft_->GetRoleSnapshot(); + Waited res; + if (role == raft::Role::LEADER) { res = LeaderWait(); } + else { res = Wait(); } + + if (res == Waited::STOPPED) { break; } + else if (res == Waited::ROLE_CHANGE) { + LOG(INFO) << __FUNCTION__ << ": Role change detected"; + continue; + } + else if (res == Waited::HEARTBEAT) { + LOG(INFO) << __FUNCTION__ << ": Heartbeat received within " << timeout_ms_ << " ms"; + if (raft_->GetRoleSnapshot() == raft::Role::LEADER) { + // A leader receiving a heartbeat would be unusual but not impossible. + LOG(WARNING) << __FUNCTION__ << " Received Heartbeat as LEADER"; + } + continue; + } + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in timeout section"; + // Only gets here if timeout expired. + // Leaders send a new heartbeat. + if (raft_->GetRoleSnapshot() == raft::Role::LEADER) { + raft_->SendHeartBeat(); + } + // Followers and Candidates start an election. + else { + raft_->StartElection(); + } + } +} + +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h new file mode 100644 index 0000000000..7acd5b9dda --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include "platform/config/resdb_config.h" +#include "platform/consensus/execution/system_info.h" +#include "platform/proto/viewchange_message.pb.h" +#include "platform/statistic/stats.h" + +namespace resdb { +namespace raft { + +class Raft; // forward declaration + +enum class Waited { + HEARTBEAT, + STOPPED, + TIMEOUT, + ROLE_CHANGE +}; + +class LeaderElectionManager { + public: + LeaderElectionManager(const ResDBConfig& config); + virtual ~LeaderElectionManager(); + + // If the monitor is not running, start to monitor. + void MayStart(); + void SetRaft(raft::Raft*); + void OnHeartBeat(); + void OnRoleChange(); + + private: + Waited LeaderWait(); + Waited Wait(); + void MonitoringElectionTimeout(); + uint64_t RandomInt(uint64_t min, uint64_t max); + + + protected: + ResDBConfig config_; + Stats* global_stats_; + raft::Raft* raft_; + std::map> viewchange_request_; + std::atomic started_; + std::atomic stop_; + std::thread server_checking_timeout_thread_; + uint64_t timeout_ms_; + uint64_t timeout_min_ms; + uint64_t timeout_max_ms; + uint64_t heartbeat_timer_; + uint64_t heartbeat_count_; // Protected by cv_mutex_ + uint64_t role_epoch_; // Protected by cv_mutex_ + uint64_t known_role_epoch_; // Protected by cv_mutex_ + std::mutex cv_mutex_; + std::condition_variable cv_; +}; + +} // namespace raft +} // namespace resdb + diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index cc92bb5624..bcf7c46cf8 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -23,12 +23,17 @@ #include "common/crypto/signature_verifier.h" #include "common/utils/utils.h" +#include "platform/proto/resdb.pb.h" namespace resdb { namespace raft { -Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier) - : ProtocolBase(id, f, total_num), verifier_(verifier) { +Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, + LeaderElectionManager* leaderelection_manager) + : ProtocolBase(id, f, total_num), + verifier_(verifier), + leader_election_manager_(leaderelection_manager), + role_(raft::Role::FOLLOWER) { LOG(ERROR) << "get proposal graph"; id_ = id; total_num_ = total_num; @@ -56,8 +61,8 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { int64_t seq = txn->seq(); int proposer = txn->proposer(); { - std::unique_lock lk(mutex_); - data_[txn->hash()] = std::move(txn); + std::unique_lock lk(raft_mutex_); + log_[txn->hash()] = std::move(txn); } Proposal proposal; @@ -71,21 +76,104 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { bool Raft::ReceivePrepare(std::unique_ptr proposal) { std::unique_ptr txn = nullptr; { - std::unique_lock lk(mutex_); + std::unique_lock lk(raft_mutex_); received_[proposal->hash()].insert(proposal->proposer()); - auto it = data_.find(proposal->hash()); - if (it != data_.end()) { + auto it = log_.find(proposal->hash()); + if (it != log_.end()) { if (received_[proposal->hash()].size() >= 2 * f_ + 1) { txn = std::move(it->second); - data_.erase(it); + log_.erase(it); } } } if (txn != nullptr) { commit_(*txn); } + else { + } return true; } +raft::Role Raft::GetRoleSnapshot() const { + std::lock_guard lk(raft_mutex_); + return role_; +} + +// TODO SET LASTLOGINDEX AND LASTLOGTERM UPON MERGE +// Called from LeaderElectionManager::StartElection when timeout +void Raft::StartElection() { + LOG(INFO) << "JIM -> " << __FUNCTION__; + uint64_t currentTerm; + int candidateId; + uint64_t lastLogIndex; + uint64_t lastLogTerm; + bool roleChanged = false; + + { + std::lock_guard lk(raft_mutex_); + if (role_ == raft::Role::LEADER) { + LOG(WARNING) << __FUNCTION__ << ": Leader tried to start election"; + return; + } + if (role_ == raft::Role::FOLLOWER) { + LOG(INFO) << __FUNCTION__ << ": FOLLOWER->CANDIDATE"; + role_ = raft::Role::CANDIDATE; + roleChanged = true; + } + currentTerm_++; + votedFor_ = id_; + + currentTerm = currentTerm_; + candidateId = id_; + + // TODO + lastLogIndex = 0; + lastLogTerm = 0; + } + if (roleChanged) { + leader_election_manager_->OnRoleChange(); + } + + RequestVote requestVote; + requestVote.set_term(currentTerm); + requestVote.set_candidateid(candidateId); + requestVote.set_lastlogindex(lastLogIndex); + requestVote.set_lastlogterm(lastLogTerm); + Broadcast(MessageType::RequestVoteMsg, requestVote); +} + +// TODO +// ON MERGE FIX VALUES +void Raft::SendHeartBeat() { + LOG(INFO) << "JIM -> " << __FUNCTION__; + uint64_t currentTerm; + int leaderId = id_; + uint64_t prevLogIndex; + uint64_t prevLogTerm; + std::string entries; + uint64_t leaderCommit; + { + std::lock_guard lk(raft_mutex_); + if (role_ != raft::Role::LEADER) { + LOG(WARNING) << __FUNCTION__ << ": Non-Leader tried to start HeartBeat"; + return; + } + currentTerm = currentTerm_; + prevLogIndex = 0; + prevLogTerm = 0; + entries = ""; + leaderCommit = 0; + } + AppendEntries appendEntries; + appendEntries.set_term(currentTerm); + appendEntries.set_proposer(leaderId); + appendEntries.set_leadercommitindex(prevLogIndex); // wrong function + appendEntries.set_prevlogterm(prevLogTerm); + appendEntries.set_data(entries); + appendEntries.set_leadercommitindex(leaderCommit); + // TODO Need to make sure leader no-ops their own heartbeats + Broadcast(MessageType::AppendEntriesMsg, appendEntries); +} + } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 7406feb774..882492cfd2 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -28,31 +28,49 @@ #include "platform/consensus/ordering/common/algorithm/protocol_base.h" #include "platform/consensus/ordering/raft/proto/proposal.pb.h" #include "platform/statistic/stats.h" +#include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" namespace resdb { namespace raft { +enum class Role { FOLLOWER, CANDIDATE, LEADER }; + class Raft : public common::ProtocolBase { public: - Raft(int id, int f, int total_num, SignatureVerifier* verifier); + Raft(int id, int f, int total_num, + SignatureVerifier* verifier, + LeaderElectionManager* leaderelection_manager); ~Raft(); bool ReceiveTransaction(std::unique_ptr txn); bool ReceivePropose(std::unique_ptr txn); bool ReceivePrepare(std::unique_ptr proposal); + raft::Role GetRoleSnapshot() const; + void StartElection(); + void SendHeartBeat(); + private: bool IsStop(); private: - std::mutex mutex_; - std::map > received_; - std::map > data_; - int64_t seq_; bool is_stop_; SignatureVerifier* verifier_; + LeaderElectionManager* leader_election_manager_; Stats* global_stats_; + std::map > received_; + + mutable std::mutex raft_mutex_; + Role role_; // Protected by raft_mutex_ + uint64_t currentTerm_; // Protected by raft_mutex_ + int votedFor_; // Protected by raft_mutex_ + std::map> log_; // Protected by raft_mutex_ + uint64_t commit_index_; // Protected by raft_mutex_ + uint64_t last_applied_; // Protected by raft_mutex_ + std::map next_index_; // Protected by raft_mutex_ + std::map match_index_; // Protected by raft_mutex_ + int LeaderId; // Protected by raft_mutex_ }; } // namespace raft diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 1c6a6635ed..4a9fa69c86 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -29,7 +29,9 @@ namespace raft { Consensus::Consensus(const ResDBConfig& config, std::unique_ptr executor) - : common::Consensus(config, std::move(executor)) { + : common::Consensus(config, std::move(executor)), + leader_election_manager_(std::make_unique(config_)) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": In consensus constructor"; int total_replicas = config_.GetReplicaNum(); int f = (total_replicas - 1) / 3; @@ -42,7 +44,11 @@ Consensus::Consensus(const ResDBConfig& config, .public_key_info() .type() != CertificateKeyInfo::CLIENT) { raft_ = std::make_unique(config_.GetSelfInfo().id(), f, total_replicas, - GetSignatureVerifier()); + GetSignatureVerifier(), leader_election_manager_.get()); + + leader_election_manager_->SetRaft(raft_.get()); + leader_election_manager_->MayStart(); + InitProtocol(raft_.get()); } } @@ -89,7 +95,6 @@ int Consensus::CommitMsgInternal(const Transaction& txn) { request->set_seq(txn.seq()); request->set_uid(txn.uid()); request->set_proxy_id(txn.proxy_id()); - transaction_executor_->Commit(std::move(request)); return 0; } diff --git a/platform/consensus/ordering/raft/framework/consensus.h b/platform/consensus/ordering/raft/framework/consensus.h index bfff56b4f5..d6216c067c 100644 --- a/platform/consensus/ordering/raft/framework/consensus.h +++ b/platform/consensus/ordering/raft/framework/consensus.h @@ -22,6 +22,7 @@ #include "executor/common/transaction_manager.h" #include "platform/consensus/ordering/common/framework/consensus.h" #include "platform/consensus/ordering/raft/algorithm/raft.h" +#include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" #include "platform/networkstrate/consensus_manager.h" namespace resdb { @@ -43,6 +44,7 @@ class Consensus : public common::Consensus { protected: std::unique_ptr raft_; + std::unique_ptr leader_election_manager_; Stats* global_stats_; int64_t start_; std::mutex mutex_; diff --git a/platform/consensus/ordering/raft/proto/proposal.proto b/platform/consensus/ordering/raft/proto/proposal.proto index 70d8559812..09dedb8523 100644 --- a/platform/consensus/ordering/raft/proto/proposal.proto +++ b/platform/consensus/ordering/raft/proto/proposal.proto @@ -38,9 +38,49 @@ message Proposal { int64 seq =3 ; } +message AppendEntries{ + int32 id = 1; + bytes data = 2; // this can maybe work as entries but maybe not? + bytes hash = 3; + int32 proxy_id = 4; + int32 proposer = 5; + int64 uid = 6; + int64 create_time = 7; + int64 seq = 9; + int32 prevLogTerm = 10; // term of the most recent log (term corresponding to seq) + int64 leaderCommitIndex = 11; // leader sends out highest seq that is committed + int32 term = 12; // This should be a term for each entry, but assuming no failure at first +} + +message AppendEntriesResponse { + int32 term = 1; + bool success = 2; + int32 id = 3; + int32 lastApplied = 4; + int32 nextEntry = 5; + bytes hash = 6; + int64 seq = 7; +} + +message RequestVote { + int64 term = 1; + int32 candidateId = 2; + int64 lastLogIndex = 3; + int64 lastLogTerm = 4; +} + +message RequestVoteResponse { + int64 term = 1; + bool voteGranted = 2; +} + enum MessageType { None = 0; Propose = 1; Prepare = 2; + AppendEntriesMsg = 3; + AppendEntriesResponseMsg = 4; + RequestVoteMsg = 5; + RequestVoteResponseMsg = 6; } diff --git a/platform/networkstrate/consensus_manager.cpp b/platform/networkstrate/consensus_manager.cpp index b3fb106253..7ee0eef637 100644 --- a/platform/networkstrate/consensus_manager.cpp +++ b/platform/networkstrate/consensus_manager.cpp @@ -22,6 +22,7 @@ #include #include +#include "glog/logging.h" #include "platform/proto/broadcast.pb.h" namespace resdb { @@ -85,7 +86,6 @@ void ConsensusManager::Start() { // Keep Boardcast the public keys to others. void ConsensusManager::HeartBeat() { - LOG(INFO) << "heart beat start"; int sleep_time = 1; std::mutex mutex; std::condition_variable cv; diff --git a/scripts/deploy/config/raft.config b/scripts/deploy/config/raft.config index c5092a94c8..e11b11efb2 100644 --- a/scripts/deploy/config/raft.config +++ b/scripts/deploy/config/raft.config @@ -1,6 +1,6 @@ { "clientBatchNum": 100, - "enable_viewchange": false, + "enable_viewchange": true, "recovery_enabled": false, "max_client_complaint_num":10, "max_process_txn": 32, From df51c82458761840384f660d2e338eed867417b8 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Thu, 20 Nov 2025 18:54:51 -0800 Subject: [PATCH 16/66] Update variable names to match the Raft paper --- .../ordering/raft/algorithm/raft.cpp | 87 +++++++++---------- .../consensus/ordering/raft/algorithm/raft.h | 6 +- .../ordering/raft/framework/consensus.cpp | 6 +- .../ordering/raft/proto/proposal.proto | 23 +++-- 4 files changed, 60 insertions(+), 62 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index c039dc7e0b..9f171647ba 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -44,24 +44,23 @@ static void printAppendEntries(const std::unique_ptr& txn) { } LOG(INFO) << "=== AppendEntries ==="; - LOG(INFO) << "id: " << txn->id(); LOG(INFO) << "term: " << txn->term(); - LOG(INFO) << "seq: " << txn->seq(); + LOG(INFO) << "prevLogIndex: " << txn->prevlogindex(); LOG(INFO) << "prevLogTerm: " << txn->prevlogterm(); LOG(INFO) << "leaderCommitIndex: " << txn->leadercommitindex(); LOG(INFO) << "proxy_id: " << txn->proxy_id(); - LOG(INFO) << "proposer: " << txn->proposer(); + LOG(INFO) << "leaderId: " << txn->leaderid(); LOG(INFO) << "uid: " << txn->uid(); LOG(INFO) << "create_time: " << txn->create_time(); // bytes fields (print as hex or limited string to avoid binary garbage) - const std::string& data = txn->data(); + const std::string& entries = txn->entries(); const std::string& hash = txn->hash(); - LOG(INFO) << "data size: " << data.size(); - if (!data.empty()) { - LOG(INFO) << "data (first 32 bytes): " - << data.substr(0, std::min(32, data.size())); + LOG(INFO) << "entries size: " << entries.size(); + if (!entries.empty()) { + LOG(INFO) << "entries (first 32 bytes): " + << entries.substr(0, std::min(32, entries.size())); } LOG(INFO) << "hash size: " << hash.size(); @@ -84,7 +83,7 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, total_num_ = total_num; f_ = f; is_stop_ = false; - seq_ = 0; + prevLogIndex_ = 0; currentTerm_ = 0; votedFor_ = -1; commitIndex_ = 0; @@ -110,15 +109,15 @@ bool Raft::ReceiveTransaction(std::unique_ptr txn) { // LOG(INFO)<<"recv txn:"; LOG(INFO) << "Received Transaction to primary id: " << id_; - LOG(INFO) << "seq: " << seq_; + LOG(INFO) << "prevLogIndex: " << prevLogIndex_; txn->set_create_time(GetCurrentTime()); - txn->set_seq(seq_++); - txn->set_proposer(id_); + txn->set_prevlogindex(prevLogIndex_++); + txn->set_leaderid(id_); // For now just set this to currentTerm_, but is wrong if it just became leader txn->set_prevlogterm(currentTerm_); - // leader sends out highest seq that is committed + // leader sends out highest prevLogIndex that is committed txn->set_leadercommitindex(commitIndex_); // This should be a term for each entry, but assuming no failure at first @@ -133,33 +132,33 @@ bool Raft::ReceiveTransaction(std::unique_ptr txn) { bool Raft::ReceivePropose(std::unique_ptr txn) { Dump(); - auto leader_id = txn->proposer(); + auto leader_id = txn->leaderid(); auto leaderCommit = txn->leadercommitindex(); LOG(INFO) << "Received AppendEntries to replica id: " << id_; - LOG(INFO) << "static_cast(data_.size()): " << static_cast(data_.size()); + LOG(INFO) << "static_cast(log_.size()): " << static_cast(log_.size()); printAppendEntries(txn); AppendEntriesResponse appendEntriesResponse; appendEntriesResponse.set_term(currentTerm_); appendEntriesResponse.set_id(id_); appendEntriesResponse.set_lastapplied(lastApplied_); - appendEntriesResponse.set_nextentry(data_.size()); + appendEntriesResponse.set_nextentry(log_.size()); if (txn->term() < currentTerm_) { LOG(INFO) << "AppendEntriesMsg Fail1"; appendEntriesResponse.set_success(false); SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); return true; } - auto prevSeq = txn->seq() - 1; + auto prevprevLogIndex = txn->prevlogindex() - 1; // This should be the same as checking if it has an entry // with this prevLogIndex and term - if (prevSeq != 0 && prevSeq > static_cast(dataIndexMapping_.size()) && - (prevSeq >= static_cast(dataIndexMapping_.size()) || - txn->prevlogterm() != data_[dataIndexMapping_[prevSeq]]->term())) { + if (prevprevLogIndex != 0 && prevprevLogIndex > static_cast(logIndexMapping_.size()) && + (prevprevLogIndex >= static_cast(logIndexMapping_.size()) || + txn->prevlogterm() != log_[logIndexMapping_[prevprevLogIndex]]->term())) { LOG(INFO) << "AppendEntriesMsg Fail2"; - LOG(INFO) << "prevSeq: " << prevSeq << " data size: " << static_cast(data_.size()); - if (prevSeq < static_cast(dataIndexMapping_.size())){ + LOG(INFO) << "prevprevLogIndex: " << prevprevLogIndex << " entries size: " << static_cast(log_.size()); + if (prevprevLogIndex < static_cast(logIndexMapping_.size())){ LOG(INFO) << "txn->prevlogterm(): " << txn->prevlogterm() - << " last entry term: " << data_[dataIndexMapping_[prevSeq]]->term(); + << " last entry term: " << log_[logIndexMapping_[prevprevLogIndex]]->term(); } appendEntriesResponse.set_success(false); SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); @@ -169,43 +168,43 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { // delete that entry and all after it LOG(INFO) << "Before AppendEntriesMsg Added to Log"; std::string hash = txn->hash(); - int64_t seq = txn->seq(); + int64_t prevLogIndex = txn->prevlogindex(); { std::unique_lock lk(mutex_); std::string hash = txn->hash(); - LOG(INFO) << "Before adding to data"; - data_[txn->hash()] = std::move(txn); - LOG(INFO) << "After adding to data"; - dataIndexMapping_.push_back(hash); + LOG(INFO) << "Before adding to entries"; + log_[txn->hash()] = std::move(txn); + LOG(INFO) << "After adding to entries"; + logIndexMapping_.push_back(hash); } LOG(INFO) << "AppendEntriesMsg Added to Log"; LOG(INFO) << "leaderCommit: " << leaderCommit; LOG(INFO) << "commitIndex_: " << commitIndex_; LOG(INFO) << "lastApplied_: " << lastApplied_; - LOG(INFO) << "static_cast(data_.size()): " << static_cast(data_.size()); + LOG(INFO) << "static_cast(log_.size()): " << static_cast(log_.size()); LOG(INFO) << "leaderCommit > commitIndex_: " << (leaderCommit > commitIndex_ ? "true" : "false"); - LOG(INFO) << "lealastApplied_ + 1 <= static_cast(data_.size()) " << ((lastApplied_ + 1 <= static_cast(data_.size())) ? "true" : "false"); - while ((leaderCommit != 0) && leaderCommit > lastApplied_ && lastApplied_ + 1 <= static_cast(data_.size())) { + LOG(INFO) << "lastApplied_ + 1 <= static_cast(log_.size()) " << ((lastApplied_ + 1 <= static_cast(log_.size())) ? "true" : "false"); + while (leaderCommit > lastApplied_ && lastApplied_ + 1 <= static_cast(log_.size())) { // assert(false); LOG(INFO) << "AppendEntriesMsg Committing"; std::unique_ptr txnToCommit = nullptr; - txnToCommit = std::move(data_[dataIndexMapping_[lastApplied_]]); + txnToCommit = std::move(log_[logIndexMapping_[lastApplied_]]); commit_(*txnToCommit); lastApplied_++; } LOG(INFO) << "before commit index check"; // I don't quite know if this needs to be conditional, but that's how the paper says it if (leaderCommit > commitIndex_) - // not 100% certain if this second variable should be seq - commitIndex_ = std::min(leaderCommit, seq); + // not 100% certain if this second variable should be prevLogIndex + commitIndex_ = std::min(leaderCommit, prevLogIndex); LOG(INFO) << "after commit index check"; appendEntriesResponse.set_lastapplied(lastApplied_); - appendEntriesResponse.set_nextentry(data_.size()); + appendEntriesResponse.set_nextentry(log_.size()); appendEntriesResponse.set_success(true); appendEntriesResponse.set_hash(hash); - appendEntriesResponse.set_seq(seq); + appendEntriesResponse.set_prevlogindex(prevLogIndex); LOG(INFO) << "Leader_id: " << leader_id; SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); // Broadcast(MessageType::AppendEntriesResponseMsg, appendEntriesResponse); @@ -222,18 +221,18 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr r { std::unique_lock lk(mutex_); received_[response->hash()].insert(response->id()); - auto it = data_.find(response->hash()); - if (it != data_.end()) { - LOG(INFO) << "Transaction: " << response->seq() << " has gotten " << received_[response->hash()].size() << " responses"; + auto it = log_.find(response->hash()); + if (it != log_.end()) { + LOG(INFO) << "Transaction: " << response->prevlogindex() << " has gotten " << received_[response->hash()].size() << " responses"; if (static_cast(received_[response->hash()].size()) >= f_ + 1) { - commitIndex_ = response->seq(); + commitIndex_ = response->prevlogindex(); // pretty sure this should always be in order with no gaps - while (lastApplied_ + 1 <= static_cast(data_.size()) && + while (lastApplied_ + 1 <= static_cast(log_.size()) && lastApplied_ <= commitIndex_) { LOG(INFO) << "Leader Committing"; std::unique_ptr txnToCommit = nullptr; - txnToCommit = std::move(data_[dataIndexMapping_[lastApplied_]]); + txnToCommit = std::move(log_[logIndexMapping_[lastApplied_]]); commit_(*txnToCommit); lastApplied_++; } @@ -326,10 +325,10 @@ void Raft::SendHeartBeat() { } AppendEntries appendEntries; appendEntries.set_term(currentTerm); - appendEntries.set_proposer(leaderId); + appendEntries.set_leaderid(leaderId); appendEntries.set_leadercommitindex(prevLogIndex); // wrong function appendEntries.set_prevlogterm(prevLogTerm); - appendEntries.set_data(entries); + appendEntries.set_entries(entries); appendEntries.set_leadercommitindex(leaderCommit); // TODO Need to make sure leader no-ops their own heartbeats Broadcast(MessageType::AppendEntriesMsg, appendEntries); diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 5e067b5f79..c97a2ee212 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -57,9 +57,9 @@ class Raft : public common::ProtocolBase { private: mutable std::mutex mutex_; std::map > received_; - std::map > data_; // log[] + std::map > log_; // log[] - std::vector dataIndexMapping_; + std::vector logIndexMapping_; // This is for everyone // Most recent term it has seen @@ -82,7 +82,7 @@ class Raft : public common::ProtocolBase { Role role_; // Protected by raft_mutex_ int LeaderId; // Protected by raft_mutex_ - int64_t seq_; + int64_t prevLogIndex_; bool is_stop_; SignatureVerifier* verifier_; LeaderElectionManager* leader_election_manager_; diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 4b43de3835..9853f92a9e 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -80,7 +80,7 @@ int Consensus::ProcessCustomConsensus(std::unique_ptr request) { int Consensus::ProcessNewTransaction(std::unique_ptr request) { std::unique_ptr txn = std::make_unique(); - txn->set_data(request->data()); + txn->set_entries(request->data()); txn->set_hash(request->hash()); txn->set_proxy_id(request->proxy_id()); txn->set_uid(request->uid()); @@ -93,8 +93,8 @@ int Consensus::CommitMsg(const google::protobuf::Message& msg) { int Consensus::CommitMsgInternal(const AppendEntries& txn) { std::unique_ptr request = std::make_unique(); - request->set_data(txn.data()); - request->set_seq(txn.seq()); + request->set_data(txn.entries()); + request->set_seq(txn.prevlogindex()); request->set_uid(txn.uid()); request->set_proxy_id(txn.proxy_id()); transaction_executor_->Commit(std::move(request)); diff --git a/platform/consensus/ordering/raft/proto/proposal.proto b/platform/consensus/ordering/raft/proto/proposal.proto index 0c3097b060..724c30511f 100644 --- a/platform/consensus/ordering/raft/proto/proposal.proto +++ b/platform/consensus/ordering/raft/proto/proposal.proto @@ -22,17 +22,16 @@ syntax = "proto3"; package resdb.raft; message AppendEntries{ - int32 id = 1; - bytes data = 2; // this can maybe work as entries but maybe not? - bytes hash = 3; - int32 proxy_id = 4; - int32 proposer = 5; - int64 uid = 6; - int64 create_time = 7; - int64 seq = 9; - int32 prevLogTerm = 10; // term of the most recent log (term corresponding to seq) - int64 leaderCommitIndex = 11; // leader sends out highest seq that is committed - int32 term = 12; // This should be a term for each entry, but assuming no failure at first + bytes entries = 1; // this can maybe work as entries but maybe not? + bytes hash = 2; + int32 proxy_id = 3; + int32 leaderId = 4; + int64 uid = 5; + int64 create_time = 6; + int64 prevLogIndex = 7; + int32 prevLogTerm = 8; // term of the most recent log (term corresponding to seq) + int64 leaderCommitIndex = 9; // leader sends out highest seq that is committed + int32 term = 10; // This should be a term for each entry, but assuming no failure at first } message AppendEntriesResponse { @@ -42,7 +41,7 @@ message AppendEntriesResponse { int32 lastApplied = 4; int32 nextEntry = 5; bytes hash = 6; - int64 seq = 7; + int64 prevLogIndex = 7; } message RequestVote { From 3f15848d2c4eee3d326fb91200069688755bcad2 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Fri, 21 Nov 2025 02:57:58 +0000 Subject: [PATCH 17/66] in progress requestVote and requestVoteResponse handlers --- .../ordering/raft/algorithm/raft.cpp | 80 ++++++++++++++++++- .../consensus/ordering/raft/algorithm/raft.h | 15 ++-- 2 files changed, 87 insertions(+), 8 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index c039dc7e0b..b557466b3f 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -23,6 +23,7 @@ #include "common/crypto/signature_verifier.h" #include "common/utils/utils.h" +#include "platform/consensus/ordering/raft/proto/proposal.pb.h" #include "platform/proto/resdb.pb.h" namespace resdb { @@ -76,9 +77,9 @@ static void printAppendEntries(const std::unique_ptr& txn) { Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, LeaderElectionManager* leaderelection_manager) : ProtocolBase(id, f, total_num), + role_(raft::Role::FOLLOWER), verifier_(verifier), - leader_election_manager_(leaderelection_manager), - role_(raft::Role::FOLLOWER) { + leader_election_manager_(leaderelection_manager) { LOG(ERROR) << "get proposal graph"; id_ = id; total_num_ = total_num; @@ -89,6 +90,10 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, votedFor_ = -1; commitIndex_ = 0; lastApplied_ = 0; + + AppendEntries ae = + + dataIndexMapping_[0] nextIndex_.assign(total_num_ + 1, 0); matchIndex_.assign(total_num_ + 1, 0); } @@ -132,6 +137,10 @@ bool Raft::ReceiveTransaction(std::unique_ptr txn) { } bool Raft::ReceivePropose(std::unique_ptr txn) { + if (txn->proposer() == id_) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": discarding message from self"; + return false; + } Dump(); auto leader_id = txn->proposer(); auto leaderCommit = txn->leadercommitindex(); @@ -207,6 +216,7 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { appendEntriesResponse.set_hash(hash); appendEntriesResponse.set_seq(seq); LOG(INFO) << "Leader_id: " << leader_id; + leader_election_manager_->OnHeartBeat(); SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); // Broadcast(MessageType::AppendEntriesResponseMsg, appendEntriesResponse); return true; @@ -254,6 +264,72 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr r return true; } + +void Raft::ReceiveRequestVote(std::unique_ptr rv) { + int rvSender = rv->candidateid(); + uint64_t rvTerm = rv->term(); + + uint64_t term; + bool voteGranted = false; + bool roleChanged = false; + + if (rvSender == id_) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": discarding message from self"; + return; + } + + [&]() { + std::lock_guard lk(mutex_); + // If their term is higher than ours, we accept new term, reset votedFor + // and convert to follower + if (rvTerm > currentTerm_) { + currentTerm_ = rvTerm; + votedFor_ = -1; + if (role_ != raft::Role::FOLLOWER) { + role_ = raft::Role::FOLLOWER; + roleChanged = true; + } + } +/* +Raft determines which of two logs is more up-to-date +by comparing the index and term of the last entries in the +logs. If the logs have last entries with different terms, then +the log with the later term is more up-to-date. If the logs +end with the same term, then whichever log is longer is +more up-to-date +*/ + // Then we continue voting process + term = currentTerm_; + uint64_t lastLogTerm = data_[dataIndexMapping_.back()]->term(); + if (rvTerm < currentTerm_) { return; } + if (rv->lastlogterm() < lastLogTerm) { return; } + if (rv->lastlogterm() == lastLogTerm + && rv->lastlogindex() < dataIndexMapping_.size() - 1) { return; } + if (votedFor_ == -1 || votedFor_ == rvSender) { + votedFor_ = rvSender; + voteGranted = true; + } + }(); + + if (roleChanged) { + leader_election_manager_->OnRoleChange(); + } + + if (voteGranted) { + leader_election_manager_->OnHeartBeat(); + } + + RequestVoteResponse rvr; + rvr.set_term(term); + rvr.set_votegranted(voteGranted); + SendMessage(MessageType::RequestVoteResponseMsg, rvr, rvSender); +} + +void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) { + + +} + raft::Role Raft::GetRoleSnapshot() const { std::lock_guard lk(mutex_); return role_; diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 5e067b5f79..7d36950eb0 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -45,6 +45,9 @@ class Raft : public common::ProtocolBase { bool ReceiveTransaction(std::unique_ptr txn); bool ReceivePropose(std::unique_ptr txn); bool ReceiveAppendEntriesResponse(std::unique_ptr response); + void ReceiveRequestVote(std::unique_ptr rv); + void ReceiveRequestVoteResponse(std::unique_ptr rvr); + raft::Role GetRoleSnapshot() const; void StartElection(); @@ -63,22 +66,22 @@ class Raft : public common::ProtocolBase { // This is for everyone // Most recent term it has seen - int currentTerm_; + int currentTerm_; // Protected by raft_mutex_ // Id for vote in current Term - int votedFor_; + int votedFor_; // Protected by raft_mutex_ // Volatile on all servers // Index of highest log entry it knows to be committed - int64_t commitIndex_; + int64_t commitIndex_; // Protected by raft_mutex_ // Index of highest log entry executed - int64_t lastApplied_; + int64_t lastApplied_; // Protected by raft_mutex_ // Only for leaders // This keeps track of the next log entry to send to that replica // Initialized to last log index + 1 - std::vector nextIndex_; + std::vector nextIndex_; // Protected by raft_mutex_ // This keeps track of the highest log entry it knows is executed on that replica - std::vector matchIndex_; + std::vector matchIndex_; // Protected by raft_mutex_ Role role_; // Protected by raft_mutex_ int LeaderId; // Protected by raft_mutex_ From 8ae8d03856f470291145955519a676e92e7bda5c Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Fri, 21 Nov 2025 08:44:58 +0000 Subject: [PATCH 18/66] finished adding leader election logic, it seems buggy though. --- .../raft/algorithm/leaderelection_manager.cpp | 3 +- .../raft/algorithm/leaderelection_manager.h | 2 +- .../ordering/raft/algorithm/raft.cpp | 143 +++++++++++++----- .../consensus/ordering/raft/algorithm/raft.h | 6 + .../ordering/raft/framework/consensus.cpp | 21 +++ .../ordering/raft/proto/proposal.proto | 3 +- 6 files changed, 134 insertions(+), 44 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp index 1cac636c03..bb41a646e6 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp @@ -167,7 +167,7 @@ void LeaderElectionManager::MonitoringElectionTimeout() { continue; } else if (res == Waited::HEARTBEAT) { - LOG(INFO) << __FUNCTION__ << ": Heartbeat received within " << timeout_ms_ << " ms"; + LOG(INFO) << __FUNCTION__ << ": Heartbeat received within " << timeout_ms_.load() << " ms"; if (raft_->GetRoleSnapshot() == raft::Role::LEADER) { // A leader receiving a heartbeat would be unusual but not impossible. LOG(WARNING) << __FUNCTION__ << " Received Heartbeat as LEADER"; @@ -175,6 +175,7 @@ void LeaderElectionManager::MonitoringElectionTimeout() { continue; } LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in timeout section"; + LOG(INFO) << __FUNCTION__ << ": Heartbeat timed out after " << timeout_ms_.load() << " ms"; // Only gets here if timeout expired. // Leaders send a new heartbeat. if (raft_->GetRoleSnapshot() == raft::Role::LEADER) { diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h index 7acd5b9dda..b08c68e335 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h @@ -64,7 +64,7 @@ class LeaderElectionManager { std::atomic started_; std::atomic stop_; std::thread server_checking_timeout_thread_; - uint64_t timeout_ms_; + std::atomic timeout_ms_; uint64_t timeout_min_ms; uint64_t timeout_max_ms; uint64_t heartbeat_timer_; diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index da619bd4a1..e7a962191a 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -20,6 +20,9 @@ #include "platform/consensus/ordering/raft/algorithm/raft.h" #include +#include +#include +#include #include "common/crypto/signature_verifier.h" #include "common/utils/utils.h" @@ -90,9 +93,16 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, commitIndex_ = 0; lastApplied_ = 0; - AppendEntries ae = + AppendEntries ae; + ae.set_leaderid(0); + ae.set_prevlogindex(0); + ae.set_prevlogterm(0); + ae.set_leadercommitindex(0); + ae.set_term(0); + const std::string key = "7622832959"; + logIndexMapping_.push_back(key); + log_[key] = std::make_unique(ae); - dataIndexMapping_[0] nextIndex_.assign(total_num_ + 1, 0); matchIndex_.assign(total_num_ + 1, 0); } @@ -136,7 +146,7 @@ bool Raft::ReceiveTransaction(std::unique_ptr txn) { } bool Raft::ReceivePropose(std::unique_ptr txn) { - if (txn->proposer() == id_) { + if (txn->leaderid() == id_) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": discarding message from self"; return false; } @@ -265,6 +275,7 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr r void Raft::ReceiveRequestVote(std::unique_ptr rv) { + LOG(INFO) << "JIM -> " << __FUNCTION__; int rvSender = rv->candidateid(); uint64_t rvTerm = rv->term(); @@ -277,56 +288,72 @@ void Raft::ReceiveRequestVote(std::unique_ptr rv) { return; } + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": before lock"; [&]() { std::lock_guard lk(mutex_); // If their term is higher than ours, we accept new term, reset votedFor // and convert to follower - if (rvTerm > currentTerm_) { - currentTerm_ = rvTerm; - votedFor_ = -1; - if (role_ != raft::Role::FOLLOWER) { - role_ = raft::Role::FOLLOWER; - roleChanged = true; - } + TermRelation tr = TermCheckLocked(rvTerm); + if (tr == TermRelation::STALE) { + term = currentTerm_; + return; } -/* -Raft determines which of two logs is more up-to-date -by comparing the index and term of the last entries in the -logs. If the logs have last entries with different terms, then -the log with the later term is more up-to-date. If the logs -end with the same term, then whichever log is longer is -more up-to-date -*/ + else if (tr == TermRelation::NEW) { roleChanged = DemoteSelfLocked(rvTerm); } // Then we continue voting process term = currentTerm_; - uint64_t lastLogTerm = data_[dataIndexMapping_.back()]->term(); - if (rvTerm < currentTerm_) { return; } + uint64_t lastLogTerm = getPrevLogTermLocked(); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": prev terms at least equal"; if (rv->lastlogterm() < lastLogTerm) { return; } + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": prev log terms at least equal"; if (rv->lastlogterm() == lastLogTerm - && rv->lastlogindex() < dataIndexMapping_.size() - 1) { return; } + && rv->lastlogindex() < getPrevLogIndexLocked()) { return; } + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": candidate is valid"; if (votedFor_ == -1 || votedFor_ == rvSender) { votedFor_ = rvSender; voteGranted = true; + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": voted for " << rvSender; } }(); - - if (roleChanged) { - leader_election_manager_->OnRoleChange(); - } - - if (voteGranted) { - leader_election_manager_->OnHeartBeat(); - } + if (roleChanged) { leader_election_manager_->OnRoleChange(); } + if (voteGranted) { leader_election_manager_->OnHeartBeat(); } RequestVoteResponse rvr; rvr.set_term(term); + rvr.set_voterid(id_); rvr.set_votegranted(voteGranted); SendMessage(MessageType::RequestVoteResponseMsg, rvr, rvSender); } void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) { + uint64_t term = rvr->term(); + int voterId = rvr->voterid(); + bool votedYes = rvr->votegranted(); + bool demoted = false; + bool elected = false; + int votesNeeded = total_num_ - f_; - + [&]() { + std::lock_guard lk(mutex_); + TermRelation tr = TermCheckLocked(term); + if (tr == TermRelation::STALE) { return; } + else if (tr == TermRelation::NEW) { + demoted = DemoteSelfLocked(term); + return; + } + if (role_ != Role::CANDIDATE) { return; } + if (!votedYes) { return; } + bool dupe = (std::find(votes_.begin(), votes_.end(), voterId) != votes_.end()); + if (dupe) { return; } + votes_.push_back(voterId); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Replica " << voterId << " voted for me. Votes: " << votes_.size() << "/" << votesNeeded; + if (votes_.size() >= votesNeeded) { + elected = true; + role_ = Role::LEADER; + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": CANDIDATE -> LEADER"; + } + }(); + if (demoted || elected) { leader_election_manager_->OnRoleChange(); } + if (elected) { SendHeartBeat(); } } raft::Role Raft::GetRoleSnapshot() const { @@ -334,7 +361,6 @@ raft::Role Raft::GetRoleSnapshot() const { return role_; } -// TODO SET LASTLOGINDEX AND LASTLOGTERM UPON MERGE // Called from LeaderElectionManager::StartElection when timeout void Raft::StartElection() { LOG(INFO) << "JIM -> " << __FUNCTION__; @@ -357,13 +383,13 @@ void Raft::StartElection() { } currentTerm_++; votedFor_ = id_; + votes_.clear(); + votes_.push_back(id_); currentTerm = currentTerm_; candidateId = id_; - - // TODO - lastLogIndex = 0; - lastLogTerm = 0; + lastLogIndex = getPrevLogIndexLocked(); + lastLogTerm = getPrevLogTermLocked(); } if (roleChanged) { leader_election_manager_->OnRoleChange(); @@ -394,21 +420,56 @@ void Raft::SendHeartBeat() { return; } currentTerm = currentTerm_; - prevLogIndex = 0; - prevLogTerm = 0; + prevLogIndex = getPrevLogIndexLocked(); + prevLogTerm = getPrevLogTermLocked(); entries = ""; - leaderCommit = 0; + leaderCommit = 0; // TODO } AppendEntries appendEntries; appendEntries.set_term(currentTerm); appendEntries.set_leaderid(leaderId); - appendEntries.set_leadercommitindex(prevLogIndex); // wrong function + appendEntries.set_prevlogindex(prevLogIndex); appendEntries.set_prevlogterm(prevLogTerm); appendEntries.set_entries(entries); - appendEntries.set_leadercommitindex(leaderCommit); - // TODO Need to make sure leader no-ops their own heartbeats + appendEntries.set_leadercommitindex(leaderCommit); // TODO Broadcast(MessageType::AppendEntriesMsg, appendEntries); } +// requires raft mutex to be held +// returns true if demoted +bool Raft::DemoteSelfLocked(uint64_t term) { + currentTerm_ = term; + votedFor_ = -1; + if (role_ != raft::Role::FOLLOWER) { + role_ = raft::Role::FOLLOWER; + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted to FOLLOWER"; + return true; + } + return false; +} + +// requires raft mutex to be held +TermRelation Raft::TermCheckLocked(uint64_t term) const { + if (term < currentTerm_) { return TermRelation::STALE; } + else if (term == currentTerm_) { return TermRelation::CURRENT; } + else { return TermRelation::NEW; } +} + +// requires raft mutex to be held +uint64_t Raft::getPrevLogIndexLocked() const { + return logIndexMapping_.size() - 1; +} + +// requires raft mutex to be held +uint64_t Raft::getPrevLogTermLocked() const { + if (logIndexMapping_.empty()) { return 0; } + const std::string& key = logIndexMapping_.back(); + auto it = log_.find(key); + if (it == log_.end() || !it->second) { + LOG(FATAL) << __FUNCTION__ << ": inconsistency found between log vector and log map"; + } + return it->second->term(); +} + } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 4e39ad2853..a73ab82a40 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -34,6 +34,7 @@ namespace resdb { namespace raft { enum class Role { FOLLOWER, CANDIDATE, LEADER }; +enum class TermRelation { STALE, CURRENT, NEW }; class Raft : public common::ProtocolBase { public: @@ -54,6 +55,10 @@ class Raft : public common::ProtocolBase { void SendHeartBeat(); private: + TermRelation TermCheckLocked(uint64_t term) const; // Must be called under mutex + bool DemoteSelfLocked(uint64_t term); // Must be called under mutex + uint64_t getPrevLogIndexLocked() const; // Must be called under mutex + uint64_t getPrevLogTermLocked() const; // Must be called under mutex bool IsStop(); void Dump(); @@ -84,6 +89,7 @@ class Raft : public common::ProtocolBase { std::vector matchIndex_; // Protected by raft_mutex_ Role role_; // Protected by raft_mutex_ int LeaderId; // Protected by raft_mutex_ + std::vector votes_; // Protected by raft_mutex_ int64_t prevLogIndex_; bool is_stop_; diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 9853f92a9e..9f28f6fe91 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -23,6 +23,7 @@ #include #include "common/utils/utils.h" +#include "platform/consensus/ordering/raft/proto/proposal.pb.h" namespace resdb { namespace raft { @@ -75,6 +76,26 @@ int Consensus::ProcessCustomConsensus(std::unique_ptr request) { raft_->ReceiveAppendEntriesResponse(std::move(AppendEntriesResponse)); return 0; } + else if (request->user_type() == MessageType::RequestVoteMsg) { + std::unique_ptr rv = std::make_unique(); + if (!rv->ParseFromString(request->data())) { + LOG(ERROR) << "parse proposal fail"; + assert(1 == 0); + return -1; + } + raft_->ReceiveRequestVote(std::move(rv)); + return 0; + } + else if (request->user_type() == MessageType::RequestVoteResponseMsg) { + std::unique_ptr rvr = std::make_unique(); + if (!rvr->ParseFromString(request->data())) { + LOG(ERROR) << "parse proposal fail"; + assert(1 == 0); + return -1; + } + raft_->ReceiveRequestVoteResponse(std::move(rvr)); + return 0; + } return 0; } diff --git a/platform/consensus/ordering/raft/proto/proposal.proto b/platform/consensus/ordering/raft/proto/proposal.proto index 724c30511f..7b6a7da466 100644 --- a/platform/consensus/ordering/raft/proto/proposal.proto +++ b/platform/consensus/ordering/raft/proto/proposal.proto @@ -53,7 +53,8 @@ message RequestVote { message RequestVoteResponse { int64 term = 1; - bool voteGranted = 2; + int32 voterId = 2; + bool voteGranted = 3; } enum MessageType { From 707f5ea559d335604369a75de33b3a861c556f3d Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Fri, 21 Nov 2025 09:53:44 +0000 Subject: [PATCH 19/66] blocked input from client, set appendEntries handler to always be happy raised timeout window and heartbeat interval --- .../raft/algorithm/leaderelection_manager.cpp | 9 +++-- .../ordering/raft/algorithm/raft.cpp | 4 +- .../networkstrate/async_replica_client.cpp | 6 +-- .../networkstrate/replica_communicator.cpp | 38 +++++++++---------- 4 files changed, 30 insertions(+), 27 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp index bb41a646e6..be9ed4b3bb 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp @@ -35,9 +35,9 @@ LeaderElectionManager::LeaderElectionManager(const ResDBConfig& config) raft_(nullptr), started_(false), stop_(false), - timeout_min_ms(150), - timeout_max_ms(300), - heartbeat_timer_(50), + timeout_min_ms(1000), + timeout_max_ms(2000), + heartbeat_timer_(100), heartbeat_count_(0), role_epoch_(0), known_role_epoch_(0) { @@ -175,7 +175,7 @@ void LeaderElectionManager::MonitoringElectionTimeout() { continue; } LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in timeout section"; - LOG(INFO) << __FUNCTION__ << ": Heartbeat timed out after " << timeout_ms_.load() << " ms"; + // Only gets here if timeout expired. // Leaders send a new heartbeat. if (raft_->GetRoleSnapshot() == raft::Role::LEADER) { @@ -183,6 +183,7 @@ void LeaderElectionManager::MonitoringElectionTimeout() { } // Followers and Candidates start an election. else { + LOG(INFO) << __FUNCTION__ << ": Heartbeat timed out after " << timeout_ms_.load() << " ms"; raft_->StartElection(); } } diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index e7a962191a..117b33b4fe 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -122,7 +122,7 @@ void Raft::Dump() { bool Raft::ReceiveTransaction(std::unique_ptr txn) { // LOG(INFO)<<"recv txn:"; - + return false; LOG(INFO) << "Received Transaction to primary id: " << id_; LOG(INFO) << "prevLogIndex: " << prevLogIndex_; txn->set_create_time(GetCurrentTime()); @@ -150,6 +150,8 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": discarding message from self"; return false; } + leader_election_manager_->OnHeartBeat(); + return false; Dump(); auto leader_id = txn->leaderid(); auto leaderCommit = txn->leadercommitindex(); diff --git a/platform/networkstrate/async_replica_client.cpp b/platform/networkstrate/async_replica_client.cpp index f24fcf64d9..624cbcadd9 100644 --- a/platform/networkstrate/async_replica_client.cpp +++ b/platform/networkstrate/async_replica_client.cpp @@ -37,7 +37,7 @@ AsyncReplicaClient::~AsyncReplicaClient() {} int AsyncReplicaClient::SendMessage(const std::string& data) { queue_.Push(std::make_unique(data)); - LOG(ERROR) << "About to send"; + if (!in_process_.load()) { bool old_value = false; if (in_process_.compare_exchange_strong(old_value, true, @@ -50,7 +50,7 @@ int AsyncReplicaClient::SendMessage(const std::string& data) { } void AsyncReplicaClient::OnSendNewMessage() { - LOG(ERROR) << "OnSendNewMessage()"; + std::unique_ptr data = queue_.Pop(0); if (data == nullptr || data->empty()) { in_process_ = false; @@ -62,7 +62,7 @@ void AsyncReplicaClient::OnSendNewMessage() { } void AsyncReplicaClient::OnSendMessage() { - LOG(ERROR) << "OnSendMessage(), status: " << status_; + if (status_ == 0) { data_size_ = pending_data_->size(); sending_data_size_ = sizeof(data_size_); diff --git a/platform/networkstrate/replica_communicator.cpp b/platform/networkstrate/replica_communicator.cpp index 63a2ca5fda..12c6c8b7cc 100644 --- a/platform/networkstrate/replica_communicator.cpp +++ b/platform/networkstrate/replica_communicator.cpp @@ -127,7 +127,7 @@ void ReplicaCommunicator::StartBroadcastInBackGround() { void ReplicaCommunicator::StartSingleInBackGround(const std::string& ip, int port) { single_bq_[std::make_pair(ip,port)] = std::make_unique>>("s_batch", tcp_batch_); - LOG(INFO) << "StartSingleInBackGround: "; + ReplicaInfo replica_info; for (const auto& replica : replicas_) { if (replica.ip() == ip && replica.port() == port) { @@ -145,12 +145,12 @@ void ReplicaCommunicator::StartSingleInBackGround(const std::string& ip, int por } } -LOG(INFO) << "before push back, IsRunning() " << IsRunning(); + single_thread_.push_back(std::thread([&](BatchQueue> *bq, ReplicaInfo replica_info) { while (IsRunning()) { std::vector> batch_req = bq->Pop(50000); - LOG(INFO) << "batch_req.empty() " << batch_req.empty(); + if (batch_req.empty()) { continue; } @@ -158,9 +158,9 @@ LOG(INFO) << "before push back, IsRunning() " << IsRunning(); for (auto& queue_item : batch_req) { broadcast_data.add_data()->swap(queue_item->data); } - LOG(INFO) << "Before SendBroadCastMsg: "; + global_stats_->SendBroadCastMsg(broadcast_data.data_size()); - LOG(ERROR)<<" send to ip:"<BroadCastMsg(); - LOG(INFO) << "is_use_long_conn_: " << is_use_long_conn_; + if (is_use_long_conn_) { auto item = std::make_unique(); item->data = NetChannel::GetRawMessageString(message, verifier_); @@ -191,7 +191,7 @@ const ReplicaInfo& replica_info) { single_bq_[std::make_pair(ip, port)]->Push(std::move(item)); return 0; } else { - LOG(INFO) << "Branch 2, calling SendMessageInternal: "; + return SendMessageInternal(message, replicas_); } } @@ -212,15 +212,15 @@ int ReplicaCommunicator::SendMessage(const google::protobuf::Message& message, const ReplicaInfo& replica_info) { return SendSingleMessage(message, replica_info); - LOG(INFO) << "is_use_long_conn_: " << is_use_long_conn_; + if (is_use_long_conn_) { - LOG(INFO) << "path 1"; + std::string data = NetChannel::GetRawMessageString(message, verifier_); BroadcastData broadcast_data; broadcast_data.add_data()->swap(data); return SendMessageFromPool(broadcast_data, {replica_info}); } else { - LOG(INFO) << "path 2"; + return SendMessageInternal(message, {replica_info}); } } @@ -247,7 +247,7 @@ int ReplicaCommunicator::SendBatchMessage( int ReplicaCommunicator::SendMessageFromPool( const google::protobuf::Message& message, const std::vector& replicas) { - LOG(ERROR) << "SendMessageFromPool():"; + int ret = 0; std::string data; message.SerializeToString(&data); @@ -255,17 +255,17 @@ int ReplicaCommunicator::SendMessageFromPool( std::lock_guard lk(mutex_); for (const auto& replica : replicas) { auto client = GetClientFromPool(replica.ip(), replica.port()); - LOG(ERROR) << "Try client"; + if (client == nullptr) { continue; } - LOG(ERROR) << "send to:" << replica.ip(); + if (client->SendMessage(data) == 0) { ret++; } else { LOG(ERROR) << "send to:" << replica.ip() << " fail"; } - LOG(ERROR) << "send to:" << replica.ip()<<" done"; + } return ret; } @@ -282,9 +282,9 @@ int ReplicaCommunicator::SendMessageInternal( if (verifier_ != nullptr) { client->SetSignatureVerifier(verifier_); } - LOG(ERROR) << "Before Message sent"; + if (client->SendRawMessage(message) == 0) { - LOG(ERROR) << "Message sent"; + ret++; } } @@ -317,9 +317,9 @@ void ReplicaCommunicator::BroadCast(const google::protobuf::Message& message) { void ReplicaCommunicator::SendMessage(const google::protobuf::Message& message, int64_t node_id) { ReplicaInfo target_replica; - LOG(INFO) << "node_id: " << node_id; + for (const auto& replica : replicas_) { - LOG(INFO) << "replica.id(): " << replica.id(); + if (replica.id() == node_id) { target_replica = replica; break; From 81c91061df2dbd25b951a6a8d00905f988f55180 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Fri, 21 Nov 2025 21:02:07 +0000 Subject: [PATCH 20/66] testing --- .../raft/algorithm/leaderelection_manager.cpp | 7 +-- .../ordering/raft/algorithm/raft.cpp | 60 +++++++++++++++---- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp index be9ed4b3bb..57338ba9ca 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp @@ -35,9 +35,9 @@ LeaderElectionManager::LeaderElectionManager(const ResDBConfig& config) raft_(nullptr), started_(false), stop_(false), - timeout_min_ms(1000), - timeout_max_ms(2000), - heartbeat_timer_(100), + timeout_min_ms(150), + timeout_max_ms(300), + heartbeat_timer_(50), heartbeat_count_(0), role_epoch_(0), known_role_epoch_(0) { @@ -174,7 +174,6 @@ void LeaderElectionManager::MonitoringElectionTimeout() { } continue; } - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in timeout section"; // Only gets here if timeout expired. // Leaders send a new heartbeat. diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 117b33b4fe..b7a8392b56 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -150,8 +150,33 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": discarding message from self"; return false; } - leader_election_manager_->OnHeartBeat(); + uint64_t term; + bool success = false; + bool demoted = false; + TermRelation tr; + { + std::lock_guard lk(mutex_); + tr = TermCheckLocked(txn->term()); + if (tr == TermRelation::NEW) { demoted = DemoteSelfLocked(txn->term()); } + if (tr != TermRelation::STALE) { + uint64_t i = txn->prevlogindex(); + if (i < logIndexMapping_.size()) { + const std::string& key = logIndexMapping_[i]; + if (txn->prevlogterm() == log_[key]->term()) { success = true; } + } + } + term = currentTerm_; + } + if (demoted) { leader_election_manager_->OnRoleChange(); } + if (tr != TermRelation::STALE) { leader_election_manager_->OnHeartBeat(); } + AppendEntriesResponse aer; + aer.set_term(term); + aer.set_success(success); + if (success) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded success"; } + else { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded failure"; } + SendMessage(MessageType::AppendEntriesResponseMsg, aer, txn->leaderid()); return false; + Dump(); auto leader_id = txn->leaderid(); auto leaderCommit = txn->leadercommitindex(); @@ -284,13 +309,13 @@ void Raft::ReceiveRequestVote(std::unique_ptr rv) { uint64_t term; bool voteGranted = false; bool roleChanged = false; + int votedFor; if (rvSender == id_) { - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": discarding message from self"; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": discarding message from self"; return; } - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": before lock"; [&]() { std::lock_guard lk(mutex_); // If their term is higher than ours, we accept new term, reset votedFor @@ -303,21 +328,25 @@ void Raft::ReceiveRequestVote(std::unique_ptr rv) { else if (tr == TermRelation::NEW) { roleChanged = DemoteSelfLocked(rvTerm); } // Then we continue voting process term = currentTerm_; + votedFor = votedFor_; uint64_t lastLogTerm = getPrevLogTermLocked(); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": prev terms at least equal"; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": prev terms at least equal"; if (rv->lastlogterm() < lastLogTerm) { return; } - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": prev log terms at least equal"; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": prev log terms at least equal"; if (rv->lastlogterm() == lastLogTerm && rv->lastlogindex() < getPrevLogIndexLocked()) { return; } - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": candidate is valid"; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": candidate is valid"; if (votedFor_ == -1 || votedFor_ == rvSender) { votedFor_ = rvSender; voteGranted = true; - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": voted for " << rvSender; } }(); if (roleChanged) { leader_election_manager_->OnRoleChange(); } - if (voteGranted) { leader_election_manager_->OnHeartBeat(); } + if (voteGranted) { + leader_election_manager_->OnHeartBeat(); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": voted for " << rvSender<< " on term " << term; + } + else { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": did not vote for " << rvSender<< " on term " << term << ". I already voted for " << votedFor; } RequestVoteResponse rvr; rvr.set_term(term); @@ -347,11 +376,12 @@ void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) bool dupe = (std::find(votes_.begin(), votes_.end(), voterId) != votes_.end()); if (dupe) { return; } votes_.push_back(voterId); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Replica " << voterId << " voted for me. Votes: " << votes_.size() << "/" << votesNeeded; + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Replica " << voterId << " voted for me. Votes: " + << votes_.size() << "/" << votesNeeded << "in term " << currentTerm_; if (votes_.size() >= votesNeeded) { elected = true; role_ = Role::LEADER; - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": CANDIDATE -> LEADER"; + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": CANDIDATE -> LEADER on term " << currentTerm_; } }(); if (demoted || elected) { leader_election_manager_->OnRoleChange(); } @@ -365,7 +395,6 @@ raft::Role Raft::GetRoleSnapshot() const { // Called from LeaderElectionManager::StartElection when timeout void Raft::StartElection() { - LOG(INFO) << "JIM -> " << __FUNCTION__; uint64_t currentTerm; int candidateId; uint64_t lastLogIndex; @@ -383,10 +412,13 @@ void Raft::StartElection() { role_ = raft::Role::CANDIDATE; roleChanged = true; } + heartBeatsSentThisTerm_ 0; currentTerm_++; votedFor_ = id_; votes_.clear(); votes_.push_back(id_); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": I voted for myself. Votes: " + << votes_.size() << "/" << (total_num_ - f_) << "in term " << currentTerm_; currentTerm = currentTerm_; candidateId = id_; @@ -408,19 +440,21 @@ void Raft::StartElection() { // TODO // ON MERGE FIX VALUES void Raft::SendHeartBeat() { - LOG(INFO) << "JIM -> " << __FUNCTION__; uint64_t currentTerm; int leaderId = id_; uint64_t prevLogIndex; uint64_t prevLogTerm; std::string entries; uint64_t leaderCommit; + uint64_t heartBeatNum; { std::lock_guard lk(mutex_); if (role_ != raft::Role::LEADER) { LOG(WARNING) << __FUNCTION__ << ": Non-Leader tried to start HeartBeat"; return; } + heartBeatsSentThisTerm_++; + heartBeatNum = heartBeatsSentThisTerm_; currentTerm = currentTerm_; prevLogIndex = getPrevLogIndexLocked(); prevLogTerm = getPrevLogTermLocked(); @@ -435,6 +469,8 @@ void Raft::SendHeartBeat() { appendEntries.set_entries(entries); appendEntries.set_leadercommitindex(leaderCommit); // TODO Broadcast(MessageType::AppendEntriesMsg, appendEntries); + + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": "; } // requires raft mutex to be held From 880287af1032c7ab234868b30ba0203ceb45ed2e Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Mon, 24 Nov 2025 19:51:13 +0000 Subject: [PATCH 21/66] testing leader election --- .../ordering/common/framework/consensus.cpp | 4 +- .../raft/algorithm/leaderelection_manager.cpp | 43 ++- .../raft/algorithm/leaderelection_manager.h | 3 +- .../ordering/raft/algorithm/raft.cpp | 277 +++++------------- .../consensus/ordering/raft/algorithm/raft.h | 5 +- .../ordering/raft/framework/consensus.cpp | 5 +- .../config/kv_performance_server_local.conf | 1 - 7 files changed, 117 insertions(+), 221 deletions(-) diff --git a/platform/consensus/ordering/common/framework/consensus.cpp b/platform/consensus/ordering/common/framework/consensus.cpp index 33c1265ebb..d375f93269 100644 --- a/platform/consensus/ordering/common/framework/consensus.cpp +++ b/platform/consensus/ordering/common/framework/consensus.cpp @@ -96,7 +96,7 @@ int Consensus::Broadcast(int type, const google::protobuf::Message& msg) { Request request; msg.SerializeToString(request.mutable_data()); request.set_type(Request::TYPE_CUSTOM_CONSENSUS); - LOG(ERROR) << "Sending custom consensus Broadcast"; + //LOG(ERROR) << "Sending custom consensus Broadcast"; request.set_user_type(type); request.set_sender_id(config_.GetSelfInfo().id()); @@ -109,7 +109,7 @@ int Consensus::SendMsg(int type, const google::protobuf::Message& msg, Request request; msg.SerializeToString(request.mutable_data()); request.set_type(Request::TYPE_CUSTOM_CONSENSUS); - LOG(ERROR) << "Sending custom consensus message"; + //LOG(ERROR) << "Sending custom consensus message"; request.set_user_type(type); request.set_sender_id(config_.GetSelfInfo().id()); replica_communicator_->SendMessage(request, node_id); diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp index 57338ba9ca..892ac28f9e 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp @@ -20,6 +20,7 @@ #include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" #include "platform/consensus/ordering/raft/algorithm/raft.h" #include +#include #include "common/utils/utils.h" #include "platform/proto/viewchange_message.pb.h" @@ -39,10 +40,11 @@ LeaderElectionManager::LeaderElectionManager(const ResDBConfig& config) timeout_max_ms(300), heartbeat_timer_(50), heartbeat_count_(0), + //last_heartbeat_time_(std::chrono::steady_clock::now()), role_epoch_(0), known_role_epoch_(0) { global_stats_ = Stats::GetGlobalStats(); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in LeaderElectionManager constructor"; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in LeaderElectionManager constructor"; } LeaderElectionManager::~LeaderElectionManager() { @@ -55,7 +57,7 @@ LeaderElectionManager::~LeaderElectionManager() { } void LeaderElectionManager::MayStart() { - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in LeaderElectionManager MayStart"; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in LeaderElectionManager MayStart"; bool expected = false; if (!started_.compare_exchange_strong(expected, true)) { return; @@ -65,13 +67,13 @@ void LeaderElectionManager::MayStart() { .public_key() .public_key_info() .type() == CertificateKeyInfo::CLIENT) { - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in LeaderElectionManager MayStart, Client conditional"; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in LeaderElectionManager MayStart, Client conditional"; LOG(ERROR) << "client type not process view change"; return; } if (config_.GetConfigData().enable_viewchange()) { - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in LeaderElectionManager MayStart, viewchange is enabled"; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in LeaderElectionManager MayStart, viewchange is enabled"; server_checking_timeout_thread_ = std::thread(&LeaderElectionManager::MonitoringElectionTimeout, this); } @@ -82,17 +84,22 @@ void LeaderElectionManager::SetRaft(raft::Raft* raft) { } void LeaderElectionManager::OnHeartBeat() { + //auto now = std::chrono::steady_clock::now(); + //std::chrono::steady_clock::duration delta; { - LOG(INFO) << "JIM -> " << __FUNCTION__; std::lock_guard lk(cv_mutex_); heartbeat_count_++; + //delta = now - last_heartbeat_time_; + //last_heartbeat_time_ = now; } cv_.notify_all(); + //auto ms = std::chrono::duration_cast(delta).count(); + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Heartbeat received after " << ms << "ms"; } void LeaderElectionManager::OnRoleChange() { { - LOG(INFO) << "JIM -> " << __FUNCTION__; + //LOG(INFO) << "JIM -> " << __FUNCTION__; std::lock_guard lk(cv_mutex_); role_epoch_++; } @@ -106,7 +113,7 @@ uint64_t LeaderElectionManager::RandomInt(uint64_t min, uint64_t max) { } Waited LeaderElectionManager::LeaderWait() { - LOG(INFO) << "JIM -> " << __FUNCTION__; + //LOG(INFO) << "JIM -> " << __FUNCTION__; std::unique_lock lk(cv_mutex_); if (known_role_epoch_ != role_epoch_) { known_role_epoch_ = role_epoch_; @@ -126,7 +133,7 @@ Waited LeaderElectionManager::LeaderWait() { } Waited LeaderElectionManager::Wait() { - LOG(INFO) << "JIM -> " << __FUNCTION__; + //LOG(INFO) << "JIM -> " << __FUNCTION__; const uint64_t timeout_ms = RandomInt(timeout_min_ms, timeout_max_ms); timeout_ms_ = timeout_ms; std::unique_lock lk(cv_mutex_); @@ -154,20 +161,30 @@ Waited LeaderElectionManager::Wait() { // Causes leaders to Heartbeat. // Causes followers and candidates to start an election if no heartbeat received. void LeaderElectionManager::MonitoringElectionTimeout() { - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": thread entered the function"; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": thread entered the function"; while (!stop_.load()) { raft::Role role = raft_->GetRoleSnapshot(); Waited res; - if (role == raft::Role::LEADER) { res = LeaderWait(); } - else { res = Wait(); } - + std::chrono::steady_clock::time_point wait_start_time_ = std::chrono::steady_clock::now(); + bool leader = false; + if (role == raft::Role::LEADER) { + res = LeaderWait(); + leader = true; + } + else { + res = Wait(); + } + std::chrono::steady_clock::time_point wait_end_time_ = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration delta = wait_end_time_ - wait_start_time_; + auto ms = std::chrono::duration_cast(delta).count(); + LOG(INFO) << __FUNCTION__ << ": " << (leader ? "Leader" : "") << "Wait " << ms << "ms"; if (res == Waited::STOPPED) { break; } else if (res == Waited::ROLE_CHANGE) { LOG(INFO) << __FUNCTION__ << ": Role change detected"; continue; } else if (res == Waited::HEARTBEAT) { - LOG(INFO) << __FUNCTION__ << ": Heartbeat received within " << timeout_ms_.load() << " ms"; + LOG(INFO) << __FUNCTION__ << ": Heartbeat received within window"; if (raft_->GetRoleSnapshot() == raft::Role::LEADER) { // A leader receiving a heartbeat would be unusual but not impossible. LOG(WARNING) << __FUNCTION__ << " Received Heartbeat as LEADER"; diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h index b08c68e335..984e89107d 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h @@ -19,7 +19,7 @@ #pragma once -#include +#include #include "platform/config/resdb_config.h" #include "platform/consensus/execution/system_info.h" @@ -69,6 +69,7 @@ class LeaderElectionManager { uint64_t timeout_max_ms; uint64_t heartbeat_timer_; uint64_t heartbeat_count_; // Protected by cv_mutex_ + //std::chrono::steady_clock::time_point last_heartbeat_time_; uint64_t role_epoch_; // Protected by cv_mutex_ uint64_t known_role_epoch_; // Protected by cv_mutex_ std::mutex cv_mutex_; diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index b7a8392b56..6dfc3dc946 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -32,66 +32,23 @@ namespace resdb { namespace raft { -static std::string ToHex(const std::string& input, size_t max_len = 16) { - std::ostringstream oss; - oss << std::hex << std::setfill('0'); - for (size_t i = 0; i < std::min(input.size(), max_len); ++i) { - oss << std::setw(2) << static_cast(static_cast(input[i])); - } - return oss.str(); -} - -static void printAppendEntries(const std::unique_ptr& txn) { - if (!txn) { - LOG(INFO) << "AppendEntries: nullptr"; - return; - } - - LOG(INFO) << "=== AppendEntries ==="; - LOG(INFO) << "term: " << txn->term(); - LOG(INFO) << "prevLogIndex: " << txn->prevlogindex(); - LOG(INFO) << "prevLogTerm: " << txn->prevlogterm(); - LOG(INFO) << "leaderCommitIndex: " << txn->leadercommitindex(); - LOG(INFO) << "proxy_id: " << txn->proxy_id(); - LOG(INFO) << "leaderId: " << txn->leaderid(); - LOG(INFO) << "uid: " << txn->uid(); - LOG(INFO) << "create_time: " << txn->create_time(); - - // bytes fields (print as hex or limited string to avoid binary garbage) - const std::string& entries = txn->entries(); - const std::string& hash = txn->hash(); - - LOG(INFO) << "entries size: " << entries.size(); - if (!entries.empty()) { - LOG(INFO) << "entries (first 32 bytes): " - << entries.substr(0, std::min(32, entries.size())); - } - - LOG(INFO) << "hash size: " << hash.size(); - if (!hash.empty()) { - LOG(INFO) << "hash (hex first 16 bytes): " - << ToHex(hash); - } - - LOG(INFO) << "====================="; -} - Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, LeaderElectionManager* leaderelection_manager) : ProtocolBase(id, f, total_num), role_(raft::Role::FOLLOWER), verifier_(verifier), leader_election_manager_(leaderelection_manager) { - LOG(ERROR) << "get proposal graph"; id_ = id; total_num_ = total_num; - f_ = f; + f_ = (total_num-1)/2; is_stop_ = false; prevLogIndex_ = 0; currentTerm_ = 0; votedFor_ = -1; commitIndex_ = 0; lastApplied_ = 0; + last_ae_time_ = std::chrono::steady_clock::now(); + last_heartbeat_time_ = std::chrono::steady_clock::now(); AppendEntries ae; ae.set_leaderid(0); @@ -111,15 +68,6 @@ Raft::~Raft() { is_stop_ = true; } bool Raft::IsStop() { return is_stop_; } -void Raft::Dump() { - LOG(INFO) << "=== Replica Dump ==="; - LOG(INFO) << "id_: " << id_; - LOG(INFO) << "currentTerm_: " << currentTerm_; - LOG(INFO) << "votedFor_: " << votedFor_; - LOG(INFO) << "commitIndex_: " << commitIndex_; - LOG(INFO) << "lastApplied_: " << lastApplied_; -} - bool Raft::ReceiveTransaction(std::unique_ptr txn) { // LOG(INFO)<<"recv txn:"; return false; @@ -137,27 +85,31 @@ bool Raft::ReceiveTransaction(std::unique_ptr txn) { // This should be a term for each entry, but assuming no failure at first txn->set_term(currentTerm_); - LOG(INFO) << "Before"; - printAppendEntries(txn); - LOG(INFO) << "After"; Broadcast(MessageType::AppendEntriesMsg, *txn); return true; } bool Raft::ReceivePropose(std::unique_ptr txn) { - if (txn->leaderid() == id_) { - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": discarding message from self"; - return false; - } + if (txn->leaderid() == id_) { return false; } uint64_t term; bool success = false; bool demoted = false; TermRelation tr; + Role initialRole; + + auto now = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration delta; + delta = now - last_ae_time_; + last_ae_time_ = now; + auto ms = std::chrono::duration_cast(delta).count(); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": AE received after " << ms << "ms"; { + initialRole = role_; std::lock_guard lk(mutex_); tr = TermCheckLocked(txn->term()); if (tr == TermRelation::NEW) { demoted = DemoteSelfLocked(txn->term()); } + else if (role_ == Role::CANDIDATE && tr == TermRelation::CURRENT) { demoted = DemoteSelfLocked(txn->term()); } if (tr != TermRelation::STALE) { uint64_t i = txn->prevlogindex(); if (i < logIndexMapping_.size()) { @@ -167,8 +119,12 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { } term = currentTerm_; } - if (demoted) { leader_election_manager_->OnRoleChange(); } - if (tr != TermRelation::STALE) { leader_election_manager_->OnHeartBeat(); } + if (demoted) { + leader_election_manager_->OnRoleChange(); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted from " + << (initialRole == Role::LEADER ? "LEADER" : "CANDIDATE") << "->FOLLOWER in term " << term; + } + if (success) { leader_election_manager_->OnHeartBeat(); } AppendEntriesResponse aer; aer.set_term(term); aer.set_success(success); @@ -176,148 +132,45 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { else { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded failure"; } SendMessage(MessageType::AppendEntriesResponseMsg, aer, txn->leaderid()); return false; - - Dump(); - auto leader_id = txn->leaderid(); - auto leaderCommit = txn->leadercommitindex(); - LOG(INFO) << "Received AppendEntries to replica id: " << id_; - LOG(INFO) << "static_cast(log_.size()): " << static_cast(log_.size()); - printAppendEntries(txn); - AppendEntriesResponse appendEntriesResponse; - appendEntriesResponse.set_term(currentTerm_); - appendEntriesResponse.set_id(id_); - appendEntriesResponse.set_lastapplied(lastApplied_); - appendEntriesResponse.set_nextentry(log_.size()); - if (txn->term() < currentTerm_) { - LOG(INFO) << "AppendEntriesMsg Fail1"; - appendEntriesResponse.set_success(false); - SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); - return true; - } - auto prevprevLogIndex = txn->prevlogindex() - 1; - // This should be the same as checking if it has an entry - // with this prevLogIndex and term - if (prevprevLogIndex != 0 && prevprevLogIndex > static_cast(logIndexMapping_.size()) && - (prevprevLogIndex >= static_cast(logIndexMapping_.size()) || - txn->prevlogterm() != log_[logIndexMapping_[prevprevLogIndex]]->term())) { - LOG(INFO) << "AppendEntriesMsg Fail2"; - LOG(INFO) << "prevprevLogIndex: " << prevprevLogIndex << " entries size: " << static_cast(log_.size()); - if (prevprevLogIndex < static_cast(logIndexMapping_.size())){ - LOG(INFO) << "txn->prevlogterm(): " << txn->prevlogterm() - << " last entry term: " << log_[logIndexMapping_[prevprevLogIndex]]->term(); - } - appendEntriesResponse.set_success(false); - SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); - return true; - } - // Implement an entry existing but with a different term - // delete that entry and all after it - LOG(INFO) << "Before AppendEntriesMsg Added to Log"; - std::string hash = txn->hash(); - int64_t prevLogIndex = txn->prevlogindex(); - { - std::unique_lock lk(mutex_); - std::string hash = txn->hash(); - LOG(INFO) << "Before adding to entries"; - log_[txn->hash()] = std::move(txn); - LOG(INFO) << "After adding to entries"; - logIndexMapping_.push_back(hash); - } - LOG(INFO) << "AppendEntriesMsg Added to Log"; - - LOG(INFO) << "leaderCommit: " << leaderCommit; - LOG(INFO) << "commitIndex_: " << commitIndex_; - LOG(INFO) << "lastApplied_: " << lastApplied_; - LOG(INFO) << "static_cast(log_.size()): " << static_cast(log_.size()); - LOG(INFO) << "leaderCommit > commitIndex_: " << (leaderCommit > commitIndex_ ? "true" : "false"); - LOG(INFO) << "lastApplied_ + 1 <= static_cast(log_.size()) " << ((lastApplied_ + 1 <= static_cast(log_.size())) ? "true" : "false"); - while (leaderCommit > lastApplied_ && lastApplied_ + 1 <= static_cast(log_.size())) { - // assert(false); - LOG(INFO) << "AppendEntriesMsg Committing"; - std::unique_ptr txnToCommit = nullptr; - txnToCommit = std::move(log_[logIndexMapping_[lastApplied_]]); - commit_(*txnToCommit); - lastApplied_++; - } - LOG(INFO) << "before commit index check"; - // I don't quite know if this needs to be conditional, but that's how the paper says it - if (leaderCommit > commitIndex_) - // not 100% certain if this second variable should be prevLogIndex - commitIndex_ = std::min(leaderCommit, prevLogIndex); - - LOG(INFO) << "after commit index check"; - appendEntriesResponse.set_lastapplied(lastApplied_); - appendEntriesResponse.set_nextentry(log_.size()); - appendEntriesResponse.set_success(true); - appendEntriesResponse.set_hash(hash); - appendEntriesResponse.set_prevlogindex(prevLogIndex); - LOG(INFO) << "Leader_id: " << leader_id; - leader_election_manager_->OnHeartBeat(); - SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); - // Broadcast(MessageType::AppendEntriesResponseMsg, appendEntriesResponse); - return true; } bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr response) { - if (id_ != 1) - return true; - LOG(INFO) << "ReceiveAppendEntriesResponse"; - auto followerId = response->id(); - LOG(INFO) << "followerId: " << followerId; - if (response->success()) { - { - std::unique_lock lk(mutex_); - received_[response->hash()].insert(response->id()); - auto it = log_.find(response->hash()); - if (it != log_.end()) { - LOG(INFO) << "Transaction: " << response->prevlogindex() << " has gotten " << received_[response->hash()].size() << " responses"; - if (static_cast(received_[response->hash()].size()) >= f_ + 1) { - commitIndex_ = response->prevlogindex(); - - // pretty sure this should always be in order with no gaps - while (lastApplied_ + 1 <= static_cast(log_.size()) && - lastApplied_ <= commitIndex_) { - LOG(INFO) << "Leader Committing"; - std::unique_ptr txnToCommit = nullptr; - txnToCommit = std::move(log_[logIndexMapping_[lastApplied_]]); - commit_(*txnToCommit); - lastApplied_++; - } - } - } - } - nextIndex_[followerId] = response->nextentry(); - matchIndex_[followerId] = response->lastapplied(); - return true; +uint64_t term; +bool demoted = false; +TermRelation tr; +Role initialRole; +{ + initialRole = role_; + std::lock_guard lk(mutex_); + tr = TermCheckLocked(response->term()); + if (tr == TermRelation::NEW) { demoted = DemoteSelfLocked(response->term()); } + term = currentTerm_; +} + if (demoted) { + leader_election_manager_->OnRoleChange(); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted from " + << (initialRole == Role::LEADER ? "LEADER" : "CANDIDATE") << "->FOLLOWER in term " << term; } - - // handling for if leader is out of date and term is wrong - - // handling for if term is correct, but follower is just out of date - --nextIndex_[followerId]; - // send message - assert(false); - return true; + return false; } void Raft::ReceiveRequestVote(std::unique_ptr rv) { - LOG(INFO) << "JIM -> " << __FUNCTION__; int rvSender = rv->candidateid(); uint64_t rvTerm = rv->term(); uint64_t term; bool voteGranted = false; - bool roleChanged = false; - int votedFor; + bool demoted = false; + bool validCandidate = false; + int votedFor = -1; + Role initialRole; - if (rvSender == id_) { - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": discarding message from self"; - return; - } + if (rvSender == id_) { return; } [&]() { std::lock_guard lk(mutex_); + initialRole = role_; // If their term is higher than ours, we accept new term, reset votedFor // and convert to follower TermRelation tr = TermCheckLocked(rvTerm); @@ -325,7 +178,7 @@ void Raft::ReceiveRequestVote(std::unique_ptr rv) { term = currentTerm_; return; } - else if (tr == TermRelation::NEW) { roleChanged = DemoteSelfLocked(rvTerm); } + else if (tr == TermRelation::NEW) { demoted = DemoteSelfLocked(rvTerm); } // Then we continue voting process term = currentTerm_; votedFor = votedFor_; @@ -335,18 +188,27 @@ void Raft::ReceiveRequestVote(std::unique_ptr rv) { //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": prev log terms at least equal"; if (rv->lastlogterm() == lastLogTerm && rv->lastlogindex() < getPrevLogIndexLocked()) { return; } + validCandidate = true; //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": candidate is valid"; if (votedFor_ == -1 || votedFor_ == rvSender) { votedFor_ = rvSender; voteGranted = true; } }(); - if (roleChanged) { leader_election_manager_->OnRoleChange(); } - if (voteGranted) { + if (demoted) { + leader_election_manager_->OnRoleChange(); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted from " + << (initialRole == Role::LEADER ? "LEADER" : "CANDIDATE") << "->FOLLOWER in term " << term; + } + if (voteGranted) { leader_election_manager_->OnHeartBeat(); LOG(INFO) << "JIM -> " << __FUNCTION__ << ": voted for " << rvSender<< " on term " << term; } - else { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": did not vote for " << rvSender<< " on term " << term << ". I already voted for " << votedFor; } + else if (validCandidate) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": did not vote for " + << rvSender<< " on term " << term << ". I already voted for " << votedFor + << ((votedFor == id_) ? " (myself)" : ""); + } RequestVoteResponse rvr; rvr.set_term(term); @@ -362,9 +224,11 @@ void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) bool demoted = false; bool elected = false; int votesNeeded = total_num_ - f_; + Role initialRole; [&]() { std::lock_guard lk(mutex_); + initialRole = role_; TermRelation tr = TermCheckLocked(term); if (tr == TermRelation::STALE) { return; } else if (tr == TermRelation::NEW) { @@ -377,14 +241,18 @@ void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) if (dupe) { return; } votes_.push_back(voterId); LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Replica " << voterId << " voted for me. Votes: " - << votes_.size() << "/" << votesNeeded << "in term " << currentTerm_; + << votes_.size() << "/" << votesNeeded << " in term " << currentTerm_; if (votes_.size() >= votesNeeded) { elected = true; role_ = Role::LEADER; - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": CANDIDATE -> LEADER on term " << currentTerm_; + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": CANDIDATE->LEADER on term " << currentTerm_; } }(); if (demoted || elected) { leader_election_manager_->OnRoleChange(); } + if (demoted) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted from " + << (initialRole == Role::LEADER ? "LEADER" : "CANDIDATE") << "->FOLLOWER in term " << term; + } if (elected) { SendHeartBeat(); } } @@ -412,13 +280,13 @@ void Raft::StartElection() { role_ = raft::Role::CANDIDATE; roleChanged = true; } - heartBeatsSentThisTerm_ 0; + heartBeatsSentThisTerm_ = 0; currentTerm_++; votedFor_ = id_; votes_.clear(); votes_.push_back(id_); LOG(INFO) << "JIM -> " << __FUNCTION__ << ": I voted for myself. Votes: " - << votes_.size() << "/" << (total_num_ - f_) << "in term " << currentTerm_; + << votes_.size() << "/" << (total_num_ - f_) << " in term " << currentTerm_; currentTerm = currentTerm_; candidateId = id_; @@ -437,7 +305,7 @@ void Raft::StartElection() { Broadcast(MessageType::RequestVoteMsg, requestVote); } -// TODO +// TODOjim // ON MERGE FIX VALUES void Raft::SendHeartBeat() { uint64_t currentTerm; @@ -447,6 +315,10 @@ void Raft::SendHeartBeat() { std::string entries; uint64_t leaderCommit; uint64_t heartBeatNum; + + auto now = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration delta; + { std::lock_guard lk(mutex_); if (role_ != raft::Role::LEADER) { @@ -460,7 +332,12 @@ void Raft::SendHeartBeat() { prevLogTerm = getPrevLogTermLocked(); entries = ""; leaderCommit = 0; // TODO + + delta = now - last_heartbeat_time_; + last_heartbeat_time_ = now; } + auto ms = std::chrono::duration_cast(delta).count(); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Heartbeat sent after " << ms << "ms"; AppendEntries appendEntries; appendEntries.set_term(currentTerm); appendEntries.set_leaderid(leaderId); @@ -470,7 +347,7 @@ void Raft::SendHeartBeat() { appendEntries.set_leadercommitindex(leaderCommit); // TODO Broadcast(MessageType::AppendEntriesMsg, appendEntries); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": "; + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Heartbeat " << heartBeatNum << " for term " << currentTerm; } // requires raft mutex to be held @@ -480,7 +357,7 @@ bool Raft::DemoteSelfLocked(uint64_t term) { votedFor_ = -1; if (role_ != raft::Role::FOLLOWER) { role_ = raft::Role::FOLLOWER; - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted to FOLLOWER"; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted to FOLLOWER"; return true; } return false; diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index a73ab82a40..bbf06dce2a 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -23,6 +23,7 @@ #include #include #include +#include #include "platform/common/queue/lock_free_queue.h" #include "platform/consensus/ordering/common/algorithm/protocol_base.h" @@ -60,7 +61,6 @@ class Raft : public common::ProtocolBase { uint64_t getPrevLogIndexLocked() const; // Must be called under mutex uint64_t getPrevLogTermLocked() const; // Must be called under mutex bool IsStop(); - void Dump(); private: mutable std::mutex mutex_; @@ -90,6 +90,9 @@ class Raft : public common::ProtocolBase { Role role_; // Protected by raft_mutex_ int LeaderId; // Protected by raft_mutex_ std::vector votes_; // Protected by raft_mutex_ + uint64_t heartBeatsSentThisTerm_; // Protected by raft_mutex_ + std::chrono::steady_clock::time_point last_ae_time_; + std::chrono::steady_clock::time_point last_heartbeat_time_; // Protected by raft_mutex_ int64_t prevLogIndex_; bool is_stop_; diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 9f28f6fe91..cef9d58009 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -32,7 +32,7 @@ Consensus::Consensus(const ResDBConfig& config, std::unique_ptr executor) : common::Consensus(config, std::move(executor)), leader_election_manager_(std::make_unique(config_)) { - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": In consensus constructor"; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": In consensus constructor"; int total_replicas = config_.GetReplicaNum(); int f = (total_replicas - 1) / 3; @@ -55,9 +55,8 @@ Consensus::Consensus(const ResDBConfig& config, } int Consensus::ProcessCustomConsensus(std::unique_ptr request) { - LOG(ERROR) << "Message type request->user_type(): " << request->user_type(); if (request->user_type() == MessageType::AppendEntriesMsg) { - LOG(ERROR) << "Received AppendEntriesMsg"; + //LOG(ERROR) << "Received AppendEntriesMsg"; std::unique_ptr txn = std::make_unique(); if (!txn->ParseFromString(request->data())) { LOG(ERROR) << "parse proposal fail"; diff --git a/scripts/deploy/config/kv_performance_server_local.conf b/scripts/deploy/config/kv_performance_server_local.conf index 16dc042f28..fb1e910a85 100644 --- a/scripts/deploy/config/kv_performance_server_local.conf +++ b/scripts/deploy/config/kv_performance_server_local.conf @@ -22,7 +22,6 @@ iplist=( 127.0.0.1 127.0.0.1 127.0.0.1 -127.0.0.1 ) client_num=1 From 210806428f33f8ea42365a6dda861b2172ca90fa Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Mon, 1 Dec 2025 20:02:31 +0000 Subject: [PATCH 22/66] staging for merge into raft branch --- .../ordering/raft/algorithm/leaderelection_manager.cpp | 10 +++++----- platform/consensus/ordering/raft/algorithm/raft.cpp | 8 ++++---- scripts/deploy/config/raft.config | 3 ++- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp index 892ac28f9e..9ecad97e6b 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp @@ -21,6 +21,7 @@ #include "platform/consensus/ordering/raft/algorithm/raft.h" #include #include +#include #include "common/utils/utils.h" #include "platform/proto/viewchange_message.pb.h" @@ -36,8 +37,8 @@ LeaderElectionManager::LeaderElectionManager(const ResDBConfig& config) raft_(nullptr), started_(false), stop_(false), - timeout_min_ms(150), - timeout_max_ms(300), + timeout_min_ms(800), + timeout_max_ms(1600), heartbeat_timer_(50), heartbeat_count_(0), //last_heartbeat_time_(std::chrono::steady_clock::now()), @@ -73,7 +74,7 @@ void LeaderElectionManager::MayStart() { } if (config_.GetConfigData().enable_viewchange()) { - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": in LeaderElectionManager MayStart, viewchange is enabled"; + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Starting MonitoringElectionTimeout thread."; server_checking_timeout_thread_ = std::thread(&LeaderElectionManager::MonitoringElectionTimeout, this); } @@ -161,7 +162,6 @@ Waited LeaderElectionManager::Wait() { // Causes leaders to Heartbeat. // Causes followers and candidates to start an election if no heartbeat received. void LeaderElectionManager::MonitoringElectionTimeout() { - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": thread entered the function"; while (!stop_.load()) { raft::Role role = raft_->GetRoleSnapshot(); Waited res; @@ -180,7 +180,7 @@ void LeaderElectionManager::MonitoringElectionTimeout() { LOG(INFO) << __FUNCTION__ << ": " << (leader ? "Leader" : "") << "Wait " << ms << "ms"; if (res == Waited::STOPPED) { break; } else if (res == Waited::ROLE_CHANGE) { - LOG(INFO) << __FUNCTION__ << ": Role change detected"; + //LOG(INFO) << __FUNCTION__ << ": Role change detected"; continue; } else if (res == Waited::HEARTBEAT) { diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 6dfc3dc946..d8c0434555 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -124,7 +124,7 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted from " << (initialRole == Role::LEADER ? "LEADER" : "CANDIDATE") << "->FOLLOWER in term " << term; } - if (success) { leader_election_manager_->OnHeartBeat(); } + if (tr != TermRelation::STALE) { leader_election_manager_->OnHeartBeat(); } AppendEntriesResponse aer; aer.set_term(term); aer.set_success(success); @@ -202,7 +202,7 @@ void Raft::ReceiveRequestVote(std::unique_ptr rv) { } if (voteGranted) { leader_election_manager_->OnHeartBeat(); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": voted for " << rvSender<< " on term " << term; + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": voted for " << rvSender<< " in term " << term; } else if (validCandidate) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": did not vote for " @@ -245,7 +245,7 @@ void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) if (votes_.size() >= votesNeeded) { elected = true; role_ = Role::LEADER; - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": CANDIDATE->LEADER on term " << currentTerm_; + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": CANDIDATE->LEADER in term " << currentTerm_; } }(); if (demoted || elected) { leader_election_manager_->OnRoleChange(); } @@ -276,7 +276,6 @@ void Raft::StartElection() { return; } if (role_ == raft::Role::FOLLOWER) { - LOG(INFO) << __FUNCTION__ << ": FOLLOWER->CANDIDATE"; role_ = raft::Role::CANDIDATE; roleChanged = true; } @@ -295,6 +294,7 @@ void Raft::StartElection() { } if (roleChanged) { leader_election_manager_->OnRoleChange(); + LOG(INFO) << __FUNCTION__ << ": FOLLOWER->CANDIDATE in term " << currentTerm; } RequestVote requestVote; diff --git a/scripts/deploy/config/raft.config b/scripts/deploy/config/raft.config index ef8980fd29..16ecb0fdf9 100644 --- a/scripts/deploy/config/raft.config +++ b/scripts/deploy/config/raft.config @@ -6,5 +6,6 @@ "max_process_txn": 10000, "worker_num": 1, "input_worker_num": 1, - "output_worker_num": 10 + "output_worker_num": 10, + "not_need_signature": true } From 331f3a3e79372979cc17db3689e325b735c9d117 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Tue, 2 Dec 2025 00:07:05 +0000 Subject: [PATCH 23/66] Added logic to redirect client proxy to current leader --- .../common/framework/performance_manager.cpp | 4 + .../common/framework/performance_manager.h | 1 + .../ordering/raft/algorithm/raft.cpp | 141 +++++++++++++++--- .../consensus/ordering/raft/algorithm/raft.h | 7 +- .../ordering/raft/framework/consensus.cpp | 14 +- .../ordering/raft/proto/proposal.proto | 6 + scripts/deploy/config/raft.config | 3 +- 7 files changed, 152 insertions(+), 24 deletions(-) diff --git a/platform/consensus/ordering/common/framework/performance_manager.cpp b/platform/consensus/ordering/common/framework/performance_manager.cpp index 101b00af06..844196ad5c 100644 --- a/platform/consensus/ordering/common/framework/performance_manager.cpp +++ b/platform/consensus/ordering/common/framework/performance_manager.cpp @@ -68,6 +68,10 @@ PerformanceManager::~PerformanceManager() { } int PerformanceManager::GetPrimary() { return primary_; } +void PerformanceManager::SetPrimary(int id) { + primary_ = id; + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": primary updated to " << primary_; +} std::unique_ptr PerformanceManager::GenerateUserRequest() { std::unique_ptr request = std::make_unique(); diff --git a/platform/consensus/ordering/common/framework/performance_manager.h b/platform/consensus/ordering/common/framework/performance_manager.h index a34f05739f..5e2f453173 100644 --- a/platform/consensus/ordering/common/framework/performance_manager.h +++ b/platform/consensus/ordering/common/framework/performance_manager.h @@ -39,6 +39,7 @@ class PerformanceManager { virtual ~PerformanceManager(); int StartEval(); + void SetPrimary(int id); int ProcessResponseMsg(std::unique_ptr context, std::unique_ptr request); diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index d8c0434555..66fcc3b89c 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -33,11 +33,12 @@ namespace resdb { namespace raft { Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, - LeaderElectionManager* leaderelection_manager) + LeaderElectionManager* leaderelection_manager, ReplicaCommunicator* replica_communicator) : ProtocolBase(id, f, total_num), role_(raft::Role::FOLLOWER), verifier_(verifier), - leader_election_manager_(leaderelection_manager) { + leader_election_manager_(leaderelection_manager), + replica_communicator_(replica_communicator) { id_ = id; total_num_ = total_num; f_ = (total_num-1)/2; @@ -69,8 +70,17 @@ Raft::~Raft() { is_stop_ = true; } bool Raft::IsStop() { return is_stop_; } bool Raft::ReceiveTransaction(std::unique_ptr txn) { + + LOG(INFO) << "JIM -> " << __FUNCTION__; + { + std::lock_guard lk(mutex_); + if (role_ != Role::LEADER) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Replica is not leader, returning early"; + return false; } + } + + LOG(INFO) << "JIM -> " << __FUNCTION__; // LOG(INFO)<<"recv txn:"; - return false; LOG(INFO) << "Received Transaction to primary id: " << id_; LOG(INFO) << "prevLogIndex: " << prevLogIndex_; txn->set_create_time(GetCurrentTime()); @@ -125,35 +135,116 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { << (initialRole == Role::LEADER ? "LEADER" : "CANDIDATE") << "->FOLLOWER in term " << term; } if (tr != TermRelation::STALE) { leader_election_manager_->OnHeartBeat(); } - AppendEntriesResponse aer; - aer.set_term(term); - aer.set_success(success); - if (success) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded success"; } - else { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded failure"; } - SendMessage(MessageType::AppendEntriesResponseMsg, aer, txn->leaderid()); - return false; + AppendEntriesResponse aer; + aer.set_term(term); + aer.set_success(success); + aer.set_id(id_); + if (success) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded success"; } + else { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded failure"; } + SendMessage(MessageType::AppendEntriesResponseMsg, aer, txn->leaderid()); + //return false; + + // Implement an entry existing but with a different term + // delete that entry and all after it + LOG(INFO) << "Before AppendEntriesMsg Added to Log"; + std::string hash = txn->hash(); + int64_t prevLogIndex = txn->prevlogindex(); + { + std::unique_lock lk(mutex_); + std::string hash = txn->hash(); + LOG(INFO) << "Before adding to entries"; + log_[txn->hash()] = std::move(txn); + LOG(INFO) << "After adding to entries"; + logIndexMapping_.push_back(hash); + lastApplied_++; + } + auto leaderCommit = txn->leadercommitindex(); + LOG(INFO) << "AppendEntriesMsg Added to Log"; + LOG(INFO) << "leaderCommit: " << leaderCommit; + LOG(INFO) << "commitIndex_: " << commitIndex_; + LOG(INFO) << "lastApplied_: " << lastApplied_; + LOG(INFO) << "static_cast(log_.size()): " << static_cast(log_.size()); + LOG(INFO) << "leaderCommit > commitIndex_: " << (leaderCommit > commitIndex_ ? "true" : "false"); + LOG(INFO) << "lastApplied_ + 1 <= static_cast(log_.size()) " << ((lastApplied_ + 1 <= static_cast(log_.size())) ? "true" : "false"); + while (leaderCommit > commitIndex_ && lastApplied_ + 1 <= static_cast(log_.size())) { + // assert(false); + LOG(INFO) << "AppendEntriesMsg Committing"; + std::unique_ptr txnToCommit = nullptr; + txnToCommit = std::move(log_[logIndexMapping_[lastApplied_]]); + commit_(*txnToCommit); + lastApplied_++; + } + LOG(INFO) << "before commit index check"; + // I don't quite know if this needs to be conditional, but that's how the paper says it + if (leaderCommit > commitIndex_) + // not 100% certain if this second variable should be prevLogIndex + commitIndex_ = std::min(leaderCommit, prevLogIndex); + + LOG(INFO) << "after commit index check"; + aer.set_lastapplied(lastApplied_); + aer.set_nextentry(log_.size()); + aer.set_success(true); + aer.set_hash(hash); + aer.set_prevlogindex(prevLogIndex); + LOG(INFO) << "Leader_id: " << txn->leaderid(); + leader_election_manager_->OnHeartBeat(); + SendMessage(MessageType::AppendEntriesResponseMsg, aer, txn->leaderid()); + return true; } -bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr response) { +bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr aer) { uint64_t term; bool demoted = false; TermRelation tr; Role initialRole; -{ - initialRole = role_; - std::lock_guard lk(mutex_); - tr = TermCheckLocked(response->term()); - if (tr == TermRelation::NEW) { demoted = DemoteSelfLocked(response->term()); } - term = currentTerm_; -} + { + std::lock_guard lk(mutex_); + initialRole = role_; + tr = TermCheckLocked(aer->term()); + if (tr == TermRelation::NEW) { demoted = DemoteSelfLocked(aer->term()); } + term = currentTerm_; + } if (demoted) { leader_election_manager_->OnRoleChange(); LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted from " << (initialRole == Role::LEADER ? "LEADER" : "CANDIDATE") << "->FOLLOWER in term " << term; + return false; + } + else if (aer->success()) { + { + std::unique_lock lk(mutex_); + received_[aer->hash()].insert(aer->id()); + auto it = log_.find(aer->hash()); + if (it != log_.end()) { + LOG(INFO) << "Transaction: " << aer->prevlogindex() << " has gotten " << received_[aer->hash()].size() << " responses"; + if (static_cast(received_[aer->hash()].size()) >= f_ + 1) { + commitIndex_ = aer->prevlogindex(); + + // pretty sure this should always be in order with no gaps + while (lastApplied_ + 1 <= static_cast(log_.size()) && + lastApplied_ <= commitIndex_) { + LOG(INFO) << "Leader Committing"; + std::unique_ptr txnToCommit = nullptr; + txnToCommit = std::move(log_[logIndexMapping_[lastApplied_]]); + commit_(*txnToCommit); + lastApplied_++; + } + } + } + } + nextIndex_[aer->id()] = aer->nextentry(); + matchIndex_[aer->id()] = aer->lastapplied(); + return true; } - return false; -} + // handling for if leader is out of date and term is wrong + + // handling for if term is correct, but follower is just out of date + --nextIndex_[aer->id()]; + // send message + assert(false); + return true; +} void Raft::ReceiveRequestVote(std::unique_ptr rv) { int rvSender = rv->candidateid(); @@ -348,6 +439,16 @@ void Raft::SendHeartBeat() { Broadcast(MessageType::AppendEntriesMsg, appendEntries); LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Heartbeat " << heartBeatNum << " for term " << currentTerm; + + // Also ping client proxies that this is the leader + DirectToLeader dtl; + dtl.set_term(currentTerm); + dtl.set_leaderid(id_); + for (const auto& client : replica_communicator_->GetClientReplicas()) { + int id = client.id(); + SendMessage(DirectToLeaderMsg, dtl, id); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": DirectToLeader sent to " << id; + } } // requires raft mutex to be held diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index bbf06dce2a..a7a180a094 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -30,6 +30,7 @@ #include "platform/consensus/ordering/raft/proto/proposal.pb.h" #include "platform/statistic/stats.h" #include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" +#include "platform/networkstrate/replica_communicator.h" namespace resdb { namespace raft { @@ -41,7 +42,9 @@ class Raft : public common::ProtocolBase { public: Raft(int id, int f, int total_num, SignatureVerifier* verifier, - LeaderElectionManager* leaderelection_manager); + LeaderElectionManager* leaderelection_manager, + ReplicaCommunicator* replica_communicator + ); ~Raft(); bool ReceiveTransaction(std::unique_ptr txn); @@ -99,6 +102,8 @@ class Raft : public common::ProtocolBase { SignatureVerifier* verifier_; LeaderElectionManager* leader_election_manager_; Stats* global_stats_; + + ReplicaCommunicator* replica_communicator_; }; diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index cef9d58009..211a82dc36 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -45,7 +45,8 @@ Consensus::Consensus(const ResDBConfig& config, .public_key_info() .type() != CertificateKeyInfo::CLIENT) { raft_ = std::make_unique(config_.GetSelfInfo().id(), f, total_replicas, - GetSignatureVerifier(), leader_election_manager_.get()); + GetSignatureVerifier(), leader_election_manager_.get(), + replica_communicator_); leader_election_manager_->SetRaft(raft_.get()); leader_election_manager_->MayStart(); @@ -95,6 +96,17 @@ int Consensus::ProcessCustomConsensus(std::unique_ptr request) { raft_->ReceiveRequestVoteResponse(std::move(rvr)); return 0; } + else if (request->user_type() == MessageType::DirectToLeaderMsg) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": In DirectToLeader"; + std::unique_ptr dtl = std::make_unique(); + if (!dtl->ParseFromString(request->data())) { + LOG(ERROR) << "parse proposal fail"; + assert(1 == 0); + return -1; + } + performance_manager_->SetPrimary(dtl->leaderid()); + return 0; + } return 0; } diff --git a/platform/consensus/ordering/raft/proto/proposal.proto b/platform/consensus/ordering/raft/proto/proposal.proto index 7b6a7da466..ecdae113ec 100644 --- a/platform/consensus/ordering/raft/proto/proposal.proto +++ b/platform/consensus/ordering/raft/proto/proposal.proto @@ -57,11 +57,17 @@ message RequestVoteResponse { bool voteGranted = 3; } +message DirectToLeader { + int64 term = 1; + int32 leaderId = 2; +} + enum MessageType { None = 0; AppendEntriesMsg = 1; AppendEntriesResponseMsg = 2; RequestVoteMsg = 3; RequestVoteResponseMsg = 4; + DirectToLeaderMsg = 5; } diff --git a/scripts/deploy/config/raft.config b/scripts/deploy/config/raft.config index 16ecb0fdf9..ef8980fd29 100644 --- a/scripts/deploy/config/raft.config +++ b/scripts/deploy/config/raft.config @@ -6,6 +6,5 @@ "max_process_txn": 10000, "worker_num": 1, "input_worker_num": 1, - "output_worker_num": 10, - "not_need_signature": true + "output_worker_num": 10 } From 2a808dcbbc0fe7df74e5e562ad4e79e3b9fc3703 Mon Sep 17 00:00:00 2001 From: yhuan331 Date: Tue, 2 Dec 2025 18:02:57 +0000 Subject: [PATCH 24/66] Fix AppendEntries failure path to back off nextIndex and retry --- .../ordering/raft/algorithm/raft.cpp | 138 +++++++++++++++++- 1 file changed, 130 insertions(+), 8 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index e7a962191a..ccb3c8dc73 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -183,6 +183,13 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); return true; } +// testing purpose (will remove later ) + if (id_ == 2) { + LOG(INFO) << "TEST: Forcing AppendEntries Failure on follower " << id_; + appendEntriesResponse.set_success(false); + SendMessage(MessageType::AppendEntriesResponseMsg, appendEntriesResponse, leader_id); + return true; + } // Implement an entry existing but with a different term // delete that entry and all after it LOG(INFO) << "Before AppendEntriesMsg Added to Log"; @@ -231,25 +238,75 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { return true; } + +//===================================================== +// bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr response) { +// if (id_ != 1) +// return true; +// LOG(INFO) << "ReceiveAppendEntriesResponse"; +// auto followerId = response->id(); +// LOG(INFO) << "followerId: " << followerId; +// if (response->success()) { +// { +// std::unique_lock lk(mutex_); +// received_[response->hash()].insert(response->id()); +// auto it = log_.find(response->hash()); +// if (it != log_.end()) { +// LOG(INFO) << "Transaction: " << response->prevlogindex() << " has gotten " << received_[response->hash()].size() << " responses"; +// if (static_cast(received_[response->hash()].size()) >= f_ + 1) { +// commitIndex_ = response->prevlogindex(); + +// // pretty sure this should always be in order with no gaps +// while (lastApplied_ + 1 <= static_cast(log_.size()) && +// lastApplied_ <= commitIndex_) { +// LOG(INFO) << "Leader Committing"; +// std::unique_ptr txnToCommit = nullptr; +// txnToCommit = std::move(log_[logIndexMapping_[lastApplied_]]); +// commit_(*txnToCommit); +// lastApplied_++; +// } +// } +// } +// } +// nextIndex_[followerId] = response->nextentry(); +// matchIndex_[followerId] = response->lastapplied(); +// return true; +// } + +// // handling for if leader is out of date and term is wrong + +// // handling for if term is correct, but follower is just out of date +// --nextIndex_[followerId]; +// // send message +// assert(false); +// return true; +// } +//=====================UNTOUCHED================================ + bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr response) { if (id_ != 1) return true; + LOG(INFO) << "ReceiveAppendEntriesResponse"; auto followerId = response->id(); LOG(INFO) << "followerId: " << followerId; + + // ===================== SUCCESS CASE ===================== if (response->success()) { { std::unique_lock lk(mutex_); received_[response->hash()].insert(response->id()); auto it = log_.find(response->hash()); if (it != log_.end()) { - LOG(INFO) << "Transaction: " << response->prevlogindex() << " has gotten " << received_[response->hash()].size() << " responses"; + LOG(INFO) << "Transaction: " << response->prevlogindex() + << " has gotten " << received_[response->hash()].size() + << " responses"; if (static_cast(received_[response->hash()].size()) >= f_ + 1) { commitIndex_ = response->prevlogindex(); // pretty sure this should always be in order with no gaps while (lastApplied_ + 1 <= static_cast(log_.size()) && - lastApplied_ <= commitIndex_) { + lastApplied_ <= commitIndex_) { LOG(INFO) << "Leader Committing"; std::unique_ptr txnToCommit = nullptr; txnToCommit = std::move(log_[logIndexMapping_[lastApplied_]]); @@ -263,17 +320,82 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr r matchIndex_[followerId] = response->lastapplied(); return true; } - - // handling for if leader is out of date and term is wrong - // handling for if term is correct, but follower is just out of date - --nextIndex_[followerId]; - // send message - assert(false); + // ===================== FAILURE CASE ===================== + // (term-mismatch leader demotion is TODO; here we handle follower being out-of-date) + + LOG(INFO) << "AppendEntriesResponse indicates FAILURE from follower " << followerId; + + { + std::unique_lock lk(mutex_); + + // Move nextIndex one step back, but don't go below 1 + if (nextIndex_[followerId] > 1) { + --nextIndex_[followerId]; + } else { + nextIndex_[followerId] = 1; + } + + int resendIndex = nextIndex_[followerId]; + LOG(INFO) << "Updated nextIndex_ for follower " << followerId + << " to " << resendIndex; + + // Check that we actually have an entry at this index + if (resendIndex < 0 || + resendIndex >= static_cast(logIndexMapping_.size())) { + LOG(INFO) << "No log entry at index " << resendIndex + << " to resend; logIndexMapping_.size() = " + << logIndexMapping_.size(); + return true; + } + + const std::string& key = logIndexMapping_[resendIndex]; + auto it = log_.find(key); + if (it == log_.end() || !it->second) { + LOG(WARNING) << "Log entry missing in map for key at index " + << resendIndex; + return true; + } + + // Build a new AppendEntries message based on the stored log entry + AppendEntries resend; + resend.CopyFrom(*(it->second)); // copies hash, entries, uid, proxy_id, etc. + + // Make sure RAFT fields are consistent with our current state + resend.set_term(currentTerm_); + resend.set_leaderid(id_); + + // prevLogIndex = index immediately before resendIndex + int64_t prevIdx = (resendIndex == 0 ? 0 : resendIndex - 1); + resend.set_prevlogindex(prevIdx); + + // prevLogTerm = term of the entry at prevIdx (or 0 if none) + uint64_t prevTerm = 0; + if (prevIdx >= 0 && prevIdx < static_cast(logIndexMapping_.size())) { + const std::string& prevKey = logIndexMapping_[prevIdx]; + auto itPrev = log_.find(prevKey); + if (itPrev != log_.end() && itPrev->second) { + prevTerm = itPrev->second->term(); + } + } + resend.set_prevlogterm(prevTerm); + + // leaderCommitIndex + resend.set_leadercommitindex(commitIndex_); + + LOG(INFO) << "Resending AppendEntries for index " << resendIndex + << " (prevIdx=" << prevIdx + << ", prevTerm=" << prevTerm + << ") to follower " << followerId; + + SendMessage(MessageType::AppendEntriesMsg, resend, followerId); + } + return true; } + void Raft::ReceiveRequestVote(std::unique_ptr rv) { LOG(INFO) << "JIM -> " << __FUNCTION__; int rvSender = rv->candidateid(); From 6f626ede42f84ee51f8668c15d806e86cae86aaf Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Wed, 3 Dec 2025 00:06:53 +0000 Subject: [PATCH 25/66] redirect to leader working even on later terms --- .../common/framework/performance_manager.cpp | 35 +++++++++------ .../common/framework/performance_manager.h | 2 +- .../raft/algorithm/leaderelection_manager.cpp | 22 +++++----- .../ordering/raft/algorithm/raft.cpp | 44 +++++++++---------- .../ordering/raft/framework/consensus.cpp | 6 +-- 5 files changed, 58 insertions(+), 51 deletions(-) diff --git a/platform/consensus/ordering/common/framework/performance_manager.cpp b/platform/consensus/ordering/common/framework/performance_manager.cpp index 844196ad5c..160c18fc3a 100644 --- a/platform/consensus/ordering/common/framework/performance_manager.cpp +++ b/platform/consensus/ordering/common/framework/performance_manager.cpp @@ -52,8 +52,8 @@ PerformanceManager::PerformanceManager( total_num_ = 0; replica_num_ = config_.GetReplicaNum(); id_ = config_.GetSelfInfo().id(); - primary_ = id_ % replica_num_; - if (primary_ == 0) primary_ = replica_num_; + primary_.store(id_ % replica_num_); + if (primary_ == 0) primary_.store(replica_num_); local_id_ = 1; sum_ = 0; } @@ -67,10 +67,16 @@ PerformanceManager::~PerformanceManager() { } } -int PerformanceManager::GetPrimary() { return primary_; } +int PerformanceManager::GetPrimary() { return primary_.load(); } + void PerformanceManager::SetPrimary(int id) { - primary_ = id; - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": primary updated to " << primary_; + int curr_primary = primary_.load(); + while (id != curr_primary) { + if (primary_.compare_exchange_strong(curr_primary, id)) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": primary updated to " << id; + return; + } + } } std::unique_ptr PerformanceManager::GenerateUserRequest() { @@ -88,15 +94,17 @@ int PerformanceManager::StartEval() { return 0; } eval_started_ = true; - for (int i = 0; i < 100000000; ++i) { - std::unique_ptr queue_item = std::make_unique(); - queue_item->context = nullptr; - queue_item->user_request = GenerateUserRequest(); - batch_queue_.Push(std::move(queue_item)); - if (i == 2000000) { - eval_ready_promise_.set_value(true); + std::thread([&](){ + for (int i = 0; i < 100000000; ++i) { + std::unique_ptr queue_item = std::make_unique(); + queue_item->context = nullptr; + queue_item->user_request = GenerateUserRequest(); + batch_queue_.Push(std::move(queue_item)); + if (i == 2000000) { + eval_ready_promise_.set_value(true); + } } - } +}).detach(); return 0; } @@ -273,6 +281,7 @@ int PerformanceManager::DoBatch( void PerformanceManager::SendMessage(const Request& request) { replica_communicator_->SendMessage(request, GetPrimary()); + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Sent to replica " << GetPrimary(); } } // namespace common diff --git a/platform/consensus/ordering/common/framework/performance_manager.h b/platform/consensus/ordering/common/framework/performance_manager.h index 5e2f453173..990fd6f742 100644 --- a/platform/consensus/ordering/common/framework/performance_manager.h +++ b/platform/consensus/ordering/common/framework/performance_manager.h @@ -89,7 +89,7 @@ class PerformanceManager { std::mutex response_lock_[response_set_size_]; int replica_num_; int id_; - int primary_; + std::atomic primary_; std::atomic local_id_; std::atomic sum_; }; diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp index 9ecad97e6b..ddf3f42ee8 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp @@ -37,9 +37,9 @@ LeaderElectionManager::LeaderElectionManager(const ResDBConfig& config) raft_(nullptr), started_(false), stop_(false), - timeout_min_ms(800), - timeout_max_ms(1600), - heartbeat_timer_(50), + timeout_min_ms(1200), + timeout_max_ms(2400), + heartbeat_timer_(100), heartbeat_count_(0), //last_heartbeat_time_(std::chrono::steady_clock::now()), role_epoch_(0), @@ -74,7 +74,7 @@ void LeaderElectionManager::MayStart() { } if (config_.GetConfigData().enable_viewchange()) { - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Starting MonitoringElectionTimeout thread."; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Starting MonitoringElectionTimeout thread."; server_checking_timeout_thread_ = std::thread(&LeaderElectionManager::MonitoringElectionTimeout, this); } @@ -165,19 +165,19 @@ void LeaderElectionManager::MonitoringElectionTimeout() { while (!stop_.load()) { raft::Role role = raft_->GetRoleSnapshot(); Waited res; - std::chrono::steady_clock::time_point wait_start_time_ = std::chrono::steady_clock::now(); - bool leader = false; + //std::chrono::steady_clock::time_point wait_start_time_ = std::chrono::steady_clock::now(); + //bool leader = false; if (role == raft::Role::LEADER) { res = LeaderWait(); - leader = true; + //leader = true; } else { res = Wait(); } - std::chrono::steady_clock::time_point wait_end_time_ = std::chrono::steady_clock::now(); - std::chrono::steady_clock::duration delta = wait_end_time_ - wait_start_time_; - auto ms = std::chrono::duration_cast(delta).count(); - LOG(INFO) << __FUNCTION__ << ": " << (leader ? "Leader" : "") << "Wait " << ms << "ms"; + //std::chrono::steady_clock::time_point wait_end_time_ = std::chrono::steady_clock::now(); + //std::chrono::steady_clock::duration delta = wait_end_time_ - wait_start_time_; + //auto ms = std::chrono::duration_cast(delta).count(); + //LOG(INFO) << __FUNCTION__ << ": " << (leader ? "Leader" : "") << "Wait " << ms << "ms"; if (res == Waited::STOPPED) { break; } else if (res == Waited::ROLE_CHANGE) { //LOG(INFO) << __FUNCTION__ << ": Role change detected"; diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 66fcc3b89c..ac5511eb76 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -70,8 +70,6 @@ Raft::~Raft() { is_stop_ = true; } bool Raft::IsStop() { return is_stop_; } bool Raft::ReceiveTransaction(std::unique_ptr txn) { - - LOG(INFO) << "JIM -> " << __FUNCTION__; { std::lock_guard lk(mutex_); if (role_ != Role::LEADER) { @@ -79,7 +77,6 @@ bool Raft::ReceiveTransaction(std::unique_ptr txn) { return false; } } - LOG(INFO) << "JIM -> " << __FUNCTION__; // LOG(INFO)<<"recv txn:"; LOG(INFO) << "Received Transaction to primary id: " << id_; LOG(INFO) << "prevLogIndex: " << prevLogIndex_; @@ -108,12 +105,12 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { TermRelation tr; Role initialRole; - auto now = std::chrono::steady_clock::now(); - std::chrono::steady_clock::duration delta; - delta = now - last_ae_time_; - last_ae_time_ = now; - auto ms = std::chrono::duration_cast(delta).count(); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": AE received after " << ms << "ms"; + //auto now = std::chrono::steady_clock::now(); + //std::chrono::steady_clock::duration delta; + //delta = now - last_ae_time_; + //last_ae_time_ = now; + //auto ms = std::chrono::duration_cast(delta).count(); + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": AE received after " << ms << "ms"; { initialRole = role_; std::lock_guard lk(mutex_); @@ -135,20 +132,14 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { << (initialRole == Role::LEADER ? "LEADER" : "CANDIDATE") << "->FOLLOWER in term " << term; } if (tr != TermRelation::STALE) { leader_election_manager_->OnHeartBeat(); } - AppendEntriesResponse aer; - aer.set_term(term); - aer.set_success(success); - aer.set_id(id_); - if (success) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded success"; } - else { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded failure"; } - SendMessage(MessageType::AppendEntriesResponseMsg, aer, txn->leaderid()); - //return false; // Implement an entry existing but with a different term // delete that entry and all after it LOG(INFO) << "Before AppendEntriesMsg Added to Log"; std::string hash = txn->hash(); int64_t prevLogIndex = txn->prevlogindex(); + auto leaderCommit = txn->leadercommitindex(); + auto leaderId = txn->leaderid(); { std::unique_lock lk(mutex_); std::string hash = txn->hash(); @@ -158,7 +149,7 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { logIndexMapping_.push_back(hash); lastApplied_++; } - auto leaderCommit = txn->leadercommitindex(); + LOG(INFO) << "AppendEntriesMsg Added to Log"; LOG(INFO) << "leaderCommit: " << leaderCommit; LOG(INFO) << "commitIndex_: " << commitIndex_; @@ -180,15 +171,22 @@ bool Raft::ReceivePropose(std::unique_ptr txn) { // not 100% certain if this second variable should be prevLogIndex commitIndex_ = std::min(leaderCommit, prevLogIndex); + LOG(INFO) << "after commit index check"; + AppendEntriesResponse aer; + aer.set_term(term); + aer.set_success(success); + aer.set_id(id_); aer.set_lastapplied(lastApplied_); aer.set_nextentry(log_.size()); aer.set_success(true); aer.set_hash(hash); aer.set_prevlogindex(prevLogIndex); - LOG(INFO) << "Leader_id: " << txn->leaderid(); + LOG(INFO) << "Leader_id: " << leaderId; leader_election_manager_->OnHeartBeat(); - SendMessage(MessageType::AppendEntriesResponseMsg, aer, txn->leaderid()); + SendMessage(MessageType::AppendEntriesResponseMsg, aer, leaderId); + if (success) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded success"; } + else { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded failure"; } return true; } @@ -428,7 +426,7 @@ void Raft::SendHeartBeat() { last_heartbeat_time_ = now; } auto ms = std::chrono::duration_cast(delta).count(); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Heartbeat sent after " << ms << "ms"; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Heartbeat sent after " << ms << "ms"; AppendEntries appendEntries; appendEntries.set_term(currentTerm); appendEntries.set_leaderid(leaderId); @@ -438,7 +436,7 @@ void Raft::SendHeartBeat() { appendEntries.set_leadercommitindex(leaderCommit); // TODO Broadcast(MessageType::AppendEntriesMsg, appendEntries); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Heartbeat " << heartBeatNum << " for term " << currentTerm; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Heartbeat " << heartBeatNum << " for term " << currentTerm; // Also ping client proxies that this is the leader DirectToLeader dtl; @@ -447,7 +445,7 @@ void Raft::SendHeartBeat() { for (const auto& client : replica_communicator_->GetClientReplicas()) { int id = client.id(); SendMessage(DirectToLeaderMsg, dtl, id); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": DirectToLeader sent to " << id; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": DirectToLeader " << id_ << " sent to proxy " << id; } } diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 211a82dc36..c42d82e053 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -97,7 +97,7 @@ int Consensus::ProcessCustomConsensus(std::unique_ptr request) { return 0; } else if (request->user_type() == MessageType::DirectToLeaderMsg) { - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": In DirectToLeader"; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": In DirectToLeader"; std::unique_ptr dtl = std::make_unique(); if (!dtl->ParseFromString(request->data())) { LOG(ERROR) << "parse proposal fail"; @@ -116,8 +116,8 @@ int Consensus::ProcessNewTransaction(std::unique_ptr request) { txn->set_hash(request->hash()); txn->set_proxy_id(request->proxy_id()); txn->set_uid(request->uid()); - return raft_->ReceiveTransaction(std::move(txn)); -} + return raft_->ReceiveTransaction(std::move(txn)); + } int Consensus::CommitMsg(const google::protobuf::Message& msg) { return CommitMsgInternal(dynamic_cast(msg)); From d8bb72de61142f53a659f93ea5cbf8366e02c080 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Thu, 4 Dec 2025 09:41:17 +0000 Subject: [PATCH 26/66] seems to kind of work. worried about duplicate entries? --- .../raft/algorithm/leaderelection_manager.cpp | 2 +- .../ordering/raft/algorithm/raft.cpp | 285 ++++++++---------- .../consensus/ordering/raft/algorithm/raft.h | 58 ++-- .../ordering/raft/proto/proposal.proto | 4 +- scripts/deploy/config/raft.config | 2 +- 5 files changed, 151 insertions(+), 200 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp index ddf3f42ee8..eedbb6e738 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp @@ -184,7 +184,7 @@ void LeaderElectionManager::MonitoringElectionTimeout() { continue; } else if (res == Waited::HEARTBEAT) { - LOG(INFO) << __FUNCTION__ << ": Heartbeat received within window"; + //LOG(INFO) << __FUNCTION__ << ": Heartbeat received within window"; if (raft_->GetRoleSnapshot() == raft::Role::LEADER) { // A leader receiving a heartbeat would be unusual but not impossible. LOG(WARNING) << __FUNCTION__ << " Received Heartbeat as LEADER"; diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index dc05f09676..282e40ddb0 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -34,35 +34,32 @@ namespace raft { Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, LeaderElectionManager* leaderelection_manager, ReplicaCommunicator* replica_communicator) - : ProtocolBase(id, f, total_num), + : ProtocolBase(id, f, total_num), + currentTerm_(0), + votedFor_(-1), + commitIndex_(0), + lastApplied_(0), role_(raft::Role::FOLLOWER), + lastLogIndex_(0), + is_stop_(false), + quorum_((total_num/2) + 1), verifier_(verifier), leader_election_manager_(leaderelection_manager), replica_communicator_(replica_communicator) { + id_ = id; total_num_ = total_num; f_ = (total_num-1)/2; - is_stop_ = false; - prevLogIndex_ = 0; - currentTerm_ = 0; - votedFor_ = -1; - commitIndex_ = 0; - lastApplied_ = 0; last_ae_time_ = std::chrono::steady_clock::now(); last_heartbeat_time_ = std::chrono::steady_clock::now(); - AppendEntries ae; - ae.set_leaderid(0); - ae.set_prevlogindex(0); - ae.set_prevlogterm(0); - ae.set_leadercommitindex(0); - ae.set_term(0); - const std::string key = "7622832959"; - logIndexMapping_.push_back(key); - log_[key] = std::make_unique(ae); - - nextIndex_.assign(total_num_ + 1, logIndexMapping_.size()); - matchIndex_.assign(total_num_ + 1, 0); + auto sentinel = std::make_unique(); + sentinel->set_term(0); + sentinel->set_entries("COMMON_PREFIX"); + log_.push_back(std::move(sentinel)); + + nextIndex_.assign(total_num_ + 1, lastLogIndex_ + 1); + matchIndex_.assign(total_num_ + 1, lastLogIndex_); } Raft::~Raft() { is_stop_ = true; } @@ -81,30 +78,31 @@ bool Raft::ReceiveTransaction(std::unique_ptr txn) { // Inform client proxy of new leader? // Redirect transaction to a known leader? LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Replica is not leader, returning early"; - return false; + return false; } // prepare fields for appendEntries message term = currentTerm_; - prevLogIndex = getPrevLogIndexLocked(); - prevLogTerm = getPrevLogTermLocked(); + prevLogIndex = lastLogIndex_; + prevLogTerm = getLastLogTermLocked(); leaderCommit = commitIndex_; // append new transaction to log txn->set_term(term); ae.CopyFrom(*txn); - logIndexMapping_.push_back(txn->hash()); - log_[txn->hash()] = std::move(txn); - prevLogIndex_++; // ?? + log_.push_back(std::move(txn)); + // TODO // durably store the new entry somehow // otherwise it is a safety violation to treat it as "appended" // should not be sending AEs before durable. + + lastLogIndex_++; + nextIndex_[id_] = lastLogIndex_ + 1; + matchIndex_[id_] = lastLogIndex_; } - LOG(INFO) << "Received Transaction to primary id: " << id_; - LOG(INFO) << "prevLogIndex: " << prevLogIndex; + //LOG(INFO) << "Received Transaction to primary id: " << id_; ae.set_create_time(GetCurrentTime()); - // This should be a term for each entry, but assuming no failure at first ae.set_leaderid(id_); ae.set_prevlogindex(prevLogIndex); ae.set_prevlogterm(prevLogTerm); @@ -123,7 +121,7 @@ bool Raft::ReceivePropose(std::unique_ptr ae) { bool demoted = false; TermRelation tr; Role initialRole; - uint64_t prevLogIndex = ae->prevlogindex(); + uint64_t lastLogIndex; auto leaderCommit = ae->leadercommitindex(); auto leaderId = ae->leaderid(); std::string hash = ae->hash(); @@ -132,15 +130,14 @@ bool Raft::ReceivePropose(std::unique_ptr ae) { [&]() { std::lock_guard lk(mutex_); initialRole = role_; + lastLogIndex = lastLogIndex_; tr = TermCheckLocked(ae->term()); if (tr == TermRelation::NEW) { demoted = DemoteSelfLocked(ae->term()); } - else if (role_ == Role::CANDIDATE && tr == TermRelation::CURRENT) { demoted = DemoteSelfLocked(ae->term()); } + else if (role_ != Role::FOLLOWER && tr == TermRelation::CURRENT) { demoted = DemoteSelfLocked(ae->term()); } + if (tr != TermRelation::STALE && role_ == Role::FOLLOWER) { uint64_t i = ae->prevlogindex(); - if (i < logIndexMapping_.size()) { - const std::string& key = logIndexMapping_[i]; - if (ae->prevlogterm() == log_[key]->term()) { success = true; } - } + if (i < static_cast(log_.size()) && ae->prevlogterm() == log_[i]->term()) { success = true; } } term = currentTerm_; if (!success) { return; } @@ -152,29 +149,30 @@ bool Raft::ReceivePropose(std::unique_ptr ae) { if (ae->entries() != "") { // TODO Implement an entry existing but with a different term // delete that entry and all after it - logIndexMapping_.push_back(hash); - log_[hash] = std::move(ae); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Appended to log at index" << getPrevLogIndexLocked(); + log_.push_back(std::move(ae)); + lastLogIndex_++; + lastLogIndex = lastLogIndex_; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Appended to log at index" << lastLogIndex_; // have to actually store the entry durably before it can be considered "appended" } // heartbeat case else { - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": This is a heartbeat, should not append"; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": This is a heartbeat, should not append"; } // common case uint64_t prevCommitIndex = commitIndex_; if (leaderCommit > commitIndex_) { - commitIndex_ = std::min(leaderCommit, getPrevLogIndexLocked()); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Raised commitIndex_ from " - << prevCommitIndex << " to " << commitIndex_; + commitIndex_ = std::min(leaderCommit, lastLogIndex_); + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Raised commitIndex_ from " + // << prevCommitIndex << " to " << commitIndex_; } // apply any newly committed entries to state machine while (commitIndex_ > lastApplied_) { lastApplied_++; - eToApply.push_back(log_[logIndexMapping_[lastApplied_]].get()); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Applying index entry " << lastApplied_; + eToApply.push_back(log_[lastApplied_].get()); + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Applying index entry " << lastApplied_; } }(); @@ -211,69 +209,60 @@ bool Raft::ReceivePropose(std::unique_ptr ae) { aer.set_term(term); aer.set_success(success); aer.set_id(id_); - //aer.set_nextentry(log_.size()); - aer.set_hash(hash); // this seems odd - aer.set_prevlogindex(prevLogIndex); + aer.set_lastlogindex(lastLogIndex); SendMessage(MessageType::AppendEntriesResponseMsg, aer, leaderId); - if (success) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded success"; } - else { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded failure"; } + //if (success) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded success"; } + //else { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded failure"; } return true; } bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr aer) { -uint64_t term; -bool demoted = false; -TermRelation tr; -Role initialRole; - { + uint64_t term; + bool demoted = false; + bool resending = false; + TermRelation tr; + Role initialRole; + std::vector eToApply; + AppendEntries resend; + + [&]() { std::lock_guard lk(mutex_); initialRole = role_; tr = TermCheckLocked(aer->term()); if (tr == TermRelation::NEW) { demoted = DemoteSelfLocked(aer->term()); } term = currentTerm_; - } - if (demoted) { - leader_election_manager_->OnRoleChange(); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted from " - << (initialRole == Role::LEADER ? "LEADER" : "CANDIDATE") << "->FOLLOWER in term " << term; - return false; - } - // ===================== SUCCESS CASE ===================== - else if (aer->success()) { - { - std::unique_lock lk(mutex_); - received_[aer->hash()].insert(aer->id()); - auto it = log_.find(aer->hash()); - if (it != log_.end()) { - LOG(INFO) << "Transaction: " << aer->prevlogindex() - << " has gotten " << received_[aer->hash()].size() - << " responses"; - if (static_cast(received_[aer->hash()].size()) >= static_cast(total_num_ - f_)) { - commitIndex_ = aer->prevlogindex(); - - // pretty sure this should always be in order with no gaps - while (lastApplied_ + 1 <= static_cast(log_.size()) && - lastApplied_ <= commitIndex_) { - LOG(INFO) << "Leader Committing"; - std::unique_ptr txnToCommit = nullptr; - txnToCommit = std::move(log_[logIndexMapping_[lastApplied_]]); // suspicious? - commit_(*txnToCommit); - lastApplied_++; - } - } - } - } - nextIndex_[aer->id()] = aer->nextentry(); - matchIndex_[aer->id()] = aer->lastapplied(); - return true; - } - // ===================== FAILURE CASE ===================== - else if (!aer->success()) { - LOG(INFO) << "AppendEntriesResponse indicates FAILURE from follower " << aer->id(); - { - std::unique_lock lk(mutex_); + if (role_ != Role::LEADER || tr == TermRelation::STALE) { return; } + + // ===================== SUCCESS CASE ===================== + if (aer->success()) { + nextIndex_[aer->id()] = aer->lastlogindex() + 1; + + // need to ensure matchIndex never decreases even if followers lastLogIndex decreases + matchIndex_[aer->id()] = std::max(matchIndex_[aer->id()], aer->lastlogindex()); + + // use updated matchIndex to find new entries eligible for commit + std::vector sorted = matchIndex_; + std::sort(sorted.begin(), sorted.end(), std::greater()); + uint64_t lastReplicatedIndex = sorted[quorum_ - 1]; + + // Need to check the lastReplicatedIndex contains entry from current term + if (lastReplicatedIndex > commitIndex_ && log_[lastReplicatedIndex]->term() == currentTerm_) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Raised commitIndex_ from " + << commitIndex_ << " to " << lastReplicatedIndex; + commitIndex_ = lastReplicatedIndex; + } + // apply any newly committed entries to state machine + while (commitIndex_ > lastApplied_) { + lastApplied_++; + eToApply.push_back(log_[lastApplied_].get()); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Applying index entry " << lastApplied_; + } + } + // ===================== FAILURE CASE ===================== + else { + LOG(INFO) << "AppendEntriesResponse indicates FAILURE from follower " << aer->id(); // Move nextIndex one step back, but don't go below 1 if (nextIndex_[aer->id()] > 1) { --nextIndex_[aer->id()]; @@ -286,59 +275,46 @@ Role initialRole; << " to " << resendIndex; // Check that we actually have an entry at this index - if (resendIndex < 0 || - resendIndex >= static_cast(logIndexMapping_.size())) { // check for off by 1 + if (resendIndex == 0 || resendIndex > lastLogIndex_) { LOG(INFO) << "No log entry at index " << resendIndex - << " to resend; logIndexMapping_.size() = " - << logIndexMapping_.size(); - return false; - } - - const std::string& key = logIndexMapping_[resendIndex]; - auto it = log_.find(key); - if (it == log_.end() || !it->second) { - LOG(WARNING) << "Log entry missing in map for key at index " - << resendIndex; - return false; + << " to resend; lastLogIndex_ = " + << lastLogIndex_; + return; } // Build a new AppendEntries message based on the stored log entry - AppendEntries resend; - resend.CopyFrom(*(it->second)); // copies hash, entries, uid, proxy_id, etc. - + resend.CopyFrom(*log_[resendIndex]); // copies hash, entries, uid, proxy_id, etc. // Make sure RAFT fields are consistent with our current state - //resend.set_term(currentTerm_); + resend.set_term(currentTerm_); // TODO LEAKY ABSTRACTION ON APPENDENTRY ABUSE resend.set_leaderid(id_); - // prevLogIndex = index immediately before resendIndex - uint64_t prevIdx = (resendIndex == 0 ? 0 : resendIndex - 1); + uint64_t prevIdx = resendIndex - 1; resend.set_prevlogindex(prevIdx); - // prevLogTerm = term of the entry at prevIdx (or 0 if none) - uint64_t prevTerm = 0; - if (prevIdx >= 0 && prevIdx < static_cast(logIndexMapping_.size())) { // off by 1 check - const std::string& prevKey = logIndexMapping_[prevIdx]; - auto itPrev = log_.find(prevKey); - if (itPrev != log_.end() && itPrev->second) { - prevTerm = itPrev->second->term(); - } - } + uint64_t prevTerm = log_[prevIdx]->term(); resend.set_prevlogterm(prevTerm); - // leaderCommitIndex resend.set_leadercommitindex(commitIndex_); + resending = true; LOG(INFO) << "Resending AppendEntries for index " << resendIndex << " (prevIdx=" << prevIdx << ", prevTerm=" << prevTerm << ") to follower " << aer->id(); - - SendMessage(MessageType::AppendEntriesMsg, resend, aer->id()); } + }(); + if (demoted) { + leader_election_manager_->OnRoleChange(); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted from " + << (initialRole == Role::LEADER ? "LEADER" : "CANDIDATE") << "->FOLLOWER in term " << term; + return false; + } + if (resending) { SendMessage(MessageType::AppendEntriesMsg, resend, aer->id()); } - return true; + for (auto e : eToApply) { + commit_(*e); } - return false; + return true; } void Raft::ReceiveRequestVote(std::unique_ptr rv) { @@ -368,14 +344,11 @@ void Raft::ReceiveRequestVote(std::unique_ptr rv) { // Then we continue voting process term = currentTerm_; votedFor = votedFor_; - uint64_t lastLogTerm = getPrevLogTermLocked(); - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": prev terms at least equal"; + uint64_t lastLogTerm = getLastLogTermLocked(); if (rv->lastlogterm() < lastLogTerm) { return; } - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": prev log terms at least equal"; if (rv->lastlogterm() == lastLogTerm - && rv->lastlogindex() < getPrevLogIndexLocked()) { return; } + && rv->lastlogindex() < lastLogIndex_) { return; } validCandidate = true; - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": candidate is valid"; if (votedFor_ == -1 || votedFor_ == rvSender) { votedFor_ = rvSender; voteGranted = true; @@ -409,7 +382,6 @@ void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) bool votedYes = rvr->votegranted(); bool demoted = false; bool elected = false; - uint64_t votesNeeded = static_cast(total_num_ - f_); Role initialRole; [&]() { @@ -427,12 +399,15 @@ void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) if (dupe) { return; } votes_.push_back(voterId); LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Replica " << voterId << " voted for me. Votes: " - << votes_.size() << "/" << votesNeeded << " in term " << currentTerm_; - if (votes_.size() >= votesNeeded) { + << votes_.size() << "/" << quorum_ << " in term " << currentTerm_; + if (votes_.size() >= quorum_) { elected = true; role_ = Role::LEADER; - nextIndex_.assign(total_num_ + 1, logIndexMapping_.size()); + nextIndex_.assign(total_num_ + 1, lastLogIndex_ + 1); + + // make sure to set leaders own matchIndex entry to lastLogIndex matchIndex_.assign(total_num_ + 1, 0); + matchIndex_[id_] = lastLogIndex_; LOG(INFO) << "JIM -> " << __FUNCTION__ << ": CANDIDATE->LEADER in term " << currentTerm_; } }(); @@ -473,12 +448,12 @@ void Raft::StartElection() { votes_.clear(); votes_.push_back(id_); LOG(INFO) << "JIM -> " << __FUNCTION__ << ": I voted for myself. Votes: " - << votes_.size() << "/" << (total_num_ - f_) << " in term " << currentTerm_; + << votes_.size() << "/" << quorum_ << " in term " << currentTerm_; currentTerm = currentTerm_; candidateId = id_; - lastLogIndex = getPrevLogIndexLocked(); - lastLogTerm = getPrevLogTermLocked(); + lastLogIndex = lastLogIndex_; + lastLogTerm = getLastLogTermLocked(); } if (roleChanged) { leader_election_manager_->OnRoleChange(); @@ -493,8 +468,6 @@ void Raft::StartElection() { Broadcast(MessageType::RequestVoteMsg, requestVote); } -// TODOjim -// ON MERGE FIX VALUES void Raft::SendHeartBeat() { uint64_t currentTerm; int leaderId = id_; @@ -516,10 +489,10 @@ void Raft::SendHeartBeat() { //heartBeatsSentThisTerm_++; //heartBeatNum = heartBeatsSentThisTerm_; currentTerm = currentTerm_; - prevLogIndex = getPrevLogIndexLocked(); - prevLogTerm = getPrevLogTermLocked(); + prevLogIndex = lastLogIndex_; + prevLogTerm = getLastLogTermLocked(); entries = ""; - leaderCommit = 0; // TODO + leaderCommit = commitIndex_; // delta = now - last_heartbeat_time_; // last_heartbeat_time_ = now; @@ -529,7 +502,7 @@ void Raft::SendHeartBeat() { AppendEntries appendEntries; appendEntries.set_term(currentTerm); appendEntries.set_leaderid(leaderId); - appendEntries.set_prevlogindex(prevLogIndex); + appendEntries.set_prevlogindex(prevLogIndex); // TODO appendEntries.set_prevlogterm(prevLogTerm); appendEntries.set_entries(entries); appendEntries.set_leadercommitindex(leaderCommit); @@ -551,8 +524,10 @@ void Raft::SendHeartBeat() { // requires raft mutex to be held // returns true if demoted bool Raft::DemoteSelfLocked(uint64_t term) { - currentTerm_ = term; - votedFor_ = -1; + if (term > currentTerm_) { + currentTerm_ = term; + votedFor_ = -1; + } if (role_ != raft::Role::FOLLOWER) { role_ = raft::Role::FOLLOWER; //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted to FOLLOWER"; @@ -569,20 +544,8 @@ TermRelation Raft::TermCheckLocked(uint64_t term) const { } // requires raft mutex to be held -// TODO Reuse for other cases? -uint64_t Raft::getPrevLogIndexLocked() const { - return logIndexMapping_.size() - 1; -} - -// requires raft mutex to be held -uint64_t Raft::getPrevLogTermLocked() const { - if (logIndexMapping_.empty()) { return 0; } - const std::string& key = logIndexMapping_.back(); - auto it = log_.find(key); - if (it == log_.end() || !it->second) { - LOG(FATAL) << __FUNCTION__ << ": inconsistency found between log vector and log map"; - } - return it->second->term(); +uint64_t Raft::getLastLogTermLocked() const { + return log_[lastLogIndex_]->term(); } } // namespace raft diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 82b809da9c..87070ef9c5 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -19,8 +19,11 @@ #pragma once +#include +#include #include #include +#include #include #include #include @@ -61,51 +64,38 @@ class Raft : public common::ProtocolBase { private: TermRelation TermCheckLocked(uint64_t term) const; // Must be called under mutex bool DemoteSelfLocked(uint64_t term); // Must be called under mutex - uint64_t getPrevLogIndexLocked() const; // Must be called under mutex - uint64_t getPrevLogTermLocked() const; // Must be called under mutex + uint64_t getLastLogTermLocked() const; // Must be called under mutex bool IsStop(); private: mutable std::mutex mutex_; - std::map > received_; - std::map > log_; // log[] - - std::vector logIndexMapping_; - - // This is for everyone - // Most recent term it has seen - uint64_t currentTerm_; // Protected by raft_mutex_ - // Id for vote in current Term - int votedFor_; // Protected by raft_mutex_ - - // Volatile on all servers - // Index of highest log entry it knows to be committed - uint64_t commitIndex_; // Protected by raft_mutex_ - // Index of highest log entry executed - uint64_t lastApplied_; // Protected by raft_mutex_ - - // Only for leaders - // This keeps track of the next log entry to send to that replica - // Initialized to last log index + 1 - std::vector nextIndex_; // Protected by raft_mutex_ - // This keeps track of the highest log entry it knows is executed on that replica - std::vector matchIndex_; // Protected by raft_mutex_ - Role role_; // Protected by raft_mutex_ - int LeaderId; // Protected by raft_mutex_ - std::vector votes_; // Protected by raft_mutex_ - uint64_t heartBeatsSentThisTerm_; // Protected by raft_mutex_ + + // Persistent state on all servers: + uint64_t currentTerm_; // Protected by mutex_ + int votedFor_; // Protected by mutex_ + std::vector> log_; // Protected by mutex_ + + // Volatile state on leaders: + std::vector nextIndex_; // Protected by mutex_ + std::vector matchIndex_; // Protected by mutex_ + uint64_t heartBeatsSentThisTerm_; // Protected by mutex_ + + // Volatile state on all servers: + uint64_t commitIndex_; // Protected by mutex_ + uint64_t lastApplied_; // Protected by mutex_ + Role role_; // Protected by mutex_ + int LeaderId; // Protected by mutex_ + std::vector votes_; // Protected by mutex_ std::chrono::steady_clock::time_point last_ae_time_; - std::chrono::steady_clock::time_point last_heartbeat_time_; // Protected by raft_mutex_ + std::chrono::steady_clock::time_point last_heartbeat_time_; // Protected by mutex_ - uint64_t prevLogIndex_; + uint64_t lastLogIndex_; bool is_stop_; + const uint64_t quorum_; SignatureVerifier* verifier_; LeaderElectionManager* leader_election_manager_; Stats* global_stats_; - ReplicaCommunicator* replica_communicator_; - - }; } // namespace raft diff --git a/platform/consensus/ordering/raft/proto/proposal.proto b/platform/consensus/ordering/raft/proto/proposal.proto index d276ccfe3c..5702aaf167 100644 --- a/platform/consensus/ordering/raft/proto/proposal.proto +++ b/platform/consensus/ordering/raft/proto/proposal.proto @@ -39,9 +39,7 @@ message AppendEntriesResponse { bool success = 2; int32 id = 3; uint64 lastApplied = 4; - uint64 nextEntry = 5; - bytes hash = 6; - uint64 prevLogIndex = 7; + uint64 lastLogIndex = 5; } message RequestVote { diff --git a/scripts/deploy/config/raft.config b/scripts/deploy/config/raft.config index ef8980fd29..d1440610ac 100644 --- a/scripts/deploy/config/raft.config +++ b/scripts/deploy/config/raft.config @@ -1,5 +1,5 @@ { - "clientBatchNum": 100, + "clientBatchNum": 1, "enable_viewchange": true, "recovery_enabled": false, "max_client_complaint_num":10, From cbed2129621de06d586a79ddd5d058fc9adbfa97 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Thu, 4 Dec 2025 21:23:12 +0000 Subject: [PATCH 27/66] modified some logging, changed config --- scripts/deploy/config/raft.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/deploy/config/raft.config b/scripts/deploy/config/raft.config index d1440610ac..888f4ba813 100644 --- a/scripts/deploy/config/raft.config +++ b/scripts/deploy/config/raft.config @@ -3,7 +3,7 @@ "enable_viewchange": true, "recovery_enabled": false, "max_client_complaint_num":10, - "max_process_txn": 10000, + "max_process_txn": 256, "worker_num": 1, "input_worker_num": 1, "output_worker_num": 10 From eb9692ffcdbd4bac7153a528482574105e8316b7 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Fri, 5 Dec 2025 00:20:21 +0000 Subject: [PATCH 28/66] moved some variables around in header, changed receivePropose to ReceiveAppendEntries --- platform/consensus/ordering/raft/algorithm/raft.cpp | 8 ++++---- platform/consensus/ordering/raft/algorithm/raft.h | 11 +++++------ .../consensus/ordering/raft/framework/consensus.cpp | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 282e40ddb0..46561acdf3 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -37,10 +37,10 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, : ProtocolBase(id, f, total_num), currentTerm_(0), votedFor_(-1), + lastLogIndex_(0), commitIndex_(0), lastApplied_(0), role_(raft::Role::FOLLOWER), - lastLogIndex_(0), is_stop_(false), quorum_((total_num/2) + 1), verifier_(verifier), @@ -102,7 +102,7 @@ bool Raft::ReceiveTransaction(std::unique_ptr txn) { } //LOG(INFO) << "Received Transaction to primary id: " << id_; - ae.set_create_time(GetCurrentTime()); + ae.set_create_time(GetCurrentTime()); // TODO: figure this out ae.set_leaderid(id_); ae.set_prevlogindex(prevLogIndex); ae.set_prevlogterm(prevLogTerm); @@ -114,7 +114,7 @@ bool Raft::ReceiveTransaction(std::unique_ptr txn) { return true; } -bool Raft::ReceivePropose(std::unique_ptr ae) { +bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { if (ae->leaderid() == id_) { return false; } uint64_t term; bool success = false; @@ -161,7 +161,7 @@ bool Raft::ReceivePropose(std::unique_ptr ae) { //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": This is a heartbeat, should not append"; } // common case - uint64_t prevCommitIndex = commitIndex_; + //uint64_t prevCommitIndex = commitIndex_; if (leaderCommit > commitIndex_) { commitIndex_ = std::min(leaderCommit, lastLogIndex_); //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Raised commitIndex_ from " diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 87070ef9c5..ab1e686988 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -51,25 +51,24 @@ class Raft : public common::ProtocolBase { ~Raft(); bool ReceiveTransaction(std::unique_ptr txn); - bool ReceivePropose(std::unique_ptr txn); + bool ReceiveAppendEntries(std::unique_ptr txn); bool ReceiveAppendEntriesResponse(std::unique_ptr response); void ReceiveRequestVote(std::unique_ptr rv); void ReceiveRequestVoteResponse(std::unique_ptr rvr); - raft::Role GetRoleSnapshot() const; void StartElection(); void SendHeartBeat(); private: + mutable std::mutex mutex_; + TermRelation TermCheckLocked(uint64_t term) const; // Must be called under mutex bool DemoteSelfLocked(uint64_t term); // Must be called under mutex uint64_t getLastLogTermLocked() const; // Must be called under mutex bool IsStop(); + bool IsDuplicateLogEntry(const std::string& hash) const; // Must be called under mutex - private: - mutable std::mutex mutex_; - // Persistent state on all servers: uint64_t currentTerm_; // Protected by mutex_ int votedFor_; // Protected by mutex_ @@ -79,6 +78,7 @@ class Raft : public common::ProtocolBase { std::vector nextIndex_; // Protected by mutex_ std::vector matchIndex_; // Protected by mutex_ uint64_t heartBeatsSentThisTerm_; // Protected by mutex_ + uint64_t lastLogIndex_; // Protected by mutex_ // Volatile state on all servers: uint64_t commitIndex_; // Protected by mutex_ @@ -89,7 +89,6 @@ class Raft : public common::ProtocolBase { std::chrono::steady_clock::time_point last_ae_time_; std::chrono::steady_clock::time_point last_heartbeat_time_; // Protected by mutex_ - uint64_t lastLogIndex_; bool is_stop_; const uint64_t quorum_; SignatureVerifier* verifier_; diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index c42d82e053..3e3c116929 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -64,7 +64,7 @@ int Consensus::ProcessCustomConsensus(std::unique_ptr request) { assert(1 == 0); return -1; } - raft_->ReceivePropose(std::move(txn)); + raft_->ReceiveAppendEntries(std::move(txn)); return 0; } else if (request->user_type() == MessageType::AppendEntriesResponseMsg) { std::unique_ptr AppendEntriesResponse = std::make_unique(); From aa03e2aaa3a921f1a78ebcd6b32d5974f5fd9599 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Fri, 5 Dec 2025 06:31:54 +0000 Subject: [PATCH 29/66] Swapped to requests instead of AppendEntries outside RPC logic --- .../ordering/common/framework/consensus.cpp | 2 + .../ordering/raft/algorithm/raft.cpp | 137 +++++++++--------- .../consensus/ordering/raft/algorithm/raft.h | 20 ++- .../ordering/raft/framework/consensus.cpp | 42 +++--- .../ordering/raft/proto/proposal.proto | 37 +++-- 5 files changed, 139 insertions(+), 99 deletions(-) diff --git a/platform/consensus/ordering/common/framework/consensus.cpp b/platform/consensus/ordering/common/framework/consensus.cpp index d375f93269..91e59b27b8 100644 --- a/platform/consensus/ordering/common/framework/consensus.cpp +++ b/platform/consensus/ordering/common/framework/consensus.cpp @@ -130,11 +130,13 @@ int Consensus::ConsensusCommit(std::unique_ptr context, if (config_.IsPerformanceRunning()) { return performance_manager_->StartEval(); } + break; case Request::TYPE_RESPONSE: if (config_.IsPerformanceRunning()) { return performance_manager_->ProcessResponseMsg(std::move(context), std::move(request)); } + break; case Request::TYPE_NEW_TXNS: { return ProcessNewTransaction(std::move(request)); } diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 46561acdf3..65a4981550 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -53,9 +53,9 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, last_ae_time_ = std::chrono::steady_clock::now(); last_heartbeat_time_ = std::chrono::steady_clock::now(); - auto sentinel = std::make_unique(); - sentinel->set_term(0); - sentinel->set_entries("COMMON_PREFIX"); + auto sentinel = std::make_unique(); + sentinel->term = 0; + sentinel->command = "COMMON_PREFIX"; log_.push_back(std::move(sentinel)); nextIndex_.assign(total_num_ + 1, lastLogIndex_ + 1); @@ -66,11 +66,12 @@ Raft::~Raft() { is_stop_ = true; } bool Raft::IsStop() { return is_stop_; } -bool Raft::ReceiveTransaction(std::unique_ptr txn) { +bool Raft::ReceiveTransaction(std::unique_ptr req) { uint64_t term; uint64_t prevLogIndex; uint64_t prevLogTerm; uint64_t leaderCommit; + std::string cmd; AppendEntries ae; { std::lock_guard lk(mutex_); @@ -87,9 +88,14 @@ bool Raft::ReceiveTransaction(std::unique_ptr txn) { leaderCommit = commitIndex_; // append new transaction to log - txn->set_term(term); - ae.CopyFrom(*txn); - log_.push_back(std::move(txn)); + auto entry = std::make_unique(); + entry->term = currentTerm_; + if (!req->SerializeToString(&entry->command)) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": req could not be serialized"; + return false; + } + cmd = entry->command; + log_.push_back(std::move(entry)); // TODO // durably store the new entry somehow @@ -102,10 +108,13 @@ bool Raft::ReceiveTransaction(std::unique_ptr txn) { } //LOG(INFO) << "Received Transaction to primary id: " << id_; - ae.set_create_time(GetCurrentTime()); // TODO: figure this out + ae.set_term(term); ae.set_leaderid(id_); ae.set_prevlogindex(prevLogIndex); ae.set_prevlogterm(prevLogTerm); + auto* e = ae.add_entries(); + e->set_term(term); + e->set_command(cmd); ae.set_leadercommitindex(leaderCommit); // Broadcast probably shouldnt be happening here. @@ -124,8 +133,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { uint64_t lastLogIndex; auto leaderCommit = ae->leadercommitindex(); auto leaderId = ae->leaderid(); - std::string hash = ae->hash(); - std::vector eToApply; + std::vector> eToApply; [&]() { std::lock_guard lk(mutex_); @@ -137,30 +145,24 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { if (tr != TermRelation::STALE && role_ == Role::FOLLOWER) { uint64_t i = ae->prevlogindex(); - if (i < static_cast(log_.size()) && ae->prevlogterm() == log_[i]->term()) { success = true; } + if (i < static_cast(log_.size()) && ae->prevlogterm() == log_[i]->term) { success = true; } } term = currentTerm_; if (!success) { return; } - - // Only append entries to log if "entries" field is non-empty. - // heartbeats contain a entry of "" (empty string), they should not be appended - - // Non heartbeat case - if (ae->entries() != "") { // TODO Implement an entry existing but with a different term // delete that entry and all after it - log_.push_back(std::move(ae)); + + for (const auto& e : ae->entries()) { + auto entry = std::make_unique(); + entry->term = e.term(); + entry->command = e.command(); + log_.push_back(std::move(entry)); lastLogIndex_++; - lastLogIndex = lastLogIndex_; //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Appended to log at index" << lastLogIndex_; - - // have to actually store the entry durably before it can be considered "appended" - } - // heartbeat case - else { - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": This is a heartbeat, should not append"; + // TODO: have to actually store the entry durably before it can be considered "appended" } - // common case + lastLogIndex = lastLogIndex_; + //uint64_t prevCommitIndex = commitIndex_; if (leaderCommit > commitIndex_) { commitIndex_ = std::min(leaderCommit, lastLogIndex_); @@ -169,11 +171,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { } // apply any newly committed entries to state machine - while (commitIndex_ > lastApplied_) { - lastApplied_++; - eToApply.push_back(log_[lastApplied_].get()); - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Applying index entry " << lastApplied_; - } + eToApply = PrepareCommitLocked(); }(); //auto now = std::chrono::steady_clock::now(); @@ -191,7 +189,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { if (tr != TermRelation::STALE) { leader_election_manager_->OnHeartBeat(); } - for (auto e : eToApply) { + for (auto& e : eToApply) { commit_(*e); } @@ -222,7 +220,7 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a bool resending = false; TermRelation tr; Role initialRole; - std::vector eToApply; + std::vector> eToApply; AppendEntries resend; [&]() { @@ -247,18 +245,14 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a uint64_t lastReplicatedIndex = sorted[quorum_ - 1]; // Need to check the lastReplicatedIndex contains entry from current term - if (lastReplicatedIndex > commitIndex_ && log_[lastReplicatedIndex]->term() == currentTerm_) { + if (lastReplicatedIndex > commitIndex_ && log_[lastReplicatedIndex]->term == currentTerm_) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Raised commitIndex_ from " << commitIndex_ << " to " << lastReplicatedIndex; commitIndex_ = lastReplicatedIndex; } // apply any newly committed entries to state machine - while (commitIndex_ > lastApplied_) { - lastApplied_++; - eToApply.push_back(log_[lastApplied_].get()); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Applying index entry " << lastApplied_; - } + eToApply = PrepareCommitLocked(); } // ===================== FAILURE CASE ===================== else { @@ -281,19 +275,16 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a << lastLogIndex_; return; } - - // Build a new AppendEntries message based on the stored log entry - resend.CopyFrom(*log_[resendIndex]); // copies hash, entries, uid, proxy_id, etc. - // Make sure RAFT fields are consistent with our current state - resend.set_term(currentTerm_); // TODO LEAKY ABSTRACTION ON APPENDENTRY ABUSE - resend.set_leaderid(id_); - // prevLogIndex = index immediately before resendIndex uint64_t prevIdx = resendIndex - 1; + uint64_t prevTerm = log_[prevIdx]->term; + const LogEntry& resendEntry = *log_[resendIndex]; + resend.set_term(currentTerm_); + resend.set_leaderid(id_); resend.set_prevlogindex(prevIdx); - // prevLogTerm = term of the entry at prevIdx (or 0 if none) - uint64_t prevTerm = log_[prevIdx]->term(); resend.set_prevlogterm(prevTerm); - // leaderCommitIndex + auto* e = resend.add_entries(); + e->set_term(resendEntry.term); + e->set_command(resendEntry.command); resend.set_leadercommitindex(commitIndex_); resending = true; @@ -311,7 +302,7 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a } if (resending) { SendMessage(MessageType::AppendEntriesMsg, resend, aer->id()); } - for (auto e : eToApply) { + for (auto& e : eToApply) { commit_(*e); } return true; @@ -460,12 +451,12 @@ void Raft::StartElection() { LOG(INFO) << __FUNCTION__ << ": FOLLOWER->CANDIDATE in term " << currentTerm; } - RequestVote requestVote; - requestVote.set_term(currentTerm); - requestVote.set_candidateid(candidateId); - requestVote.set_lastlogindex(lastLogIndex); - requestVote.set_lastlogterm(lastLogTerm); - Broadcast(MessageType::RequestVoteMsg, requestVote); + RequestVote rv; + rv.set_term(currentTerm); + rv.set_candidateid(candidateId); + rv.set_lastlogindex(lastLogIndex); + rv.set_lastlogterm(lastLogTerm); + Broadcast(MessageType::RequestVoteMsg, rv); } void Raft::SendHeartBeat() { @@ -499,14 +490,13 @@ void Raft::SendHeartBeat() { } //auto ms = std::chrono::duration_cast(delta).count(); //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Heartbeat sent after " << ms << "ms"; - AppendEntries appendEntries; - appendEntries.set_term(currentTerm); - appendEntries.set_leaderid(leaderId); - appendEntries.set_prevlogindex(prevLogIndex); // TODO - appendEntries.set_prevlogterm(prevLogTerm); - appendEntries.set_entries(entries); - appendEntries.set_leadercommitindex(leaderCommit); - Broadcast(MessageType::AppendEntriesMsg, appendEntries); + AppendEntries ae; + ae.set_term(currentTerm); + ae.set_leaderid(leaderId); + ae.set_prevlogindex(prevLogIndex); // TODO + ae.set_prevlogterm(prevLogTerm); + ae.set_leadercommitindex(leaderCommit); + Broadcast(MessageType::AppendEntriesMsg, ae); //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Heartbeat " << heartBeatNum << " for term " << currentTerm; @@ -545,8 +535,25 @@ TermRelation Raft::TermCheckLocked(uint64_t term) const { // requires raft mutex to be held uint64_t Raft::getLastLogTermLocked() const { - return log_[lastLogIndex_]->term(); + return log_[lastLogIndex_]->term; +} + +std::vector> Raft::PrepareCommitLocked() { + std::vector> v; + while (commitIndex_ > lastApplied_) { + lastApplied_++; + auto command = std::make_unique(); + if (!command->ParseFromString(log_[lastApplied_]->command)) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Failed to parse command"; + continue; + } + v.push_back(std::move(command)); + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Applying index entry " << lastApplied_; + } + return v; } + + } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index ab1e686988..e3e33892bb 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -31,6 +31,7 @@ #include "platform/common/queue/lock_free_queue.h" #include "platform/consensus/ordering/common/algorithm/protocol_base.h" #include "platform/consensus/ordering/raft/proto/proposal.pb.h" +#include "platform/proto/resdb.pb.h" #include "platform/statistic/stats.h" #include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" #include "platform/networkstrate/replica_communicator.h" @@ -41,6 +42,11 @@ namespace raft { enum class Role { FOLLOWER, CANDIDATE, LEADER }; enum class TermRelation { STALE, CURRENT, NEW }; +struct LogEntry { + uint64_t term; + std::string command; +}; + class Raft : public common::ProtocolBase { public: Raft(int id, int f, int total_num, @@ -50,9 +56,9 @@ class Raft : public common::ProtocolBase { ); ~Raft(); - bool ReceiveTransaction(std::unique_ptr txn); - bool ReceiveAppendEntries(std::unique_ptr txn); - bool ReceiveAppendEntriesResponse(std::unique_ptr response); + bool ReceiveTransaction(std::unique_ptr req); + bool ReceiveAppendEntries(std::unique_ptr ae); + bool ReceiveAppendEntriesResponse(std::unique_ptr aer); void ReceiveRequestVote(std::unique_ptr rv); void ReceiveRequestVoteResponse(std::unique_ptr rvr); @@ -68,11 +74,12 @@ class Raft : public common::ProtocolBase { uint64_t getLastLogTermLocked() const; // Must be called under mutex bool IsStop(); bool IsDuplicateLogEntry(const std::string& hash) const; // Must be called under mutex + std::vector> PrepareCommitLocked(); // Persistent state on all servers: uint64_t currentTerm_; // Protected by mutex_ int votedFor_; // Protected by mutex_ - std::vector> log_; // Protected by mutex_ + std::vector> log_; // Protected by mutex_ // Volatile state on leaders: std::vector nextIndex_; // Protected by mutex_ @@ -91,6 +98,11 @@ class Raft : public common::ProtocolBase { bool is_stop_; const uint64_t quorum_; + + // for limiting AppendEntries batch sizing + static constexpr size_t maxBytes = 64 * 1024; + static constexpr size_t maxEntries = 16; + SignatureVerifier* verifier_; LeaderElectionManager* leader_election_manager_; Stats* global_stats_; diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 3e3c116929..6dfd3de0b1 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -24,6 +24,7 @@ #include "common/utils/utils.h" #include "platform/consensus/ordering/raft/proto/proposal.pb.h" +#include "platform/proto/resdb.pb.h" namespace resdb { namespace raft { @@ -109,27 +110,34 @@ int Consensus::ProcessCustomConsensus(std::unique_ptr request) { } return 0; } - +/* +message BatchUserRequest { + message UserRequest { + Request request = 1; + SignatureInfo signature = 2; + int32 id = 3; + }; + repeated UserRequest user_requests = 1; + uint64 createtime = 2; + uint64 local_id = 3; + uint64 seq = 4; + Certs committed_certs= 5; + bytes hash = 6; + int32 proxy_id = 7; +} +*/ int Consensus::ProcessNewTransaction(std::unique_ptr request) { - std::unique_ptr txn = std::make_unique(); - txn->set_entries(request->data()); - txn->set_hash(request->hash()); - txn->set_proxy_id(request->proxy_id()); - txn->set_uid(request->uid()); - return raft_->ReceiveTransaction(std::move(txn)); + return raft_->ReceiveTransaction(std::move(request)); } int Consensus::CommitMsg(const google::protobuf::Message& msg) { - return CommitMsgInternal(dynamic_cast(msg)); -} - -int Consensus::CommitMsgInternal(const AppendEntries& txn) { - std::unique_ptr request = std::make_unique(); - request->set_data(txn.entries()); - request->set_seq(txn.prevlogindex()); - request->set_uid(txn.uid()); - request->set_proxy_id(txn.proxy_id()); - transaction_executor_->Commit(std::move(request)); + auto* req = dynamic_cast(&msg); + if (!req) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Failed to cast Message to Request"; + return -1; + } + auto execReq = std::make_unique(*req); + transaction_executor_->Commit(std::move(execReq)); return 0; } diff --git a/platform/consensus/ordering/raft/proto/proposal.proto b/platform/consensus/ordering/raft/proto/proposal.proto index 5702aaf167..fd88bca361 100644 --- a/platform/consensus/ordering/raft/proto/proposal.proto +++ b/platform/consensus/ordering/raft/proto/proposal.proto @@ -21,24 +21,35 @@ syntax = "proto3"; package resdb.raft; +message Transaction{ + int32 id = 1; + bytes data = 2; + bytes hash = 3; + int32 proxy_id = 4; + int32 proposer = 5; + int64 uid = 6; + int64 create_time = 7; + int64 seq = 9; +} + +message Entry { + uint64 term = 1; + bytes command = 2; +} + message AppendEntries{ - bytes entries = 1; // this can maybe work as entries but maybe not? - bytes hash = 2; - int32 proxy_id = 3; - int32 leaderId = 4; - int64 uid = 5; - int64 create_time = 6; - uint64 prevLogIndex = 7; - uint64 prevLogTerm = 8; // term of the most recent log (term corresponding to seq) - uint64 leaderCommitIndex = 9; // leader sends out highest seq that is committed - uint64 term = 10; // This should be a term for each entry, but assuming no failure at first + uint64 term = 1; + int32 leaderId = 2; + uint64 prevLogIndex = 3; + uint64 prevLogTerm = 4; + repeated Entry entries = 5; + uint64 leaderCommitIndex = 6; } message AppendEntriesResponse { uint64 term = 1; bool success = 2; int32 id = 3; - uint64 lastApplied = 4; uint64 lastLogIndex = 5; } @@ -51,8 +62,8 @@ message RequestVote { message RequestVoteResponse { uint64 term = 1; - int32 voterId = 2; - bool voteGranted = 3; + bool voteGranted = 2; + int32 voterId = 3; } message DirectToLeader { From ddd5e975f7e3b9bc7f3d0c723b7f9816043c02b1 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Fri, 5 Dec 2025 09:29:27 +0000 Subject: [PATCH 30/66] implemented LogEntry struct instead of using AppendEntries for everything --- executor/kv/kv_executor.cpp | 3 +- .../ordering/common/framework/consensus.cpp | 2 -- .../common/framework/performance_manager.cpp | 7 +++-- .../ordering/raft/algorithm/raft.cpp | 29 +++++++------------ .../ordering/raft/proto/proposal.proto | 11 ------- raft_performance.sh | 8 +++++ scripts/deploy/config/raft.config | 4 +-- scripts/deploy/performance/run_performance.sh | 2 +- 8 files changed, 28 insertions(+), 38 deletions(-) create mode 100755 raft_performance.sh diff --git a/executor/kv/kv_executor.cpp b/executor/kv/kv_executor.cpp index c587f592ce..fa5ff48894 100644 --- a/executor/kv/kv_executor.cpp +++ b/executor/kv/kv_executor.cpp @@ -135,7 +135,8 @@ std::unique_ptr KVExecutor::ExecuteData( } void KVExecutor::Set(const std::string& key, const std::string& value) { - LOG(ERROR)<<" set key:"<SetValue(key, value); } diff --git a/platform/consensus/ordering/common/framework/consensus.cpp b/platform/consensus/ordering/common/framework/consensus.cpp index 91e59b27b8..d375f93269 100644 --- a/platform/consensus/ordering/common/framework/consensus.cpp +++ b/platform/consensus/ordering/common/framework/consensus.cpp @@ -130,13 +130,11 @@ int Consensus::ConsensusCommit(std::unique_ptr context, if (config_.IsPerformanceRunning()) { return performance_manager_->StartEval(); } - break; case Request::TYPE_RESPONSE: if (config_.IsPerformanceRunning()) { return performance_manager_->ProcessResponseMsg(std::move(context), std::move(request)); } - break; case Request::TYPE_NEW_TXNS: { return ProcessNewTransaction(std::move(request)); } diff --git a/platform/consensus/ordering/common/framework/performance_manager.cpp b/platform/consensus/ordering/common/framework/performance_manager.cpp index 160c18fc3a..605a49a566 100644 --- a/platform/consensus/ordering/common/framework/performance_manager.cpp +++ b/platform/consensus/ordering/common/framework/performance_manager.cpp @@ -182,9 +182,10 @@ void PerformanceManager::SendResponseToClient( uint64_t create_time = batch_response.createtime(); if (create_time > 0) { uint64_t run_time = GetCurrentTime() - create_time; - LOG(ERROR) << "receive current:" << GetCurrentTime() - << " create time:" << create_time << " run time:" << run_time - << " local id:" << batch_response.local_id(); + // JIM + //LOG(ERROR) << "receive current:" << GetCurrentTime() + // << " create time:" << create_time << " run time:" << run_time + // << " local id:" << batch_response.local_id(); global_stats_->AddLatency(run_time); } send_num_--; diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 65a4981550..989943c672 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -87,6 +87,9 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { prevLogTerm = getLastLogTermLocked(); leaderCommit = commitIndex_; + // assign seq number as log index for the request or executing transactions fails. + req->set_seq(lastLogIndex_ + 1); + // append new transaction to log auto entry = std::make_unique(); entry->term = currentTerm_; @@ -193,16 +196,6 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { commit_(*e); } - /* - LOG(INFO) << "AppendEntriesMsg Added to Log"; - LOG(INFO) << "leaderCommit: " << leaderCommit; - LOG(INFO) << "commitIndex_: " << commitIndex_; - LOG(INFO) << "lastApplied_: " << lastApplied_; - LOG(INFO) << "static_cast(log_.size()): " << static_cast(log_.size()); - LOG(INFO) << "leaderCommit > commitIndex_: " << (leaderCommit > commitIndex_ ? "true" : "false"); - LOG(INFO) << "lastApplied_ + 1 <= static_cast(log_.size()) " << ((lastApplied_ + 1 <= static_cast(log_.size())) ? "true" : "false"); - */ - AppendEntriesResponse aer; aer.set_term(term); aer.set_success(success); @@ -246,8 +239,8 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a // Need to check the lastReplicatedIndex contains entry from current term if (lastReplicatedIndex > commitIndex_ && log_[lastReplicatedIndex]->term == currentTerm_) { - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Raised commitIndex_ from " - << commitIndex_ << " to " << lastReplicatedIndex; + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Raised commitIndex_ from " + // << commitIndex_ << " to " << lastReplicatedIndex; commitIndex_ = lastReplicatedIndex; } @@ -265,8 +258,8 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a } uint64_t resendIndex = nextIndex_[aer->id()]; - LOG(INFO) << "Updated nextIndex_ for follower " << aer->id() - << " to " << resendIndex; + //LOG(INFO) << "Updated nextIndex_ for follower " << aer->id() + // << " to " << resendIndex; // Check that we actually have an entry at this index if (resendIndex == 0 || resendIndex > lastLogIndex_) { @@ -288,10 +281,10 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a resend.set_leadercommitindex(commitIndex_); resending = true; - LOG(INFO) << "Resending AppendEntries for index " << resendIndex - << " (prevIdx=" << prevIdx - << ", prevTerm=" << prevTerm - << ") to follower " << aer->id(); + // LOG(INFO) << "Resending AppendEntries for index " << resendIndex + // << " (prevIdx=" << prevIdx + // << ", prevTerm=" << prevTerm + // << ") to follower " << aer->id(); } }(); if (demoted) { diff --git a/platform/consensus/ordering/raft/proto/proposal.proto b/platform/consensus/ordering/raft/proto/proposal.proto index fd88bca361..1f69aabe9a 100644 --- a/platform/consensus/ordering/raft/proto/proposal.proto +++ b/platform/consensus/ordering/raft/proto/proposal.proto @@ -21,17 +21,6 @@ syntax = "proto3"; package resdb.raft; -message Transaction{ - int32 id = 1; - bytes data = 2; - bytes hash = 3; - int32 proxy_id = 4; - int32 proposer = 5; - int64 uid = 6; - int64 create_time = 7; - int64 seq = 9; -} - message Entry { uint64 term = 1; bytes command = 2; diff --git a/raft_performance.sh b/raft_performance.sh new file mode 100755 index 0000000000..c688e75e3d --- /dev/null +++ b/raft_performance.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Move into the deploy directory +cd ~/incubator-resilientdb/scripts/deploy + +# Run the performance script with the config file +./performance_local/raft_performance.sh config/kv_performance_server_local.conf >out_raft.txt 2>&1 diff --git a/scripts/deploy/config/raft.config b/scripts/deploy/config/raft.config index 888f4ba813..68732f59bf 100644 --- a/scripts/deploy/config/raft.config +++ b/scripts/deploy/config/raft.config @@ -1,9 +1,9 @@ { - "clientBatchNum": 1, + "clientBatchNum": 200, "enable_viewchange": true, "recovery_enabled": false, "max_client_complaint_num":10, - "max_process_txn": 256, + "max_process_txn": 100000, "worker_num": 1, "input_worker_num": 1, "output_worker_num": 10 diff --git a/scripts/deploy/performance/run_performance.sh b/scripts/deploy/performance/run_performance.sh index 7ef2b8a798..ca3f49a132 100755 --- a/scripts/deploy/performance/run_performance.sh +++ b/scripts/deploy/performance/run_performance.sh @@ -54,6 +54,6 @@ done python3 performance/calculate_result.py `ls result_*_log` > results.log -rm -rf result_*_log +#rm -rf result_*_log echo "save result to results.log" cat results.log From 5278d19834895798e651925a7b8e3817d05546f0 Mon Sep 17 00:00:00 2001 From: nachiket Date: Fri, 5 Dec 2025 15:28:33 -0800 Subject: [PATCH 31/66] Create and send individual AppendEntries to each follower from ReceiveTransaction --- .../ordering/raft/algorithm/raft.cpp | 31 ++++++++++++------- .../consensus/ordering/raft/algorithm/raft.h | 2 ++ 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 989943c672..d1bd5bf1e4 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -66,13 +66,28 @@ Raft::~Raft() { is_stop_ = true; } bool Raft::IsStop() { return is_stop_; } +void Raft::CreateAndSendAppendEntries(std::vector nextIndexCopy, uint64_t term, uint64_t prevLogTerm, uint64_t leaderCommit, std::string cmd) { + for (int i = 1; i <= total_num_; ++i) { + AppendEntries ae; + ae.set_term(term); + ae.set_leaderid(id_); + ae.set_prevlogindex(nextIndexCopy[i] - 1); + ae.set_prevlogterm(prevLogTerm); + auto* e = ae.add_entries(); + e->set_term(term); + e->set_command(cmd); + ae.set_leadercommitindex(leaderCommit); + SendMessage(MessageType::AppendEntriesMsg, ae, i); + } +} + bool Raft::ReceiveTransaction(std::unique_ptr req) { uint64_t term; uint64_t prevLogIndex; uint64_t prevLogTerm; uint64_t leaderCommit; std::string cmd; - AppendEntries ae; + std::vector nextIndexCopy = nextIndex_; { std::lock_guard lk(mutex_); if (role_ != Role::LEADER) { @@ -108,21 +123,14 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { lastLogIndex_++; nextIndex_[id_] = lastLogIndex_ + 1; matchIndex_[id_] = lastLogIndex_; + nextIndexCopy[id_] = nextIndex_[id_]; } //LOG(INFO) << "Received Transaction to primary id: " << id_; - ae.set_term(term); - ae.set_leaderid(id_); - ae.set_prevlogindex(prevLogIndex); - ae.set_prevlogterm(prevLogTerm); - auto* e = ae.add_entries(); - e->set_term(term); - e->set_command(cmd); - ae.set_leadercommitindex(leaderCommit); - // Broadcast probably shouldnt be happening here. // Ideally a loop is sending AEs to followers based on index feedback - Broadcast(MessageType::AppendEntriesMsg, ae); + // Broadcast(MessageType::AppendEntriesMsg, ae); + CreateAndSendAppendEntries(nextIndexCopy, term, prevLogTerm, leaderCommit, cmd); return true; } @@ -256,7 +264,6 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a } else { nextIndex_[aer->id()] = 1; } - uint64_t resendIndex = nextIndex_[aer->id()]; //LOG(INFO) << "Updated nextIndex_ for follower " << aer->id() // << " to " << resendIndex; diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index e3e33892bb..90160fd0bb 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -65,6 +65,8 @@ class Raft : public common::ProtocolBase { raft::Role GetRoleSnapshot() const; void StartElection(); void SendHeartBeat(); + //nextIndexCopy is a copy of nextIndex_ to prevent updating it outside a mutex lock + void CreateAndSendAppendEntries(std::vector nextIndexCopy, uint64_t term, uint64_t prevLogTerm, uint64_t leaderCommit, std::string cmd); private: mutable std::mutex mutex_; From 0f9843d2196176b706b8e9f740786e07ed629ca2 Mon Sep 17 00:00:00 2001 From: nachiket Date: Sun, 7 Dec 2025 08:38:03 -0800 Subject: [PATCH 32/66] Setting nextIndex to lastlogindex() - 1 in ReceiveAEResponse fail case --- platform/consensus/ordering/raft/algorithm/raft.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index d1bd5bf1e4..a1140af681 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -223,7 +223,7 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a Role initialRole; std::vector> eToApply; AppendEntries resend; - + std::vector nextIndexCopy = nextIndex_; [&]() { std::lock_guard lk(mutex_); initialRole = role_; @@ -259,9 +259,8 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a else { LOG(INFO) << "AppendEntriesResponse indicates FAILURE from follower " << aer->id(); // Move nextIndex one step back, but don't go below 1 - if (nextIndex_[aer->id()] > 1) { - --nextIndex_[aer->id()]; - } else { + nextIndex_[aer->id()] = aer->lastlogindex() - 1; + if (nextIndex_[aer->id()] < 1) { nextIndex_[aer->id()] = 1; } uint64_t resendIndex = nextIndex_[aer->id()]; From c5c0ac6a1a2e77c6a2a5a992a5b0e013ef58cd88 Mon Sep 17 00:00:00 2001 From: nachiket Date: Sun, 7 Dec 2025 09:04:26 -0800 Subject: [PATCH 33/66] Modified CreateAndSendAppendEntries to CreateAndSendAppendEntryMsg and used it in ReceiveAppendEntriesResponse --- .../ordering/raft/algorithm/raft.cpp | 55 +++++++++++-------- .../consensus/ordering/raft/algorithm/raft.h | 3 +- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index a1140af681..6f5996967a 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -66,19 +66,18 @@ Raft::~Raft() { is_stop_ = true; } bool Raft::IsStop() { return is_stop_; } -void Raft::CreateAndSendAppendEntries(std::vector nextIndexCopy, uint64_t term, uint64_t prevLogTerm, uint64_t leaderCommit, std::string cmd) { - for (int i = 1; i <= total_num_; ++i) { - AppendEntries ae; - ae.set_term(term); - ae.set_leaderid(id_); - ae.set_prevlogindex(nextIndexCopy[i] - 1); - ae.set_prevlogterm(prevLogTerm); - auto* e = ae.add_entries(); - e->set_term(term); - e->set_command(cmd); - ae.set_leadercommitindex(leaderCommit); - SendMessage(MessageType::AppendEntriesMsg, ae, i); - } +void Raft::CreateAndSendAppendEntryMsg(uint64_t replica_id, std::vector nextIndexCopy, uint64_t term, uint64_t prevLogTerm, uint64_t leaderCommit, + std::string cmd, uint64_t entry_term) { + AppendEntries ae; + ae.set_term(term); + ae.set_leaderid(id_); + ae.set_prevlogindex(nextIndexCopy[i] - 1); + ae.set_prevlogterm(prevLogTerm); + auto* e = ae.add_entries(); + e->set_term(entry_term); + e->set_command(cmd); + ae.set_leadercommitindex(leaderCommit); + SendMessage(MessageType::AppendEntriesMsg, ae, replica_id); } bool Raft::ReceiveTransaction(std::unique_ptr req) { @@ -130,7 +129,9 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { // Broadcast probably shouldnt be happening here. // Ideally a loop is sending AEs to followers based on index feedback // Broadcast(MessageType::AppendEntriesMsg, ae); - CreateAndSendAppendEntries(nextIndexCopy, term, prevLogTerm, leaderCommit, cmd); + for (int i = 1; i <= total_num_; ++i) { + CreateAndSendAppendEntryMsg(i, nextIndexCopy, term, prevLogTerm, leaderCommit, cmd, term); + } return true; } @@ -217,6 +218,11 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr aer) { uint64_t term; + uint64_t prevLogIndex; + uint64_t prevLogTerm; + uint64_t leaderCommit; + uint64_t resendEntryTerm; + std::string cmd; bool demoted = false; bool resending = false; TermRelation tr; @@ -275,16 +281,14 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a return; } uint64_t prevIdx = resendIndex - 1; - uint64_t prevTerm = log_[prevIdx]->term; const LogEntry& resendEntry = *log_[resendIndex]; - resend.set_term(currentTerm_); - resend.set_leaderid(id_); - resend.set_prevlogindex(prevIdx); - resend.set_prevlogterm(prevTerm); - auto* e = resend.add_entries(); - e->set_term(resendEntry.term); - e->set_command(resendEntry.command); - resend.set_leadercommitindex(commitIndex_); + // prepare fields for appendEntries message + term = currentTerm_; + prevLogIndex = lastLogIndex_; + prevLogTerm = log_[prevIdx]->term; + leaderCommit = commitIndex_; + cmd = resendEntry.command + resendEntryTerm = resendEntry.term resending = true; // LOG(INFO) << "Resending AppendEntries for index " << resendIndex @@ -299,7 +303,10 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a << (initialRole == Role::LEADER ? "LEADER" : "CANDIDATE") << "->FOLLOWER in term " << term; return false; } - if (resending) { SendMessage(MessageType::AppendEntriesMsg, resend, aer->id()); } + if (resending) { + CreateAndSendAppendEntryMsg(aer->id(), nextIndexCopy, term, prevLogTerm, leaderCommit, cmd, resendEntryTerm); + // SendMessage(MessageType::AppendEntriesMsg, resend, aer->id()); + } for (auto& e : eToApply) { commit_(*e); diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 90160fd0bb..6f3a17fcc3 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -66,7 +66,8 @@ class Raft : public common::ProtocolBase { void StartElection(); void SendHeartBeat(); //nextIndexCopy is a copy of nextIndex_ to prevent updating it outside a mutex lock - void CreateAndSendAppendEntries(std::vector nextIndexCopy, uint64_t term, uint64_t prevLogTerm, uint64_t leaderCommit, std::string cmd); + void CreateAndSendAppendEntryMsg(uint64_t replica_id, std::vector nextIndexCopy, uint64_t term, uint64_t prevLogTerm, uint64_t leaderCommit, + std::string cmd, uint64_t entry_term); private: mutable std::mutex mutex_; From 1e9e1f2934ce614d872a1bda2bcdf29541e051a6 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Sun, 7 Dec 2025 11:52:56 -0800 Subject: [PATCH 34/66] Fix issues from the previous commit --- platform/consensus/ordering/raft/algorithm/raft.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 6f5996967a..9b7db5e055 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -71,7 +71,7 @@ void Raft::CreateAndSendAppendEntryMsg(uint64_t replica_id, std::vectorset_term(entry_term); @@ -126,11 +126,8 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { } //LOG(INFO) << "Received Transaction to primary id: " << id_; - // Broadcast probably shouldnt be happening here. - // Ideally a loop is sending AEs to followers based on index feedback - // Broadcast(MessageType::AppendEntriesMsg, ae); - for (int i = 1; i <= total_num_; ++i) { - CreateAndSendAppendEntryMsg(i, nextIndexCopy, term, prevLogTerm, leaderCommit, cmd, term); + for (int replica_id = 1; replica_id <= total_num_; ++replica_id) { + if (replica_id != id_) {CreateAndSendAppendEntryMsg(replica_id, nextIndexCopy, term, prevLogTerm, leaderCommit, cmd, term);} } return true; } @@ -287,8 +284,8 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a prevLogIndex = lastLogIndex_; prevLogTerm = log_[prevIdx]->term; leaderCommit = commitIndex_; - cmd = resendEntry.command - resendEntryTerm = resendEntry.term + cmd = resendEntry.command; + resendEntryTerm = resendEntry.term; resending = true; // LOG(INFO) << "Resending AppendEntries for index " << resendIndex From 1a16d10cf0a10bb437addfdc0ef9ca8195d56aa5 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Sun, 7 Dec 2025 17:04:38 -0800 Subject: [PATCH 35/66] WIP catch up transactions to leader's most current log entry --- .../ordering/raft/algorithm/raft.cpp | 34 +++++++++++++------ .../consensus/ordering/raft/algorithm/raft.h | 2 +- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 9b7db5e055..04a17cd53e 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -66,12 +66,12 @@ Raft::~Raft() { is_stop_ = true; } bool Raft::IsStop() { return is_stop_; } -void Raft::CreateAndSendAppendEntryMsg(uint64_t replica_id, std::vector nextIndexCopy, uint64_t term, uint64_t prevLogTerm, uint64_t leaderCommit, +void Raft::CreateAndSendAppendEntryMsg(uint64_t replica_id, uint64_t nextIndex, uint64_t term, uint64_t prevLogTerm, uint64_t leaderCommit, std::string cmd, uint64_t entry_term) { AppendEntries ae; ae.set_term(term); ae.set_leaderid(id_); - ae.set_prevlogindex(nextIndexCopy[replica_id] - 1); + ae.set_prevlogindex(nextIndex); ae.set_prevlogterm(prevLogTerm); auto* e = ae.add_entries(); e->set_term(entry_term); @@ -127,7 +127,8 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { //LOG(INFO) << "Received Transaction to primary id: " << id_; for (int replica_id = 1; replica_id <= total_num_; ++replica_id) { - if (replica_id != id_) {CreateAndSendAppendEntryMsg(replica_id, nextIndexCopy, term, prevLogTerm, leaderCommit, cmd, term);} + auto prevLogIndexForReplica = nextIndexCopy[replica_id] - 1; + if (replica_id != id_) {CreateAndSendAppendEntryMsg(replica_id, prevLogIndexForReplica, term, prevLogTerm, leaderCommit, cmd, term);} } return true; } @@ -227,6 +228,8 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a std::vector> eToApply; AppendEntries resend; std::vector nextIndexCopy = nextIndex_; + std::vector sendList; + std::vector sendListTerm; [&]() { std::lock_guard lk(mutex_); initialRole = role_; @@ -278,16 +281,24 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a return; } uint64_t prevIdx = resendIndex - 1; - const LogEntry& resendEntry = *log_[resendIndex]; // prepare fields for appendEntries message term = currentTerm_; - prevLogIndex = lastLogIndex_; prevLogTerm = log_[prevIdx]->term; leaderCommit = commitIndex_; - cmd = resendEntry.command; - resendEntryTerm = resendEntry.term; resending = true; + while (resendIndex <= lastLogIndex_) { + LOG(INFO) << "Resending AppendEntries for index " << resendIndex << " to " << aer->id(); + const LogEntry& resendEntry = *log_[resendIndex]; + cmd = resendEntry.command; + resendEntryTerm = resendEntry.term; + prevLogTerm = log_[prevIdx]->term; + sendList.push_back(resendEntry); + sendListTerm.push_back(prevLogTerm); + resendIndex++; + prevIdx++; + } + // LOG(INFO) << "Resending AppendEntries for index " << resendIndex // << " (prevIdx=" << prevIdx // << ", prevTerm=" << prevTerm @@ -300,9 +311,12 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a << (initialRole == Role::LEADER ? "LEADER" : "CANDIDATE") << "->FOLLOWER in term " << term; return false; } - if (resending) { - CreateAndSendAppendEntryMsg(aer->id(), nextIndexCopy, term, prevLogTerm, leaderCommit, cmd, resendEntryTerm); - // SendMessage(MessageType::AppendEntriesMsg, resend, aer->id()); + if (resending) { + auto prevLogIndexForReplica = nextIndexCopy[aer->id()] - 1; + for (int i = 0; i < sendList.size(); i++) { + LOG(INFO) << "Sending Command: " << sendList[i].command << " Term: " << sendList[i].term; + CreateAndSendAppendEntryMsg(aer->id(), prevLogIndexForReplica + i, term, sendListTerm[i], leaderCommit, sendList[i].command, sendList[i].term); + } } for (auto& e : eToApply) { diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 6f3a17fcc3..d6f79c2f26 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -66,7 +66,7 @@ class Raft : public common::ProtocolBase { void StartElection(); void SendHeartBeat(); //nextIndexCopy is a copy of nextIndex_ to prevent updating it outside a mutex lock - void CreateAndSendAppendEntryMsg(uint64_t replica_id, std::vector nextIndexCopy, uint64_t term, uint64_t prevLogTerm, uint64_t leaderCommit, + void CreateAndSendAppendEntryMsg(uint64_t replica_id, uint64_t nextIndex, uint64_t term, uint64_t prevLogTerm, uint64_t leaderCommit, std::string cmd, uint64_t entry_term); private: From 5aca792f06b2b247838bd9e1090a9d644ca19382 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Mon, 8 Dec 2025 05:36:33 +0000 Subject: [PATCH 36/66] staging for benchmarks --- .../ordering/common/framework/performance_manager.cpp | 6 +++--- platform/consensus/ordering/raft/algorithm/raft.cpp | 4 +++- scripts/deploy/config/kv_performance_server_local.conf | 1 + scripts/deploy/config/pbft.config | 10 +++++----- scripts/deploy/config/raft.config | 4 ++-- 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/platform/consensus/ordering/common/framework/performance_manager.cpp b/platform/consensus/ordering/common/framework/performance_manager.cpp index 605a49a566..e20f3a9e23 100644 --- a/platform/consensus/ordering/common/framework/performance_manager.cpp +++ b/platform/consensus/ordering/common/framework/performance_manager.cpp @@ -193,9 +193,9 @@ void PerformanceManager::SendResponseToClient( // =================== request ======================== int PerformanceManager::BatchProposeMsg() { - LOG(WARNING) << "batch wait time:" << config_.ClientBatchWaitTimeMS() - << " batch num:" << config_.ClientBatchNum() - << " max txn:" << config_.GetMaxProcessTxn(); + //LOG(WARNING) << "batch wait time:" << config_.ClientBatchWaitTimeMS() + // << " batch num:" << config_.ClientBatchNum() + // << " max txn:" << config_.GetMaxProcessTxn(); std::vector> batch_req; eval_ready_future_.get(); bool start = false; diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 9b7db5e055..6b5faf1a7d 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -102,7 +102,7 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { leaderCommit = commitIndex_; // assign seq number as log index for the request or executing transactions fails. - req->set_seq(lastLogIndex_ + 1); + //req->set_seq(lastLogIndex_ + 1); // append new transaction to log auto entry = std::make_unique(); @@ -550,6 +550,8 @@ std::vector> Raft::PrepareCommitLocked() { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Failed to parse command"; continue; } + // assign seq number as log index for the request or executing transactions fails. + command->set_seq(lastApplied_); v.push_back(std::move(command)); //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Applying index entry " << lastApplied_; } diff --git a/scripts/deploy/config/kv_performance_server_local.conf b/scripts/deploy/config/kv_performance_server_local.conf index fb1e910a85..16dc042f28 100644 --- a/scripts/deploy/config/kv_performance_server_local.conf +++ b/scripts/deploy/config/kv_performance_server_local.conf @@ -22,6 +22,7 @@ iplist=( 127.0.0.1 127.0.0.1 127.0.0.1 +127.0.0.1 ) client_num=1 diff --git a/scripts/deploy/config/pbft.config b/scripts/deploy/config/pbft.config index e4fe496036..f70ffd488c 100644 --- a/scripts/deploy/config/pbft.config +++ b/scripts/deploy/config/pbft.config @@ -18,12 +18,12 @@ # { - "clientBatchNum": 100, + "clientBatchNum": 30, "enable_viewchange": true, - "recovery_enabled": true, + "recovery_enabled": false, "max_client_complaint_num":10, - "max_process_txn": 2048, - "worker_num": 2, + "max_process_txn": 512, + "worker_num": 1, "input_worker_num": 1, "output_worker_num": 10 -} +} \ No newline at end of file diff --git a/scripts/deploy/config/raft.config b/scripts/deploy/config/raft.config index 68732f59bf..33c52621e8 100644 --- a/scripts/deploy/config/raft.config +++ b/scripts/deploy/config/raft.config @@ -1,9 +1,9 @@ { - "clientBatchNum": 200, + "clientBatchNum": 30, "enable_viewchange": true, "recovery_enabled": false, "max_client_complaint_num":10, - "max_process_txn": 100000, + "max_process_txn": 512, "worker_num": 1, "input_worker_num": 1, "output_worker_num": 10 From 1a2bbc114f3f14f523d67ff7c0ec2cf7e9753d7b Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Mon, 15 Dec 2025 19:36:12 +0000 Subject: [PATCH 37/66] Added support for multiple entries in AppendEntries RPC Heartbeat is skipped if leader recently broadcasted AppendEntries Added logic for followers to truncate their log when leader sends conflicting entries --- .../ordering/common/framework/consensus.cpp | 2 - .../raft/algorithm/leaderelection_manager.cpp | 51 ++- .../raft/algorithm/leaderelection_manager.h | 5 +- .../ordering/raft/algorithm/raft.cpp | 380 ++++++++++-------- .../consensus/ordering/raft/algorithm/raft.h | 34 +- .../ordering/raft/framework/consensus.cpp | 2 - .../ordering/raft/framework/consensus.h | 4 - .../networkstrate/replica_communicator.cpp | 19 +- scripts/deploy/config/raft.config | 5 +- 9 files changed, 297 insertions(+), 205 deletions(-) diff --git a/platform/consensus/ordering/common/framework/consensus.cpp b/platform/consensus/ordering/common/framework/consensus.cpp index d375f93269..568a00ef06 100644 --- a/platform/consensus/ordering/common/framework/consensus.cpp +++ b/platform/consensus/ordering/common/framework/consensus.cpp @@ -96,7 +96,6 @@ int Consensus::Broadcast(int type, const google::protobuf::Message& msg) { Request request; msg.SerializeToString(request.mutable_data()); request.set_type(Request::TYPE_CUSTOM_CONSENSUS); - //LOG(ERROR) << "Sending custom consensus Broadcast"; request.set_user_type(type); request.set_sender_id(config_.GetSelfInfo().id()); @@ -109,7 +108,6 @@ int Consensus::SendMsg(int type, const google::protobuf::Message& msg, Request request; msg.SerializeToString(request.mutable_data()); request.set_type(Request::TYPE_CUSTOM_CONSENSUS); - //LOG(ERROR) << "Sending custom consensus message"; request.set_user_type(type); request.set_sender_id(config_.GetSelfInfo().id()); replica_communicator_->SendMessage(request, node_id); diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp index eedbb6e738..923a6a02bb 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp @@ -42,6 +42,7 @@ LeaderElectionManager::LeaderElectionManager(const ResDBConfig& config) heartbeat_timer_(100), heartbeat_count_(0), //last_heartbeat_time_(std::chrono::steady_clock::now()), + broadcast_count_(0), role_epoch_(0), known_role_epoch_(0) { global_stats_ = Stats::GetGlobalStats(); @@ -100,13 +101,22 @@ void LeaderElectionManager::OnHeartBeat() { void LeaderElectionManager::OnRoleChange() { { - //LOG(INFO) << "JIM -> " << __FUNCTION__; + LOG(INFO) << "JIM -> " << __FUNCTION__; std::lock_guard lk(cv_mutex_); role_epoch_++; } cv_.notify_all(); } +void LeaderElectionManager::OnAeBroadcast() { + { + LOG(INFO) << "JIM -> " << __FUNCTION__; + std::lock_guard lk(cv_mutex_); + broadcast_count_++; + } + cv_.notify_all(); +} + uint64_t LeaderElectionManager::RandomInt(uint64_t min, uint64_t max) { static thread_local std::mt19937_64 gen(std::random_device{}()); std::uniform_int_distribution dist(min, max); @@ -116,20 +126,23 @@ uint64_t LeaderElectionManager::RandomInt(uint64_t min, uint64_t max) { Waited LeaderElectionManager::LeaderWait() { //LOG(INFO) << "JIM -> " << __FUNCTION__; std::unique_lock lk(cv_mutex_); + const uint64_t broadcast_snapshot = broadcast_count_; if (known_role_epoch_ != role_epoch_) { known_role_epoch_ = role_epoch_; return Waited::ROLE_CHANGE; } cv_.wait_for(lk, std::chrono::milliseconds(heartbeat_timer_), - [this] { + [this, broadcast_snapshot] { return (stop_.load() == true - || (known_role_epoch_ != role_epoch_)); + || (known_role_epoch_ != role_epoch_) + || (broadcast_snapshot != broadcast_count_)); }); if (stop_.load() == true) { return Waited::STOPPED; } else if (known_role_epoch_ != role_epoch_) { known_role_epoch_ = role_epoch_; return Waited::ROLE_CHANGE; } + else if (broadcast_snapshot != broadcast_count_) { return Waited::BROADCASTED; } else { return Waited::TIMEOUT; } } @@ -163,38 +176,46 @@ Waited LeaderElectionManager::Wait() { // Causes followers and candidates to start an election if no heartbeat received. void LeaderElectionManager::MonitoringElectionTimeout() { while (!stop_.load()) { - raft::Role role = raft_->GetRoleSnapshot(); + Role role = raft_->GetRoleSnapshot(); Waited res; - //std::chrono::steady_clock::time_point wait_start_time_ = std::chrono::steady_clock::now(); - //bool leader = false; - if (role == raft::Role::LEADER) { + std::chrono::steady_clock::time_point wait_start_time_ = std::chrono::steady_clock::now(); + bool leader = false; + if (role == Role::LEADER) { res = LeaderWait(); - //leader = true; + leader = true; } else { res = Wait(); } - //std::chrono::steady_clock::time_point wait_end_time_ = std::chrono::steady_clock::now(); - //std::chrono::steady_clock::duration delta = wait_end_time_ - wait_start_time_; - //auto ms = std::chrono::duration_cast(delta).count(); - //LOG(INFO) << __FUNCTION__ << ": " << (leader ? "Leader" : "") << "Wait " << ms << "ms"; + std::chrono::steady_clock::time_point wait_end_time_ = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration delta = wait_end_time_ - wait_start_time_; + auto ms = std::chrono::duration_cast(delta).count(); + if (raft_->livenessLoggingFlag_) { + LOG(INFO) << __FUNCTION__ << ": " << (leader ? "Leader" : "") << "Wait " << ms << "ms"; + } if (res == Waited::STOPPED) { break; } else if (res == Waited::ROLE_CHANGE) { - //LOG(INFO) << __FUNCTION__ << ": Role change detected"; + LOG(INFO) << __FUNCTION__ << ": Role change detected"; continue; } else if (res == Waited::HEARTBEAT) { //LOG(INFO) << __FUNCTION__ << ": Heartbeat received within window"; - if (raft_->GetRoleSnapshot() == raft::Role::LEADER) { + if (raft_->GetRoleSnapshot() == Role::LEADER) { // A leader receiving a heartbeat would be unusual but not impossible. LOG(WARNING) << __FUNCTION__ << " Received Heartbeat as LEADER"; } continue; } + else if (res == Waited::BROADCASTED) { + if (raft_->livenessLoggingFlag_) { + LOG(INFO) << __FUNCTION__ << ": AE Broadcast reset leader heartbeat timer"; + } + continue; + } // Only gets here if timeout expired. // Leaders send a new heartbeat. - if (raft_->GetRoleSnapshot() == raft::Role::LEADER) { + if (raft_->GetRoleSnapshot() == Role::LEADER) { raft_->SendHeartBeat(); } // Followers and Candidates start an election. diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h index 984e89107d..ed52234629 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h @@ -35,7 +35,8 @@ enum class Waited { HEARTBEAT, STOPPED, TIMEOUT, - ROLE_CHANGE + ROLE_CHANGE, + BROADCASTED }; class LeaderElectionManager { @@ -48,6 +49,7 @@ class LeaderElectionManager { void SetRaft(raft::Raft*); void OnHeartBeat(); void OnRoleChange(); + void OnAeBroadcast(); private: Waited LeaderWait(); @@ -69,6 +71,7 @@ class LeaderElectionManager { uint64_t timeout_max_ms; uint64_t heartbeat_timer_; uint64_t heartbeat_count_; // Protected by cv_mutex_ + uint64_t broadcast_count_; // Protected by cv_mutex_ //std::chrono::steady_clock::time_point last_heartbeat_time_; uint64_t role_epoch_; // Protected by cv_mutex_ uint64_t known_role_epoch_; // Protected by cv_mutex_ diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 04a17cd53e..537ce6e7d8 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -40,7 +40,7 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, lastLogIndex_(0), commitIndex_(0), lastApplied_(0), - role_(raft::Role::FOLLOWER), + role_(Role::FOLLOWER), is_stop_(false), quorum_((total_num/2) + 1), verifier_(verifier), @@ -58,6 +58,7 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, sentinel->command = "COMMON_PREFIX"; log_.push_back(std::move(sentinel)); + inflight_.assign(total_num_ + 1, 0); nextIndex_.assign(total_num_ + 1, lastLogIndex_ + 1); matchIndex_.assign(total_num_ + 1, lastLogIndex_); } @@ -66,27 +67,8 @@ Raft::~Raft() { is_stop_ = true; } bool Raft::IsStop() { return is_stop_; } -void Raft::CreateAndSendAppendEntryMsg(uint64_t replica_id, uint64_t nextIndex, uint64_t term, uint64_t prevLogTerm, uint64_t leaderCommit, - std::string cmd, uint64_t entry_term) { - AppendEntries ae; - ae.set_term(term); - ae.set_leaderid(id_); - ae.set_prevlogindex(nextIndex); - ae.set_prevlogterm(prevLogTerm); - auto* e = ae.add_entries(); - e->set_term(entry_term); - e->set_command(cmd); - ae.set_leadercommitindex(leaderCommit); - SendMessage(MessageType::AppendEntriesMsg, ae, replica_id); -} - bool Raft::ReceiveTransaction(std::unique_ptr req) { - uint64_t term; - uint64_t prevLogIndex; - uint64_t prevLogTerm; - uint64_t leaderCommit; - std::string cmd; - std::vector nextIndexCopy = nextIndex_; + std::vector> messages; { std::lock_guard lk(mutex_); if (role_ != Role::LEADER) { @@ -95,15 +77,6 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Replica is not leader, returning early"; return false; } - // prepare fields for appendEntries message - term = currentTerm_; - prevLogIndex = lastLogIndex_; - prevLogTerm = getLastLogTermLocked(); - leaderCommit = commitIndex_; - - // assign seq number as log index for the request or executing transactions fails. - req->set_seq(lastLogIndex_ + 1); - // append new transaction to log auto entry = std::make_unique(); entry->term = currentTerm_; @@ -111,25 +84,30 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": req could not be serialized"; return false; } - cmd = entry->command; log_.push_back(std::move(entry)); + + // TODO // durably store the new entry somehow // otherwise it is a safety violation to treat it as "appended" - // should not be sending AEs before durable. + // should not be responding to RPCs before durable. lastLogIndex_++; nextIndex_[id_] = lastLogIndex_ + 1; matchIndex_[id_] = lastLogIndex_; - nextIndexCopy[id_] = nextIndex_[id_]; - } - //LOG(INFO) << "Received Transaction to primary id: " << id_; - for (int replica_id = 1; replica_id <= total_num_; ++replica_id) { - auto prevLogIndexForReplica = nextIndexCopy[replica_id] - 1; - if (replica_id != id_) {CreateAndSendAppendEntryMsg(replica_id, prevLogIndexForReplica, term, prevLogTerm, leaderCommit, cmd, term);} + if (replicationLoggingFlag_) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Leader appended entry at index " << lastLogIndex_; + } + + // prepare fields for appendEntries message + messages = GatherAeFieldsForBroadcastLocked(); } + for (const auto& [followerId, fields] : messages) { + CreateAndSendAppendEntryMsg(followerId, fields); + } + leader_election_manager_->OnAeBroadcast(); return true; } @@ -145,6 +123,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { auto leaderId = ae->leaderid(); std::vector> eToApply; + const char* parent_fn = __FUNCTION__; [&]() { std::lock_guard lk(mutex_); initialRole = role_; @@ -159,37 +138,65 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { } term = currentTerm_; if (!success) { return; } - // TODO Implement an entry existing but with a different term - // delete that entry and all after it + uint64_t idx = ae->prevlogindex() + 1; for (const auto& e : ae->entries()) { auto entry = std::make_unique(); entry->term = e.term(); entry->command = e.command(); - log_.push_back(std::move(entry)); - lastLogIndex_++; - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Appended to log at index" << lastLogIndex_; + + // entry is at new position + if (idx > lastLogIndex_) { + log_.push_back(std::move(entry)); + lastLogIndex_ = idx; + + if (replicationLoggingFlag_) { + LOG(INFO) << "JIM -> " << parent_fn << ": follower appended new entry at index " << lastLogIndex_; + } + + } + // entry is at an existing position && new term doesnt match old term + else if (entry->term != log_[idx]->term) { + auto first = log_.begin() + idx; + auto last = log_.begin() + lastLogIndex_ + 1; + log_.erase(first, last); + log_.push_back(std::move(entry)); + lastLogIndex_ = idx; + + if (replicationLoggingFlag_) { + LOG(INFO) << "JIM -> " << parent_fn << ": follower saw term mismatch at index " << lastLogIndex_ << ". Later entries erased"; + } + + } + ++idx; // TODO: have to actually store the entry durably before it can be considered "appended" } lastLogIndex = lastLogIndex_; - //uint64_t prevCommitIndex = commitIndex_; + uint64_t prevCommitIndex = commitIndex_; if (leaderCommit > commitIndex_) { commitIndex_ = std::min(leaderCommit, lastLogIndex_); - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Raised commitIndex_ from " - // << prevCommitIndex << " to " << commitIndex_; + + if (replicationLoggingFlag_ && commitIndex_ > prevCommitIndex) { + LOG(INFO) << "JIM -> " << parent_fn << ": Raised commitIndex_ from " + << prevCommitIndex << " to " << commitIndex_; + } + } // apply any newly committed entries to state machine eToApply = PrepareCommitLocked(); }(); - //auto now = std::chrono::steady_clock::now(); - //std::chrono::steady_clock::duration delta; - //delta = now - last_ae_time_; - //last_ae_time_ = now; - //auto ms = std::chrono::duration_cast(delta).count(); - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": AE received after " << ms << "ms"; + auto now = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration delta; + delta = now - last_ae_time_; + last_ae_time_ = now; + auto ms = std::chrono::duration_cast(delta).count(); + + if (replicationLoggingFlag_) { + //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": AE received after " << ms << "ms"; + } if (demoted) { leader_election_manager_->OnRoleChange(); @@ -199,9 +206,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { if (tr != TermRelation::STALE) { leader_election_manager_->OnHeartBeat(); } - for (auto& e : eToApply) { - commit_(*e); - } + for (auto& e : eToApply) { commit_(*e); } AppendEntriesResponse aer; aer.set_term(term); @@ -209,27 +214,24 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { aer.set_id(id_); aer.set_lastlogindex(lastLogIndex); SendMessage(MessageType::AppendEntriesResponseMsg, aer, leaderId); - //if (success) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded success"; } - //else { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded failure"; } + + if (replicationLoggingFlag_) { + //if (success) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded success"; } + //else { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded failure"; } + } return true; } bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr aer) { uint64_t term; - uint64_t prevLogIndex; - uint64_t prevLogTerm; - uint64_t leaderCommit; - uint64_t resendEntryTerm; - std::string cmd; bool demoted = false; bool resending = false; TermRelation tr; Role initialRole; std::vector> eToApply; - AppendEntries resend; - std::vector nextIndexCopy = nextIndex_; - std::vector sendList; - std::vector sendListTerm; + AeFields fields; + int followerId = aer->id(); + const char* parent_fn = __FUNCTION__; [&]() { std::lock_guard lk(mutex_); initialRole = role_; @@ -239,70 +241,30 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a if (role_ != Role::LEADER || tr == TermRelation::STALE) { return; } - // ===================== SUCCESS CASE ===================== - if (aer->success()) { - nextIndex_[aer->id()] = aer->lastlogindex() + 1; + nextIndex_[followerId] = aer->lastlogindex() + 1; + // if successful, update matchIndex and try to commit more entries + if (aer->success()) { // need to ensure matchIndex never decreases even if followers lastLogIndex decreases - matchIndex_[aer->id()] = std::max(matchIndex_[aer->id()], aer->lastlogindex()); - + matchIndex_[followerId] = std::max(matchIndex_[followerId], aer->lastlogindex()); // use updated matchIndex to find new entries eligible for commit std::vector sorted = matchIndex_; std::sort(sorted.begin(), sorted.end(), std::greater()); uint64_t lastReplicatedIndex = sorted[quorum_ - 1]; - // Need to check the lastReplicatedIndex contains entry from current term if (lastReplicatedIndex > commitIndex_ && log_[lastReplicatedIndex]->term == currentTerm_) { - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Raised commitIndex_ from " - // << commitIndex_ << " to " << lastReplicatedIndex; + LOG(INFO) << "JIM -> " << parent_fn << ": Raised commitIndex_ from " + << commitIndex_ << " to " << lastReplicatedIndex; commitIndex_ = lastReplicatedIndex; } - // apply any newly committed entries to state machine eToApply = PrepareCommitLocked(); } - // ===================== FAILURE CASE ===================== - else { - LOG(INFO) << "AppendEntriesResponse indicates FAILURE from follower " << aer->id(); - // Move nextIndex one step back, but don't go below 1 - nextIndex_[aer->id()] = aer->lastlogindex() - 1; - if (nextIndex_[aer->id()] < 1) { - nextIndex_[aer->id()] = 1; - } - uint64_t resendIndex = nextIndex_[aer->id()]; - //LOG(INFO) << "Updated nextIndex_ for follower " << aer->id() - // << " to " << resendIndex; - - // Check that we actually have an entry at this index - if (resendIndex == 0 || resendIndex > lastLogIndex_) { - LOG(INFO) << "No log entry at index " << resendIndex - << " to resend; lastLogIndex_ = " - << lastLogIndex_; - return; - } - uint64_t prevIdx = resendIndex - 1; - // prepare fields for appendEntries message - term = currentTerm_; - prevLogTerm = log_[prevIdx]->term; - leaderCommit = commitIndex_; + // if failure, or if nextIndex[i] < lastLogIndex + 1 (follower isnt caught up) + if (!aer->success() || (nextIndex_[followerId] < lastLogIndex_ + 1)) { + if (!aer->success()) { LOG(INFO) << "AppendEntriesResponse indicates FAILURE from follower " << followerId; } + fields = GatherAeFieldsLocked(followerId); resending = true; - - while (resendIndex <= lastLogIndex_) { - LOG(INFO) << "Resending AppendEntries for index " << resendIndex << " to " << aer->id(); - const LogEntry& resendEntry = *log_[resendIndex]; - cmd = resendEntry.command; - resendEntryTerm = resendEntry.term; - prevLogTerm = log_[prevIdx]->term; - sendList.push_back(resendEntry); - sendListTerm.push_back(prevLogTerm); - resendIndex++; - prevIdx++; - } - - // LOG(INFO) << "Resending AppendEntries for index " << resendIndex - // << " (prevIdx=" << prevIdx - // << ", prevTerm=" << prevTerm - // << ") to follower " << aer->id(); } }(); if (demoted) { @@ -312,11 +274,7 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a return false; } if (resending) { - auto prevLogIndexForReplica = nextIndexCopy[aer->id()] - 1; - for (int i = 0; i < sendList.size(); i++) { - LOG(INFO) << "Sending Command: " << sendList[i].command << " Term: " << sendList[i].term; - CreateAndSendAppendEntryMsg(aer->id(), prevLogIndexForReplica + i, term, sendListTerm[i], leaderCommit, sendList[i].command, sendList[i].term); - } + CreateAndSendAppendEntryMsg(followerId, fields); } for (auto& e : eToApply) { @@ -338,6 +296,7 @@ void Raft::ReceiveRequestVote(std::unique_ptr rv) { if (rvSender == id_) { return; } + const char* parent_fn = __FUNCTION__; [&]() { std::lock_guard lk(mutex_); initialRole = role_; @@ -392,6 +351,7 @@ void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) bool elected = false; Role initialRole; + const char* parent_fn = __FUNCTION__; [&]() { std::lock_guard lk(mutex_); initialRole = role_; @@ -406,17 +366,18 @@ void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) bool dupe = (std::find(votes_.begin(), votes_.end(), voterId) != votes_.end()); if (dupe) { return; } votes_.push_back(voterId); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Replica " << voterId << " voted for me. Votes: " + LOG(INFO) << "JIM -> " << parent_fn << ": Replica " << voterId << " voted for me. Votes: " << votes_.size() << "/" << quorum_ << " in term " << currentTerm_; if (votes_.size() >= quorum_) { elected = true; role_ = Role::LEADER; + inflight_.assign(total_num_ + 1, 0); nextIndex_.assign(total_num_ + 1, lastLogIndex_ + 1); // make sure to set leaders own matchIndex entry to lastLogIndex matchIndex_.assign(total_num_ + 1, 0); matchIndex_[id_] = lastLogIndex_; - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": CANDIDATE->LEADER in term " << currentTerm_; + LOG(INFO) << "JIM -> " << parent_fn << ": CANDIDATE->LEADER in term " << currentTerm_; } }(); if (demoted || elected) { leader_election_manager_->OnRoleChange(); } @@ -427,7 +388,7 @@ void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) if (elected) { SendHeartBeat(); } } -raft::Role Raft::GetRoleSnapshot() const { +Role Raft::GetRoleSnapshot() const { std::lock_guard lk(mutex_); return role_; } @@ -442,12 +403,12 @@ void Raft::StartElection() { { std::lock_guard lk(mutex_); - if (role_ == raft::Role::LEADER) { + if (role_ == Role::LEADER) { LOG(WARNING) << __FUNCTION__ << ": Leader tried to start election"; return; } - if (role_ == raft::Role::FOLLOWER) { - role_ = raft::Role::CANDIDATE; + if (role_ == Role::FOLLOWER) { + role_ = Role::CANDIDATE; roleChanged = true; } heartBeatsSentThisTerm_ = 0; @@ -477,46 +438,45 @@ void Raft::StartElection() { } void Raft::SendHeartBeat() { - uint64_t currentTerm; - int leaderId = id_; - uint64_t prevLogIndex; - uint64_t prevLogTerm; - std::string entries; - uint64_t leaderCommit; - //uint64_t heartBeatNum; - - //auto now = std::chrono::steady_clock::now(); - //std::chrono::steady_clock::duration delta; + auto functionStart = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration functionDelta; + std::vector> messages; + uint64_t currentTerm; + uint64_t heartBeatNum; { std::lock_guard lk(mutex_); - if (role_ != raft::Role::LEADER) { + if (role_ != Role::LEADER) { LOG(WARNING) << __FUNCTION__ << ": Non-Leader tried to start HeartBeat"; return; } - //heartBeatsSentThisTerm_++; - //heartBeatNum = heartBeatsSentThisTerm_; currentTerm = currentTerm_; - prevLogIndex = lastLogIndex_; - prevLogTerm = getLastLogTermLocked(); - entries = ""; - leaderCommit = commitIndex_; - - // delta = now - last_heartbeat_time_; - // last_heartbeat_time_ = now; + + heartBeatsSentThisTerm_++; + heartBeatNum = heartBeatsSentThisTerm_; + messages = GatherAeFieldsForBroadcastLocked(true); } - //auto ms = std::chrono::duration_cast(delta).count(); - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Heartbeat sent after " << ms << "ms"; - AppendEntries ae; - ae.set_term(currentTerm); - ae.set_leaderid(leaderId); - ae.set_prevlogindex(prevLogIndex); // TODO - ae.set_prevlogterm(prevLogTerm); - ae.set_leadercommitindex(leaderCommit); - Broadcast(MessageType::AppendEntriesMsg, ae); - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Heartbeat " << heartBeatNum << " for term " << currentTerm; + auto msgStart = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration msgDelta; + + for (const auto& [followerId, fields] : messages) { + CreateAndSendAppendEntryMsg(followerId, fields); + } + + auto msgEnd = std::chrono::steady_clock::now(); + msgDelta = msgEnd - msgStart; + auto msgMs = std::chrono::duration_cast(msgDelta).count(); + if (livenessLoggingFlag_) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": " << msgMs << " ms elapsed in CreateAndSend loop"; + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Heartbeat " << heartBeatNum << " for term " << currentTerm; + } + + + auto redirectStart = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration redirectDelta; + // Also ping client proxies that this is the leader DirectToLeader dtl; dtl.set_term(currentTerm); @@ -526,6 +486,20 @@ void Raft::SendHeartBeat() { SendMessage(DirectToLeaderMsg, dtl, id); //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": DirectToLeader " << id_ << " sent to proxy " << id; } + + auto redirectEnd = std::chrono::steady_clock::now(); + redirectDelta = redirectEnd - redirectStart; + auto redirectMs = std::chrono::duration_cast(redirectDelta).count(); + + + auto functionEnd = std::chrono::steady_clock::now(); + functionDelta = functionEnd - functionStart; + auto functionMs = std::chrono::duration_cast(functionDelta).count(); + + if (livenessLoggingFlag_) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": " << redirectMs << " ms elapsed in redirect loop"; + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": " << functionMs << " ms elapsed in function"; + } } // requires raft mutex to be held @@ -535,8 +509,8 @@ bool Raft::DemoteSelfLocked(uint64_t term) { currentTerm_ = term; votedFor_ = -1; } - if (role_ != raft::Role::FOLLOWER) { - role_ = raft::Role::FOLLOWER; + if (role_ != Role::FOLLOWER) { + role_ = Role::FOLLOWER; //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted to FOLLOWER"; return true; } @@ -555,22 +529,92 @@ uint64_t Raft::getLastLogTermLocked() const { return log_[lastLogIndex_]->term; } +// requires raft mutex to be held std::vector> Raft::PrepareCommitLocked() { std::vector> v; - while (commitIndex_ > lastApplied_) { - lastApplied_++; - auto command = std::make_unique(); - if (!command->ParseFromString(log_[lastApplied_]->command)) { - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Failed to parse command"; - continue; - } - v.push_back(std::move(command)); - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Applying index entry " << lastApplied_; + uint64_t begin = lastApplied_ + 1; + bool applying = false; + while (lastApplied_ < commitIndex_) { + ++lastApplied_; + auto command = std::make_unique(); + if (!command->ParseFromString(log_[lastApplied_]->command)) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Failed to parse command"; + continue; } + // assign seq number as log index for the request or executing transactions fails. + command->set_seq(lastApplied_); + v.push_back(std::move(command)); + applying = true; + } + + if (applying && replicationLoggingFlag_) { + if (lastApplied_ > begin) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Applying index entries " << begin << " to " << lastApplied_; + } + else { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Applying index entry " << lastApplied_; + } + } + return v; } +AeFields Raft::GatherAeFieldsLocked(int followerId, bool heartBeat) const { + AeFields f{}; + f.term = currentTerm_; + f.leaderId = id_; + f.leaderCommit = commitIndex_; + f.prevLogIndex = nextIndex_[followerId] - 1; + f.prevLogTerm = log_[f.prevLogIndex]->term; + if (heartBeat) { return f; } + const uint64_t firstNew = nextIndex_[followerId]; + const uint64_t limit = std::min(lastLogIndex_, (firstNew + maxEntries) - 1); + for (uint64_t i = firstNew; i <= limit; ++i) { + LogEntry e; + e.term = log_[i]->term; + e.command = log_[i]->command; + f.entries.push_back(e); + } + return f; +} + +// returns vector of tuples +// If heartBeat == true, entries[] will be empty for all messages +// else entries will each contain at most maxEntries amount of entries +std::vector> Raft::GatherAeFieldsForBroadcastLocked(bool heartBeat) const { + std::vector> v; + v.reserve(total_num_ - 1); + for (int i = 1; i <= total_num_; ++i) { + if (i == id_) { continue; } + AeFields f = GatherAeFieldsLocked(i, heartBeat); + v.emplace_back(i, f); + } + return v; +} +void Raft::CreateAndSendAppendEntryMsg(int followerId, const AeFields& f) { + AppendEntries ae; + ae.set_term(f.term); + ae.set_leaderid(f.leaderId); + ae.set_prevlogindex(f.prevLogIndex); + ae.set_prevlogterm(f.prevLogTerm); + ae.set_leadercommitindex(f.leaderCommit); + uint64_t entryCount = 0; + for (const auto& entry : f.entries) { + auto* e = ae.add_entries(); + e->set_term(entry.term); + e->set_command(entry.command); + if (entryCount > 0 && ae.ByteSizeLong() > maxBytes) { + ae.mutable_entries()->RemoveLast(); + break; + } + entryCount++; + } + SendMessage(MessageType::AppendEntriesMsg, ae, followerId); + if (replicationLoggingFlag_) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Sent AE with " << entryCount << (entryCount == 1 ? " entry" : " entries"); + } +} } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index d6f79c2f26..e2d7ac4602 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -47,6 +47,15 @@ struct LogEntry { std::string command; }; +struct AeFields { + uint64_t term = 0; + int leaderId = -1; + uint64_t prevLogIndex = 0; + uint64_t prevLogTerm = 0; + std::vector entries{}; + uint64_t leaderCommit = 0; +}; + class Raft : public common::ProtocolBase { public: Raft(int id, int f, int total_num, @@ -56,18 +65,17 @@ class Raft : public common::ProtocolBase { ); ~Raft(); + const bool replicationLoggingFlag_ = true; + const bool livenessLoggingFlag_ = false; + bool ReceiveTransaction(std::unique_ptr req); bool ReceiveAppendEntries(std::unique_ptr ae); bool ReceiveAppendEntriesResponse(std::unique_ptr aer); void ReceiveRequestVote(std::unique_ptr rv); void ReceiveRequestVoteResponse(std::unique_ptr rvr); - - raft::Role GetRoleSnapshot() const; void StartElection(); void SendHeartBeat(); - //nextIndexCopy is a copy of nextIndex_ to prevent updating it outside a mutex lock - void CreateAndSendAppendEntryMsg(uint64_t replica_id, uint64_t nextIndex, uint64_t term, uint64_t prevLogTerm, uint64_t leaderCommit, - std::string cmd, uint64_t entry_term); + Role GetRoleSnapshot() const; private: mutable std::mutex mutex_; @@ -76,8 +84,12 @@ class Raft : public common::ProtocolBase { bool DemoteSelfLocked(uint64_t term); // Must be called under mutex uint64_t getLastLogTermLocked() const; // Must be called under mutex bool IsStop(); - bool IsDuplicateLogEntry(const std::string& hash) const; // Must be called under mutex - std::vector> PrepareCommitLocked(); + //bool IsDuplicateLogEntry(const std::string& hash) const; // Must be called under mutex + std::vector> PrepareCommitLocked(); // Must be called under mutex + AeFields GatherAeFieldsLocked(int followerId, bool heartBeat = false) const; // Must be called under mutex + std::vector> GatherAeFieldsForBroadcastLocked(bool heartBeat = false) const; // Must be called under mutex + + void CreateAndSendAppendEntryMsg(int followerId, const AeFields& f); // Persistent state on all servers: uint64_t currentTerm_; // Protected by mutex_ @@ -94,8 +106,9 @@ class Raft : public common::ProtocolBase { uint64_t commitIndex_; // Protected by mutex_ uint64_t lastApplied_; // Protected by mutex_ Role role_; // Protected by mutex_ - int LeaderId; // Protected by mutex_ + //int leaderId_; // Protected by mutex_ std::vector votes_; // Protected by mutex_ + std::vector inflight_; // Protected by mutex_ std::chrono::steady_clock::time_point last_ae_time_; std::chrono::steady_clock::time_point last_heartbeat_time_; // Protected by mutex_ @@ -104,11 +117,12 @@ class Raft : public common::ProtocolBase { // for limiting AppendEntries batch sizing static constexpr size_t maxBytes = 64 * 1024; - static constexpr size_t maxEntries = 16; + static constexpr size_t maxEntries = 8; + static constexpr size_t maxInFlightAE = 3; SignatureVerifier* verifier_; LeaderElectionManager* leader_election_manager_; - Stats* global_stats_; + //Stats* global_stats_; ReplicaCommunicator* replica_communicator_; }; diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 6dfd3de0b1..95a72ef8a2 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -39,8 +39,6 @@ Consensus::Consensus(const ResDBConfig& config, Init(); - start_ = 0; - if (config_.GetPublicKeyCertificateInfo() .public_key() .public_key_info() diff --git a/platform/consensus/ordering/raft/framework/consensus.h b/platform/consensus/ordering/raft/framework/consensus.h index 04cae13778..0b197e9751 100644 --- a/platform/consensus/ordering/raft/framework/consensus.h +++ b/platform/consensus/ordering/raft/framework/consensus.h @@ -43,10 +43,6 @@ class Consensus : public common::Consensus { protected: std::unique_ptr raft_; std::unique_ptr leader_election_manager_; - Stats* global_stats_; - int64_t start_; - std::mutex mutex_; - int send_num_[200]; }; } // namespace raft diff --git a/platform/networkstrate/replica_communicator.cpp b/platform/networkstrate/replica_communicator.cpp index 12c6c8b7cc..872c4f5786 100644 --- a/platform/networkstrate/replica_communicator.cpp +++ b/platform/networkstrate/replica_communicator.cpp @@ -181,17 +181,34 @@ const ReplicaInfo& replica_info) { global_stats_->BroadCastMsg(); if (is_use_long_conn_) { + /* + auto msgStart = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration msgDelta; + auto pushStart = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration pushDelta; + */ auto item = std::make_unique(); item->data = NetChannel::GetRawMessageString(message, verifier_); + /* + auto pushEnd = std::chrono::steady_clock::now(); + pushDelta = pushEnd - pushStart; + auto pushMs = std::chrono::duration_cast(pushDelta).count(); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": " << pushMs << " ms elapsed getting raw msg string"; + */ std::lock_guard lk(smutex_); if(single_bq_.find(std::make_pair(ip, port)) == single_bq_.end()){ StartSingleInBackGround(ip, port); } assert(single_bq_[std::make_pair(ip, port)] != nullptr); single_bq_[std::make_pair(ip, port)]->Push(std::move(item)); + /* + auto msgEnd = std::chrono::steady_clock::now(); + msgDelta = msgEnd - msgStart; + auto msgMs = std::chrono::duration_cast(msgDelta).count(); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": " << msgMs << " ms elapsed in is_use_long_conn_ conditional"; + */ return 0; } else { - return SendMessageInternal(message, replicas_); } } diff --git a/scripts/deploy/config/raft.config b/scripts/deploy/config/raft.config index 68732f59bf..64de9bf238 100644 --- a/scripts/deploy/config/raft.config +++ b/scripts/deploy/config/raft.config @@ -1,9 +1,10 @@ { - "clientBatchNum": 200, + "clientBatchNum": 1, "enable_viewchange": true, "recovery_enabled": false, + "not_need_signature": true, "max_client_complaint_num":10, - "max_process_txn": 100000, + "max_process_txn": 10, "worker_num": 1, "input_worker_num": 1, "output_worker_num": 10 From cbff3c70a1dbd269a7609cfe21cc448b3271d87e Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Mon, 15 Dec 2025 22:45:36 +0000 Subject: [PATCH 38/66] added followerId field to AeFields struct to simplify code changed variable names for semanatic value removed one line conditionals for readability --- .../raft/algorithm/leaderelection_manager.cpp | 28 +- .../ordering/raft/algorithm/raft.cpp | 239 ++++++++++++------ .../consensus/ordering/raft/algorithm/raft.h | 5 +- .../ordering/raft/framework/consensus.cpp | 22 +- 4 files changed, 186 insertions(+), 108 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp index 923a6a02bb..d2e4f1bf03 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.cpp @@ -137,13 +137,19 @@ Waited LeaderElectionManager::LeaderWait() { || (known_role_epoch_ != role_epoch_) || (broadcast_snapshot != broadcast_count_)); }); - if (stop_.load() == true) { return Waited::STOPPED; } + if (stop_.load() == true) { + return Waited::STOPPED; + } else if (known_role_epoch_ != role_epoch_) { known_role_epoch_ = role_epoch_; return Waited::ROLE_CHANGE; } - else if (broadcast_snapshot != broadcast_count_) { return Waited::BROADCASTED; } - else { return Waited::TIMEOUT; } + else if (broadcast_snapshot != broadcast_count_) { + return Waited::BROADCASTED; + } + else { + return Waited::TIMEOUT; + } } Waited LeaderElectionManager::Wait() { @@ -162,13 +168,19 @@ Waited LeaderElectionManager::Wait() { || (heartbeat_snapshot != heartbeat_count_) || (known_role_epoch_ != role_epoch_)); }); - if (stop_.load() == true) { return Waited::STOPPED; } + if (stop_.load() == true) { + return Waited::STOPPED; + } else if (known_role_epoch_ != role_epoch_) { known_role_epoch_ = role_epoch_; return Waited::ROLE_CHANGE; } - else if (heartbeat_snapshot != heartbeat_count_) { return Waited::HEARTBEAT; } - else { return Waited::TIMEOUT; } + else if (heartbeat_snapshot != heartbeat_count_) { + return Waited::HEARTBEAT; + } + else { + return Waited::TIMEOUT; + } } // Function that is run in server_checking_timeout_thread started in MayStart(). @@ -193,7 +205,9 @@ void LeaderElectionManager::MonitoringElectionTimeout() { if (raft_->livenessLoggingFlag_) { LOG(INFO) << __FUNCTION__ << ": " << (leader ? "Leader" : "") << "Wait " << ms << "ms"; } - if (res == Waited::STOPPED) { break; } + if (res == Waited::STOPPED) { + break; + } else if (res == Waited::ROLE_CHANGE) { LOG(INFO) << __FUNCTION__ << ": Role change detected"; continue; diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 537ce6e7d8..8614d8ee2d 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -63,12 +63,16 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, matchIndex_.assign(total_num_ + 1, lastLogIndex_); } -Raft::~Raft() { is_stop_ = true; } +Raft::~Raft() { + is_stop_ = true; +} -bool Raft::IsStop() { return is_stop_; } +bool Raft::IsStop() { + return is_stop_; +} bool Raft::ReceiveTransaction(std::unique_ptr req) { - std::vector> messages; + std::vector messages; { std::lock_guard lk(mutex_); if (role_ != Role::LEADER) { @@ -104,15 +108,17 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { // prepare fields for appendEntries message messages = GatherAeFieldsForBroadcastLocked(); } - for (const auto& [followerId, fields] : messages) { - CreateAndSendAppendEntryMsg(followerId, fields); + for (const auto& msg : messages) { + CreateAndSendAppendEntryMsg(msg); } leader_election_manager_->OnAeBroadcast(); return true; } bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { - if (ae->leaderid() == id_) { return false; } + if (ae->leaderid() == id_) { + return false; + } uint64_t term; bool success = false; bool demoted = false; @@ -129,25 +135,45 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { initialRole = role_; lastLogIndex = lastLogIndex_; tr = TermCheckLocked(ae->term()); - if (tr == TermRelation::NEW) { demoted = DemoteSelfLocked(ae->term()); } - else if (role_ != Role::FOLLOWER && tr == TermRelation::CURRENT) { demoted = DemoteSelfLocked(ae->term()); } + if (tr == TermRelation::NEW) { + demoted = DemoteSelfLocked(ae->term()); + } + else if (role_ != Role::FOLLOWER && tr == TermRelation::CURRENT) { + demoted = DemoteSelfLocked(ae->term()); + } if (tr != TermRelation::STALE && role_ == Role::FOLLOWER) { uint64_t i = ae->prevlogindex(); - if (i < static_cast(log_.size()) && ae->prevlogterm() == log_[i]->term) { success = true; } + if (i < static_cast(log_.size()) && ae->prevlogterm() == log_[i]->term) { + success = true; + } } term = currentTerm_; - if (!success) { return; } + if (!success) { + return; + } + + + /* + new logic concept: + rather than checking idx > lastLogIndex_, check idx == lastLogIndex_ + 1 (should be equivalent, has semantic value) + + First, loop over entries before lastLogIndex + 1 and look for conflicts. + If conflict occurs, wipe suffix and set LastLogIndex = idx + If idx already = lastLogIndex_ + 1, this loop is skipped + + Second, batch append all remaining entries to log + */ uint64_t idx = ae->prevlogindex() + 1; - for (const auto& e : ae->entries()) { - auto entry = std::make_unique(); - entry->term = e.term(); - entry->command = e.command(); + for (const auto& entry : ae->entries()) { + auto newEntry = std::make_unique(); + newEntry->term = entry.term(); + newEntry->command = entry.command(); // entry is at new position if (idx > lastLogIndex_) { - log_.push_back(std::move(entry)); + log_.push_back(std::move(newEntry)); lastLogIndex_ = idx; if (replicationLoggingFlag_) { @@ -156,11 +182,11 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { } // entry is at an existing position && new term doesnt match old term - else if (entry->term != log_[idx]->term) { + else if (newEntry->term != log_[idx]->term) { auto first = log_.begin() + idx; auto last = log_.begin() + lastLogIndex_ + 1; log_.erase(first, last); - log_.push_back(std::move(entry)); + log_.push_back(std::move(newEntry)); lastLogIndex_ = idx; if (replicationLoggingFlag_) { @@ -181,7 +207,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { LOG(INFO) << "JIM -> " << parent_fn << ": Raised commitIndex_ from " << prevCommitIndex << " to " << commitIndex_; } - + } // apply any newly committed entries to state machine @@ -192,10 +218,13 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { std::chrono::steady_clock::duration delta; delta = now - last_ae_time_; last_ae_time_ = now; - auto ms = std::chrono::duration_cast(delta).count(); + if (replicationLoggingFlag_) { - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": AE received after " << ms << "ms"; + /* + auto ms = std::chrono::duration_cast(delta).count(); + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": AE received after " << ms << "ms"; + */ } if (demoted) { @@ -204,9 +233,13 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { << (initialRole == Role::LEADER ? "LEADER" : "CANDIDATE") << "->FOLLOWER in term " << term; } - if (tr != TermRelation::STALE) { leader_election_manager_->OnHeartBeat(); } + if (tr != TermRelation::STALE) { + leader_election_manager_->OnHeartBeat(); + } - for (auto& e : eToApply) { commit_(*e); } + for (auto& entry : eToApply) { + commit_(*entry); + } AppendEntriesResponse aer; aer.set_term(term); @@ -216,8 +249,14 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { SendMessage(MessageType::AppendEntriesResponseMsg, aer, leaderId); if (replicationLoggingFlag_) { - //if (success) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded success"; } - //else { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded failure"; } + /* + if (success) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded success"; + } + else { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": responded failure"; + } + */ } return true; } @@ -236,10 +275,14 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a std::lock_guard lk(mutex_); initialRole = role_; tr = TermCheckLocked(aer->term()); - if (tr == TermRelation::NEW) { demoted = DemoteSelfLocked(aer->term()); } + if (tr == TermRelation::NEW) { + demoted = DemoteSelfLocked(aer->term()); + } term = currentTerm_; - if (role_ != Role::LEADER || tr == TermRelation::STALE) { return; } + if (role_ != Role::LEADER || tr == TermRelation::STALE) { + return; + } nextIndex_[followerId] = aer->lastlogindex() + 1; @@ -262,7 +305,9 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a } // if failure, or if nextIndex[i] < lastLogIndex + 1 (follower isnt caught up) if (!aer->success() || (nextIndex_[followerId] < lastLogIndex_ + 1)) { - if (!aer->success()) { LOG(INFO) << "AppendEntriesResponse indicates FAILURE from follower " << followerId; } + if (!aer->success()) { + LOG(INFO) << "AppendEntriesResponse indicates FAILURE from follower " << followerId; + } fields = GatherAeFieldsLocked(followerId); resending = true; } @@ -274,11 +319,11 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a return false; } if (resending) { - CreateAndSendAppendEntryMsg(followerId, fields); + CreateAndSendAppendEntryMsg(fields); } - for (auto& e : eToApply) { - commit_(*e); + for (auto& entry : eToApply) { + commit_(*entry); } return true; } @@ -294,9 +339,11 @@ void Raft::ReceiveRequestVote(std::unique_ptr rv) { int votedFor = -1; Role initialRole; - if (rvSender == id_) { return; } + if (rvSender == id_) { + return; + } - const char* parent_fn = __FUNCTION__; + //const char* parent_fn = __FUNCTION__; [&]() { std::lock_guard lk(mutex_); initialRole = role_; @@ -307,14 +354,19 @@ void Raft::ReceiveRequestVote(std::unique_ptr rv) { term = currentTerm_; return; } - else if (tr == TermRelation::NEW) { demoted = DemoteSelfLocked(rvTerm); } + else if (tr == TermRelation::NEW) { + demoted = DemoteSelfLocked(rvTerm); + } // Then we continue voting process term = currentTerm_; votedFor = votedFor_; uint64_t lastLogTerm = getLastLogTermLocked(); - if (rv->lastlogterm() < lastLogTerm) { return; } - if (rv->lastlogterm() == lastLogTerm - && rv->lastlogindex() < lastLogIndex_) { return; } + if (rv->lastlogterm() < lastLogTerm) { + return; + } + if (rv->lastlogterm() == lastLogTerm && rv->lastlogindex() < lastLogIndex_) { + return; + } validCandidate = true; if (votedFor_ == -1 || votedFor_ == rvSender) { votedFor_ = rvSender; @@ -356,15 +408,23 @@ void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) std::lock_guard lk(mutex_); initialRole = role_; TermRelation tr = TermCheckLocked(term); - if (tr == TermRelation::STALE) { return; } + if (tr == TermRelation::STALE) { + return; + } else if (tr == TermRelation::NEW) { demoted = DemoteSelfLocked(term); return; } - if (role_ != Role::CANDIDATE) { return; } - if (!votedYes) { return; } + if (role_ != Role::CANDIDATE) { + return; + } + if (!votedYes) { + return; + } bool dupe = (std::find(votes_.begin(), votes_.end(), voterId) != votes_.end()); - if (dupe) { return; } + if (dupe) { + return; + } votes_.push_back(voterId); LOG(INFO) << "JIM -> " << parent_fn << ": Replica " << voterId << " voted for me. Votes: " << votes_.size() << "/" << quorum_ << " in term " << currentTerm_; @@ -380,12 +440,16 @@ void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) LOG(INFO) << "JIM -> " << parent_fn << ": CANDIDATE->LEADER in term " << currentTerm_; } }(); - if (demoted || elected) { leader_election_manager_->OnRoleChange(); } + if (demoted || elected) { + leader_election_manager_->OnRoleChange(); + } if (demoted) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted from " << (initialRole == Role::LEADER ? "LEADER" : "CANDIDATE") << "->FOLLOWER in term " << term; } - if (elected) { SendHeartBeat(); } + if (elected) { + SendHeartBeat(); + } } Role Raft::GetRoleSnapshot() const { @@ -441,7 +505,7 @@ void Raft::SendHeartBeat() { auto functionStart = std::chrono::steady_clock::now(); std::chrono::steady_clock::duration functionDelta; - std::vector> messages; + std::vector messages; uint64_t currentTerm; uint64_t heartBeatNum; { @@ -454,14 +518,15 @@ void Raft::SendHeartBeat() { heartBeatsSentThisTerm_++; heartBeatNum = heartBeatsSentThisTerm_; - messages = GatherAeFieldsForBroadcastLocked(true); + bool heartbeat = true; + messages = GatherAeFieldsForBroadcastLocked(heartbeat); } auto msgStart = std::chrono::steady_clock::now(); std::chrono::steady_clock::duration msgDelta; - for (const auto& [followerId, fields] : messages) { - CreateAndSendAppendEntryMsg(followerId, fields); + for (const auto& msg : messages) { + CreateAndSendAppendEntryMsg(msg); } auto msgEnd = std::chrono::steady_clock::now(); @@ -519,9 +584,15 @@ bool Raft::DemoteSelfLocked(uint64_t term) { // requires raft mutex to be held TermRelation Raft::TermCheckLocked(uint64_t term) const { - if (term < currentTerm_) { return TermRelation::STALE; } - else if (term == currentTerm_) { return TermRelation::CURRENT; } - else { return TermRelation::NEW; } + if (term < currentTerm_) { + return TermRelation::STALE; + } + else if (term == currentTerm_) { + return TermRelation::CURRENT; + } + else { + return TermRelation::NEW; + } } // requires raft mutex to be held @@ -531,7 +602,7 @@ uint64_t Raft::getLastLogTermLocked() const { // requires raft mutex to be held std::vector> Raft::PrepareCommitLocked() { - std::vector> v; + std::vector> commitVec; uint64_t begin = lastApplied_ + 1; bool applying = false; while (lastApplied_ < commitIndex_) { @@ -543,7 +614,7 @@ std::vector> Raft::PrepareCommitLocked() { } // assign seq number as log index for the request or executing transactions fails. command->set_seq(lastApplied_); - v.push_back(std::move(command)); + commitVec.push_back(std::move(command)); applying = true; } @@ -556,54 +627,60 @@ std::vector> Raft::PrepareCommitLocked() { } } - return v; + return commitVec; } AeFields Raft::GatherAeFieldsLocked(int followerId, bool heartBeat) const { - AeFields f{}; - f.term = currentTerm_; - f.leaderId = id_; - f.leaderCommit = commitIndex_; - f.prevLogIndex = nextIndex_[followerId] - 1; - f.prevLogTerm = log_[f.prevLogIndex]->term; - if (heartBeat) { return f; } + AeFields fields{}; + fields.term = currentTerm_; + fields.leaderId = id_; + fields.leaderCommit = commitIndex_; + fields.prevLogIndex = nextIndex_[followerId] - 1; + fields.prevLogTerm = log_[fields.prevLogIndex]->term; + fields.followerId = followerId; + if (heartBeat) { + return fields; + } const uint64_t firstNew = nextIndex_[followerId]; const uint64_t limit = std::min(lastLogIndex_, (firstNew + maxEntries) - 1); for (uint64_t i = firstNew; i <= limit; ++i) { - LogEntry e; - e.term = log_[i]->term; - e.command = log_[i]->command; - f.entries.push_back(e); + LogEntry entry; + entry.term = log_[i]->term; + entry.command = log_[i]->command; + fields.entries.push_back(entry); } - return f; + return fields; } // returns vector of tuples // If heartBeat == true, entries[] will be empty for all messages // else entries will each contain at most maxEntries amount of entries -std::vector> Raft::GatherAeFieldsForBroadcastLocked(bool heartBeat) const { - std::vector> v; - v.reserve(total_num_ - 1); +std::vector Raft::GatherAeFieldsForBroadcastLocked(bool heartBeat) const { + std::vector fieldsVec; + fieldsVec.reserve(total_num_ - 1); for (int i = 1; i <= total_num_; ++i) { - if (i == id_) { continue; } - AeFields f = GatherAeFieldsLocked(i, heartBeat); - v.emplace_back(i, f); + if (i == id_) { + continue; + } + AeFields fields = GatherAeFieldsLocked(i, heartBeat); + fieldsVec.push_back(fields); } - return v; + return fieldsVec; } -void Raft::CreateAndSendAppendEntryMsg(int followerId, const AeFields& f) { +void Raft::CreateAndSendAppendEntryMsg(const AeFields& fields) { + int followerId = fields.followerId; AppendEntries ae; - ae.set_term(f.term); - ae.set_leaderid(f.leaderId); - ae.set_prevlogindex(f.prevLogIndex); - ae.set_prevlogterm(f.prevLogTerm); - ae.set_leadercommitindex(f.leaderCommit); + ae.set_term(fields.term); + ae.set_leaderid(fields.leaderId); + ae.set_prevlogindex(fields.prevLogIndex); + ae.set_prevlogterm(fields.prevLogTerm); + ae.set_leadercommitindex(fields.leaderCommit); uint64_t entryCount = 0; - for (const auto& entry : f.entries) { - auto* e = ae.add_entries(); - e->set_term(entry.term); - e->set_command(entry.command); + for (const auto& entry : fields.entries) { + auto* newEntry = ae.add_entries(); + newEntry->set_term(entry.term); + newEntry->set_command(entry.command); if (entryCount > 0 && ae.ByteSizeLong() > maxBytes) { ae.mutable_entries()->RemoveLast(); break; diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index e2d7ac4602..346f3209d8 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -54,6 +54,7 @@ struct AeFields { uint64_t prevLogTerm = 0; std::vector entries{}; uint64_t leaderCommit = 0; + int followerId = -1; // not part of AE message itself, but needed to determine recipient }; class Raft : public common::ProtocolBase { @@ -87,9 +88,9 @@ class Raft : public common::ProtocolBase { //bool IsDuplicateLogEntry(const std::string& hash) const; // Must be called under mutex std::vector> PrepareCommitLocked(); // Must be called under mutex AeFields GatherAeFieldsLocked(int followerId, bool heartBeat = false) const; // Must be called under mutex - std::vector> GatherAeFieldsForBroadcastLocked(bool heartBeat = false) const; // Must be called under mutex + std::vector GatherAeFieldsForBroadcastLocked(bool heartBeat = false) const; // Must be called under mutex - void CreateAndSendAppendEntryMsg(int followerId, const AeFields& f); + void CreateAndSendAppendEntryMsg(const AeFields& f); // Persistent state on all servers: uint64_t currentTerm_; // Protected by mutex_ diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 95a72ef8a2..4ac40d49f8 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -65,7 +65,8 @@ int Consensus::ProcessCustomConsensus(std::unique_ptr request) { } raft_->ReceiveAppendEntries(std::move(txn)); return 0; - } else if (request->user_type() == MessageType::AppendEntriesResponseMsg) { + } + else if (request->user_type() == MessageType::AppendEntriesResponseMsg) { std::unique_ptr AppendEntriesResponse = std::make_unique(); if (!AppendEntriesResponse->ParseFromString(request->data())) { LOG(ERROR) << "parse proposal fail"; @@ -108,25 +109,10 @@ int Consensus::ProcessCustomConsensus(std::unique_ptr request) { } return 0; } -/* -message BatchUserRequest { - message UserRequest { - Request request = 1; - SignatureInfo signature = 2; - int32 id = 3; - }; - repeated UserRequest user_requests = 1; - uint64 createtime = 2; - uint64 local_id = 3; - uint64 seq = 4; - Certs committed_certs= 5; - bytes hash = 6; - int32 proxy_id = 7; -} -*/ + int Consensus::ProcessNewTransaction(std::unique_ptr request) { return raft_->ReceiveTransaction(std::move(request)); - } +} int Consensus::CommitMsg(const google::protobuf::Message& msg) { auto* req = dynamic_cast(&msg); From a48f29aae86373b3e6951258419201a536ff4ffa Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Tue, 16 Dec 2025 01:43:36 +0000 Subject: [PATCH 39/66] revamped follower truncate/append logic --- .../ordering/raft/algorithm/raft.cpp | 91 ++++++++++--------- .../consensus/ordering/raft/algorithm/raft.h | 7 +- 2 files changed, 54 insertions(+), 44 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 8614d8ee2d..aab3da17e7 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -50,8 +50,8 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, id_ = id; total_num_ = total_num; f_ = (total_num-1)/2; - last_ae_time_ = std::chrono::steady_clock::now(); - last_heartbeat_time_ = std::chrono::steady_clock::now(); + //last_ae_time_ = std::chrono::steady_clock::now(); + //last_heartbeat_time_ = std::chrono::steady_clock::now(); auto sentinel = std::make_unique(); sentinel->term = 0; @@ -153,52 +153,52 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { return; } - - /* - new logic concept: - rather than checking idx > lastLogIndex_, check idx == lastLogIndex_ + 1 (should be equivalent, has semantic value) - - First, loop over entries before lastLogIndex + 1 and look for conflicts. - If conflict occurs, wipe suffix and set LastLogIndex = idx - If idx already = lastLogIndex_ + 1, this loop is skipped - - Second, batch append all remaining entries to log - */ - - uint64_t idx = ae->prevlogindex() + 1; - for (const auto& entry : ae->entries()) { - auto newEntry = std::make_unique(); - newEntry->term = entry.term(); - newEntry->command = entry.command(); - - // entry is at new position - if (idx > lastLogIndex_) { - log_.push_back(std::move(newEntry)); - lastLogIndex_ = idx; - - if (replicationLoggingFlag_) { - LOG(INFO) << "JIM -> " << parent_fn << ": follower appended new entry at index " << lastLogIndex_; - } - - } - // entry is at an existing position && new term doesnt match old term - else if (newEntry->term != log_[idx]->term) { - auto first = log_.begin() + idx; + // Try to append entries to the log + uint64_t logIdx = ae->prevlogindex() + 1; + uint64_t entriesIdx = 0; + uint64_t entriesSize = static_cast(ae->entries_size()); + // check for conflicting entry terms in existing indices + // if conflict, delete suffix and short circuit out of loop + while (logIdx < log_.size() && entriesIdx < entriesSize) { + uint64_t term = ae->entries(entriesIdx).term(); + if (term != log_[logIdx]->term) { + auto first = log_.begin() + logIdx; auto last = log_.begin() + lastLogIndex_ + 1; log_.erase(first, last); - log_.push_back(std::move(newEntry)); - lastLogIndex_ = idx; + lastLogIndex_ = log_.size() - 1; if (replicationLoggingFlag_) { - LOG(INFO) << "JIM -> " << parent_fn << ": follower saw term mismatch at index " << lastLogIndex_ << ". Later entries erased"; + LOG(INFO) << "JIM -> " << parent_fn << ": follower saw term mismatch at index " << logIdx << ". Suffix erased from log"; } + break; } - ++idx; - // TODO: have to actually store the entry durably before it can be considered "appended" + ++entriesIdx; + ++logIdx; + } + + // append remaining entries + const auto appendSize = entriesSize - entriesIdx; + log_.reserve(log_.size() + appendSize); + for (uint64_t i = entriesIdx; i < entriesSize; ++i) { + log_.emplace_back(std::make_unique(CreateLogEntry(ae->entries(i)))); } + // update lastLogIndex after appends + uint64_t firstAppendIdx = lastLogIndex_ + 1; + lastLogIndex_ = log_.size() - 1; + // TODO: have to actually store the entry durably before follower can respond to RPC lastLogIndex = lastLogIndex_; - + + if (replicationLoggingFlag_ && appendSize > 0) { + if (appendSize > 1) { + LOG(INFO) << "JIM -> " << parent_fn << ": follower appended entries at indices " << firstAppendIdx << " to " << lastLogIndex_; + } + else { + LOG(INFO) << "JIM -> " << parent_fn << ": follower appended entry at index " << lastLogIndex_; + } + } + + // Try to raise commitIndex and commit entries uint64_t prevCommitIndex = commitIndex_; if (leaderCommit > commitIndex_) { commitIndex_ = std::min(leaderCommit, lastLogIndex_); @@ -214,6 +214,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { eToApply = PrepareCommitLocked(); }(); + /* auto now = std::chrono::steady_clock::now(); std::chrono::steady_clock::duration delta; delta = now - last_ae_time_; @@ -221,11 +222,12 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { if (replicationLoggingFlag_) { - /* + auto ms = std::chrono::duration_cast(delta).count(); LOG(INFO) << "JIM -> " << __FUNCTION__ << ": AE received after " << ms << "ms"; - */ + } + */ if (demoted) { leader_election_manager_->OnRoleChange(); @@ -693,5 +695,12 @@ void Raft::CreateAndSendAppendEntryMsg(const AeFields& fields) { } } +LogEntry Raft::CreateLogEntry(const Entry& entry) const { + LogEntry newEntry; + newEntry.term = entry.term(); + newEntry.command = entry.command(); + return newEntry; +} + } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 346f3209d8..43e940bbcc 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -90,7 +90,8 @@ class Raft : public common::ProtocolBase { AeFields GatherAeFieldsLocked(int followerId, bool heartBeat = false) const; // Must be called under mutex std::vector GatherAeFieldsForBroadcastLocked(bool heartBeat = false) const; // Must be called under mutex - void CreateAndSendAppendEntryMsg(const AeFields& f); + void CreateAndSendAppendEntryMsg(const AeFields& fields); + LogEntry CreateLogEntry(const Entry& entry) const; // Persistent state on all servers: uint64_t currentTerm_; // Protected by mutex_ @@ -110,8 +111,8 @@ class Raft : public common::ProtocolBase { //int leaderId_; // Protected by mutex_ std::vector votes_; // Protected by mutex_ std::vector inflight_; // Protected by mutex_ - std::chrono::steady_clock::time_point last_ae_time_; - std::chrono::steady_clock::time_point last_heartbeat_time_; // Protected by mutex_ + //std::chrono::steady_clock::time_point last_ae_time_; + //std::chrono::steady_clock::time_point last_heartbeat_time_; // Protected by mutex_ bool is_stop_; const uint64_t quorum_; From b2a7a598732a91a156e5663dc49fe5c645266030 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Tue, 16 Dec 2025 02:02:24 +0000 Subject: [PATCH 40/66] added some comments to Raft::ReceiveAppendEntries method --- platform/consensus/ordering/raft/algorithm/raft.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index aab3da17e7..e6e7997ef8 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -132,6 +132,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { const char* parent_fn = __FUNCTION__; [&]() { std::lock_guard lk(mutex_); + // ---------- Checking term, role, prevlogindex, prevlogterm ---------- initialRole = role_; lastLogIndex = lastLogIndex_; tr = TermCheckLocked(ae->term()); @@ -149,11 +150,12 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { } } term = currentTerm_; + // Early return if we should not append if (!success) { return; } - // Try to append entries to the log + // ---------- Appending entries ---------- uint64_t logIdx = ae->prevlogindex() + 1; uint64_t entriesIdx = 0; uint64_t entriesSize = static_cast(ae->entries_size()); @@ -198,7 +200,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { } } - // Try to raise commitIndex and commit entries + // ---------- Try to raise commitIndex and commit entries ---------- uint64_t prevCommitIndex = commitIndex_; if (leaderCommit > commitIndex_) { commitIndex_ = std::min(leaderCommit, lastLogIndex_); @@ -210,7 +212,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { } - // apply any newly committed entries to state machine + // build vector to apply committed entries outside mutex eToApply = PrepareCommitLocked(); }(); @@ -229,6 +231,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { } */ + // ---------- Outside mutex: inform leader_election_manager, apply committed entries, send response ---------- if (demoted) { leader_election_manager_->OnRoleChange(); LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted from " From d70639671e6dc9c54623f22adadfcf7d39a4536d Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Mon, 26 Jan 2026 20:23:33 +0000 Subject: [PATCH 41/66] update inflight limit code --- .../ordering/raft/algorithm/raft.cpp | 141 ++++++++++++++++-- .../consensus/ordering/raft/algorithm/raft.h | 30 +++- scripts/deploy/config/raft.config | 2 +- scripts/deploy/script/deploy_local.sh | 3 + 4 files changed, 157 insertions(+), 19 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index e6e7997ef8..b9476fc3d7 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -32,6 +33,21 @@ namespace resdb { namespace raft { + +uint32_t LogEntry::GetSerializedSize() { + if (serializedSize == 0) { + serializedSize = ComputeSerializedEntrySize(); + } + return serializedSize; +} + +uint32_t LogEntry::ComputeSerializedEntrySize() const { + Entry entry; + entry.set_term(term); + entry.set_command(command); + return entry.ByteSizeLong(); +} + Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, LeaderElectionManager* leaderelection_manager, ReplicaCommunicator* replica_communicator) : ProtocolBase(id, f, total_num), @@ -58,7 +74,10 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, sentinel->command = "COMMON_PREFIX"; log_.push_back(std::move(sentinel)); - inflight_.assign(total_num_ + 1, 0); + inflightVecs_.resize(total_num_ + 1); + for (auto& vec : inflightVecs_) { + vec.reserve(maxInFlightPerFollower); + } nextIndex_.assign(total_num_ + 1, lastLogIndex_ + 1); matchIndex_.assign(total_num_ + 1, lastLogIndex_); } @@ -88,6 +107,7 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": req could not be serialized"; return false; } + entry->GetSerializedSize(); log_.push_back(std::move(entry)); @@ -106,7 +126,12 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { } // prepare fields for appendEntries message + PruneExpiredInFlightMsgsLocked(); messages = GatherAeFieldsForBroadcastLocked(); + auto now = std::chrono::steady_clock::now(); + for (const auto& msg : messages) { + RecordNewInFlightMsgLocked(msg, now); + } } for (const auto& msg : messages) { CreateAndSendAppendEntryMsg(msg); @@ -288,7 +313,8 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a if (role_ != Role::LEADER || tr == TermRelation::STALE) { return; } - + PruneExpiredInFlightMsgsLocked(); + PruneRedundantInFlightMsgsLocked(followerId, aer->lastlogindex()); nextIndex_[followerId] = aer->lastlogindex() + 1; // if successful, update matchIndex and try to commit more entries @@ -313,8 +339,12 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a if (!aer->success()) { LOG(INFO) << "AppendEntriesResponse indicates FAILURE from follower " << followerId; } - fields = GatherAeFieldsLocked(followerId); - resending = true; + if (!InFlightPerFollowerLimitReachedLocked(followerId)) { + fields = GatherAeFieldsLocked(followerId); + resending = true; + auto now = std::chrono::steady_clock::now(); + RecordNewInFlightMsgLocked(fields, now); + } } }(); if (demoted) { @@ -436,7 +466,7 @@ void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) if (votes_.size() >= quorum_) { elected = true; role_ = Role::LEADER; - inflight_.assign(total_num_ + 1, 0); + ClearInFlightsLocked(); nextIndex_.assign(total_num_ + 1, lastLogIndex_ + 1); // make sure to set leaders own matchIndex entry to lastLogIndex @@ -543,7 +573,6 @@ void Raft::SendHeartBeat() { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Heartbeat " << heartBeatNum << " for term " << currentTerm; } - auto redirectStart = std::chrono::steady_clock::now(); std::chrono::steady_clock::duration redirectDelta; @@ -646,9 +675,15 @@ AeFields Raft::GatherAeFieldsLocked(int followerId, bool heartBeat) const { if (heartBeat) { return fields; } + uint32_t msgBytes = maxHeaderBytes; const uint64_t firstNew = nextIndex_[followerId]; const uint64_t limit = std::min(lastLogIndex_, (firstNew + maxEntries) - 1); for (uint64_t i = firstNew; i <= limit; ++i) { + msgBytes += log_[i]->GetSerializedSize(); + // Always include at least 1 entry, after that limit by maxBytes. + if (i != firstNew && msgBytes >= maxBytes) { + break; + } LogEntry entry; entry.term = log_[i]->term; entry.command = log_[i]->command; @@ -660,13 +695,18 @@ AeFields Raft::GatherAeFieldsLocked(int followerId, bool heartBeat) const { // returns vector of tuples // If heartBeat == true, entries[] will be empty for all messages // else entries will each contain at most maxEntries amount of entries +// Followers will be excluded from the broadcast if they are at inflight max unless this is a heartbeat std::vector Raft::GatherAeFieldsForBroadcastLocked(bool heartBeat) const { + assert(role_ == Role::LEADER); std::vector fieldsVec; fieldsVec.reserve(total_num_ - 1); - for (int i = 1; i <= total_num_; ++i) { + for (size_t i = 1; i <= total_num_; ++i) { if (i == id_) { continue; } + if (!heartBeat && InFlightPerFollowerLimitReachedLocked(i)) { + continue; + } AeFields fields = GatherAeFieldsLocked(i, heartBeat); fieldsVec.push_back(fields); } @@ -681,19 +721,14 @@ void Raft::CreateAndSendAppendEntryMsg(const AeFields& fields) { ae.set_prevlogindex(fields.prevLogIndex); ae.set_prevlogterm(fields.prevLogTerm); ae.set_leadercommitindex(fields.leaderCommit); - uint64_t entryCount = 0; for (const auto& entry : fields.entries) { auto* newEntry = ae.add_entries(); newEntry->set_term(entry.term); newEntry->set_command(entry.command); - if (entryCount > 0 && ae.ByteSizeLong() > maxBytes) { - ae.mutable_entries()->RemoveLast(); - break; - } - entryCount++; } SendMessage(MessageType::AppendEntriesMsg, ae, followerId); if (replicationLoggingFlag_) { + uint64_t entryCount = fields.entries.size(); LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Sent AE with " << entryCount << (entryCount == 1 ? " entry" : " entries"); } } @@ -705,5 +740,85 @@ LogEntry Raft::CreateLogEntry(const Entry& entry) const { return newEntry; } +void Raft::ClearInFlightsLocked() { + assert(role_ == Role::LEADER); + for (auto& vec : inflightVecs_) { + vec.clear(); + } +} + +void Raft::PruneExpiredInFlightMsgsLocked() { + assert(role_ == Role::LEADER); + auto now = std::chrono::steady_clock::now(); + for (size_t i = 1; i < inflightVecs_.size(); ++i) { + if (i == id_) { + continue; + } + auto& vec = inflightVecs_[i]; + if (vec.empty()) { + continue; + } + auto it = vec.begin(); + while(it != vec.end()) { + auto timeElapsed = now - it->timeSent; + if (timeElapsed >= AEResponseDeadline) { + it = vec.erase(it); + if (replicationLoggingFlag_) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Pruned expired inflight AE for follower " << i; + } + } + else { + ++it; + } + } + } +} + +void Raft::PruneRedundantInFlightMsgsLocked(int followerId, uint64_t followerLastLogIndex) { + assert(role_ == Role::LEADER); + assert(followerId > 0); + assert(static_cast(followerId) < inflightVecs_.size()); + assert(followerId != id_); + + auto& msgVec = inflightVecs_[followerId]; + if (msgVec.empty()) { + return; + } + auto it = msgVec.begin(); + while(it != msgVec.end()) { + if (it->prevLogIndexSent > followerLastLogIndex || it->lastIndexOfSegmentSent <= followerLastLogIndex) { + it = msgVec.erase(it); + if (replicationLoggingFlag_) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Pruned redundant inflight AE for follower " << followerId; + } + } + else { + ++it; + } + } +} + +void Raft::RecordNewInFlightMsgLocked(const AeFields& msg, std::chrono::steady_clock::time_point timestamp) { + if (msg.entries.empty()) { + return; + } + InFlightMsg inFlight; + inFlight.timeSent = timestamp; + inFlight.prevLogIndexSent = msg.prevLogIndex; + inFlight.lastIndexOfSegmentSent = msg.prevLogIndex + msg.entries.size(); + inflightVecs_[msg.followerId].push_back(inFlight); +} + +bool Raft::InFlightPerFollowerLimitReachedLocked(int followerId) const { + assert(role_ == Role::LEADER); + assert(followerId > 0); + assert(static_cast(followerId) < inflightVecs_.size()); + assert(followerId != id_); + + auto size = inflightVecs_[followerId].size(); + assert(size <= maxInFlightPerFollower); + return size == maxInFlightPerFollower; +} + } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 43e940bbcc..aba15ba372 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -42,9 +42,16 @@ namespace raft { enum class Role { FOLLOWER, CANDIDATE, LEADER }; enum class TermRelation { STALE, CURRENT, NEW }; -struct LogEntry { +class LogEntry { + public: uint64_t term; std::string command; + + uint32_t GetSerializedSize(); + uint32_t ComputeSerializedEntrySize() const; + + private: + uint32_t serializedSize = 0; }; struct AeFields { @@ -57,6 +64,12 @@ struct AeFields { int followerId = -1; // not part of AE message itself, but needed to determine recipient }; +struct InFlightMsg { + std::chrono::steady_clock::time_point timeSent; + uint64_t prevLogIndexSent; + uint64_t lastIndexOfSegmentSent; +}; + class Raft : public common::ProtocolBase { public: Raft(int id, int f, int total_num, @@ -89,10 +102,15 @@ class Raft : public common::ProtocolBase { std::vector> PrepareCommitLocked(); // Must be called under mutex AeFields GatherAeFieldsLocked(int followerId, bool heartBeat = false) const; // Must be called under mutex std::vector GatherAeFieldsForBroadcastLocked(bool heartBeat = false) const; // Must be called under mutex - void CreateAndSendAppendEntryMsg(const AeFields& fields); LogEntry CreateLogEntry(const Entry& entry) const; + void ClearInFlightsLocked(); + void PruneExpiredInFlightMsgsLocked(); + void PruneRedundantInFlightMsgsLocked(int followerId, uint64_t followerLastLogIndex); + void RecordNewInFlightMsgLocked(const AeFields& msg, std::chrono::steady_clock::time_point timestamp); + bool InFlightPerFollowerLimitReachedLocked(int followerId) const; + // Persistent state on all servers: uint64_t currentTerm_; // Protected by mutex_ int votedFor_; // Protected by mutex_ @@ -110,7 +128,7 @@ class Raft : public common::ProtocolBase { Role role_; // Protected by mutex_ //int leaderId_; // Protected by mutex_ std::vector votes_; // Protected by mutex_ - std::vector inflight_; // Protected by mutex_ + std::vector> inflightVecs_; // Protected by mutex_ //std::chrono::steady_clock::time_point last_ae_time_; //std::chrono::steady_clock::time_point last_heartbeat_time_; // Protected by mutex_ @@ -118,9 +136,11 @@ class Raft : public common::ProtocolBase { const uint64_t quorum_; // for limiting AppendEntries batch sizing + static constexpr size_t maxHeaderBytes = 64; static constexpr size_t maxBytes = 64 * 1024; - static constexpr size_t maxEntries = 8; - static constexpr size_t maxInFlightAE = 3; + static constexpr size_t maxEntries = 16; + static constexpr size_t maxInFlightPerFollower = 4; + static constexpr std::chrono::milliseconds AEResponseDeadline{300}; // in milliseconds SignatureVerifier* verifier_; LeaderElectionManager* leader_election_manager_; diff --git a/scripts/deploy/config/raft.config b/scripts/deploy/config/raft.config index 64de9bf238..8994035a07 100644 --- a/scripts/deploy/config/raft.config +++ b/scripts/deploy/config/raft.config @@ -4,7 +4,7 @@ "recovery_enabled": false, "not_need_signature": true, "max_client_complaint_num":10, - "max_process_txn": 10, + "max_process_txn": 100, "worker_num": 1, "input_worker_num": 1, "output_worker_num": 10 diff --git a/scripts/deploy/script/deploy_local.sh b/scripts/deploy/script/deploy_local.sh index 11145de839..778ce744a5 100755 --- a/scripts/deploy/script/deploy_local.sh +++ b/scripts/deploy/script/deploy_local.sh @@ -72,6 +72,8 @@ deploy/script/generate_config.sh ${BAZEL_WORKSPACE_PATH} ${output_key_path} ${ou # build kv server bazel build ${server} +# JIM opts for debug +#bazel build -c opt --copt=-g --strip=never ${server} if [ $? != 0 ] then @@ -144,6 +146,7 @@ do private_key="cert/node_"${idx}".key.pri" cert="cert/cert_"${idx}".cert" cd ${home_path}/${main_folder}/$idx; nohup ./${server_bin} server.config ${private_key} ${cert} ${grafna_port} > ${server_bin}.log 2>&1 & + echo "cd ${home_path}/${main_folder}/$idx; nohup ./${server_bin} server.config ${private_key} ${cert} ${grafna_port} > ${server_bin}.log 2>&1 &" ((count++)) ((idx++)) ((grafna_port++)) From da77534f1e2c539255737aeb4c47430541cd6626 Mon Sep 17 00:00:00 2001 From: Jim Brower Date: Mon, 26 Jan 2026 21:37:01 +0000 Subject: [PATCH 42/66] modified config generation script to accomodate changes to config reading in upstream code --- scripts/deploy/script/generate_config.sh | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/scripts/deploy/script/generate_config.sh b/scripts/deploy/script/generate_config.sh index aa3d77d71a..4fce410e0d 100755 --- a/scripts/deploy/script/generate_config.sh +++ b/scripts/deploy/script/generate_config.sh @@ -78,5 +78,27 @@ do idx=$(($idx+1)) done +#python3 ${CONFIG_TOOLS_BIN} ./client.config ./client.config.json ${TEMPLATE_PATH} +#mv client.config.json client.config + +# Rewrite client.config into RegionInfo JSON for ReadConfig() +python3 - <<'PY' +import json + +path = "client.config" +replicas = [] +with open(path) as f: + for line in f: + line = line.strip() + if not line: + continue + i, ip, port = line.split() + replicas.append({"id": int(i), "ip": ip, "port": int(port)}) + +with open(path, "w") as out: + json.dump({"replicaInfo": replicas}, out) + out.write("\n") +PY + python3 ${CONFIG_TOOLS_BIN} ./server.config ./server.config.json ${TEMPLATE_PATH} mv server.config.json server.config From 6555213f57d242a1f14906796a918f13549873e0 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Wed, 4 Feb 2026 19:32:04 -0800 Subject: [PATCH 43/66] Add a function for use in later commit --- platform/consensus/ordering/raft/algorithm/raft.cpp | 10 +++++++--- platform/consensus/ordering/raft/algorithm/raft.h | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index b9476fc3d7..c1fc9da853 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -90,6 +90,10 @@ bool Raft::IsStop() { return is_stop_; } +void Raft::SetRole(Role role) { + role_ = role; +} + bool Raft::ReceiveTransaction(std::unique_ptr req) { std::vector messages; { @@ -465,7 +469,7 @@ void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) << votes_.size() << "/" << quorum_ << " in term " << currentTerm_; if (votes_.size() >= quorum_) { elected = true; - role_ = Role::LEADER; + SetRole(Role::LEADER); ClearInFlightsLocked(); nextIndex_.assign(total_num_ + 1, lastLogIndex_ + 1); @@ -507,7 +511,7 @@ void Raft::StartElection() { return; } if (role_ == Role::FOLLOWER) { - role_ = Role::CANDIDATE; + SetRole(Role::CANDIDATE); roleChanged = true; } heartBeatsSentThisTerm_ = 0; @@ -609,7 +613,7 @@ bool Raft::DemoteSelfLocked(uint64_t term) { votedFor_ = -1; } if (role_ != Role::FOLLOWER) { - role_ = Role::FOLLOWER; + SetRole(Role::FOLLOWER); //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted to FOLLOWER"; return true; } diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index aba15ba372..14668215cf 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -90,6 +90,7 @@ class Raft : public common::ProtocolBase { void StartElection(); void SendHeartBeat(); Role GetRoleSnapshot() const; + void SetRole(Role role); private: mutable std::mutex mutex_; From 9e7cfc5655848c3a4bd6b7a89236203766197ba2 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Wed, 4 Feb 2026 19:34:53 -0800 Subject: [PATCH 44/66] Add tests related to timeouts and heartbeats for Raft --- .../ordering/common/algorithm/protocol_base.h | 4 +- .../consensus/ordering/raft/algorithm/BUILD | 20 ++ .../leader_election_manager_test.cpp | 182 ++++++++++++++++++ .../raft/algorithm/leaderelection_manager.h | 2 +- .../ordering/raft/algorithm/mock_raft.h | 46 +++++ .../consensus/ordering/raft/algorithm/raft.h | 4 +- 6 files changed, 253 insertions(+), 5 deletions(-) create mode 100644 platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp create mode 100644 platform/consensus/ordering/raft/algorithm/mock_raft.h diff --git a/platform/consensus/ordering/common/algorithm/protocol_base.h b/platform/consensus/ordering/common/algorithm/protocol_base.h index f8e47052a2..d180746bda 100644 --- a/platform/consensus/ordering/common/algorithm/protocol_base.h +++ b/platform/consensus/ordering/common/algorithm/protocol_base.h @@ -63,9 +63,9 @@ class ProtocolBase { } protected: - int SendMessage(int msg_type, const google::protobuf::Message& msg, + virtual int SendMessage(int msg_type, const google::protobuf::Message& msg, int node_id); - int Broadcast(int msg_type, const google::protobuf::Message& msg); + virtual int Broadcast(int msg_type, const google::protobuf::Message& msg); int Commit(const google::protobuf::Message& msg); bool IsStop(); diff --git a/platform/consensus/ordering/raft/algorithm/BUILD b/platform/consensus/ordering/raft/algorithm/BUILD index d59c03bf2e..dba24bb308 100644 --- a/platform/consensus/ordering/raft/algorithm/BUILD +++ b/platform/consensus/ordering/raft/algorithm/BUILD @@ -41,3 +41,23 @@ cc_library( "//platform/proto:viewchange_message_cc_proto", ], ) + +cc_library( + name = "mock_raft", + hdrs = ["mock_raft.h"], + deps = [ + ":raft", + ], +) + +cc_test( + name = "leader_election_test", + srcs = ["leader_election_manager_test.cpp"], + deps = [ + ":raft", + ":mock_raft", + "//platform/config:resdb_config_utils", + "//common/test:test_main" + ], + size="small" +) \ No newline at end of file diff --git a/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp b/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp new file mode 100644 index 0000000000..a88cbf1172 --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp @@ -0,0 +1,182 @@ +#include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" +#include "platform/consensus/ordering/raft/algorithm/mock_raft.h" +#include "platform/config/resdb_config_utils.h" + +#include + +#include +#include +#include + +namespace resdb { +namespace raft { + +using ::testing::Invoke; + +ResDBConfig GenerateConfig() { + ResConfigData data; + data.set_duplicate_check_frequency_useconds(100000); + data.set_enable_viewchange(true); + return ResDBConfig({GenerateReplicaInfo(1, "127.0.0.1", 1234), + GenerateReplicaInfo(2, "127.0.0.1", 1235), + GenerateReplicaInfo(3, "127.0.0.1", 1236), + GenerateReplicaInfo(4, "127.0.0.1", 1237)}, + GenerateReplicaInfo(1, "127.0.0.1", 1234), data); +} + +class TestLeaderElectionManager : public LeaderElectionManager { +public: + TestLeaderElectionManager(const ResDBConfig& config) : LeaderElectionManager(config) {} + uint64_t GetHeartbeatCount() { + std::lock_guard lk(cv_mutex_); + return heartbeat_count_; + } + uint64_t GetBroadcastCount() { + std::lock_guard lk(cv_mutex_); + return broadcast_count_; + } +private: + // Overriding this is used to set the timeout timer to start an election to 50 ms. + uint64_t RandomInt(uint64_t min, uint64_t max) { return 50; } +}; + +class LeaderElectionManagerTest : public ::testing::Test { + protected: + LeaderElectionManagerTest() : config_(GenerateConfig()) {} + + void SetUp() override { + verifier_ = nullptr; + replica_communicator_ = nullptr; + leader_election_manager_ = std::make_unique(config_); + mock_raft_ = std::make_unique(1, 1, 3, verifier_.get(), leader_election_manager_.get(), replica_communicator_.get()); + } + + void TearDown() override { + if (leader_election_manager_) { + leader_election_manager_.reset(); + } + if (mock_raft_) { + mock_raft_.reset(); + } + } + + ResDBConfig config_; + std::unique_ptr verifier_; + std::unique_ptr replica_communicator_; + std::unique_ptr leader_election_manager_; + std::unique_ptr mock_raft_; +}; + +// Test 1: Follower timeout should trigger election +TEST_F(LeaderElectionManagerTest, FollowerTimeoutTriggersElection) { + mock_raft_->SetRole(Role::FOLLOWER); + + std::promise election_started; + std::future election_started_future = election_started.get_future(); + + leader_election_manager_->SetRaft(mock_raft_.get()); + leader_election_manager_->MayStart(); + + EXPECT_CALL(*mock_raft_, StartElection) + .WillOnce(Invoke([&]() { + election_started.set_value(true); + })); + + auto status = election_started_future.wait_for(std::chrono::milliseconds(100)); + ASSERT_EQ(status, std::future_status::ready); +} + +// Test 2: Follower should not start election before timing out +TEST_F(LeaderElectionManagerTest, FollowerShouldNotStartElectionEarly) { + mock_raft_->SetRole(Role::FOLLOWER); + + std::promise election_started; + std::future election_started_future = election_started.get_future(); + + EXPECT_CALL(*mock_raft_, StartElection()).Times(0); + + leader_election_manager_->SetRaft(mock_raft_.get()); + leader_election_manager_->MayStart(); + + std::this_thread::sleep_for(std::chrono::milliseconds(45)); + // Since the timeout timer is set to 50 ms, StartElection should never be called +} + +// Test 3: Follower receiving heartbeat should NOT trigger election +TEST_F(LeaderElectionManagerTest, FollowerReceivingHeartbeatDoesNotStartElection) { + mock_raft_->SetRole(Role::FOLLOWER); + + std::promise election_started; + std::future election_started_future = election_started.get_future(); + + EXPECT_CALL(*mock_raft_, StartElection()).Times(0); + + leader_election_manager_->SetRaft(mock_raft_.get()); + leader_election_manager_->MayStart(); + + std::this_thread::sleep_for(std::chrono::milliseconds(45)); + leader_election_manager_->OnHeartBeat(); + + std::this_thread::sleep_for(std::chrono::milliseconds(45)); + ASSERT_EQ(leader_election_manager_->GetHeartbeatCount(), 1); + // Since the timeout timer is set to 50 ms, StartElection should never be called +} + +// Test 4: Leader timeout should send heartbeat +TEST_F(LeaderElectionManagerTest, LeaderTimeoutSendsHeartbeat) { + mock_raft_->SetRole(Role::LEADER); + + std::promise heartbeat_sent; + std::future heartbeat_sent_future = heartbeat_sent.get_future(); + + leader_election_manager_->SetRaft(mock_raft_.get()); + leader_election_manager_->MayStart(); + + EXPECT_CALL(*mock_raft_, SendHeartBeat) + .WillOnce(Invoke([&]() { + heartbeat_sent.set_value(true); + })); + + auto status = heartbeat_sent_future.wait_for(std::chrono::milliseconds(105)); + ASSERT_EQ(status, std::future_status::ready); +} + +// Test 5: Leader should not send heartbeat before timing out +TEST_F(LeaderElectionManagerTest, LeaderShouldNotSendHeartbeatEarly) { + mock_raft_->SetRole(Role::LEADER); + + std::promise heartbeat_sent; + std::future heartbeat_sent_future = heartbeat_sent.get_future(); + + EXPECT_CALL(*mock_raft_, SendHeartBeat()).Times(0); + + leader_election_manager_->SetRaft(mock_raft_.get()); + leader_election_manager_->MayStart(); + + std::this_thread::sleep_for(std::chrono::milliseconds(95)); + // Since the heartbeat timer is set to 100 ms, SendHeartBeat should never be called +} + +// Test 6: Leader sending some broadcast should not be sending heartbeats +TEST_F(LeaderElectionManagerTest, LeaderWithBroadcastDoesNotSendHeartbeat) { + mock_raft_->SetRole(Role::LEADER); + + std::promise heartbeat_sent; + std::future heartbeat_sent_future = heartbeat_sent.get_future(); + + EXPECT_CALL(*mock_raft_, SendHeartBeat()).Times(0); + leader_election_manager_->SetRaft(mock_raft_.get()); + leader_election_manager_->MayStart(); + + // Send broadcasts to reset the timer + for (int i = 0; i < 3; i++) { + std::this_thread::sleep_for(std::chrono::milliseconds(95)); + leader_election_manager_->OnAeBroadcast(); + } + + ASSERT_EQ(leader_election_manager_->GetBroadcastCount(), 3); +} + +} // namespace raft +} // namespace resdb + diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h index ed52234629..f9ea1b32e9 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h @@ -55,7 +55,7 @@ class LeaderElectionManager { Waited LeaderWait(); Waited Wait(); void MonitoringElectionTimeout(); - uint64_t RandomInt(uint64_t min, uint64_t max); + virtual uint64_t RandomInt(uint64_t min, uint64_t max); protected: diff --git a/platform/consensus/ordering/raft/algorithm/mock_raft.h b/platform/consensus/ordering/raft/algorithm/mock_raft.h new file mode 100644 index 0000000000..4eebe085d9 --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/mock_raft.h @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include "platform/consensus/ordering/raft/algorithm/raft.h" + +namespace resdb { +namespace raft { + +// Mock Raft class to test LeaderElectionManager interactions +class MockRaft : public Raft { + public: + MockRaft(int id, int f, int total_num, SignatureVerifier* verifier, + LeaderElectionManager* leaderelection_manager, + ReplicaCommunicator* replica_communicator) + : Raft(id, f, total_num, verifier, leaderelection_manager, replica_communicator){} + + MOCK_METHOD(void, SendHeartBeat, (), ()); + MOCK_METHOD(void, StartElection, (), ()); + MOCK_METHOD(int, Broadcast, (int msg_type, const google::protobuf::Message& msg), (override)); + MOCK_METHOD(int, SendMessage, (int msg_type, + const google::protobuf::Message& msg, + int node_id), (override)); +}; + +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 14668215cf..f517df1d28 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -87,8 +87,8 @@ class Raft : public common::ProtocolBase { bool ReceiveAppendEntriesResponse(std::unique_ptr aer); void ReceiveRequestVote(std::unique_ptr rv); void ReceiveRequestVoteResponse(std::unique_ptr rvr); - void StartElection(); - void SendHeartBeat(); + virtual void StartElection(); + virtual void SendHeartBeat(); Role GetRoleSnapshot() const; void SetRole(Role role); From b67e584b6106a9c85b7e0686bff0d9ee5e3af4fb Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Mon, 16 Feb 2026 12:42:26 -0800 Subject: [PATCH 45/66] Add initial Raft tests for sending and receiving AppendEntries --- .../consensus/ordering/raft/algorithm/BUILD | 24 + .../leader_election_manager_test.cpp | 1 - .../raft/algorithm/leaderelection_manager.h | 6 +- .../algorithm/mock_leader_election_manager.h | 40 + .../ordering/raft/algorithm/raft.cpp | 56 ++ .../consensus/ordering/raft/algorithm/raft.h | 168 ++++- .../ordering/raft/algorithm/raft_test.cpp | 696 ++++++++++++++++++ 7 files changed, 967 insertions(+), 24 deletions(-) create mode 100644 platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h create mode 100644 platform/consensus/ordering/raft/algorithm/raft_test.cpp diff --git a/platform/consensus/ordering/raft/algorithm/BUILD b/platform/consensus/ordering/raft/algorithm/BUILD index dba24bb308..a4e41deed3 100644 --- a/platform/consensus/ordering/raft/algorithm/BUILD +++ b/platform/consensus/ordering/raft/algorithm/BUILD @@ -50,6 +50,14 @@ cc_library( ], ) +cc_library( + name = "mock_leader_election_manager", + hdrs = ["mock_leader_election_manager.h"], + deps = [ + ":raft", + ], +) + cc_test( name = "leader_election_test", srcs = ["leader_election_manager_test.cpp"], @@ -60,4 +68,20 @@ cc_test( "//common/test:test_main" ], size="small" +) + +cc_test( + name = "raft_test", + srcs = ["raft_test.cpp"], + copts = ["-DRAFT_TEST_MODE"], + deps = [ + ":raft", + ":mock_leader_election_manager", + "//platform/networkstrate:mock_replica_communicator", + "//common/crypto:mock_signature_verifier", + "//platform/config:resdb_config_utils", + "//common/test:test_main", + "//platform/proto:client_test_cc_proto", + ], + size="small" ) \ No newline at end of file diff --git a/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp b/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp index a88cbf1172..5a89355073 100644 --- a/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp +++ b/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp @@ -179,4 +179,3 @@ TEST_F(LeaderElectionManagerTest, LeaderWithBroadcastDoesNotSendHeartbeat) { } // namespace raft } // namespace resdb - diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h index f9ea1b32e9..03e0242ae3 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h @@ -47,9 +47,9 @@ class LeaderElectionManager { // If the monitor is not running, start to monitor. void MayStart(); void SetRaft(raft::Raft*); - void OnHeartBeat(); - void OnRoleChange(); - void OnAeBroadcast(); + virtual void OnHeartBeat(); + virtual void OnRoleChange(); + virtual void OnAeBroadcast(); private: Waited LeaderWait(); diff --git a/platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h b/platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h new file mode 100644 index 0000000000..1f08d4a461 --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" + +namespace resdb { +namespace raft { + +// Mock MockLeaderElectionManager class to test Raft interactions +class MockLeaderElectionManager : public LeaderElectionManager { + public: + MockLeaderElectionManager(const ResDBConfig& config) + : LeaderElectionManager(config) {} + MOCK_METHOD(void, OnRoleChange, (), (override)); + MOCK_METHOD(void, OnHeartBeat, (), (override)); + MOCK_METHOD(void, OnAeBroadcast, (), (override)); +}; + +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index c1fc9da853..bcde0949a3 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -33,6 +33,15 @@ namespace resdb { namespace raft { +std::ostream &operator << (std::ostream& stream, Role role) { + const char* nameRole[] = { "FOLLOWER", "CANDIDATE", "LEADER"}; + return stream << nameRole[static_cast(role)]; +} + +std::ostream &operator << (std::ostream& stream, TermRelation tr) { + const char* nameTR[] = { "STALE", "CURRENT", "NEW"}; + return stream << nameTR[static_cast(tr)]; +} uint32_t LogEntry::GetSerializedSize() { if (serializedSize == 0) { @@ -824,5 +833,52 @@ bool Raft::InFlightPerFollowerLimitReachedLocked(int followerId) const { return size == maxInFlightPerFollower; } + +void Raft::PrintDebugState() const { + std::lock_guard lk(mutex_); + + LOG(INFO) << "---- Raft Debug State ----\n"; + LOG(INFO) << "currentTerm_: " << currentTerm_ << "\n"; + LOG(INFO) << "votedFor_: " << votedFor_ << "\n"; + + LOG(INFO) << "log_ (size " << log_.size() << "): ["; + for (size_t i = 0; i < log_.size(); ++i) { + LOG(INFO) << "{term: " << log_[i]->term + << ", cmd_size: " << log_[i]->command.size() << "}"; + if (i + 1 != log_.size()) LOG(INFO) << ", "; + } + LOG(INFO) << "]\n"; + + LOG(INFO) << "nextIndex_: ["; + for (size_t i = 0; i < nextIndex_.size(); ++i) { + LOG(INFO) << nextIndex_[i]; + if (i + 1 != nextIndex_.size()) LOG(INFO) << ", "; + } + LOG(INFO) << "]\n"; + + LOG(INFO) << "matchIndex_: ["; + for (size_t i = 0; i < matchIndex_.size(); ++i) { + LOG(INFO) << matchIndex_[i]; + if (i + 1 != matchIndex_.size()) LOG(INFO) << ", "; + } + LOG(INFO) << "]\n"; + + LOG(INFO) << "heartBeatsSentThisTerm_: " << heartBeatsSentThisTerm_ << "\n"; + LOG(INFO) << "lastLogIndex_: " << lastLogIndex_ << "\n"; + LOG(INFO) << "commitIndex_: " << commitIndex_ << "\n"; + LOG(INFO) << "lastApplied_: " << lastApplied_ << "\n"; + LOG(INFO) << "role_: " << static_cast(role_) << "\n"; + + LOG(INFO) << "votes_: ["; + for (size_t i = 0; i < votes_.size(); ++i) { + LOG(INFO) << votes_[i]; + if (i + 1 != votes_.size()) LOG(INFO) << ", "; + } + LOG(INFO) << "]\n"; + + LOG(INFO) << "--------------------------\n"; +} + + } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index f517df1d28..c84d0cddfa 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -27,6 +27,9 @@ #include #include #include +#ifdef RAFT_TEST_MODE +#include +#endif #include "platform/common/queue/lock_free_queue.h" #include "platform/consensus/ordering/common/algorithm/protocol_base.h" @@ -70,6 +73,20 @@ struct InFlightMsg { uint64_t lastIndexOfSegmentSent; }; +#ifdef RAFT_TEST_MODE +struct RaftStatePatch { + std::optional currentTerm; + std::optional votedFor; + std::optional commitIndex; + std::optional lastApplied; + std::optional role; + + std::optional>> log; + std::optional> nextIndex; + std::optional> matchIndex; +}; +#endif + class Raft : public common::ProtocolBase { public: Raft(int id, int f, int total_num, @@ -82,34 +99,35 @@ class Raft : public common::ProtocolBase { const bool replicationLoggingFlag_ = true; const bool livenessLoggingFlag_ = false; - bool ReceiveTransaction(std::unique_ptr req); - bool ReceiveAppendEntries(std::unique_ptr ae); - bool ReceiveAppendEntriesResponse(std::unique_ptr aer); - void ReceiveRequestVote(std::unique_ptr rv); - void ReceiveRequestVoteResponse(std::unique_ptr rvr); + virtual bool ReceiveTransaction(std::unique_ptr req); + virtual bool ReceiveAppendEntries(std::unique_ptr ae); + virtual bool ReceiveAppendEntriesResponse(std::unique_ptr aer); + virtual void ReceiveRequestVote(std::unique_ptr rv); + virtual void ReceiveRequestVoteResponse(std::unique_ptr rvr); virtual void StartElection(); virtual void SendHeartBeat(); - Role GetRoleSnapshot() const; - void SetRole(Role role); + virtual Role GetRoleSnapshot() const; + virtual void SetRole(Role role); + virtual void PrintDebugState() const; private: mutable std::mutex mutex_; - TermRelation TermCheckLocked(uint64_t term) const; // Must be called under mutex - bool DemoteSelfLocked(uint64_t term); // Must be called under mutex - uint64_t getLastLogTermLocked() const; // Must be called under mutex - bool IsStop(); + virtual TermRelation TermCheckLocked(uint64_t term) const; // Must be called under mutex + virtual bool DemoteSelfLocked(uint64_t term); // Must be called under mutex + virtual uint64_t getLastLogTermLocked() const; // Must be called under mutex + virtual bool IsStop(); //bool IsDuplicateLogEntry(const std::string& hash) const; // Must be called under mutex - std::vector> PrepareCommitLocked(); // Must be called under mutex - AeFields GatherAeFieldsLocked(int followerId, bool heartBeat = false) const; // Must be called under mutex + virtual std::vector> PrepareCommitLocked(); // Must be called under mutex + virtual AeFields GatherAeFieldsLocked(int followerId, bool heartBeat = false) const; // Must be called under mutex std::vector GatherAeFieldsForBroadcastLocked(bool heartBeat = false) const; // Must be called under mutex - void CreateAndSendAppendEntryMsg(const AeFields& fields); - LogEntry CreateLogEntry(const Entry& entry) const; - void ClearInFlightsLocked(); - void PruneExpiredInFlightMsgsLocked(); - void PruneRedundantInFlightMsgsLocked(int followerId, uint64_t followerLastLogIndex); - void RecordNewInFlightMsgLocked(const AeFields& msg, std::chrono::steady_clock::time_point timestamp); - bool InFlightPerFollowerLimitReachedLocked(int followerId) const; + virtual void CreateAndSendAppendEntryMsg(const AeFields& fields); + virtual LogEntry CreateLogEntry(const Entry& entry) const; + virtual void ClearInFlightsLocked(); + virtual void PruneExpiredInFlightMsgsLocked(); + virtual void PruneRedundantInFlightMsgsLocked(int followerId, uint64_t followerLastLogIndex); + virtual void RecordNewInFlightMsgLocked(const AeFields& msg, std::chrono::steady_clock::time_point timestamp); + virtual bool InFlightPerFollowerLimitReachedLocked(int followerId) const; // Persistent state on all servers: @@ -147,6 +165,116 @@ class Raft : public common::ProtocolBase { LeaderElectionManager* leader_election_manager_; //Stats* global_stats_; ReplicaCommunicator* replica_communicator_; + +#ifdef RAFT_TEST_MODE + public: + void SetStateForTest(RaftStatePatch patch) { + std::lock_guard lk(mutex_); + + if (patch.currentTerm) currentTerm_ = *patch.currentTerm; + if (patch.votedFor) votedFor_ = *patch.votedFor; + if (patch.commitIndex) commitIndex_ = *patch.commitIndex; + if (patch.lastApplied) lastApplied_ = *patch.lastApplied; + if (patch.role) role_ = *patch.role; + + if (patch.log) { + log_ = std::move(*patch.log); + lastLogIndex_ = log_.empty() ? 0 : log_.size() - 1; + } + + if (patch.nextIndex) nextIndex_ = *patch.nextIndex; + if (patch.matchIndex) matchIndex_ = *patch.matchIndex; + } + + uint64_t GetCurrentTerm() const { + std::lock_guard lock(mutex_); + return currentTerm_; + } + + int GetVotedFor() const { + std::lock_guard lock(mutex_); + return votedFor_; + } + + const std::vector>& GetLog() const { + std::lock_guard lock(mutex_); + return log_; + } + + void PrintLog(std::ostream& os) const { + os << "Log entries (count = " << log_.size() << "):\n"; + + for (size_t i = 0; i < log_.size(); ++i) { + const auto& entry = log_[i]; + if (!entry) { + os << " [" << i << "] \n"; + continue; + } + + os << " [" << i << "] " + << "term=" << entry->term + << ", command=\"" << entry->command << "\"" + << ", serializedSize=" << entry->GetSerializedSize() + << "\n"; + } + } + + size_t GetLogSize() const { + std::lock_guard lock(mutex_); + return log_.size(); + } + + uint64_t GetLastLogIndexFromLog() const { + std::lock_guard lock(mutex_); + return log_.empty() ? 0 : log_.size() - 1; + } + + std::vector GetNextIndex() const { + std::lock_guard lock(mutex_); + return nextIndex_; + } + + std::vector GetMatchIndex() const { + std::lock_guard lock(mutex_); + return matchIndex_; + } + + uint64_t GetHeartBeatsSentThisTerm() const { + std::lock_guard lock(mutex_); + return heartBeatsSentThisTerm_; + } + + uint64_t GetLastLogIndex() const { + std::lock_guard lock(mutex_); + return lastLogIndex_; + } + + uint64_t GetCommitIndex() const { + std::lock_guard lock(mutex_); + return commitIndex_; + } + + uint64_t GetLastApplied() const { + std::lock_guard lock(mutex_); + return lastApplied_; + } + + Role GetRole() const { + std::lock_guard lock(mutex_); + return role_; + } + + std::vector GetVotes() const { + std::lock_guard lock(mutex_); + return votes_; + } + + std::vector> GetInFlightVecs() const { + std::lock_guard lock(mutex_); + return inflightVecs_; + } + +#endif }; } // namespace raft diff --git a/platform/consensus/ordering/raft/algorithm/raft_test.cpp b/platform/consensus/ordering/raft/algorithm/raft_test.cpp new file mode 100644 index 0000000000..b3326bc133 --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/raft_test.cpp @@ -0,0 +1,696 @@ +#include + +#include "platform/config/resdb_config_utils.h" +#include "common/crypto/mock_signature_verifier.h" +#include "platform/networkstrate/mock_replica_communicator.h" +#include "platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h" +#include "platform/consensus/ordering/raft/algorithm/raft.h" +#include "platform/proto/client_test.pb.h" + +namespace resdb { +namespace raft { +using ::testing::Invoke; +using ::testing::_; +using ::testing::Matcher; +using ::testing::AnyNumber; + +ResDBConfig GenerateConfig() { + ResConfigData data; + data.set_duplicate_check_frequency_useconds(100000); + data.set_enable_viewchange(true); + return ResDBConfig({GenerateReplicaInfo(1, "127.0.0.1", 1234), + GenerateReplicaInfo(2, "127.0.0.1", 1235), + GenerateReplicaInfo(3, "127.0.0.1", 1236), + GenerateReplicaInfo(4, "127.0.0.1", 1237)}, + GenerateReplicaInfo(1, "127.0.0.1", 1234), data); +} + +class RaftTest : public ::testing::Test { + private: + class MockSendMessageFunction { + public: + MOCK_METHOD(int, Call, (int, const google::protobuf::Message&, int)); + }; + class MockBroadcastFunction { + public: + MOCK_METHOD(int, Broadcast, (int, const google::protobuf::Message&)); + }; + class MockCommitFunction { + public: + MOCK_METHOD(int, Commit, (const google::protobuf::Message&)); + }; + + protected: + void SetUp() override { + verifier_ = std::make_unique(); + leader_election_manager_ = std::make_unique(GenerateConfig()); + replica_communicator_ = std::make_unique(); + raft_ = std::make_unique( + /*id=*/1, + /*f=*/1, + /*total=*/4, + verifier_.get(), + leader_election_manager_.get(), + replica_communicator_.get()); + + raft_->SetSingleCallFunc( + [&](int type, const google::protobuf::Message& msg, int node_id) { + return mock_call.Call(type, msg, node_id); + }); + + raft_->SetBroadcastCallFunc( + [&](int type, const google::protobuf::Message& msg) { + return mock_broadcast.Broadcast(type, msg); + }); + + raft_->SetCommitFunc( + [&](const google::protobuf::Message& msg) { + return mock_commit.Commit(msg); + }); + } + + AeFields CreateAeFields(uint64_t term, int leaderId, uint64_t prevLogIndex, uint64_t prevLogTerm, const std::vector>& entries, uint64_t leaderCommit, int followerId) { + AeFields fields{}; + fields.term = term; + fields.leaderId = leaderId; + fields.leaderCommit = leaderCommit; + fields.prevLogIndex = prevLogIndex; + fields.prevLogTerm = prevLogTerm; + fields.followerId = followerId; + + for (const auto& logEntry : entries) { + LogEntry entry; + entry.term = logEntry->term; + entry.command = logEntry->command; + fields.entries.push_back(std::move(entry)); + } + + return fields; + }; + + // Helper to create a single log entry + std::unique_ptr CreateLogEntry(uint64_t term, const std::string& command_data) { + auto entry = std::make_unique(); + entry->term = term; + entry->command = command_data; + return entry; + } + + // Helper to create a vector of log entries for testing + std::vector> CreateLogEntries(const std::vector>& term_and_cmds, bool usedForLogPatch = false) { + std::vector> entries; + + if (usedForLogPatch) { + std::unique_ptr first_entry = std::make_unique(); + first_entry->term = 0; + first_entry->command = "COMMON_PREFIX"; + entries.push_back(std::move(first_entry)); + } + + for (const auto& [term, cmd] : term_and_cmds) { + std::unique_ptr entry = std::make_unique(); + entry->term = term; + // entry->command = cmd; + + ClientTestRequest req; + req.set_value(cmd); + std::string serialized; + req.SerializeToString(&serialized); + entry->command = serialized; + entries.push_back(std::move(entry)); + } + return entries; + } + + AppendEntries CreateAeMessage(const AeFields& fields) { + AppendEntries ae; + ae.set_term(fields.term); + ae.set_leaderid(fields.leaderId); + ae.set_prevlogindex(fields.prevLogIndex); + ae.set_prevlogterm(fields.prevLogTerm); + ae.set_leadercommitindex(fields.leaderCommit); + for (const auto& entry : fields.entries) { + auto* newEntry = ae.add_entries(); + newEntry->set_term(entry.term); + newEntry->set_command(entry.command); + } + + return ae; + } + + std::unique_ptr verifier_; + std::unique_ptr leader_election_manager_; + std::unique_ptr replica_communicator_; + std::unique_ptr raft_; + MockSendMessageFunction mock_call; + MockBroadcastFunction mock_broadcast; + MockCommitFunction mock_commit; +}; + +// Test 1: A follower receiving a client transaction should reject it +TEST_F(RaftTest, FollowerRejectsClientTransaction) { + EXPECT_CALL(mock_call, Call(_, _, _)).Times(0); + EXPECT_CALL(mock_broadcast, Broadcast(_, _)).Times(0); + + auto req = std::make_unique(); + req->set_seq(1); + raft_->SetRole(Role::FOLLOWER); + + bool success = raft_->ReceiveTransaction(std::move(req)); + EXPECT_FALSE(success); +} + +// Test 2: A leader receiving a client transaction should send an AppendEntries to all other replicas +TEST_F(RaftTest, LeaderSendsAppendEntriesUponClientTransaction) { + EXPECT_CALL(mock_call, Call(_, _, _)).Times(3); + + auto req = std::make_unique(); + req->set_seq(1); + raft_->SetRole(Role::LEADER); + + bool success = raft_->ReceiveTransaction(std::move(req)); + EXPECT_TRUE(success); +} + +// Test 3: Sent AppendEntries should be based on the follower's nextIndex +TEST_F(RaftTest, LeaderSendsAppendEntriesBasedOnNextIndex) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& ae = dynamic_cast(msg); + EXPECT_EQ(node_id, 2); + EXPECT_EQ(ae.prevlogindex(), 2); + EXPECT_EQ(ae.entries().size(), 3); + return 0; + })) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& ae = dynamic_cast(msg); + EXPECT_EQ(node_id, 3); + EXPECT_EQ(ae.prevlogindex(), 1); + EXPECT_EQ(ae.entries().size(), 4); + return 0; + })) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& ae = dynamic_cast(msg); + EXPECT_EQ(node_id, 4); + EXPECT_EQ(ae.prevlogindex(), 0); + EXPECT_EQ(ae.entries().size(), 5); + return 0; + })); + + raft_->SetStateForTest({ + .currentTerm = 0, + .role = Role::LEADER, + .log = CreateLogEntries({ + {0, "Term 0 Transaction 1"}, + {0, "Term 0 Transaction 2"}, + {0, "Term 0 Transaction 3"}, + {0, "Term 0 Transaction 4"}, + }, true), + .nextIndex = std::vector{0, 4, 3, 2, 1} + }); + + auto req = std::make_unique(); + req->set_seq(5); + + bool success = raft_->ReceiveTransaction(std::move(req)); + EXPECT_TRUE(success); +} + +// Test 4: A follower receiving 1 AppendEntries with multiple entries that it can accept +TEST_F(RaftTest, FollowerAddsAppendEntriesWithMultipleEntries) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& aer = dynamic_cast(msg); + EXPECT_TRUE(aer.success()); + EXPECT_EQ(aer.lastlogindex(), 3); + return 0; + })); + + auto aefields = CreateAeFields( + /*term=*/ 0, + /*leaderId=*/ 2, + /*prevLogIndex=*/ 0, + /*prevLogTerm=*/ 0, + /*entries=*/ CreateLogEntries({ + {0, "Transaction 1"}, + {0, "Transaction 2"}, + {0, "Transaction 3"}, + }), + /*leaderCommit=*/ 0, + /*followerId=*/ 1 + ); + + auto aemessage = CreateAeMessage(aefields); + raft_->SetRole(Role::FOLLOWER); + + bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + EXPECT_TRUE(success); +} + +// Test 5: A follower receiving multiple AppendEntries that it can accept +TEST_F(RaftTest, FollowerAddsMultipleAppendEntries) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& aer = dynamic_cast(msg); + EXPECT_TRUE(aer.success()); + EXPECT_EQ(aer.lastlogindex(), 1); + return 0; + })) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& aer = dynamic_cast(msg); + EXPECT_TRUE(aer.success()); + EXPECT_EQ(aer.lastlogindex(), 2); + return 0; + })) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& aer = dynamic_cast(msg); + EXPECT_TRUE(aer.success()); + EXPECT_EQ(aer.lastlogindex(), 3); + return 0; + })); + + auto aefields1 = CreateAeFields( + /*term=*/ 0, + /*leaderId=*/ 2, + /*prevLogIndex=*/ 0, + /*prevLogTerm=*/ 0, + /*entries=*/ CreateLogEntries({ + {0, "Transaction 1"}, + }), + /*leaderCommit=*/ 0, + /*followerId=*/ 1 + ); + + auto aefields2 = CreateAeFields( + /*term=*/ 0, + /*leaderId=*/ 2, + /*prevLogIndex=*/ 1, + /*prevLogTerm=*/ 0, + /*entries=*/ CreateLogEntries({ + {0, "Transaction 2"}, + }), + /*leaderCommit=*/ 0, + /*followerId=*/ 1 + ); + + auto aefields3 = CreateAeFields( + /*term=*/ 0, + /*leaderId=*/ 2, + /*prevLogIndex=*/ 2, + /*prevLogTerm=*/ 0, + /*entries=*/ CreateLogEntries({ + {0, "Transaction 3"}, + }), + /*leaderCommit=*/ 0, + /*followerId=*/ 1 + ); + + auto aemessage1 = CreateAeMessage(aefields1); + auto aemessage2 = CreateAeMessage(aefields2); + auto aemessage3 = CreateAeMessage(aefields3); + raft_->SetRole(Role::FOLLOWER); + + bool success1 = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage1))); + EXPECT_TRUE(success1); + + bool success2 = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage2))); + EXPECT_TRUE(success2); + + bool success3 = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage3))); + EXPECT_TRUE(success3); +} + +// Test 6: A follower rejects Append Entries because its own entry at prevLogIndex does not have the same term. +TEST_F(RaftTest, FollowerRejectsMismatchedTermAtPrevLogIndex) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& aer = dynamic_cast(msg); + EXPECT_FALSE(aer.success()); + EXPECT_EQ(aer.lastlogindex(), 1); + return 0; + })); + + auto aefields = CreateAeFields( + /*term=*/ 0, + /*leaderId=*/ 2, + /*prevLogIndex=*/ 1, + /*prevLogTerm=*/ 2, + /*entries=*/ CreateLogEntries({ + {2, "Term 2 Transaction 1"}, + }), + /*leaderCommit=*/ 0, + /*followerId=*/ 1 + ); + + raft_->SetStateForTest({ + .currentTerm = 0, + .role = Role::FOLLOWER, + .log = CreateLogEntries({ + {1, "Term 1 Transaction 1"}, + }, true), + }); + + auto aemessage = CreateAeMessage(aefields); + raft_->SetRole(Role::FOLLOWER); + + bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + EXPECT_TRUE(success); +} + +// Test 7: A follower rejects Append Entries because it does not have a term at prevLogIndex +TEST_F(RaftTest, FollowerRejectsMissingIndex) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& aer = dynamic_cast(msg); + EXPECT_FALSE(aer.success()); + EXPECT_EQ(aer.lastlogindex(), 0); + return 0; + })); + + auto aefields = CreateAeFields( + /*term=*/ 0, + /*leaderId=*/ 2, + /*prevLogIndex=*/ 1, + /*prevLogTerm=*/ 0, + /*entries=*/ CreateLogEntries({ + {0, "Transaction 2"}, + }), + /*leaderCommit=*/ 0, + /*followerId=*/ 1 + ); + + auto aemessage = CreateAeMessage(aefields); + raft_->SetRole(Role::FOLLOWER); + + bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + EXPECT_TRUE(success); +} + +// Test 8: A follower receiving 1 AppendEntries with multiple entries and needing to truncate part of its log +TEST_F(RaftTest, FollowerAddsAppendEntriesAndTruncatesLog) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& aer = dynamic_cast(msg); + EXPECT_TRUE(aer.success()); + EXPECT_EQ(aer.lastlogindex(), 3); + return 0; + })); + + auto aefields = CreateAeFields( + /*term=*/ 1, + /*leaderId=*/ 2, + /*prevLogIndex=*/ 1, + /*prevLogTerm=*/ 0, + /*entries=*/ CreateLogEntries({ + {1, "Term 1 Transaction 1"}, + {1, "Term 1 Transaction 2"}, + }), + /*leaderCommit=*/ 0, + /*followerId=*/ 1 + ); + auto aemessage = CreateAeMessage(aefields); + + raft_->SetStateForTest({ + .currentTerm = 0, + .role = Role::FOLLOWER, + .log = CreateLogEntries({ + {0, "Term 0 Transaction 1"}, // index 1 + {0, "Term 0 Transaction 2"}, // mismatched entry will be removed + }, true), + }); + + bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + + const auto& raft_log = raft_->GetLog(); + EXPECT_EQ(raft_log[0]->term, 0); + EXPECT_EQ(raft_log[0]->command, "COMMON_PREFIX"); + EXPECT_EQ(raft_log[1]->term, 0); + // TODO: Use serialized string instead of manually doing it + EXPECT_EQ(raft_log[1]->command, "\n\x14Term 0 Transaction 1"); + EXPECT_EQ(raft_log[2]->term, 1); + EXPECT_EQ(raft_log[2]->command, "\n\x14Term 1 Transaction 1"); + EXPECT_EQ(raft_log[3]->term, 1); + EXPECT_EQ(raft_log[3]->command, "\n\x14Term 1 Transaction 2"); + EXPECT_TRUE(success); +} + +// Test 9: A follower increases its commitIndex +TEST_F(RaftTest, FollowerIncreasesCommitIndex) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& aer = dynamic_cast(msg); + EXPECT_TRUE(aer.success()); + EXPECT_EQ(aer.lastlogindex(), 5); + return 0; + })); + EXPECT_CALL(mock_commit, Commit(_)) + .Times(2); + + auto aefields = CreateAeFields( + /*term=*/ 1, + /*leaderId=*/ 2, + /*prevLogIndex=*/ 5, + /*prevLogTerm=*/ 1, + /*entries=*/ CreateLogEntries({}), + /*leaderCommit=*/ 3, + /*followerId=*/ 1 + ); + auto aemessage = CreateAeMessage(aefields); + + raft_->SetStateForTest({ + .currentTerm = 1, + .commitIndex = 1, + .lastApplied = 1, + .role = Role::FOLLOWER, + .log = CreateLogEntries({ + {1, "Term 1 Transaction 1"}, + {1, "Term 1 Transaction 2"}, + {1, "Term 1 Transaction 3"}, + {1, "Term 1 Transaction 4"}, + {1, "Term 1 Transaction 5"}, + }, true), + }); + + bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + + EXPECT_TRUE(success); + EXPECT_EQ(raft_->GetCommitIndex(), 3); +} + +// Test 10: A follower increases its commitIndex, but not past its own log size +TEST_F(RaftTest, FollowerIncreasesCommitIndexCappedAtLogSize) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& aer = dynamic_cast(msg); + EXPECT_TRUE(aer.success()); + EXPECT_EQ(aer.lastlogindex(), 5); + return 0; + })); + EXPECT_CALL(mock_commit, Commit(_)) + .Times(4); + + auto aefields = CreateAeFields( + /*term=*/ 1, + /*leaderId=*/ 2, + /*prevLogIndex=*/ 5, + /*prevLogTerm=*/ 1, + /*entries=*/ CreateLogEntries({}), + /*leaderCommit=*/ 7, + /*followerId=*/ 1 + ); + auto aemessage = CreateAeMessage(aefields); + + raft_->SetStateForTest({ + .currentTerm = 1, + .commitIndex = 1, + .lastApplied = 1, + .role = Role::FOLLOWER, + .log = CreateLogEntries({ + {1, "Term 1 Transaction 1"}, + {1, "Term 1 Transaction 2"}, + {1, "Term 1 Transaction 3"}, + {1, "Term 1 Transaction 4"}, + {1, "Term 1 Transaction 5"}, + }, true), + }); + + bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + + EXPECT_TRUE(success); + EXPECT_EQ(raft_->GetCommitIndex(), 5); +} + +// Test 11: A candidate rejecting an AppendEntries from an outdated term and staying candidate +TEST_F(RaftTest, CandidateRejectsAppendEntriesFromOutdatedTerm) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& aer = dynamic_cast(msg); + EXPECT_FALSE(aer.success()); + EXPECT_EQ(aer.lastlogindex(), 0); + return 0; + })); + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + + auto aefields = CreateAeFields( + /*term=*/ 1, + /*leaderId=*/ 2, + /*prevLogIndex=*/ 0, + /*prevLogTerm=*/ 0, + /*entries=*/ CreateLogEntries({ + {1, "Transaction 1"}, + {1, "Transaction 2"}, + {1, "Transaction 3"}, + }), + /*leaderCommit=*/ 0, + /*followerId=*/ 1 + ); + auto aemessage = CreateAeMessage(aefields); + + raft_->SetStateForTest({ + .currentTerm = 2, + .role = Role::CANDIDATE, + .log = CreateLogEntries({ + }, true), + }); + + + bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + EXPECT_TRUE(success); +} + +// Test 12: A candidate rejecting an AppendEntries because their log is further behind, but it is in the same term so they still demote. +TEST_F(RaftTest, CandidateRejectsAppendEntriesFromSameTerm) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& aer = dynamic_cast(msg); + EXPECT_FALSE(aer.success()); + EXPECT_EQ(aer.lastlogindex(), 1); + return 0; + })); + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); + + auto aefields = CreateAeFields( + /*term=*/ 2, + /*leaderId=*/ 2, + /*prevLogIndex=*/ 2, + /*prevLogTerm=*/ 0, + /*entries=*/ CreateLogEntries({ + {2, "Transaction 1"}, + {2, "Transaction 2"}, + {2, "Transaction 3"}, + }), + /*leaderCommit=*/ 0, + /*followerId=*/ 1 + ); + auto aemessage = CreateAeMessage(aefields); + + raft_->SetStateForTest({ + .currentTerm = 2, + .role = Role::CANDIDATE, + .log = CreateLogEntries({ + {1, "Old Transaction 1"} + }, true), + }); + + + bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + EXPECT_TRUE(success); +} + +// Test 13: A candidate receiving an AppendEntries with multiple entries that it can accept from a newer term. +TEST_F(RaftTest, CandidateReceivesNewerTermWithAppendEntriesItCanAccept) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& aer = dynamic_cast(msg); + EXPECT_TRUE(aer.success()); + EXPECT_EQ(aer.lastlogindex(), 3); + return 0; + })); + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); + + auto aefields = CreateAeFields( + /*term=*/ 2, + /*leaderId=*/ 2, + /*prevLogIndex=*/ 2, + /*prevLogTerm=*/ 0, + /*entries=*/ CreateLogEntries({ + {2, "Transaction 1"}, + }), + /*leaderCommit=*/ 2, + /*followerId=*/ 1 + ); + auto aemessage = CreateAeMessage(aefields); + + raft_->SetStateForTest({ + .currentTerm = 1, + .commitIndex = 2, + .role = Role::CANDIDATE, + .log = CreateLogEntries({ + {0, "old-1"}, + {0, "old-2"}, + }, true), + }); + + bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + EXPECT_TRUE(success); + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); +} + +// Test 14: A candidate receiving an AppendEntries with multiple entries that it can accept from a the same term but further along. +TEST_F(RaftTest, CandidateReceivesSameTermWithAppendEntriesItCanAccept) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& aer = dynamic_cast(msg); + EXPECT_TRUE(aer.success()); + EXPECT_EQ(aer.lastlogindex(), 3); + return 0; + })); + EXPECT_CALL(*leader_election_manager_, OnRoleChange()) + .Times(1); + + auto aefields = CreateAeFields( + /*term=*/ 1, + /*leaderId=*/ 2, + /*prevLogIndex=*/ 2, + /*prevLogTerm=*/ 0, + /*entries=*/ CreateLogEntries({ + {2, "Transaction 1"}, + }), + /*leaderCommit=*/ 2, + /*followerId=*/ 1 + ); + auto aemessage = CreateAeMessage(aefields); + + raft_->SetStateForTest({ + .currentTerm = 1, + .commitIndex = 2, + .role = Role::CANDIDATE, + .log = CreateLogEntries({ + {0, "old-1"}, + {0, "old-2"}, + }, true), + }); + + bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + EXPECT_TRUE(success); + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); +} + +} // namespace raft +} // namespace resdb From 26db304ac886ded52c83c46658625b07c46ba185 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Mon, 16 Feb 2026 15:37:48 -0800 Subject: [PATCH 46/66] Update tests, restructure raft tests to support multiple raft test files --- .../consensus/ordering/raft/algorithm/BUILD | 9 +- ..._test.cpp => raft_test_append_entries.cpp} | 168 +++--------------- .../ordering/raft/algorithm/raft_tests.h | 152 ++++++++++++++++ 3 files changed, 178 insertions(+), 151 deletions(-) rename platform/consensus/ordering/raft/algorithm/{raft_test.cpp => raft_test_append_entries.cpp} (76%) create mode 100644 platform/consensus/ordering/raft/algorithm/raft_tests.h diff --git a/platform/consensus/ordering/raft/algorithm/BUILD b/platform/consensus/ordering/raft/algorithm/BUILD index a4e41deed3..e3f8fa1da8 100644 --- a/platform/consensus/ordering/raft/algorithm/BUILD +++ b/platform/consensus/ordering/raft/algorithm/BUILD @@ -71,8 +71,11 @@ cc_test( ) cc_test( - name = "raft_test", - srcs = ["raft_test.cpp"], + name = "raft_test_append_entries", + srcs = [ + "raft_test_append_entries.cpp", + "raft_tests.h", + ], copts = ["-DRAFT_TEST_MODE"], deps = [ ":raft", @@ -84,4 +87,4 @@ cc_test( "//platform/proto:client_test_cc_proto", ], size="small" -) \ No newline at end of file +) diff --git a/platform/consensus/ordering/raft/algorithm/raft_test.cpp b/platform/consensus/ordering/raft/algorithm/raft_test_append_entries.cpp similarity index 76% rename from platform/consensus/ordering/raft/algorithm/raft_test.cpp rename to platform/consensus/ordering/raft/algorithm/raft_test_append_entries.cpp index b3326bc133..ab94d3ded1 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_test.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft_test_append_entries.cpp @@ -1,11 +1,4 @@ -#include - -#include "platform/config/resdb_config_utils.h" -#include "common/crypto/mock_signature_verifier.h" -#include "platform/networkstrate/mock_replica_communicator.h" -#include "platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h" -#include "platform/consensus/ordering/raft/algorithm/raft.h" -#include "platform/proto/client_test.pb.h" +#include "platform/consensus/ordering/raft/algorithm/raft_tests.h" namespace resdb { namespace raft { @@ -14,139 +7,6 @@ using ::testing::_; using ::testing::Matcher; using ::testing::AnyNumber; -ResDBConfig GenerateConfig() { - ResConfigData data; - data.set_duplicate_check_frequency_useconds(100000); - data.set_enable_viewchange(true); - return ResDBConfig({GenerateReplicaInfo(1, "127.0.0.1", 1234), - GenerateReplicaInfo(2, "127.0.0.1", 1235), - GenerateReplicaInfo(3, "127.0.0.1", 1236), - GenerateReplicaInfo(4, "127.0.0.1", 1237)}, - GenerateReplicaInfo(1, "127.0.0.1", 1234), data); -} - -class RaftTest : public ::testing::Test { - private: - class MockSendMessageFunction { - public: - MOCK_METHOD(int, Call, (int, const google::protobuf::Message&, int)); - }; - class MockBroadcastFunction { - public: - MOCK_METHOD(int, Broadcast, (int, const google::protobuf::Message&)); - }; - class MockCommitFunction { - public: - MOCK_METHOD(int, Commit, (const google::protobuf::Message&)); - }; - - protected: - void SetUp() override { - verifier_ = std::make_unique(); - leader_election_manager_ = std::make_unique(GenerateConfig()); - replica_communicator_ = std::make_unique(); - raft_ = std::make_unique( - /*id=*/1, - /*f=*/1, - /*total=*/4, - verifier_.get(), - leader_election_manager_.get(), - replica_communicator_.get()); - - raft_->SetSingleCallFunc( - [&](int type, const google::protobuf::Message& msg, int node_id) { - return mock_call.Call(type, msg, node_id); - }); - - raft_->SetBroadcastCallFunc( - [&](int type, const google::protobuf::Message& msg) { - return mock_broadcast.Broadcast(type, msg); - }); - - raft_->SetCommitFunc( - [&](const google::protobuf::Message& msg) { - return mock_commit.Commit(msg); - }); - } - - AeFields CreateAeFields(uint64_t term, int leaderId, uint64_t prevLogIndex, uint64_t prevLogTerm, const std::vector>& entries, uint64_t leaderCommit, int followerId) { - AeFields fields{}; - fields.term = term; - fields.leaderId = leaderId; - fields.leaderCommit = leaderCommit; - fields.prevLogIndex = prevLogIndex; - fields.prevLogTerm = prevLogTerm; - fields.followerId = followerId; - - for (const auto& logEntry : entries) { - LogEntry entry; - entry.term = logEntry->term; - entry.command = logEntry->command; - fields.entries.push_back(std::move(entry)); - } - - return fields; - }; - - // Helper to create a single log entry - std::unique_ptr CreateLogEntry(uint64_t term, const std::string& command_data) { - auto entry = std::make_unique(); - entry->term = term; - entry->command = command_data; - return entry; - } - - // Helper to create a vector of log entries for testing - std::vector> CreateLogEntries(const std::vector>& term_and_cmds, bool usedForLogPatch = false) { - std::vector> entries; - - if (usedForLogPatch) { - std::unique_ptr first_entry = std::make_unique(); - first_entry->term = 0; - first_entry->command = "COMMON_PREFIX"; - entries.push_back(std::move(first_entry)); - } - - for (const auto& [term, cmd] : term_and_cmds) { - std::unique_ptr entry = std::make_unique(); - entry->term = term; - // entry->command = cmd; - - ClientTestRequest req; - req.set_value(cmd); - std::string serialized; - req.SerializeToString(&serialized); - entry->command = serialized; - entries.push_back(std::move(entry)); - } - return entries; - } - - AppendEntries CreateAeMessage(const AeFields& fields) { - AppendEntries ae; - ae.set_term(fields.term); - ae.set_leaderid(fields.leaderId); - ae.set_prevlogindex(fields.prevLogIndex); - ae.set_prevlogterm(fields.prevLogTerm); - ae.set_leadercommitindex(fields.leaderCommit); - for (const auto& entry : fields.entries) { - auto* newEntry = ae.add_entries(); - newEntry->set_term(entry.term); - newEntry->set_command(entry.command); - } - - return ae; - } - - std::unique_ptr verifier_; - std::unique_ptr leader_election_manager_; - std::unique_ptr replica_communicator_; - std::unique_ptr raft_; - MockSendMessageFunction mock_call; - MockBroadcastFunction mock_broadcast; - MockCommitFunction mock_commit; -}; - // Test 1: A follower receiving a client transaction should reject it TEST_F(RaftTest, FollowerRejectsClientTransaction) { EXPECT_CALL(mock_call, Call(_, _, _)).Times(0); @@ -163,6 +23,7 @@ TEST_F(RaftTest, FollowerRejectsClientTransaction) { // Test 2: A leader receiving a client transaction should send an AppendEntries to all other replicas TEST_F(RaftTest, LeaderSendsAppendEntriesUponClientTransaction) { EXPECT_CALL(mock_call, Call(_, _, _)).Times(3); + EXPECT_CALL(*leader_election_manager_, OnAeBroadcast()).Times(1); auto req = std::make_unique(); req->set_seq(1); @@ -199,6 +60,7 @@ TEST_F(RaftTest, LeaderSendsAppendEntriesBasedOnNextIndex) { EXPECT_EQ(ae.entries().size(), 5); return 0; })); + EXPECT_CALL(*leader_election_manager_, OnAeBroadcast()).Times(1); raft_->SetStateForTest({ .currentTerm = 0, @@ -229,6 +91,7 @@ TEST_F(RaftTest, FollowerAddsAppendEntriesWithMultipleEntries) { EXPECT_EQ(aer.lastlogindex(), 3); return 0; })); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( /*term=*/ 0, @@ -275,6 +138,7 @@ TEST_F(RaftTest, FollowerAddsMultipleAppendEntries) { EXPECT_EQ(aer.lastlogindex(), 3); return 0; })); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(3); auto aefields1 = CreateAeFields( /*term=*/ 0, @@ -337,6 +201,7 @@ TEST_F(RaftTest, FollowerRejectsMismatchedTermAtPrevLogIndex) { EXPECT_EQ(aer.lastlogindex(), 1); return 0; })); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( /*term=*/ 0, @@ -375,6 +240,7 @@ TEST_F(RaftTest, FollowerRejectsMissingIndex) { EXPECT_EQ(aer.lastlogindex(), 0); return 0; })); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( /*term=*/ 0, @@ -405,6 +271,7 @@ TEST_F(RaftTest, FollowerAddsAppendEntriesAndTruncatesLog) { EXPECT_EQ(aer.lastlogindex(), 3); return 0; })); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( /*term=*/ 1, @@ -456,6 +323,7 @@ TEST_F(RaftTest, FollowerIncreasesCommitIndex) { })); EXPECT_CALL(mock_commit, Commit(_)) .Times(2); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( /*term=*/ 1, @@ -500,6 +368,7 @@ TEST_F(RaftTest, FollowerIncreasesCommitIndexCappedAtLogSize) { })); EXPECT_CALL(mock_commit, Commit(_)) .Times(4); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( /*term=*/ 1, @@ -543,7 +412,8 @@ TEST_F(RaftTest, CandidateRejectsAppendEntriesFromOutdatedTerm) { return 0; })); EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); - + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); + auto aefields = CreateAeFields( /*term=*/ 1, /*leaderId=*/ 2, @@ -582,6 +452,7 @@ TEST_F(RaftTest, CandidateRejectsAppendEntriesFromSameTerm) { return 0; })); EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( /*term=*/ 2, @@ -611,7 +482,7 @@ TEST_F(RaftTest, CandidateRejectsAppendEntriesFromSameTerm) { EXPECT_TRUE(success); } -// Test 13: A candidate receiving an AppendEntries with multiple entries that it can accept from a newer term. +// Test 13: A candidate receiving an AppendEntries it can accept from a newer term. TEST_F(RaftTest, CandidateReceivesNewerTermWithAppendEntriesItCanAccept) { EXPECT_CALL(mock_call, Call(_, _, _)) .WillOnce(::testing::Invoke( @@ -622,6 +493,7 @@ TEST_F(RaftTest, CandidateReceivesNewerTermWithAppendEntriesItCanAccept) { return 0; })); EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( /*term=*/ 2, @@ -638,7 +510,7 @@ TEST_F(RaftTest, CandidateReceivesNewerTermWithAppendEntriesItCanAccept) { raft_->SetStateForTest({ .currentTerm = 1, - .commitIndex = 2, + .lastApplied = 2, .role = Role::CANDIDATE, .log = CreateLogEntries({ {0, "old-1"}, @@ -651,7 +523,7 @@ TEST_F(RaftTest, CandidateReceivesNewerTermWithAppendEntriesItCanAccept) { EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); } -// Test 14: A candidate receiving an AppendEntries with multiple entries that it can accept from a the same term but further along. +// Test 14: A candidate receiving an AppendEntries that it can accept from a the same term but further along. TEST_F(RaftTest, CandidateReceivesSameTermWithAppendEntriesItCanAccept) { EXPECT_CALL(mock_call, Call(_, _, _)) .WillOnce(::testing::Invoke( @@ -661,8 +533,8 @@ TEST_F(RaftTest, CandidateReceivesSameTermWithAppendEntriesItCanAccept) { EXPECT_EQ(aer.lastlogindex(), 3); return 0; })); - EXPECT_CALL(*leader_election_manager_, OnRoleChange()) - .Times(1); + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( /*term=*/ 1, @@ -679,7 +551,7 @@ TEST_F(RaftTest, CandidateReceivesSameTermWithAppendEntriesItCanAccept) { raft_->SetStateForTest({ .currentTerm = 1, - .commitIndex = 2, + .lastApplied = 2, .role = Role::CANDIDATE, .log = CreateLogEntries({ {0, "old-1"}, diff --git a/platform/consensus/ordering/raft/algorithm/raft_tests.h b/platform/consensus/ordering/raft/algorithm/raft_tests.h new file mode 100644 index 0000000000..e6ad975b5b --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/raft_tests.h @@ -0,0 +1,152 @@ +#include + +#include "platform/config/resdb_config_utils.h" +#include "common/crypto/mock_signature_verifier.h" +#include "platform/networkstrate/mock_replica_communicator.h" +#include "platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h" +#include "platform/consensus/ordering/raft/algorithm/raft.h" +#include "platform/proto/client_test.pb.h" + +namespace resdb { +namespace raft { +using ::testing::Invoke; +using ::testing::_; +using ::testing::Matcher; +using ::testing::AnyNumber; + +ResDBConfig GenerateConfig() { + ResConfigData data; + data.set_duplicate_check_frequency_useconds(100000); + data.set_enable_viewchange(true); + return ResDBConfig({GenerateReplicaInfo(1, "127.0.0.1", 1234), + GenerateReplicaInfo(2, "127.0.0.1", 1235), + GenerateReplicaInfo(3, "127.0.0.1", 1236), + GenerateReplicaInfo(4, "127.0.0.1", 1237)}, + GenerateReplicaInfo(1, "127.0.0.1", 1234), data); +} + +class RaftTest : public ::testing::Test { + private: + class MockSendMessageFunction { + public: + MOCK_METHOD(int, Call, (int, const google::protobuf::Message&, int)); + }; + class MockBroadcastFunction { + public: + MOCK_METHOD(int, Broadcast, (int, const google::protobuf::Message&)); + }; + class MockCommitFunction { + public: + MOCK_METHOD(int, Commit, (const google::protobuf::Message&)); + }; + + protected: + void SetUp() override { + verifier_ = std::make_unique(); + leader_election_manager_ = std::make_unique(GenerateConfig()); + replica_communicator_ = std::make_unique(); + raft_ = std::make_unique( + /*id=*/1, + /*f=*/1, + /*total=*/4, + verifier_.get(), + leader_election_manager_.get(), + replica_communicator_.get()); + + raft_->SetSingleCallFunc( + [&](int type, const google::protobuf::Message& msg, int node_id) { + return mock_call.Call(type, msg, node_id); + }); + + raft_->SetBroadcastCallFunc( + [&](int type, const google::protobuf::Message& msg) { + return mock_broadcast.Broadcast(type, msg); + }); + + raft_->SetCommitFunc( + [&](const google::protobuf::Message& msg) { + return mock_commit.Commit(msg); + }); + } + + AeFields CreateAeFields(uint64_t term, int leaderId, uint64_t prevLogIndex, uint64_t prevLogTerm, const std::vector>& entries, uint64_t leaderCommit, int followerId) { + AeFields fields{}; + fields.term = term; + fields.leaderId = leaderId; + fields.leaderCommit = leaderCommit; + fields.prevLogIndex = prevLogIndex; + fields.prevLogTerm = prevLogTerm; + fields.followerId = followerId; + + for (const auto& logEntry : entries) { + LogEntry entry; + entry.term = logEntry->term; + entry.command = logEntry->command; + fields.entries.push_back(std::move(entry)); + } + + return fields; + }; + + // Helper to create a single log entry + std::unique_ptr CreateLogEntry(uint64_t term, const std::string& command_data) { + auto entry = std::make_unique(); + entry->term = term; + entry->command = command_data; + return entry; + } + + // Helper to create a vector of log entries for testing + std::vector> CreateLogEntries(const std::vector>& term_and_cmds, bool usedForLogPatch = false) { + std::vector> entries; + + if (usedForLogPatch) { + std::unique_ptr first_entry = std::make_unique(); + first_entry->term = 0; + first_entry->command = "COMMON_PREFIX"; + entries.push_back(std::move(first_entry)); + } + + for (const auto& [term, cmd] : term_and_cmds) { + std::unique_ptr entry = std::make_unique(); + entry->term = term; + // entry->command = cmd; + + ClientTestRequest req; + req.set_value(cmd); + std::string serialized; + req.SerializeToString(&serialized); + entry->command = serialized; + entries.push_back(std::move(entry)); + } + return entries; + } + + AppendEntries CreateAeMessage(const AeFields& fields) { + AppendEntries ae; + ae.set_term(fields.term); + ae.set_leaderid(fields.leaderId); + ae.set_prevlogindex(fields.prevLogIndex); + ae.set_prevlogterm(fields.prevLogTerm); + ae.set_leadercommitindex(fields.leaderCommit); + for (const auto& entry : fields.entries) { + auto* newEntry = ae.add_entries(); + newEntry->set_term(entry.term); + newEntry->set_command(entry.command); + } + + return ae; + } + + std::unique_ptr verifier_; + std::unique_ptr leader_election_manager_; + std::unique_ptr replica_communicator_; + std::unique_ptr raft_; + MockSendMessageFunction mock_call; + MockBroadcastFunction mock_broadcast; + MockCommitFunction mock_commit; +}; + + +} // namespace raft +} // namespace resdb From 15c50c88b4eefec48c4d89d5fa65cc3e0277811a Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Mon, 16 Feb 2026 15:56:46 -0800 Subject: [PATCH 47/66] Add tests for ReceiveAppendEntriesResponse() --- .../consensus/ordering/raft/algorithm/BUILD | 19 ++ .../raft_test_append_entries_response.cpp | 190 ++++++++++++++++++ 2 files changed, 209 insertions(+) create mode 100644 platform/consensus/ordering/raft/algorithm/raft_test_append_entries_response.cpp diff --git a/platform/consensus/ordering/raft/algorithm/BUILD b/platform/consensus/ordering/raft/algorithm/BUILD index e3f8fa1da8..7694902e8c 100644 --- a/platform/consensus/ordering/raft/algorithm/BUILD +++ b/platform/consensus/ordering/raft/algorithm/BUILD @@ -88,3 +88,22 @@ cc_test( ], size="small" ) + +cc_test( + name = "raft_test_append_entries_response", + srcs = [ + "raft_test_append_entries_response.cpp", + "raft_tests.h", + ], + copts = ["-DRAFT_TEST_MODE"], + deps = [ + ":raft", + ":mock_leader_election_manager", + "//platform/networkstrate:mock_replica_communicator", + "//common/crypto:mock_signature_verifier", + "//platform/config:resdb_config_utils", + "//common/test:test_main", + "//platform/proto:client_test_cc_proto", + ], + size="small" +) diff --git a/platform/consensus/ordering/raft/algorithm/raft_test_append_entries_response.cpp b/platform/consensus/ordering/raft/algorithm/raft_test_append_entries_response.cpp new file mode 100644 index 0000000000..4d23afbd3b --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/raft_test_append_entries_response.cpp @@ -0,0 +1,190 @@ +#include "platform/consensus/ordering/raft/algorithm/raft_tests.h" + +namespace resdb { +namespace raft { +using ::testing::Invoke; +using ::testing::_; +using ::testing::Matcher; +using ::testing::AnyNumber; + +// Test 1: A leader receiving an AppendEntriesResponse success and updating the follower's matchIndex. +TEST_F(RaftTest, LeaderReceivesAppendEntriesResponseSuccess) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + + AppendEntriesResponse aeResponse; + aeResponse.set_success(true); + aeResponse.set_term(1); + aeResponse.set_id(2); + aeResponse.set_lastlogindex(2); + + raft_->SetStateForTest({ + .currentTerm = 1, + .commitIndex = 0, + .role = Role::LEADER, + .log = CreateLogEntries({ + {0, "Transaction 1"}, + {0, "Transaction 2"}, + }, true), + .matchIndex = std::vector{0, 2, 0, 0, 0} + }); + + bool success = raft_->ReceiveAppendEntriesResponse(std::make_unique(aeResponse)); + EXPECT_TRUE(success); + EXPECT_THAT(raft_->GetMatchIndex(), ::testing::ElementsAre(0, 2, 2, 0, 0)); +} + +// Test 2: A leader receiving an AppendEntriesResponse from a follower that in a newer term. +TEST_F(RaftTest, LeaderReceivesAppendEntriesResponseFromNewerTerm) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); + + raft_->SetStateForTest({ + .currentTerm = 1, + .role = Role::LEADER, + }); + + AppendEntriesResponse aeResponse; + aeResponse.set_success(false); + aeResponse.set_term(2); + + bool success = raft_->ReceiveAppendEntriesResponse(std::make_unique(aeResponse)); + EXPECT_FALSE(success); + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); +} + +// Test 3: A leader receiving an AppendEntriesResponse success, updating the follower's matchIndex, and committing a new entry +TEST_F(RaftTest, LeaderReceivesAppendEntriesResponseSuccessAndCommits) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(mock_commit, Commit(_)).Times(1); + + AppendEntriesResponse aeResponse; + aeResponse.set_success(true); + aeResponse.set_term(1); + aeResponse.set_id(2); + aeResponse.set_lastlogindex(2); + + raft_->SetStateForTest({ + .currentTerm = 1, + .commitIndex = 0, + .lastApplied = 0, + .role = Role::LEADER, + .log = CreateLogEntries({ + {1, "Transaction 1"}, + {1, "Transaction 2"}, + }, true), + .nextIndex = std::vector{0, 2, 2, 2, 2}, + .matchIndex = std::vector{0, 2, 0, 1, 0} + }); + + bool success = raft_->ReceiveAppendEntriesResponse(std::make_unique(aeResponse)); + EXPECT_TRUE(success); + EXPECT_THAT(raft_->GetMatchIndex(), ::testing::ElementsAre(0, 2, 2, 1, 0)); + EXPECT_EQ(raft_->GetCommitIndex(), 1); +} + +// Test 4: A leader receiving an AppendEntriesResponse success and catching up a follower that is behind +TEST_F(RaftTest, LeaderCatchesUpFollowerThatIsBehind) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& ae = dynamic_cast(msg); + EXPECT_EQ(ae.entries_size(), 1); + // TODO: Use serialized string instead of manually doing it + EXPECT_EQ(ae.entries(0).command(), "\n\rTransaction 2"); + EXPECT_EQ(node_id, 2); + return 0; + })); + + AppendEntriesResponse aeResponse; + aeResponse.set_success(true); + aeResponse.set_term(1); + aeResponse.set_id(2); + aeResponse.set_lastlogindex(1); + + raft_->SetStateForTest({ + .currentTerm = 1, + .commitIndex = 0, + .lastApplied = 0, + .role = Role::LEADER, + .log = CreateLogEntries({ + {1, "Transaction 1"}, + {1, "Transaction 2"}, + }, true), + }); + + bool success = raft_->ReceiveAppendEntriesResponse(std::make_unique(aeResponse)); + EXPECT_TRUE(success); +} + +// Test 5: A leader receiving an AppendEntriesResponse Failure and catching up a follower that is behind +TEST_F(RaftTest, LeaderCatchesUpFollowerThatIsBehindFailure) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& ae = dynamic_cast(msg); + // TODO: Use serialized string instead of manually doing it + EXPECT_EQ(ae.entries(0).command(), "\n\rTransaction 1"); + EXPECT_EQ(ae.entries(1).command(), "\n\rTransaction 2"); + EXPECT_EQ(ae.entries_size(), 2); + EXPECT_EQ(node_id, 2); + return 0; + })); + + AppendEntriesResponse aeResponse; + aeResponse.set_success(false); + aeResponse.set_term(1); + aeResponse.set_id(2); + aeResponse.set_lastlogindex(0); + + raft_->SetStateForTest({ + .currentTerm = 1, + .commitIndex = 0, + .lastApplied = 0, + .role = Role::LEADER, + .log = CreateLogEntries({ + {1, "Transaction 1"}, + {1, "Transaction 2"}, + }, true), + }); + + bool success = raft_->ReceiveAppendEntriesResponse(std::make_unique(aeResponse)); + EXPECT_TRUE(success); +} + +// Test 6: A follower ignores an AppendEntriesResponse. +TEST_F(RaftTest, FollowerIgnoresAppendEntriesResponse) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(mock_call, Call(_, _, _)).Times(0); + + AppendEntriesResponse aeResponse; + aeResponse.set_term(1); + + raft_->SetStateForTest({ + .currentTerm = 1, + .role = Role::FOLLOWER, + }); + + bool success = raft_->ReceiveAppendEntriesResponse(std::make_unique(aeResponse)); + EXPECT_TRUE(success); +} + +// Test 7: A leader ignores an AppendEntriesResponse from an outdated term. +TEST_F(RaftTest, LeaderIgnoresAppendEntriesResponseFromOutdatedTerm) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(mock_call, Call(_, _, _)).Times(0); + + AppendEntriesResponse aeResponse; + aeResponse.set_term(1); + + raft_->SetStateForTest({ + .currentTerm = 2, + .role = Role::LEADER, + }); + + bool success = raft_->ReceiveAppendEntriesResponse(std::make_unique(aeResponse)); + EXPECT_TRUE(success); +} + +} // namespace raft +} // namespace resdb From 7d547c4a0b8051e53675122b17fba4a506d01656 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Tue, 17 Feb 2026 14:56:38 -0800 Subject: [PATCH 48/66] Add tests related to voting --- .../consensus/ordering/raft/algorithm/BUILD | 38 +++ .../consensus/ordering/raft/algorithm/raft.h | 2 + .../raft/algorithm/raft_test_request_vote.cpp | 220 ++++++++++++++++++ .../raft_test_request_vote_response.cpp | 195 ++++++++++++++++ 4 files changed, 455 insertions(+) create mode 100644 platform/consensus/ordering/raft/algorithm/raft_test_request_vote.cpp create mode 100644 platform/consensus/ordering/raft/algorithm/raft_test_request_vote_response.cpp diff --git a/platform/consensus/ordering/raft/algorithm/BUILD b/platform/consensus/ordering/raft/algorithm/BUILD index 7694902e8c..9a2f97735d 100644 --- a/platform/consensus/ordering/raft/algorithm/BUILD +++ b/platform/consensus/ordering/raft/algorithm/BUILD @@ -107,3 +107,41 @@ cc_test( ], size="small" ) + +cc_test( + name = "raft_test_request_vote", + srcs = [ + "raft_test_request_vote.cpp", + "raft_tests.h", + ], + copts = ["-DRAFT_TEST_MODE"], + deps = [ + ":raft", + ":mock_leader_election_manager", + "//platform/networkstrate:mock_replica_communicator", + "//common/crypto:mock_signature_verifier", + "//platform/config:resdb_config_utils", + "//common/test:test_main", + "//platform/proto:client_test_cc_proto", + ], + size="small" +) + +cc_test( + name = "raft_test_request_vote_response", + srcs = [ + "raft_test_request_vote_response.cpp", + "raft_tests.h", + ], + copts = ["-DRAFT_TEST_MODE"], + deps = [ + ":raft", + ":mock_leader_election_manager", + "//platform/networkstrate:mock_replica_communicator", + "//common/crypto:mock_signature_verifier", + "//platform/config:resdb_config_utils", + "//common/test:test_main", + "//platform/proto:client_test_cc_proto", + ], + size="small" +) \ No newline at end of file diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index c84d0cddfa..7b93c5e8ec 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -84,6 +84,7 @@ struct RaftStatePatch { std::optional>> log; std::optional> nextIndex; std::optional> matchIndex; + std::optional> votes; }; #endif @@ -184,6 +185,7 @@ class Raft : public common::ProtocolBase { if (patch.nextIndex) nextIndex_ = *patch.nextIndex; if (patch.matchIndex) matchIndex_ = *patch.matchIndex; + if (patch.votes) votes_ = *patch.votes; } uint64_t GetCurrentTerm() const { diff --git a/platform/consensus/ordering/raft/algorithm/raft_test_request_vote.cpp b/platform/consensus/ordering/raft/algorithm/raft_test_request_vote.cpp new file mode 100644 index 0000000000..6efca4d9e0 --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/raft_test_request_vote.cpp @@ -0,0 +1,220 @@ +#include "platform/consensus/ordering/raft/algorithm/raft_tests.h" + +namespace resdb { +namespace raft { +using ::testing::Invoke; +using ::testing::_; +using ::testing::Matcher; +using ::testing::AnyNumber; + +// Test 1: A follower times out, transitions to candidate, and starts an election +TEST_F(RaftTest, FollowerTransitionsToCandidateAndStartsElection) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); + EXPECT_CALL(mock_broadcast, Broadcast(_, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg) { + const auto& requestVote = dynamic_cast(msg); + EXPECT_EQ(requestVote.term(), 1); + EXPECT_EQ(requestVote.candidateid(), 1); + EXPECT_EQ(requestVote.lastlogindex(), 1); + EXPECT_EQ(requestVote.lastlogterm(), 0); + return 0; + })); + + raft_->SetStateForTest({ + .currentTerm = 0, + .role = Role::FOLLOWER, + .log = CreateLogEntries({ + {0, "Term 0 Transaction 1"}, + }, true), + }); + + raft_->StartElection(); + EXPECT_EQ(raft_->GetVotedFor(), 1); + EXPECT_EQ(raft_->GetCurrentTerm(), 1); + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::CANDIDATE); +} + +// Test 2: A leader receives a RequestVote from a candidate in a newer term and demotes. +TEST_F(RaftTest, LeaderReceivesRequestVoteFromNewTermAndDemotes) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& requestVoteResponse = dynamic_cast(msg); + EXPECT_EQ(node_id, 2); + EXPECT_EQ(requestVoteResponse.term(), 1); + EXPECT_EQ(requestVoteResponse.voterid(), 1); + EXPECT_TRUE(requestVoteResponse.votegranted()); + return 0; + })); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); + + RequestVote rv; + rv.set_term(1); + rv.set_candidateid(2); + rv.set_lastlogindex(1); + rv.set_lastlogterm(0); + + raft_->SetStateForTest({ + .currentTerm = 0, + .role = Role::LEADER, + .log = CreateLogEntries({ + {0, "Term 0 Transaction 1"}, + }, true), + }); + + raft_->ReceiveRequestVote(std::make_unique(rv)); + + EXPECT_EQ(raft_->GetVotedFor(), 2); + EXPECT_EQ(raft_->GetCurrentTerm(), 1); + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); +} + +// Test 3: A leader receives a RequestVote from a candidate whose lastLogTerm is fewer and does not vote +TEST_F(RaftTest, LeaderReceivesRequestVoteFromOldTerm) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& requestVoteResponse = dynamic_cast(msg); + EXPECT_EQ(node_id, 2); + EXPECT_EQ(requestVoteResponse.term(), 1); + EXPECT_EQ(requestVoteResponse.voterid(), 1); + EXPECT_FALSE(requestVoteResponse.votegranted()); + return 0; + })); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); + + RequestVote rv; + rv.set_term(1); + rv.set_candidateid(2); + rv.set_lastlogindex(0); + rv.set_lastlogterm(0); + + raft_->SetStateForTest({ + .currentTerm = 1, + .role = Role::LEADER, + .log = CreateLogEntries({ + {0, "Term 0 Transaction 1"}, + }, true), + }); + + raft_->ReceiveRequestVote(std::make_unique(rv)); + + EXPECT_EQ(raft_->GetVotedFor(), -1); + EXPECT_EQ(raft_->GetCurrentTerm(), 1); + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::LEADER); +} + +// Test 4: A leader receives a RequestVote from a candidate whose lastLogTerm is less recent +TEST_F(RaftTest, LeaderReceivesRequestVoteFromOlderLastLogTerm) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& requestVoteResponse = dynamic_cast(msg); + EXPECT_EQ(node_id, 2); + EXPECT_EQ(requestVoteResponse.term(), 1); + EXPECT_EQ(requestVoteResponse.voterid(), 1); + EXPECT_FALSE(requestVoteResponse.votegranted()); + return 0; + })); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); + + RequestVote rv; + rv.set_term(1); + rv.set_candidateid(2); + rv.set_lastlogindex(0); + rv.set_lastlogterm(0); + + raft_->SetStateForTest({ + .currentTerm = 1, + .role = Role::LEADER, + .log = CreateLogEntries({ + {0, "Term 0 Transaction 1"}, + }, true), + }); + + raft_->ReceiveRequestVote(std::make_unique(rv)); + + EXPECT_EQ(raft_->GetVotedFor(), -1); + EXPECT_EQ(raft_->GetCurrentTerm(), 1); + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::LEADER); +} + +// Test 5: A leader receives a RequestVote from a candidate whose lastLogTerm is the same, but whose lastLogIndex is further behind +TEST_F(RaftTest, LeaderReceivesRequestVoteFromFurtherBehindLog) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& requestVoteResponse = dynamic_cast(msg); + EXPECT_EQ(node_id, 2); + EXPECT_EQ(requestVoteResponse.term(), 2); + EXPECT_EQ(requestVoteResponse.voterid(), 1); + EXPECT_FALSE(requestVoteResponse.votegranted()); + return 0; + })); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); + + RequestVote rv; + rv.set_term(2); + rv.set_candidateid(2); + rv.set_lastlogindex(0); + rv.set_lastlogterm(0); + + raft_->SetStateForTest({ + .currentTerm = 1, + .role = Role::LEADER, + .log = CreateLogEntries({ + {0, "Term 0 Transaction 1"}, + }, true), + }); + + raft_->ReceiveRequestVote(std::make_unique(rv)); + + EXPECT_EQ(raft_->GetVotedFor(), -1); + EXPECT_EQ(raft_->GetCurrentTerm(), 2); + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); +} + +// Test 6: A follower receives a RequestVote from a candidate who it would vote for, if it had not already voted for someone else. +TEST_F(RaftTest, FollowerRejectsRequestVoteBecauseAlreadyVoted) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& requestVoteResponse = dynamic_cast(msg); + EXPECT_EQ(node_id, 2); + EXPECT_EQ(requestVoteResponse.term(), 2); + EXPECT_EQ(requestVoteResponse.voterid(), 1); + EXPECT_FALSE(requestVoteResponse.votegranted()); + return 0; + })); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); + + RequestVote rv; + rv.set_term(2); + rv.set_candidateid(2); + rv.set_lastlogindex(2); + rv.set_lastlogterm(1); + + raft_->SetStateForTest({ + .currentTerm = 2, + .votedFor = 3, + .role = Role::FOLLOWER, + .log = CreateLogEntries({ + {0, "Term 0 Transaction 1"}, + }, true), + }); + + raft_->ReceiveRequestVote(std::make_unique(rv)); + + EXPECT_EQ(raft_->GetVotedFor(), 3); + EXPECT_EQ(raft_->GetCurrentTerm(), 2); + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); +} + +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft_test_request_vote_response.cpp b/platform/consensus/ordering/raft/algorithm/raft_test_request_vote_response.cpp new file mode 100644 index 0000000000..8723c489bd --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/raft_test_request_vote_response.cpp @@ -0,0 +1,195 @@ +#include "platform/consensus/ordering/raft/algorithm/raft_tests.h" + +namespace resdb { +namespace raft { +using ::testing::Invoke; +using ::testing::_; +using ::testing::Matcher; +using ::testing::AnyNumber; + +// Test 1: A candidate gets elected +TEST_F(RaftTest, CandidateGetsElected) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& AppendEntriesMessage = dynamic_cast(msg); + EXPECT_EQ(node_id, 2); + EXPECT_EQ(AppendEntriesMessage.entries_size(), 0); + EXPECT_EQ(AppendEntriesMessage.prevlogterm(), 1); + EXPECT_EQ(AppendEntriesMessage.prevlogindex(), 2); + EXPECT_EQ(AppendEntriesMessage.leaderid(), 1); + EXPECT_EQ(AppendEntriesMessage.leadercommitindex(), 1); + return 0; + })) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& AppendEntriesMessage = dynamic_cast(msg); + EXPECT_EQ(node_id, 3); + EXPECT_EQ(AppendEntriesMessage.entries_size(), 0); + EXPECT_EQ(AppendEntriesMessage.prevlogterm(), 1); + EXPECT_EQ(AppendEntriesMessage.prevlogindex(), 2); + EXPECT_EQ(AppendEntriesMessage.leaderid(), 1); + EXPECT_EQ(AppendEntriesMessage.leadercommitindex(), 1); + return 0; + })) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& AppendEntriesMessage = dynamic_cast(msg); + EXPECT_EQ(node_id, 4); + EXPECT_EQ(AppendEntriesMessage.entries_size(), 0); + EXPECT_EQ(AppendEntriesMessage.prevlogterm(), 1); + EXPECT_EQ(AppendEntriesMessage.prevlogindex(), 2); + EXPECT_EQ(AppendEntriesMessage.leaderid(), 1); + EXPECT_EQ(AppendEntriesMessage.leadercommitindex(), 1); + return 0; + })); + + raft_->SetStateForTest({ + .currentTerm = 2, + .commitIndex = 1, + .lastApplied = 1, + .role = Role::CANDIDATE, + .log = CreateLogEntries({ + {0, "Term 0 Transaction 1"}, + {1, "Term 1 Transaction 1"}, + }, true), + .votes = std::vector{1, 3} + }); + + RequestVoteResponse rvr; + rvr.set_term(2); + rvr.set_voterid(2); + rvr.set_votegranted(true); + raft_->ReceiveRequestVoteResponse(std::make_unique(rvr)); + + EXPECT_EQ(raft_->GetCurrentTerm(), 2); + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::LEADER); + EXPECT_EQ(raft_->GetLastLogIndexFromLog(), 2); + EXPECT_THAT(raft_->GetNextIndex(), ::testing::ElementsAre(3, 3, 3, 3, 3)); + EXPECT_THAT(raft_->GetMatchIndex(), ::testing::ElementsAre(0, 2, 0, 0, 0)); +} + +// Test 2: A candidate receives a RequestVoteResponse from an older term and ignores it. +TEST_F(RaftTest, CandidateIgnoresResponseFromOldTerm) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(mock_call, Call(_, _, _)).Times(0); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); + + raft_->SetStateForTest({ + .currentTerm = 2, + .role = Role::CANDIDATE, + .log = CreateLogEntries({ + {0, "Term 0 Transaction 1"}, + }, true), + }); + + RequestVoteResponse rvr; + rvr.set_term(1); + rvr.set_voterid(2); + rvr.set_votegranted(true); + raft_->ReceiveRequestVoteResponse(std::make_unique(rvr)); + + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::CANDIDATE); +} + +// Test 3: A candidate receives a RequestVoteResponse from an newer term and demotes. +TEST_F(RaftTest, CandidateDemotesAfterRequestVoteResponseFromNewerTerm) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); + EXPECT_CALL(mock_call, Call(_, _, _)).Times(0); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); + + raft_->SetStateForTest({ + .currentTerm = 2, + .role = Role::CANDIDATE, + .log = CreateLogEntries({ + {0, "Term 0 Transaction 1"}, + }, true), + }); + + RequestVoteResponse rvr; + rvr.set_term(3); + rvr.set_voterid(2); + rvr.set_votegranted(false); + raft_->ReceiveRequestVoteResponse(std::make_unique(rvr)); + + EXPECT_EQ(raft_->GetVotedFor(), -1); + EXPECT_EQ(raft_->GetCurrentTerm(), 3); + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); +} + +// Test 4: A follower ignores a RequestVoteResponse. +TEST_F(RaftTest, FollowerIgnoresRequestVoteResponse) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(mock_call, Call(_, _, _)).Times(0); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); + + raft_->SetStateForTest({ + .currentTerm = 2, + .role = Role::FOLLOWER, + .log = CreateLogEntries({ + {0, "Term 0 Transaction 1"}, + }, true), + }); + + RequestVoteResponse rvr; + rvr.set_term(2); + rvr.set_voterid(2); + rvr.set_votegranted(true); + raft_->ReceiveRequestVoteResponse(std::make_unique(rvr)); + + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); +} + +// Test 5: A candidate ignores a no vote in a RequestVoteResponse. +TEST_F(RaftTest, CandidateIgnoresNoVote) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(mock_call, Call(_, _, _)).Times(0); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); + + raft_->SetStateForTest({ + .currentTerm = 2, + .role = Role::CANDIDATE, + .log = CreateLogEntries({ + {0, "Term 0 Transaction 1"}, + }, true), + }); + + RequestVoteResponse rvr; + rvr.set_term(2); + rvr.set_voterid(2); + rvr.set_votegranted(false); + raft_->ReceiveRequestVoteResponse(std::make_unique(rvr)); + + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::CANDIDATE); +} + +// Test 6: A candidate ignores a duplicate vote. +TEST_F(RaftTest, CandidateIgnoresDuplicateVote) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(mock_call, Call(_, _, _)).Times(0); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); + + raft_->SetStateForTest({ + .currentTerm = 2, + .commitIndex = 1, + .lastApplied = 1, + .role = Role::CANDIDATE, + .log = CreateLogEntries({ + {0, "Term 0 Transaction 1"}, + {1, "Term 1 Transaction 1"}, + }, true), + .votes = std::vector{1, 2} + }); + + RequestVoteResponse rvr; + rvr.set_term(2); + rvr.set_voterid(2); + rvr.set_votegranted(true); + raft_->ReceiveRequestVoteResponse(std::make_unique(rvr)); + + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::CANDIDATE); +} + +} // namespace raft +} // namespace resdb From a0c3514b29e9b7edbf6a3d57f6891e4b3d1ce0b9 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Tue, 17 Feb 2026 15:12:03 -0800 Subject: [PATCH 49/66] Run clang format and other style changes --- .../consensus/ordering/raft/algorithm/BUILD | 16 +- .../leader_election_manager_test.cpp | 107 ++-- .../raft/algorithm/leaderelection_manager.h | 1 - .../algorithm/mock_leader_election_manager.h | 1 - .../ordering/raft/algorithm/mock_raft.h | 15 +- .../ordering/raft/algorithm/raft.cpp | 32 +- .../consensus/ordering/raft/algorithm/raft.h | 36 +- ... => raft_append_entries_response_test.cpp} | 128 ++-- ...tries.cpp => raft_append_entries_test.cpp} | 561 +++++++++--------- ...pp => raft_request_vote_response_test.cpp} | 121 ++-- ...st_vote.cpp => raft_request_vote_test.cpp} | 137 +++-- .../ordering/raft/algorithm/raft_tests.h | 86 +-- 12 files changed, 664 insertions(+), 577 deletions(-) rename platform/consensus/ordering/raft/algorithm/{raft_test_append_entries_response.cpp => raft_append_entries_response_test.cpp} (61%) rename platform/consensus/ordering/raft/algorithm/{raft_test_append_entries.cpp => raft_append_entries_test.cpp} (53%) rename platform/consensus/ordering/raft/algorithm/{raft_test_request_vote_response.cpp => raft_request_vote_response_test.cpp} (68%) rename platform/consensus/ordering/raft/algorithm/{raft_test_request_vote.cpp => raft_request_vote_test.cpp} (70%) diff --git a/platform/consensus/ordering/raft/algorithm/BUILD b/platform/consensus/ordering/raft/algorithm/BUILD index 9a2f97735d..4a980c9c54 100644 --- a/platform/consensus/ordering/raft/algorithm/BUILD +++ b/platform/consensus/ordering/raft/algorithm/BUILD @@ -71,9 +71,9 @@ cc_test( ) cc_test( - name = "raft_test_append_entries", + name = "raft_append_entries_test", srcs = [ - "raft_test_append_entries.cpp", + "raft_append_entries_test.cpp", "raft_tests.h", ], copts = ["-DRAFT_TEST_MODE"], @@ -90,9 +90,9 @@ cc_test( ) cc_test( - name = "raft_test_append_entries_response", + name = "raft_append_entries_response_test", srcs = [ - "raft_test_append_entries_response.cpp", + "raft_append_entries_response_test.cpp", "raft_tests.h", ], copts = ["-DRAFT_TEST_MODE"], @@ -109,9 +109,9 @@ cc_test( ) cc_test( - name = "raft_test_request_vote", + name = "raft_request_vote_test", srcs = [ - "raft_test_request_vote.cpp", + "raft_request_vote_test.cpp", "raft_tests.h", ], copts = ["-DRAFT_TEST_MODE"], @@ -128,9 +128,9 @@ cc_test( ) cc_test( - name = "raft_test_request_vote_response", + name = "raft_request_vote_response_test", srcs = [ - "raft_test_request_vote_response.cpp", + "raft_request_vote_response_test.cpp", "raft_tests.h", ], copts = ["-DRAFT_TEST_MODE"], diff --git a/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp b/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp index 5a89355073..1ebd251a62 100644 --- a/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp +++ b/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp @@ -1,12 +1,12 @@ -#include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" -#include "platform/consensus/ordering/raft/algorithm/mock_raft.h" -#include "platform/config/resdb_config_utils.h" - #include #include -#include #include +#include + +#include "platform/config/resdb_config_utils.h" +#include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" +#include "platform/consensus/ordering/raft/algorithm/mock_raft.h" namespace resdb { namespace raft { @@ -25,8 +25,9 @@ ResDBConfig GenerateConfig() { } class TestLeaderElectionManager : public LeaderElectionManager { -public: - TestLeaderElectionManager(const ResDBConfig& config) : LeaderElectionManager(config) {} + public: + TestLeaderElectionManager(const ResDBConfig& config) + : LeaderElectionManager(config) {} uint64_t GetHeartbeatCount() { std::lock_guard lk(cv_mutex_); return heartbeat_count_; @@ -35,20 +36,25 @@ class TestLeaderElectionManager : public LeaderElectionManager { std::lock_guard lk(cv_mutex_); return broadcast_count_; } -private: - // Overriding this is used to set the timeout timer to start an election to 50 ms. + + private: + // Overriding this is used to set the timeout timer to start an election to 50 + // ms. uint64_t RandomInt(uint64_t min, uint64_t max) { return 50; } }; class LeaderElectionManagerTest : public ::testing::Test { protected: - LeaderElectionManagerTest() : config_(GenerateConfig()) {} + LeaderElectionManagerTest() : config_(GenerateConfig()) {} void SetUp() override { verifier_ = nullptr; replica_communicator_ = nullptr; - leader_election_manager_ = std::make_unique(config_); - mock_raft_ = std::make_unique(1, 1, 3, verifier_.get(), leader_election_manager_.get(), replica_communicator_.get()); + leader_election_manager_ = + std::make_unique(config_); + mock_raft_ = std::make_unique(1, 1, 3, verifier_.get(), + leader_election_manager_.get(), + replica_communicator_.get()); } void TearDown() override { @@ -67,29 +73,29 @@ class LeaderElectionManagerTest : public ::testing::Test { std::unique_ptr mock_raft_; }; -// Test 1: Follower timeout should trigger election +// Test 1: Follower timeout should trigger election. TEST_F(LeaderElectionManagerTest, FollowerTimeoutTriggersElection) { mock_raft_->SetRole(Role::FOLLOWER); - + std::promise election_started; std::future election_started_future = election_started.get_future(); leader_election_manager_->SetRaft(mock_raft_.get()); leader_election_manager_->MayStart(); - EXPECT_CALL(*mock_raft_, StartElection) - .WillOnce(Invoke([&]() { - election_started.set_value(true); - })); - - auto status = election_started_future.wait_for(std::chrono::milliseconds(100)); + EXPECT_CALL(*mock_raft_, StartElection).WillOnce(Invoke([&]() { + election_started.set_value(true); + })); + + auto status = + election_started_future.wait_for(std::chrono::milliseconds(100)); ASSERT_EQ(status, std::future_status::ready); } -// Test 2: Follower should not start election before timing out +// Test 2: Follower should not start election before timing out. TEST_F(LeaderElectionManagerTest, FollowerShouldNotStartElectionEarly) { mock_raft_->SetRole(Role::FOLLOWER); - + std::promise election_started; std::future election_started_future = election_started.get_future(); @@ -97,15 +103,17 @@ TEST_F(LeaderElectionManagerTest, FollowerShouldNotStartElectionEarly) { leader_election_manager_->SetRaft(mock_raft_.get()); leader_election_manager_->MayStart(); - + std::this_thread::sleep_for(std::chrono::milliseconds(45)); - // Since the timeout timer is set to 50 ms, StartElection should never be called + // Since the timeout timer is set to 50 ms, StartElection should never be + // called. } -// Test 3: Follower receiving heartbeat should NOT trigger election -TEST_F(LeaderElectionManagerTest, FollowerReceivingHeartbeatDoesNotStartElection) { +// Test 3: Follower receiving heartbeat should NOT trigger election. +TEST_F(LeaderElectionManagerTest, + FollowerReceivingHeartbeatDoesNotStartElection) { mock_raft_->SetRole(Role::FOLLOWER); - + std::promise election_started; std::future election_started_future = election_started.get_future(); @@ -113,38 +121,38 @@ TEST_F(LeaderElectionManagerTest, FollowerReceivingHeartbeatDoesNotStartElection leader_election_manager_->SetRaft(mock_raft_.get()); leader_election_manager_->MayStart(); - + std::this_thread::sleep_for(std::chrono::milliseconds(45)); leader_election_manager_->OnHeartBeat(); std::this_thread::sleep_for(std::chrono::milliseconds(45)); ASSERT_EQ(leader_election_manager_->GetHeartbeatCount(), 1); - // Since the timeout timer is set to 50 ms, StartElection should never be called + // Since the timeout timer is set to 50 ms, StartElection should never be + // called. } -// Test 4: Leader timeout should send heartbeat +// Test 4: Leader timeout should send heartbeat. TEST_F(LeaderElectionManagerTest, LeaderTimeoutSendsHeartbeat) { mock_raft_->SetRole(Role::LEADER); - + std::promise heartbeat_sent; std::future heartbeat_sent_future = heartbeat_sent.get_future(); leader_election_manager_->SetRaft(mock_raft_.get()); leader_election_manager_->MayStart(); - - EXPECT_CALL(*mock_raft_, SendHeartBeat) - .WillOnce(Invoke([&]() { - heartbeat_sent.set_value(true); - })); - + + EXPECT_CALL(*mock_raft_, SendHeartBeat).WillOnce(Invoke([&]() { + heartbeat_sent.set_value(true); + })); + auto status = heartbeat_sent_future.wait_for(std::chrono::milliseconds(105)); ASSERT_EQ(status, std::future_status::ready); } -// Test 5: Leader should not send heartbeat before timing out +// Test 5: Leader should not send heartbeat before timing out. TEST_F(LeaderElectionManagerTest, LeaderShouldNotSendHeartbeatEarly) { mock_raft_->SetRole(Role::LEADER); - + std::promise heartbeat_sent; std::future heartbeat_sent_future = heartbeat_sent.get_future(); @@ -152,30 +160,31 @@ TEST_F(LeaderElectionManagerTest, LeaderShouldNotSendHeartbeatEarly) { leader_election_manager_->SetRaft(mock_raft_.get()); leader_election_manager_->MayStart(); - + std::this_thread::sleep_for(std::chrono::milliseconds(95)); - // Since the heartbeat timer is set to 100 ms, SendHeartBeat should never be called + // Since the heartbeat timer is set to 100 ms, SendHeartBeat should never be + // called. } -// Test 6: Leader sending some broadcast should not be sending heartbeats +// Test 6: Leader sending some broadcast should not be sending heartbeats. TEST_F(LeaderElectionManagerTest, LeaderWithBroadcastDoesNotSendHeartbeat) { mock_raft_->SetRole(Role::LEADER); - + std::promise heartbeat_sent; std::future heartbeat_sent_future = heartbeat_sent.get_future(); - + EXPECT_CALL(*mock_raft_, SendHeartBeat()).Times(0); leader_election_manager_->SetRaft(mock_raft_.get()); leader_election_manager_->MayStart(); - - // Send broadcasts to reset the timer + + // Send broadcasts to reset the timer. for (int i = 0; i < 3; i++) { std::this_thread::sleep_for(std::chrono::milliseconds(95)); leader_election_manager_->OnAeBroadcast(); } - + ASSERT_EQ(leader_election_manager_->GetBroadcastCount(), 3); } -} // namespace raft -} // namespace resdb +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h index 03e0242ae3..638aa42350 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h @@ -56,7 +56,6 @@ class LeaderElectionManager { Waited Wait(); void MonitoringElectionTimeout(); virtual uint64_t RandomInt(uint64_t min, uint64_t max); - protected: ResDBConfig config_; diff --git a/platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h b/platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h index 1f08d4a461..42b4e4502e 100644 --- a/platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h +++ b/platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h @@ -26,7 +26,6 @@ namespace resdb { namespace raft { -// Mock MockLeaderElectionManager class to test Raft interactions class MockLeaderElectionManager : public LeaderElectionManager { public: MockLeaderElectionManager(const ResDBConfig& config) diff --git a/platform/consensus/ordering/raft/algorithm/mock_raft.h b/platform/consensus/ordering/raft/algorithm/mock_raft.h index 4eebe085d9..12a4d5d027 100644 --- a/platform/consensus/ordering/raft/algorithm/mock_raft.h +++ b/platform/consensus/ordering/raft/algorithm/mock_raft.h @@ -26,20 +26,21 @@ namespace resdb { namespace raft { -// Mock Raft class to test LeaderElectionManager interactions class MockRaft : public Raft { public: MockRaft(int id, int f, int total_num, SignatureVerifier* verifier, - LeaderElectionManager* leaderelection_manager, + LeaderElectionManager* leaderelection_manager, ReplicaCommunicator* replica_communicator) - : Raft(id, f, total_num, verifier, leaderelection_manager, replica_communicator){} + : Raft(id, f, total_num, verifier, leaderelection_manager, + replica_communicator) {} MOCK_METHOD(void, SendHeartBeat, (), ()); MOCK_METHOD(void, StartElection, (), ()); - MOCK_METHOD(int, Broadcast, (int msg_type, const google::protobuf::Message& msg), (override)); - MOCK_METHOD(int, SendMessage, (int msg_type, - const google::protobuf::Message& msg, - int node_id), (override)); + MOCK_METHOD(int, Broadcast, + (int msg_type, const google::protobuf::Message& msg), (override)); + MOCK_METHOD(int, SendMessage, + (int msg_type, const google::protobuf::Message& msg, int node_id), + (override)); }; } // namespace raft diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index bcde0949a3..24f17866c9 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -33,13 +33,13 @@ namespace resdb { namespace raft { -std::ostream &operator << (std::ostream& stream, Role role) { - const char* nameRole[] = { "FOLLOWER", "CANDIDATE", "LEADER"}; +std::ostream& operator<<(std::ostream& stream, Role role) { + const char* nameRole[] = {"FOLLOWER", "CANDIDATE", "LEADER"}; return stream << nameRole[static_cast(role)]; } -std::ostream &operator << (std::ostream& stream, TermRelation tr) { - const char* nameTR[] = { "STALE", "CURRENT", "NEW"}; +std::ostream& operator<<(std::ostream& stream, TermRelation tr) { + const char* nameTR[] = {"STALE", "CURRENT", "NEW"}; return stream << nameTR[static_cast(tr)]; } @@ -99,9 +99,7 @@ bool Raft::IsStop() { return is_stop_; } -void Raft::SetRole(Role role) { - role_ = role; -} +void Raft::SetRole(Role role) { role_ = role; } bool Raft::ReceiveTransaction(std::unique_ptr req) { std::vector messages; @@ -833,7 +831,6 @@ bool Raft::InFlightPerFollowerLimitReachedLocked(int followerId) const { return size == maxInFlightPerFollower; } - void Raft::PrintDebugState() const { std::lock_guard lk(mutex_); @@ -843,23 +840,23 @@ void Raft::PrintDebugState() const { LOG(INFO) << "log_ (size " << log_.size() << "): ["; for (size_t i = 0; i < log_.size(); ++i) { - LOG(INFO) << "{term: " << log_[i]->term - << ", cmd_size: " << log_[i]->command.size() << "}"; - if (i + 1 != log_.size()) LOG(INFO) << ", "; + LOG(INFO) << "{term: " << log_[i]->term + << ", cmd_size: " << log_[i]->command.size() << "}"; + if (i + 1 != log_.size()) LOG(INFO) << ", "; } LOG(INFO) << "]\n"; LOG(INFO) << "nextIndex_: ["; for (size_t i = 0; i < nextIndex_.size(); ++i) { - LOG(INFO) << nextIndex_[i]; - if (i + 1 != nextIndex_.size()) LOG(INFO) << ", "; + LOG(INFO) << nextIndex_[i]; + if (i + 1 != nextIndex_.size()) LOG(INFO) << ", "; } LOG(INFO) << "]\n"; LOG(INFO) << "matchIndex_: ["; for (size_t i = 0; i < matchIndex_.size(); ++i) { - LOG(INFO) << matchIndex_[i]; - if (i + 1 != matchIndex_.size()) LOG(INFO) << ", "; + LOG(INFO) << matchIndex_[i]; + if (i + 1 != matchIndex_.size()) LOG(INFO) << ", "; } LOG(INFO) << "]\n"; @@ -871,14 +868,13 @@ void Raft::PrintDebugState() const { LOG(INFO) << "votes_: ["; for (size_t i = 0; i < votes_.size(); ++i) { - LOG(INFO) << votes_[i]; - if (i + 1 != votes_.size()) LOG(INFO) << ", "; + LOG(INFO) << votes_[i]; + if (i + 1 != votes_.size()) LOG(INFO) << ", "; } LOG(INFO) << "]\n"; LOG(INFO) << "--------------------------\n"; } - } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 7b93c5e8ec..65e22e5df1 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -102,9 +102,11 @@ class Raft : public common::ProtocolBase { virtual bool ReceiveTransaction(std::unique_ptr req); virtual bool ReceiveAppendEntries(std::unique_ptr ae); - virtual bool ReceiveAppendEntriesResponse(std::unique_ptr aer); + virtual bool ReceiveAppendEntriesResponse( + std::unique_ptr aer); virtual void ReceiveRequestVote(std::unique_ptr rv); - virtual void ReceiveRequestVoteResponse(std::unique_ptr rvr); + virtual void ReceiveRequestVoteResponse( + std::unique_ptr rvr); virtual void StartElection(); virtual void SendHeartBeat(); virtual Role GetRoleSnapshot() const; @@ -114,23 +116,27 @@ class Raft : public common::ProtocolBase { private: mutable std::mutex mutex_; - virtual TermRelation TermCheckLocked(uint64_t term) const; // Must be called under mutex - virtual bool DemoteSelfLocked(uint64_t term); // Must be called under mutex - virtual uint64_t getLastLogTermLocked() const; // Must be called under mutex + virtual TermRelation TermCheckLocked( + uint64_t term) const; // Must be called under mutex + virtual bool DemoteSelfLocked(uint64_t term); // Must be called under mutex + virtual uint64_t getLastLogTermLocked() const; // Must be called under mutex virtual bool IsStop(); //bool IsDuplicateLogEntry(const std::string& hash) const; // Must be called under mutex - virtual std::vector> PrepareCommitLocked(); // Must be called under mutex - virtual AeFields GatherAeFieldsLocked(int followerId, bool heartBeat = false) const; // Must be called under mutex + virtual std::vector> + PrepareCommitLocked(); // Must be called under mutex + virtual AeFields GatherAeFieldsLocked(int followerId, bool heartBeat = false) + const; // Must be called under mutex std::vector GatherAeFieldsForBroadcastLocked(bool heartBeat = false) const; // Must be called under mutex virtual void CreateAndSendAppendEntryMsg(const AeFields& fields); virtual LogEntry CreateLogEntry(const Entry& entry) const; virtual void ClearInFlightsLocked(); virtual void PruneExpiredInFlightMsgsLocked(); - virtual void PruneRedundantInFlightMsgsLocked(int followerId, uint64_t followerLastLogIndex); - virtual void RecordNewInFlightMsgLocked(const AeFields& msg, std::chrono::steady_clock::time_point timestamp); + virtual void PruneRedundantInFlightMsgsLocked(int followerId, + uint64_t followerLastLogIndex); + virtual void RecordNewInFlightMsgLocked( + const AeFields& msg, std::chrono::steady_clock::time_point timestamp); virtual bool InFlightPerFollowerLimitReachedLocked(int followerId) const; - // Persistent state on all servers: uint64_t currentTerm_; // Protected by mutex_ int votedFor_; // Protected by mutex_ @@ -198,11 +204,11 @@ class Raft : public common::ProtocolBase { return votedFor_; } - const std::vector>& GetLog() const { + const std::vector>& GetLog() const { std::lock_guard lock(mutex_); return log_; } - + void PrintLog(std::ostream& os) const { os << "Log entries (count = " << log_.size() << "):\n"; @@ -214,10 +220,8 @@ class Raft : public common::ProtocolBase { } os << " [" << i << "] " - << "term=" << entry->term - << ", command=\"" << entry->command << "\"" - << ", serializedSize=" << entry->GetSerializedSize() - << "\n"; + << "term=" << entry->term << ", command=\"" << entry->command << "\"" + << ", serializedSize=" << entry->GetSerializedSize() << "\n"; } } diff --git a/platform/consensus/ordering/raft/algorithm/raft_test_append_entries_response.cpp b/platform/consensus/ordering/raft/algorithm/raft_append_entries_response_test.cpp similarity index 61% rename from platform/consensus/ordering/raft/algorithm/raft_test_append_entries_response.cpp rename to platform/consensus/ordering/raft/algorithm/raft_append_entries_response_test.cpp index 4d23afbd3b..cc30602963 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_test_append_entries_response.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft_append_entries_response_test.cpp @@ -2,12 +2,13 @@ namespace resdb { namespace raft { -using ::testing::Invoke; using ::testing::_; -using ::testing::Matcher; using ::testing::AnyNumber; +using ::testing::Invoke; +using ::testing::Matcher; -// Test 1: A leader receiving an AppendEntriesResponse success and updating the follower's matchIndex. +// Test 1: A leader receiving an AppendEntriesResponse success and updating the +// follower's matchIndex. TEST_F(RaftTest, LeaderReceivesAppendEntriesResponseSuccess) { EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); @@ -17,23 +18,25 @@ TEST_F(RaftTest, LeaderReceivesAppendEntriesResponseSuccess) { aeResponse.set_id(2); aeResponse.set_lastlogindex(2); - raft_->SetStateForTest({ - .currentTerm = 1, - .commitIndex = 0, - .role = Role::LEADER, - .log = CreateLogEntries({ - {0, "Transaction 1"}, - {0, "Transaction 2"}, - }, true), - .matchIndex = std::vector{0, 2, 0, 0, 0} - }); - - bool success = raft_->ReceiveAppendEntriesResponse(std::make_unique(aeResponse)); + raft_->SetStateForTest({.currentTerm = 1, + .commitIndex = 0, + .role = Role::LEADER, + .log = CreateLogEntries( + { + {0, "Transaction 1"}, + {0, "Transaction 2"}, + }, + true), + .matchIndex = std::vector{0, 2, 0, 0, 0}}); + + bool success = raft_->ReceiveAppendEntriesResponse( + std::make_unique(aeResponse)); EXPECT_TRUE(success); EXPECT_THAT(raft_->GetMatchIndex(), ::testing::ElementsAre(0, 2, 2, 0, 0)); } -// Test 2: A leader receiving an AppendEntriesResponse from a follower that in a newer term. +// Test 2: A leader receiving an AppendEntriesResponse from a follower that in a +// newer term. TEST_F(RaftTest, LeaderReceivesAppendEntriesResponseFromNewerTerm) { EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); @@ -46,12 +49,14 @@ TEST_F(RaftTest, LeaderReceivesAppendEntriesResponseFromNewerTerm) { aeResponse.set_success(false); aeResponse.set_term(2); - bool success = raft_->ReceiveAppendEntriesResponse(std::make_unique(aeResponse)); + bool success = raft_->ReceiveAppendEntriesResponse( + std::make_unique(aeResponse)); EXPECT_FALSE(success); EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); } -// Test 3: A leader receiving an AppendEntriesResponse success, updating the follower's matchIndex, and committing a new entry +// Test 3: A leader receiving an AppendEntriesResponse success, updating the +// follower's matchIndex, and committing a new entry. TEST_F(RaftTest, LeaderReceivesAppendEntriesResponseSuccessAndCommits) { EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); EXPECT_CALL(mock_commit, Commit(_)).Times(1); @@ -62,38 +67,40 @@ TEST_F(RaftTest, LeaderReceivesAppendEntriesResponseSuccessAndCommits) { aeResponse.set_id(2); aeResponse.set_lastlogindex(2); - raft_->SetStateForTest({ - .currentTerm = 1, - .commitIndex = 0, - .lastApplied = 0, - .role = Role::LEADER, - .log = CreateLogEntries({ - {1, "Transaction 1"}, - {1, "Transaction 2"}, - }, true), - .nextIndex = std::vector{0, 2, 2, 2, 2}, - .matchIndex = std::vector{0, 2, 0, 1, 0} - }); - - bool success = raft_->ReceiveAppendEntriesResponse(std::make_unique(aeResponse)); + raft_->SetStateForTest({.currentTerm = 1, + .commitIndex = 0, + .lastApplied = 0, + .role = Role::LEADER, + .log = CreateLogEntries( + { + {1, "Transaction 1"}, + {1, "Transaction 2"}, + }, + true), + .nextIndex = std::vector{0, 2, 2, 2, 2}, + .matchIndex = std::vector{0, 2, 0, 1, 0}}); + + bool success = raft_->ReceiveAppendEntriesResponse( + std::make_unique(aeResponse)); EXPECT_TRUE(success); EXPECT_THAT(raft_->GetMatchIndex(), ::testing::ElementsAre(0, 2, 2, 1, 0)); EXPECT_EQ(raft_->GetCommitIndex(), 1); } -// Test 4: A leader receiving an AppendEntriesResponse success and catching up a follower that is behind +// Test 4: A leader receiving an AppendEntriesResponse success and catching up a +// follower that is behind. TEST_F(RaftTest, LeaderCatchesUpFollowerThatIsBehind) { EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& ae = dynamic_cast(msg); EXPECT_EQ(ae.entries_size(), 1); - // TODO: Use serialized string instead of manually doing it + // TODO: Use serialized string instead of manually doing it. EXPECT_EQ(ae.entries(0).command(), "\n\rTransaction 2"); EXPECT_EQ(node_id, 2); return 0; - })); + })); AppendEntriesResponse aeResponse; aeResponse.set_success(true); @@ -106,30 +113,34 @@ TEST_F(RaftTest, LeaderCatchesUpFollowerThatIsBehind) { .commitIndex = 0, .lastApplied = 0, .role = Role::LEADER, - .log = CreateLogEntries({ - {1, "Transaction 1"}, - {1, "Transaction 2"}, - }, true), + .log = CreateLogEntries( + { + {1, "Transaction 1"}, + {1, "Transaction 2"}, + }, + true), }); - bool success = raft_->ReceiveAppendEntriesResponse(std::make_unique(aeResponse)); + bool success = raft_->ReceiveAppendEntriesResponse( + std::make_unique(aeResponse)); EXPECT_TRUE(success); } -// Test 5: A leader receiving an AppendEntriesResponse Failure and catching up a follower that is behind +// Test 5: A leader receiving an AppendEntriesResponse Failure and catching up a +// follower that is behind. TEST_F(RaftTest, LeaderCatchesUpFollowerThatIsBehindFailure) { EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& ae = dynamic_cast(msg); - // TODO: Use serialized string instead of manually doing it + // TODO: Use serialized string instead of manually doing it. EXPECT_EQ(ae.entries(0).command(), "\n\rTransaction 1"); EXPECT_EQ(ae.entries(1).command(), "\n\rTransaction 2"); EXPECT_EQ(ae.entries_size(), 2); EXPECT_EQ(node_id, 2); return 0; - })); + })); AppendEntriesResponse aeResponse; aeResponse.set_success(false); @@ -142,13 +153,16 @@ TEST_F(RaftTest, LeaderCatchesUpFollowerThatIsBehindFailure) { .commitIndex = 0, .lastApplied = 0, .role = Role::LEADER, - .log = CreateLogEntries({ - {1, "Transaction 1"}, - {1, "Transaction 2"}, - }, true), + .log = CreateLogEntries( + { + {1, "Transaction 1"}, + {1, "Transaction 2"}, + }, + true), }); - bool success = raft_->ReceiveAppendEntriesResponse(std::make_unique(aeResponse)); + bool success = raft_->ReceiveAppendEntriesResponse( + std::make_unique(aeResponse)); EXPECT_TRUE(success); } @@ -165,7 +179,8 @@ TEST_F(RaftTest, FollowerIgnoresAppendEntriesResponse) { .role = Role::FOLLOWER, }); - bool success = raft_->ReceiveAppendEntriesResponse(std::make_unique(aeResponse)); + bool success = raft_->ReceiveAppendEntriesResponse( + std::make_unique(aeResponse)); EXPECT_TRUE(success); } @@ -182,9 +197,10 @@ TEST_F(RaftTest, LeaderIgnoresAppendEntriesResponseFromOutdatedTerm) { .role = Role::LEADER, }); - bool success = raft_->ReceiveAppendEntriesResponse(std::make_unique(aeResponse)); + bool success = raft_->ReceiveAppendEntriesResponse( + std::make_unique(aeResponse)); EXPECT_TRUE(success); } -} // namespace raft -} // namespace resdb +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft_test_append_entries.cpp b/platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp similarity index 53% rename from platform/consensus/ordering/raft/algorithm/raft_test_append_entries.cpp rename to platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp index ab94d3ded1..d95fed0ebd 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_test_append_entries.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp @@ -2,12 +2,12 @@ namespace resdb { namespace raft { -using ::testing::Invoke; using ::testing::_; -using ::testing::Matcher; using ::testing::AnyNumber; +using ::testing::Invoke; +using ::testing::Matcher; -// Test 1: A follower receiving a client transaction should reject it +// Test 1: A follower receiving a client transaction should reject it. TEST_F(RaftTest, FollowerRejectsClientTransaction) { EXPECT_CALL(mock_call, Call(_, _, _)).Times(0); EXPECT_CALL(mock_broadcast, Broadcast(_, _)).Times(0); @@ -20,7 +20,8 @@ TEST_F(RaftTest, FollowerRejectsClientTransaction) { EXPECT_FALSE(success); } -// Test 2: A leader receiving a client transaction should send an AppendEntries to all other replicas +// Test 2: A leader receiving a client transaction should send an AppendEntries +// to all other replicas. TEST_F(RaftTest, LeaderSendsAppendEntriesUponClientTransaction) { EXPECT_CALL(mock_call, Call(_, _, _)).Times(3); EXPECT_CALL(*leader_election_manager_, OnAeBroadcast()).Times(1); @@ -33,46 +34,46 @@ TEST_F(RaftTest, LeaderSendsAppendEntriesUponClientTransaction) { EXPECT_TRUE(success); } -// Test 3: Sent AppendEntries should be based on the follower's nextIndex +// Test 3: Sent AppendEntries should be based on the follower's nextIndex. TEST_F(RaftTest, LeaderSendsAppendEntriesBasedOnNextIndex) { EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& ae = dynamic_cast(msg); EXPECT_EQ(node_id, 2); EXPECT_EQ(ae.prevlogindex(), 2); EXPECT_EQ(ae.entries().size(), 3); return 0; - })) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + })) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& ae = dynamic_cast(msg); EXPECT_EQ(node_id, 3); EXPECT_EQ(ae.prevlogindex(), 1); EXPECT_EQ(ae.entries().size(), 4); return 0; - })) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + })) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& ae = dynamic_cast(msg); EXPECT_EQ(node_id, 4); EXPECT_EQ(ae.prevlogindex(), 0); EXPECT_EQ(ae.entries().size(), 5); return 0; - })); + })); EXPECT_CALL(*leader_election_manager_, OnAeBroadcast()).Times(1); - - raft_->SetStateForTest({ - .currentTerm = 0, - .role = Role::LEADER, - .log = CreateLogEntries({ - {0, "Term 0 Transaction 1"}, - {0, "Term 0 Transaction 2"}, - {0, "Term 0 Transaction 3"}, - {0, "Term 0 Transaction 4"}, - }, true), - .nextIndex = std::vector{0, 4, 3, 2, 1} - }); + + raft_->SetStateForTest({.currentTerm = 0, + .role = Role::LEADER, + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + {0, "Term 0 Transaction 2"}, + {0, "Term 0 Transaction 3"}, + {0, "Term 0 Transaction 4"}, + }, + true), + .nextIndex = std::vector{0, 4, 3, 2, 1}}); auto req = std::make_unique(); req->set_seq(5); @@ -81,228 +82,243 @@ TEST_F(RaftTest, LeaderSendsAppendEntriesBasedOnNextIndex) { EXPECT_TRUE(success); } -// Test 4: A follower receiving 1 AppendEntries with multiple entries that it can accept +// Test 4: A follower receiving 1 AppendEntries with multiple entries that it +// can accept. TEST_F(RaftTest, FollowerAddsAppendEntriesWithMultipleEntries) { EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& aer = dynamic_cast(msg); EXPECT_TRUE(aer.success()); EXPECT_EQ(aer.lastlogindex(), 3); return 0; - })); + })); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( - /*term=*/ 0, - /*leaderId=*/ 2, - /*prevLogIndex=*/ 0, - /*prevLogTerm=*/ 0, - /*entries=*/ CreateLogEntries({ - {0, "Transaction 1"}, - {0, "Transaction 2"}, - {0, "Transaction 3"}, - }), - /*leaderCommit=*/ 0, - /*followerId=*/ 1 - ); + /*term=*/0, + /*leaderId=*/2, + /*prevLogIndex=*/0, + /*prevLogTerm=*/0, + /*entries=*/ + CreateLogEntries({ + {0, "Transaction 1"}, + {0, "Transaction 2"}, + {0, "Transaction 3"}, + }), + /*leaderCommit=*/0, + /*followerId=*/1); auto aemessage = CreateAeMessage(aefields); raft_->SetRole(Role::FOLLOWER); - bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + bool success = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); EXPECT_TRUE(success); } -// Test 5: A follower receiving multiple AppendEntries that it can accept +// Test 5: A follower receiving multiple AppendEntries that it can accept. TEST_F(RaftTest, FollowerAddsMultipleAppendEntries) { EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& aer = dynamic_cast(msg); EXPECT_TRUE(aer.success()); EXPECT_EQ(aer.lastlogindex(), 1); return 0; - })) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + })) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& aer = dynamic_cast(msg); EXPECT_TRUE(aer.success()); EXPECT_EQ(aer.lastlogindex(), 2); return 0; - })) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + })) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& aer = dynamic_cast(msg); EXPECT_TRUE(aer.success()); EXPECT_EQ(aer.lastlogindex(), 3); return 0; - })); + })); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(3); auto aefields1 = CreateAeFields( - /*term=*/ 0, - /*leaderId=*/ 2, - /*prevLogIndex=*/ 0, - /*prevLogTerm=*/ 0, - /*entries=*/ CreateLogEntries({ - {0, "Transaction 1"}, - }), - /*leaderCommit=*/ 0, - /*followerId=*/ 1 - ); + /*term=*/0, + /*leaderId=*/2, + /*prevLogIndex=*/0, + /*prevLogTerm=*/0, + /*entries=*/ + CreateLogEntries({ + {0, "Transaction 1"}, + }), + /*leaderCommit=*/0, + /*followerId=*/1); auto aefields2 = CreateAeFields( - /*term=*/ 0, - /*leaderId=*/ 2, - /*prevLogIndex=*/ 1, - /*prevLogTerm=*/ 0, - /*entries=*/ CreateLogEntries({ - {0, "Transaction 2"}, - }), - /*leaderCommit=*/ 0, - /*followerId=*/ 1 - ); + /*term=*/0, + /*leaderId=*/2, + /*prevLogIndex=*/1, + /*prevLogTerm=*/0, + /*entries=*/ + CreateLogEntries({ + {0, "Transaction 2"}, + }), + /*leaderCommit=*/0, + /*followerId=*/1); auto aefields3 = CreateAeFields( - /*term=*/ 0, - /*leaderId=*/ 2, - /*prevLogIndex=*/ 2, - /*prevLogTerm=*/ 0, - /*entries=*/ CreateLogEntries({ - {0, "Transaction 3"}, - }), - /*leaderCommit=*/ 0, - /*followerId=*/ 1 - ); + /*term=*/0, + /*leaderId=*/2, + /*prevLogIndex=*/2, + /*prevLogTerm=*/0, + /*entries=*/ + CreateLogEntries({ + {0, "Transaction 3"}, + }), + /*leaderCommit=*/0, + /*followerId=*/1); auto aemessage1 = CreateAeMessage(aefields1); auto aemessage2 = CreateAeMessage(aefields2); auto aemessage3 = CreateAeMessage(aefields3); raft_->SetRole(Role::FOLLOWER); - bool success1 = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage1))); + bool success1 = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage1))); EXPECT_TRUE(success1); - bool success2 = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage2))); + bool success2 = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage2))); EXPECT_TRUE(success2); - bool success3 = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage3))); + bool success3 = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage3))); EXPECT_TRUE(success3); } -// Test 6: A follower rejects Append Entries because its own entry at prevLogIndex does not have the same term. +// Test 6: A follower rejects Append Entries because its own entry at +// prevLogIndex does not have the same term. TEST_F(RaftTest, FollowerRejectsMismatchedTermAtPrevLogIndex) { EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& aer = dynamic_cast(msg); EXPECT_FALSE(aer.success()); EXPECT_EQ(aer.lastlogindex(), 1); return 0; - })); + })); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( - /*term=*/ 0, - /*leaderId=*/ 2, - /*prevLogIndex=*/ 1, - /*prevLogTerm=*/ 2, - /*entries=*/ CreateLogEntries({ - {2, "Term 2 Transaction 1"}, - }), - /*leaderCommit=*/ 0, - /*followerId=*/ 1 - ); + /*term=*/0, + /*leaderId=*/2, + /*prevLogIndex=*/1, + /*prevLogTerm=*/2, + /*entries=*/ + CreateLogEntries({ + {2, "Term 2 Transaction 1"}, + }), + /*leaderCommit=*/0, + /*followerId=*/1); raft_->SetStateForTest({ .currentTerm = 0, .role = Role::FOLLOWER, - .log = CreateLogEntries({ - {1, "Term 1 Transaction 1"}, - }, true), + .log = CreateLogEntries( + { + {1, "Term 1 Transaction 1"}, + }, + true), }); auto aemessage = CreateAeMessage(aefields); raft_->SetRole(Role::FOLLOWER); - bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + bool success = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); EXPECT_TRUE(success); } -// Test 7: A follower rejects Append Entries because it does not have a term at prevLogIndex +// Test 7: A follower rejects Append Entries because it does not have a term at +// prevLogIndex. TEST_F(RaftTest, FollowerRejectsMissingIndex) { EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& aer = dynamic_cast(msg); EXPECT_FALSE(aer.success()); EXPECT_EQ(aer.lastlogindex(), 0); return 0; - })); + })); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( - /*term=*/ 0, - /*leaderId=*/ 2, - /*prevLogIndex=*/ 1, - /*prevLogTerm=*/ 0, - /*entries=*/ CreateLogEntries({ - {0, "Transaction 2"}, - }), - /*leaderCommit=*/ 0, - /*followerId=*/ 1 - ); + /*term=*/0, + /*leaderId=*/2, + /*prevLogIndex=*/1, + /*prevLogTerm=*/0, + /*entries=*/ + CreateLogEntries({ + {0, "Transaction 2"}, + }), + /*leaderCommit=*/0, + /*followerId=*/1); auto aemessage = CreateAeMessage(aefields); raft_->SetRole(Role::FOLLOWER); - bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + bool success = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); EXPECT_TRUE(success); } -// Test 8: A follower receiving 1 AppendEntries with multiple entries and needing to truncate part of its log +// Test 8: A follower receiving 1 AppendEntries with multiple entries and +// needing to truncate part of its log. TEST_F(RaftTest, FollowerAddsAppendEntriesAndTruncatesLog) { EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& aer = dynamic_cast(msg); EXPECT_TRUE(aer.success()); EXPECT_EQ(aer.lastlogindex(), 3); return 0; - })); + })); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( - /*term=*/ 1, - /*leaderId=*/ 2, - /*prevLogIndex=*/ 1, - /*prevLogTerm=*/ 0, - /*entries=*/ CreateLogEntries({ - {1, "Term 1 Transaction 1"}, - {1, "Term 1 Transaction 2"}, - }), - /*leaderCommit=*/ 0, - /*followerId=*/ 1 - ); + /*term=*/1, + /*leaderId=*/2, + /*prevLogIndex=*/1, + /*prevLogTerm=*/0, + /*entries=*/ + CreateLogEntries({ + {1, "Term 1 Transaction 1"}, + {1, "Term 1 Transaction 2"}, + }), + /*leaderCommit=*/0, + /*followerId=*/1); auto aemessage = CreateAeMessage(aefields); raft_->SetStateForTest({ .currentTerm = 0, .role = Role::FOLLOWER, - .log = CreateLogEntries({ - {0, "Term 0 Transaction 1"}, // index 1 - {0, "Term 0 Transaction 2"}, // mismatched entry will be removed - }, true), + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, // index 1 + {0, "Term 0 Transaction 2"}, // mismatched entry will be removed + }, + true), }); - bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + bool success = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); const auto& raft_log = raft_->GetLog(); EXPECT_EQ(raft_log[0]->term, 0); EXPECT_EQ(raft_log[0]->command, "COMMON_PREFIX"); EXPECT_EQ(raft_log[1]->term, 0); - // TODO: Use serialized string instead of manually doing it + // TODO: Use serialized string instead of manually doing it. EXPECT_EQ(raft_log[1]->command, "\n\x14Term 0 Transaction 1"); EXPECT_EQ(raft_log[2]->term, 1); EXPECT_EQ(raft_log[2]->command, "\n\x14Term 1 Transaction 1"); @@ -311,29 +327,27 @@ TEST_F(RaftTest, FollowerAddsAppendEntriesAndTruncatesLog) { EXPECT_TRUE(success); } -// Test 9: A follower increases its commitIndex +// Test 9: A follower increases its commitIndex. TEST_F(RaftTest, FollowerIncreasesCommitIndex) { EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& aer = dynamic_cast(msg); EXPECT_TRUE(aer.success()); EXPECT_EQ(aer.lastlogindex(), 5); return 0; - })); - EXPECT_CALL(mock_commit, Commit(_)) - .Times(2); + })); + EXPECT_CALL(mock_commit, Commit(_)).Times(2); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( - /*term=*/ 1, - /*leaderId=*/ 2, - /*prevLogIndex=*/ 5, - /*prevLogTerm=*/ 1, - /*entries=*/ CreateLogEntries({}), - /*leaderCommit=*/ 3, - /*followerId=*/ 1 - ); + /*term=*/1, + /*leaderId=*/2, + /*prevLogIndex=*/5, + /*prevLogTerm=*/1, + /*entries=*/CreateLogEntries({}), + /*leaderCommit=*/3, + /*followerId=*/1); auto aemessage = CreateAeMessage(aefields); raft_->SetStateForTest({ @@ -341,44 +355,45 @@ TEST_F(RaftTest, FollowerIncreasesCommitIndex) { .commitIndex = 1, .lastApplied = 1, .role = Role::FOLLOWER, - .log = CreateLogEntries({ - {1, "Term 1 Transaction 1"}, - {1, "Term 1 Transaction 2"}, - {1, "Term 1 Transaction 3"}, - {1, "Term 1 Transaction 4"}, - {1, "Term 1 Transaction 5"}, - }, true), + .log = CreateLogEntries( + { + {1, "Term 1 Transaction 1"}, + {1, "Term 1 Transaction 2"}, + {1, "Term 1 Transaction 3"}, + {1, "Term 1 Transaction 4"}, + {1, "Term 1 Transaction 5"}, + }, + true), }); - bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + bool success = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); EXPECT_TRUE(success); EXPECT_EQ(raft_->GetCommitIndex(), 3); } -// Test 10: A follower increases its commitIndex, but not past its own log size +// Test 10: A follower increases its commitIndex, but not past its own log size. TEST_F(RaftTest, FollowerIncreasesCommitIndexCappedAtLogSize) { EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& aer = dynamic_cast(msg); EXPECT_TRUE(aer.success()); EXPECT_EQ(aer.lastlogindex(), 5); return 0; - })); - EXPECT_CALL(mock_commit, Commit(_)) - .Times(4); + })); + EXPECT_CALL(mock_commit, Commit(_)).Times(4); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( - /*term=*/ 1, - /*leaderId=*/ 2, - /*prevLogIndex=*/ 5, - /*prevLogTerm=*/ 1, - /*entries=*/ CreateLogEntries({}), - /*leaderCommit=*/ 7, - /*followerId=*/ 1 - ); + /*term=*/1, + /*leaderId=*/2, + /*prevLogIndex=*/5, + /*prevLogTerm=*/1, + /*entries=*/CreateLogEntries({}), + /*leaderCommit=*/7, + /*followerId=*/1); auto aemessage = CreateAeMessage(aefields); raft_->SetStateForTest({ @@ -386,183 +401,193 @@ TEST_F(RaftTest, FollowerIncreasesCommitIndexCappedAtLogSize) { .commitIndex = 1, .lastApplied = 1, .role = Role::FOLLOWER, - .log = CreateLogEntries({ - {1, "Term 1 Transaction 1"}, - {1, "Term 1 Transaction 2"}, - {1, "Term 1 Transaction 3"}, - {1, "Term 1 Transaction 4"}, - {1, "Term 1 Transaction 5"}, - }, true), + .log = CreateLogEntries( + { + {1, "Term 1 Transaction 1"}, + {1, "Term 1 Transaction 2"}, + {1, "Term 1 Transaction 3"}, + {1, "Term 1 Transaction 4"}, + {1, "Term 1 Transaction 5"}, + }, + true), }); - bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + bool success = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); EXPECT_TRUE(success); EXPECT_EQ(raft_->GetCommitIndex(), 5); } -// Test 11: A candidate rejecting an AppendEntries from an outdated term and staying candidate +// Test 11: A candidate rejecting an AppendEntries from an outdated term and +// staying candidate. TEST_F(RaftTest, CandidateRejectsAppendEntriesFromOutdatedTerm) { EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& aer = dynamic_cast(msg); EXPECT_FALSE(aer.success()); EXPECT_EQ(aer.lastlogindex(), 0); return 0; - })); + })); EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); auto aefields = CreateAeFields( - /*term=*/ 1, - /*leaderId=*/ 2, - /*prevLogIndex=*/ 0, - /*prevLogTerm=*/ 0, - /*entries=*/ CreateLogEntries({ - {1, "Transaction 1"}, - {1, "Transaction 2"}, - {1, "Transaction 3"}, - }), - /*leaderCommit=*/ 0, - /*followerId=*/ 1 - ); + /*term=*/1, + /*leaderId=*/2, + /*prevLogIndex=*/0, + /*prevLogTerm=*/0, + /*entries=*/ + CreateLogEntries({ + {1, "Transaction 1"}, + {1, "Transaction 2"}, + {1, "Transaction 3"}, + }), + /*leaderCommit=*/0, + /*followerId=*/1); auto aemessage = CreateAeMessage(aefields); raft_->SetStateForTest({ .currentTerm = 2, .role = Role::CANDIDATE, - .log = CreateLogEntries({ - }, true), + .log = CreateLogEntries({}, true), }); - - bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + bool success = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); EXPECT_TRUE(success); } -// Test 12: A candidate rejecting an AppendEntries because their log is further behind, but it is in the same term so they still demote. +// Test 12: A candidate rejecting an AppendEntries because their log is further +// behind, but it is in the same term so they still demote. TEST_F(RaftTest, CandidateRejectsAppendEntriesFromSameTerm) { EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& aer = dynamic_cast(msg); EXPECT_FALSE(aer.success()); EXPECT_EQ(aer.lastlogindex(), 1); return 0; - })); + })); EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); - + auto aefields = CreateAeFields( - /*term=*/ 2, - /*leaderId=*/ 2, - /*prevLogIndex=*/ 2, - /*prevLogTerm=*/ 0, - /*entries=*/ CreateLogEntries({ - {2, "Transaction 1"}, - {2, "Transaction 2"}, - {2, "Transaction 3"}, - }), - /*leaderCommit=*/ 0, - /*followerId=*/ 1 - ); + /*term=*/2, + /*leaderId=*/2, + /*prevLogIndex=*/2, + /*prevLogTerm=*/0, + /*entries=*/ + CreateLogEntries({ + {2, "Transaction 1"}, + {2, "Transaction 2"}, + {2, "Transaction 3"}, + }), + /*leaderCommit=*/0, + /*followerId=*/1); auto aemessage = CreateAeMessage(aefields); raft_->SetStateForTest({ .currentTerm = 2, .role = Role::CANDIDATE, - .log = CreateLogEntries({ - {1, "Old Transaction 1"} - }, true), + .log = CreateLogEntries({{1, "Old Transaction 1"}}, true), }); - - bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + bool success = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); EXPECT_TRUE(success); } -// Test 13: A candidate receiving an AppendEntries it can accept from a newer term. +// Test 13: A candidate receiving an AppendEntries it can accept from a newer +// term. TEST_F(RaftTest, CandidateReceivesNewerTermWithAppendEntriesItCanAccept) { EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& aer = dynamic_cast(msg); EXPECT_TRUE(aer.success()); EXPECT_EQ(aer.lastlogindex(), 3); return 0; - })); + })); EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( - /*term=*/ 2, - /*leaderId=*/ 2, - /*prevLogIndex=*/ 2, - /*prevLogTerm=*/ 0, - /*entries=*/ CreateLogEntries({ - {2, "Transaction 1"}, - }), - /*leaderCommit=*/ 2, - /*followerId=*/ 1 - ); + /*term=*/2, + /*leaderId=*/2, + /*prevLogIndex=*/2, + /*prevLogTerm=*/0, + /*entries=*/ + CreateLogEntries({ + {2, "Transaction 1"}, + }), + /*leaderCommit=*/2, + /*followerId=*/1); auto aemessage = CreateAeMessage(aefields); raft_->SetStateForTest({ .currentTerm = 1, .lastApplied = 2, .role = Role::CANDIDATE, - .log = CreateLogEntries({ - {0, "old-1"}, - {0, "old-2"}, - }, true), + .log = CreateLogEntries( + { + {0, "old-1"}, + {0, "old-2"}, + }, + true), }); - - bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + + bool success = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); EXPECT_TRUE(success); EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); } -// Test 14: A candidate receiving an AppendEntries that it can accept from a the same term but further along. +// Test 14: A candidate receiving an AppendEntries that it can accept from a the +// same term but further along. TEST_F(RaftTest, CandidateReceivesSameTermWithAppendEntriesItCanAccept) { EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { const auto& aer = dynamic_cast(msg); EXPECT_TRUE(aer.success()); EXPECT_EQ(aer.lastlogindex(), 3); return 0; - })); + })); EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( - /*term=*/ 1, - /*leaderId=*/ 2, - /*prevLogIndex=*/ 2, - /*prevLogTerm=*/ 0, - /*entries=*/ CreateLogEntries({ - {2, "Transaction 1"}, - }), - /*leaderCommit=*/ 2, - /*followerId=*/ 1 - ); + /*term=*/1, + /*leaderId=*/2, + /*prevLogIndex=*/2, + /*prevLogTerm=*/0, + /*entries=*/ + CreateLogEntries({ + {2, "Transaction 1"}, + }), + /*leaderCommit=*/2, + /*followerId=*/1); auto aemessage = CreateAeMessage(aefields); raft_->SetStateForTest({ .currentTerm = 1, .lastApplied = 2, .role = Role::CANDIDATE, - .log = CreateLogEntries({ - {0, "old-1"}, - {0, "old-2"}, - }, true), + .log = CreateLogEntries( + { + {0, "old-1"}, + {0, "old-2"}, + }, + true), }); - - bool success = raft_->ReceiveAppendEntries(std::make_unique(std::move(aemessage))); + + bool success = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); EXPECT_TRUE(success); EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); } -} // namespace raft -} // namespace resdb +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft_test_request_vote_response.cpp b/platform/consensus/ordering/raft/algorithm/raft_request_vote_response_test.cpp similarity index 68% rename from platform/consensus/ordering/raft/algorithm/raft_test_request_vote_response.cpp rename to platform/consensus/ordering/raft/algorithm/raft_request_vote_response_test.cpp index 8723c489bd..b45c2c032f 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_test_request_vote_response.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft_request_vote_response_test.cpp @@ -2,18 +2,19 @@ namespace resdb { namespace raft { -using ::testing::Invoke; using ::testing::_; -using ::testing::Matcher; using ::testing::AnyNumber; +using ::testing::Invoke; +using ::testing::Matcher; -// Test 1: A candidate gets elected +// Test 1: A candidate gets elected. TEST_F(RaftTest, CandidateGetsElected) { EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { - const auto& AppendEntriesMessage = dynamic_cast(msg); + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& AppendEntriesMessage = + dynamic_cast(msg); EXPECT_EQ(node_id, 2); EXPECT_EQ(AppendEntriesMessage.entries_size(), 0); EXPECT_EQ(AppendEntriesMessage.prevlogterm(), 1); @@ -21,10 +22,11 @@ TEST_F(RaftTest, CandidateGetsElected) { EXPECT_EQ(AppendEntriesMessage.leaderid(), 1); EXPECT_EQ(AppendEntriesMessage.leadercommitindex(), 1); return 0; - })) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { - const auto& AppendEntriesMessage = dynamic_cast(msg); + })) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& AppendEntriesMessage = + dynamic_cast(msg); EXPECT_EQ(node_id, 3); EXPECT_EQ(AppendEntriesMessage.entries_size(), 0); EXPECT_EQ(AppendEntriesMessage.prevlogterm(), 1); @@ -32,10 +34,11 @@ TEST_F(RaftTest, CandidateGetsElected) { EXPECT_EQ(AppendEntriesMessage.leaderid(), 1); EXPECT_EQ(AppendEntriesMessage.leadercommitindex(), 1); return 0; - })) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { - const auto& AppendEntriesMessage = dynamic_cast(msg); + })) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& AppendEntriesMessage = + dynamic_cast(msg); EXPECT_EQ(node_id, 4); EXPECT_EQ(AppendEntriesMessage.entries_size(), 0); EXPECT_EQ(AppendEntriesMessage.prevlogterm(), 1); @@ -43,19 +46,19 @@ TEST_F(RaftTest, CandidateGetsElected) { EXPECT_EQ(AppendEntriesMessage.leaderid(), 1); EXPECT_EQ(AppendEntriesMessage.leadercommitindex(), 1); return 0; - })); - - raft_->SetStateForTest({ - .currentTerm = 2, - .commitIndex = 1, - .lastApplied = 1, - .role = Role::CANDIDATE, - .log = CreateLogEntries({ - {0, "Term 0 Transaction 1"}, - {1, "Term 1 Transaction 1"}, - }, true), - .votes = std::vector{1, 3} - }); + })); + + raft_->SetStateForTest({.currentTerm = 2, + .commitIndex = 1, + .lastApplied = 1, + .role = Role::CANDIDATE, + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + {1, "Term 1 Transaction 1"}, + }, + true), + .votes = std::vector{1, 3}}); RequestVoteResponse rvr; rvr.set_term(2); @@ -70,7 +73,8 @@ TEST_F(RaftTest, CandidateGetsElected) { EXPECT_THAT(raft_->GetMatchIndex(), ::testing::ElementsAre(0, 2, 0, 0, 0)); } -// Test 2: A candidate receives a RequestVoteResponse from an older term and ignores it. +// Test 2: A candidate receives a RequestVoteResponse from an older term and +// ignores it. TEST_F(RaftTest, CandidateIgnoresResponseFromOldTerm) { EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); EXPECT_CALL(mock_call, Call(_, _, _)).Times(0); @@ -79,9 +83,11 @@ TEST_F(RaftTest, CandidateIgnoresResponseFromOldTerm) { raft_->SetStateForTest({ .currentTerm = 2, .role = Role::CANDIDATE, - .log = CreateLogEntries({ - {0, "Term 0 Transaction 1"}, - }, true), + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + }, + true), }); RequestVoteResponse rvr; @@ -93,7 +99,8 @@ TEST_F(RaftTest, CandidateIgnoresResponseFromOldTerm) { EXPECT_EQ(raft_->GetRoleSnapshot(), Role::CANDIDATE); } -// Test 3: A candidate receives a RequestVoteResponse from an newer term and demotes. +// Test 3: A candidate receives a RequestVoteResponse from an newer term and +// demotes. TEST_F(RaftTest, CandidateDemotesAfterRequestVoteResponseFromNewerTerm) { EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); EXPECT_CALL(mock_call, Call(_, _, _)).Times(0); @@ -102,9 +109,11 @@ TEST_F(RaftTest, CandidateDemotesAfterRequestVoteResponseFromNewerTerm) { raft_->SetStateForTest({ .currentTerm = 2, .role = Role::CANDIDATE, - .log = CreateLogEntries({ - {0, "Term 0 Transaction 1"}, - }, true), + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + }, + true), }); RequestVoteResponse rvr; @@ -127,9 +136,11 @@ TEST_F(RaftTest, FollowerIgnoresRequestVoteResponse) { raft_->SetStateForTest({ .currentTerm = 2, .role = Role::FOLLOWER, - .log = CreateLogEntries({ - {0, "Term 0 Transaction 1"}, - }, true), + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + }, + true), }); RequestVoteResponse rvr; @@ -150,9 +161,11 @@ TEST_F(RaftTest, CandidateIgnoresNoVote) { raft_->SetStateForTest({ .currentTerm = 2, .role = Role::CANDIDATE, - .log = CreateLogEntries({ - {0, "Term 0 Transaction 1"}, - }, true), + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + }, + true), }); RequestVoteResponse rvr; @@ -170,17 +183,17 @@ TEST_F(RaftTest, CandidateIgnoresDuplicateVote) { EXPECT_CALL(mock_call, Call(_, _, _)).Times(0); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); - raft_->SetStateForTest({ - .currentTerm = 2, - .commitIndex = 1, - .lastApplied = 1, - .role = Role::CANDIDATE, - .log = CreateLogEntries({ - {0, "Term 0 Transaction 1"}, - {1, "Term 1 Transaction 1"}, - }, true), - .votes = std::vector{1, 2} - }); + raft_->SetStateForTest({.currentTerm = 2, + .commitIndex = 1, + .lastApplied = 1, + .role = Role::CANDIDATE, + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + {1, "Term 1 Transaction 1"}, + }, + true), + .votes = std::vector{1, 2}}); RequestVoteResponse rvr; rvr.set_term(2); @@ -191,5 +204,5 @@ TEST_F(RaftTest, CandidateIgnoresDuplicateVote) { EXPECT_EQ(raft_->GetRoleSnapshot(), Role::CANDIDATE); } -} // namespace raft -} // namespace resdb +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft_test_request_vote.cpp b/platform/consensus/ordering/raft/algorithm/raft_request_vote_test.cpp similarity index 70% rename from platform/consensus/ordering/raft/algorithm/raft_test_request_vote.cpp rename to platform/consensus/ordering/raft/algorithm/raft_request_vote_test.cpp index 6efca4d9e0..8601234ed3 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_test_request_vote.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft_request_vote_test.cpp @@ -2,31 +2,34 @@ namespace resdb { namespace raft { -using ::testing::Invoke; using ::testing::_; -using ::testing::Matcher; using ::testing::AnyNumber; +using ::testing::Invoke; +using ::testing::Matcher; -// Test 1: A follower times out, transitions to candidate, and starts an election +// Test 1: A follower times out, transitions to candidate, and starts an +// election. TEST_F(RaftTest, FollowerTransitionsToCandidateAndStartsElection) { EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); EXPECT_CALL(mock_broadcast, Broadcast(_, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg) { + .WillOnce( + ::testing::Invoke([](int type, const google::protobuf::Message& msg) { const auto& requestVote = dynamic_cast(msg); EXPECT_EQ(requestVote.term(), 1); EXPECT_EQ(requestVote.candidateid(), 1); EXPECT_EQ(requestVote.lastlogindex(), 1); EXPECT_EQ(requestVote.lastlogterm(), 0); return 0; - })); + })); raft_->SetStateForTest({ .currentTerm = 0, .role = Role::FOLLOWER, - .log = CreateLogEntries({ - {0, "Term 0 Transaction 1"}, - }, true), + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + }, + true), }); raft_->StartElection(); @@ -35,19 +38,21 @@ TEST_F(RaftTest, FollowerTransitionsToCandidateAndStartsElection) { EXPECT_EQ(raft_->GetRoleSnapshot(), Role::CANDIDATE); } -// Test 2: A leader receives a RequestVote from a candidate in a newer term and demotes. +// Test 2: A leader receives a RequestVote from a candidate in a newer term and +// demotes. TEST_F(RaftTest, LeaderReceivesRequestVoteFromNewTermAndDemotes) { EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { - const auto& requestVoteResponse = dynamic_cast(msg); + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& requestVoteResponse = + dynamic_cast(msg); EXPECT_EQ(node_id, 2); EXPECT_EQ(requestVoteResponse.term(), 1); EXPECT_EQ(requestVoteResponse.voterid(), 1); EXPECT_TRUE(requestVoteResponse.votegranted()); return 0; - })); + })); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); RequestVote rv; @@ -59,11 +64,13 @@ TEST_F(RaftTest, LeaderReceivesRequestVoteFromNewTermAndDemotes) { raft_->SetStateForTest({ .currentTerm = 0, .role = Role::LEADER, - .log = CreateLogEntries({ - {0, "Term 0 Transaction 1"}, - }, true), + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + }, + true), }); - + raft_->ReceiveRequestVote(std::make_unique(rv)); EXPECT_EQ(raft_->GetVotedFor(), 2); @@ -71,19 +78,21 @@ TEST_F(RaftTest, LeaderReceivesRequestVoteFromNewTermAndDemotes) { EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); } -// Test 3: A leader receives a RequestVote from a candidate whose lastLogTerm is fewer and does not vote +// Test 3: A leader receives a RequestVote from a candidate whose lastLogTerm is +// fewer and does not vote. TEST_F(RaftTest, LeaderReceivesRequestVoteFromOldTerm) { EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { - const auto& requestVoteResponse = dynamic_cast(msg); + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& requestVoteResponse = + dynamic_cast(msg); EXPECT_EQ(node_id, 2); EXPECT_EQ(requestVoteResponse.term(), 1); EXPECT_EQ(requestVoteResponse.voterid(), 1); EXPECT_FALSE(requestVoteResponse.votegranted()); return 0; - })); + })); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); RequestVote rv; @@ -95,11 +104,13 @@ TEST_F(RaftTest, LeaderReceivesRequestVoteFromOldTerm) { raft_->SetStateForTest({ .currentTerm = 1, .role = Role::LEADER, - .log = CreateLogEntries({ - {0, "Term 0 Transaction 1"}, - }, true), + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + }, + true), }); - + raft_->ReceiveRequestVote(std::make_unique(rv)); EXPECT_EQ(raft_->GetVotedFor(), -1); @@ -107,20 +118,22 @@ TEST_F(RaftTest, LeaderReceivesRequestVoteFromOldTerm) { EXPECT_EQ(raft_->GetRoleSnapshot(), Role::LEADER); } -// Test 4: A leader receives a RequestVote from a candidate whose lastLogTerm is less recent +// Test 4: A leader receives a RequestVote from a candidate whose lastLogTerm is +// less recent. TEST_F(RaftTest, LeaderReceivesRequestVoteFromOlderLastLogTerm) { EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { - const auto& requestVoteResponse = dynamic_cast(msg); + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& requestVoteResponse = + dynamic_cast(msg); EXPECT_EQ(node_id, 2); EXPECT_EQ(requestVoteResponse.term(), 1); EXPECT_EQ(requestVoteResponse.voterid(), 1); EXPECT_FALSE(requestVoteResponse.votegranted()); return 0; - })); - EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); + })); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); RequestVote rv; rv.set_term(1); @@ -131,11 +144,13 @@ TEST_F(RaftTest, LeaderReceivesRequestVoteFromOlderLastLogTerm) { raft_->SetStateForTest({ .currentTerm = 1, .role = Role::LEADER, - .log = CreateLogEntries({ - {0, "Term 0 Transaction 1"}, - }, true), + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + }, + true), }); - + raft_->ReceiveRequestVote(std::make_unique(rv)); EXPECT_EQ(raft_->GetVotedFor(), -1); @@ -143,19 +158,21 @@ TEST_F(RaftTest, LeaderReceivesRequestVoteFromOlderLastLogTerm) { EXPECT_EQ(raft_->GetRoleSnapshot(), Role::LEADER); } -// Test 5: A leader receives a RequestVote from a candidate whose lastLogTerm is the same, but whose lastLogIndex is further behind +// Test 5: A leader receives a RequestVote from a candidate whose lastLogTerm is +// the same, but whose lastLogIndex is further behind. TEST_F(RaftTest, LeaderReceivesRequestVoteFromFurtherBehindLog) { EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { - const auto& requestVoteResponse = dynamic_cast(msg); + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& requestVoteResponse = + dynamic_cast(msg); EXPECT_EQ(node_id, 2); EXPECT_EQ(requestVoteResponse.term(), 2); EXPECT_EQ(requestVoteResponse.voterid(), 1); EXPECT_FALSE(requestVoteResponse.votegranted()); return 0; - })); + })); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); RequestVote rv; @@ -167,11 +184,13 @@ TEST_F(RaftTest, LeaderReceivesRequestVoteFromFurtherBehindLog) { raft_->SetStateForTest({ .currentTerm = 1, .role = Role::LEADER, - .log = CreateLogEntries({ - {0, "Term 0 Transaction 1"}, - }, true), + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + }, + true), }); - + raft_->ReceiveRequestVote(std::make_unique(rv)); EXPECT_EQ(raft_->GetVotedFor(), -1); @@ -179,19 +198,21 @@ TEST_F(RaftTest, LeaderReceivesRequestVoteFromFurtherBehindLog) { EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); } -// Test 6: A follower receives a RequestVote from a candidate who it would vote for, if it had not already voted for someone else. +// Test 6: A follower receives a RequestVote from a candidate who it would vote +// for, if it had not already voted for someone else. TEST_F(RaftTest, FollowerRejectsRequestVoteBecauseAlreadyVoted) { EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); EXPECT_CALL(mock_call, Call(_, _, _)) - .WillOnce(::testing::Invoke( - [](int type, const google::protobuf::Message& msg, int node_id) { - const auto& requestVoteResponse = dynamic_cast(msg); + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& requestVoteResponse = + dynamic_cast(msg); EXPECT_EQ(node_id, 2); EXPECT_EQ(requestVoteResponse.term(), 2); EXPECT_EQ(requestVoteResponse.voterid(), 1); EXPECT_FALSE(requestVoteResponse.votegranted()); return 0; - })); + })); EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); RequestVote rv; @@ -204,11 +225,13 @@ TEST_F(RaftTest, FollowerRejectsRequestVoteBecauseAlreadyVoted) { .currentTerm = 2, .votedFor = 3, .role = Role::FOLLOWER, - .log = CreateLogEntries({ - {0, "Term 0 Transaction 1"}, - }, true), + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + }, + true), }); - + raft_->ReceiveRequestVote(std::make_unique(rv)); EXPECT_EQ(raft_->GetVotedFor(), 3); @@ -216,5 +239,5 @@ TEST_F(RaftTest, FollowerRejectsRequestVoteBecauseAlreadyVoted) { EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); } -} // namespace raft -} // namespace resdb +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft_tests.h b/platform/consensus/ordering/raft/algorithm/raft_tests.h index e6ad975b5b..6e5c6879c0 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_tests.h +++ b/platform/consensus/ordering/raft/algorithm/raft_tests.h @@ -1,18 +1,18 @@ #include -#include "platform/config/resdb_config_utils.h" #include "common/crypto/mock_signature_verifier.h" -#include "platform/networkstrate/mock_replica_communicator.h" +#include "platform/config/resdb_config_utils.h" #include "platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h" #include "platform/consensus/ordering/raft/algorithm/raft.h" +#include "platform/networkstrate/mock_replica_communicator.h" #include "platform/proto/client_test.pb.h" namespace resdb { namespace raft { -using ::testing::Invoke; using ::testing::_; -using ::testing::Matcher; using ::testing::AnyNumber; +using ::testing::Invoke; +using ::testing::Matcher; ResDBConfig GenerateConfig() { ResConfigData data; @@ -28,48 +28,49 @@ ResDBConfig GenerateConfig() { class RaftTest : public ::testing::Test { private: class MockSendMessageFunction { - public: - MOCK_METHOD(int, Call, (int, const google::protobuf::Message&, int)); + public: + MOCK_METHOD(int, Call, (int, const google::protobuf::Message&, int)); }; class MockBroadcastFunction { - public: - MOCK_METHOD(int, Broadcast, (int, const google::protobuf::Message&)); + public: + MOCK_METHOD(int, Broadcast, (int, const google::protobuf::Message&)); }; class MockCommitFunction { - public: - MOCK_METHOD(int, Commit, (const google::protobuf::Message&)); + public: + MOCK_METHOD(int, Commit, (const google::protobuf::Message&)); }; protected: void SetUp() override { verifier_ = std::make_unique(); - leader_election_manager_ = std::make_unique(GenerateConfig()); + leader_election_manager_ = + std::make_unique(GenerateConfig()); replica_communicator_ = std::make_unique(); raft_ = std::make_unique( /*id=*/1, /*f=*/1, - /*total=*/4, - verifier_.get(), - leader_election_manager_.get(), + /*total=*/4, verifier_.get(), leader_election_manager_.get(), replica_communicator_.get()); raft_->SetSingleCallFunc( - [&](int type, const google::protobuf::Message& msg, int node_id) { - return mock_call.Call(type, msg, node_id); - }); + [&](int type, const google::protobuf::Message& msg, int node_id) { + return mock_call.Call(type, msg, node_id); + }); raft_->SetBroadcastCallFunc( - [&](int type, const google::protobuf::Message& msg) { - return mock_broadcast.Broadcast(type, msg); - }); + [&](int type, const google::protobuf::Message& msg) { + return mock_broadcast.Broadcast(type, msg); + }); - raft_->SetCommitFunc( - [&](const google::protobuf::Message& msg) { - return mock_commit.Commit(msg); + raft_->SetCommitFunc([&](const google::protobuf::Message& msg) { + return mock_commit.Commit(msg); }); } - AeFields CreateAeFields(uint64_t term, int leaderId, uint64_t prevLogIndex, uint64_t prevLogTerm, const std::vector>& entries, uint64_t leaderCommit, int followerId) { + AeFields CreateAeFields(uint64_t term, int leaderId, uint64_t prevLogIndex, + uint64_t prevLogTerm, + const std::vector>& entries, + uint64_t leaderCommit, int followerId) { AeFields fields{}; fields.term = term; fields.leaderId = leaderId; @@ -88,16 +89,19 @@ class RaftTest : public ::testing::Test { return fields; }; - // Helper to create a single log entry - std::unique_ptr CreateLogEntry(uint64_t term, const std::string& command_data) { + // Helper to create a single log entry. + std::unique_ptr CreateLogEntry(uint64_t term, + const std::string& command_data) { auto entry = std::make_unique(); entry->term = term; entry->command = command_data; return entry; } - // Helper to create a vector of log entries for testing - std::vector> CreateLogEntries(const std::vector>& term_and_cmds, bool usedForLogPatch = false) { + // Helper to create a vector of log entries for testing. + std::vector> CreateLogEntries( + const std::vector>& term_and_cmds, + bool usedForLogPatch = false) { std::vector> entries; if (usedForLogPatch) { @@ -106,18 +110,17 @@ class RaftTest : public ::testing::Test { first_entry->command = "COMMON_PREFIX"; entries.push_back(std::move(first_entry)); } - + for (const auto& [term, cmd] : term_and_cmds) { - std::unique_ptr entry = std::make_unique(); - entry->term = term; - // entry->command = cmd; - - ClientTestRequest req; - req.set_value(cmd); - std::string serialized; - req.SerializeToString(&serialized); - entry->command = serialized; - entries.push_back(std::move(entry)); + std::unique_ptr entry = std::make_unique(); + entry->term = term; + + ClientTestRequest req; + req.set_value(cmd); + std::string serialized; + req.SerializeToString(&serialized); + entry->command = serialized; + entries.push_back(std::move(entry)); } return entries; } @@ -147,6 +150,5 @@ class RaftTest : public ::testing::Test { MockCommitFunction mock_commit; }; - -} // namespace raft -} // namespace resdb +} // namespace raft +} // namespace resdb From ba33b746f2fcb174f1e09ee868debf433d7f1489 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Tue, 17 Feb 2026 15:27:39 -0800 Subject: [PATCH 50/66] Undo some changes unrelated to Raft --- .../execution/transaction_executor.cpp | 1 + .../ordering/common/framework/consensus.cpp | 2 ++ .../common/framework/performance_manager.cpp | 14 ++++----- platform/consensus/ordering/geo_pbft/BUILD | 2 +- platform/consensus/ordering/pbft/BUILD | 4 +-- .../networkstrate/async_replica_client.cpp | 3 -- platform/networkstrate/consensus_manager.cpp | 2 +- .../networkstrate/replica_communicator.cpp | 29 ------------------- scripts/deploy/config/pbft.config | 10 +++---- 9 files changed, 18 insertions(+), 49 deletions(-) diff --git a/platform/consensus/execution/transaction_executor.cpp b/platform/consensus/execution/transaction_executor.cpp index 3a39a67ec9..0a7401bc06 100644 --- a/platform/consensus/execution/transaction_executor.cpp +++ b/platform/consensus/execution/transaction_executor.cpp @@ -185,6 +185,7 @@ void TransactionExecutor::OrderMessage() { << " next seq:" << next_execute_seq_; continue; } + AddNewData(std::move(message)); } diff --git a/platform/consensus/ordering/common/framework/consensus.cpp b/platform/consensus/ordering/common/framework/consensus.cpp index 568a00ef06..93e00cc848 100644 --- a/platform/consensus/ordering/common/framework/consensus.cpp +++ b/platform/consensus/ordering/common/framework/consensus.cpp @@ -38,6 +38,8 @@ Consensus::Consensus(const ResDBConfig& config, ResponseMsg(*resp_msg); }, nullptr, std::move(executor))) { + LOG(INFO) << "is running is performance mode:" + << config_.IsPerformanceRunning(); is_stop_ = false; global_stats_ = Stats::GetGlobalStats(); } diff --git a/platform/consensus/ordering/common/framework/performance_manager.cpp b/platform/consensus/ordering/common/framework/performance_manager.cpp index bdee9c5b03..ed77c36460 100644 --- a/platform/consensus/ordering/common/framework/performance_manager.cpp +++ b/platform/consensus/ordering/common/framework/performance_manager.cpp @@ -186,10 +186,9 @@ void PerformanceManager::SendResponseToClient( uint64_t create_time = batch_response.createtime(); if (create_time > 0) { uint64_t run_time = GetCurrentTime() - create_time; - // JIM - //LOG(ERROR) << "receive current:" << GetCurrentTime() - // << " create time:" << create_time << " run time:" << run_time - // << " local id:" << batch_response.local_id(); + LOG(ERROR) << "receive current:" << GetCurrentTime() + << " create time:" << create_time << " run time:" << run_time + << " local id:" << batch_response.local_id(); global_stats_->AddLatency(run_time); } send_num_--; @@ -197,9 +196,9 @@ void PerformanceManager::SendResponseToClient( // =================== request ======================== int PerformanceManager::BatchProposeMsg() { - //LOG(WARNING) << "batch wait time:" << config_.ClientBatchWaitTimeMS() - // << " batch num:" << config_.ClientBatchNum() - // << " max txn:" << config_.GetMaxProcessTxn(); + LOG(WARNING) << "batch wait time:" << config_.ClientBatchWaitTimeMS() + << " batch num:" << config_.ClientBatchNum() + << " max txn:" << config_.GetMaxProcessTxn(); std::vector> batch_req; eval_ready_future_.get(); bool start = false; @@ -286,7 +285,6 @@ int PerformanceManager::DoBatch( void PerformanceManager::SendMessage(const Request& request) { replica_communicator_->SendMessage(request, GetPrimary()); - //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Sent to replica " << GetPrimary(); } } // namespace common diff --git a/platform/consensus/ordering/geo_pbft/BUILD b/platform/consensus/ordering/geo_pbft/BUILD index a083eb3e86..4bc568dd5b 100644 --- a/platform/consensus/ordering/geo_pbft/BUILD +++ b/platform/consensus/ordering/geo_pbft/BUILD @@ -62,7 +62,7 @@ cc_test( "//common/test:test_main", "//platform/config:resdb_config_utils", "//platform/consensus/execution:mock_geo_global_executor", - #"//platform/consensus/execution:mock_transaction_executor_impl", + "//platform/consensus/execution:mock_transaction_executor_impl", "//platform/networkstrate:mock_replica_communicator", ], ) diff --git a/platform/consensus/ordering/pbft/BUILD b/platform/consensus/ordering/pbft/BUILD index 4010865b22..9afd123818 100644 --- a/platform/consensus/ordering/pbft/BUILD +++ b/platform/consensus/ordering/pbft/BUILD @@ -256,8 +256,8 @@ cc_library( ) cc_library( - name = "pre_very_consensus_service_pbft", - hdrs = ["pre_very_consensus_service_pbft.h"], + name = "pre_very_consensus_manager_pbft", + hdrs = ["pre_very_consensus_manager_pbft.h"], visibility = [ "//platform:__subpackages__", "//service:__subpackages__", diff --git a/platform/networkstrate/async_replica_client.cpp b/platform/networkstrate/async_replica_client.cpp index 624cbcadd9..af13b79965 100644 --- a/platform/networkstrate/async_replica_client.cpp +++ b/platform/networkstrate/async_replica_client.cpp @@ -37,7 +37,6 @@ AsyncReplicaClient::~AsyncReplicaClient() {} int AsyncReplicaClient::SendMessage(const std::string& data) { queue_.Push(std::make_unique(data)); - if (!in_process_.load()) { bool old_value = false; if (in_process_.compare_exchange_strong(old_value, true, @@ -50,7 +49,6 @@ int AsyncReplicaClient::SendMessage(const std::string& data) { } void AsyncReplicaClient::OnSendNewMessage() { - std::unique_ptr data = queue_.Pop(0); if (data == nullptr || data->empty()) { in_process_ = false; @@ -62,7 +60,6 @@ void AsyncReplicaClient::OnSendNewMessage() { } void AsyncReplicaClient::OnSendMessage() { - if (status_ == 0) { data_size_ = pending_data_->size(); sending_data_size_ = sizeof(data_size_); diff --git a/platform/networkstrate/consensus_manager.cpp b/platform/networkstrate/consensus_manager.cpp index 2c3224c1e6..006bf96e71 100644 --- a/platform/networkstrate/consensus_manager.cpp +++ b/platform/networkstrate/consensus_manager.cpp @@ -22,7 +22,6 @@ #include #include -#include "glog/logging.h" #include "platform/proto/broadcast.pb.h" namespace resdb { @@ -86,6 +85,7 @@ void ConsensusManager::Start() { // Keep Boardcast the public keys to others. void ConsensusManager::HeartBeat() { + LOG(INFO) << "heart beat start"; int sleep_time = 1; std::mutex mutex; std::condition_variable cv; diff --git a/platform/networkstrate/replica_communicator.cpp b/platform/networkstrate/replica_communicator.cpp index f47e42c547..9bea6e3b41 100644 --- a/platform/networkstrate/replica_communicator.cpp +++ b/platform/networkstrate/replica_communicator.cpp @@ -129,7 +129,6 @@ void ReplicaCommunicator::StartSingleInBackGround(const std::string& ip, std::make_unique>>("s_batch", tcp_batch_); - ReplicaInfo replica_info; for (const auto& replica : replicas_) { if (replica.ip() == ip && replica.port() == port) { @@ -180,34 +179,15 @@ int ReplicaCommunicator::SendSingleMessage( // LOG(ERROR)<<" send msg ip:"<BroadCastMsg(); - if (is_use_long_conn_) { - /* - auto msgStart = std::chrono::steady_clock::now(); - std::chrono::steady_clock::duration msgDelta; - auto pushStart = std::chrono::steady_clock::now(); - std::chrono::steady_clock::duration pushDelta; - */ auto item = std::make_unique(); item->data = NetChannel::GetRawMessageString(message, verifier_); - /* - auto pushEnd = std::chrono::steady_clock::now(); - pushDelta = pushEnd - pushStart; - auto pushMs = std::chrono::duration_cast(pushDelta).count(); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": " << pushMs << " ms elapsed getting raw msg string"; - */ std::lock_guard lk(smutex_); if (single_bq_.find(std::make_pair(ip, port)) == single_bq_.end()) { StartSingleInBackGround(ip, port); } assert(single_bq_[std::make_pair(ip, port)] != nullptr); single_bq_[std::make_pair(ip, port)]->Push(std::move(item)); - /* - auto msgEnd = std::chrono::steady_clock::now(); - msgDelta = msgEnd - msgStart; - auto msgMs = std::chrono::duration_cast(msgDelta).count(); - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": " << msgMs << " ms elapsed in is_use_long_conn_ conditional"; - */ return 0; } else { return SendMessageInternal(message, replicas_); @@ -230,15 +210,12 @@ int ReplicaCommunicator::SendMessage(const google::protobuf::Message& message, const ReplicaInfo& replica_info) { return SendSingleMessage(message, replica_info); - if (is_use_long_conn_) { - std::string data = NetChannel::GetRawMessageString(message, verifier_); BroadcastData broadcast_data; broadcast_data.add_data()->swap(data); return SendMessageFromPool(broadcast_data, {replica_info}); } else { - return SendMessageInternal(message, {replica_info}); } } @@ -265,7 +242,6 @@ int ReplicaCommunicator::SendBatchMessage( int ReplicaCommunicator::SendMessageFromPool( const google::protobuf::Message& message, const std::vector& replicas) { - int ret = 0; std::string data; message.SerializeToString(&data); @@ -273,7 +249,6 @@ int ReplicaCommunicator::SendMessageFromPool( std::lock_guard lk(mutex_); for (const auto& replica : replicas) { auto client = GetClientFromPool(replica.ip(), replica.port()); - if (client == nullptr) { continue; } @@ -300,9 +275,7 @@ int ReplicaCommunicator::SendMessageInternal( if (verifier_ != nullptr) { client->SetSignatureVerifier(verifier_); } - if (client->SendRawMessage(message) == 0) { - ret++; } } @@ -335,9 +308,7 @@ void ReplicaCommunicator::BroadCast(const google::protobuf::Message& message) { void ReplicaCommunicator::SendMessage(const google::protobuf::Message& message, int64_t node_id) { ReplicaInfo target_replica; - for (const auto& replica : replicas_) { - if (replica.id() == node_id) { target_replica = replica; break; diff --git a/scripts/deploy/config/pbft.config b/scripts/deploy/config/pbft.config index 4f47dd163f..1de013abf3 100644 --- a/scripts/deploy/config/pbft.config +++ b/scripts/deploy/config/pbft.config @@ -18,12 +18,12 @@ // { - "clientBatchNum": 30, + "clientBatchNum": 100, "enable_viewchange": true, - "recovery_enabled": false, + "recovery_enabled": true, "max_client_complaint_num":10, - "max_process_txn": 512, - "worker_num": 1, + "max_process_txn": 2048, + "worker_num": 2, "input_worker_num": 1, "output_worker_num": 10 -} \ No newline at end of file +} From f863e12383fda4f811a4f934d5cf439aed6cda33 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Tue, 3 Mar 2026 14:18:59 -0800 Subject: [PATCH 51/66] Change log_ to vector. Add helper functions for future use --- .../ordering/raft/algorithm/raft.cpp | 74 ++++++++++++------- .../consensus/ordering/raft/algorithm/raft.h | 26 ++++--- .../algorithm/raft_append_entries_test.cpp | 16 ++-- .../ordering/raft/algorithm/raft_tests.h | 37 +++++----- 4 files changed, 88 insertions(+), 65 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 24f17866c9..7883f88336 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -43,7 +43,7 @@ std::ostream& operator<<(std::ostream& stream, TermRelation tr) { return stream << nameTR[static_cast(tr)]; } -uint32_t LogEntry::GetSerializedSize() { +uint32_t LogEntry::GetSerializedSize() const { if (serializedSize == 0) { serializedSize = ComputeSerializedEntrySize(); } @@ -78,10 +78,10 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, //last_ae_time_ = std::chrono::steady_clock::now(); //last_heartbeat_time_ = std::chrono::steady_clock::now(); - auto sentinel = std::make_unique(); - sentinel->term = 0; - sentinel->command = "COMMON_PREFIX"; - log_.push_back(std::move(sentinel)); + LogEntry sentinel; + sentinel.term = 0; + sentinel.command = "COMMON_PREFIX"; + AddToLog(sentinel); inflightVecs_.resize(total_num_ + 1); for (auto& vec : inflightVecs_) { @@ -112,14 +112,14 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { return false; } // append new transaction to log - auto entry = std::make_unique(); - entry->term = currentTerm_; - if (!req->SerializeToString(&entry->command)) { + LogEntry entry; + entry.term = currentTerm_; + if (!req->SerializeToString(&entry.command)) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": req could not be serialized"; return false; } - entry->GetSerializedSize(); - log_.push_back(std::move(entry)); + entry.GetSerializedSize(); + AddToLog(std::move(entry)); @@ -181,7 +181,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { if (tr != TermRelation::STALE && role_ == Role::FOLLOWER) { uint64_t i = ae->prevlogindex(); - if (i < static_cast(log_.size()) && ae->prevlogterm() == log_[i]->term) { + if (i < static_cast(log_.size()) && ae->prevlogterm() == log_[i].term) { success = true; } } @@ -199,10 +199,10 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { // if conflict, delete suffix and short circuit out of loop while (logIdx < log_.size() && entriesIdx < entriesSize) { uint64_t term = ae->entries(entriesIdx).term(); - if (term != log_[logIdx]->term) { + if (term != log_[logIdx].term) { auto first = log_.begin() + logIdx; auto last = log_.begin() + lastLogIndex_ + 1; - log_.erase(first, last); + TruncateLog(first, last); lastLogIndex_ = log_.size() - 1; if (replicationLoggingFlag_) { @@ -219,7 +219,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { const auto appendSize = entriesSize - entriesIdx; log_.reserve(log_.size() + appendSize); for (uint64_t i = entriesIdx; i < entriesSize; ++i) { - log_.emplace_back(std::make_unique(CreateLogEntry(ae->entries(i)))); + AddToLog(CreateLogEntry(ae->entries(i))); } // update lastLogIndex after appends uint64_t firstAppendIdx = lastLogIndex_ + 1; @@ -337,7 +337,7 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a std::sort(sorted.begin(), sorted.end(), std::greater()); uint64_t lastReplicatedIndex = sorted[quorum_ - 1]; // Need to check the lastReplicatedIndex contains entry from current term - if (lastReplicatedIndex > commitIndex_ && log_[lastReplicatedIndex]->term == currentTerm_) { + if (lastReplicatedIndex > commitIndex_ && log_[lastReplicatedIndex].term == currentTerm_) { LOG(INFO) << "JIM -> " << parent_fn << ": Raised commitIndex_ from " << commitIndex_ << " to " << lastReplicatedIndex; commitIndex_ = lastReplicatedIndex; @@ -415,7 +415,7 @@ void Raft::ReceiveRequestVote(std::unique_ptr rv) { } validCandidate = true; if (votedFor_ == -1 || votedFor_ == rvSender) { - votedFor_ = rvSender; + SetVotedFor(rvSender); voteGranted = true; } }(); @@ -522,8 +522,8 @@ void Raft::StartElection() { roleChanged = true; } heartBeatsSentThisTerm_ = 0; - currentTerm_++; - votedFor_ = id_; + SetCurrentTerm(currentTerm_ + 1, false); + SetVotedFor(id_); votes_.clear(); votes_.push_back(id_); LOG(INFO) << "JIM -> " << __FUNCTION__ << ": I voted for myself. Votes: " @@ -616,8 +616,8 @@ void Raft::SendHeartBeat() { // returns true if demoted bool Raft::DemoteSelfLocked(uint64_t term) { if (term > currentTerm_) { - currentTerm_ = term; - votedFor_ = -1; + SetCurrentTerm(term, false); + SetVotedFor(-1); } if (role_ != Role::FOLLOWER) { SetRole(Role::FOLLOWER); @@ -642,7 +642,7 @@ TermRelation Raft::TermCheckLocked(uint64_t term) const { // requires raft mutex to be held uint64_t Raft::getLastLogTermLocked() const { - return log_[lastLogIndex_]->term; + return log_[lastLogIndex_].term; } // requires raft mutex to be held @@ -653,7 +653,7 @@ std::vector> Raft::PrepareCommitLocked() { while (lastApplied_ < commitIndex_) { ++lastApplied_; auto command = std::make_unique(); - if (!command->ParseFromString(log_[lastApplied_]->command)) { + if (!command->ParseFromString(log_[lastApplied_].command)) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Failed to parse command"; continue; } @@ -681,7 +681,7 @@ AeFields Raft::GatherAeFieldsLocked(int followerId, bool heartBeat) const { fields.leaderId = id_; fields.leaderCommit = commitIndex_; fields.prevLogIndex = nextIndex_[followerId] - 1; - fields.prevLogTerm = log_[fields.prevLogIndex]->term; + fields.prevLogTerm = log_[fields.prevLogIndex].term; fields.followerId = followerId; if (heartBeat) { return fields; @@ -690,14 +690,14 @@ AeFields Raft::GatherAeFieldsLocked(int followerId, bool heartBeat) const { const uint64_t firstNew = nextIndex_[followerId]; const uint64_t limit = std::min(lastLogIndex_, (firstNew + maxEntries) - 1); for (uint64_t i = firstNew; i <= limit; ++i) { - msgBytes += log_[i]->GetSerializedSize(); + msgBytes += log_[i].GetSerializedSize(); // Always include at least 1 entry, after that limit by maxBytes. if (i != firstNew && msgBytes >= maxBytes) { break; } LogEntry entry; - entry.term = log_[i]->term; - entry.command = log_[i]->command; + entry.term = log_[i].term; + entry.command = log_[i].command; fields.entries.push_back(entry); } return fields; @@ -831,6 +831,24 @@ bool Raft::InFlightPerFollowerLimitReachedLocked(int followerId) const { return size == maxInFlightPerFollower; } +void Raft::SetCurrentTerm(uint64_t currentTerm, bool writeMetadata) { + currentTerm_ = currentTerm; +} + +void Raft::SetVotedFor(int votedFor, bool writeMetadata) { + votedFor_ = votedFor; +} + +void Raft::AddToLog(LogEntry logEntryToAdd, bool writeMetadata) { + log_.push_back(logEntryToAdd); +} + +void Raft::TruncateLog(std::vector::iterator first, + std::vector::iterator last, + bool writeMetadata) { + log_.erase(first, last); +} + void Raft::PrintDebugState() const { std::lock_guard lk(mutex_); @@ -840,8 +858,8 @@ void Raft::PrintDebugState() const { LOG(INFO) << "log_ (size " << log_.size() << "): ["; for (size_t i = 0; i < log_.size(); ++i) { - LOG(INFO) << "{term: " << log_[i]->term - << ", cmd_size: " << log_[i]->command.size() << "}"; + LOG(INFO) << "{term: " << log_[i].term + << ", cmd_size: " << log_[i].command.size() << "}"; if (i + 1 != log_.size()) LOG(INFO) << ", "; } LOG(INFO) << "]\n"; diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 65e22e5df1..22b016e649 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -50,11 +50,11 @@ class LogEntry { uint64_t term; std::string command; - uint32_t GetSerializedSize(); + uint32_t GetSerializedSize() const; uint32_t ComputeSerializedEntrySize() const; private: - uint32_t serializedSize = 0; + mutable uint32_t serializedSize = 0; }; struct AeFields { @@ -81,7 +81,7 @@ struct RaftStatePatch { std::optional lastApplied; std::optional role; - std::optional>> log; + std::optional> log; std::optional> nextIndex; std::optional> matchIndex; std::optional> votes; @@ -112,6 +112,12 @@ class Raft : public common::ProtocolBase { virtual Role GetRoleSnapshot() const; virtual void SetRole(Role role); virtual void PrintDebugState() const; + virtual void SetCurrentTerm(uint64_t currentTerm, bool writeMetadata = true); + virtual void SetVotedFor(int votedFor, bool writeMetadata = true); + void AddToLog(LogEntry logEntry, bool writeMetadata = true); + void TruncateLog(std::vector::iterator first, + std::vector::iterator last, + bool writeMetadata = true); private: mutable std::mutex mutex_; @@ -140,7 +146,7 @@ class Raft : public common::ProtocolBase { // Persistent state on all servers: uint64_t currentTerm_; // Protected by mutex_ int votedFor_; // Protected by mutex_ - std::vector> log_; // Protected by mutex_ + std::vector log_; // Protected by mutex_ // Volatile state on leaders: std::vector nextIndex_; // Protected by mutex_ @@ -185,7 +191,7 @@ class Raft : public common::ProtocolBase { if (patch.role) role_ = *patch.role; if (patch.log) { - log_ = std::move(*patch.log); + log_ = *patch.log; lastLogIndex_ = log_.empty() ? 0 : log_.size() - 1; } @@ -204,7 +210,7 @@ class Raft : public common::ProtocolBase { return votedFor_; } - const std::vector>& GetLog() const { + const std::vector& GetLog() const { std::lock_guard lock(mutex_); return log_; } @@ -214,14 +220,10 @@ class Raft : public common::ProtocolBase { for (size_t i = 0; i < log_.size(); ++i) { const auto& entry = log_[i]; - if (!entry) { - os << " [" << i << "] \n"; - continue; - } os << " [" << i << "] " - << "term=" << entry->term << ", command=\"" << entry->command << "\"" - << ", serializedSize=" << entry->GetSerializedSize() << "\n"; + << "term=" << entry.term << ", command=\"" << entry.command << "\"" + << ", serializedSize=" << entry.GetSerializedSize() << "\n"; } } diff --git a/platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp b/platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp index d95fed0ebd..e195fab9f4 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp @@ -315,15 +315,15 @@ TEST_F(RaftTest, FollowerAddsAppendEntriesAndTruncatesLog) { std::make_unique(std::move(aemessage))); const auto& raft_log = raft_->GetLog(); - EXPECT_EQ(raft_log[0]->term, 0); - EXPECT_EQ(raft_log[0]->command, "COMMON_PREFIX"); - EXPECT_EQ(raft_log[1]->term, 0); + EXPECT_EQ(raft_log[0].term, 0); + EXPECT_EQ(raft_log[0].command, "COMMON_PREFIX"); + EXPECT_EQ(raft_log[1].term, 0); // TODO: Use serialized string instead of manually doing it. - EXPECT_EQ(raft_log[1]->command, "\n\x14Term 0 Transaction 1"); - EXPECT_EQ(raft_log[2]->term, 1); - EXPECT_EQ(raft_log[2]->command, "\n\x14Term 1 Transaction 1"); - EXPECT_EQ(raft_log[3]->term, 1); - EXPECT_EQ(raft_log[3]->command, "\n\x14Term 1 Transaction 2"); + EXPECT_EQ(raft_log[1].command, "\n\x14Term 0 Transaction 1"); + EXPECT_EQ(raft_log[2].term, 1); + EXPECT_EQ(raft_log[2].command, "\n\x14Term 1 Transaction 1"); + EXPECT_EQ(raft_log[3].term, 1); + EXPECT_EQ(raft_log[3].command, "\n\x14Term 1 Transaction 2"); EXPECT_TRUE(success); } diff --git a/platform/consensus/ordering/raft/algorithm/raft_tests.h b/platform/consensus/ordering/raft/algorithm/raft_tests.h index 6e5c6879c0..49cd2e499d 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_tests.h +++ b/platform/consensus/ordering/raft/algorithm/raft_tests.h @@ -69,7 +69,7 @@ class RaftTest : public ::testing::Test { AeFields CreateAeFields(uint64_t term, int leaderId, uint64_t prevLogIndex, uint64_t prevLogTerm, - const std::vector>& entries, + const std::vector& entries, uint64_t leaderCommit, int followerId) { AeFields fields{}; fields.term = term; @@ -81,8 +81,8 @@ class RaftTest : public ::testing::Test { for (const auto& logEntry : entries) { LogEntry entry; - entry.term = logEntry->term; - entry.command = logEntry->command; + entry.term = logEntry.term; + entry.command = logEntry.command; fields.entries.push_back(std::move(entry)); } @@ -90,38 +90,41 @@ class RaftTest : public ::testing::Test { }; // Helper to create a single log entry. - std::unique_ptr CreateLogEntry(uint64_t term, + LogEntry CreateLogEntry(uint64_t term, const std::string& command_data) { - auto entry = std::make_unique(); - entry->term = term; - entry->command = command_data; + LogEntry entry; + entry.term = term; + entry.command = command_data; return entry; } // Helper to create a vector of log entries for testing. - std::vector> CreateLogEntries( + std::vector CreateLogEntries( const std::vector>& term_and_cmds, bool usedForLogPatch = false) { - std::vector> entries; + std::vector entries; if (usedForLogPatch) { - std::unique_ptr first_entry = std::make_unique(); - first_entry->term = 0; - first_entry->command = "COMMON_PREFIX"; - entries.push_back(std::move(first_entry)); + LogEntry first_entry; + first_entry.term = 0; + first_entry.command = "COMMON_PREFIX"; + entries.push_back(first_entry); } for (const auto& [term, cmd] : term_and_cmds) { - std::unique_ptr entry = std::make_unique(); - entry->term = term; + LogEntry entry; + entry.term = term; ClientTestRequest req; req.set_value(cmd); + std::string serialized; req.SerializeToString(&serialized); - entry->command = serialized; - entries.push_back(std::move(entry)); + entry.command = serialized; + + entries.push_back(entry); } + return entries; } From 830fe9b2bd7a773857888d77e6357d4ff7b94a9e Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Wed, 4 Mar 2026 15:05:36 -0800 Subject: [PATCH 52/66] Change LogEntry to store Entry directly --- .../ordering/raft/algorithm/raft.cpp | 48 +++++++++---------- .../consensus/ordering/raft/algorithm/raft.h | 3 +- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 7883f88336..4da9bd8278 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -51,9 +51,6 @@ uint32_t LogEntry::GetSerializedSize() const { } uint32_t LogEntry::ComputeSerializedEntrySize() const { - Entry entry; - entry.set_term(term); - entry.set_command(command); return entry.ByteSizeLong(); } @@ -79,8 +76,8 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, //last_heartbeat_time_ = std::chrono::steady_clock::now(); LogEntry sentinel; - sentinel.term = 0; - sentinel.command = "COMMON_PREFIX"; + sentinel.entry.set_term(0); + sentinel.entry.set_command("COMMON_PREFIX"); AddToLog(sentinel); inflightVecs_.resize(total_num_ + 1); @@ -112,14 +109,18 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { return false; } // append new transaction to log - LogEntry entry; - entry.term = currentTerm_; - if (!req->SerializeToString(&entry.command)) { + LogEntry logEntry; + logEntry.entry.set_term(currentTerm_); + + std::string serialized; + if (!req->SerializeToString(&serialized)) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": req could not be serialized"; return false; } - entry.GetSerializedSize(); - AddToLog(std::move(entry)); + + logEntry.entry.set_command(std::move(serialized)); + logEntry.GetSerializedSize(); + AddToLog(logEntry); @@ -181,7 +182,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { if (tr != TermRelation::STALE && role_ == Role::FOLLOWER) { uint64_t i = ae->prevlogindex(); - if (i < static_cast(log_.size()) && ae->prevlogterm() == log_[i].term) { + if (i < static_cast(log_.size()) && ae->prevlogterm() == log_[i].entry.term()) { success = true; } } @@ -199,7 +200,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { // if conflict, delete suffix and short circuit out of loop while (logIdx < log_.size() && entriesIdx < entriesSize) { uint64_t term = ae->entries(entriesIdx).term(); - if (term != log_[logIdx].term) { + if (term != log_[logIdx].entry.term()) { auto first = log_.begin() + logIdx; auto last = log_.begin() + lastLogIndex_ + 1; TruncateLog(first, last); @@ -337,7 +338,7 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a std::sort(sorted.begin(), sorted.end(), std::greater()); uint64_t lastReplicatedIndex = sorted[quorum_ - 1]; // Need to check the lastReplicatedIndex contains entry from current term - if (lastReplicatedIndex > commitIndex_ && log_[lastReplicatedIndex].term == currentTerm_) { + if (lastReplicatedIndex > commitIndex_ && log_[lastReplicatedIndex].entry.term() == currentTerm_) { LOG(INFO) << "JIM -> " << parent_fn << ": Raised commitIndex_ from " << commitIndex_ << " to " << lastReplicatedIndex; commitIndex_ = lastReplicatedIndex; @@ -642,7 +643,7 @@ TermRelation Raft::TermCheckLocked(uint64_t term) const { // requires raft mutex to be held uint64_t Raft::getLastLogTermLocked() const { - return log_[lastLogIndex_].term; + return log_[lastLogIndex_].entry.term(); } // requires raft mutex to be held @@ -653,7 +654,7 @@ std::vector> Raft::PrepareCommitLocked() { while (lastApplied_ < commitIndex_) { ++lastApplied_; auto command = std::make_unique(); - if (!command->ParseFromString(log_[lastApplied_].command)) { + if (!command->ParseFromString(log_[lastApplied_].entry.command())) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Failed to parse command"; continue; } @@ -681,7 +682,7 @@ AeFields Raft::GatherAeFieldsLocked(int followerId, bool heartBeat) const { fields.leaderId = id_; fields.leaderCommit = commitIndex_; fields.prevLogIndex = nextIndex_[followerId] - 1; - fields.prevLogTerm = log_[fields.prevLogIndex].term; + fields.prevLogTerm = log_[fields.prevLogIndex].entry.term(); fields.followerId = followerId; if (heartBeat) { return fields; @@ -696,8 +697,7 @@ AeFields Raft::GatherAeFieldsLocked(int followerId, bool heartBeat) const { break; } LogEntry entry; - entry.term = log_[i].term; - entry.command = log_[i].command; + entry.entry = log_[i].entry; fields.entries.push_back(entry); } return fields; @@ -733,9 +733,9 @@ void Raft::CreateAndSendAppendEntryMsg(const AeFields& fields) { ae.set_prevlogterm(fields.prevLogTerm); ae.set_leadercommitindex(fields.leaderCommit); for (const auto& entry : fields.entries) { - auto* newEntry = ae.add_entries(); - newEntry->set_term(entry.term); - newEntry->set_command(entry.command); + Entry* newEntry = ae.add_entries(); + newEntry->set_term(entry.entry.term()); + newEntry->set_command(entry.entry.command()); } SendMessage(MessageType::AppendEntriesMsg, ae, followerId); if (replicationLoggingFlag_) { @@ -746,8 +746,7 @@ void Raft::CreateAndSendAppendEntryMsg(const AeFields& fields) { LogEntry Raft::CreateLogEntry(const Entry& entry) const { LogEntry newEntry; - newEntry.term = entry.term(); - newEntry.command = entry.command(); + newEntry.entry = entry; return newEntry; } @@ -858,8 +857,7 @@ void Raft::PrintDebugState() const { LOG(INFO) << "log_ (size " << log_.size() << "): ["; for (size_t i = 0; i < log_.size(); ++i) { - LOG(INFO) << "{term: " << log_[i].term - << ", cmd_size: " << log_[i].command.size() << "}"; + LOG(INFO) << "{term: " << log_[i].entry.term(); if (i + 1 != log_.size()) LOG(INFO) << ", "; } LOG(INFO) << "]\n"; diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 22b016e649..15d5611b07 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -47,8 +47,7 @@ enum class TermRelation { STALE, CURRENT, NEW }; class LogEntry { public: - uint64_t term; - std::string command; + Entry entry; uint32_t GetSerializedSize() const; uint32_t ComputeSerializedEntrySize() const; From 723ef64a2cbbc7e7f68f4130c62bc7a09853b4e5 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Thu, 5 Mar 2026 11:49:42 -0800 Subject: [PATCH 53/66] WIP Add RaftRecovery class --- .../consensus/ordering/raft/algorithm/BUILD | 1 + .../ordering/raft/algorithm/raft.cpp | 25 +- .../consensus/ordering/raft/algorithm/raft.h | 7 +- .../consensus/ordering/raft/framework/BUILD | 49 +- .../raft/framework/checkpoint_manager.cpp | 558 ++++++++++++++++++ .../raft/framework/checkpoint_manager.h | 149 +++++ .../ordering/raft/framework/consensus.cpp | 29 +- .../ordering/raft/framework/consensus.h | 7 + .../ordering/raft/framework/raft_recovery.cpp | 323 ++++++++++ .../ordering/raft/framework/raft_recovery.h | 69 +++ .../raft/framework/transaction_utils.cpp | 41 ++ .../raft/framework/transaction_utils.h | 38 ++ platform/consensus/recovery/recovery.h | 23 +- 13 files changed, 1304 insertions(+), 15 deletions(-) create mode 100644 platform/consensus/ordering/raft/framework/checkpoint_manager.cpp create mode 100644 platform/consensus/ordering/raft/framework/checkpoint_manager.h create mode 100644 platform/consensus/ordering/raft/framework/raft_recovery.cpp create mode 100644 platform/consensus/ordering/raft/framework/raft_recovery.h create mode 100644 platform/consensus/ordering/raft/framework/transaction_utils.cpp create mode 100644 platform/consensus/ordering/raft/framework/transaction_utils.h diff --git a/platform/consensus/ordering/raft/algorithm/BUILD b/platform/consensus/ordering/raft/algorithm/BUILD index 4a980c9c54..ff1a7ac3f5 100644 --- a/platform/consensus/ordering/raft/algorithm/BUILD +++ b/platform/consensus/ordering/raft/algorithm/BUILD @@ -39,6 +39,7 @@ cc_library( "//platform/consensus/execution:system_info", "//platform/networkstrate:replica_communicator", "//platform/proto:viewchange_message_cc_proto", + "//platform/consensus/ordering/raft/framework:raft_recovery" ], ) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 4da9bd8278..f3769c6b2f 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -55,7 +55,7 @@ uint32_t LogEntry::ComputeSerializedEntrySize() const { } Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, - LeaderElectionManager* leaderelection_manager, ReplicaCommunicator* replica_communicator) + LeaderElectionManager* leaderelection_manager, ReplicaCommunicator* replica_communicator, RaftRecovery* recovery) : ProtocolBase(id, f, total_num), currentTerm_(0), votedFor_(-1), @@ -63,11 +63,13 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, commitIndex_(0), lastApplied_(0), role_(Role::FOLLOWER), + seqAfterCheckpoint_(0), is_stop_(false), quorum_((total_num/2) + 1), verifier_(verifier), leader_election_manager_(leaderelection_manager), - replica_communicator_(replica_communicator) { + replica_communicator_(replica_communicator), + recovery_(recovery) { id_ = id; total_num_ = total_num; @@ -523,6 +525,7 @@ void Raft::StartElection() { roleChanged = true; } heartBeatsSentThisTerm_ = 0; + LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; SetCurrentTerm(currentTerm_ + 1, false); SetVotedFor(id_); votes_.clear(); @@ -617,6 +620,7 @@ void Raft::SendHeartBeat() { // returns true if demoted bool Raft::DemoteSelfLocked(uint64_t term) { if (term > currentTerm_) { + LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; SetCurrentTerm(term, false); SetVotedFor(-1); } @@ -831,14 +835,31 @@ bool Raft::InFlightPerFollowerLimitReachedLocked(int followerId) const { } void Raft::SetCurrentTerm(uint64_t currentTerm, bool writeMetadata) { + LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; currentTerm_ = currentTerm; + // if (writeMetadata) { + // recovery_->WriteMetadata(currentTerm_, votedFor_); + // } } void Raft::SetVotedFor(int votedFor, bool writeMetadata) { + LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; votedFor_ = votedFor; + // if (writeMetadata) { + // recovery_->WriteMetadata(currentTerm_, votedFor_); + // } +} + +void Raft::SetSeqIndexCoveredBySnapshot(int seq) { + seqAfterCheckpoint_ = seq; } void Raft::AddToLog(LogEntry logEntryToAdd, bool writeMetadata) { + // Entry* entry; + // entry = &logEntryToAdd.entry; + // if (writeMetadata) { + // recovery_->AddLogEntry(entry); + // } log_.push_back(logEntryToAdd); } diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 15d5611b07..58971071ae 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -38,6 +38,7 @@ #include "platform/statistic/stats.h" #include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" #include "platform/networkstrate/replica_communicator.h" +#include "platform/consensus/ordering/raft/framework/raft_recovery.h" namespace resdb { namespace raft { @@ -92,7 +93,8 @@ class Raft : public common::ProtocolBase { Raft(int id, int f, int total_num, SignatureVerifier* verifier, LeaderElectionManager* leaderelection_manager, - ReplicaCommunicator* replica_communicator + ReplicaCommunicator* replica_communicator, + RaftRecovery* recovery ); ~Raft(); @@ -113,6 +115,7 @@ class Raft : public common::ProtocolBase { virtual void PrintDebugState() const; virtual void SetCurrentTerm(uint64_t currentTerm, bool writeMetadata = true); virtual void SetVotedFor(int votedFor, bool writeMetadata = true); + virtual void SetSeqIndexCoveredBySnapshot(int seq); void AddToLog(LogEntry logEntry, bool writeMetadata = true); void TruncateLog(std::vector::iterator first, std::vector::iterator last, @@ -162,6 +165,7 @@ class Raft : public common::ProtocolBase { std::vector> inflightVecs_; // Protected by mutex_ //std::chrono::steady_clock::time_point last_ae_time_; //std::chrono::steady_clock::time_point last_heartbeat_time_; // Protected by mutex_ + int seqAfterCheckpoint_; bool is_stop_; const uint64_t quorum_; @@ -177,6 +181,7 @@ class Raft : public common::ProtocolBase { LeaderElectionManager* leader_election_manager_; //Stats* global_stats_; ReplicaCommunicator* replica_communicator_; + RaftRecovery* recovery_; #ifdef RAFT_TEST_MODE public: diff --git a/platform/consensus/ordering/raft/framework/BUILD b/platform/consensus/ordering/raft/framework/BUILD index 03137c0500..0b6103b498 100644 --- a/platform/consensus/ordering/raft/framework/BUILD +++ b/platform/consensus/ordering/raft/framework/BUILD @@ -16,7 +16,52 @@ # under the License. # -package(default_visibility = ["//visibility:private"]) +package(default_visibility = ["//platform/consensus/ordering/raft:__subpackages__"]) + +cc_library( + name = "transaction_utils", + srcs = ["transaction_utils.cpp"], + hdrs = ["transaction_utils.h"], + deps = [ + "//platform/proto:resdb_cc_proto", + ], +) + +cc_library( + name = "checkpoint_manager", + srcs = ["checkpoint_manager.cpp"], + hdrs = ["checkpoint_manager.h"], + deps = [ + ":transaction_utils", + "//chain/state:chain_state", + "//common/crypto:signature_verifier", + "//interface/common:resdb_txn_accessor", + "//platform/config:resdb_config", + "//platform/consensus/checkpoint", + "//platform/consensus/execution:transaction_executor", + "//platform/networkstrate:replica_communicator", + "//platform/networkstrate:server_comm", + "//platform/proto:checkpoint_info_cc_proto", + ], +) + +cc_library( + name = "raft_recovery", + srcs = ["raft_recovery.cpp"], + hdrs = ["raft_recovery.h"], + deps = [ + "//chain/storage", + "//common/utils", + "//platform/consensus/ordering/raft/proto:proposal_cc_proto", + "//platform/config:resdb_config", + "//platform/consensus/checkpoint", + "//platform/consensus/execution:system_info", + "//platform/networkstrate:server_comm", + "//platform/proto:resdb_cc_proto", + "//platform/proto:system_info_data_cc_proto", + "//platform/consensus/recovery:recovery" + ], +) cc_library( name = "consensus", @@ -26,6 +71,8 @@ cc_library( "//visibility:public", ], deps = [ + ":checkpoint_manager", + ":raft_recovery", "//common/utils", "//platform/consensus/ordering/common/framework:consensus", "//platform/consensus/ordering/raft/algorithm:raft", diff --git a/platform/consensus/ordering/raft/framework/checkpoint_manager.cpp b/platform/consensus/ordering/raft/framework/checkpoint_manager.cpp new file mode 100644 index 0000000000..ca9bddeca7 --- /dev/null +++ b/platform/consensus/ordering/raft/framework/checkpoint_manager.cpp @@ -0,0 +1,558 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "platform/consensus/ordering/raft/framework/checkpoint_manager.h" + +#include + +#include "platform/consensus/ordering/raft/framework/transaction_utils.h" +#include "platform/proto/checkpoint_info.pb.h" + +namespace resdb { + +CheckPointManager::CheckPointManager(const ResDBConfig& config, + ReplicaCommunicator* replica_communicator, + SignatureVerifier* verifier, + SystemInfo* sys_info) + : config_(config), + replica_communicator_(replica_communicator), + verifier_(verifier), + stop_(false), + txn_accessor_(config), + highest_prepared_seq_(0), + sys_info_(sys_info) { + current_stable_seq_ = 0; + if (config_.GetConfigData().enable_viewchange()) { + config_.EnableCheckPoint(true); + } + if (config_.IsCheckPointEnabled()) { + stable_checkpoint_thread_ = + std::thread(&CheckPointManager::UpdateStableCheckPointStatus, this); + checkpoint_thread_ = + std::thread(&CheckPointManager::UpdateCheckPointStatus, this); + status_thread_ = std::thread(&CheckPointManager::SyncStatus, this); + } + sem_init(&committable_seq_signal_, 0, 0); +} + +CheckPointManager::~CheckPointManager() { Stop(); } + +void CheckPointManager::Stop() { + stop_ = true; + if (checkpoint_thread_.joinable()) { + checkpoint_thread_.join(); + } + if (stable_checkpoint_thread_.joinable()) { + stable_checkpoint_thread_.join(); + } + if (status_thread_.joinable()) { + status_thread_.join(); + } +} + +void CheckPointManager::SetResetExecute( + std::function func) { + reset_execute_func_ = func; +} + +std::string GetHash(const std::string& h1, const std::string& h2) { + return SignatureVerifier::CalculateHash(h1 + h2); +} + +uint64_t CheckPointManager::GetStableCheckpoint() { + std::lock_guard lk(mutex_); + return current_stable_seq_; +} + +StableCheckPoint CheckPointManager::GetStableCheckpointWithVotes() { + std::lock_guard lk(mutex_); + return stable_ckpt_; +} + +void CheckPointManager::AddCommitData(std::unique_ptr request) { + if (config_.IsCheckPointEnabled()) { + data_queue_.Push(std::move(request)); + } +} + +// check whether there are 2f+1 valid checkpoint proof. +bool CheckPointManager::IsValidCheckpointProof( + const StableCheckPoint& stable_ckpt) { + std::string hash = stable_ckpt_.hash(); + std::set senders; + for (const auto& signature : stable_ckpt_.signatures()) { + if (!verifier_->VerifyMessage(hash, signature)) { + return false; + } + senders.insert(signature.node_id()); + } + + return (static_cast(senders.size()) >= config_.GetMinDataReceiveNum()) || + (stable_ckpt.seq() == 0 && senders.size() == 0); +} + +int CheckPointManager::ProcessCheckPoint(std::unique_ptr context, + std::unique_ptr request) { + CheckPointData checkpoint_data; + if (!checkpoint_data.ParseFromString(request->data())) { + LOG(ERROR) << "parse checkpont data fail:"; + return -2; + } + uint64_t checkpoint_seq = checkpoint_data.seq(); + uint32_t sender_id = request->sender_id(); + LOG(ERROR) << " receive ckpt:" << checkpoint_seq << " from:" << sender_id; + int water_mark = config_.GetCheckPointWaterMark(); + if (checkpoint_seq % water_mark) { + LOG(ERROR) << "checkpoint seq not invalid:" << checkpoint_seq; + return -2; + } + + if (verifier_) { + // check signatures + bool valid = verifier_->VerifyMessage(checkpoint_data.hash(), + checkpoint_data.hash_signature()); + if (!valid) { + LOG(ERROR) << "request is not valid:" + << checkpoint_data.hash_signature().DebugString(); + return -2; + } + } + + { + std::lock_guard lk(mutex_); + auto res = + sender_ckpt_[std::make_pair(checkpoint_seq, checkpoint_data.hash())] + .insert(sender_id); + if (res.second) { + sign_ckpt_[std::make_pair(checkpoint_seq, checkpoint_data.hash())] + .push_back(checkpoint_data.hash_signature()); + new_data_++; + } + if (sender_ckpt_[std::make_pair(checkpoint_seq, checkpoint_data.hash())] + .size() == 1) { + for (auto& hash_ : checkpoint_data.hashs()) { + hash_ckpt_[std::make_pair(checkpoint_seq, checkpoint_data.hash())] + .push_back(hash_); + } + } + Notify(); + } + return 0; +} + +void CheckPointManager::Notify() { + std::lock_guard lk(cv_mutex_); + cv_.notify_all(); +} + +bool CheckPointManager::Wait() { + int timeout_ms = 1000; + std::unique_lock lk(cv_mutex_); + return cv_.wait_for(lk, std::chrono::milliseconds(timeout_ms), + [&] { return new_data_ > 0; }); +} + +void CheckPointManager::CheckHealthy() { + uint32_t current_time = time(nullptr); + + std::map seqs; + + for (int i = 1; i <= config_.GetReplicaNum(); ++i) { + if (last_update_time_.find(i) == last_update_time_.end() || + last_update_time_[i] == 0) { + continue; + } + LOG(ERROR) << " check healthy, replica:" << i + << " current time:" << current_time + << " last time:" << last_update_time_[i] + << " timeout:" << replica_timeout_ + << " pass:" << current_time - last_update_time_[i]; + if (current_time - last_update_time_[i] > replica_timeout_) { + TimeoutHandler(i); + } + seqs[status_[i]]++; + } + + uint64_t unstable_check_ckpt = 0; + for (auto it : seqs) { + int num = 0; + for (auto sit : seqs) { + if (sit.first < it.first) { + continue; + } + num += sit.second; + } + if (num >= config_.GetMinDataReceiveNum()) { + unstable_check_ckpt = std::max(unstable_check_ckpt, it.first); + } + } + SetUnstableCkpt(unstable_check_ckpt); +} + +void CheckPointManager::UpdateStableCheckPointStatus() { + uint64_t last_committable_seq = 0; + while (!stop_) { + if (!Wait()) { + continue; + } + uint64_t stable_seq = 0; + std::string stable_hash; + { + std::lock_guard lk(mutex_); + for (auto it : sender_ckpt_) { + if (it.second.size() >= + static_cast(config_.GetMinCheckpointReceiveNum())) { + committable_seq_ = it.first.first; + committable_hash_ = it.first.second; + std::set senders_ = + sender_ckpt_[std::make_pair(committable_seq_, committable_hash_)]; + sem_post(&committable_seq_signal_); + } + if (it.second.size() >= + static_cast(config_.GetMinDataReceiveNum())) { + stable_seq = it.first.first; + stable_hash = it.first.second; + } + } + new_data_ = 0; + } + + LOG(ERROR) << "current stable seq:" << current_stable_seq_ + << " stable seq:" << stable_seq; + if (stable_seq == 0) { + continue; + } + std::vector votes; + if (current_stable_seq_ < stable_seq) { + std::lock_guard lk(mutex_); + votes = sign_ckpt_[std::make_pair(stable_seq, stable_hash)]; + std::set senders_ = + sender_ckpt_[std::make_pair(stable_seq, stable_hash)]; + + auto it = sender_ckpt_.begin(); + while (it != sender_ckpt_.end()) { + if (it->first.first <= stable_seq) { + sign_ckpt_.erase(sign_ckpt_.find(it->first)); + auto tmp = it++; + sender_ckpt_.erase(tmp); + } else { + it++; + } + } + stable_ckpt_.set_seq(stable_seq); + stable_ckpt_.set_hash(stable_hash); + stable_ckpt_.mutable_signatures()->Clear(); + for (auto vote : votes) { + *stable_ckpt_.add_signatures() = vote; + } + current_stable_seq_ = stable_seq; + } + UpdateStableCheckPointCallback(current_stable_seq_); + } +} + +void CheckPointManager::SetTimeoutHandler( + std::function timeout_handler) { + timeout_handler_ = timeout_handler; +} + +void CheckPointManager::TimeoutHandler() { + if (timeout_handler_) { + timeout_handler_(0); + } +} + +void CheckPointManager::TimeoutHandler(uint32_t replica) { + if (timeout_handler_) { + timeout_handler_(replica); + } +} + +void CheckPointManager::SetLastCommit(uint64_t seq) { + LOG(ERROR) << " set last commit:" << seq; + last_seq_ = seq; + std::lock_guard lk(lt_mutex_); + committed_status_.clear(); +} + +uint64_t CheckPointManager::GetLastCommit() { return last_seq_; } + +int CheckPointManager::ProcessStatusSync(std::unique_ptr context, + std::unique_ptr request) { + CheckPointData checkpoint_data; + if (!checkpoint_data.ParseFromString(request->data())) { + LOG(ERROR) << "parse checkpont data fail:"; + return -2; + } + uint64_t seq = checkpoint_data.seq(); + uint32_t sender_id = request->sender_id(); + uint32_t primary_id = checkpoint_data.primary_id(); + uint32_t view = checkpoint_data.view(); + + status_[sender_id] = seq; + last_update_time_[sender_id] = time(nullptr); + view_status_[sender_id] = std::make_pair(primary_id, view); + LOG(ERROR) << " received from :" << sender_id << " commit status:" << seq + << " primary:" << primary_id << " view:" << view; + return 0; +} + +void CheckPointManager::CheckStatus(uint64_t last_seq) { + std::vector seqs; + for (auto it : status_) { + seqs.push_back(it.second); + } + + sort(seqs.begin(), seqs.end()); + int f = config_.GetMaxMaliciousReplicaNum(); + + if (seqs.size() <= f + 1) { + return; + } + // uint64_t min_seq = seqs[f + 1]; + uint64_t min_seq = seqs.back(); + + LOG(ERROR) << " check last seq:" << last_seq << " max seq:" << min_seq; + if (last_seq < min_seq) { + // need recovery from others + reset_execute_func_(last_seq + 1); + BroadcastRecovery(last_seq + 1, std::min(min_seq, last_seq + 500)); + } +} + +void CheckPointManager::CheckSysStatus() { + int f = config_.GetMaxMaliciousReplicaNum(); + + std::map, int> views; + int current_primary = 0; + uint64_t current_view = 0; + for (auto it : view_status_) { + views[it.second]++; + if (views[it.second] >= 2 * f + 1) { + current_primary = it.second.first; + current_view = it.second.second; + } + } + + if (current_primary > 0 && current_primary != sys_info_->GetPrimaryId() && + current_view > sys_info_->GetCurrentView()) { + sys_info_->SetCurrentView(current_view); + sys_info_->SetPrimary(current_primary); + LOG(ERROR) << " change to primary:" << current_primary + << " view:" << current_view; + } +} + +void CheckPointManager::SyncStatus() { + uint64_t last_check_seq = 0; + uint64_t last_time = 0; + while (!stop_) { + uint64_t last_seq = last_seq_; + + CheckPointData checkpoint_data; + std::unique_ptr checkpoint_request = NewRequest( + Request::TYPE_STATUS_SYNC, Request(), config_.GetSelfInfo().id()); + checkpoint_data.set_seq(last_seq); + checkpoint_data.set_view(sys_info_->GetCurrentView()); + checkpoint_data.set_primary_id(sys_info_->GetPrimaryId()); + checkpoint_data.SerializeToString(checkpoint_request->mutable_data()); + replica_communicator_->BroadCast(*checkpoint_request); + + LOG(ERROR) << " sync status last seq:" << last_seq + << " last time:" << last_time + << " primary:" << sys_info_->GetPrimaryId() + << " view:" << sys_info_->GetCurrentView(); + if (last_check_seq == last_seq && last_time > 5) { + CheckStatus(last_seq); + last_time = 0; + } + CheckSysStatus(); + + if (last_seq != last_check_seq) { + last_check_seq = last_seq; + last_time = 0; + } + CheckHealthy(); + sleep(10); + last_time++; + } +} + +void CheckPointManager::UpdateCheckPointStatus() { + uint64_t last_ckpt_seq = 0; + int water_mark = config_.GetCheckPointWaterMark(); + int timeout_ms = config_.GetViewchangeCommitTimeout(); + std::vector stable_hashs; + std::vector stable_seqs; + std::map> pendings; + while (!stop_) { + std::unique_ptr request = nullptr; + if (!pendings.empty()) { + LOG(ERROR) << " last seq:" << last_seq_ + << " pending:" << pendings.begin()->second->seq(); + if (pendings.begin()->second->seq() == last_seq_ + 1) { + request = std::move(pendings.begin()->second); + pendings.erase(pendings.begin()); + } + } + if (request == nullptr) { + request = data_queue_.Pop(timeout_ms); + } + if (request == nullptr) { + continue; + } + std::string hash_ = request->hash(); + uint64_t current_seq = request->seq(); + LOG(ERROR) << "update checkpoint seq :" << last_seq_ + << " current:" << current_seq; + if (current_seq != last_seq_ + 1) { + LOG(ERROR) << "seq invalid:" << last_seq_ << " current:" << current_seq; + if (current_seq > last_seq_ + 1) { + pendings[current_seq] = std::move(request); + } + continue; + } + { + std::lock_guard lk(lt_mutex_); + last_hash_ = GetHash(last_hash_, request->hash()); + last_seq_++; + } + bool is_recovery = request->is_recovery(); + + LOG(ERROR) << " current seq:" << current_seq << " water mark:" << water_mark + << " current stable seq:" << current_stable_seq_; + if (current_seq > 0 && current_seq % water_mark == 0) { + last_ckpt_seq = current_seq; + BroadcastCheckPoint(last_ckpt_seq, last_hash_, stable_hashs, stable_seqs); + } + ClearCommittedStatus(current_seq); + } + return; +} + +void CheckPointManager::BroadcastCheckPoint( + uint64_t seq, const std::string& hash, + const std::vector& stable_hashs, + const std::vector& stable_seqs) { + CheckPointData checkpoint_data; + std::unique_ptr checkpoint_request = NewRequest( + Request::TYPE_CHECKPOINT, Request(), config_.GetSelfInfo().id()); + checkpoint_data.set_seq(seq); + checkpoint_data.set_hash(hash); + if (verifier_) { + auto signature_or = verifier_->SignMessage(hash); + if (!signature_or.ok()) { + LOG(ERROR) << "Sign message fail"; + return; + } + *checkpoint_data.mutable_hash_signature() = *signature_or; + } + + checkpoint_data.SerializeToString(checkpoint_request->mutable_data()); + replica_communicator_->BroadCast(*checkpoint_request); +} + +void CheckPointManager::BroadcastRecovery(uint64_t min_seq, uint64_t max_seq) { + RecoveryRequest recovery_data; + std::unique_ptr recovery_request = NewRequest( + Request::TYPE_RECOVERY_DATA, Request(), config_.GetSelfInfo().id()); + recovery_data.set_min_seq(min_seq); + recovery_data.set_max_seq(max_seq); + recovery_data.SerializeToString(recovery_request->mutable_data()); + + LOG(ERROR) << " recovery request [" << min_seq << "," << max_seq << "]"; + replica_communicator_->BroadCast(*recovery_request); +} + +void CheckPointManager::WaitSignal() { + std::unique_lock lk(mutex_); + signal_.wait(lk, [&] { return !stable_hash_queue_.Empty(); }); +} + +std::unique_ptr> +CheckPointManager::PopStableSeqHash() { + return stable_hash_queue_.Pop(); +} + +uint64_t CheckPointManager::GetHighestPreparedSeq() { + std::lock_guard lk(lt_mutex_); + LOG(ERROR) << "get high prepared seq:" << highest_prepared_seq_; + return highest_prepared_seq_; +} + +void CheckPointManager::SetHighestPreparedSeq(uint64_t seq) { + LOG(ERROR) << "set high prepared seq:" << seq; + std::lock_guard lk(lt_mutex_); + highest_prepared_seq_ = seq; +} + +sem_t* CheckPointManager::CommitableSeqSignal() { + std::lock_guard lk(lt_mutex_); + return &committable_seq_signal_; +} + +uint64_t CheckPointManager::GetCommittableSeq() { + std::lock_guard lk(lt_mutex_); + return committable_seq_; +} + +void CheckPointManager::SetUnstableCkpt(uint64_t unstable_check_ckpt) { + LOG(ERROR) << " set unstable ckpt:" << unstable_check_ckpt; + { + std::lock_guard lk(lt_mutex_); + unstable_check_ckpt_ = unstable_check_ckpt; + } +} + +uint64_t CheckPointManager::GetUnstableCkpt() { + std::lock_guard lk(lt_mutex_); + LOG(ERROR) << " get unstable ckpt:" << unstable_check_ckpt_; + return unstable_check_ckpt_; +} + +void CheckPointManager::AddCommitState(uint64_t seq) { + LOG(ERROR) << " add commited state:" << seq; + std::lock_guard lk(lt_mutex_); + committed_status_[seq] = true; +} + +bool CheckPointManager::IsCommitted(uint64_t seq) { + std::lock_guard lk(lt_mutex_); + if (seq < last_seq_) { + return true; + } + return committed_status_.find(seq) != committed_status_.end(); +} + +void CheckPointManager::ClearCommittedStatus(uint64_t seq) { + std::lock_guard lk(lt_mutex_); + while (!committed_status_.empty()) { + if (committed_status_.begin()->first <= seq) { + committed_status_.erase(committed_status_.begin()); + } else { + break; + } + } +} + +// void CheckPointManager::SetLastExecutedSeq(uint64_t latest_executed_seq){ +// latest_executed_seq = executor_->get_latest_executed_seq(); +// } + +} // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/checkpoint_manager.h b/platform/consensus/ordering/raft/framework/checkpoint_manager.h new file mode 100644 index 0000000000..8a91638574 --- /dev/null +++ b/platform/consensus/ordering/raft/framework/checkpoint_manager.h @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include "chain/state/chain_state.h" +#include "common/crypto/signature_verifier.h" +#include "interface/common/resdb_txn_accessor.h" +#include "platform/config/resdb_config.h" +#include "platform/consensus/checkpoint/checkpoint.h" +#include "platform/consensus/execution/transaction_executor.h" +#include "platform/networkstrate/replica_communicator.h" +#include "platform/networkstrate/server_comm.h" +#include "platform/proto/checkpoint_info.pb.h" +#include "platform/proto/resdb.pb.h" + +namespace resdb { + +class CheckPointManager : public CheckPoint { + public: + CheckPointManager(const ResDBConfig& config, + ReplicaCommunicator* replica_communicator, + SignatureVerifier* verifier, SystemInfo* sys_info); + virtual ~CheckPointManager(); + + void SetLastCommit(uint64_t seq); + uint64_t GetLastCommit(); + + void AddCommitData(std::unique_ptr request); + int ProcessCheckPoint(std::unique_ptr context, + std::unique_ptr request); + int ProcessStatusSync(std::unique_ptr context, + std::unique_ptr request); + + uint64_t GetStableCheckpoint() override; + // void SetLastExecutedSeq(uint64_t latest_executed_seq); + StableCheckPoint GetStableCheckpointWithVotes(); + bool IsValidCheckpointProof(const StableCheckPoint& stable_ckpt); + + void SetTimeoutHandler(std::function timeout_handler); + virtual void UpdateStableCheckPointCallback( + int64_t current_stable_checkpoint) {} + + void Stop(); + + void TimeoutHandler(); + void TimeoutHandler(uint32_t replica); + + void WaitSignal(); + std::unique_ptr> PopStableSeqHash(); + + void SetExecutor(TransactionExecutor* executor) { executor_ = executor; } + + uint64_t GetHighestPreparedSeq(); + + void SetHighestPreparedSeq(uint64_t seq); + + sem_t* CommitableSeqSignal(); + + uint64_t GetCommittableSeq(); + + void SetUnstableCkpt(uint64_t unstable_check_ckpt); + + uint64_t GetUnstableCkpt(); + + void AddCommitState(uint64_t seq); + + bool IsCommitted(uint64_t seq); + void ClearCommittedStatus(uint64_t seq); + + void SetResetExecute(std::function); + + private: + void UpdateCheckPointStatus(); + void UpdateStableCheckPointStatus(); + void BroadcastCheckPoint(uint64_t seq, const std::string& hash, + const std::vector& stable_hashs, + const std::vector& stable_seqs); + + void Notify(); + bool Wait(); + void BroadcastRecovery(uint64_t min_seq, uint64_t max_seq); + + void SyncStatus(); + void StatusProcess(); + void CheckStatus(uint64_t last_seq); + void CheckSysStatus(); + void CheckHealthy(); + + protected: + uint64_t last_executed_seq_ = 0; + ResDBConfig config_; + ReplicaCommunicator* replica_communicator_; + std::thread checkpoint_thread_, stable_checkpoint_thread_, status_thread_; + SignatureVerifier* verifier_; + std::atomic stop_; + std::map, std::set> sender_ckpt_; + std::map, std::vector> + sign_ckpt_; + std::map, std::vector> + hash_ckpt_; + std::atomic current_stable_seq_; + std::mutex mutex_; + LockFreeQueue data_queue_; + std::mutex cv_mutex_; + std::condition_variable cv_; + std::function timeout_handler_; + StableCheckPoint stable_ckpt_; + int new_data_ = 0; + LockFreeQueue> stable_hash_queue_; + std::condition_variable signal_; + ResDBTxnAccessor txn_accessor_; + std::mutex lt_mutex_, seq_mutex_; + uint64_t last_seq_ = 0; + uint64_t max_seq_ = 0; + TransactionExecutor* executor_; + std::atomic highest_prepared_seq_; + uint64_t committable_seq_ = 0; + std::string last_hash_, committable_hash_; + sem_t committable_seq_signal_; + std::map status_; + std::map last_update_time_; + int replica_timeout_ = 60; + uint64_t unstable_check_ckpt_; + std::map committed_status_; + std::function reset_execute_func_; + SystemInfo* sys_info_; + std::map> view_status_; +}; + +} // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 4ac40d49f8..956621040c 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -32,7 +32,14 @@ namespace raft { Consensus::Consensus(const ResDBConfig& config, std::unique_ptr executor) : common::Consensus(config, std::move(executor)), - leader_election_manager_(std::make_unique(config_)) { + leader_election_manager_(std::make_unique(config_)), + system_info_(std::make_unique(config_)), + checkpoint_manager_(std::make_unique( + config_, GetBroadCastClient(), GetSignatureVerifier(), + system_info_.get())), + recovery_(std::make_unique(config_, checkpoint_manager_.get(), + system_info_.get(), + transaction_executor_->GetStorage())) { //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": In consensus constructor"; int total_replicas = config_.GetReplicaNum(); int f = (total_replicas - 1) / 3; @@ -45,10 +52,22 @@ Consensus::Consensus(const ResDBConfig& config, .type() != CertificateKeyInfo::CLIENT) { raft_ = std::make_unique(config_.GetSelfInfo().id(), f, total_replicas, GetSignatureVerifier(), leader_election_manager_.get(), - replica_communicator_); + replica_communicator_, recovery_.get()); leader_election_manager_->SetRaft(raft_.get()); leader_election_manager_->MayStart(); + + recovery_->ReadLogs( + [&](const RaftMetadata& metadata) { + LOG(ERROR) << " read current term: " << metadata.current_term + << " voted for: " << metadata.voted_for; + raft_->SetCurrentTerm(metadata.current_term, false); + raft_->SetVotedFor(metadata.voted_for, false); + }, + [&](std::unique_ptr request) { + return CommitMsg(std::move(request)); + }, + [&](int seq) { raft_->SetSeqIndexCoveredBySnapshot(seq); }); InitProtocol(raft_.get()); } @@ -107,6 +126,7 @@ int Consensus::ProcessCustomConsensus(std::unique_ptr request) { performance_manager_->SetPrimary(dtl->leaderid()); return 0; } + LOG(ERROR) << "Unknown message type"; return 0; } @@ -125,5 +145,10 @@ int Consensus::CommitMsg(const google::protobuf::Message& msg) { return 0; } +// int Consensus::CommitMsg(std::unique_ptr request) { +// transaction_executor_->Commit(std::move(request)); +// return 0; +// } + } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/consensus.h b/platform/consensus/ordering/raft/framework/consensus.h index 0b197e9751..89dc494139 100644 --- a/platform/consensus/ordering/raft/framework/consensus.h +++ b/platform/consensus/ordering/raft/framework/consensus.h @@ -25,6 +25,9 @@ #include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" #include "platform/networkstrate/consensus_manager.h" +#include "platform/consensus/ordering/raft/framework/checkpoint_manager.h" +#include "platform/consensus/ordering/raft/framework/raft_recovery.h" + namespace resdb { namespace raft { @@ -38,11 +41,15 @@ class Consensus : public common::Consensus { int ProcessCustomConsensus(std::unique_ptr request) override; int ProcessNewTransaction(std::unique_ptr request) override; int CommitMsg(const google::protobuf::Message& msg) override; + // int CommitMsg(const std::unique_ptr request); int CommitMsgInternal(const AppendEntries& txn); protected: std::unique_ptr raft_; std::unique_ptr leader_election_manager_; + std::unique_ptr system_info_; + std::unique_ptr checkpoint_manager_; + std::unique_ptr recovery_; }; } // namespace raft diff --git a/platform/consensus/ordering/raft/framework/raft_recovery.cpp b/platform/consensus/ordering/raft/framework/raft_recovery.cpp new file mode 100644 index 0000000000..76bea4daff --- /dev/null +++ b/platform/consensus/ordering/raft/framework/raft_recovery.cpp @@ -0,0 +1,323 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "platform/consensus/ordering/raft/framework/raft_recovery.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "common/utils/utils.h" + +namespace resdb { +namespace raft { + +RaftRecovery::RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, SystemInfo* system_info, Storage* storage) + : Recovery(config, checkpoint, system_info, storage) { + LOG(INFO) << "Raft Recovery constructor"; + Init(); +}; + +void RaftRecovery::Init() { + // Recovery::Init(); + + meta_file_path_ = std::filesystem::path(base_file_path_).parent_path() + / "raft_metadata.dat"; + LOG(INFO) << "Meta file path: " << meta_file_path_; + OpenMetadataFile(); +} + +RaftRecovery::~RaftRecovery() { + if (recovery_enabled_ == false) { + return; + } + if (metadata_fd_ >= 0) { + close(metadata_fd_); + } +} + +void RaftRecovery::OpenMetadataFile() { + metadata_fd_ = open(meta_file_path_.c_str(), O_CREAT | O_RDWR, 0666); + if (metadata_fd_ < 0) { + LOG(ERROR) << "Failed to open metadata file: " << strerror(errno); + return; + } + + // Read existing metadata if it exists, otherwise defaults are used + metadata_ = ReadMetadata(); + LOG(INFO) << "Opened metadata file: term: " << metadata_.current_term + << " votedFor: " << metadata_.voted_for; +} + +void RaftRecovery::WriteMetadata(int64_t current_term, int32_t voted_for) { + if (metadata_fd_ < 0) { + LOG(ERROR) << "Metadata file not open"; + return; + } + + metadata_.current_term = current_term; + metadata_.voted_for = voted_for; + + lseek(metadata_fd_, 0, SEEK_SET); + write(metadata_fd_, &metadata_, sizeof(metadata_)); + fsync(metadata_fd_); + + LOG(INFO) << "Wrote metadata: term: " << current_term + << " votedFor: " << voted_for; + LOG(INFO) << "METADATA location: " << meta_file_path_; +} + +RaftMetadata RaftRecovery::ReadMetadata() { + RaftMetadata metadata; + if (metadata_fd_ < 0) { + LOG(ERROR) << "Metadata file not open"; + return metadata; + } + + lseek(metadata_fd_, 0, SEEK_SET); + int bytes = read(metadata_fd_, &metadata, sizeof(metadata)); + if (bytes != sizeof(metadata)) { + LOG(INFO) << "No existing metadata, using defaults"; + return RaftMetadata{}; + } + return metadata; +} + +void RaftRecovery::SwitchFile(const std::string& file_path) { + std::unique_lock lk(mutex_); + + min_seq_ = -1; + max_seq_ = -1; + + ReadLogsFromFiles( + file_path, 0, 0, [&](const RaftMetadata& data) {}, + [&](std::unique_ptr request) { + min_seq_ == -1 + ? min_seq_ = request->seq() + : std::min(min_seq_, static_cast(request->seq())); + max_seq_ = std::max(max_seq_, static_cast(request->seq())); + }); + + OpenFile(file_path); + LOG(INFO) << "switch to file:" << file_path << " seq:" + << "[" << min_seq_ << "," << max_seq_ << "]"; +} + +void RaftRecovery::AddLogEntry(const Entry* entry) { + if (recovery_enabled_ == false) { + return; + } + return WriteLog(entry); +} + +void RaftRecovery::WriteLog(const Entry* entry) { + std::string data; + if (entry) { + entry->SerializeToString(&data); + } + + std::unique_lock lk(mutex_); + min_seq_ = min_seq_ == -1 + ? entry->term() + : std::min(min_seq_, static_cast(entry->term())); + max_seq_ = std::max(max_seq_, static_cast(entry->term())); + AppendData(data); + + Flush(); +} + +std::vector> RaftRecovery::ParseData(const std::string& data) { + std::vector> request_list; + + std::vector data_list; + int pos = 0; + while (pos < data.size()) { + size_t len; + memcpy(&len, data.c_str() + pos, sizeof(len)); + pos += sizeof(len); + + std::string item = data.substr(pos, len); + pos += len; + data_list.push_back(item); + } + + for (size_t i = 0; i < data_list.size(); i += 1) { + std::unique_ptr recovery_data = + std::make_unique(); + recovery_data->request = std::make_unique(); + + if (!recovery_data->request->ParseFromString(data_list[i])) { + LOG(ERROR) << "Parse from data fail"; + break; + } + + request_list.push_back(std::move(recovery_data)); + } + return request_list; +} + +void RaftRecovery::ReadLogs( + std::function system_callback, + std::function request)> + call_back, + std::function set_start_point) { + if (recovery_enabled_ == false) { + return; + } + assert(storage_); + int64_t storage_ckpt = storage_->GetLastCheckpoint(); + LOG(ERROR) << " storage ckpt:" << storage_ckpt; + std::unique_lock lk(mutex_); + + system_callback(ReadMetadata()); + auto recovery_files_pair = GetRecoveryFiles(storage_ckpt); + int64_t ckpt = recovery_files_pair.second; + if (set_start_point) { + set_start_point(ckpt); + } + int idx = 0; + for (auto path : recovery_files_pair.first) { + ReadLogsFromFiles(path.second, ckpt, idx++, system_callback, call_back); + } +} + +void RaftRecovery::ReadLogsFromFiles( + const std::string& path, int64_t ckpt, int file_idx, + std::function system_callback, + std::function request)> + call_back) { + int fd = open(path.c_str(), O_CREAT | O_RDONLY, 0666); + if (fd < 0) { + LOG(ERROR) << " open file fail:" << path; + } + LOG(INFO) << "read logs:" << path << " pos:" << lseek(fd, 0, SEEK_CUR); + assert(fd >= 0); + + size_t data_len = 0; + std::vector> request_list; + + while (Read(fd, sizeof(data_len), reinterpret_cast(&data_len))) { + std::string data; + char* buf = new char[data_len]; + if (!Read(fd, data_len, buf)) { + LOG(ERROR) << "Read data log fail"; + break; + } + data = std::string(buf, data_len); + delete buf; + + std::vector> list = ParseData(data); + if (list.size() == 0) { + request_list.clear(); + break; + } + for (auto& l : list) { + request_list.push_back(std::move(l)); + } + } + if (request_list.size() == 0) { + ftruncate(fd, 0); + } + uint64_t max_seq = 0; + for (std::unique_ptr& recovery_data : request_list) { + // LOG(ERROR)<<" ckpt :"<request->seq()<<" + // type:"<request->type(); + if (ckpt < recovery_data->request->seq() || + recovery_data->request->type() == Request::TYPE_NEWVIEW) { + recovery_data->request->set_is_recovery(true); + max_seq = recovery_data->request->seq(); + call_back(std::move(recovery_data->request)); + } + } + + LOG(ERROR) << "read log from files:" << path << " done" + << " recovery max seq:" << max_seq; + + close(fd); +} + +std::map>> RaftRecovery::GetDataFromRecoveryFiles(uint64_t need_min_seq, uint64_t need_max_seq) { + std::string dir = std::filesystem::path(file_path_).parent_path(); + + std::vector> list; + std::vector> e_list; + + for (const auto& entry : std::filesystem::directory_iterator(dir)) { + std::string dir = std::filesystem::path(entry.path()).parent_path(); + std::string file_name = std::filesystem::path(entry.path()).stem(); + std::string ext = std::filesystem::path(entry.path()).extension(); + if (ext != ".log") continue; + int pos = file_name.rfind("_"); + + int max_seq_pos = file_name.rfind("_", pos - 1); + int64_t max_seq = + std::stoll(file_name.substr(max_seq_pos + 1, pos - max_seq_pos - 1)); + + int min_seq_pos = file_name.rfind("_", max_seq_pos - 1); + int64_t min_seq = std::stoll( + file_name.substr(min_seq_pos + 1, max_seq_pos - min_seq_pos - 1)); + + int time_pos = file_name.rfind("_", min_seq_pos - 1); + int64_t time = + std::stoll(file_name.substr(time_pos + 1, min_seq_pos - time_pos - 1)); + + // LOG(ERROR)<<" min seq:"< need_max_seq) { + continue; + } + // LOG(ERROR)<<" get min seq:"<>> res; + for (const auto& path : list) { + ReadLogsFromFiles( + path.second, need_min_seq - 1, 0, [&](const RaftMetadata& data) {}, + [&](std::unique_ptr request) { + // LOG(ERROR) << "check get data from recovery file seq:" + // << request->seq(); + if (request->seq() >= need_min_seq && + request->seq() <= need_max_seq) { + LOG(ERROR) << "get data from recovery file seq:" << request->seq(); + res[request->seq()].push_back(std::move(request)); + } + }); + } + + return res; +} + +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/raft_recovery.h b/platform/consensus/ordering/raft/framework/raft_recovery.h new file mode 100644 index 0000000000..dc490a7dda --- /dev/null +++ b/platform/consensus/ordering/raft/framework/raft_recovery.h @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include "chain/storage/storage.h" +#include "platform/config/resdb_config.h" +#include "platform/consensus/checkpoint/checkpoint.h" +#include "platform/consensus/execution/system_info.h" +#include "platform/networkstrate/server_comm.h" +#include "platform/proto/resdb.pb.h" +#include "platform/consensus/recovery/recovery.h" +#include "platform/proto/system_info_data.pb.h" +#include "platform/consensus/ordering/raft/proto/proposal.pb.h" + +namespace resdb { + +namespace raft { + +struct RaftMetadata { + int64_t current_term = 0; + int32_t voted_for = -1; +}; + +class RaftRecovery : public Recovery { + public: + RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, + SystemInfo* system_info, Storage* storage); + ~RaftRecovery(); + + RaftMetadata ReadMetadata(); + void Init(); + void WriteMetadata(int64_t current_term, int32_t voted_for); + void ReadLogs(std::function system_callback,std::function request)> call_back, std::function set_start_point); + void AddLogEntry(const Entry* entry); + + private: + void OpenMetadataFile(); + std::vector> ParseData(const std::string& data); + void WriteLog(const Entry* entry); + void ReadLogsFromFiles(const std::string& path, int64_t ckpt, int file_idx, std::function system_callback, std::function request)> call_back); + std::map>> GetDataFromRecoveryFiles(uint64_t need_min_seq, uint64_t need_max_seq); + void SwitchFile(const std::string& path); + + int metadata_fd_; + std::string meta_file_path_; + RaftMetadata metadata_; +}; + +} // namespace raft +} // namespace resdb \ No newline at end of file diff --git a/platform/consensus/ordering/raft/framework/transaction_utils.cpp b/platform/consensus/ordering/raft/framework/transaction_utils.cpp new file mode 100644 index 0000000000..b423407e32 --- /dev/null +++ b/platform/consensus/ordering/raft/framework/transaction_utils.cpp @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "platform/consensus/ordering/raft/framework/transaction_utils.h" + +namespace resdb { + +std::unique_ptr NewRequest(Request::Type type, const Request& request, + int sender_id) { + auto new_request = std::make_unique(request); + new_request->set_type(type); + new_request->set_sender_id(sender_id); + return new_request; +} + +std::unique_ptr NewRequest(Request::Type type, const Request& request, + int sender_id, int region_id) { + auto new_request = std::make_unique(request); + new_request->set_type(type); + new_request->set_sender_id(sender_id); + new_request->mutable_region_info()->set_region_id(region_id); + return new_request; +} + +} // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/transaction_utils.h b/platform/consensus/ordering/raft/framework/transaction_utils.h new file mode 100644 index 0000000000..e5e3eac222 --- /dev/null +++ b/platform/consensus/ordering/raft/framework/transaction_utils.h @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once +#include "platform/proto/replica_info.pb.h" +#include "platform/proto/resdb.pb.h" + +namespace resdb { + +enum CollectorResultCode { + INVALID = -2, + OK = 0, + STATE_CHANGED = 1, +}; + +std::unique_ptr NewRequest(Request::Type type, const Request& request, + int sender_id); + +std::unique_ptr NewRequest(Request::Type type, const Request& request, + int sender_id, int region_info); + +} // namespace resdb diff --git a/platform/consensus/recovery/recovery.h b/platform/consensus/recovery/recovery.h index 426bffd275..5ba9b2c816 100644 --- a/platform/consensus/recovery/recovery.h +++ b/platform/consensus/recovery/recovery.h @@ -37,7 +37,7 @@ class Recovery { SystemInfo* system_info, Storage* storage); virtual ~Recovery(); - void Init(); + virtual void Init(); virtual void AddRequest(const Context* context, const Request* request); void ReadLogs(std::function system_callback, @@ -55,33 +55,31 @@ class Recovery { std::unique_ptr>>> GetDataFromRecoveryFiles(uint64_t need_min_seq, uint64_t need_max_seq); - private: + protected: struct RecoveryData { std::unique_ptr context; std::unique_ptr request; }; + private: + void WriteLog(const Context* context, const Request* request); - void AppendData(const std::string& data); + std::vector> ParseData(const std::string& data); std::vector ParseRawData(const std::string& data); - void Flush(); + void MayFlush(); void Write(const char* data, size_t len); - bool Read(int fd, size_t len, char* data); + std::string GenerateFile(int64_t seq, int64_t min_seq, int64_t max_seq); void GetLastFile(); void WriteSystemInfo(); - void OpenFile(const std::string& path); void FinishFile(int64_t seq); - void SwitchFile(const std::string& path); void UpdateStableCheckPoint(); - std::pair>, int64_t> - GetRecoveryFiles(int64_t ckpt); void ReadLogsFromFiles( const std::string& path, int64_t ckpt, int file_idx, std::function system_callback, @@ -92,6 +90,13 @@ class Recovery { void InsertCache(const Context& context, const Request& request); protected: + void Flush(); + void AppendData(const std::string& data); + bool Read(int fd, size_t len, char* data); + std::pair>, int64_t> GetRecoveryFiles(int64_t ckpt); + virtual void SwitchFile(const std::string& path); + void OpenFile(const std::string& path); + ResDBConfig config_; CheckPoint* checkpoint_; std::thread ckpt_thread_; From 3f5711f88493191c15cf0b4968175fb47071c07e Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Wed, 25 Mar 2026 17:09:14 -0700 Subject: [PATCH 54/66] WIP Extract common Recovery code to BaseRecovery --- .../ordering/pbft/consensus_manager_pbft.cpp | 4 +- .../ordering/pbft/consensus_manager_pbft.h | 4 +- platform/consensus/ordering/pbft/query.cpp | 2 +- platform/consensus/ordering/pbft/query.h | 6 +- .../ordering/raft/algorithm/raft.cpp | 22 +- .../consensus/ordering/raft/algorithm/raft.h | 2 +- .../ordering/raft/algorithm/raft_tests.h | 4 +- .../ordering/raft/algorithm/recovery_tests.h | 160 +++++++++ .../consensus/ordering/raft/framework/BUILD | 19 + .../ordering/raft/framework/consensus.cpp | 31 +- .../ordering/raft/framework/consensus.h | 5 +- .../ordering/raft/framework/raft_recovery.cpp | 238 +++++-------- .../ordering/raft/framework/raft_recovery.h | 23 +- .../raft/framework/raft_recovery_test.cpp | 105 ++++++ platform/consensus/ordering/raft/proto/BUILD | 11 + platform/consensus/recovery/BUILD | 14 +- platform/consensus/recovery/pbft_recovery.cpp | 225 ++++++++++++ platform/consensus/recovery/pbft_recovery.h | 60 ++++ platform/consensus/recovery/recovery.h | 80 +++-- .../{recovery.cpp => recovery_impl.h} | 336 +++--------------- .../recovery/recovery_template_functions.h | 147 ++++++++ platform/consensus/recovery/recovery_test.cpp | 155 +++++--- 22 files changed, 1072 insertions(+), 581 deletions(-) create mode 100644 platform/consensus/ordering/raft/algorithm/recovery_tests.h create mode 100644 platform/consensus/ordering/raft/framework/raft_recovery_test.cpp create mode 100644 platform/consensus/recovery/pbft_recovery.cpp create mode 100644 platform/consensus/recovery/pbft_recovery.h rename platform/consensus/recovery/{recovery.cpp => recovery_impl.h} (53%) create mode 100644 platform/consensus/recovery/recovery_template_functions.h diff --git a/platform/consensus/ordering/pbft/consensus_manager_pbft.cpp b/platform/consensus/ordering/pbft/consensus_manager_pbft.cpp index d7b3880766..a05291fa86 100644 --- a/platform/consensus/ordering/pbft/consensus_manager_pbft.cpp +++ b/platform/consensus/ordering/pbft/consensus_manager_pbft.cpp @@ -53,7 +53,7 @@ ConsensusManagerPBFT::ConsensusManagerPBFT( view_change_manager_(std::make_unique( config_, checkpoint_manager_.get(), message_manager_.get(), system_info_.get(), GetBroadCastClient(), GetSignatureVerifier())), - recovery_(std::make_unique(config_, checkpoint_manager_.get(), + recovery_(std::make_unique(config_, checkpoint_manager_.get(), system_info_.get(), message_manager_->GetStorage())), query_(std::make_unique(config_, recovery_.get(), @@ -64,7 +64,7 @@ ConsensusManagerPBFT::ConsensusManagerPBFT( view_change_manager_->SetDuplicateManager(commitment_->GetDuplicateManager()); - recovery_->ReadLogs( + recovery_->ReadLogs( [&](const SystemInfoData& data) { LOG(ERROR) << " read data info:" << data.view() << " primary:" << data.primary_id(); diff --git a/platform/consensus/ordering/pbft/consensus_manager_pbft.h b/platform/consensus/ordering/pbft/consensus_manager_pbft.h index 4df5e9c2e3..947ede07ac 100644 --- a/platform/consensus/ordering/pbft/consensus_manager_pbft.h +++ b/platform/consensus/ordering/pbft/consensus_manager_pbft.h @@ -28,7 +28,7 @@ #include "platform/consensus/ordering/pbft/query.h" #include "platform/consensus/ordering/pbft/response_manager.h" #include "platform/consensus/ordering/pbft/viewchange_manager.h" -#include "platform/consensus/recovery/recovery.h" +#include "platform/consensus/recovery/pbft_recovery.h" #include "platform/networkstrate/consensus_manager.h" namespace resdb { @@ -84,7 +84,7 @@ class ConsensusManagerPBFT : public ConsensusManager { std::unique_ptr response_manager_; std::unique_ptr performance_manager_; std::unique_ptr view_change_manager_; - std::unique_ptr recovery_; + std::unique_ptr recovery_; Stats* global_stats_; std::queue, std::unique_ptr>> request_pending_; diff --git a/platform/consensus/ordering/pbft/query.cpp b/platform/consensus/ordering/pbft/query.cpp index 197caac485..732fa437a4 100644 --- a/platform/consensus/ordering/pbft/query.cpp +++ b/platform/consensus/ordering/pbft/query.cpp @@ -24,7 +24,7 @@ namespace resdb { -Query::Query(const ResDBConfig& config, Recovery* recovery, +Query::Query(const ResDBConfig& config, PBFTRecovery* recovery, std::unique_ptr executor) : config_(config), recovery_(recovery), diff --git a/platform/consensus/ordering/pbft/query.h b/platform/consensus/ordering/pbft/query.h index 85f2e4c566..4678fb83ef 100644 --- a/platform/consensus/ordering/pbft/query.h +++ b/platform/consensus/ordering/pbft/query.h @@ -21,13 +21,13 @@ #include "executor/common/custom_query.h" #include "platform/config/resdb_config.h" -#include "platform/consensus/recovery/recovery.h" +#include "platform/consensus/recovery/pbft_recovery.h" namespace resdb { class Query { public: - Query(const ResDBConfig& config, Recovery* recovery, + Query(const ResDBConfig& config, PBFTRecovery* recovery, std::unique_ptr executor = nullptr); virtual ~Query(); @@ -41,7 +41,7 @@ class Query { protected: ResDBConfig config_; - Recovery* recovery_; + PBFTRecovery* recovery_; std::unique_ptr custom_query_executor_; }; diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index f3769c6b2f..e2a6618ee9 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -837,17 +837,17 @@ bool Raft::InFlightPerFollowerLimitReachedLocked(int followerId) const { void Raft::SetCurrentTerm(uint64_t currentTerm, bool writeMetadata) { LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; currentTerm_ = currentTerm; - // if (writeMetadata) { - // recovery_->WriteMetadata(currentTerm_, votedFor_); - // } + if (writeMetadata) { + recovery_->WriteMetadata(currentTerm_, votedFor_); + } } void Raft::SetVotedFor(int votedFor, bool writeMetadata) { LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; votedFor_ = votedFor; - // if (writeMetadata) { - // recovery_->WriteMetadata(currentTerm_, votedFor_); - // } + if (writeMetadata) { + recovery_->WriteMetadata(currentTerm_, votedFor_); + } } void Raft::SetSeqIndexCoveredBySnapshot(int seq) { @@ -855,11 +855,11 @@ void Raft::SetSeqIndexCoveredBySnapshot(int seq) { } void Raft::AddToLog(LogEntry logEntryToAdd, bool writeMetadata) { - // Entry* entry; - // entry = &logEntryToAdd.entry; - // if (writeMetadata) { - // recovery_->AddLogEntry(entry); - // } + Entry* entry; + entry = &logEntryToAdd.entry; + if (writeMetadata) { + recovery_->AddLogEntry(entry); + } log_.push_back(logEntryToAdd); } diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 58971071ae..b89f0d8248 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -226,7 +226,7 @@ class Raft : public common::ProtocolBase { const auto& entry = log_[i]; os << " [" << i << "] " - << "term=" << entry.term << ", command=\"" << entry.command << "\"" + << "term=" << entry.entry.term() << ", command=\"" << entry.entry.command() << "\"" << ", serializedSize=" << entry.GetSerializedSize() << "\n"; } } diff --git a/platform/consensus/ordering/raft/algorithm/raft_tests.h b/platform/consensus/ordering/raft/algorithm/raft_tests.h index 49cd2e499d..234d6352ef 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_tests.h +++ b/platform/consensus/ordering/raft/algorithm/raft_tests.h @@ -46,11 +46,12 @@ class RaftTest : public ::testing::Test { leader_election_manager_ = std::make_unique(GenerateConfig()); replica_communicator_ = std::make_unique(); + recovery_ = std::make_unique(); raft_ = std::make_unique( /*id=*/1, /*f=*/1, /*total=*/4, verifier_.get(), leader_election_manager_.get(), - replica_communicator_.get()); + replica_communicator_.get(), recovery_.get()); raft_->SetSingleCallFunc( [&](int type, const google::protobuf::Message& msg, int node_id) { @@ -147,6 +148,7 @@ class RaftTest : public ::testing::Test { std::unique_ptr verifier_; std::unique_ptr leader_election_manager_; std::unique_ptr replica_communicator_; + std::unique_ptr recovery_; std::unique_ptr raft_; MockSendMessageFunction mock_call; MockBroadcastFunction mock_broadcast; diff --git a/platform/consensus/ordering/raft/algorithm/recovery_tests.h b/platform/consensus/ordering/raft/algorithm/recovery_tests.h new file mode 100644 index 0000000000..38fb7e6dcb --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/recovery_tests.h @@ -0,0 +1,160 @@ +#include + +#include "common/crypto/mock_signature_verifier.h" +#include "platform/config/resdb_config_utils.h" +#include "platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h" +#include "platform/consensus/ordering/raft/algorithm/raft.h" +#include "platform/networkstrate/mock_replica_communicator.h" +#include "platform/proto/client_test.pb.h" + +namespace resdb { +namespace raft { +using ::testing::_; +using ::testing::AnyNumber; +using ::testing::Invoke; +using ::testing::Matcher; + +ResDBConfig GenerateConfig() { + ResConfigData data; + data.set_duplicate_check_frequency_useconds(100000); + data.set_enable_viewchange(true); + return ResDBConfig({GenerateReplicaInfo(1, "127.0.0.1", 1234), + GenerateReplicaInfo(2, "127.0.0.1", 1235), + GenerateReplicaInfo(3, "127.0.0.1", 1236), + GenerateReplicaInfo(4, "127.0.0.1", 1237)}, + GenerateReplicaInfo(1, "127.0.0.1", 1234), data); +} + +class RecoveryTest : public ::testing::Test { + private: + class MockSendMessageFunction { + public: + MOCK_METHOD(int, Call, (int, const google::protobuf::Message&, int)); + }; + class MockBroadcastFunction { + public: + MOCK_METHOD(int, Broadcast, (int, const google::protobuf::Message&)); + }; + class MockCommitFunction { + public: + MOCK_METHOD(int, Commit, (const google::protobuf::Message&)); + }; + + protected: + void SetUp() override { + auto config = GenerateConfig(); + verifier_ = std::make_unique(); + leader_election_manager_ = + std::make_unique(config); + replica_communicator_ = std::make_unique(); + recovery_ = std::make_unique(config, CheckPoint* checkpoint, SystemInfo* system_info, Storage* storage); + raft_ = std::make_unique( + /*id=*/1, + /*f=*/1, + /*total=*/4, verifier_.get(), leader_election_manager_.get(), + replica_communicator_.get(), recovery_.get()); + + raft_->SetSingleCallFunc( + [&](int type, const google::protobuf::Message& msg, int node_id) { + return mock_call.Call(type, msg, node_id); + }); + + raft_->SetBroadcastCallFunc( + [&](int type, const google::protobuf::Message& msg) { + return mock_broadcast.Broadcast(type, msg); + }); + + raft_->SetCommitFunc([&](const google::protobuf::Message& msg) { + return mock_commit.Commit(msg); + }); + } + + AeFields CreateAeFields(uint64_t term, int leaderId, uint64_t prevLogIndex, + uint64_t prevLogTerm, + const std::vector& entries, + uint64_t leaderCommit, int followerId) { + AeFields fields{}; + fields.term = term; + fields.leaderId = leaderId; + fields.leaderCommit = leaderCommit; + fields.prevLogIndex = prevLogIndex; + fields.prevLogTerm = prevLogTerm; + fields.followerId = followerId; + + for (const auto& logEntry : entries) { + LogEntry entry; + entry.term = logEntry.term; + entry.command = logEntry.command; + fields.entries.push_back(std::move(entry)); + } + + return fields; + }; + + // Helper to create a single log entry. + LogEntry CreateLogEntry(uint64_t term, + const std::string& command_data) { + LogEntry entry; + entry.term = term; + entry.command = command_data; + return entry; + } + + // Helper to create a vector of log entries for testing. + std::vector CreateLogEntries( + const std::vector>& term_and_cmds, + bool usedForLogPatch = false) { + std::vector entries; + + if (usedForLogPatch) { + LogEntry first_entry; + first_entry.term = 0; + first_entry.command = "COMMON_PREFIX"; + entries.push_back(first_entry); + } + + for (const auto& [term, cmd] : term_and_cmds) { + LogEntry entry; + entry.term = term; + + ClientTestRequest req; + req.set_value(cmd); + + std::string serialized; + req.SerializeToString(&serialized); + entry.command = serialized; + + entries.push_back(entry); + } + + return entries; + } + + AppendEntries CreateAeMessage(const AeFields& fields) { + AppendEntries ae; + ae.set_term(fields.term); + ae.set_leaderid(fields.leaderId); + ae.set_prevlogindex(fields.prevLogIndex); + ae.set_prevlogterm(fields.prevLogTerm); + ae.set_leadercommitindex(fields.leaderCommit); + for (const auto& entry : fields.entries) { + auto* newEntry = ae.add_entries(); + newEntry->set_term(entry.term); + newEntry->set_command(entry.command); + } + + return ae; + } + + std::unique_ptr verifier_; + std::unique_ptr leader_election_manager_; + std::unique_ptr replica_communicator_; + std::unique_ptr recovery_; + std::unique_ptr raft_; + MockSendMessageFunction mock_call; + MockBroadcastFunction mock_broadcast; + MockCommitFunction mock_commit; +}; + +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/BUILD b/platform/consensus/ordering/raft/framework/BUILD index 0b6103b498..bdf13fdbd6 100644 --- a/platform/consensus/ordering/raft/framework/BUILD +++ b/platform/consensus/ordering/raft/framework/BUILD @@ -78,3 +78,22 @@ cc_library( "//platform/consensus/ordering/raft/algorithm:raft", ], ) + +cc_test( + name = "raft_recovery_test", + srcs = [ + "raft_recovery_test.cpp", + ], + copts = ["-DRAFT_TEST_MODE"], + deps = [ + ":raft_recovery", + "//chain/storage:mock_storage", + "//platform/consensus/ordering/raft/proto:proposal_cc_proto", + "//platform/consensus/checkpoint:mock_checkpoint", + ":transaction_utils", + "//common/test:test_main", + "//platform/proto:client_test_cc_proto", + "//platform/consensus/ordering/raft/algorithm:raft" + ], + size="small" +) \ No newline at end of file diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 956621040c..4e4d53f0d3 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -38,7 +38,6 @@ Consensus::Consensus(const ResDBConfig& config, config_, GetBroadCastClient(), GetSignatureVerifier(), system_info_.get())), recovery_(std::make_unique(config_, checkpoint_manager_.get(), - system_info_.get(), transaction_executor_->GetStorage())) { //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": In consensus constructor"; int total_replicas = config_.GetReplicaNum(); @@ -57,17 +56,7 @@ Consensus::Consensus(const ResDBConfig& config, leader_election_manager_->SetRaft(raft_.get()); leader_election_manager_->MayStart(); - recovery_->ReadLogs( - [&](const RaftMetadata& metadata) { - LOG(ERROR) << " read current term: " << metadata.current_term - << " voted for: " << metadata.voted_for; - raft_->SetCurrentTerm(metadata.current_term, false); - raft_->SetVotedFor(metadata.voted_for, false); - }, - [&](std::unique_ptr request) { - return CommitMsg(std::move(request)); - }, - [&](int seq) { raft_->SetSeqIndexCoveredBySnapshot(seq); }); + RecoverFromLogs(); InitProtocol(raft_.get()); } @@ -130,6 +119,20 @@ int Consensus::ProcessCustomConsensus(std::unique_ptr request) { return 0; } +void Consensus::RecoverFromLogs() { + recovery_->ReadLogs( + [&](const RaftMetadata& metadata) { + LOG(ERROR) << " read current term: " << metadata.current_term + << " voted for: " << metadata.voted_for; + raft_->SetCurrentTerm(metadata.current_term, false); + raft_->SetVotedFor(metadata.voted_for, false); + }, + [&](std::unique_ptr request) { + return CommitMsg(*request); + }, + [&](int seq) { raft_->SetSeqIndexCoveredBySnapshot(seq); }); +} + int Consensus::ProcessNewTransaction(std::unique_ptr request) { return raft_->ReceiveTransaction(std::move(request)); } @@ -145,10 +148,6 @@ int Consensus::CommitMsg(const google::protobuf::Message& msg) { return 0; } -// int Consensus::CommitMsg(std::unique_ptr request) { -// transaction_executor_->Commit(std::move(request)); -// return 0; -// } } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/consensus.h b/platform/consensus/ordering/raft/framework/consensus.h index 89dc494139..e6580f60c9 100644 --- a/platform/consensus/ordering/raft/framework/consensus.h +++ b/platform/consensus/ordering/raft/framework/consensus.h @@ -41,8 +41,11 @@ class Consensus : public common::Consensus { int ProcessCustomConsensus(std::unique_ptr request) override; int ProcessNewTransaction(std::unique_ptr request) override; int CommitMsg(const google::protobuf::Message& msg) override; - // int CommitMsg(const std::unique_ptr request); int CommitMsgInternal(const AppendEntries& txn); +#ifdef RAFT_TEST_MODE + public: +#endif + void RecoverFromLogs(); protected: std::unique_ptr raft_; diff --git a/platform/consensus/ordering/raft/framework/raft_recovery.cpp b/platform/consensus/ordering/raft/framework/raft_recovery.cpp index 76bea4daff..a99e9b9cb4 100644 --- a/platform/consensus/ordering/raft/framework/raft_recovery.cpp +++ b/platform/consensus/ordering/raft/framework/raft_recovery.cpp @@ -35,31 +35,54 @@ namespace resdb { namespace raft { -RaftRecovery::RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, SystemInfo* system_info, Storage* storage) - : Recovery(config, checkpoint, system_info, storage) { - LOG(INFO) << "Raft Recovery constructor"; +using CallbackType = std::function)>; + +RaftRecovery::RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage) + : RecoveryBase(config, checkpoint, storage) { Init(); -}; +} void RaftRecovery::Init() { - // Recovery::Init(); + if (recovery_enabled_ == false) { + LOG(INFO) << "recovery is not enabled:" << recovery_enabled_; + return; + } + + LOG(ERROR) << " init"; + GetLastFile(); + + CallbackType callback = + [this](std::unique_ptr request) { + min_seq_ == -1 + ? min_seq_ = request->seq() + : std::min(min_seq_, static_cast(request->seq())); + max_seq_ = std::max(max_seq_, static_cast(request->seq())); + }; + + SwitchFile(file_path_, callback); + LOG(ERROR) << " init done"; meta_file_path_ = std::filesystem::path(base_file_path_).parent_path() / "raft_metadata.dat"; LOG(INFO) << "Meta file path: " << meta_file_path_; OpenMetadataFile(); + + ckpt_thread_ = std::thread([this]{ this->UpdateStableCheckPoint(); }); } RaftRecovery::~RaftRecovery() { + LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; if (recovery_enabled_ == false) { return; } + Flush(); if (metadata_fd_ >= 0) { close(metadata_fd_); } } void RaftRecovery::OpenMetadataFile() { + LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; metadata_fd_ = open(meta_file_path_.c_str(), O_CREAT | O_RDWR, 0666); if (metadata_fd_ < 0) { LOG(ERROR) << "Failed to open metadata file: " << strerror(errno); @@ -91,6 +114,7 @@ void RaftRecovery::WriteMetadata(int64_t current_term, int32_t voted_for) { } RaftMetadata RaftRecovery::ReadMetadata() { + LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; RaftMetadata metadata; if (metadata_fd_ < 0) { LOG(ERROR) << "Metadata file not open"; @@ -106,25 +130,7 @@ RaftMetadata RaftRecovery::ReadMetadata() { return metadata; } -void RaftRecovery::SwitchFile(const std::string& file_path) { - std::unique_lock lk(mutex_); - - min_seq_ = -1; - max_seq_ = -1; - - ReadLogsFromFiles( - file_path, 0, 0, [&](const RaftMetadata& data) {}, - [&](std::unique_ptr request) { - min_seq_ == -1 - ? min_seq_ = request->seq() - : std::min(min_seq_, static_cast(request->seq())); - max_seq_ = std::max(max_seq_, static_cast(request->seq())); - }); - - OpenFile(file_path); - LOG(INFO) << "switch to file:" << file_path << " seq:" - << "[" << min_seq_ << "," << max_seq_ << "]"; -} +void RaftRecovery::WriteSystemInfo() { } void RaftRecovery::AddLogEntry(const Entry* entry) { if (recovery_enabled_ == false) { @@ -134,6 +140,7 @@ void RaftRecovery::AddLogEntry(const Entry* entry) { } void RaftRecovery::WriteLog(const Entry* entry) { +LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; std::string data; if (entry) { entry->SerializeToString(&data); @@ -149,22 +156,11 @@ void RaftRecovery::WriteLog(const Entry* entry) { Flush(); } -std::vector> RaftRecovery::ParseData(const std::string& data) { +std::vector::RecoveryData>> RaftRecovery::ParseDataListItem( + std::vector &data_list) { std::vector> request_list; - std::vector data_list; - int pos = 0; - while (pos < data.size()) { - size_t len; - memcpy(&len, data.c_str() + pos, sizeof(len)); - pos += sizeof(len); - - std::string item = data.substr(pos, len); - pos += len; - data_list.push_back(item); - } - - for (size_t i = 0; i < data_list.size(); i += 1) { + for (size_t i = 0; i < data_list.size(); i += 2) { std::unique_ptr recovery_data = std::make_unique(); recovery_data->request = std::make_unique(); @@ -179,68 +175,9 @@ std::vector> RaftRecovery::ParseData return request_list; } -void RaftRecovery::ReadLogs( - std::function system_callback, - std::function request)> - call_back, - std::function set_start_point) { - if (recovery_enabled_ == false) { - return; - } - assert(storage_); - int64_t storage_ckpt = storage_->GetLastCheckpoint(); - LOG(ERROR) << " storage ckpt:" << storage_ckpt; - std::unique_lock lk(mutex_); - - system_callback(ReadMetadata()); - auto recovery_files_pair = GetRecoveryFiles(storage_ckpt); - int64_t ckpt = recovery_files_pair.second; - if (set_start_point) { - set_start_point(ckpt); - } - int idx = 0; - for (auto path : recovery_files_pair.first) { - ReadLogsFromFiles(path.second, ckpt, idx++, system_callback, call_back); - } -} - -void RaftRecovery::ReadLogsFromFiles( - const std::string& path, int64_t ckpt, int file_idx, - std::function system_callback, - std::function request)> - call_back) { - int fd = open(path.c_str(), O_CREAT | O_RDONLY, 0666); - if (fd < 0) { - LOG(ERROR) << " open file fail:" << path; - } - LOG(INFO) << "read logs:" << path << " pos:" << lseek(fd, 0, SEEK_CUR); - assert(fd >= 0); - - size_t data_len = 0; - std::vector> request_list; - - while (Read(fd, sizeof(data_len), reinterpret_cast(&data_len))) { - std::string data; - char* buf = new char[data_len]; - if (!Read(fd, data_len, buf)) { - LOG(ERROR) << "Read data log fail"; - break; - } - data = std::string(buf, data_len); - delete buf; - - std::vector> list = ParseData(data); - if (list.size() == 0) { - request_list.clear(); - break; - } - for (auto& l : list) { - request_list.push_back(std::move(l)); - } - } - if (request_list.size() == 0) { - ftruncate(fd, 0); - } +void RaftRecovery::PerformCallback( + std::vector> &request_list, + CallbackType call_back, int64_t ckpt) { uint64_t max_seq = 0; for (std::unique_ptr& recovery_data : request_list) { // LOG(ERROR)<<" ckpt :"<>> RaftRecovery::GetDataFromRecoveryFiles(uint64_t need_min_seq, uint64_t need_max_seq) { - std::string dir = std::filesystem::path(file_path_).parent_path(); - - std::vector> list; - std::vector> e_list; - - for (const auto& entry : std::filesystem::directory_iterator(dir)) { - std::string dir = std::filesystem::path(entry.path()).parent_path(); - std::string file_name = std::filesystem::path(entry.path()).stem(); - std::string ext = std::filesystem::path(entry.path()).extension(); - if (ext != ".log") continue; - int pos = file_name.rfind("_"); - - int max_seq_pos = file_name.rfind("_", pos - 1); - int64_t max_seq = - std::stoll(file_name.substr(max_seq_pos + 1, pos - max_seq_pos - 1)); - - int min_seq_pos = file_name.rfind("_", max_seq_pos - 1); - int64_t min_seq = std::stoll( - file_name.substr(min_seq_pos + 1, max_seq_pos - min_seq_pos - 1)); - - int time_pos = file_name.rfind("_", min_seq_pos - 1); - int64_t time = - std::stoll(file_name.substr(time_pos + 1, min_seq_pos - time_pos - 1)); - - // LOG(ERROR)<<" min seq:"< need_max_seq) { - continue; - } - // LOG(ERROR)<<" get min seq:"< data_list, std::function system_callback) { + RaftMetadata info = ReadMetadata(); + system_callback(info); + return true; +} - std::map>> res; +std::map< + uint64_t, + std::vector, std::unique_ptr>>> +RaftRecovery::GetDataFromRecoveryFiles(uint64_t need_min_seq, + uint64_t need_max_seq) { + auto list = GetSortedRecoveryFiles(need_min_seq, need_max_seq); + + std::map, + std::unique_ptr>>> + res; + std::function system_cb = [&](const RaftMetadata&) {}; for (const auto& path : list) { - ReadLogsFromFiles( - path.second, need_min_seq - 1, 0, [&](const RaftMetadata& data) {}, + CallbackType callback = [&](std::unique_ptr request) { - // LOG(ERROR) << "check get data from recovery file seq:" - // << request->seq(); - if (request->seq() >= need_min_seq && - request->seq() <= need_max_seq) { - LOG(ERROR) << "get data from recovery file seq:" << request->seq(); - res[request->seq()].push_back(std::move(request)); - } - }); + if (request->seq() >= need_min_seq && + request->seq() <= need_max_seq) { + LOG(ERROR) << "get data from recovery file seq:" << request->seq(); + res[request->seq()].push_back( + std::make_pair(nullptr, std::move(request))); + } + }; + + this->template ReadLogsFromFiles( + path.second, need_min_seq - 1, 0, + system_cb, // system callback + callback); // typed callback } return res; } } // namespace raft + +template class RecoveryBase; + +template void RecoveryBase::ReadLogs( + std::function, + raft::CallbackType, + std::function); + +template void RecoveryBase::SwitchFile( + const std::string&, + raft::CallbackType); + +template void RecoveryBase::ReadLogsFromFiles( + const std::string&, int64_t, int, + std::function, + raft::CallbackType); } // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/raft_recovery.h b/platform/consensus/ordering/raft/framework/raft_recovery.h index dc490a7dda..5168c3aaf0 100644 --- a/platform/consensus/ordering/raft/framework/raft_recovery.h +++ b/platform/consensus/ordering/raft/framework/raft_recovery.h @@ -40,25 +40,32 @@ struct RaftMetadata { int32_t voted_for = -1; }; -class RaftRecovery : public Recovery { +class RaftRecovery : public RecoveryBase { + friend class RecoveryBase; public: - RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, - SystemInfo* system_info, Storage* storage); + RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage); ~RaftRecovery(); RaftMetadata ReadMetadata(); void Init(); void WriteMetadata(int64_t current_term, int32_t voted_for); - void ReadLogs(std::function system_callback,std::function request)> call_back, std::function set_start_point); void AddLogEntry(const Entry* entry); + std::map, std::unique_ptr>>> + GetDataFromRecoveryFiles(uint64_t need_min_seq, uint64_t need_max_seq); private: void OpenMetadataFile(); - std::vector> ParseData(const std::string& data); + void WriteSystemInfo(); + std::vector> ParseDataListItem( + std::vector &data_list); void WriteLog(const Entry* entry); - void ReadLogsFromFiles(const std::string& path, int64_t ckpt, int file_idx, std::function system_callback, std::function request)> call_back); - std::map>> GetDataFromRecoveryFiles(uint64_t need_min_seq, uint64_t need_max_seq); - void SwitchFile(const std::string& path); + + void PerformCallback( + std::vector> &request_list, + std::function request)> + call_back, int64_t ckpt); + + bool PerformSystemCallback(std::vector data_list, std::function system_callback); int metadata_fd_; std::string meta_file_path_; diff --git a/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp b/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp new file mode 100644 index 0000000000..e22497cbb2 --- /dev/null +++ b/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp @@ -0,0 +1,105 @@ +#include +#include +#include +#include + + +#include "platform/consensus/ordering/raft/framework/raft_recovery.h" +#include "platform/consensus/ordering/raft/proto/proposal.pb.h" +#include "chain/storage/mock_storage.h" +#include "platform/consensus/checkpoint/mock_checkpoint.h" +#include "platform/consensus/ordering/raft/framework/transaction_utils.h" + + +namespace resdb { +namespace raft { +using ::testing::_; +using ::testing::AnyNumber; +using ::testing::Invoke; +using ::testing::Matcher; +using ::testing::Test; + +const std::string log_path = "./log/test_log"; + +ResConfigData GetConfigData(int buf_size = 10) { + ResConfigData data; + data.set_recovery_enabled(true); + data.set_recovery_path(log_path); + data.set_recovery_buffer_size(buf_size); + data.set_recovery_ckpt_time_s(1); + + return data; +} + +std::vector Listlogs(const std::string &path) { + std::vector ret; + std::string dir = std::filesystem::path(path).parent_path(); + for (const auto &entry : std::filesystem::directory_iterator(dir)) { + LOG(ERROR) << "path:" << entry.path(); + ret.push_back(entry.path()); + } + return ret; +} + +class RaftRecoveryTest : public Test { + public: + RaftRecoveryTest() + : config_(GetConfigData(), ReplicaInfo(), KeyInfo(), CertificateInfo()), + system_info_() { + std::string dir = std::filesystem::path(log_path).parent_path(); + std::filesystem::remove_all(dir); + } + + protected: + ResDBConfig config_; + SystemInfo system_info_; + MockCheckPoint checkpoint_; +}; + +TEST_F(RaftRecoveryTest, ReadLog) { + std::vector types = {Request::TYPE_PRE_PREPARE, Request::TYPE_PREPARE, + Request::TYPE_COMMIT, Request::TYPE_CHECKPOINT, + Request::TYPE_NEWVIEW, Request::TYPE_NEW_TXNS}; + + std::vector expected_types = { + Request::TYPE_PRE_PREPARE, Request::TYPE_PREPARE, Request::TYPE_COMMIT, + Request::TYPE_CHECKPOINT, Request::TYPE_NEWVIEW, + }; + + int entries_to_add = 3; + { + RaftRecovery recovery(config_, &checkpoint_, &system_info_, nullptr); + + for (int i = 0; i < entries_to_add; i++) { + // Set up the Log Entry to be added + Entry logEntry; + logEntry.set_term(i); + auto req = std::make_unique(); + req->set_seq(i); + std::string serialized; + if (!req->SerializeToString(&serialized)) { + assert(false); + } + logEntry.set_command(std::move(serialized)); + + recovery.AddLogEntry(&logEntry); + } + } + { + std::vector list; + RaftRecovery recovery(config_, &checkpoint_, &system_info_, nullptr); + recovery.ReadLogs( + [&](const RaftMetadata &data) {}, + [&](std::unique_ptr request) { list.push_back(*request); }, + nullptr); + + EXPECT_EQ(list.size(), entries_to_add); + + for (size_t i = 0; i < entries_to_add; ++i) { + EXPECT_EQ(list[i].seq(), i); + } + } +} + +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/proto/BUILD b/platform/consensus/ordering/raft/proto/BUILD index 144a6751f3..2d2a90c38d 100644 --- a/platform/consensus/ordering/raft/proto/BUILD +++ b/platform/consensus/ordering/raft/proto/BUILD @@ -32,3 +32,14 @@ cc_proto_library( name = "proposal_cc_proto", deps = [":proposal_proto"], ) + +proto_library( + name = "persistent_state_proto", + srcs = ["persistent_state.proto"], + #visibility = ["//visibility:public"], +) + +cc_proto_library( + name = "persistent_state_cc_proto", + deps = [":persistent_state_proto"], +) diff --git a/platform/consensus/recovery/BUILD b/platform/consensus/recovery/BUILD index d429c6a7e8..b23a046f95 100644 --- a/platform/consensus/recovery/BUILD +++ b/platform/consensus/recovery/BUILD @@ -19,9 +19,8 @@ package(default_visibility = ["//platform/consensus:__subpackages__"]) cc_library( - name = "recovery", - srcs = ["recovery.cpp"], - hdrs = ["recovery.h"], + name = "recovery_base", + hdrs = ["recovery.h", "recovery_impl.h", "recovery_template_functions.h"], deps = [ "//chain/storage", "//common/utils", @@ -34,6 +33,15 @@ cc_library( ], ) +cc_library( + name = "recovery", + srcs = ["pbft_recovery.cpp"], + hdrs = ["pbft_recovery.h"], + deps = [ + ":recovery_base", + ], +) + cc_test( name = "recovery_test", srcs = ["recovery_test.cpp"], diff --git a/platform/consensus/recovery/pbft_recovery.cpp b/platform/consensus/recovery/pbft_recovery.cpp new file mode 100644 index 0000000000..492395bd23 --- /dev/null +++ b/platform/consensus/recovery/pbft_recovery.cpp @@ -0,0 +1,225 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "platform/consensus/recovery/pbft_recovery.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "common/utils/utils.h" + +namespace resdb { + +using CallbackType = std::function, std::unique_ptr)>; + +PBFTRecovery::PBFTRecovery(const ResDBConfig& config, CheckPoint* checkpoint, + SystemInfo* system_info, Storage* storage) + : RecoveryBase(config, checkpoint, storage), + system_info_(system_info) { + Init(); +} + +void PBFTRecovery::Init() { + if (recovery_enabled_ == false) { + LOG(INFO) << "recovery is not enabled:" << recovery_enabled_; + return; + } + + LOG(ERROR) << " init"; + GetLastFile(); + + CallbackType callback = + [this](std::unique_ptr context, std::unique_ptr request) { + min_seq_ == -1 + ? min_seq_ = request->seq() + : std::min(min_seq_, static_cast(request->seq())); + max_seq_ = std::max(max_seq_, static_cast(request->seq())); + }; + + SwitchFile(file_path_, callback); + + LOG(ERROR) << " init done"; + + ckpt_thread_ = std::thread([this]{ this->UpdateStableCheckPoint(); }); +} + +void PBFTRecovery::WriteSystemInfo() { + int view = system_info_->GetCurrentView(); + int primary_id = system_info_->GetPrimaryId(); + LOG(ERROR) << "write system info:" << primary_id << " view:" << view; + SystemInfoData data; + data.set_view(view); + data.set_primary_id(primary_id); + + std::string data_str; + data.SerializeToString(&data_str); + + AppendData(data_str); + Flush(); +} + +void PBFTRecovery::AddRequest(const Context* context, const Request* request) { + if (recovery_enabled_ == false) { + return; + } + switch (request->type()) { + case Request::TYPE_PRE_PREPARE: + case Request::TYPE_PREPARE: + case Request::TYPE_COMMIT: + case Request::TYPE_NEWVIEW: + return WriteLog(context, request); + default: + break; + } +} + +void PBFTRecovery::WriteLog(const Context* context, const Request* request) { + std::string data; + if (request) { + request->SerializeToString(&data); + } + + std::string sig; + if (context) { + context->signature.SerializeToString(&sig); + } + + std::unique_lock lk(mutex_); + min_seq_ = min_seq_ == -1 + ? request->seq() + : std::min(min_seq_, static_cast(request->seq())); + max_seq_ = std::max(max_seq_, static_cast(request->seq())); + AppendData(data); + AppendData(sig); + + Flush(); +} + +std::vector::RecoveryData>> PBFTRecovery::ParseDataListItem( + std::vector &data_list) { + std::vector> request_list; + + for (size_t i = 0; i < data_list.size(); i += 2) { + std::unique_ptr recovery_data = + std::make_unique(); + recovery_data->request = std::make_unique(); + recovery_data->context = std::make_unique(); + + if (!recovery_data->request->ParseFromString(data_list[i])) { + LOG(ERROR) << "Parse from data fail"; + break; + } + + if (!recovery_data->context->signature.ParseFromString(data_list[i + 1])) { + LOG(ERROR) << "Parse from data fail"; + break; + } + + request_list.push_back(std::move(recovery_data)); + } + return request_list; +} + +void PBFTRecovery::PerformCallback( + std::vector> &request_list, + CallbackType call_back, int64_t ckpt) { + uint64_t max_seq = 0; + for (std::unique_ptr& recovery_data : request_list) { + // LOG(ERROR)<<" ckpt :"<request->seq()<<" + // type:"<request->type(); + if (ckpt < recovery_data->request->seq() || + recovery_data->request->type() == Request::TYPE_NEWVIEW) { + recovery_data->request->set_is_recovery(true); + max_seq = recovery_data->request->seq(); + call_back(std::move(recovery_data->context), + std::move(recovery_data->request)); + } + } + + LOG(ERROR) << " recovery max seq:" << max_seq; +} + + +bool PBFTRecovery::PerformSystemCallback(std::vector data_list, std::function system_callback) { + SystemInfoData info; + if (data_list.empty() || !info.ParseFromString(data_list[0])) { + return false; + } + LOG(ERROR) << "read system info:" << info.DebugString(); + system_callback(info); + return true; +} + +std::map< + uint64_t, + std::vector, std::unique_ptr>>> +PBFTRecovery::GetDataFromRecoveryFiles(uint64_t need_min_seq, + uint64_t need_max_seq) { + auto list = GetSortedRecoveryFiles(need_min_seq, need_max_seq); + + std::map, + std::unique_ptr>>> + res; + for (const auto& path : list) { + CallbackType callback = + [&](std::unique_ptr context, std::unique_ptr request) { + if (request->seq() >= need_min_seq && + request->seq() <= need_max_seq) { + LOG(ERROR) << "get data from recovery file seq:" << request->seq(); + res[request->seq()].push_back( + std::make_pair(std::move(context), std::move(request))); + } + }; + + ReadLogsFromFiles( + path.second, need_min_seq - 1, 0, + [&](const SystemInfoData& data) {}, // system callback + callback); // typed callback + } + + return res; +} + + +template class RecoveryBase; + +template void RecoveryBase::ReadLogs( + std::function, + CallbackType, + std::function); + +template void RecoveryBase::SwitchFile( + const std::string&, + CallbackType); + +template void RecoveryBase::ReadLogsFromFiles( + const std::string&, int64_t, int, + std::function, + CallbackType); + +} // namespace resdb diff --git a/platform/consensus/recovery/pbft_recovery.h b/platform/consensus/recovery/pbft_recovery.h new file mode 100644 index 0000000000..556e5b157e --- /dev/null +++ b/platform/consensus/recovery/pbft_recovery.h @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include "platform/consensus/recovery/recovery.h" +#include "platform/proto/system_info_data.pb.h" +#include "platform/consensus/execution/system_info.h" + +namespace resdb { + +class PBFTRecovery : public RecoveryBase { + friend class RecoveryBase; + public: + PBFTRecovery(const ResDBConfig& config, CheckPoint* checkpoint, + SystemInfo* system_info, Storage* storage); + ~PBFTRecovery() = default; + + void AddRequest(const Context* context, const Request* request); + + std::map, + std::unique_ptr>>> + GetDataFromRecoveryFiles(uint64_t need_min_seq, uint64_t need_max_seq); + + private: + void Init(); + void WriteLog(const Context* context, const Request* request); + void WriteSystemInfo(); + + std::vector> ParseDataListItem( + std::vector &data_list); + + void PerformCallback( + std::vector> &request_list, + std::function context, + std::unique_ptr request)> + call_back, int64_t ckpt); + + bool PerformSystemCallback(std::vector data_list, std::function system_callback); + + SystemInfo* system_info_; +}; + +} // namespace resdb diff --git a/platform/consensus/recovery/recovery.h b/platform/consensus/recovery/recovery.h index 5ba9b2c816..7c5e2abaa4 100644 --- a/platform/consensus/recovery/recovery.h +++ b/platform/consensus/recovery/recovery.h @@ -24,26 +24,32 @@ #include "chain/storage/storage.h" #include "platform/config/resdb_config.h" #include "platform/consensus/checkpoint/checkpoint.h" -#include "platform/consensus/execution/system_info.h" #include "platform/networkstrate/server_comm.h" #include "platform/proto/resdb.pb.h" -#include "platform/proto/system_info_data.pb.h" +#include +#include +#include +#include +#include +#include -namespace resdb { +#include +#include +#include -class Recovery { - public: - Recovery(const ResDBConfig& config, CheckPoint* checkpoint, - SystemInfo* system_info, Storage* storage); - virtual ~Recovery(); +#include "common/utils/utils.h" - virtual void Init(); +namespace resdb { - virtual void AddRequest(const Context* context, const Request* request); - void ReadLogs(std::function system_callback, - std::function context, - std::unique_ptr request)> - call_back, +template +class RecoveryBase { + public: + RecoveryBase(const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage); + ~RecoveryBase(); + // void Init(); + template + void ReadLogs(std::function system_callback, + TCallback call_back, std::function start_point); int64_t GetMaxSeq(); @@ -51,11 +57,8 @@ class Recovery { int GetData(const RecoveryRequest& request, RecoveryResponse& response); - std::map, - std::unique_ptr>>> - GetDataFromRecoveryFiles(uint64_t need_min_seq, uint64_t need_max_seq); - protected: + std::vector> GetSortedRecoveryFiles(uint64_t need_min_seq, uint64_t need_max_seq); struct RecoveryData { std::unique_ptr context; std::unique_ptr request; @@ -72,37 +75,48 @@ class Recovery { void Write(const char* data, size_t len); - std::string GenerateFile(int64_t seq, int64_t min_seq, int64_t max_seq); - void GetLastFile(); - void WriteSystemInfo(); void FinishFile(int64_t seq); - void UpdateStableCheckPoint(); - void ReadLogsFromFiles( - const std::string& path, int64_t ckpt, int file_idx, - std::function system_callback, - std::function context, - std::unique_ptr request)> - call_back); - void InsertCache(const Context& context, const Request& request); protected: + void GetLastFile(); + void UpdateStableCheckPoint(); void Flush(); + void AppendData(const std::string& data); bool Read(int fd, size_t len, char* data); std::pair>, int64_t> GetRecoveryFiles(int64_t ckpt); - virtual void SwitchFile(const std::string& path); + + template + void SwitchFile(const std::string& path, TCallback call_back); void OpenFile(const std::string& path); + template + void ReadLogsFromFiles( + const std::string& path, int64_t ckpt, int file_idx, + std::function system_callback, + TCallback call_back); + + std::string file_path_; ResDBConfig config_; + // Derived class must implement these + std::vector> ParseDataListItem(std::vector& data_list); + + template + void PerformCallback(std::vector>& request_list, TCallback call_back); + + void WriteSystemInfo(); + + CheckPoint* checkpoint_; std::thread ckpt_thread_; bool recovery_enabled_ = false; std::string buffer_; - std::string file_path_, base_file_path_; + + std::string base_file_path_; size_t buffer_size_ = 0; int fd_; std::mutex mutex_, data_mutex_; @@ -112,8 +126,10 @@ class Recovery { std::mutex ckpt_mutex_; std::atomic stop_; int recovery_ckpt_time_s_; - SystemInfo* system_info_; Storage* storage_; }; +#include "platform/consensus/recovery/recovery_impl.h" +#include "platform/consensus/recovery/recovery_template_functions.h" + } // namespace resdb diff --git a/platform/consensus/recovery/recovery.cpp b/platform/consensus/recovery/recovery_impl.h similarity index 53% rename from platform/consensus/recovery/recovery.cpp rename to platform/consensus/recovery/recovery_impl.h index bf24ebc746..6bf044877f 100644 --- a/platform/consensus/recovery/recovery.cpp +++ b/platform/consensus/recovery/recovery_impl.h @@ -17,28 +17,24 @@ * under the License. */ -#include "platform/consensus/recovery/recovery.h" +// #include "platform/consensus/recovery/recovery.h" -#include -#include -#include -#include -#include -#include +// #include +// #include +// #include -#include -#include -#include +// #include +// #include +// #include -#include "common/utils/utils.h" +// #include "common/utils/utils.h" -namespace resdb { +// namespace resdb { -Recovery::Recovery(const ResDBConfig& config, CheckPoint* checkpoint, - SystemInfo* system_info, Storage* storage) +template +RecoveryBase::RecoveryBase(const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage) : config_(config), checkpoint_(checkpoint), - system_info_(system_info), storage_(storage) { recovery_enabled_ = config_.GetConfigData().recovery_enabled(); file_path_ = config_.GetConfigData().recovery_path(); @@ -80,19 +76,10 @@ Recovery::Recovery(const ResDBConfig& config, CheckPoint* checkpoint, fd_ = -1; stop_ = false; - Init(); } -void Recovery::Init() { - LOG(ERROR) << " init"; - GetLastFile(); - SwitchFile(file_path_); - LOG(ERROR) << " init done"; - - ckpt_thread_ = std::thread(&Recovery::UpdateStableCheckPoint, this); -} - -Recovery::~Recovery() { +template +RecoveryBase::~RecoveryBase() { if (recovery_enabled_ == false) { return; } @@ -104,11 +91,14 @@ Recovery::~Recovery() { } } -int64_t Recovery::GetMaxSeq() { return max_seq_; } +template +int64_t RecoveryBase::GetMaxSeq() { return max_seq_; } -int64_t Recovery::GetMinSeq() { return min_seq_; } +template +int64_t RecoveryBase::GetMinSeq() { return min_seq_; } -void Recovery::UpdateStableCheckPoint() { +template +void RecoveryBase::UpdateStableCheckPoint() { if (checkpoint_ == nullptr) { return; } @@ -124,7 +114,8 @@ void Recovery::UpdateStableCheckPoint() { } } -void Recovery::GetLastFile() { +template +void RecoveryBase::GetLastFile() { std::string dir = std::filesystem::path(file_path_).parent_path(); last_ckpt_ = -1; uint64_t m_time_s = 0; @@ -162,7 +153,8 @@ void Recovery::GetLastFile() { } } -std::string Recovery::GenerateFile(int64_t seq, int64_t min_seq, +template +std::string RecoveryBase::GenerateFile(int64_t seq, int64_t min_seq, int64_t max_seq) { std::string dir = std::filesystem::path(file_path_).parent_path(); std::string file_name = std::filesystem::path(base_file_path_).stem(); @@ -175,7 +167,8 @@ std::string Recovery::GenerateFile(int64_t seq, int64_t min_seq, return dir + "/" + file_name + "." + ext; } -void Recovery::FinishFile(int64_t seq) { +template +void RecoveryBase::FinishFile(int64_t seq) { std::unique_lock lk(mutex_); Flush(); if (storage_) { @@ -198,109 +191,16 @@ void Recovery::FinishFile(int64_t seq) { OpenFile(file_path_); } -void Recovery::SwitchFile(const std::string& file_path) { - std::unique_lock lk(mutex_); - - min_seq_ = -1; - max_seq_ = -1; - - ReadLogsFromFiles( - file_path, 0, 0, [&](const SystemInfoData& data) {}, - [&](std::unique_ptr context, std::unique_ptr request) { - min_seq_ == -1 - ? min_seq_ = request->seq() - : std::min(min_seq_, static_cast(request->seq())); - max_seq_ = std::max(max_seq_, static_cast(request->seq())); - }); - - OpenFile(file_path); - LOG(INFO) << "switch to file:" << file_path << " seq:" - << "[" << min_seq_ << "," << max_seq_ << "]"; -} - -void Recovery::OpenFile(const std::string& path) { - if (fd_ >= 0) { - close(fd_); - } - fd_ = open(path.c_str(), O_CREAT | O_WRONLY, 0666); - if (fd_ < 0) { - LOG(ERROR) << "open file fail:" << path << " error:" << strerror(errno); - } - - int pos = lseek(fd_, 0, SEEK_END); - LOG(INFO) << "file path:" << path << " len:" << pos << " fd:" << fd_; - - if (pos == 0) { - WriteSystemInfo(); - } - - lseek(fd_, 0, SEEK_END); - LOG(ERROR) << "open file:" << path << " pos:" << lseek(fd_, 0, SEEK_CUR) - << " fd:" << fd_; - assert(fd_ >= 0); -} - -void Recovery::WriteSystemInfo() { - int view = system_info_->GetCurrentView(); - int primary_id = system_info_->GetPrimaryId(); - LOG(ERROR) << "write system info:" << primary_id << " view:" << view; - SystemInfoData data; - data.set_view(view); - data.set_primary_id(primary_id); - - std::string data_str; - data.SerializeToString(&data_str); - - AppendData(data_str); - Flush(); -} - -void Recovery::AddRequest(const Context* context, const Request* request) { - if (recovery_enabled_ == false) { - return; - } - switch (request->type()) { - case Request::TYPE_PRE_PREPARE: - case Request::TYPE_PREPARE: - case Request::TYPE_COMMIT: - case Request::TYPE_NEWVIEW: - return WriteLog(context, request); - default: - break; - } -} - -void Recovery::WriteLog(const Context* context, const Request* request) { - std::string data; - if (request) { - request->SerializeToString(&data); - } - - std::string sig; - if (context) { - context->signature.SerializeToString(&sig); - } - - std::unique_lock lk(mutex_); - min_seq_ = min_seq_ == -1 - ? request->seq() - : std::min(min_seq_, static_cast(request->seq())); - max_seq_ = std::max(max_seq_, static_cast(request->seq())); - AppendData(data); - AppendData(sig); - - Flush(); -} - -void Recovery::AppendData(const std::string& data) { +template +void RecoveryBase::AppendData(const std::string& data) { size_t len = data.size(); buffer_.append(reinterpret_cast(&len), sizeof(len)); buffer_.append(data); } -std::vector> Recovery::ParseData( +template +std::vector::RecoveryData>> RecoveryBase::ParseData( const std::string& data) { - std::vector> request_list; std::vector data_list; int pos = 0; @@ -314,28 +214,11 @@ std::vector> Recovery::ParseData( data_list.push_back(item); } - for (size_t i = 0; i < data_list.size(); i += 2) { - std::unique_ptr recovery_data = - std::make_unique(); - recovery_data->request = std::make_unique(); - recovery_data->context = std::make_unique(); - - if (!recovery_data->request->ParseFromString(data_list[i])) { - LOG(ERROR) << "Parse from data fail"; - break; - } - - if (!recovery_data->context->signature.ParseFromString(data_list[i + 1])) { - LOG(ERROR) << "Parse from data fail"; - break; - } - - request_list.push_back(std::move(recovery_data)); - } - return request_list; + return static_cast(this)->ParseDataListItem(data_list); } -std::vector Recovery::ParseRawData(const std::string& data) { +template +std::vector RecoveryBase::ParseRawData(const std::string& data) { std::vector data_list; int pos = 0; while (pos < data.size()) { @@ -350,13 +233,15 @@ std::vector Recovery::ParseRawData(const std::string& data) { return data_list; } -void Recovery::MayFlush() { +template +void RecoveryBase::MayFlush() { if (buffer_.size() > buffer_size_) { Flush(); } } -void Recovery::Flush() { +template +void RecoveryBase::Flush() { size_t len = buffer_.size(); if (len == 0) { return; @@ -368,7 +253,8 @@ void Recovery::Flush() { fsync(fd_); } -void Recovery::Write(const char* data, size_t len) { +template +void RecoveryBase::Write(const char* data, size_t len) { int pos = 0; while (len > 0) { int write_len = write(fd_, data + pos, len); @@ -377,7 +263,8 @@ void Recovery::Write(const char* data, size_t len) { } } -bool Recovery::Read(int fd, size_t len, char* data) { +template +bool RecoveryBase::Read(int fd, size_t len, char* data) { int pos = 0; while (len > 0) { int read_len = read(fd, data + pos, len); @@ -390,8 +277,9 @@ bool Recovery::Read(int fd, size_t len, char* data) { return true; } +template std::pair>, int64_t> -Recovery::GetRecoveryFiles(int64_t ckpt) { +RecoveryBase::GetRecoveryFiles(int64_t ckpt) { std::string dir = std::filesystem::path(file_path_).parent_path(); int64_t last_ckpt = 0; for (const auto& entry : std::filesystem::directory_iterator(dir)) { @@ -443,113 +331,10 @@ Recovery::GetRecoveryFiles(int64_t ckpt) { return std::make_pair(list, last_ckpt); } -void Recovery::ReadLogs( - std::function system_callback, - std::function context, - std::unique_ptr request)> - call_back, - std::function set_start_point) { - if (recovery_enabled_ == false) { - return; - } - assert(storage_); - int64_t storage_ckpt = storage_->GetLastCheckpoint(); - LOG(ERROR) << " storage ckpt:" << storage_ckpt; - std::unique_lock lk(mutex_); - - auto recovery_files_pair = GetRecoveryFiles(storage_ckpt); - int64_t ckpt = recovery_files_pair.second; - if (set_start_point) { - set_start_point(ckpt); - } - int idx = 0; - for (auto path : recovery_files_pair.first) { - ReadLogsFromFiles(path.second, ckpt, idx++, system_callback, call_back); - } -} - -void Recovery::ReadLogsFromFiles( - const std::string& path, int64_t ckpt, int file_idx, - std::function system_callback, - std::function context, - std::unique_ptr request)> - call_back) { - int fd = open(path.c_str(), O_CREAT | O_RDONLY, 0666); - if (fd < 0) { - LOG(ERROR) << " open file fail:" << path; - } - LOG(INFO) << "read logs:" << path << " pos:" << lseek(fd, 0, SEEK_CUR); - assert(fd >= 0); - - size_t data_len = 0; - Read(fd, sizeof(data_len), reinterpret_cast(&data_len)); - { - std::string data; - char* buf = new char[data_len]; - if (!Read(fd, data_len, buf)) { - LOG(ERROR) << "Read system info fail"; - return; - } - data = std::string(buf, data_len); - delete buf; - std::vector data_list = ParseRawData(data); - - SystemInfoData info; - if (data_list.empty() || !info.ParseFromString(data_list[0])) { - LOG(ERROR) << "parse info fail:" << data.size(); - return; - } - LOG(ERROR) << "read system info:" << info.DebugString(); - system_callback(info); - } - - std::vector> request_list; - - while (Read(fd, sizeof(data_len), reinterpret_cast(&data_len))) { - std::string data; - char* buf = new char[data_len]; - if (!Read(fd, data_len, buf)) { - LOG(ERROR) << "Read data log fail"; - break; - } - data = std::string(buf, data_len); - delete buf; - - std::vector> list = ParseData(data); - if (list.size() == 0) { - request_list.clear(); - break; - } - for (auto& l : list) { - request_list.push_back(std::move(l)); - } - } - if (request_list.size() == 0) { - ftruncate(fd, 0); - } - uint64_t max_seq = 0; - for (std::unique_ptr& recovery_data : request_list) { - // LOG(ERROR)<<" ckpt :"<request->seq()<<" - // type:"<request->type(); - if (ckpt < recovery_data->request->seq() || - recovery_data->request->type() == Request::TYPE_NEWVIEW) { - recovery_data->request->set_is_recovery(true); - max_seq = recovery_data->request->seq(); - call_back(std::move(recovery_data->context), - std::move(recovery_data->request)); - } - } - - LOG(ERROR) << "read log from files:" << path << " done" - << " recovery max seq:" << max_seq; - - close(fd); -} - -int Recovery::GetData(const RecoveryRequest& request, +template +int RecoveryBase::GetData(const RecoveryRequest& request, RecoveryResponse& response) { - auto res = GetDataFromRecoveryFiles(request.min_seq(), request.max_seq()); + auto res = static_cast(this)->GetDataFromRecoveryFiles(request.min_seq(), request.max_seq()); for (const auto& it : res) { for (const auto& req : it.second) { @@ -560,10 +345,9 @@ int Recovery::GetData(const RecoveryRequest& request, return 0; } -std::map< - uint64_t, - std::vector, std::unique_ptr>>> -Recovery::GetDataFromRecoveryFiles(uint64_t need_min_seq, +template +std::vector> +RecoveryBase::GetSortedRecoveryFiles(uint64_t need_min_seq, uint64_t need_max_seq) { std::string dir = std::filesystem::path(file_path_).parent_path(); @@ -604,27 +388,7 @@ Recovery::GetDataFromRecoveryFiles(uint64_t need_min_seq, sort(e_list.begin(), e_list.end()); list.push_back(e_list.back()); sort(list.begin(), list.end()); - - std::map, - std::unique_ptr>>> - res; - for (const auto& path : list) { - ReadLogsFromFiles( - path.second, need_min_seq - 1, 0, [&](const SystemInfoData& data) {}, - [&](std::unique_ptr context, - std::unique_ptr request) { - // LOG(ERROR) << "check get data from recovery file seq:" - // << request->seq(); - if (request->seq() >= need_min_seq && - request->seq() <= need_max_seq) { - LOG(ERROR) << "get data from recovery file seq:" << request->seq(); - res[request->seq()].push_back( - std::make_pair(std::move(context), std::move(request))); - } - }); - } - - return res; + return list; } -} // namespace resdb +// } // namespace resdb diff --git a/platform/consensus/recovery/recovery_template_functions.h b/platform/consensus/recovery/recovery_template_functions.h new file mode 100644 index 0000000000..15666c782d --- /dev/null +++ b/platform/consensus/recovery/recovery_template_functions.h @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +template +template +void RecoveryBase::ReadLogs(std::function system_callback, + TCallback call_back, + std::function set_start_point) { + if (recovery_enabled_ == false) { + return; + } + // assert(storage_); + int64_t storage_ckpt = storage_ ? storage_->GetLastCheckpoint() : -1; + LOG(ERROR) << " storage ckpt:" << storage_ckpt; + std::unique_lock lk(mutex_); + + auto recovery_files_pair = GetRecoveryFiles(storage_ckpt); + int64_t ckpt = recovery_files_pair.second; + if (set_start_point) { + set_start_point(ckpt); + } + int idx = 0; + for (auto path : recovery_files_pair.first) { + ReadLogsFromFiles(path.second, ckpt, idx++, system_callback, call_back); + } +} + +template +template +void RecoveryBase::SwitchFile(const std::string& file_path, TCallback call_back) { + std::unique_lock lk(mutex_); + + min_seq_ = -1; + max_seq_ = -1; + ReadLogsFromFiles( + file_path, 0, 0, + [&](const TSystemInfoData& data) {}, + call_back); + OpenFile(file_path); + LOG(INFO) << "switch to file:" << file_path << " seq:" + << "[" << min_seq_ << "," << max_seq_ << "]"; +} + +template +void RecoveryBase::OpenFile(const std::string& path) { + if (fd_ >= 0) { + close(fd_); + } + fd_ = open(path.c_str(), O_CREAT | O_WRONLY, 0666); + if (fd_ < 0) { + LOG(ERROR) << "open file fail:" << path << " error:" << strerror(errno); + } + + int pos = lseek(fd_, 0, SEEK_END); + LOG(INFO) << "file path:" << path << " len:" << pos << " fd:" << fd_; + + if (pos == 0) { + static_cast(this)->WriteSystemInfo(); + } + + lseek(fd_, 0, SEEK_END); + LOG(ERROR) << "open file:" << path << " pos:" << lseek(fd_, 0, SEEK_CUR) + << " fd:" << fd_; + assert(fd_ >= 0); +} + +template +template +void RecoveryBase::ReadLogsFromFiles( + const std::string& path, int64_t ckpt, int file_idx, + std::function system_callback, + TCallback call_back) { + int fd = open(path.c_str(), O_CREAT | O_RDONLY, 0666); + if (fd < 0) { + LOG(ERROR) << " open file fail:" << path; + } + LOG(INFO) << "read logs:" << path << " pos:" << lseek(fd, 0, SEEK_CUR); + assert(fd >= 0); + + size_t data_len = 0; + Read(fd, sizeof(data_len), reinterpret_cast(&data_len)); + { + std::string data; + char* buf = new char[data_len]; + if (!Read(fd, data_len, buf)) { + LOG(ERROR) << "Read system info fail"; + return; + } + data = std::string(buf, data_len); + delete buf; + std::vector data_list = ParseRawData(data); + + bool successful_callback = static_cast(this)->PerformSystemCallback(data_list, system_callback); + + if (!successful_callback) { + LOG(ERROR) << "parse info fail:" << data.size(); + } + } + + std::vector::RecoveryData>> request_list; + + while (Read(fd, sizeof(data_len), reinterpret_cast(&data_len))) { + std::string data; + char* buf = new char[data_len]; + if (!Read(fd, data_len, buf)) { + LOG(ERROR) << "Read data log fail"; + break; + } + data = std::string(buf, data_len); + delete buf; + + std::vector::RecoveryData>> list = ParseData(data); + if (list.size() == 0) { + request_list.clear(); + break; + } + for (auto& l : list) { + request_list.push_back(std::move(l)); + } + } + if (request_list.size() == 0) { + ftruncate(fd, 0); + } + + static_cast(this)->PerformCallback(request_list, call_back, ckpt); + + LOG(ERROR) << "read log from files:" << path << " done"; + close(fd); +} \ No newline at end of file diff --git a/platform/consensus/recovery/recovery_test.cpp b/platform/consensus/recovery/recovery_test.cpp index 6c0dd25b56..ffe15f1251 100644 --- a/platform/consensus/recovery/recovery_test.cpp +++ b/platform/consensus/recovery/recovery_test.cpp @@ -17,7 +17,7 @@ * under the License. */ -#include "platform/consensus/recovery/recovery.h" +#include "platform/consensus/recovery/pbft_recovery.h" #include #include @@ -87,7 +87,7 @@ TEST_F(RecoveryTest, ReadLog) { }; { - Recovery recovery(config_, &checkpoint_, &system_info_, nullptr); + PBFTRecovery recovery(config_, &checkpoint_, &system_info_, nullptr); for (int t : types) { std::unique_ptr request = @@ -98,12 +98,19 @@ TEST_F(RecoveryTest, ReadLog) { } { std::vector list; - Recovery recovery(config_, &checkpoint_, &system_info_, nullptr); - recovery.ReadLogs( + PBFTRecovery recovery(config_, &checkpoint_, &system_info_, nullptr); + + std::function, std::unique_ptr)> call_back = + [&](std::unique_ptr context, std::unique_ptr request) { + list.push_back(*request); + // LOG(ERROR) << "call back:" << request->seq(); + }; + + recovery.ReadLogs( [&](const SystemInfoData &data) {}, - [&](std::unique_ptr context, - std::unique_ptr request) { list.push_back(*request); }, - nullptr); + call_back, + nullptr + ); EXPECT_EQ(list.size(), expected_types.size()); @@ -127,7 +134,7 @@ TEST_F(RecoveryTest, ReadLog_FlushOnce) { }; { - Recovery recovery(config, &checkpoint_, &system_info_, nullptr); + PBFTRecovery recovery(config, &checkpoint_, &system_info_, nullptr); for (int t : types) { std::unique_ptr request = @@ -138,14 +145,19 @@ TEST_F(RecoveryTest, ReadLog_FlushOnce) { } { std::vector list; - Recovery recovery(config, &checkpoint_, &system_info_, nullptr); - recovery.ReadLogs([&](const SystemInfoData &data) {}, - [&](std::unique_ptr context, - std::unique_ptr request) { - LOG(ERROR) << "call back:" << request->seq(); - list.push_back(*request); - }, - nullptr); + PBFTRecovery recovery(config, &checkpoint_, &system_info_, nullptr); + + std::function, std::unique_ptr)> call_back = + [&](std::unique_ptr context, std::unique_ptr request) { + list.push_back(*request); + // LOG(ERROR) << "call back:" << request->seq(); + }; + + recovery.ReadLogs( + [&](const SystemInfoData &data) {}, + call_back, + nullptr + ); EXPECT_EQ(list.size(), expected_types.size()); @@ -180,7 +192,7 @@ TEST_F(RecoveryTest, CheckPoint) { })); { - Recovery recovery(config, &checkpoint_, &system_info_, nullptr); + PBFTRecovery recovery(config, &checkpoint_, &system_info_, nullptr); for (int i = 1; i < 10; ++i) { for (int t : types) { @@ -205,14 +217,19 @@ TEST_F(RecoveryTest, CheckPoint) { EXPECT_EQ(log_list.size(), 2); { std::vector list; - Recovery recovery(config, &checkpoint_, &system_info_, nullptr); - recovery.ReadLogs([&](const SystemInfoData &data) {}, - [&](std::unique_ptr context, - std::unique_ptr request) { - list.push_back(*request); - // LOG(ERROR)<<"call back:"<seq(); - }, - nullptr); + PBFTRecovery recovery(config, &checkpoint_, &system_info_, nullptr); + + std::function, std::unique_ptr)> call_back = + [&](std::unique_ptr context, std::unique_ptr request) { + list.push_back(*request); + // LOG(ERROR) << "call back:" << request->seq(); + }; + + recovery.ReadLogs( + [&](const SystemInfoData &data) {}, + call_back, + nullptr + ); EXPECT_EQ(list.size(), types.size() * 14); @@ -258,7 +275,7 @@ TEST_F(RecoveryTest, CheckPoint2) { })); { - Recovery recovery(config, &checkpoint_, &system_info_, &storage); + PBFTRecovery recovery(config, &checkpoint_, &system_info_, &storage); for (int i = 1; i < 10; ++i) { for (int t : types) { @@ -283,14 +300,19 @@ TEST_F(RecoveryTest, CheckPoint2) { EXPECT_EQ(log_list.size(), 2); { std::vector list; - Recovery recovery(config, &checkpoint_, &system_info_, &storage); - recovery.ReadLogs([&](const SystemInfoData &data) {}, - [&](std::unique_ptr context, - std::unique_ptr request) { - list.push_back(*request); - // LOG(ERROR)<<"call back:"<seq(); - }, - nullptr); + PBFTRecovery recovery(config, &checkpoint_, &system_info_, &storage); + + std::function, std::unique_ptr)> call_back = + [&](std::unique_ptr context, std::unique_ptr request) { + list.push_back(*request); + // LOG(ERROR) << "call back:" << request->seq(); + }; + + recovery.ReadLogs( + [&](const SystemInfoData &data) {}, + call_back, + nullptr + ); EXPECT_EQ(list.size(), types.size() * 14); @@ -321,14 +343,19 @@ TEST_F(RecoveryTest, CheckPoint2) { { std::vector list; - Recovery recovery(config, &checkpoint_, &system_info_, &storage); - recovery.ReadLogs([&](const SystemInfoData &data) {}, - [&](std::unique_ptr context, - std::unique_ptr request) { - list.push_back(*request); - // LOG(ERROR)<<"call back:"<seq(); - }, - nullptr); + PBFTRecovery recovery(config, &checkpoint_, &system_info_, &storage); + + std::function, std::unique_ptr)> call_back = + [&](std::unique_ptr context, std::unique_ptr request) { + list.push_back(*request); + // LOG(ERROR) << "call back:" << request->seq(); + }; + + recovery.ReadLogs( + [&](const SystemInfoData &data) {}, + call_back, + nullptr + ); EXPECT_EQ(list.size(), types.size() * 9); @@ -376,7 +403,7 @@ TEST_F(RecoveryTest, SystemInfo) { })); { - Recovery recovery(config, &checkpoint_, &system_info_, &storage); + PBFTRecovery recovery(config, &checkpoint_, &system_info_, &storage); system_info_.SetCurrentView(2); system_info_.SetPrimary(2); @@ -404,14 +431,19 @@ TEST_F(RecoveryTest, SystemInfo) { { std::vector list; SystemInfoData data; - Recovery recovery(config, &checkpoint_, &system_info_, &storage); - recovery.ReadLogs([&](const SystemInfoData &r_data) { data = r_data; }, - [&](std::unique_ptr context, - std::unique_ptr request) { - list.push_back(*request); - // LOG(ERROR)<<"call back:"<seq(); - }, - nullptr); + PBFTRecovery recovery(config, &checkpoint_, &system_info_, &storage); + + std::function, std::unique_ptr)> call_back = + [&](std::unique_ptr context, std::unique_ptr request) { + list.push_back(*request); + // LOG(ERROR) << "call back:" << request->seq(); + }; + + recovery.ReadLogs( + [&](const SystemInfoData &r_data) {data = r_data;}, + call_back, + nullptr + ); EXPECT_EQ(list.size(), types.size() * 14); @@ -443,14 +475,19 @@ TEST_F(RecoveryTest, SystemInfo) { { std::vector list; SystemInfoData data; - Recovery recovery(config, &checkpoint_, &system_info_, &storage); - recovery.ReadLogs([&](const SystemInfoData &r_data) { data = r_data; }, - [&](std::unique_ptr context, - std::unique_ptr request) { - list.push_back(*request); - // LOG(ERROR)<<"call back:"<seq(); - }, - nullptr); + PBFTRecovery recovery(config, &checkpoint_, &system_info_, &storage); + + std::function, std::unique_ptr)> call_back = + [&](std::unique_ptr context, std::unique_ptr request) { + list.push_back(*request); + // LOG(ERROR) << "call back:" << request->seq(); + }; + + recovery.ReadLogs( + [&](const SystemInfoData &r_data) {data = r_data;}, + call_back, + nullptr + ); EXPECT_EQ(data.view(), 2); EXPECT_EQ(data.primary_id(), 2); From 89ed7fdd3f1b209bba3dad7a4d5aa129abee2639 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Wed, 1 Apr 2026 13:47:19 -0700 Subject: [PATCH 55/66] WIP Update RaftRecovery and tests, tentatively working --- .../ordering/common/algorithm/protocol_base.h | 4 +- .../ordering/raft/algorithm/raft.cpp | 20 ++++- .../consensus/ordering/raft/algorithm/raft.h | 1 + .../ordering/raft/framework/consensus.cpp | 7 +- .../ordering/raft/framework/raft_recovery.cpp | 83 +++++++------------ .../ordering/raft/framework/raft_recovery.h | 9 +- .../raft/framework/raft_recovery_test.cpp | 25 ++---- platform/consensus/recovery/pbft_recovery.cpp | 15 +++- platform/consensus/recovery/pbft_recovery.h | 8 +- platform/consensus/recovery/recovery.h | 13 ++- platform/consensus/recovery/recovery_impl.h | 17 +--- .../recovery/recovery_template_functions.h | 46 +++++----- platform/consensus/recovery/recovery_test.cpp | 5 +- scripts/deploy/config/pbft.config | 8 +- scripts/deploy/config/raft.config | 12 +-- 15 files changed, 132 insertions(+), 141 deletions(-) diff --git a/platform/consensus/ordering/common/algorithm/protocol_base.h b/platform/consensus/ordering/common/algorithm/protocol_base.h index d180746bda..f8e47052a2 100644 --- a/platform/consensus/ordering/common/algorithm/protocol_base.h +++ b/platform/consensus/ordering/common/algorithm/protocol_base.h @@ -63,9 +63,9 @@ class ProtocolBase { } protected: - virtual int SendMessage(int msg_type, const google::protobuf::Message& msg, + int SendMessage(int msg_type, const google::protobuf::Message& msg, int node_id); - virtual int Broadcast(int msg_type, const google::protobuf::Message& msg); + int Broadcast(int msg_type, const google::protobuf::Message& msg); int Commit(const google::protobuf::Message& msg); bool IsStop(); diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index e2a6618ee9..b7cf23a082 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -221,9 +221,11 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { // append remaining entries const auto appendSize = entriesSize - entriesIdx; log_.reserve(log_.size() + appendSize); + std::vector log_entries_to_add; for (uint64_t i = entriesIdx; i < entriesSize; ++i) { - AddToLog(CreateLogEntry(ae->entries(i))); + log_entries_to_add.push_back(CreateLogEntry(ae->entries(i))); } + AddToLog(log_entries_to_add); // update lastLogIndex after appends uint64_t firstAppendIdx = lastLogIndex_ + 1; lastLogIndex_ = log_.size() - 1; @@ -863,6 +865,22 @@ void Raft::AddToLog(LogEntry logEntryToAdd, bool writeMetadata) { log_.push_back(logEntryToAdd); } +void Raft::AddToLog(std::vector logEntriesToAdd, bool writeMetadata) { + if (writeMetadata) { + std::vector entries_to_add; + for (const auto &entry : logEntriesToAdd) { + entries_to_add.push_back(entry.entry); + } + + recovery_->AddLogEntry(entries_to_add); + } + + log_.reserve(log_.size() + logEntriesToAdd.size()); + log_.insert(log_.end(), + std::make_move_iterator(logEntriesToAdd.begin()), + std::make_move_iterator(logEntriesToAdd.end())); +} + void Raft::TruncateLog(std::vector::iterator first, std::vector::iterator last, bool writeMetadata) { diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index b89f0d8248..6ae3c11f08 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -117,6 +117,7 @@ class Raft : public common::ProtocolBase { virtual void SetVotedFor(int votedFor, bool writeMetadata = true); virtual void SetSeqIndexCoveredBySnapshot(int seq); void AddToLog(LogEntry logEntry, bool writeMetadata = true); + void AddToLog(std::vector logEntriesToAdd, bool writeMetadata = true); void TruncateLog(std::vector::iterator first, std::vector::iterator last, bool writeMetadata = true); diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 4e4d53f0d3..10f6207f34 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -122,12 +122,15 @@ int Consensus::ProcessCustomConsensus(std::unique_ptr request) { void Consensus::RecoverFromLogs() { recovery_->ReadLogs( [&](const RaftMetadata& metadata) { - LOG(ERROR) << " read current term: " << metadata.current_term + LOG(INFO) << " read current term: " << metadata.current_term << " voted for: " << metadata.voted_for; raft_->SetCurrentTerm(metadata.current_term, false); raft_->SetVotedFor(metadata.voted_for, false); }, - [&](std::unique_ptr request) { + [&](std::unique_ptr entry) { + auto request = std::make_unique(); + if (!request->ParseFromString(entry->command())) + LOG(ERROR) << "Error parsing entry in Recovery"; return CommitMsg(*request); }, [&](int seq) { raft_->SetSeqIndexCoveredBySnapshot(seq); }); diff --git a/platform/consensus/ordering/raft/framework/raft_recovery.cpp b/platform/consensus/ordering/raft/framework/raft_recovery.cpp index a99e9b9cb4..1ca20a1fff 100644 --- a/platform/consensus/ordering/raft/framework/raft_recovery.cpp +++ b/platform/consensus/ordering/raft/framework/raft_recovery.cpp @@ -35,7 +35,7 @@ namespace resdb { namespace raft { -using CallbackType = std::function)>; +using CallbackType = std::function)>; RaftRecovery::RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage) : RecoveryBase(config, checkpoint, storage) { @@ -52,11 +52,11 @@ void RaftRecovery::Init() { GetLastFile(); CallbackType callback = - [this](std::unique_ptr request) { + [this](std::unique_ptr entry) { min_seq_ == -1 - ? min_seq_ = request->seq() - : std::min(min_seq_, static_cast(request->seq())); - max_seq_ = std::max(max_seq_, static_cast(request->seq())); + ? min_seq_ = entry->term() + : std::min(min_seq_, static_cast(entry->term())); + max_seq_ = std::max(max_seq_, static_cast(entry->term())); }; SwitchFile(file_path_, callback); @@ -136,7 +136,19 @@ void RaftRecovery::AddLogEntry(const Entry* entry) { if (recovery_enabled_ == false) { return; } - return WriteLog(entry); + + WriteLog(entry); + Flush(); +} + +void RaftRecovery::AddLogEntry(std::vector &entries_to_add) { + if (recovery_enabled_ == false) { + return; + } + for (const auto &entry : entries_to_add) { + WriteLog(&entry); + } + Flush(); } void RaftRecovery::WriteLog(const Entry* entry) { @@ -152,42 +164,36 @@ LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __ : std::min(min_seq_, static_cast(entry->term())); max_seq_ = std::max(max_seq_, static_cast(entry->term())); AppendData(data); - - Flush(); } -std::vector::RecoveryData>> RaftRecovery::ParseDataListItem( +std::vector> RaftRecovery::ParseDataListItem( std::vector &data_list) { - std::vector> request_list; + std::vector> request_list; - for (size_t i = 0; i < data_list.size(); i += 2) { - std::unique_ptr recovery_data = - std::make_unique(); - recovery_data->request = std::make_unique(); + for (size_t i = 0; i < data_list.size(); i++) { + std::unique_ptr entry = std::make_unique(); - if (!recovery_data->request->ParseFromString(data_list[i])) { + if (!entry->ParseFromString(data_list[i])) { LOG(ERROR) << "Parse from data fail"; break; } - request_list.push_back(std::move(recovery_data)); + request_list.push_back(std::move(entry)); } return request_list; } void RaftRecovery::PerformCallback( - std::vector> &request_list, + std::vector> &request_list, CallbackType call_back, int64_t ckpt) { uint64_t max_seq = 0; - for (std::unique_ptr& recovery_data : request_list) { + for (std::unique_ptr& entry : request_list) { // LOG(ERROR)<<" ckpt :"<request->seq()<<" // type:"<request->type(); - if (ckpt < recovery_data->request->seq() || - recovery_data->request->type() == Request::TYPE_NEWVIEW) { - recovery_data->request->set_is_recovery(true); - max_seq = recovery_data->request->seq(); - call_back(std::move(recovery_data->request)); + if (ckpt < entry->term()) { + max_seq = entry->term(); + call_back(std::move(entry)); } } @@ -200,37 +206,6 @@ bool RaftRecovery::PerformSystemCallback(std::vector data_list, std return true; } -std::map< - uint64_t, - std::vector, std::unique_ptr>>> -RaftRecovery::GetDataFromRecoveryFiles(uint64_t need_min_seq, - uint64_t need_max_seq) { - auto list = GetSortedRecoveryFiles(need_min_seq, need_max_seq); - - std::map, - std::unique_ptr>>> - res; - std::function system_cb = [&](const RaftMetadata&) {}; - for (const auto& path : list) { - CallbackType callback = - [&](std::unique_ptr request) { - if (request->seq() >= need_min_seq && - request->seq() <= need_max_seq) { - LOG(ERROR) << "get data from recovery file seq:" << request->seq(); - res[request->seq()].push_back( - std::make_pair(nullptr, std::move(request))); - } - }; - - this->template ReadLogsFromFiles( - path.second, need_min_seq - 1, 0, - system_cb, // system callback - callback); // typed callback - } - - return res; -} - } // namespace raft template class RecoveryBase; diff --git a/platform/consensus/ordering/raft/framework/raft_recovery.h b/platform/consensus/ordering/raft/framework/raft_recovery.h index 5168c3aaf0..65705b399c 100644 --- a/platform/consensus/ordering/raft/framework/raft_recovery.h +++ b/platform/consensus/ordering/raft/framework/raft_recovery.h @@ -50,19 +50,18 @@ class RaftRecovery : public RecoveryBase { void Init(); void WriteMetadata(int64_t current_term, int32_t voted_for); void AddLogEntry(const Entry* entry); - std::map, std::unique_ptr>>> - GetDataFromRecoveryFiles(uint64_t need_min_seq, uint64_t need_max_seq); + void AddLogEntry(std::vector &entries_to_add); private: void OpenMetadataFile(); void WriteSystemInfo(); - std::vector> ParseDataListItem( + std::vector> ParseDataListItem( std::vector &data_list); void WriteLog(const Entry* entry); void PerformCallback( - std::vector> &request_list, - std::function request)> + std::vector> &request_list, + std::function entry)> call_back, int64_t ckpt); bool PerformSystemCallback(std::vector data_list, std::function system_callback); diff --git a/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp b/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp index e22497cbb2..beee10eaae 100644 --- a/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp +++ b/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp @@ -57,25 +57,16 @@ class RaftRecoveryTest : public Test { }; TEST_F(RaftRecoveryTest, ReadLog) { - std::vector types = {Request::TYPE_PRE_PREPARE, Request::TYPE_PREPARE, - Request::TYPE_COMMIT, Request::TYPE_CHECKPOINT, - Request::TYPE_NEWVIEW, Request::TYPE_NEW_TXNS}; - - std::vector expected_types = { - Request::TYPE_PRE_PREPARE, Request::TYPE_PREPARE, Request::TYPE_COMMIT, - Request::TYPE_CHECKPOINT, Request::TYPE_NEWVIEW, - }; - int entries_to_add = 3; { - RaftRecovery recovery(config_, &checkpoint_, &system_info_, nullptr); + RaftRecovery recovery(config_, &checkpoint_, nullptr); for (int i = 0; i < entries_to_add; i++) { // Set up the Log Entry to be added Entry logEntry; - logEntry.set_term(i); + logEntry.set_term(i + 1); auto req = std::make_unique(); - req->set_seq(i); + req->set_seq(i + 1); std::string serialized; if (!req->SerializeToString(&serialized)) { assert(false); @@ -86,17 +77,17 @@ TEST_F(RaftRecoveryTest, ReadLog) { } } { - std::vector list; - RaftRecovery recovery(config_, &checkpoint_, &system_info_, nullptr); - recovery.ReadLogs( + std::vector list; + RaftRecovery recovery(config_, &checkpoint_, nullptr); + recovery.ReadLogs( [&](const RaftMetadata &data) {}, - [&](std::unique_ptr request) { list.push_back(*request); }, + [&](std::unique_ptr entry) { list.push_back(*entry); }, nullptr); EXPECT_EQ(list.size(), entries_to_add); for (size_t i = 0; i < entries_to_add; ++i) { - EXPECT_EQ(list[i].seq(), i); + EXPECT_EQ(list[i].term(), i + 1); } } } diff --git a/platform/consensus/recovery/pbft_recovery.cpp b/platform/consensus/recovery/pbft_recovery.cpp index 492395bd23..18524c728c 100644 --- a/platform/consensus/recovery/pbft_recovery.cpp +++ b/platform/consensus/recovery/pbft_recovery.cpp @@ -119,7 +119,7 @@ void PBFTRecovery::WriteLog(const Context* context, const Request* request) { Flush(); } -std::vector::RecoveryData>> PBFTRecovery::ParseDataListItem( +std::vector> PBFTRecovery::ParseDataListItem( std::vector &data_list) { std::vector> request_list; @@ -205,6 +205,19 @@ PBFTRecovery::GetDataFromRecoveryFiles(uint64_t need_min_seq, return res; } +int PBFTRecovery::GetData(const RecoveryRequest& request, + RecoveryResponse& response) { + auto res = GetDataFromRecoveryFiles(request.min_seq(), request.max_seq()); + + for (const auto& it : res) { + for (const auto& req : it.second) { + *response.add_signature() = req.first->signature; + *response.add_request() = *req.second; + } + } + return 0; +} + template class RecoveryBase; diff --git a/platform/consensus/recovery/pbft_recovery.h b/platform/consensus/recovery/pbft_recovery.h index 556e5b157e..1b47b172cc 100644 --- a/platform/consensus/recovery/pbft_recovery.h +++ b/platform/consensus/recovery/pbft_recovery.h @@ -20,7 +20,6 @@ #pragma once #include "platform/consensus/recovery/recovery.h" -#include "platform/proto/system_info_data.pb.h" #include "platform/consensus/execution/system_info.h" namespace resdb { @@ -38,7 +37,14 @@ class PBFTRecovery : public RecoveryBase { std::unique_ptr>>> GetDataFromRecoveryFiles(uint64_t need_min_seq, uint64_t need_max_seq); + int GetData(const RecoveryRequest& request, RecoveryResponse& response); + private: + struct RecoveryData { + std::unique_ptr context; + std::unique_ptr request; + }; + void Init(); void WriteLog(const Context* context, const Request* request); void WriteSystemInfo(); diff --git a/platform/consensus/recovery/recovery.h b/platform/consensus/recovery/recovery.h index 7c5e2abaa4..d98790d308 100644 --- a/platform/consensus/recovery/recovery.h +++ b/platform/consensus/recovery/recovery.h @@ -26,6 +26,7 @@ #include "platform/consensus/checkpoint/checkpoint.h" #include "platform/networkstrate/server_comm.h" #include "platform/proto/resdb.pb.h" +#include "platform/proto/system_info_data.pb.h" #include #include #include @@ -55,20 +56,16 @@ class RecoveryBase { int64_t GetMaxSeq(); int64_t GetMinSeq(); - int GetData(const RecoveryRequest& request, RecoveryResponse& response); + // int GetData(const RecoveryRequest& request, RecoveryResponse& response); protected: std::vector> GetSortedRecoveryFiles(uint64_t need_min_seq, uint64_t need_max_seq); - struct RecoveryData { - std::unique_ptr context; - std::unique_ptr request; - }; private: void WriteLog(const Context* context, const Request* request); - std::vector> ParseData(const std::string& data); + auto ParseData(const std::string& data); std::vector ParseRawData(const std::string& data); void MayFlush(); @@ -103,10 +100,10 @@ class RecoveryBase { std::string file_path_; ResDBConfig config_; // Derived class must implement these - std::vector> ParseDataListItem(std::vector& data_list); + auto ParseDataListItem(std::vector& data_list); template - void PerformCallback(std::vector>& request_list, TCallback call_back); + void PerformCallback(auto& request_list, TCallback call_back); void WriteSystemInfo(); diff --git a/platform/consensus/recovery/recovery_impl.h b/platform/consensus/recovery/recovery_impl.h index 6bf044877f..2f374a2a92 100644 --- a/platform/consensus/recovery/recovery_impl.h +++ b/platform/consensus/recovery/recovery_impl.h @@ -199,9 +199,7 @@ void RecoveryBase::AppendData(const std::string& data) { } template -std::vector::RecoveryData>> RecoveryBase::ParseData( - const std::string& data) { - +auto RecoveryBase::ParseData(const std::string& data) { std::vector data_list; int pos = 0; while (pos < data.size()) { @@ -331,19 +329,6 @@ RecoveryBase::GetRecoveryFiles(int64_t ckpt) { return std::make_pair(list, last_ckpt); } -template -int RecoveryBase::GetData(const RecoveryRequest& request, - RecoveryResponse& response) { - auto res = static_cast(this)->GetDataFromRecoveryFiles(request.min_seq(), request.max_seq()); - - for (const auto& it : res) { - for (const auto& req : it.second) { - *response.add_signature() = req.first->signature; - *response.add_request() = *req.second; - } - } - return 0; -} template std::vector> diff --git a/platform/consensus/recovery/recovery_template_functions.h b/platform/consensus/recovery/recovery_template_functions.h index 15666c782d..94a8f544c4 100644 --- a/platform/consensus/recovery/recovery_template_functions.h +++ b/platform/consensus/recovery/recovery_template_functions.h @@ -27,9 +27,11 @@ void RecoveryBase::ReadLogs(std::functionGetLastCheckpoint() : -1; - LOG(ERROR) << " storage ckpt:" << storage_ckpt; + + int64_t storage_ckpt = 0; + if(storage_) { + storage_ckpt = storage_->GetLastCheckpoint(); + } std::unique_lock lk(mutex_); auto recovery_files_pair = GetRecoveryFiles(storage_ckpt); @@ -96,26 +98,28 @@ void RecoveryBase::ReadLogsFromFiles( assert(fd >= 0); size_t data_len = 0; - Read(fd, sizeof(data_len), reinterpret_cast(&data_len)); - { - std::string data; - char* buf = new char[data_len]; - if (!Read(fd, data_len, buf)) { - LOG(ERROR) << "Read system info fail"; - return; - } - data = std::string(buf, data_len); - delete buf; - std::vector data_list = ParseRawData(data); - - bool successful_callback = static_cast(this)->PerformSystemCallback(data_list, system_callback); - - if (!successful_callback) { - LOG(ERROR) << "parse info fail:" << data.size(); + if constexpr (std::is_same_v) { + Read(fd, sizeof(data_len), reinterpret_cast(&data_len)); + { + std::string data; + char* buf = new char[data_len]; + if (!Read(fd, data_len, buf)) { + LOG(ERROR) << "Read system info fail"; + return; + } + data = std::string(buf, data_len); + delete buf; + std::vector data_list = ParseRawData(data); + + bool successful_callback = static_cast(this)->PerformSystemCallback(data_list, system_callback); + + if (!successful_callback) { + LOG(ERROR) << "parse info fail:" << data.size(); + } } } - std::vector::RecoveryData>> request_list; + decltype(ParseData(std::string{})) request_list; while (Read(fd, sizeof(data_len), reinterpret_cast(&data_len))) { std::string data; @@ -127,7 +131,7 @@ void RecoveryBase::ReadLogsFromFiles( data = std::string(buf, data_len); delete buf; - std::vector::RecoveryData>> list = ParseData(data); + auto list = ParseData(data); if (list.size() == 0) { request_list.clear(); break; diff --git a/platform/consensus/recovery/recovery_test.cpp b/platform/consensus/recovery/recovery_test.cpp index ffe15f1251..2181996249 100644 --- a/platform/consensus/recovery/recovery_test.cpp +++ b/platform/consensus/recovery/recovery_test.cpp @@ -83,7 +83,7 @@ TEST_F(RecoveryTest, ReadLog) { std::vector expected_types = { Request::TYPE_PRE_PREPARE, Request::TYPE_PREPARE, Request::TYPE_COMMIT, - Request::TYPE_CHECKPOINT, Request::TYPE_NEWVIEW, + Request::TYPE_NEWVIEW, }; { @@ -130,7 +130,7 @@ TEST_F(RecoveryTest, ReadLog_FlushOnce) { std::vector expected_types = { Request::TYPE_PRE_PREPARE, Request::TYPE_PREPARE, Request::TYPE_COMMIT, - Request::TYPE_CHECKPOINT, Request::TYPE_NEWVIEW, + Request::TYPE_NEWVIEW, }; { @@ -150,7 +150,6 @@ TEST_F(RecoveryTest, ReadLog_FlushOnce) { std::function, std::unique_ptr)> call_back = [&](std::unique_ptr context, std::unique_ptr request) { list.push_back(*request); - // LOG(ERROR) << "call back:" << request->seq(); }; recovery.ReadLogs( diff --git a/scripts/deploy/config/pbft.config b/scripts/deploy/config/pbft.config index 1de013abf3..e13d03df63 100644 --- a/scripts/deploy/config/pbft.config +++ b/scripts/deploy/config/pbft.config @@ -22,8 +22,8 @@ "enable_viewchange": true, "recovery_enabled": true, "max_client_complaint_num":10, - "max_process_txn": 2048, - "worker_num": 2, - "input_worker_num": 1, - "output_worker_num": 10 + "max_process_txn": 64, + "worker_num": 16, + "input_worker_num": 5, + "output_worker_num": 5 } diff --git a/scripts/deploy/config/raft.config b/scripts/deploy/config/raft.config index 8d30825870..d24bb8493b 100644 --- a/scripts/deploy/config/raft.config +++ b/scripts/deploy/config/raft.config @@ -1,11 +1,11 @@ { - "clientBatchNum": 30, + "clientBatchNum": 100, "enable_viewchange": true, - "recovery_enabled": false, + "recovery_enabled": true, "not_need_signature": true, "max_client_complaint_num":10, - "max_process_txn": 512, - "worker_num": 1, - "input_worker_num": 1, - "output_worker_num": 10 + "max_process_txn": 64, + "worker_num": 16, + "input_worker_num": 5, + "output_worker_num": 5 } From db4689b548ef937fa6eaaa895a19c5aaf7450968 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Wed, 1 Apr 2026 13:51:02 -0700 Subject: [PATCH 56/66] Run clang format --- .../ordering/pbft/consensus_manager_pbft.cpp | 6 +- .../ordering/raft/algorithm/raft.cpp | 7 +- .../consensus/ordering/raft/algorithm/raft.h | 6 +- .../ordering/raft/algorithm/recovery_tests.h | 7 +- .../ordering/raft/framework/consensus.cpp | 6 +- .../ordering/raft/framework/raft_recovery.cpp | 65 +++++----- .../ordering/raft/framework/raft_recovery.h | 20 +-- .../raft/framework/raft_recovery_test.cpp | 35 +++-- platform/consensus/recovery/pbft_recovery.cpp | 73 +++++------ platform/consensus/recovery/pbft_recovery.h | 21 +-- platform/consensus/recovery/recovery.h | 46 +++---- platform/consensus/recovery/recovery_impl.h | 57 ++++---- .../recovery/recovery_template_functions.h | 36 +++--- platform/consensus/recovery/recovery_test.cpp | 122 ++++++++---------- 14 files changed, 256 insertions(+), 251 deletions(-) diff --git a/platform/consensus/ordering/pbft/consensus_manager_pbft.cpp b/platform/consensus/ordering/pbft/consensus_manager_pbft.cpp index a05291fa86..b4b625d4ca 100644 --- a/platform/consensus/ordering/pbft/consensus_manager_pbft.cpp +++ b/platform/consensus/ordering/pbft/consensus_manager_pbft.cpp @@ -53,9 +53,9 @@ ConsensusManagerPBFT::ConsensusManagerPBFT( view_change_manager_(std::make_unique( config_, checkpoint_manager_.get(), message_manager_.get(), system_info_.get(), GetBroadCastClient(), GetSignatureVerifier())), - recovery_(std::make_unique(config_, checkpoint_manager_.get(), - system_info_.get(), - message_manager_->GetStorage())), + recovery_(std::make_unique( + config_, checkpoint_manager_.get(), system_info_.get(), + message_manager_->GetStorage())), query_(std::make_unique(config_, recovery_.get(), std::move(query_executor))) { LOG(INFO) << "is running is performance mode:" diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index b7cf23a082..f3d2b5fcf8 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -868,16 +868,15 @@ void Raft::AddToLog(LogEntry logEntryToAdd, bool writeMetadata) { void Raft::AddToLog(std::vector logEntriesToAdd, bool writeMetadata) { if (writeMetadata) { std::vector entries_to_add; - for (const auto &entry : logEntriesToAdd) { + for (const auto& entry : logEntriesToAdd) { entries_to_add.push_back(entry.entry); } - + recovery_->AddLogEntry(entries_to_add); } log_.reserve(log_.size() + logEntriesToAdd.size()); - log_.insert(log_.end(), - std::make_move_iterator(logEntriesToAdd.begin()), + log_.insert(log_.end(), std::make_move_iterator(logEntriesToAdd.begin()), std::make_move_iterator(logEntriesToAdd.end())); } diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 6ae3c11f08..83fc230640 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -117,7 +117,8 @@ class Raft : public common::ProtocolBase { virtual void SetVotedFor(int votedFor, bool writeMetadata = true); virtual void SetSeqIndexCoveredBySnapshot(int seq); void AddToLog(LogEntry logEntry, bool writeMetadata = true); - void AddToLog(std::vector logEntriesToAdd, bool writeMetadata = true); + void AddToLog(std::vector logEntriesToAdd, + bool writeMetadata = true); void TruncateLog(std::vector::iterator first, std::vector::iterator last, bool writeMetadata = true); @@ -227,7 +228,8 @@ class Raft : public common::ProtocolBase { const auto& entry = log_[i]; os << " [" << i << "] " - << "term=" << entry.entry.term() << ", command=\"" << entry.entry.command() << "\"" + << "term=" << entry.entry.term() << ", command=\"" + << entry.entry.command() << "\"" << ", serializedSize=" << entry.GetSerializedSize() << "\n"; } } diff --git a/platform/consensus/ordering/raft/algorithm/recovery_tests.h b/platform/consensus/ordering/raft/algorithm/recovery_tests.h index 38fb7e6dcb..409344fb77 100644 --- a/platform/consensus/ordering/raft/algorithm/recovery_tests.h +++ b/platform/consensus/ordering/raft/algorithm/recovery_tests.h @@ -47,7 +47,9 @@ class RecoveryTest : public ::testing::Test { leader_election_manager_ = std::make_unique(config); replica_communicator_ = std::make_unique(); - recovery_ = std::make_unique(config, CheckPoint* checkpoint, SystemInfo* system_info, Storage* storage); + recovery_ = std::make_unique( + config, CheckPoint * checkpoint, SystemInfo * system_info, + Storage * storage); raft_ = std::make_unique( /*id=*/1, /*f=*/1, @@ -92,8 +94,7 @@ class RecoveryTest : public ::testing::Test { }; // Helper to create a single log entry. - LogEntry CreateLogEntry(uint64_t term, - const std::string& command_data) { + LogEntry CreateLogEntry(uint64_t term, const std::string& command_data) { LogEntry entry; entry.term = term; entry.command = command_data; diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 10f6207f34..197b842989 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -57,7 +57,7 @@ Consensus::Consensus(const ResDBConfig& config, leader_election_manager_->MayStart(); RecoverFromLogs(); - + InitProtocol(raft_.get()); } } @@ -123,14 +123,14 @@ void Consensus::RecoverFromLogs() { recovery_->ReadLogs( [&](const RaftMetadata& metadata) { LOG(INFO) << " read current term: " << metadata.current_term - << " voted for: " << metadata.voted_for; + << " voted for: " << metadata.voted_for; raft_->SetCurrentTerm(metadata.current_term, false); raft_->SetVotedFor(metadata.voted_for, false); }, [&](std::unique_ptr entry) { auto request = std::make_unique(); if (!request->ParseFromString(entry->command())) - LOG(ERROR) << "Error parsing entry in Recovery"; + LOG(ERROR) << "Error parsing entry in Recovery"; return CommitMsg(*request); }, [&](int seq) { raft_->SetSeqIndexCoveredBySnapshot(seq); }); diff --git a/platform/consensus/ordering/raft/framework/raft_recovery.cpp b/platform/consensus/ordering/raft/framework/raft_recovery.cpp index 1ca20a1fff..112bbec083 100644 --- a/platform/consensus/ordering/raft/framework/raft_recovery.cpp +++ b/platform/consensus/ordering/raft/framework/raft_recovery.cpp @@ -37,7 +37,8 @@ namespace raft { using CallbackType = std::function)>; -RaftRecovery::RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage) +RaftRecovery::RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, + Storage* storage) : RecoveryBase(config, checkpoint, storage) { Init(); } @@ -51,27 +52,26 @@ void RaftRecovery::Init() { LOG(ERROR) << " init"; GetLastFile(); - CallbackType callback = - [this](std::unique_ptr entry) { - min_seq_ == -1 - ? min_seq_ = entry->term() - : std::min(min_seq_, static_cast(entry->term())); - max_seq_ = std::max(max_seq_, static_cast(entry->term())); - }; + CallbackType callback = [this](std::unique_ptr entry) { + min_seq_ == -1 ? min_seq_ = entry->term() + : std::min(min_seq_, static_cast(entry->term())); + max_seq_ = std::max(max_seq_, static_cast(entry->term())); + }; SwitchFile(file_path_, callback); LOG(ERROR) << " init done"; - + meta_file_path_ = std::filesystem::path(base_file_path_).parent_path() / "raft_metadata.dat"; LOG(INFO) << "Meta file path: " << meta_file_path_; OpenMetadataFile(); - ckpt_thread_ = std::thread([this]{ this->UpdateStableCheckPoint(); }); + ckpt_thread_ = std::thread([this] { this->UpdateStableCheckPoint(); }); } RaftRecovery::~RaftRecovery() { - LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; + LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " + << __func__ << "\n"; if (recovery_enabled_ == false) { return; } @@ -82,7 +82,8 @@ RaftRecovery::~RaftRecovery() { } void RaftRecovery::OpenMetadataFile() { - LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; + LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " + << __func__ << "\n"; metadata_fd_ = open(meta_file_path_.c_str(), O_CREAT | O_RDWR, 0666); if (metadata_fd_ < 0) { LOG(ERROR) << "Failed to open metadata file: " << strerror(errno); @@ -114,7 +115,8 @@ void RaftRecovery::WriteMetadata(int64_t current_term, int32_t voted_for) { } RaftMetadata RaftRecovery::ReadMetadata() { - LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; + LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " + << __func__ << "\n"; RaftMetadata metadata; if (metadata_fd_ < 0) { LOG(ERROR) << "Metadata file not open"; @@ -130,7 +132,7 @@ RaftMetadata RaftRecovery::ReadMetadata() { return metadata; } -void RaftRecovery::WriteSystemInfo() { } +void RaftRecovery::WriteSystemInfo() {} void RaftRecovery::AddLogEntry(const Entry* entry) { if (recovery_enabled_ == false) { @@ -141,18 +143,19 @@ void RaftRecovery::AddLogEntry(const Entry* entry) { Flush(); } -void RaftRecovery::AddLogEntry(std::vector &entries_to_add) { +void RaftRecovery::AddLogEntry(std::vector& entries_to_add) { if (recovery_enabled_ == false) { return; } - for (const auto &entry : entries_to_add) { + for (const auto& entry : entries_to_add) { WriteLog(&entry); } Flush(); } void RaftRecovery::WriteLog(const Entry* entry) { -LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; + LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " + << __func__ << "\n"; std::string data; if (entry) { entry->SerializeToString(&data); @@ -167,7 +170,7 @@ LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __ } std::vector> RaftRecovery::ParseDataListItem( - std::vector &data_list) { + std::vector& data_list) { std::vector> request_list; for (size_t i = 0; i < data_list.size(); i++) { @@ -184,8 +187,8 @@ std::vector> RaftRecovery::ParseDataListItem( } void RaftRecovery::PerformCallback( - std::vector> &request_list, - CallbackType call_back, int64_t ckpt) { + std::vector>& request_list, CallbackType call_back, + int64_t ckpt) { uint64_t max_seq = 0; for (std::unique_ptr& entry : request_list) { // LOG(ERROR)<<" ckpt :"< data_list, std::function system_callback) { +bool RaftRecovery::PerformSystemCallback( + std::vector data_list, + std::function system_callback) { RaftMetadata info = ReadMetadata(); system_callback(info); return true; @@ -210,17 +215,17 @@ bool RaftRecovery::PerformSystemCallback(std::vector data_list, std template class RecoveryBase; -template void RecoveryBase::ReadLogs( - std::function, - raft::CallbackType, +template void RecoveryBase::ReadLogs( + std::function, raft::CallbackType, std::function); -template void RecoveryBase::SwitchFile( - const std::string&, - raft::CallbackType); +template void RecoveryBase::SwitchFile< + raft::RaftMetadata, raft::CallbackType>(const std::string&, + raft::CallbackType); -template void RecoveryBase::ReadLogsFromFiles( +template void RecoveryBase::ReadLogsFromFiles< + raft::RaftMetadata, raft::CallbackType>( const std::string&, int64_t, int, - std::function, - raft::CallbackType); + std::function, raft::CallbackType); } // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/raft_recovery.h b/platform/consensus/ordering/raft/framework/raft_recovery.h index 65705b399c..e9ba865634 100644 --- a/platform/consensus/ordering/raft/framework/raft_recovery.h +++ b/platform/consensus/ordering/raft/framework/raft_recovery.h @@ -42,29 +42,33 @@ struct RaftMetadata { class RaftRecovery : public RecoveryBase { friend class RecoveryBase; + public: - RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage); + RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, + Storage* storage); ~RaftRecovery(); RaftMetadata ReadMetadata(); void Init(); void WriteMetadata(int64_t current_term, int32_t voted_for); void AddLogEntry(const Entry* entry); - void AddLogEntry(std::vector &entries_to_add); + void AddLogEntry(std::vector& entries_to_add); private: void OpenMetadataFile(); void WriteSystemInfo(); std::vector> ParseDataListItem( - std::vector &data_list); + std::vector& data_list); void WriteLog(const Entry* entry); void PerformCallback( - std::vector> &request_list, - std::function entry)> - call_back, int64_t ckpt); - - bool PerformSystemCallback(std::vector data_list, std::function system_callback); + std::vector>& request_list, + std::function entry)> call_back, + int64_t ckpt); + + bool PerformSystemCallback( + std::vector data_list, + std::function system_callback); int metadata_fd_; std::string meta_file_path_; diff --git a/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp b/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp index beee10eaae..896a96bc92 100644 --- a/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp +++ b/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp @@ -1,15 +1,15 @@ -#include +#include "platform/consensus/ordering/raft/framework/raft_recovery.h" + #include #include #include +#include -#include "platform/consensus/ordering/raft/framework/raft_recovery.h" -#include "platform/consensus/ordering/raft/proto/proposal.pb.h" #include "chain/storage/mock_storage.h" #include "platform/consensus/checkpoint/mock_checkpoint.h" #include "platform/consensus/ordering/raft/framework/transaction_utils.h" - +#include "platform/consensus/ordering/raft/proto/proposal.pb.h" namespace resdb { namespace raft { @@ -62,18 +62,18 @@ TEST_F(RaftRecoveryTest, ReadLog) { RaftRecovery recovery(config_, &checkpoint_, nullptr); for (int i = 0; i < entries_to_add; i++) { - // Set up the Log Entry to be added - Entry logEntry; - logEntry.set_term(i + 1); - auto req = std::make_unique(); - req->set_seq(i + 1); - std::string serialized; - if (!req->SerializeToString(&serialized)) { - assert(false); - } - logEntry.set_command(std::move(serialized)); - - recovery.AddLogEntry(&logEntry); + // Set up the Log Entry to be added + Entry logEntry; + logEntry.set_term(i + 1); + auto req = std::make_unique(); + req->set_seq(i + 1); + std::string serialized; + if (!req->SerializeToString(&serialized)) { + assert(false); + } + logEntry.set_command(std::move(serialized)); + + recovery.AddLogEntry(&logEntry); } } { @@ -81,8 +81,7 @@ TEST_F(RaftRecoveryTest, ReadLog) { RaftRecovery recovery(config_, &checkpoint_, nullptr); recovery.ReadLogs( [&](const RaftMetadata &data) {}, - [&](std::unique_ptr entry) { list.push_back(*entry); }, - nullptr); + [&](std::unique_ptr entry) { list.push_back(*entry); }, nullptr); EXPECT_EQ(list.size(), entries_to_add); diff --git a/platform/consensus/recovery/pbft_recovery.cpp b/platform/consensus/recovery/pbft_recovery.cpp index 18524c728c..8bd762b5b1 100644 --- a/platform/consensus/recovery/pbft_recovery.cpp +++ b/platform/consensus/recovery/pbft_recovery.cpp @@ -34,10 +34,11 @@ namespace resdb { -using CallbackType = std::function, std::unique_ptr)>; - +using CallbackType = + std::function, std::unique_ptr)>; + PBFTRecovery::PBFTRecovery(const ResDBConfig& config, CheckPoint* checkpoint, - SystemInfo* system_info, Storage* storage) + SystemInfo* system_info, Storage* storage) : RecoveryBase(config, checkpoint, storage), system_info_(system_info) { Init(); @@ -52,19 +53,18 @@ void PBFTRecovery::Init() { LOG(ERROR) << " init"; GetLastFile(); - CallbackType callback = - [this](std::unique_ptr context, std::unique_ptr request) { - min_seq_ == -1 - ? min_seq_ = request->seq() - : std::min(min_seq_, static_cast(request->seq())); - max_seq_ = std::max(max_seq_, static_cast(request->seq())); - }; + CallbackType callback = [this](std::unique_ptr context, + std::unique_ptr request) { + min_seq_ == -1 ? min_seq_ = request->seq() + : std::min(min_seq_, static_cast(request->seq())); + max_seq_ = std::max(max_seq_, static_cast(request->seq())); + }; SwitchFile(file_path_, callback); LOG(ERROR) << " init done"; - ckpt_thread_ = std::thread([this]{ this->UpdateStableCheckPoint(); }); + ckpt_thread_ = std::thread([this] { this->UpdateStableCheckPoint(); }); } void PBFTRecovery::WriteSystemInfo() { @@ -119,8 +119,8 @@ void PBFTRecovery::WriteLog(const Context* context, const Request* request) { Flush(); } -std::vector> PBFTRecovery::ParseDataListItem( - std::vector &data_list) { +std::vector> +PBFTRecovery::ParseDataListItem(std::vector& data_list) { std::vector> request_list; for (size_t i = 0; i < data_list.size(); i += 2) { @@ -145,7 +145,7 @@ std::vector> PBFTRecovery::ParseData } void PBFTRecovery::PerformCallback( - std::vector> &request_list, + std::vector>& request_list, CallbackType call_back, int64_t ckpt) { uint64_t max_seq = 0; for (std::unique_ptr& recovery_data : request_list) { @@ -164,8 +164,9 @@ void PBFTRecovery::PerformCallback( LOG(ERROR) << " recovery max seq:" << max_seq; } - -bool PBFTRecovery::PerformSystemCallback(std::vector data_list, std::function system_callback) { +bool PBFTRecovery::PerformSystemCallback( + std::vector data_list, + std::function system_callback) { SystemInfoData info; if (data_list.empty() || !info.ParseFromString(data_list[0])) { return false; @@ -179,22 +180,21 @@ std::map< uint64_t, std::vector, std::unique_ptr>>> PBFTRecovery::GetDataFromRecoveryFiles(uint64_t need_min_seq, - uint64_t need_max_seq) { + uint64_t need_max_seq) { auto list = GetSortedRecoveryFiles(need_min_seq, need_max_seq); std::map, std::unique_ptr>>> res; for (const auto& path : list) { - CallbackType callback = - [&](std::unique_ptr context, std::unique_ptr request) { - if (request->seq() >= need_min_seq && - request->seq() <= need_max_seq) { - LOG(ERROR) << "get data from recovery file seq:" << request->seq(); - res[request->seq()].push_back( - std::make_pair(std::move(context), std::move(request))); - } - }; + CallbackType callback = [&](std::unique_ptr context, + std::unique_ptr request) { + if (request->seq() >= need_min_seq && request->seq() <= need_max_seq) { + LOG(ERROR) << "get data from recovery file seq:" << request->seq(); + res[request->seq()].push_back( + std::make_pair(std::move(context), std::move(request))); + } + }; ReadLogsFromFiles( path.second, need_min_seq - 1, 0, @@ -206,7 +206,7 @@ PBFTRecovery::GetDataFromRecoveryFiles(uint64_t need_min_seq, } int PBFTRecovery::GetData(const RecoveryRequest& request, - RecoveryResponse& response) { + RecoveryResponse& response) { auto res = GetDataFromRecoveryFiles(request.min_seq(), request.max_seq()); for (const auto& it : res) { @@ -218,21 +218,18 @@ int PBFTRecovery::GetData(const RecoveryRequest& request, return 0; } - template class RecoveryBase; -template void RecoveryBase::ReadLogs( - std::function, - CallbackType, - std::function); +template void RecoveryBase::ReadLogs< + SystemInfoData, CallbackType>(std::function, + CallbackType, std::function); -template void RecoveryBase::SwitchFile( - const std::string&, - CallbackType); +template void RecoveryBase::SwitchFile< + SystemInfoData, CallbackType>(const std::string&, CallbackType); -template void RecoveryBase::ReadLogsFromFiles( +template void +RecoveryBase::ReadLogsFromFiles( const std::string&, int64_t, int, - std::function, - CallbackType); + std::function, CallbackType); } // namespace resdb diff --git a/platform/consensus/recovery/pbft_recovery.h b/platform/consensus/recovery/pbft_recovery.h index 1b47b172cc..53debb3dac 100644 --- a/platform/consensus/recovery/pbft_recovery.h +++ b/platform/consensus/recovery/pbft_recovery.h @@ -19,16 +19,17 @@ #pragma once -#include "platform/consensus/recovery/recovery.h" #include "platform/consensus/execution/system_info.h" +#include "platform/consensus/recovery/recovery.h" namespace resdb { class PBFTRecovery : public RecoveryBase { friend class RecoveryBase; + public: PBFTRecovery(const ResDBConfig& config, CheckPoint* checkpoint, - SystemInfo* system_info, Storage* storage); + SystemInfo* system_info, Storage* storage); ~PBFTRecovery() = default; void AddRequest(const Context* context, const Request* request); @@ -50,15 +51,17 @@ class PBFTRecovery : public RecoveryBase { void WriteSystemInfo(); std::vector> ParseDataListItem( - std::vector &data_list); + std::vector& data_list); - void PerformCallback( - std::vector> &request_list, - std::function context, - std::unique_ptr request)> - call_back, int64_t ckpt); + void PerformCallback(std::vector>& request_list, + std::function context, + std::unique_ptr request)> + call_back, + int64_t ckpt); - bool PerformSystemCallback(std::vector data_list, std::function system_callback); + bool PerformSystemCallback( + std::vector data_list, + std::function system_callback); SystemInfo* system_info_; }; diff --git a/platform/consensus/recovery/recovery.h b/platform/consensus/recovery/recovery.h index d98790d308..110a5c65a5 100644 --- a/platform/consensus/recovery/recovery.h +++ b/platform/consensus/recovery/recovery.h @@ -19,16 +19,8 @@ #pragma once -#include - -#include "chain/storage/storage.h" -#include "platform/config/resdb_config.h" -#include "platform/consensus/checkpoint/checkpoint.h" -#include "platform/networkstrate/server_comm.h" -#include "platform/proto/resdb.pb.h" -#include "platform/proto/system_info_data.pb.h" -#include #include +#include #include #include #include @@ -37,21 +29,29 @@ #include #include #include +#include +#include "chain/storage/storage.h" #include "common/utils/utils.h" +#include "platform/config/resdb_config.h" +#include "platform/consensus/checkpoint/checkpoint.h" +#include "platform/networkstrate/server_comm.h" +#include "platform/proto/resdb.pb.h" +#include "platform/proto/system_info_data.pb.h" namespace resdb { -template +template class RecoveryBase { public: - RecoveryBase(const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage); + RecoveryBase(const ResDBConfig& config, CheckPoint* checkpoint, + Storage* storage); ~RecoveryBase(); // void Init(); - template - void ReadLogs(std::function system_callback, - TCallback call_back, - std::function start_point); + template + void ReadLogs( + std::function system_callback, + TCallback call_back, std::function start_point); int64_t GetMaxSeq(); int64_t GetMinSeq(); @@ -59,7 +59,8 @@ class RecoveryBase { // int GetData(const RecoveryRequest& request, RecoveryResponse& response); protected: - std::vector> GetSortedRecoveryFiles(uint64_t need_min_seq, uint64_t need_max_seq); + std::vector> GetSortedRecoveryFiles( + uint64_t need_min_seq, uint64_t need_max_seq); private: @@ -87,27 +88,26 @@ class RecoveryBase { bool Read(int fd, size_t len, char* data); std::pair>, int64_t> GetRecoveryFiles(int64_t ckpt); - template + template void SwitchFile(const std::string& path, TCallback call_back); void OpenFile(const std::string& path); - template + template void ReadLogsFromFiles( - const std::string& path, int64_t ckpt, int file_idx, - std::function system_callback, - TCallback call_back); + const std::string& path, int64_t ckpt, int file_idx, + std::function system_callback, + TCallback call_back); std::string file_path_; ResDBConfig config_; // Derived class must implement these auto ParseDataListItem(std::vector& data_list); - template + template void PerformCallback(auto& request_list, TCallback call_back); void WriteSystemInfo(); - CheckPoint* checkpoint_; std::thread ckpt_thread_; bool recovery_enabled_ = false; diff --git a/platform/consensus/recovery/recovery_impl.h b/platform/consensus/recovery/recovery_impl.h index 2f374a2a92..7a92d1d713 100644 --- a/platform/consensus/recovery/recovery_impl.h +++ b/platform/consensus/recovery/recovery_impl.h @@ -31,11 +31,10 @@ // namespace resdb { -template -RecoveryBase::RecoveryBase(const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage) - : config_(config), - checkpoint_(checkpoint), - storage_(storage) { +template +RecoveryBase::RecoveryBase(const ResDBConfig& config, + CheckPoint* checkpoint, Storage* storage) + : config_(config), checkpoint_(checkpoint), storage_(storage) { recovery_enabled_ = config_.GetConfigData().recovery_enabled(); file_path_ = config_.GetConfigData().recovery_path(); if (file_path_.empty()) { @@ -78,7 +77,7 @@ RecoveryBase::RecoveryBase(const ResDBConfig& config, CheckPoint* chec stop_ = false; } -template +template RecoveryBase::~RecoveryBase() { if (recovery_enabled_ == false) { return; @@ -91,13 +90,17 @@ RecoveryBase::~RecoveryBase() { } } -template -int64_t RecoveryBase::GetMaxSeq() { return max_seq_; } +template +int64_t RecoveryBase::GetMaxSeq() { + return max_seq_; +} -template -int64_t RecoveryBase::GetMinSeq() { return min_seq_; } +template +int64_t RecoveryBase::GetMinSeq() { + return min_seq_; +} -template +template void RecoveryBase::UpdateStableCheckPoint() { if (checkpoint_ == nullptr) { return; @@ -114,7 +117,7 @@ void RecoveryBase::UpdateStableCheckPoint() { } } -template +template void RecoveryBase::GetLastFile() { std::string dir = std::filesystem::path(file_path_).parent_path(); last_ckpt_ = -1; @@ -153,9 +156,9 @@ void RecoveryBase::GetLastFile() { } } -template +template std::string RecoveryBase::GenerateFile(int64_t seq, int64_t min_seq, - int64_t max_seq) { + int64_t max_seq) { std::string dir = std::filesystem::path(file_path_).parent_path(); std::string file_name = std::filesystem::path(base_file_path_).stem(); int64_t time = GetCurrentTime(); @@ -167,7 +170,7 @@ std::string RecoveryBase::GenerateFile(int64_t seq, int64_t min_seq, return dir + "/" + file_name + "." + ext; } -template +template void RecoveryBase::FinishFile(int64_t seq) { std::unique_lock lk(mutex_); Flush(); @@ -191,14 +194,14 @@ void RecoveryBase::FinishFile(int64_t seq) { OpenFile(file_path_); } -template +template void RecoveryBase::AppendData(const std::string& data) { size_t len = data.size(); buffer_.append(reinterpret_cast(&len), sizeof(len)); buffer_.append(data); } -template +template auto RecoveryBase::ParseData(const std::string& data) { std::vector data_list; int pos = 0; @@ -215,8 +218,9 @@ auto RecoveryBase::ParseData(const std::string& data) { return static_cast(this)->ParseDataListItem(data_list); } -template -std::vector RecoveryBase::ParseRawData(const std::string& data) { +template +std::vector RecoveryBase::ParseRawData( + const std::string& data) { std::vector data_list; int pos = 0; while (pos < data.size()) { @@ -231,14 +235,14 @@ std::vector RecoveryBase::ParseRawData(const std::string& return data_list; } -template +template void RecoveryBase::MayFlush() { if (buffer_.size() > buffer_size_) { Flush(); } } -template +template void RecoveryBase::Flush() { size_t len = buffer_.size(); if (len == 0) { @@ -251,7 +255,7 @@ void RecoveryBase::Flush() { fsync(fd_); } -template +template void RecoveryBase::Write(const char* data, size_t len) { int pos = 0; while (len > 0) { @@ -261,7 +265,7 @@ void RecoveryBase::Write(const char* data, size_t len) { } } -template +template bool RecoveryBase::Read(int fd, size_t len, char* data) { int pos = 0; while (len > 0) { @@ -275,7 +279,7 @@ bool RecoveryBase::Read(int fd, size_t len, char* data) { return true; } -template +template std::pair>, int64_t> RecoveryBase::GetRecoveryFiles(int64_t ckpt) { std::string dir = std::filesystem::path(file_path_).parent_path(); @@ -329,11 +333,10 @@ RecoveryBase::GetRecoveryFiles(int64_t ckpt) { return std::make_pair(list, last_ckpt); } - -template +template std::vector> RecoveryBase::GetSortedRecoveryFiles(uint64_t need_min_seq, - uint64_t need_max_seq) { + uint64_t need_max_seq) { std::string dir = std::filesystem::path(file_path_).parent_path(); std::vector> list; diff --git a/platform/consensus/recovery/recovery_template_functions.h b/platform/consensus/recovery/recovery_template_functions.h index 94a8f544c4..c47561a3e4 100644 --- a/platform/consensus/recovery/recovery_template_functions.h +++ b/platform/consensus/recovery/recovery_template_functions.h @@ -19,17 +19,17 @@ #pragma once -template -template -void RecoveryBase::ReadLogs(std::function system_callback, - TCallback call_back, - std::function set_start_point) { +template +template +void RecoveryBase::ReadLogs( + std::function system_callback, + TCallback call_back, std::function set_start_point) { if (recovery_enabled_ == false) { return; } int64_t storage_ckpt = 0; - if(storage_) { + if (storage_) { storage_ckpt = storage_->GetLastCheckpoint(); } std::unique_lock lk(mutex_); @@ -41,27 +41,27 @@ void RecoveryBase::ReadLogs(std::function(path.second, ckpt, idx++, system_callback, call_back); + ReadLogsFromFiles(path.second, ckpt, idx++, + system_callback, call_back); } } -template -template -void RecoveryBase::SwitchFile(const std::string& file_path, TCallback call_back) { +template +template +void RecoveryBase::SwitchFile(const std::string& file_path, + TCallback call_back) { std::unique_lock lk(mutex_); min_seq_ = -1; max_seq_ = -1; ReadLogsFromFiles( - file_path, 0, 0, - [&](const TSystemInfoData& data) {}, - call_back); + file_path, 0, 0, [&](const TSystemInfoData& data) {}, call_back); OpenFile(file_path); LOG(INFO) << "switch to file:" << file_path << " seq:" << "[" << min_seq_ << "," << max_seq_ << "]"; } -template +template void RecoveryBase::OpenFile(const std::string& path) { if (fd_ >= 0) { close(fd_); @@ -84,8 +84,8 @@ void RecoveryBase::OpenFile(const std::string& path) { assert(fd_ >= 0); } -template -template +template +template void RecoveryBase::ReadLogsFromFiles( const std::string& path, int64_t ckpt, int file_idx, std::function system_callback, @@ -111,7 +111,9 @@ void RecoveryBase::ReadLogsFromFiles( delete buf; std::vector data_list = ParseRawData(data); - bool successful_callback = static_cast(this)->PerformSystemCallback(data_list, system_callback); + bool successful_callback = + static_cast(this)->PerformSystemCallback(data_list, + system_callback); if (!successful_callback) { LOG(ERROR) << "parse info fail:" << data.size(); diff --git a/platform/consensus/recovery/recovery_test.cpp b/platform/consensus/recovery/recovery_test.cpp index 2181996249..91f9d97e2b 100644 --- a/platform/consensus/recovery/recovery_test.cpp +++ b/platform/consensus/recovery/recovery_test.cpp @@ -17,8 +17,6 @@ * under the License. */ -#include "platform/consensus/recovery/pbft_recovery.h" - #include #include #include @@ -30,6 +28,7 @@ #include "common/test/test_macros.h" #include "platform/consensus/checkpoint/mock_checkpoint.h" #include "platform/consensus/ordering/common/transaction_utils.h" +#include "platform/consensus/recovery/pbft_recovery.h" namespace resdb { namespace { @@ -82,7 +81,9 @@ TEST_F(RecoveryTest, ReadLog) { Request::TYPE_NEWVIEW, Request::TYPE_NEW_TXNS}; std::vector expected_types = { - Request::TYPE_PRE_PREPARE, Request::TYPE_PREPARE, Request::TYPE_COMMIT, + Request::TYPE_PRE_PREPARE, + Request::TYPE_PREPARE, + Request::TYPE_COMMIT, Request::TYPE_NEWVIEW, }; @@ -100,17 +101,15 @@ TEST_F(RecoveryTest, ReadLog) { std::vector list; PBFTRecovery recovery(config_, &checkpoint_, &system_info_, nullptr); - std::function, std::unique_ptr)> call_back = - [&](std::unique_ptr context, std::unique_ptr request) { - list.push_back(*request); - // LOG(ERROR) << "call back:" << request->seq(); - }; + std::function, std::unique_ptr)> + call_back = [&](std::unique_ptr context, + std::unique_ptr request) { + list.push_back(*request); + // LOG(ERROR) << "call back:" << request->seq(); + }; recovery.ReadLogs( - [&](const SystemInfoData &data) {}, - call_back, - nullptr - ); + [&](const SystemInfoData &data) {}, call_back, nullptr); EXPECT_EQ(list.size(), expected_types.size()); @@ -129,7 +128,9 @@ TEST_F(RecoveryTest, ReadLog_FlushOnce) { Request::TYPE_NEWVIEW, Request::TYPE_NEW_TXNS}; std::vector expected_types = { - Request::TYPE_PRE_PREPARE, Request::TYPE_PREPARE, Request::TYPE_COMMIT, + Request::TYPE_PRE_PREPARE, + Request::TYPE_PREPARE, + Request::TYPE_COMMIT, Request::TYPE_NEWVIEW, }; @@ -147,16 +148,13 @@ TEST_F(RecoveryTest, ReadLog_FlushOnce) { std::vector list; PBFTRecovery recovery(config, &checkpoint_, &system_info_, nullptr); - std::function, std::unique_ptr)> call_back = - [&](std::unique_ptr context, std::unique_ptr request) { - list.push_back(*request); - }; + std::function, std::unique_ptr)> + call_back = + [&](std::unique_ptr context, + std::unique_ptr request) { list.push_back(*request); }; recovery.ReadLogs( - [&](const SystemInfoData &data) {}, - call_back, - nullptr - ); + [&](const SystemInfoData &data) {}, call_back, nullptr); EXPECT_EQ(list.size(), expected_types.size()); @@ -218,17 +216,15 @@ TEST_F(RecoveryTest, CheckPoint) { std::vector list; PBFTRecovery recovery(config, &checkpoint_, &system_info_, nullptr); - std::function, std::unique_ptr)> call_back = - [&](std::unique_ptr context, std::unique_ptr request) { - list.push_back(*request); - // LOG(ERROR) << "call back:" << request->seq(); - }; + std::function, std::unique_ptr)> + call_back = [&](std::unique_ptr context, + std::unique_ptr request) { + list.push_back(*request); + // LOG(ERROR) << "call back:" << request->seq(); + }; recovery.ReadLogs( - [&](const SystemInfoData &data) {}, - call_back, - nullptr - ); + [&](const SystemInfoData &data) {}, call_back, nullptr); EXPECT_EQ(list.size(), types.size() * 14); @@ -301,17 +297,15 @@ TEST_F(RecoveryTest, CheckPoint2) { std::vector list; PBFTRecovery recovery(config, &checkpoint_, &system_info_, &storage); - std::function, std::unique_ptr)> call_back = - [&](std::unique_ptr context, std::unique_ptr request) { - list.push_back(*request); - // LOG(ERROR) << "call back:" << request->seq(); - }; + std::function, std::unique_ptr)> + call_back = [&](std::unique_ptr context, + std::unique_ptr request) { + list.push_back(*request); + // LOG(ERROR) << "call back:" << request->seq(); + }; recovery.ReadLogs( - [&](const SystemInfoData &data) {}, - call_back, - nullptr - ); + [&](const SystemInfoData &data) {}, call_back, nullptr); EXPECT_EQ(list.size(), types.size() * 14); @@ -344,17 +338,15 @@ TEST_F(RecoveryTest, CheckPoint2) { std::vector list; PBFTRecovery recovery(config, &checkpoint_, &system_info_, &storage); - std::function, std::unique_ptr)> call_back = - [&](std::unique_ptr context, std::unique_ptr request) { - list.push_back(*request); - // LOG(ERROR) << "call back:" << request->seq(); - }; + std::function, std::unique_ptr)> + call_back = [&](std::unique_ptr context, + std::unique_ptr request) { + list.push_back(*request); + // LOG(ERROR) << "call back:" << request->seq(); + }; recovery.ReadLogs( - [&](const SystemInfoData &data) {}, - call_back, - nullptr - ); + [&](const SystemInfoData &data) {}, call_back, nullptr); EXPECT_EQ(list.size(), types.size() * 9); @@ -432,17 +424,16 @@ TEST_F(RecoveryTest, SystemInfo) { SystemInfoData data; PBFTRecovery recovery(config, &checkpoint_, &system_info_, &storage); - std::function, std::unique_ptr)> call_back = - [&](std::unique_ptr context, std::unique_ptr request) { - list.push_back(*request); - // LOG(ERROR) << "call back:" << request->seq(); - }; + std::function, std::unique_ptr)> + call_back = [&](std::unique_ptr context, + std::unique_ptr request) { + list.push_back(*request); + // LOG(ERROR) << "call back:" << request->seq(); + }; recovery.ReadLogs( - [&](const SystemInfoData &r_data) {data = r_data;}, - call_back, - nullptr - ); + [&](const SystemInfoData &r_data) { data = r_data; }, call_back, + nullptr); EXPECT_EQ(list.size(), types.size() * 14); @@ -476,17 +467,16 @@ TEST_F(RecoveryTest, SystemInfo) { SystemInfoData data; PBFTRecovery recovery(config, &checkpoint_, &system_info_, &storage); - std::function, std::unique_ptr)> call_back = - [&](std::unique_ptr context, std::unique_ptr request) { - list.push_back(*request); - // LOG(ERROR) << "call back:" << request->seq(); - }; + std::function, std::unique_ptr)> + call_back = [&](std::unique_ptr context, + std::unique_ptr request) { + list.push_back(*request); + // LOG(ERROR) << "call back:" << request->seq(); + }; recovery.ReadLogs( - [&](const SystemInfoData &r_data) {data = r_data;}, - call_back, - nullptr - ); + [&](const SystemInfoData &r_data) { data = r_data; }, call_back, + nullptr); EXPECT_EQ(data.view(), 2); EXPECT_EQ(data.primary_id(), 2); From aa5e923a215ab09f8724087b03cba895257b330d Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Wed, 1 Apr 2026 16:54:00 -0700 Subject: [PATCH 57/66] Change the templating of Recovery --- .../ordering/raft/framework/raft_recovery.cpp | 23 +---- .../ordering/raft/framework/raft_recovery.h | 7 +- .../raft/framework/raft_recovery_test.cpp | 2 +- platform/consensus/recovery/pbft_recovery.cpp | 26 ++--- platform/consensus/recovery/pbft_recovery.h | 12 +-- platform/consensus/recovery/recovery.h | 10 +- platform/consensus/recovery/recovery_impl.h | 99 +++++++++---------- .../recovery/recovery_template_functions.h | 27 +++-- platform/consensus/recovery/recovery_test.cpp | 25 ++--- 9 files changed, 90 insertions(+), 141 deletions(-) diff --git a/platform/consensus/ordering/raft/framework/raft_recovery.cpp b/platform/consensus/ordering/raft/framework/raft_recovery.cpp index 112bbec083..8350f16f08 100644 --- a/platform/consensus/ordering/raft/framework/raft_recovery.cpp +++ b/platform/consensus/ordering/raft/framework/raft_recovery.cpp @@ -39,7 +39,8 @@ using CallbackType = std::function)>; RaftRecovery::RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage) - : RecoveryBase(config, checkpoint, storage) { + : RecoveryBase(config, checkpoint, + storage) { Init(); } @@ -58,7 +59,7 @@ void RaftRecovery::Init() { max_seq_ = std::max(max_seq_, static_cast(entry->term())); }; - SwitchFile(file_path_, callback); + SwitchFile(file_path_, callback); LOG(ERROR) << " init done"; meta_file_path_ = std::filesystem::path(base_file_path_).parent_path() @@ -191,9 +192,6 @@ void RaftRecovery::PerformCallback( int64_t ckpt) { uint64_t max_seq = 0; for (std::unique_ptr& entry : request_list) { - // LOG(ERROR)<<" ckpt :"<request->seq()<<" - // type:"<request->type(); if (ckpt < entry->term()) { max_seq = entry->term(); call_back(std::move(entry)); @@ -213,19 +211,4 @@ bool RaftRecovery::PerformSystemCallback( } // namespace raft -template class RecoveryBase; - -template void RecoveryBase::ReadLogs( - std::function, raft::CallbackType, - std::function); - -template void RecoveryBase::SwitchFile< - raft::RaftMetadata, raft::CallbackType>(const std::string&, - raft::CallbackType); - -template void RecoveryBase::ReadLogsFromFiles< - raft::RaftMetadata, raft::CallbackType>( - const std::string&, int64_t, int, - std::function, raft::CallbackType); } // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/raft_recovery.h b/platform/consensus/ordering/raft/framework/raft_recovery.h index e9ba865634..c0ef85e606 100644 --- a/platform/consensus/ordering/raft/framework/raft_recovery.h +++ b/platform/consensus/ordering/raft/framework/raft_recovery.h @@ -40,8 +40,11 @@ struct RaftMetadata { int32_t voted_for = -1; }; -class RaftRecovery : public RecoveryBase { - friend class RecoveryBase; +using CallbackType = std::function)>; + +class RaftRecovery + : public RecoveryBase { + friend class RecoveryBase; public: RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, diff --git a/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp b/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp index 896a96bc92..477b692596 100644 --- a/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp +++ b/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp @@ -79,7 +79,7 @@ TEST_F(RaftRecoveryTest, ReadLog) { { std::vector list; RaftRecovery recovery(config_, &checkpoint_, nullptr); - recovery.ReadLogs( + recovery.ReadLogs( [&](const RaftMetadata &data) {}, [&](std::unique_ptr entry) { list.push_back(*entry); }, nullptr); diff --git a/platform/consensus/recovery/pbft_recovery.cpp b/platform/consensus/recovery/pbft_recovery.cpp index 8bd762b5b1..d54cff4e12 100644 --- a/platform/consensus/recovery/pbft_recovery.cpp +++ b/platform/consensus/recovery/pbft_recovery.cpp @@ -39,7 +39,8 @@ using CallbackType = PBFTRecovery::PBFTRecovery(const ResDBConfig& config, CheckPoint* checkpoint, SystemInfo* system_info, Storage* storage) - : RecoveryBase(config, checkpoint, storage), + : RecoveryBase( + config, checkpoint, storage), system_info_(system_info) { Init(); } @@ -60,7 +61,7 @@ void PBFTRecovery::Init() { max_seq_ = std::max(max_seq_, static_cast(request->seq())); }; - SwitchFile(file_path_, callback); + SwitchFile(file_path_, callback); LOG(ERROR) << " init done"; @@ -196,10 +197,9 @@ PBFTRecovery::GetDataFromRecoveryFiles(uint64_t need_min_seq, } }; - ReadLogsFromFiles( - path.second, need_min_seq - 1, 0, - [&](const SystemInfoData& data) {}, // system callback - callback); // typed callback + ReadLogsFromFiles( + path.second, need_min_seq - 1, 0, [&](const SystemInfoData& data) {}, + callback); } return res; @@ -218,18 +218,4 @@ int PBFTRecovery::GetData(const RecoveryRequest& request, return 0; } -template class RecoveryBase; - -template void RecoveryBase::ReadLogs< - SystemInfoData, CallbackType>(std::function, - CallbackType, std::function); - -template void RecoveryBase::SwitchFile< - SystemInfoData, CallbackType>(const std::string&, CallbackType); - -template void -RecoveryBase::ReadLogsFromFiles( - const std::string&, int64_t, int, - std::function, CallbackType); - } // namespace resdb diff --git a/platform/consensus/recovery/pbft_recovery.h b/platform/consensus/recovery/pbft_recovery.h index 53debb3dac..1de2714e23 100644 --- a/platform/consensus/recovery/pbft_recovery.h +++ b/platform/consensus/recovery/pbft_recovery.h @@ -23,9 +23,12 @@ #include "platform/consensus/recovery/recovery.h" namespace resdb { +using CallbackType = + std::function, std::unique_ptr)>; -class PBFTRecovery : public RecoveryBase { - friend class RecoveryBase; +class PBFTRecovery + : public RecoveryBase { + friend class RecoveryBase; public: PBFTRecovery(const ResDBConfig& config, CheckPoint* checkpoint, @@ -54,10 +57,7 @@ class PBFTRecovery : public RecoveryBase { std::vector& data_list); void PerformCallback(std::vector>& request_list, - std::function context, - std::unique_ptr request)> - call_back, - int64_t ckpt); + CallbackType call_back, int64_t ckpt); bool PerformSystemCallback( std::vector data_list, diff --git a/platform/consensus/recovery/recovery.h b/platform/consensus/recovery/recovery.h index 110a5c65a5..ec55337b62 100644 --- a/platform/consensus/recovery/recovery.h +++ b/platform/consensus/recovery/recovery.h @@ -41,14 +41,13 @@ namespace resdb { -template +template class RecoveryBase { public: RecoveryBase(const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage); ~RecoveryBase(); - // void Init(); - template + void ReadLogs( std::function system_callback, TCallback call_back, std::function start_point); @@ -56,8 +55,6 @@ class RecoveryBase { int64_t GetMaxSeq(); int64_t GetMinSeq(); - // int GetData(const RecoveryRequest& request, RecoveryResponse& response); - protected: std::vector> GetSortedRecoveryFiles( uint64_t need_min_seq, uint64_t need_max_seq); @@ -88,11 +85,9 @@ class RecoveryBase { bool Read(int fd, size_t len, char* data); std::pair>, int64_t> GetRecoveryFiles(int64_t ckpt); - template void SwitchFile(const std::string& path, TCallback call_back); void OpenFile(const std::string& path); - template void ReadLogsFromFiles( const std::string& path, int64_t ckpt, int file_idx, std::function system_callback, @@ -103,7 +98,6 @@ class RecoveryBase { // Derived class must implement these auto ParseDataListItem(std::vector& data_list); - template void PerformCallback(auto& request_list, TCallback call_back); void WriteSystemInfo(); diff --git a/platform/consensus/recovery/recovery_impl.h b/platform/consensus/recovery/recovery_impl.h index 7a92d1d713..a6e4875572 100644 --- a/platform/consensus/recovery/recovery_impl.h +++ b/platform/consensus/recovery/recovery_impl.h @@ -17,23 +17,9 @@ * under the License. */ -// #include "platform/consensus/recovery/recovery.h" - -// #include -// #include -// #include - -// #include -// #include -// #include - -// #include "common/utils/utils.h" - -// namespace resdb { - -template -RecoveryBase::RecoveryBase(const ResDBConfig& config, - CheckPoint* checkpoint, Storage* storage) +template +RecoveryBase::RecoveryBase( + const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage) : config_(config), checkpoint_(checkpoint), storage_(storage) { recovery_enabled_ = config_.GetConfigData().recovery_enabled(); file_path_ = config_.GetConfigData().recovery_path(); @@ -77,8 +63,8 @@ RecoveryBase::RecoveryBase(const ResDBConfig& config, stop_ = false; } -template -RecoveryBase::~RecoveryBase() { +template +RecoveryBase::~RecoveryBase() { if (recovery_enabled_ == false) { return; } @@ -90,18 +76,19 @@ RecoveryBase::~RecoveryBase() { } } -template -int64_t RecoveryBase::GetMaxSeq() { +template +int64_t RecoveryBase::GetMaxSeq() { return max_seq_; } -template -int64_t RecoveryBase::GetMinSeq() { +template +int64_t RecoveryBase::GetMinSeq() { return min_seq_; } -template -void RecoveryBase::UpdateStableCheckPoint() { +template +void RecoveryBase::UpdateStableCheckPoint() { if (checkpoint_ == nullptr) { return; } @@ -117,8 +104,8 @@ void RecoveryBase::UpdateStableCheckPoint() { } } -template -void RecoveryBase::GetLastFile() { +template +void RecoveryBase::GetLastFile() { std::string dir = std::filesystem::path(file_path_).parent_path(); last_ckpt_ = -1; uint64_t m_time_s = 0; @@ -156,9 +143,9 @@ void RecoveryBase::GetLastFile() { } } -template -std::string RecoveryBase::GenerateFile(int64_t seq, int64_t min_seq, - int64_t max_seq) { +template +std::string RecoveryBase::GenerateFile( + int64_t seq, int64_t min_seq, int64_t max_seq) { std::string dir = std::filesystem::path(file_path_).parent_path(); std::string file_name = std::filesystem::path(base_file_path_).stem(); int64_t time = GetCurrentTime(); @@ -170,8 +157,9 @@ std::string RecoveryBase::GenerateFile(int64_t seq, int64_t min_seq, return dir + "/" + file_name + "." + ext; } -template -void RecoveryBase::FinishFile(int64_t seq) { +template +void RecoveryBase::FinishFile( + int64_t seq) { std::unique_lock lk(mutex_); Flush(); if (storage_) { @@ -194,15 +182,17 @@ void RecoveryBase::FinishFile(int64_t seq) { OpenFile(file_path_); } -template -void RecoveryBase::AppendData(const std::string& data) { +template +void RecoveryBase::AppendData( + const std::string& data) { size_t len = data.size(); buffer_.append(reinterpret_cast(&len), sizeof(len)); buffer_.append(data); } -template -auto RecoveryBase::ParseData(const std::string& data) { +template +auto RecoveryBase::ParseData( + const std::string& data) { std::vector data_list; int pos = 0; while (pos < data.size()) { @@ -218,8 +208,9 @@ auto RecoveryBase::ParseData(const std::string& data) { return static_cast(this)->ParseDataListItem(data_list); } -template -std::vector RecoveryBase::ParseRawData( +template +std::vector +RecoveryBase::ParseRawData( const std::string& data) { std::vector data_list; int pos = 0; @@ -235,15 +226,15 @@ std::vector RecoveryBase::ParseRawData( return data_list; } -template -void RecoveryBase::MayFlush() { +template +void RecoveryBase::MayFlush() { if (buffer_.size() > buffer_size_) { Flush(); } } -template -void RecoveryBase::Flush() { +template +void RecoveryBase::Flush() { size_t len = buffer_.size(); if (len == 0) { return; @@ -255,8 +246,9 @@ void RecoveryBase::Flush() { fsync(fd_); } -template -void RecoveryBase::Write(const char* data, size_t len) { +template +void RecoveryBase::Write(const char* data, + size_t len) { int pos = 0; while (len > 0) { int write_len = write(fd_, data + pos, len); @@ -265,8 +257,10 @@ void RecoveryBase::Write(const char* data, size_t len) { } } -template -bool RecoveryBase::Read(int fd, size_t len, char* data) { +template +bool RecoveryBase::Read(int fd, + size_t len, + char* data) { int pos = 0; while (len > 0) { int read_len = read(fd, data + pos, len); @@ -279,9 +273,10 @@ bool RecoveryBase::Read(int fd, size_t len, char* data) { return true; } -template +template std::pair>, int64_t> -RecoveryBase::GetRecoveryFiles(int64_t ckpt) { +RecoveryBase::GetRecoveryFiles( + int64_t ckpt) { std::string dir = std::filesystem::path(file_path_).parent_path(); int64_t last_ckpt = 0; for (const auto& entry : std::filesystem::directory_iterator(dir)) { @@ -333,10 +328,10 @@ RecoveryBase::GetRecoveryFiles(int64_t ckpt) { return std::make_pair(list, last_ckpt); } -template +template std::vector> -RecoveryBase::GetSortedRecoveryFiles(uint64_t need_min_seq, - uint64_t need_max_seq) { +RecoveryBase::GetSortedRecoveryFiles( + uint64_t need_min_seq, uint64_t need_max_seq) { std::string dir = std::filesystem::path(file_path_).parent_path(); std::vector> list; @@ -378,5 +373,3 @@ RecoveryBase::GetSortedRecoveryFiles(uint64_t need_min_seq, sort(list.begin(), list.end()); return list; } - -// } // namespace resdb diff --git a/platform/consensus/recovery/recovery_template_functions.h b/platform/consensus/recovery/recovery_template_functions.h index c47561a3e4..306e505f48 100644 --- a/platform/consensus/recovery/recovery_template_functions.h +++ b/platform/consensus/recovery/recovery_template_functions.h @@ -19,9 +19,8 @@ #pragma once -template -template -void RecoveryBase::ReadLogs( +template +void RecoveryBase::ReadLogs( std::function system_callback, TCallback call_back, std::function set_start_point) { if (recovery_enabled_ == false) { @@ -41,28 +40,27 @@ void RecoveryBase::ReadLogs( } int idx = 0; for (auto path : recovery_files_pair.first) { - ReadLogsFromFiles(path.second, ckpt, idx++, - system_callback, call_back); + ReadLogsFromFiles(path.second, ckpt, idx++, system_callback, call_back); } } -template -template -void RecoveryBase::SwitchFile(const std::string& file_path, - TCallback call_back) { +template +void RecoveryBase::SwitchFile( + const std::string& file_path, TCallback call_back) { std::unique_lock lk(mutex_); min_seq_ = -1; max_seq_ = -1; - ReadLogsFromFiles( + ReadLogsFromFiles( file_path, 0, 0, [&](const TSystemInfoData& data) {}, call_back); OpenFile(file_path); LOG(INFO) << "switch to file:" << file_path << " seq:" << "[" << min_seq_ << "," << max_seq_ << "]"; } -template -void RecoveryBase::OpenFile(const std::string& path) { +template +void RecoveryBase::OpenFile( + const std::string& path) { if (fd_ >= 0) { close(fd_); } @@ -84,9 +82,8 @@ void RecoveryBase::OpenFile(const std::string& path) { assert(fd_ >= 0); } -template -template -void RecoveryBase::ReadLogsFromFiles( +template +void RecoveryBase::ReadLogsFromFiles( const std::string& path, int64_t ckpt, int file_idx, std::function system_callback, TCallback call_back) { diff --git a/platform/consensus/recovery/recovery_test.cpp b/platform/consensus/recovery/recovery_test.cpp index 91f9d97e2b..a2efa981e5 100644 --- a/platform/consensus/recovery/recovery_test.cpp +++ b/platform/consensus/recovery/recovery_test.cpp @@ -108,8 +108,7 @@ TEST_F(RecoveryTest, ReadLog) { // LOG(ERROR) << "call back:" << request->seq(); }; - recovery.ReadLogs( - [&](const SystemInfoData &data) {}, call_back, nullptr); + recovery.ReadLogs([&](const SystemInfoData &data) {}, call_back, nullptr); EXPECT_EQ(list.size(), expected_types.size()); @@ -153,8 +152,7 @@ TEST_F(RecoveryTest, ReadLog_FlushOnce) { [&](std::unique_ptr context, std::unique_ptr request) { list.push_back(*request); }; - recovery.ReadLogs( - [&](const SystemInfoData &data) {}, call_back, nullptr); + recovery.ReadLogs([&](const SystemInfoData &data) {}, call_back, nullptr); EXPECT_EQ(list.size(), expected_types.size()); @@ -223,8 +221,7 @@ TEST_F(RecoveryTest, CheckPoint) { // LOG(ERROR) << "call back:" << request->seq(); }; - recovery.ReadLogs( - [&](const SystemInfoData &data) {}, call_back, nullptr); + recovery.ReadLogs([&](const SystemInfoData &data) {}, call_back, nullptr); EXPECT_EQ(list.size(), types.size() * 14); @@ -304,8 +301,7 @@ TEST_F(RecoveryTest, CheckPoint2) { // LOG(ERROR) << "call back:" << request->seq(); }; - recovery.ReadLogs( - [&](const SystemInfoData &data) {}, call_back, nullptr); + recovery.ReadLogs([&](const SystemInfoData &data) {}, call_back, nullptr); EXPECT_EQ(list.size(), types.size() * 14); @@ -345,8 +341,7 @@ TEST_F(RecoveryTest, CheckPoint2) { // LOG(ERROR) << "call back:" << request->seq(); }; - recovery.ReadLogs( - [&](const SystemInfoData &data) {}, call_back, nullptr); + recovery.ReadLogs([&](const SystemInfoData &data) {}, call_back, nullptr); EXPECT_EQ(list.size(), types.size() * 9); @@ -431,9 +426,8 @@ TEST_F(RecoveryTest, SystemInfo) { // LOG(ERROR) << "call back:" << request->seq(); }; - recovery.ReadLogs( - [&](const SystemInfoData &r_data) { data = r_data; }, call_back, - nullptr); + recovery.ReadLogs([&](const SystemInfoData &r_data) { data = r_data; }, + call_back, nullptr); EXPECT_EQ(list.size(), types.size() * 14); @@ -474,9 +468,8 @@ TEST_F(RecoveryTest, SystemInfo) { // LOG(ERROR) << "call back:" << request->seq(); }; - recovery.ReadLogs( - [&](const SystemInfoData &r_data) { data = r_data; }, call_back, - nullptr); + recovery.ReadLogs([&](const SystemInfoData &r_data) { data = r_data; }, + call_back, nullptr); EXPECT_EQ(data.view(), 2); EXPECT_EQ(data.primary_id(), 2); From e53e41d4d5fd5d4d3a396cc89f40084f6aff83b4 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Wed, 1 Apr 2026 17:18:55 -0700 Subject: [PATCH 58/66] Reorganize file structure of recovery files --- .../consensus/ordering/raft/algorithm/BUILD | 4 +- .../consensus/ordering/raft/algorithm/raft.h | 6 +- .../consensus/ordering/raft/framework/BUILD | 47 +----- .../raft/framework/transaction_utils.cpp | 41 ----- .../raft/framework/transaction_utils.h | 38 ----- platform/consensus/ordering/raft/proto/BUILD | 13 +- platform/consensus/recovery/BUILD | 39 ++++- .../framework => recovery}/raft_recovery.cpp | 6 +- .../framework => recovery}/raft_recovery.h | 4 +- .../raft_recovery_test.cpp | 4 +- platform/consensus/recovery/recovery.h | 1 - platform/consensus/recovery/recovery_impl.h | 130 +++++++++++++++ .../recovery/recovery_template_functions.h | 150 ------------------ 13 files changed, 184 insertions(+), 299 deletions(-) delete mode 100644 platform/consensus/ordering/raft/framework/transaction_utils.cpp delete mode 100644 platform/consensus/ordering/raft/framework/transaction_utils.h rename platform/consensus/{ordering/raft/framework => recovery}/raft_recovery.cpp (98%) rename platform/consensus/{ordering/raft/framework => recovery}/raft_recovery.h (100%) rename platform/consensus/{ordering/raft/framework => recovery}/raft_recovery_test.cpp (94%) delete mode 100644 platform/consensus/recovery/recovery_template_functions.h diff --git a/platform/consensus/ordering/raft/algorithm/BUILD b/platform/consensus/ordering/raft/algorithm/BUILD index ff1a7ac3f5..fd7e90ec56 100644 --- a/platform/consensus/ordering/raft/algorithm/BUILD +++ b/platform/consensus/ordering/raft/algorithm/BUILD @@ -39,8 +39,10 @@ cc_library( "//platform/consensus/execution:system_info", "//platform/networkstrate:replica_communicator", "//platform/proto:viewchange_message_cc_proto", - "//platform/consensus/ordering/raft/framework:raft_recovery" + "//platform/consensus/recovery:raft_recovery" ], + visibility = ["//platform/consensus/ordering/raft:__subpackages__", + "//platform/consensus/recovery:__subpackages__"], ) cc_library( diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 83fc230640..a96789e0d0 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -33,12 +33,12 @@ #include "platform/common/queue/lock_free_queue.h" #include "platform/consensus/ordering/common/algorithm/protocol_base.h" +#include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" #include "platform/consensus/ordering/raft/proto/proposal.pb.h" +#include "platform/consensus/recovery/raft_recovery.h" +#include "platform/networkstrate/replica_communicator.h" #include "platform/proto/resdb.pb.h" #include "platform/statistic/stats.h" -#include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" -#include "platform/networkstrate/replica_communicator.h" -#include "platform/consensus/ordering/raft/framework/raft_recovery.h" namespace resdb { namespace raft { diff --git a/platform/consensus/ordering/raft/framework/BUILD b/platform/consensus/ordering/raft/framework/BUILD index bdf13fdbd6..c92e301190 100644 --- a/platform/consensus/ordering/raft/framework/BUILD +++ b/platform/consensus/ordering/raft/framework/BUILD @@ -18,21 +18,13 @@ package(default_visibility = ["//platform/consensus/ordering/raft:__subpackages__"]) -cc_library( - name = "transaction_utils", - srcs = ["transaction_utils.cpp"], - hdrs = ["transaction_utils.h"], - deps = [ - "//platform/proto:resdb_cc_proto", - ], -) cc_library( name = "checkpoint_manager", srcs = ["checkpoint_manager.cpp"], hdrs = ["checkpoint_manager.h"], deps = [ - ":transaction_utils", + "//platform/consensus/ordering/common:transaction_utils", "//chain/state:chain_state", "//common/crypto:signature_verifier", "//interface/common:resdb_txn_accessor", @@ -45,24 +37,6 @@ cc_library( ], ) -cc_library( - name = "raft_recovery", - srcs = ["raft_recovery.cpp"], - hdrs = ["raft_recovery.h"], - deps = [ - "//chain/storage", - "//common/utils", - "//platform/consensus/ordering/raft/proto:proposal_cc_proto", - "//platform/config:resdb_config", - "//platform/consensus/checkpoint", - "//platform/consensus/execution:system_info", - "//platform/networkstrate:server_comm", - "//platform/proto:resdb_cc_proto", - "//platform/proto:system_info_data_cc_proto", - "//platform/consensus/recovery:recovery" - ], -) - cc_library( name = "consensus", srcs = ["consensus.cpp"], @@ -78,22 +52,3 @@ cc_library( "//platform/consensus/ordering/raft/algorithm:raft", ], ) - -cc_test( - name = "raft_recovery_test", - srcs = [ - "raft_recovery_test.cpp", - ], - copts = ["-DRAFT_TEST_MODE"], - deps = [ - ":raft_recovery", - "//chain/storage:mock_storage", - "//platform/consensus/ordering/raft/proto:proposal_cc_proto", - "//platform/consensus/checkpoint:mock_checkpoint", - ":transaction_utils", - "//common/test:test_main", - "//platform/proto:client_test_cc_proto", - "//platform/consensus/ordering/raft/algorithm:raft" - ], - size="small" -) \ No newline at end of file diff --git a/platform/consensus/ordering/raft/framework/transaction_utils.cpp b/platform/consensus/ordering/raft/framework/transaction_utils.cpp deleted file mode 100644 index b423407e32..0000000000 --- a/platform/consensus/ordering/raft/framework/transaction_utils.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include "platform/consensus/ordering/raft/framework/transaction_utils.h" - -namespace resdb { - -std::unique_ptr NewRequest(Request::Type type, const Request& request, - int sender_id) { - auto new_request = std::make_unique(request); - new_request->set_type(type); - new_request->set_sender_id(sender_id); - return new_request; -} - -std::unique_ptr NewRequest(Request::Type type, const Request& request, - int sender_id, int region_id) { - auto new_request = std::make_unique(request); - new_request->set_type(type); - new_request->set_sender_id(sender_id); - new_request->mutable_region_info()->set_region_id(region_id); - return new_request; -} - -} // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/transaction_utils.h b/platform/consensus/ordering/raft/framework/transaction_utils.h deleted file mode 100644 index e5e3eac222..0000000000 --- a/platform/consensus/ordering/raft/framework/transaction_utils.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#pragma once -#include "platform/proto/replica_info.pb.h" -#include "platform/proto/resdb.pb.h" - -namespace resdb { - -enum CollectorResultCode { - INVALID = -2, - OK = 0, - STATE_CHANGED = 1, -}; - -std::unique_ptr NewRequest(Request::Type type, const Request& request, - int sender_id); - -std::unique_ptr NewRequest(Request::Type type, const Request& request, - int sender_id, int region_info); - -} // namespace resdb diff --git a/platform/consensus/ordering/raft/proto/BUILD b/platform/consensus/ordering/raft/proto/BUILD index 2d2a90c38d..114b78e24b 100644 --- a/platform/consensus/ordering/raft/proto/BUILD +++ b/platform/consensus/ordering/raft/proto/BUILD @@ -31,15 +31,6 @@ proto_library( cc_proto_library( name = "proposal_cc_proto", deps = [":proposal_proto"], -) - -proto_library( - name = "persistent_state_proto", - srcs = ["persistent_state.proto"], - #visibility = ["//visibility:public"], -) - -cc_proto_library( - name = "persistent_state_cc_proto", - deps = [":persistent_state_proto"], + visibility = ["//platform/consensus/ordering/raft:__subpackages__", + "//platform/consensus/recovery:__subpackages__"], ) diff --git a/platform/consensus/recovery/BUILD b/platform/consensus/recovery/BUILD index b23a046f95..10facb9809 100644 --- a/platform/consensus/recovery/BUILD +++ b/platform/consensus/recovery/BUILD @@ -20,7 +20,7 @@ package(default_visibility = ["//platform/consensus:__subpackages__"]) cc_library( name = "recovery_base", - hdrs = ["recovery.h", "recovery_impl.h", "recovery_template_functions.h"], + hdrs = ["recovery.h", "recovery_impl.h"], deps = [ "//chain/storage", "//common/utils", @@ -53,3 +53,40 @@ cc_test( "//platform/consensus/ordering/common:transaction_utils", ], ) + +cc_library( + name = "raft_recovery", + srcs = ["raft_recovery.cpp"], + hdrs = ["raft_recovery.h"], + deps = [ + "//chain/storage", + "//common/utils", + "//platform/consensus/ordering/raft/proto:proposal_cc_proto", + "//platform/config:resdb_config", + "//platform/consensus/checkpoint", + "//platform/consensus/execution:system_info", + "//platform/networkstrate:server_comm", + "//platform/proto:resdb_cc_proto", + "//platform/proto:system_info_data_cc_proto", + "//platform/consensus/recovery:recovery_base" + ], +) + +cc_test( + name = "raft_recovery_test", + srcs = [ + "raft_recovery_test.cpp", + ], + copts = ["-DRAFT_TEST_MODE"], + deps = [ + ":raft_recovery", + "//chain/storage:mock_storage", + "//platform/consensus/ordering/raft/proto:proposal_cc_proto", + "//platform/consensus/checkpoint:mock_checkpoint", + "//platform/consensus/ordering/common:transaction_utils", + "//common/test:test_main", + "//platform/proto:client_test_cc_proto", + "//platform/consensus/ordering/raft/algorithm:raft" + ], + size="small" +) diff --git a/platform/consensus/ordering/raft/framework/raft_recovery.cpp b/platform/consensus/recovery/raft_recovery.cpp similarity index 98% rename from platform/consensus/ordering/raft/framework/raft_recovery.cpp rename to platform/consensus/recovery/raft_recovery.cpp index 8350f16f08..60a031ada0 100644 --- a/platform/consensus/ordering/raft/framework/raft_recovery.cpp +++ b/platform/consensus/recovery/raft_recovery.cpp @@ -17,7 +17,7 @@ * under the License. */ -#include "platform/consensus/ordering/raft/framework/raft_recovery.h" +#include "platform/consensus/recovery/raft_recovery.h" #include #include @@ -62,8 +62,8 @@ void RaftRecovery::Init() { SwitchFile(file_path_, callback); LOG(ERROR) << " init done"; - meta_file_path_ = std::filesystem::path(base_file_path_).parent_path() - / "raft_metadata.dat"; + meta_file_path_ = std::filesystem::path(base_file_path_).parent_path() / + "raft_metadata.dat"; LOG(INFO) << "Meta file path: " << meta_file_path_; OpenMetadataFile(); diff --git a/platform/consensus/ordering/raft/framework/raft_recovery.h b/platform/consensus/recovery/raft_recovery.h similarity index 100% rename from platform/consensus/ordering/raft/framework/raft_recovery.h rename to platform/consensus/recovery/raft_recovery.h index c0ef85e606..0eab36fdb1 100644 --- a/platform/consensus/ordering/raft/framework/raft_recovery.h +++ b/platform/consensus/recovery/raft_recovery.h @@ -25,11 +25,11 @@ #include "platform/config/resdb_config.h" #include "platform/consensus/checkpoint/checkpoint.h" #include "platform/consensus/execution/system_info.h" +#include "platform/consensus/ordering/raft/proto/proposal.pb.h" +#include "platform/consensus/recovery/recovery.h" #include "platform/networkstrate/server_comm.h" #include "platform/proto/resdb.pb.h" -#include "platform/consensus/recovery/recovery.h" #include "platform/proto/system_info_data.pb.h" -#include "platform/consensus/ordering/raft/proto/proposal.pb.h" namespace resdb { diff --git a/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp b/platform/consensus/recovery/raft_recovery_test.cpp similarity index 94% rename from platform/consensus/ordering/raft/framework/raft_recovery_test.cpp rename to platform/consensus/recovery/raft_recovery_test.cpp index 477b692596..554c36079d 100644 --- a/platform/consensus/ordering/raft/framework/raft_recovery_test.cpp +++ b/platform/consensus/recovery/raft_recovery_test.cpp @@ -1,4 +1,4 @@ -#include "platform/consensus/ordering/raft/framework/raft_recovery.h" +#include "platform/consensus/recovery/raft_recovery.h" #include #include @@ -8,7 +8,7 @@ #include "chain/storage/mock_storage.h" #include "platform/consensus/checkpoint/mock_checkpoint.h" -#include "platform/consensus/ordering/raft/framework/transaction_utils.h" +#include "platform/consensus/ordering/common/transaction_utils.h" #include "platform/consensus/ordering/raft/proto/proposal.pb.h" namespace resdb { diff --git a/platform/consensus/recovery/recovery.h b/platform/consensus/recovery/recovery.h index ec55337b62..233cbf0e26 100644 --- a/platform/consensus/recovery/recovery.h +++ b/platform/consensus/recovery/recovery.h @@ -121,6 +121,5 @@ class RecoveryBase { }; #include "platform/consensus/recovery/recovery_impl.h" -#include "platform/consensus/recovery/recovery_template_functions.h" } // namespace resdb diff --git a/platform/consensus/recovery/recovery_impl.h b/platform/consensus/recovery/recovery_impl.h index a6e4875572..aefdb399c5 100644 --- a/platform/consensus/recovery/recovery_impl.h +++ b/platform/consensus/recovery/recovery_impl.h @@ -373,3 +373,133 @@ RecoveryBase::GetSortedRecoveryFiles( sort(list.begin(), list.end()); return list; } + +template +void RecoveryBase::ReadLogs( + std::function system_callback, + TCallback call_back, std::function set_start_point) { + if (recovery_enabled_ == false) { + return; + } + + int64_t storage_ckpt = 0; + if (storage_) { + storage_ckpt = storage_->GetLastCheckpoint(); + } + std::unique_lock lk(mutex_); + + auto recovery_files_pair = GetRecoveryFiles(storage_ckpt); + int64_t ckpt = recovery_files_pair.second; + if (set_start_point) { + set_start_point(ckpt); + } + int idx = 0; + for (auto path : recovery_files_pair.first) { + ReadLogsFromFiles(path.second, ckpt, idx++, system_callback, call_back); + } +} + +template +void RecoveryBase::SwitchFile( + const std::string& file_path, TCallback call_back) { + std::unique_lock lk(mutex_); + + min_seq_ = -1; + max_seq_ = -1; + ReadLogsFromFiles( + file_path, 0, 0, [&](const TSystemInfoData& data) {}, call_back); + OpenFile(file_path); + LOG(INFO) << "switch to file:" << file_path << " seq:" + << "[" << min_seq_ << "," << max_seq_ << "]"; +} + +template +void RecoveryBase::OpenFile( + const std::string& path) { + if (fd_ >= 0) { + close(fd_); + } + fd_ = open(path.c_str(), O_CREAT | O_WRONLY, 0666); + if (fd_ < 0) { + LOG(ERROR) << "open file fail:" << path << " error:" << strerror(errno); + } + + int pos = lseek(fd_, 0, SEEK_END); + LOG(INFO) << "file path:" << path << " len:" << pos << " fd:" << fd_; + + if (pos == 0) { + static_cast(this)->WriteSystemInfo(); + } + + lseek(fd_, 0, SEEK_END); + LOG(ERROR) << "open file:" << path << " pos:" << lseek(fd_, 0, SEEK_CUR) + << " fd:" << fd_; + assert(fd_ >= 0); +} + +template +void RecoveryBase::ReadLogsFromFiles( + const std::string& path, int64_t ckpt, int file_idx, + std::function system_callback, + TCallback call_back) { + int fd = open(path.c_str(), O_CREAT | O_RDONLY, 0666); + if (fd < 0) { + LOG(ERROR) << " open file fail:" << path; + } + LOG(INFO) << "read logs:" << path << " pos:" << lseek(fd, 0, SEEK_CUR); + assert(fd >= 0); + + size_t data_len = 0; + if constexpr (std::is_same_v) { + Read(fd, sizeof(data_len), reinterpret_cast(&data_len)); + { + std::string data; + char* buf = new char[data_len]; + if (!Read(fd, data_len, buf)) { + LOG(ERROR) << "Read system info fail"; + return; + } + data = std::string(buf, data_len); + delete buf; + std::vector data_list = ParseRawData(data); + + bool successful_callback = + static_cast(this)->PerformSystemCallback(data_list, + system_callback); + + if (!successful_callback) { + LOG(ERROR) << "parse info fail:" << data.size(); + } + } + } + + decltype(ParseData(std::string{})) request_list; + + while (Read(fd, sizeof(data_len), reinterpret_cast(&data_len))) { + std::string data; + char* buf = new char[data_len]; + if (!Read(fd, data_len, buf)) { + LOG(ERROR) << "Read data log fail"; + break; + } + data = std::string(buf, data_len); + delete buf; + + auto list = ParseData(data); + if (list.size() == 0) { + request_list.clear(); + break; + } + for (auto& l : list) { + request_list.push_back(std::move(l)); + } + } + if (request_list.size() == 0) { + ftruncate(fd, 0); + } + + static_cast(this)->PerformCallback(request_list, call_back, ckpt); + + LOG(ERROR) << "read log from files:" << path << " done"; + close(fd); +} \ No newline at end of file diff --git a/platform/consensus/recovery/recovery_template_functions.h b/platform/consensus/recovery/recovery_template_functions.h deleted file mode 100644 index 306e505f48..0000000000 --- a/platform/consensus/recovery/recovery_template_functions.h +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#pragma once - -template -void RecoveryBase::ReadLogs( - std::function system_callback, - TCallback call_back, std::function set_start_point) { - if (recovery_enabled_ == false) { - return; - } - - int64_t storage_ckpt = 0; - if (storage_) { - storage_ckpt = storage_->GetLastCheckpoint(); - } - std::unique_lock lk(mutex_); - - auto recovery_files_pair = GetRecoveryFiles(storage_ckpt); - int64_t ckpt = recovery_files_pair.second; - if (set_start_point) { - set_start_point(ckpt); - } - int idx = 0; - for (auto path : recovery_files_pair.first) { - ReadLogsFromFiles(path.second, ckpt, idx++, system_callback, call_back); - } -} - -template -void RecoveryBase::SwitchFile( - const std::string& file_path, TCallback call_back) { - std::unique_lock lk(mutex_); - - min_seq_ = -1; - max_seq_ = -1; - ReadLogsFromFiles( - file_path, 0, 0, [&](const TSystemInfoData& data) {}, call_back); - OpenFile(file_path); - LOG(INFO) << "switch to file:" << file_path << " seq:" - << "[" << min_seq_ << "," << max_seq_ << "]"; -} - -template -void RecoveryBase::OpenFile( - const std::string& path) { - if (fd_ >= 0) { - close(fd_); - } - fd_ = open(path.c_str(), O_CREAT | O_WRONLY, 0666); - if (fd_ < 0) { - LOG(ERROR) << "open file fail:" << path << " error:" << strerror(errno); - } - - int pos = lseek(fd_, 0, SEEK_END); - LOG(INFO) << "file path:" << path << " len:" << pos << " fd:" << fd_; - - if (pos == 0) { - static_cast(this)->WriteSystemInfo(); - } - - lseek(fd_, 0, SEEK_END); - LOG(ERROR) << "open file:" << path << " pos:" << lseek(fd_, 0, SEEK_CUR) - << " fd:" << fd_; - assert(fd_ >= 0); -} - -template -void RecoveryBase::ReadLogsFromFiles( - const std::string& path, int64_t ckpt, int file_idx, - std::function system_callback, - TCallback call_back) { - int fd = open(path.c_str(), O_CREAT | O_RDONLY, 0666); - if (fd < 0) { - LOG(ERROR) << " open file fail:" << path; - } - LOG(INFO) << "read logs:" << path << " pos:" << lseek(fd, 0, SEEK_CUR); - assert(fd >= 0); - - size_t data_len = 0; - if constexpr (std::is_same_v) { - Read(fd, sizeof(data_len), reinterpret_cast(&data_len)); - { - std::string data; - char* buf = new char[data_len]; - if (!Read(fd, data_len, buf)) { - LOG(ERROR) << "Read system info fail"; - return; - } - data = std::string(buf, data_len); - delete buf; - std::vector data_list = ParseRawData(data); - - bool successful_callback = - static_cast(this)->PerformSystemCallback(data_list, - system_callback); - - if (!successful_callback) { - LOG(ERROR) << "parse info fail:" << data.size(); - } - } - } - - decltype(ParseData(std::string{})) request_list; - - while (Read(fd, sizeof(data_len), reinterpret_cast(&data_len))) { - std::string data; - char* buf = new char[data_len]; - if (!Read(fd, data_len, buf)) { - LOG(ERROR) << "Read data log fail"; - break; - } - data = std::string(buf, data_len); - delete buf; - - auto list = ParseData(data); - if (list.size() == 0) { - request_list.clear(); - break; - } - for (auto& l : list) { - request_list.push_back(std::move(l)); - } - } - if (request_list.size() == 0) { - ftruncate(fd, 0); - } - - static_cast(this)->PerformCallback(request_list, call_back, ckpt); - - LOG(ERROR) << "read log from files:" << path << " done"; - close(fd); -} \ No newline at end of file From 9bc819c487fc5f8acb4ed508f2810c944bd863a9 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Wed, 1 Apr 2026 17:26:56 -0700 Subject: [PATCH 59/66] Update things missed in previous commit --- .../consensus/ordering/pbft/consensus_manager_pbft.cpp | 2 +- platform/consensus/ordering/raft/framework/BUILD | 2 +- .../ordering/raft/framework/checkpoint_manager.cpp | 2 +- platform/consensus/ordering/raft/framework/consensus.cpp | 2 +- platform/consensus/ordering/raft/framework/consensus.h | 7 +++---- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/platform/consensus/ordering/pbft/consensus_manager_pbft.cpp b/platform/consensus/ordering/pbft/consensus_manager_pbft.cpp index b4b625d4ca..9023365f49 100644 --- a/platform/consensus/ordering/pbft/consensus_manager_pbft.cpp +++ b/platform/consensus/ordering/pbft/consensus_manager_pbft.cpp @@ -64,7 +64,7 @@ ConsensusManagerPBFT::ConsensusManagerPBFT( view_change_manager_->SetDuplicateManager(commitment_->GetDuplicateManager()); - recovery_->ReadLogs( + recovery_->ReadLogs( [&](const SystemInfoData& data) { LOG(ERROR) << " read data info:" << data.view() << " primary:" << data.primary_id(); diff --git a/platform/consensus/ordering/raft/framework/BUILD b/platform/consensus/ordering/raft/framework/BUILD index c92e301190..0d9d75f728 100644 --- a/platform/consensus/ordering/raft/framework/BUILD +++ b/platform/consensus/ordering/raft/framework/BUILD @@ -46,7 +46,7 @@ cc_library( ], deps = [ ":checkpoint_manager", - ":raft_recovery", + "//platform/consensus/recovery:raft_recovery", "//common/utils", "//platform/consensus/ordering/common/framework:consensus", "//platform/consensus/ordering/raft/algorithm:raft", diff --git a/platform/consensus/ordering/raft/framework/checkpoint_manager.cpp b/platform/consensus/ordering/raft/framework/checkpoint_manager.cpp index ca9bddeca7..1a34bbbf47 100644 --- a/platform/consensus/ordering/raft/framework/checkpoint_manager.cpp +++ b/platform/consensus/ordering/raft/framework/checkpoint_manager.cpp @@ -21,7 +21,7 @@ #include -#include "platform/consensus/ordering/raft/framework/transaction_utils.h" +#include "platform/consensus/ordering/common/transaction_utils.h" #include "platform/proto/checkpoint_info.pb.h" namespace resdb { diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 197b842989..81a77cb3ba 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -120,7 +120,7 @@ int Consensus::ProcessCustomConsensus(std::unique_ptr request) { } void Consensus::RecoverFromLogs() { - recovery_->ReadLogs( + recovery_->ReadLogs( [&](const RaftMetadata& metadata) { LOG(INFO) << " read current term: " << metadata.current_term << " voted for: " << metadata.voted_for; diff --git a/platform/consensus/ordering/raft/framework/consensus.h b/platform/consensus/ordering/raft/framework/consensus.h index e6580f60c9..063fcfc069 100644 --- a/platform/consensus/ordering/raft/framework/consensus.h +++ b/platform/consensus/ordering/raft/framework/consensus.h @@ -21,12 +21,11 @@ #include "executor/common/transaction_manager.h" #include "platform/consensus/ordering/common/framework/consensus.h" -#include "platform/consensus/ordering/raft/algorithm/raft.h" #include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" -#include "platform/networkstrate/consensus_manager.h" - +#include "platform/consensus/ordering/raft/algorithm/raft.h" #include "platform/consensus/ordering/raft/framework/checkpoint_manager.h" -#include "platform/consensus/ordering/raft/framework/raft_recovery.h" +#include "platform/consensus/recovery/raft_recovery.h" +#include "platform/networkstrate/consensus_manager.h" namespace resdb { namespace raft { From 341ea182f1c1bfaa0e38f333bae9089e247c93d9 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Thu, 2 Apr 2026 11:30:03 -0700 Subject: [PATCH 60/66] Fix RaftRecovery issue, add test for it, adjust dependencies --- .../ordering/raft/algorithm/recovery_tests.h | 161 ------------------ platform/consensus/recovery/BUILD | 6 +- platform/consensus/recovery/pbft_recovery.cpp | 21 +++ platform/consensus/recovery/pbft_recovery.h | 4 + platform/consensus/recovery/raft_recovery.cpp | 8 +- platform/consensus/recovery/raft_recovery.h | 7 +- .../consensus/recovery/raft_recovery_test.cpp | 33 +++- platform/consensus/recovery/recovery.h | 7 +- platform/consensus/recovery/recovery_impl.h | 23 +-- 9 files changed, 64 insertions(+), 206 deletions(-) delete mode 100644 platform/consensus/ordering/raft/algorithm/recovery_tests.h diff --git a/platform/consensus/ordering/raft/algorithm/recovery_tests.h b/platform/consensus/ordering/raft/algorithm/recovery_tests.h deleted file mode 100644 index 409344fb77..0000000000 --- a/platform/consensus/ordering/raft/algorithm/recovery_tests.h +++ /dev/null @@ -1,161 +0,0 @@ -#include - -#include "common/crypto/mock_signature_verifier.h" -#include "platform/config/resdb_config_utils.h" -#include "platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h" -#include "platform/consensus/ordering/raft/algorithm/raft.h" -#include "platform/networkstrate/mock_replica_communicator.h" -#include "platform/proto/client_test.pb.h" - -namespace resdb { -namespace raft { -using ::testing::_; -using ::testing::AnyNumber; -using ::testing::Invoke; -using ::testing::Matcher; - -ResDBConfig GenerateConfig() { - ResConfigData data; - data.set_duplicate_check_frequency_useconds(100000); - data.set_enable_viewchange(true); - return ResDBConfig({GenerateReplicaInfo(1, "127.0.0.1", 1234), - GenerateReplicaInfo(2, "127.0.0.1", 1235), - GenerateReplicaInfo(3, "127.0.0.1", 1236), - GenerateReplicaInfo(4, "127.0.0.1", 1237)}, - GenerateReplicaInfo(1, "127.0.0.1", 1234), data); -} - -class RecoveryTest : public ::testing::Test { - private: - class MockSendMessageFunction { - public: - MOCK_METHOD(int, Call, (int, const google::protobuf::Message&, int)); - }; - class MockBroadcastFunction { - public: - MOCK_METHOD(int, Broadcast, (int, const google::protobuf::Message&)); - }; - class MockCommitFunction { - public: - MOCK_METHOD(int, Commit, (const google::protobuf::Message&)); - }; - - protected: - void SetUp() override { - auto config = GenerateConfig(); - verifier_ = std::make_unique(); - leader_election_manager_ = - std::make_unique(config); - replica_communicator_ = std::make_unique(); - recovery_ = std::make_unique( - config, CheckPoint * checkpoint, SystemInfo * system_info, - Storage * storage); - raft_ = std::make_unique( - /*id=*/1, - /*f=*/1, - /*total=*/4, verifier_.get(), leader_election_manager_.get(), - replica_communicator_.get(), recovery_.get()); - - raft_->SetSingleCallFunc( - [&](int type, const google::protobuf::Message& msg, int node_id) { - return mock_call.Call(type, msg, node_id); - }); - - raft_->SetBroadcastCallFunc( - [&](int type, const google::protobuf::Message& msg) { - return mock_broadcast.Broadcast(type, msg); - }); - - raft_->SetCommitFunc([&](const google::protobuf::Message& msg) { - return mock_commit.Commit(msg); - }); - } - - AeFields CreateAeFields(uint64_t term, int leaderId, uint64_t prevLogIndex, - uint64_t prevLogTerm, - const std::vector& entries, - uint64_t leaderCommit, int followerId) { - AeFields fields{}; - fields.term = term; - fields.leaderId = leaderId; - fields.leaderCommit = leaderCommit; - fields.prevLogIndex = prevLogIndex; - fields.prevLogTerm = prevLogTerm; - fields.followerId = followerId; - - for (const auto& logEntry : entries) { - LogEntry entry; - entry.term = logEntry.term; - entry.command = logEntry.command; - fields.entries.push_back(std::move(entry)); - } - - return fields; - }; - - // Helper to create a single log entry. - LogEntry CreateLogEntry(uint64_t term, const std::string& command_data) { - LogEntry entry; - entry.term = term; - entry.command = command_data; - return entry; - } - - // Helper to create a vector of log entries for testing. - std::vector CreateLogEntries( - const std::vector>& term_and_cmds, - bool usedForLogPatch = false) { - std::vector entries; - - if (usedForLogPatch) { - LogEntry first_entry; - first_entry.term = 0; - first_entry.command = "COMMON_PREFIX"; - entries.push_back(first_entry); - } - - for (const auto& [term, cmd] : term_and_cmds) { - LogEntry entry; - entry.term = term; - - ClientTestRequest req; - req.set_value(cmd); - - std::string serialized; - req.SerializeToString(&serialized); - entry.command = serialized; - - entries.push_back(entry); - } - - return entries; - } - - AppendEntries CreateAeMessage(const AeFields& fields) { - AppendEntries ae; - ae.set_term(fields.term); - ae.set_leaderid(fields.leaderId); - ae.set_prevlogindex(fields.prevLogIndex); - ae.set_prevlogterm(fields.prevLogTerm); - ae.set_leadercommitindex(fields.leaderCommit); - for (const auto& entry : fields.entries) { - auto* newEntry = ae.add_entries(); - newEntry->set_term(entry.term); - newEntry->set_command(entry.command); - } - - return ae; - } - - std::unique_ptr verifier_; - std::unique_ptr leader_election_manager_; - std::unique_ptr replica_communicator_; - std::unique_ptr recovery_; - std::unique_ptr raft_; - MockSendMessageFunction mock_call; - MockBroadcastFunction mock_broadcast; - MockCommitFunction mock_commit; -}; - -} // namespace raft -} // namespace resdb diff --git a/platform/consensus/recovery/BUILD b/platform/consensus/recovery/BUILD index 10facb9809..72d30613c4 100644 --- a/platform/consensus/recovery/BUILD +++ b/platform/consensus/recovery/BUILD @@ -26,10 +26,8 @@ cc_library( "//common/utils", "//platform/config:resdb_config", "//platform/consensus/checkpoint", - "//platform/consensus/execution:system_info", "//platform/networkstrate:server_comm", "//platform/proto:resdb_cc_proto", - "//platform/proto:system_info_data_cc_proto", ], ) @@ -39,6 +37,8 @@ cc_library( hdrs = ["pbft_recovery.h"], deps = [ ":recovery_base", + "//platform/consensus/execution:system_info", + "//platform/proto:system_info_data_cc_proto", ], ) @@ -64,10 +64,8 @@ cc_library( "//platform/consensus/ordering/raft/proto:proposal_cc_proto", "//platform/config:resdb_config", "//platform/consensus/checkpoint", - "//platform/consensus/execution:system_info", "//platform/networkstrate:server_comm", "//platform/proto:resdb_cc_proto", - "//platform/proto:system_info_data_cc_proto", "//platform/consensus/recovery:recovery_base" ], ) diff --git a/platform/consensus/recovery/pbft_recovery.cpp b/platform/consensus/recovery/pbft_recovery.cpp index d54cff4e12..ca04343280 100644 --- a/platform/consensus/recovery/pbft_recovery.cpp +++ b/platform/consensus/recovery/pbft_recovery.cpp @@ -177,6 +177,27 @@ bool PBFTRecovery::PerformSystemCallback( return true; } +void PBFTRecovery::HandleSystemInfo( + int fd, std::function system_callback) { + size_t data_len = 0; + Read(fd, sizeof(data_len), reinterpret_cast(&data_len)); + std::string data; + char* buf = new char[data_len]; + if (!Read(fd, data_len, buf)) { + LOG(ERROR) << "Read system info fail"; + return; + } + data = std::string(buf, data_len); + delete buf; + std::vector data_list = ParseRawData(data); + + bool successful_callback = PerformSystemCallback(data_list, system_callback); + + if (!successful_callback) { + LOG(ERROR) << "parse info fail:" << data.size(); + } +} + std::map< uint64_t, std::vector, std::unique_ptr>>> diff --git a/platform/consensus/recovery/pbft_recovery.h b/platform/consensus/recovery/pbft_recovery.h index 1de2714e23..6925b8ba16 100644 --- a/platform/consensus/recovery/pbft_recovery.h +++ b/platform/consensus/recovery/pbft_recovery.h @@ -21,6 +21,7 @@ #include "platform/consensus/execution/system_info.h" #include "platform/consensus/recovery/recovery.h" +#include "platform/proto/system_info_data.pb.h" namespace resdb { using CallbackType = @@ -63,6 +64,9 @@ class PBFTRecovery std::vector data_list, std::function system_callback); + void HandleSystemInfo( + int fd, std::function system_callback); + SystemInfo* system_info_; }; diff --git a/platform/consensus/recovery/raft_recovery.cpp b/platform/consensus/recovery/raft_recovery.cpp index 60a031ada0..46ad423118 100644 --- a/platform/consensus/recovery/raft_recovery.cpp +++ b/platform/consensus/recovery/raft_recovery.cpp @@ -201,12 +201,12 @@ void RaftRecovery::PerformCallback( LOG(ERROR) << " recovery max seq:" << max_seq; } -bool RaftRecovery::PerformSystemCallback( - std::vector data_list, - std::function system_callback) { +void RaftRecovery::HandleSystemInfo( + int /*fd*/, std::function system_callback) { RaftMetadata info = ReadMetadata(); + LOG(ERROR) << " info.voted_for: " << info.voted_for << "\ninfo.current_term " + << info.current_term; system_callback(info); - return true; } } // namespace raft diff --git a/platform/consensus/recovery/raft_recovery.h b/platform/consensus/recovery/raft_recovery.h index 0eab36fdb1..f1f5d1aa37 100644 --- a/platform/consensus/recovery/raft_recovery.h +++ b/platform/consensus/recovery/raft_recovery.h @@ -24,12 +24,10 @@ #include "chain/storage/storage.h" #include "platform/config/resdb_config.h" #include "platform/consensus/checkpoint/checkpoint.h" -#include "platform/consensus/execution/system_info.h" #include "platform/consensus/ordering/raft/proto/proposal.pb.h" #include "platform/consensus/recovery/recovery.h" #include "platform/networkstrate/server_comm.h" #include "platform/proto/resdb.pb.h" -#include "platform/proto/system_info_data.pb.h" namespace resdb { @@ -69,9 +67,8 @@ class RaftRecovery std::function entry)> call_back, int64_t ckpt); - bool PerformSystemCallback( - std::vector data_list, - std::function system_callback); + void HandleSystemInfo( + int /*fd*/, std::function system_callback); int metadata_fd_; std::string meta_file_path_; diff --git a/platform/consensus/recovery/raft_recovery_test.cpp b/platform/consensus/recovery/raft_recovery_test.cpp index 554c36079d..04141a4fc9 100644 --- a/platform/consensus/recovery/raft_recovery_test.cpp +++ b/platform/consensus/recovery/raft_recovery_test.cpp @@ -44,29 +44,27 @@ std::vector Listlogs(const std::string &path) { class RaftRecoveryTest : public Test { public: RaftRecoveryTest() - : config_(GetConfigData(), ReplicaInfo(), KeyInfo(), CertificateInfo()), - system_info_() { + : config_(GetConfigData(), ReplicaInfo(), KeyInfo(), CertificateInfo()) { std::string dir = std::filesystem::path(log_path).parent_path(); std::filesystem::remove_all(dir); } protected: ResDBConfig config_; - SystemInfo system_info_; MockCheckPoint checkpoint_; }; -TEST_F(RaftRecoveryTest, ReadLog) { +TEST_F(RaftRecoveryTest, WriteAndReadLog) { int entries_to_add = 3; { RaftRecovery recovery(config_, &checkpoint_, nullptr); for (int i = 0; i < entries_to_add; i++) { - // Set up the Log Entry to be added Entry logEntry; logEntry.set_term(i + 1); auto req = std::make_unique(); req->set_seq(i + 1); + req->set_data("Request " + std::to_string(i + 1)); std::string serialized; if (!req->SerializeToString(&serialized)) { assert(false); @@ -87,9 +85,34 @@ TEST_F(RaftRecoveryTest, ReadLog) { for (size_t i = 0; i < entries_to_add; ++i) { EXPECT_EQ(list[i].term(), i + 1); + Request req; + req.ParseFromString(list[i].command()); + EXPECT_EQ(req.data(), "Request " + std::to_string(i + 1)); } } } +TEST_F(RaftRecoveryTest, WriteAndReadMetadata) { + { + RaftRecovery recovery(config_, &checkpoint_, nullptr); + + recovery.WriteMetadata(2, 1); + } + { + int64_t current_term; + int32_t voted_for; + RaftRecovery recovery(config_, &checkpoint_, nullptr); + recovery.ReadLogs( + [&](const RaftMetadata &data) { + current_term = data.current_term; + voted_for = data.voted_for; + }, + [&](std::unique_ptr entry) {}, nullptr); + + EXPECT_EQ(current_term, 2); + EXPECT_EQ(voted_for, 1); + } +} + } // namespace raft } // namespace resdb diff --git a/platform/consensus/recovery/recovery.h b/platform/consensus/recovery/recovery.h index 233cbf0e26..07ed6de53f 100644 --- a/platform/consensus/recovery/recovery.h +++ b/platform/consensus/recovery/recovery.h @@ -37,7 +37,6 @@ #include "platform/consensus/checkpoint/checkpoint.h" #include "platform/networkstrate/server_comm.h" #include "platform/proto/resdb.pb.h" -#include "platform/proto/system_info_data.pb.h" namespace resdb { @@ -59,12 +58,10 @@ class RecoveryBase { std::vector> GetSortedRecoveryFiles( uint64_t need_min_seq, uint64_t need_max_seq); - private: - - void WriteLog(const Context* context, const Request* request); + std::vector ParseRawData(const std::string& data); + private: auto ParseData(const std::string& data); - std::vector ParseRawData(const std::string& data); void MayFlush(); diff --git a/platform/consensus/recovery/recovery_impl.h b/platform/consensus/recovery/recovery_impl.h index aefdb399c5..dc3506c583 100644 --- a/platform/consensus/recovery/recovery_impl.h +++ b/platform/consensus/recovery/recovery_impl.h @@ -450,28 +450,7 @@ void RecoveryBase::ReadLogsFromFiles( assert(fd >= 0); size_t data_len = 0; - if constexpr (std::is_same_v) { - Read(fd, sizeof(data_len), reinterpret_cast(&data_len)); - { - std::string data; - char* buf = new char[data_len]; - if (!Read(fd, data_len, buf)) { - LOG(ERROR) << "Read system info fail"; - return; - } - data = std::string(buf, data_len); - delete buf; - std::vector data_list = ParseRawData(data); - - bool successful_callback = - static_cast(this)->PerformSystemCallback(data_list, - system_callback); - - if (!successful_callback) { - LOG(ERROR) << "parse info fail:" << data.size(); - } - } - } + static_cast(this)->HandleSystemInfo(fd, system_callback); decltype(ParseData(std::string{})) request_list; From c14204cd4ed5b7fe94a9647398a9c7aa20fe9266 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Thu, 2 Apr 2026 11:44:25 -0700 Subject: [PATCH 61/66] Remove access modifier change for test compilation, change whitespace --- platform/consensus/ordering/raft/algorithm/raft.cpp | 8 -------- platform/consensus/ordering/raft/framework/consensus.cpp | 1 - platform/consensus/ordering/raft/framework/consensus.h | 3 --- platform/consensus/recovery/raft_recovery.h | 2 +- 4 files changed, 1 insertion(+), 13 deletions(-) diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index f3d2b5fcf8..1f317422d0 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -123,13 +123,6 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { logEntry.entry.set_command(std::move(serialized)); logEntry.GetSerializedSize(); AddToLog(logEntry); - - - - // TODO - // durably store the new entry somehow - // otherwise it is a safety violation to treat it as "appended" - // should not be responding to RPCs before durable. lastLogIndex_++; nextIndex_[id_] = lastLogIndex_ + 1; @@ -229,7 +222,6 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { // update lastLogIndex after appends uint64_t firstAppendIdx = lastLogIndex_ + 1; lastLogIndex_ = log_.size() - 1; - // TODO: have to actually store the entry durably before follower can respond to RPC lastLogIndex = lastLogIndex_; if (replicationLoggingFlag_ && appendSize > 0) { diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 81a77cb3ba..b1274d5a92 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -151,6 +151,5 @@ int Consensus::CommitMsg(const google::protobuf::Message& msg) { return 0; } - } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/consensus.h b/platform/consensus/ordering/raft/framework/consensus.h index 063fcfc069..4ca3fe711f 100644 --- a/platform/consensus/ordering/raft/framework/consensus.h +++ b/platform/consensus/ordering/raft/framework/consensus.h @@ -41,9 +41,6 @@ class Consensus : public common::Consensus { int ProcessNewTransaction(std::unique_ptr request) override; int CommitMsg(const google::protobuf::Message& msg) override; int CommitMsgInternal(const AppendEntries& txn); -#ifdef RAFT_TEST_MODE - public: -#endif void RecoverFromLogs(); protected: diff --git a/platform/consensus/recovery/raft_recovery.h b/platform/consensus/recovery/raft_recovery.h index f1f5d1aa37..1fe0272aaf 100644 --- a/platform/consensus/recovery/raft_recovery.h +++ b/platform/consensus/recovery/raft_recovery.h @@ -76,4 +76,4 @@ class RaftRecovery }; } // namespace raft -} // namespace resdb \ No newline at end of file +} // namespace resdb From 7745b38931f927e561f600ef829c53902692318e Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Wed, 8 Apr 2026 14:21:15 -0700 Subject: [PATCH 62/66] Address some PR comments, fix broken tests Tests now work again after adding Recovery. Addressed minor test concerns. Add Log Truncation support to Raft Recovery. This includes a new WALRecord and TruncationRecord protobuf message. Move LastLogIndex and log_ resizing into AddToLog and TruncateLog. Add PrintDebugStateLocked() so that PrintDebugState() can be called under the lock. Change Recovery to not ignore all recovery data if a portion of the data is wrong. --- .../ordering/common/algorithm/protocol_base.h | 4 +- .../consensus/ordering/raft/algorithm/BUILD | 7 + .../leader_election_manager_test.cpp | 6 +- .../ordering/raft/algorithm/mock_raft.h | 4 +- .../ordering/raft/algorithm/raft.cpp | 50 ++++--- .../consensus/ordering/raft/algorithm/raft.h | 7 +- .../algorithm/raft_append_entries_test.cpp | 20 +-- .../ordering/raft/algorithm/raft_tests.h | 42 +++--- .../ordering/raft/framework/consensus.cpp | 20 ++- .../ordering/raft/proto/proposal.proto | 13 ++ platform/consensus/recovery/BUILD | 11 ++ .../consensus/recovery/mock_raft_recovery.h | 46 +++++++ platform/consensus/recovery/raft_recovery.cpp | 130 +++++++++++++----- platform/consensus/recovery/raft_recovery.h | 12 +- .../consensus/recovery/raft_recovery_test.cpp | 105 ++++++++++++-- platform/consensus/recovery/recovery_impl.h | 5 +- 16 files changed, 367 insertions(+), 115 deletions(-) create mode 100644 platform/consensus/recovery/mock_raft_recovery.h diff --git a/platform/consensus/ordering/common/algorithm/protocol_base.h b/platform/consensus/ordering/common/algorithm/protocol_base.h index f8e47052a2..d180746bda 100644 --- a/platform/consensus/ordering/common/algorithm/protocol_base.h +++ b/platform/consensus/ordering/common/algorithm/protocol_base.h @@ -63,9 +63,9 @@ class ProtocolBase { } protected: - int SendMessage(int msg_type, const google::protobuf::Message& msg, + virtual int SendMessage(int msg_type, const google::protobuf::Message& msg, int node_id); - int Broadcast(int msg_type, const google::protobuf::Message& msg); + virtual int Broadcast(int msg_type, const google::protobuf::Message& msg); int Commit(const google::protobuf::Message& msg); bool IsStop(); diff --git a/platform/consensus/ordering/raft/algorithm/BUILD b/platform/consensus/ordering/raft/algorithm/BUILD index fd7e90ec56..214b570d88 100644 --- a/platform/consensus/ordering/raft/algorithm/BUILD +++ b/platform/consensus/ordering/raft/algorithm/BUILD @@ -48,6 +48,7 @@ cc_library( cc_library( name = "mock_raft", hdrs = ["mock_raft.h"], + testonly = True, deps = [ ":raft", ], @@ -56,6 +57,7 @@ cc_library( cc_library( name = "mock_leader_election_manager", hdrs = ["mock_leader_election_manager.h"], + testonly = True, deps = [ ":raft", ], @@ -67,6 +69,7 @@ cc_test( deps = [ ":raft", ":mock_raft", + "//platform/consensus/recovery:mock_raft_recovery", "//platform/config:resdb_config_utils", "//common/test:test_main" ], @@ -83,6 +86,7 @@ cc_test( deps = [ ":raft", ":mock_leader_election_manager", + "//platform/consensus/recovery:mock_raft_recovery", "//platform/networkstrate:mock_replica_communicator", "//common/crypto:mock_signature_verifier", "//platform/config:resdb_config_utils", @@ -102,6 +106,7 @@ cc_test( deps = [ ":raft", ":mock_leader_election_manager", + "//platform/consensus/recovery:mock_raft_recovery", "//platform/networkstrate:mock_replica_communicator", "//common/crypto:mock_signature_verifier", "//platform/config:resdb_config_utils", @@ -121,6 +126,7 @@ cc_test( deps = [ ":raft", ":mock_leader_election_manager", + "//platform/consensus/recovery:mock_raft_recovery", "//platform/networkstrate:mock_replica_communicator", "//common/crypto:mock_signature_verifier", "//platform/config:resdb_config_utils", @@ -140,6 +146,7 @@ cc_test( deps = [ ":raft", ":mock_leader_election_manager", + "//platform/consensus/recovery:mock_raft_recovery", "//platform/networkstrate:mock_replica_communicator", "//common/crypto:mock_signature_verifier", "//platform/config:resdb_config_utils", diff --git a/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp b/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp index 1ebd251a62..acb6cc5673 100644 --- a/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp +++ b/platform/consensus/ordering/raft/algorithm/leader_election_manager_test.cpp @@ -7,6 +7,7 @@ #include "platform/config/resdb_config_utils.h" #include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" #include "platform/consensus/ordering/raft/algorithm/mock_raft.h" +#include "platform/consensus/recovery/mock_raft_recovery.h" namespace resdb { namespace raft { @@ -52,9 +53,11 @@ class LeaderElectionManagerTest : public ::testing::Test { replica_communicator_ = nullptr; leader_election_manager_ = std::make_unique(config_); + mock_recovery_ = std::make_unique(config_); mock_raft_ = std::make_unique(1, 1, 3, verifier_.get(), leader_election_manager_.get(), - replica_communicator_.get()); + replica_communicator_.get(), + mock_recovery_.get()); } void TearDown() override { @@ -71,6 +74,7 @@ class LeaderElectionManagerTest : public ::testing::Test { std::unique_ptr replica_communicator_; std::unique_ptr leader_election_manager_; std::unique_ptr mock_raft_; + std::unique_ptr mock_recovery_; }; // Test 1: Follower timeout should trigger election. diff --git a/platform/consensus/ordering/raft/algorithm/mock_raft.h b/platform/consensus/ordering/raft/algorithm/mock_raft.h index 12a4d5d027..2cd58089cf 100644 --- a/platform/consensus/ordering/raft/algorithm/mock_raft.h +++ b/platform/consensus/ordering/raft/algorithm/mock_raft.h @@ -30,9 +30,9 @@ class MockRaft : public Raft { public: MockRaft(int id, int f, int total_num, SignatureVerifier* verifier, LeaderElectionManager* leaderelection_manager, - ReplicaCommunicator* replica_communicator) + ReplicaCommunicator* replica_communicator, RaftRecovery* recovery) : Raft(id, f, total_num, verifier, leaderelection_manager, - replica_communicator) {} + replica_communicator, recovery) {} MOCK_METHOD(void, SendHeartBeat, (), ()); MOCK_METHOD(void, StartElection, (), ()); diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 1f317422d0..e36a1e1570 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -59,7 +59,7 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, : ProtocolBase(id, f, total_num), currentTerm_(0), votedFor_(-1), - lastLogIndex_(0), + lastLogIndex_(-1), commitIndex_(0), lastApplied_(0), role_(Role::FOLLOWER), @@ -80,7 +80,8 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, LogEntry sentinel; sentinel.entry.set_term(0); sentinel.entry.set_command("COMMON_PREFIX"); - AddToLog(sentinel); + AddToLog(sentinel, false); + lastLogIndex_ = 0; inflightVecs_.resize(total_num_ + 1); for (auto& vec : inflightVecs_) { @@ -124,7 +125,6 @@ bool Raft::ReceiveTransaction(std::unique_ptr req) { logEntry.GetSerializedSize(); AddToLog(logEntry); - lastLogIndex_++; nextIndex_[id_] = lastLogIndex_ + 1; matchIndex_[id_] = lastLogIndex_; @@ -196,10 +196,7 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { while (logIdx < log_.size() && entriesIdx < entriesSize) { uint64_t term = ae->entries(entriesIdx).term(); if (term != log_[logIdx].entry.term()) { - auto first = log_.begin() + logIdx; - auto last = log_.begin() + lastLogIndex_ + 1; - TruncateLog(first, last); - lastLogIndex_ = log_.size() - 1; + TruncateLog(logIdx); if (replicationLoggingFlag_) { LOG(INFO) << "JIM -> " << parent_fn << ": follower saw term mismatch at index " << logIdx << ". Suffix erased from log"; @@ -213,15 +210,13 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { // append remaining entries const auto appendSize = entriesSize - entriesIdx; - log_.reserve(log_.size() + appendSize); std::vector log_entries_to_add; for (uint64_t i = entriesIdx; i < entriesSize; ++i) { log_entries_to_add.push_back(CreateLogEntry(ae->entries(i))); } - AddToLog(log_entries_to_add); - // update lastLogIndex after appends + uint64_t firstAppendIdx = lastLogIndex_ + 1; - lastLogIndex_ = log_.size() - 1; + AddToLog(std::move(log_entries_to_add)); lastLogIndex = lastLogIndex_; if (replicationLoggingFlag_ && appendSize > 0) { @@ -346,6 +341,7 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a if (!aer->success() || (nextIndex_[followerId] < lastLogIndex_ + 1)) { if (!aer->success()) { LOG(INFO) << "AppendEntriesResponse indicates FAILURE from follower " << followerId; + LOG(INFO) << "NextIndex is: " << nextIndex_[followerId] << " their lastLogIndex is: " << aer->lastlogindex(); } if (!InFlightPerFollowerLimitReachedLocked(followerId)) { fields = GatherAeFieldsLocked(followerId); @@ -829,7 +825,6 @@ bool Raft::InFlightPerFollowerLimitReachedLocked(int followerId) const { } void Raft::SetCurrentTerm(uint64_t currentTerm, bool writeMetadata) { - LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; currentTerm_ = currentTerm; if (writeMetadata) { recovery_->WriteMetadata(currentTerm_, votedFor_); @@ -837,7 +832,6 @@ void Raft::SetCurrentTerm(uint64_t currentTerm, bool writeMetadata) { } void Raft::SetVotedFor(int votedFor, bool writeMetadata) { - LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; votedFor_ = votedFor; if (writeMetadata) { recovery_->WriteMetadata(currentTerm_, votedFor_); @@ -848,39 +842,55 @@ void Raft::SetSeqIndexCoveredBySnapshot(int seq) { seqAfterCheckpoint_ = seq; } -void Raft::AddToLog(LogEntry logEntryToAdd, bool writeMetadata) { +void Raft::AddToLog(LogEntry &logEntryToAdd, bool writeMetadata) { Entry* entry; entry = &logEntryToAdd.entry; if (writeMetadata) { recovery_->AddLogEntry(entry); } log_.push_back(logEntryToAdd); + lastLogIndex_++; } void Raft::AddToLog(std::vector logEntriesToAdd, bool writeMetadata) { if (writeMetadata) { std::vector entries_to_add; for (const auto& entry : logEntriesToAdd) { + LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; entries_to_add.push_back(entry.entry); } - + LOG(INFO) << "Entries to add: " << logEntriesToAdd.size(); recovery_->AddLogEntry(entries_to_add); } + lastLogIndex_ += logEntriesToAdd.size(); log_.reserve(log_.size() + logEntriesToAdd.size()); log_.insert(log_.end(), std::make_move_iterator(logEntriesToAdd.begin()), std::make_move_iterator(logEntriesToAdd.end())); + + assert(lastLogIndex_ == log_.size() - 1); } -void Raft::TruncateLog(std::vector::iterator first, - std::vector::iterator last, - bool writeMetadata) { - log_.erase(first, last); +void Raft::TruncateLog(uint64_t firstIndex, bool writeMetadata) { + auto first = log_.begin() + firstIndex; + auto last = log_.begin() + lastLogIndex_ + 1; + if (writeMetadata) { + TruncationRecord truncation; + truncation.set_truncate_from_index(firstIndex); + truncation.set_truncate_from_term(log_[firstIndex].entry.term()); + recovery_->TruncateLog(truncation); + } + + log_.erase(first, last); + lastLogIndex_ = log_.size() - 1; } -void Raft::PrintDebugState() const { +void Raft::PrintDebugStateLocked() const { std::lock_guard lk(mutex_); + PrintDebugState(); +} +void Raft::PrintDebugState() const { LOG(INFO) << "---- Raft Debug State ----\n"; LOG(INFO) << "currentTerm_: " << currentTerm_ << "\n"; LOG(INFO) << "votedFor_: " << votedFor_ << "\n"; diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index a96789e0d0..0fd8865aef 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -112,16 +112,15 @@ class Raft : public common::ProtocolBase { virtual void SendHeartBeat(); virtual Role GetRoleSnapshot() const; virtual void SetRole(Role role); + virtual void PrintDebugStateLocked() const; virtual void PrintDebugState() const; virtual void SetCurrentTerm(uint64_t currentTerm, bool writeMetadata = true); virtual void SetVotedFor(int votedFor, bool writeMetadata = true); virtual void SetSeqIndexCoveredBySnapshot(int seq); - void AddToLog(LogEntry logEntry, bool writeMetadata = true); + void AddToLog(LogEntry &logEntry, bool writeMetadata = true); void AddToLog(std::vector logEntriesToAdd, bool writeMetadata = true); - void TruncateLog(std::vector::iterator first, - std::vector::iterator last, - bool writeMetadata = true); + void TruncateLog(uint64_t first, bool writeMetadata = true); private: mutable std::mutex mutex_; diff --git a/platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp b/platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp index e195fab9f4..3366028666 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp @@ -240,7 +240,7 @@ TEST_F(RaftTest, FollowerRejectsMismatchedTermAtPrevLogIndex) { EXPECT_TRUE(success); } -// Test 7: A follower rejects Append Entries because it does not have a term at +// Test 7: A follower rejects Append Entries because it does not have an entry at // prevLogIndex. TEST_F(RaftTest, FollowerRejectsMissingIndex) { EXPECT_CALL(mock_call, Call(_, _, _)) @@ -315,15 +315,15 @@ TEST_F(RaftTest, FollowerAddsAppendEntriesAndTruncatesLog) { std::make_unique(std::move(aemessage))); const auto& raft_log = raft_->GetLog(); - EXPECT_EQ(raft_log[0].term, 0); - EXPECT_EQ(raft_log[0].command, "COMMON_PREFIX"); - EXPECT_EQ(raft_log[1].term, 0); + EXPECT_EQ(raft_log[0].entry.term(), 0); + EXPECT_EQ(raft_log[0].entry.command(), "COMMON_PREFIX"); + EXPECT_EQ(raft_log[1].entry.term(), 0); // TODO: Use serialized string instead of manually doing it. - EXPECT_EQ(raft_log[1].command, "\n\x14Term 0 Transaction 1"); - EXPECT_EQ(raft_log[2].term, 1); - EXPECT_EQ(raft_log[2].command, "\n\x14Term 1 Transaction 1"); - EXPECT_EQ(raft_log[3].term, 1); - EXPECT_EQ(raft_log[3].command, "\n\x14Term 1 Transaction 2"); + EXPECT_EQ(raft_log[1].entry.command(), "\n\x14Term 0 Transaction 1"); + EXPECT_EQ(raft_log[2].entry.term(), 1); + EXPECT_EQ(raft_log[2].entry.command(), "\n\x14Term 1 Transaction 1"); + EXPECT_EQ(raft_log[3].entry.term(), 1); + EXPECT_EQ(raft_log[3].entry.command(), "\n\x14Term 1 Transaction 2"); EXPECT_TRUE(success); } @@ -559,7 +559,7 @@ TEST_F(RaftTest, CandidateReceivesSameTermWithAppendEntriesItCanAccept) { EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); auto aefields = CreateAeFields( - /*term=*/1, + /*term=*/2, /*leaderId=*/2, /*prevLogIndex=*/2, /*prevLogTerm=*/0, diff --git a/platform/consensus/ordering/raft/algorithm/raft_tests.h b/platform/consensus/ordering/raft/algorithm/raft_tests.h index 234d6352ef..932d9a4a79 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_tests.h +++ b/platform/consensus/ordering/raft/algorithm/raft_tests.h @@ -6,6 +6,7 @@ #include "platform/consensus/ordering/raft/algorithm/raft.h" #include "platform/networkstrate/mock_replica_communicator.h" #include "platform/proto/client_test.pb.h" +#include "platform/consensus/recovery/mock_raft_recovery.h" namespace resdb { namespace raft { @@ -43,10 +44,11 @@ class RaftTest : public ::testing::Test { protected: void SetUp() override { verifier_ = std::make_unique(); + ResDBConfig config_ = GenerateConfig(); leader_election_manager_ = - std::make_unique(GenerateConfig()); + std::make_unique(config_); replica_communicator_ = std::make_unique(); - recovery_ = std::make_unique(); + recovery_ = std::make_unique(config_); raft_ = std::make_unique( /*id=*/1, /*f=*/1, @@ -81,10 +83,10 @@ class RaftTest : public ::testing::Test { fields.followerId = followerId; for (const auto& logEntry : entries) { - LogEntry entry; - entry.term = logEntry.term; - entry.command = logEntry.command; - fields.entries.push_back(std::move(entry)); + LogEntry log_entry; + log_entry.entry.set_term(logEntry.entry.term()); + log_entry.entry.set_command(logEntry.entry.command()); + fields.entries.push_back(std::move(log_entry)); } return fields; @@ -93,10 +95,10 @@ class RaftTest : public ::testing::Test { // Helper to create a single log entry. LogEntry CreateLogEntry(uint64_t term, const std::string& command_data) { - LogEntry entry; - entry.term = term; - entry.command = command_data; - return entry; + LogEntry log_entry; + log_entry.entry.set_term(term); + log_entry.entry.set_command(command_data); + return log_entry; } // Helper to create a vector of log entries for testing. @@ -107,23 +109,23 @@ class RaftTest : public ::testing::Test { if (usedForLogPatch) { LogEntry first_entry; - first_entry.term = 0; - first_entry.command = "COMMON_PREFIX"; + first_entry.entry.set_term(0); + first_entry.entry.set_command("COMMON_PREFIX"); entries.push_back(first_entry); } for (const auto& [term, cmd] : term_and_cmds) { - LogEntry entry; - entry.term = term; + LogEntry log_entry; + log_entry.entry.set_term(term); ClientTestRequest req; req.set_value(cmd); std::string serialized; req.SerializeToString(&serialized); - entry.command = serialized; + log_entry.entry.set_command(serialized); - entries.push_back(entry); + entries.push_back(log_entry); } return entries; @@ -136,10 +138,10 @@ class RaftTest : public ::testing::Test { ae.set_prevlogindex(fields.prevLogIndex); ae.set_prevlogterm(fields.prevLogTerm); ae.set_leadercommitindex(fields.leaderCommit); - for (const auto& entry : fields.entries) { + for (const auto& log_entry : fields.entries) { auto* newEntry = ae.add_entries(); - newEntry->set_term(entry.term); - newEntry->set_command(entry.command); + newEntry->set_term(log_entry.entry.term()); + newEntry->set_command(log_entry.entry.command()); } return ae; @@ -148,7 +150,7 @@ class RaftTest : public ::testing::Test { std::unique_ptr verifier_; std::unique_ptr leader_election_manager_; std::unique_ptr replica_communicator_; - std::unique_ptr recovery_; + std::unique_ptr recovery_; std::unique_ptr raft_; MockSendMessageFunction mock_call; MockBroadcastFunction mock_broadcast; diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index b1274d5a92..5b50fdd561 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -127,11 +127,21 @@ void Consensus::RecoverFromLogs() { raft_->SetCurrentTerm(metadata.current_term, false); raft_->SetVotedFor(metadata.voted_for, false); }, - [&](std::unique_ptr entry) { - auto request = std::make_unique(); - if (!request->ParseFromString(entry->command())) - LOG(ERROR) << "Error parsing entry in Recovery"; - return CommitMsg(*request); + [&](std::unique_ptr record) { + switch (record->payload_case()) { + case WALRecord::kEntry: { + LogEntry logEntry; + logEntry.entry = record->entry(); + raft_->AddToLog(logEntry, false); + break; + } + case WALRecord::kTruncation: + raft_->TruncateLog(record->truncation().truncate_from_index(), false); + break; + case WALRecord::PAYLOAD_NOT_SET: + assert(false && "WALRecord does not contain Truncation or Entry"); + break; + } }, [&](int seq) { raft_->SetSeqIndexCoveredBySnapshot(seq); }); } diff --git a/platform/consensus/ordering/raft/proto/proposal.proto b/platform/consensus/ordering/raft/proto/proposal.proto index 1f69aabe9a..762594c640 100644 --- a/platform/consensus/ordering/raft/proto/proposal.proto +++ b/platform/consensus/ordering/raft/proto/proposal.proto @@ -26,6 +26,19 @@ message Entry { bytes command = 2; } +message TruncationRecord { + uint64 truncate_from_index = 1; + uint64 truncate_from_term = 2; +} + +message WALRecord { + uint64 seq = 1; + oneof payload { + Entry entry = 2; + TruncationRecord truncation = 3; + } +} + message AppendEntries{ uint64 term = 1; int32 leaderId = 2; diff --git a/platform/consensus/recovery/BUILD b/platform/consensus/recovery/BUILD index 72d30613c4..8f4b24b436 100644 --- a/platform/consensus/recovery/BUILD +++ b/platform/consensus/recovery/BUILD @@ -54,6 +54,17 @@ cc_test( ], ) +cc_library( + name = "mock_raft_recovery", + hdrs = ["mock_raft_recovery.h"], + testonly = True, + deps = [ + ":raft_recovery", + "//chain/storage:mock_storage", + "//platform/consensus/checkpoint:mock_checkpoint" + ], +) + cc_library( name = "raft_recovery", srcs = ["raft_recovery.cpp"], diff --git a/platform/consensus/recovery/mock_raft_recovery.h b/platform/consensus/recovery/mock_raft_recovery.h new file mode 100644 index 0000000000..936c2cd2dd --- /dev/null +++ b/platform/consensus/recovery/mock_raft_recovery.h @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include "platform/consensus/recovery/raft_recovery.h" +#include "platform/consensus/checkpoint/mock_checkpoint.h" +#include "chain/storage/mock_storage.h" + +namespace resdb { +namespace raft { + +class MockRaftRecovery : public RaftRecovery { + public: + MockRaftRecovery(const ResDBConfig& config) + : RaftRecovery(config, mock_checkpoint_.get(), mock_storage_.get()) {} + + MOCK_METHOD(void, AddLogEntry, (const Entry* entry), ()); + MOCK_METHOD(void, WriteMetadata, (int64_t current_term, int32_t voted_for), ()); + MOCK_METHOD(void, AddLogEntry, (std::vector& entries_to_add), ()); + MOCK_METHOD(void, TruncateLog, (TruncationRecord truncate_beginning_at), ()); + + std::unique_ptr mock_checkpoint_; + std::unique_ptr mock_storage_; +}; + +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/recovery/raft_recovery.cpp b/platform/consensus/recovery/raft_recovery.cpp index 46ad423118..5ee820b960 100644 --- a/platform/consensus/recovery/raft_recovery.cpp +++ b/platform/consensus/recovery/raft_recovery.cpp @@ -35,7 +35,7 @@ namespace resdb { namespace raft { -using CallbackType = std::function)>; +using CallbackType = std::function)>; RaftRecovery::RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage) @@ -50,23 +50,25 @@ void RaftRecovery::Init() { return; } + wal_seq_ = 0; + LOG(ERROR) << " init"; GetLastFile(); - CallbackType callback = [this](std::unique_ptr entry) { - min_seq_ == -1 ? min_seq_ = entry->term() - : std::min(min_seq_, static_cast(entry->term())); - max_seq_ = std::max(max_seq_, static_cast(entry->term())); - }; - - SwitchFile(file_path_, callback); - LOG(ERROR) << " init done"; - meta_file_path_ = std::filesystem::path(base_file_path_).parent_path() / "raft_metadata.dat"; LOG(INFO) << "Meta file path: " << meta_file_path_; OpenMetadataFile(); + CallbackType callback = [this](std::unique_ptr record) { + min_seq_ == -1 ? min_seq_ = record->seq() + : std::min(min_seq_, static_cast(record->seq())); + max_seq_ = std::max(max_seq_, static_cast(record->seq())); + }; + + SwitchFile(file_path_, callback); + LOG(ERROR) << " init done"; + ckpt_thread_ = std::thread([this] { this->UpdateStableCheckPoint(); }); } @@ -98,24 +100,47 @@ void RaftRecovery::OpenMetadataFile() { } void RaftRecovery::WriteMetadata(int64_t current_term, int32_t voted_for) { + if (recovery_enabled_ == false) { + return; + } + + std::string tmp_path = meta_file_path_ + ".tmp"; + LOG(ERROR) << "tmp_path = [" << tmp_path << "]"; + LOG(ERROR) << "meta_file_path_ = [" << meta_file_path_ << "]"; + int temp_fd = open(tmp_path.c_str(), O_CREAT | O_WRONLY | O_TRUNC, 0666); if (metadata_fd_ < 0) { LOG(ERROR) << "Metadata file not open"; return; } + if (temp_fd < 0) { + LOG(ERROR) << "Failed to open tmp metadata file: " << strerror(errno); + return; + } metadata_.current_term = current_term; metadata_.voted_for = voted_for; - lseek(metadata_fd_, 0, SEEK_SET); - write(metadata_fd_, &metadata_, sizeof(metadata_)); - fsync(metadata_fd_); + lseek(temp_fd, 0, SEEK_SET); + write(temp_fd, &metadata_, sizeof(metadata_)); + fsync(temp_fd); + close(temp_fd); + rename(tmp_path.c_str(), meta_file_path_.c_str()); + + std::string dir_path = std::filesystem::path(meta_file_path_).parent_path().string(); + int dir_fd = open(dir_path.c_str(), O_RDONLY); + fsync(dir_fd); + close(dir_fd); + LOG(INFO) << "Wrote metadata: term: " << current_term << " votedFor: " << voted_for; LOG(INFO) << "METADATA location: " << meta_file_path_; } RaftMetadata RaftRecovery::ReadMetadata() { + if (recovery_enabled_ == false) { + return RaftMetadata{}; + } LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; RaftMetadata metadata; @@ -140,61 +165,96 @@ void RaftRecovery::AddLogEntry(const Entry* entry) { return; } - WriteLog(entry); + std::unique_lock lk(mutex_); + WALRecord record; + *record.mutable_entry() = *entry; + record.set_seq(++wal_seq_); + WriteLog(record); Flush(); } void RaftRecovery::AddLogEntry(std::vector& entries_to_add) { - if (recovery_enabled_ == false) { + if (recovery_enabled_ == false || entries_to_add.size() == 0) { return; } + + std::unique_lock lk(mutex_); for (const auto& entry : entries_to_add) { - WriteLog(&entry); + LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; + WALRecord record; + *record.mutable_entry() = entry; + record.set_seq(++wal_seq_); + WriteLog(record); + } + Flush(); +} + +void RaftRecovery::TruncateLog(TruncationRecord truncate_beginning_at) { + if (recovery_enabled_ == false) { + return; } + + std::unique_lock lk(mutex_); + + WALRecord record; + record.set_seq(++wal_seq_); + *record.mutable_truncation() = std::move(truncate_beginning_at); + + WriteLog(record); Flush(); } -void RaftRecovery::WriteLog(const Entry* entry) { +void RaftRecovery::WriteLog(const WALRecord& record) { LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; std::string data; - if (entry) { - entry->SerializeToString(&data); + + record.SerializeToString(&data); + + + switch (record.payload_case()) { + case WALRecord::kEntry: + min_seq_ = min_seq_ == -1 + ? record.seq() + : std::min(min_seq_, static_cast(record.seq())); + max_seq_ = std::max(max_seq_, static_cast(record.seq())); + break; + case WALRecord::kTruncation: + max_seq_ = record.seq(); + break; + case WALRecord::PAYLOAD_NOT_SET: + assert(false && "WALRecord does not contain Truncation or Entry"); + break; } - std::unique_lock lk(mutex_); - min_seq_ = min_seq_ == -1 - ? entry->term() - : std::min(min_seq_, static_cast(entry->term())); - max_seq_ = std::max(max_seq_, static_cast(entry->term())); AppendData(data); } -std::vector> RaftRecovery::ParseDataListItem( +std::vector> RaftRecovery::ParseDataListItem( std::vector& data_list) { - std::vector> request_list; + std::vector> record_list; for (size_t i = 0; i < data_list.size(); i++) { - std::unique_ptr entry = std::make_unique(); + std::unique_ptr record = std::make_unique(); - if (!entry->ParseFromString(data_list[i])) { + if (!record->ParseFromString(data_list[i])) { LOG(ERROR) << "Parse from data fail"; break; } - request_list.push_back(std::move(entry)); + record_list.push_back(std::move(record)); } - return request_list; + return record_list; } void RaftRecovery::PerformCallback( - std::vector>& request_list, CallbackType call_back, + std::vector>& record_list, CallbackType call_back, int64_t ckpt) { uint64_t max_seq = 0; - for (std::unique_ptr& entry : request_list) { - if (ckpt < entry->term()) { - max_seq = entry->term(); - call_back(std::move(entry)); + for (std::unique_ptr& record : record_list) { + if (ckpt < record->seq()) { + max_seq = record->seq(); + call_back(std::move(record)); } } diff --git a/platform/consensus/recovery/raft_recovery.h b/platform/consensus/recovery/raft_recovery.h index 1fe0272aaf..00bf28f1d5 100644 --- a/platform/consensus/recovery/raft_recovery.h +++ b/platform/consensus/recovery/raft_recovery.h @@ -38,7 +38,7 @@ struct RaftMetadata { int32_t voted_for = -1; }; -using CallbackType = std::function)>; +using CallbackType = std::function)>; class RaftRecovery : public RecoveryBase { @@ -54,17 +54,18 @@ class RaftRecovery void WriteMetadata(int64_t current_term, int32_t voted_for); void AddLogEntry(const Entry* entry); void AddLogEntry(std::vector& entries_to_add); + void TruncateLog(TruncationRecord truncate_beginning_at); private: void OpenMetadataFile(); void WriteSystemInfo(); - std::vector> ParseDataListItem( + std::vector> ParseDataListItem( std::vector& data_list); - void WriteLog(const Entry* entry); + void WriteLog(const WALRecord& record); void PerformCallback( - std::vector>& request_list, - std::function entry)> call_back, + std::vector>& request_list, + std::function record)> call_back, int64_t ckpt); void HandleSystemInfo( @@ -73,6 +74,7 @@ class RaftRecovery int metadata_fd_; std::string meta_file_path_; RaftMetadata metadata_; + uint64_t wal_seq_; }; } // namespace raft diff --git a/platform/consensus/recovery/raft_recovery_test.cpp b/platform/consensus/recovery/raft_recovery_test.cpp index 04141a4fc9..a8e8b51bc1 100644 --- a/platform/consensus/recovery/raft_recovery_test.cpp +++ b/platform/consensus/recovery/raft_recovery_test.cpp @@ -59,12 +59,12 @@ TEST_F(RaftRecoveryTest, WriteAndReadLog) { { RaftRecovery recovery(config_, &checkpoint_, nullptr); - for (int i = 0; i < entries_to_add; i++) { + for (int i = 1; i <= entries_to_add; i++) { Entry logEntry; - logEntry.set_term(i + 1); + logEntry.set_term(i); auto req = std::make_unique(); - req->set_seq(i + 1); - req->set_data("Request " + std::to_string(i + 1)); + req->set_seq(i); + req->set_data("Request " + std::to_string(i)); std::string serialized; if (!req->SerializeToString(&serialized)) { assert(false); @@ -75,18 +75,20 @@ TEST_F(RaftRecoveryTest, WriteAndReadLog) { } } { - std::vector list; + std::vector list; RaftRecovery recovery(config_, &checkpoint_, nullptr); recovery.ReadLogs( [&](const RaftMetadata &data) {}, - [&](std::unique_ptr entry) { list.push_back(*entry); }, nullptr); + [&](std::unique_ptr record) { list.push_back(*record); }, nullptr); EXPECT_EQ(list.size(), entries_to_add); for (size_t i = 0; i < entries_to_add; ++i) { - EXPECT_EQ(list[i].term(), i + 1); + EXPECT_EQ(list[i].payload_case(), WALRecord::kEntry); + + EXPECT_EQ(list[i].entry().term(), i + 1); Request req; - req.ParseFromString(list[i].command()); + req.ParseFromString(list[i].entry().command()); EXPECT_EQ(req.data(), "Request " + std::to_string(i + 1)); } } @@ -107,12 +109,97 @@ TEST_F(RaftRecoveryTest, WriteAndReadMetadata) { current_term = data.current_term; voted_for = data.voted_for; }, - [&](std::unique_ptr entry) {}, nullptr); + [&](std::unique_ptr record) {}, nullptr); EXPECT_EQ(current_term, 2); EXPECT_EQ(voted_for, 1); } } +TEST_F(RaftRecoveryTest, TruncateLog) { + int entries_to_add = 4; + { + RaftRecovery recovery(config_, &checkpoint_, nullptr); + + for (int i = 1; i <= entries_to_add; i++) { + Entry logEntry; + logEntry.set_term(i); + auto req = std::make_unique(); + req->set_seq(i); + req->set_data("Request " + std::to_string(i)); + std::string serialized; + if (!req->SerializeToString(&serialized)) { + assert(false); + } + logEntry.set_command(std::move(serialized)); + + recovery.AddLogEntry(&logEntry); + } + + TruncationRecord truncation; + truncation.set_truncate_from_index(3); + truncation.set_truncate_from_term(3); + recovery.TruncateLog(truncation); + + for (int i = 5; i <= entries_to_add*2; i++) { + Entry logEntry; + logEntry.set_term(i + 1); + auto req = std::make_unique(); + req->set_seq(i); + req->set_data("Request " + std::to_string(i)); + std::string serialized; + if (!req->SerializeToString(&serialized)) { + assert(false); + } + logEntry.set_command(std::move(serialized)); + + recovery.AddLogEntry(&logEntry); + } + + } + /* Recovery WAL + Term Seq Data + list[0] 1 1 Request 1 + list[1] 2 2 Request 2 + list[2] 3 3 Request 3 + list[3] 4 4 Request 4 + list[4] Truncate beginning at Seq 3 + list[5] 6 5 Request 5 + list[6] 7 6 Request 6 + list[7] 8 7 Request 7 + list[8] 9 8 Request 8 + */ + { + std::vector list; + RaftRecovery recovery(config_, &checkpoint_, nullptr); + recovery.ReadLogs( + [&](const RaftMetadata &data) {}, + [&](std::unique_ptr record) { list.push_back(*record); }, nullptr); + + EXPECT_EQ(list.size(), 2*entries_to_add + 1); + + for (size_t i = 0; i < entries_to_add; ++i) { + EXPECT_EQ(list[i].payload_case(), WALRecord::kEntry); + EXPECT_EQ(list[i].entry().term(), i + 1); + Request req; + req.ParseFromString(list[i].entry().command()); + EXPECT_EQ(req.data(), "Request " + std::to_string(i + 1)); + EXPECT_EQ(req.seq(), i + 1); + } + + EXPECT_EQ(list[4].payload_case(), WALRecord::kTruncation); + EXPECT_EQ(list[4].truncation().truncate_from_index(), 3); + + for (size_t i = entries_to_add + 1; i < 2*entries_to_add + 1; ++i) { + EXPECT_EQ(list[i].payload_case(), WALRecord::kEntry); + EXPECT_EQ(list[i].entry().term(), i + 1); + Request req; + req.ParseFromString(list[i].entry().command()); + EXPECT_EQ(req.data(), "Request " + std::to_string(i)); + EXPECT_EQ(req.seq(), i); + } + } +} + } // namespace raft } // namespace resdb diff --git a/platform/consensus/recovery/recovery_impl.h b/platform/consensus/recovery/recovery_impl.h index dc3506c583..2154dc5251 100644 --- a/platform/consensus/recovery/recovery_impl.h +++ b/platform/consensus/recovery/recovery_impl.h @@ -322,7 +322,9 @@ RecoveryBase::GetRecoveryFiles( } sort(e_list.begin(), e_list.end()); - list.push_back(e_list.back()); + if (!e_list.empty()) { + list.push_back(e_list.back()); + } sort(list.begin(), list.end()); return std::make_pair(list, last_ckpt); @@ -466,7 +468,6 @@ void RecoveryBase::ReadLogsFromFiles( auto list = ParseData(data); if (list.size() == 0) { - request_list.clear(); break; } for (auto& l : list) { From 4ebb0ef604e5b1db9571f5a50d8325d296dd0d2c Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Mon, 20 Apr 2026 11:41:34 -0700 Subject: [PATCH 63/66] Improve reliability of RaftRecovery --- chain/storage/leveldb.cpp | 6 +- chain/storage/leveldb.h | 2 +- chain/storage/storage.h | 2 +- platform/consensus/recovery/BUILD | 1 + platform/consensus/recovery/raft_recovery.cpp | 63 ++++++++++---- platform/consensus/recovery/raft_recovery.h | 6 ++ .../consensus/recovery/raft_recovery_test.cpp | 87 +++++++++++++++++++ platform/consensus/recovery/recovery_impl.h | 8 +- 8 files changed, 152 insertions(+), 23 deletions(-) diff --git a/chain/storage/leveldb.cpp b/chain/storage/leveldb.cpp index de49376e75..47ae7a84eb 100644 --- a/chain/storage/leveldb.cpp +++ b/chain/storage/leveldb.cpp @@ -228,8 +228,10 @@ bool ResLevelDB::UpdateMetrics() { return true; } -bool ResLevelDB::Flush() { - leveldb::Status status = db_->Write(leveldb::WriteOptions(), &batch_); +bool ResLevelDB::Flush(bool should_sync) { + leveldb::WriteOptions opts = leveldb::WriteOptions(); + opts.sync = should_sync; + leveldb::Status status = db_->Write(opts, &batch_); if (status.ok()) { batch_.Clear(); return true; diff --git a/chain/storage/leveldb.h b/chain/storage/leveldb.h index 67ec7a40c2..f55e062ecc 100644 --- a/chain/storage/leveldb.h +++ b/chain/storage/leveldb.h @@ -74,7 +74,7 @@ class ResLevelDB : public Storage { bool UpdateMetrics(); - bool Flush() override; + bool Flush(bool should_sync = false) override; virtual uint64_t GetLastCheckpoint() override; diff --git a/chain/storage/storage.h b/chain/storage/storage.h index 8fa95fee9d..351d9104a8 100644 --- a/chain/storage/storage.h +++ b/chain/storage/storage.h @@ -59,7 +59,7 @@ class Storage { virtual std::vector> GetTopHistory( const std::string& key, int number) = 0; - virtual bool Flush() { return true; }; + virtual bool Flush(bool should_sync = false) { return true; }; virtual uint64_t GetLastCheckpoint() { return 0; } diff --git a/platform/consensus/recovery/BUILD b/platform/consensus/recovery/BUILD index 8f4b24b436..eeb25edbee 100644 --- a/platform/consensus/recovery/BUILD +++ b/platform/consensus/recovery/BUILD @@ -69,6 +69,7 @@ cc_library( name = "raft_recovery", srcs = ["raft_recovery.cpp"], hdrs = ["raft_recovery.h"], + copts = ["-DRAFT_TEST_MODE"], deps = [ "//chain/storage", "//common/utils", diff --git a/platform/consensus/recovery/raft_recovery.cpp b/platform/consensus/recovery/raft_recovery.cpp index 5ee820b960..ab6991a3ac 100644 --- a/platform/consensus/recovery/raft_recovery.cpp +++ b/platform/consensus/recovery/raft_recovery.cpp @@ -95,6 +95,10 @@ void RaftRecovery::OpenMetadataFile() { // Read existing metadata if it exists, otherwise defaults are used metadata_ = ReadMetadata(); + + close(metadata_fd_); + metadata_fd_ = -1; + LOG(INFO) << "Opened metadata file: term: " << metadata_.current_term << " votedFor: " << metadata_.voted_for; } @@ -105,33 +109,56 @@ void RaftRecovery::WriteMetadata(int64_t current_term, int32_t voted_for) { } std::string tmp_path = meta_file_path_ + ".tmp"; - LOG(ERROR) << "tmp_path = [" << tmp_path << "]"; - LOG(ERROR) << "meta_file_path_ = [" << meta_file_path_ << "]"; + LOG(INFO) << "tmp_path = [" << tmp_path << "]"; + LOG(INFO) << "meta_file_path_ = [" << meta_file_path_ << "]"; + int temp_fd = open(tmp_path.c_str(), O_CREAT | O_WRONLY | O_TRUNC, 0666); - if (metadata_fd_ < 0) { - LOG(ERROR) << "Metadata file not open"; - return; - } if (temp_fd < 0) { LOG(ERROR) << "Failed to open tmp metadata file: " << strerror(errno); return; } - metadata_.current_term = current_term; - metadata_.voted_for = voted_for; + RaftMetadata new_metadata; + new_metadata.current_term = current_term; + new_metadata.voted_for = voted_for; - lseek(temp_fd, 0, SEEK_SET); - write(temp_fd, &metadata_, sizeof(metadata_)); - fsync(temp_fd); + ssize_t bytes_written = write(temp_fd, &new_metadata, sizeof(new_metadata)); + if (bytes_written != static_cast(sizeof(new_metadata))) { + LOG(ERROR) << "Failed to write metadata (wrote " << bytes_written << " of " + << sizeof(new_metadata) << " bytes): " << strerror(errno); + close(temp_fd); + unlink(tmp_path.c_str()); + return; + } + + if (fsync(temp_fd) < 0) { + LOG(ERROR) << "Failed to fsync tmp metadata file: " << strerror(errno); + close(temp_fd); + unlink(tmp_path.c_str()); + return; + } close(temp_fd); - rename(tmp_path.c_str(), meta_file_path_.c_str()); + if (rename(tmp_path.c_str(), meta_file_path_.c_str()) < 0) { + LOG(ERROR) << "Failed to rename tmp metadata file: " << strerror(errno); + unlink(tmp_path.c_str()); + return; + } + // Only fsync and close the dir if it opens properly std::string dir_path = std::filesystem::path(meta_file_path_).parent_path().string(); int dir_fd = open(dir_path.c_str(), O_RDONLY); - fsync(dir_fd); - close(dir_fd); - + if (dir_fd < 0) { + LOG(ERROR) << "Failed to open directory for fsync: " << strerror(errno); + } else { + if (fsync(dir_fd) < 0) { + LOG(ERROR) << "Failed to fsync directory: " << strerror(errno); + } + close(dir_fd); + } + + metadata_ = new_metadata; + LOG(INFO) << "Wrote metadata: term: " << current_term << " votedFor: " << voted_for; LOG(INFO) << "METADATA location: " << meta_file_path_; @@ -141,11 +168,11 @@ RaftMetadata RaftRecovery::ReadMetadata() { if (recovery_enabled_ == false) { return RaftMetadata{}; } - LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " - << __func__ << "\n"; + RaftMetadata metadata; if (metadata_fd_ < 0) { - LOG(ERROR) << "Metadata file not open"; + LOG(ERROR) << "Metadata file either never opened or already closed " + "(meaning ReadMetadata() has been called before)"; return metadata; } diff --git a/platform/consensus/recovery/raft_recovery.h b/platform/consensus/recovery/raft_recovery.h index 00bf28f1d5..3341d512f0 100644 --- a/platform/consensus/recovery/raft_recovery.h +++ b/platform/consensus/recovery/raft_recovery.h @@ -56,6 +56,12 @@ class RaftRecovery void AddLogEntry(std::vector& entries_to_add); void TruncateLog(TruncationRecord truncate_beginning_at); +#ifdef RAFT_RECOVERY_TEST_MODE + std::string GetMetadataFilePath() { return meta_file_path_; } + + std::string GetFilePath() { return file_path_; } +#endif + private: void OpenMetadataFile(); void WriteSystemInfo(); diff --git a/platform/consensus/recovery/raft_recovery_test.cpp b/platform/consensus/recovery/raft_recovery_test.cpp index a8e8b51bc1..5a1a0b70d4 100644 --- a/platform/consensus/recovery/raft_recovery_test.cpp +++ b/platform/consensus/recovery/raft_recovery_test.cpp @@ -116,6 +116,91 @@ TEST_F(RaftRecoveryTest, WriteAndReadMetadata) { } } +TEST_F(RaftRecoveryTest, TruncateLog) { + int entries_to_add = 4; + { + RaftRecovery recovery(config_, &checkpoint_, nullptr); + + for (int i = 1; i <= entries_to_add; i++) { + Entry logEntry; + logEntry.set_term(i); + auto req = std::make_unique(); + req->set_seq(i); + req->set_data("Request " + std::to_string(i)); + std::string serialized; + if (!req->SerializeToString(&serialized)) { + assert(false); + } + logEntry.set_command(std::move(serialized)); + + recovery.AddLogEntry(&logEntry); + } + + TruncationRecord truncation; + truncation.set_truncate_from_index(3); + truncation.set_truncate_from_term(3); + recovery.TruncateLog(truncation); + + for (int i = 5; i <= entries_to_add * 2; i++) { + Entry logEntry; + logEntry.set_term(i + 1); + auto req = std::make_unique(); + req->set_seq(i); + req->set_data("Request " + std::to_string(i)); + std::string serialized; + if (!req->SerializeToString(&serialized)) { + assert(false); + } + logEntry.set_command(std::move(serialized)); + + recovery.AddLogEntry(&logEntry); + } + } + /* Recovery WAL + Term Seq Data + list[0] 1 1 Request 1 + list[1] 2 2 Request 2 + list[2] 3 3 Request 3 + list[3] 4 4 Request 4 + list[4] Truncate beginning at Seq 3 + list[5] 6 5 Request 5 + list[6] 7 6 Request 6 + list[7] 8 7 Request 7 + list[8] 9 8 Request 8 + */ + { + std::vector list; + RaftRecovery recovery(config_, &checkpoint_, nullptr); + recovery.ReadLogs( + [&](const RaftMetadata &data) {}, + [&](std::unique_ptr record) { list.push_back(*record); }, + nullptr); + + EXPECT_EQ(list.size(), 2 * entries_to_add + 1); + + for (size_t i = 0; i < entries_to_add; ++i) { + EXPECT_EQ(list[i].payload_case(), WALRecord::kEntry); + EXPECT_EQ(list[i].entry().term(), i + 1); + Request req; + req.ParseFromString(list[i].entry().command()); + EXPECT_EQ(req.data(), "Request " + std::to_string(i + 1)); + EXPECT_EQ(req.seq(), i + 1); + } + + EXPECT_EQ(list[4].payload_case(), WALRecord::kTruncation); + EXPECT_EQ(list[4].truncation().truncate_from_index(), 3); + + for (size_t i = entries_to_add + 1; i < 2 * entries_to_add + 1; ++i) { + EXPECT_EQ(list[i].payload_case(), WALRecord::kEntry); + EXPECT_EQ(list[i].entry().term(), i + 1); + Request req; + req.ParseFromString(list[i].entry().command()); + EXPECT_EQ(req.data(), "Request " + std::to_string(i)); + EXPECT_EQ(req.seq(), i); + } + } +} + TEST_F(RaftRecoveryTest, TruncateLog) { int entries_to_add = 4; { @@ -201,5 +286,7 @@ TEST_F(RaftRecoveryTest, TruncateLog) { } } +// TODO: Create tests that corrupt recovery files to test our handling of them. + } // namespace raft } // namespace resdb diff --git a/platform/consensus/recovery/recovery_impl.h b/platform/consensus/recovery/recovery_impl.h index 2154dc5251..0a36712ab8 100644 --- a/platform/consensus/recovery/recovery_impl.h +++ b/platform/consensus/recovery/recovery_impl.h @@ -163,7 +163,7 @@ void RecoveryBase::FinishFile( std::unique_lock lk(mutex_); Flush(); if (storage_) { - if (!storage_->Flush()) { + if (!storage_->Flush(true)) { return; } } @@ -175,6 +175,12 @@ void RecoveryBase::FinishFile( std::rename(file_path_.c_str(), new_file_path.c_str()); + std::string dir_path = + std::filesystem::path(file_path_).parent_path().string(); + int dir_fd = open(dir_path.c_str(), O_RDONLY); + fsync(dir_fd); + close(dir_fd); + LOG(INFO) << "rename:" << file_path_ << " to:" << new_file_path; std::string next_file_path = GenerateFile(seq, -1, -1); file_path_ = next_file_path; From 3cfafbe896ebb275e9d3020ab90fa5fab0b5ccd1 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Mon, 20 Apr 2026 15:50:44 -0700 Subject: [PATCH 64/66] Update changes for tests, change how log is accessed For checkpointing, we need to separate the logical log index and size from the actual indexing and size of the vector. --- chain/storage/mock_storage.h | 2 +- .../ordering/raft/algorithm/raft.cpp | 82 +++++++++++------- .../consensus/ordering/raft/algorithm/raft.h | 2 + .../ordering/raft/framework/consensus.cpp | 27 +++--- platform/consensus/recovery/raft_recovery.cpp | 20 ++--- .../consensus/recovery/raft_recovery_test.cpp | 85 ------------------- 6 files changed, 73 insertions(+), 145 deletions(-) diff --git a/chain/storage/mock_storage.h b/chain/storage/mock_storage.h index 6b17c32620..14681db728 100644 --- a/chain/storage/mock_storage.h +++ b/chain/storage/mock_storage.h @@ -57,7 +57,7 @@ class MockStorage : public Storage { MOCK_METHOD(ItemsType, GetAllItems, (), (override)); MOCK_METHOD(ValuesSeqType, GetAllItemsWithSeq, (), (override)); - MOCK_METHOD(bool, Flush, (), (override)); + MOCK_METHOD(bool, Flush, (bool should_sync), (override)); }; } // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index e36a1e1570..14bd1a1eb9 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -55,22 +55,22 @@ uint32_t LogEntry::ComputeSerializedEntrySize() const { } Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, - LeaderElectionManager* leaderelection_manager, ReplicaCommunicator* replica_communicator, RaftRecovery* recovery) + LeaderElectionManager* leaderelection_manager, + ReplicaCommunicator* replica_communicator, RaftRecovery* recovery) : ProtocolBase(id, f, total_num), - currentTerm_(0), - votedFor_(-1), - lastLogIndex_(-1), - commitIndex_(0), - lastApplied_(0), - role_(Role::FOLLOWER), - seqAfterCheckpoint_(0), - is_stop_(false), - quorum_((total_num/2) + 1), - verifier_(verifier), - leader_election_manager_(leaderelection_manager), - replica_communicator_(replica_communicator), - recovery_(recovery) { - + currentTerm_(0), + votedFor_(-1), + lastLogIndex_(-1), + commitIndex_(0), + lastApplied_(0), + role_(Role::FOLLOWER), + seqAfterCheckpoint_(0), + is_stop_(false), + quorum_((total_num / 2) + 1), + verifier_(verifier), + leader_election_manager_(leaderelection_manager), + replica_communicator_(replica_communicator), + recovery_(recovery) { id_ = id; total_num_ = total_num; f_ = (total_num-1)/2; @@ -177,8 +177,9 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { if (tr != TermRelation::STALE && role_ == Role::FOLLOWER) { uint64_t i = ae->prevlogindex(); - if (i < static_cast(log_.size()) && ae->prevlogterm() == log_[i].entry.term()) { - success = true; + if (i < static_cast(GetLogicalLogSize()) && + ae->prevlogterm() == GetLogEntryAtIndex(i).entry.term()) { + success = true; } } term = currentTerm_; @@ -193,9 +194,9 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { uint64_t entriesSize = static_cast(ae->entries_size()); // check for conflicting entry terms in existing indices // if conflict, delete suffix and short circuit out of loop - while (logIdx < log_.size() && entriesIdx < entriesSize) { + while (logIdx < GetLogicalLogSize() && entriesIdx < entriesSize) { uint64_t term = ae->entries(entriesIdx).term(); - if (term != log_[logIdx].entry.term()) { + if (term != GetLogEntryAtIndex(logIdx).entry.term()) { TruncateLog(logIdx); if (replicationLoggingFlag_) { @@ -329,7 +330,9 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a std::sort(sorted.begin(), sorted.end(), std::greater()); uint64_t lastReplicatedIndex = sorted[quorum_ - 1]; // Need to check the lastReplicatedIndex contains entry from current term - if (lastReplicatedIndex > commitIndex_ && log_[lastReplicatedIndex].entry.term() == currentTerm_) { + if (lastReplicatedIndex > commitIndex_ && + GetLogEntryAtIndex(lastReplicatedIndex).entry.term() == + currentTerm_) { LOG(INFO) << "JIM -> " << parent_fn << ": Raised commitIndex_ from " << commitIndex_ << " to " << lastReplicatedIndex; commitIndex_ = lastReplicatedIndex; @@ -637,7 +640,7 @@ TermRelation Raft::TermCheckLocked(uint64_t term) const { // requires raft mutex to be held uint64_t Raft::getLastLogTermLocked() const { - return log_[lastLogIndex_].entry.term(); + return GetLogEntryAtIndex(lastLogIndex_).entry.term(); } // requires raft mutex to be held @@ -648,7 +651,8 @@ std::vector> Raft::PrepareCommitLocked() { while (lastApplied_ < commitIndex_) { ++lastApplied_; auto command = std::make_unique(); - if (!command->ParseFromString(log_[lastApplied_].entry.command())) { + if (!command->ParseFromString( + GetLogEntryAtIndex(lastApplied_).entry.command())) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Failed to parse command"; continue; } @@ -676,7 +680,7 @@ AeFields Raft::GatherAeFieldsLocked(int followerId, bool heartBeat) const { fields.leaderId = id_; fields.leaderCommit = commitIndex_; fields.prevLogIndex = nextIndex_[followerId] - 1; - fields.prevLogTerm = log_[fields.prevLogIndex].entry.term(); + fields.prevLogTerm = GetLogEntryAtIndex(fields.prevLogIndex).entry.term(); fields.followerId = followerId; if (heartBeat) { return fields; @@ -685,13 +689,13 @@ AeFields Raft::GatherAeFieldsLocked(int followerId, bool heartBeat) const { const uint64_t firstNew = nextIndex_[followerId]; const uint64_t limit = std::min(lastLogIndex_, (firstNew + maxEntries) - 1); for (uint64_t i = firstNew; i <= limit; ++i) { - msgBytes += log_[i].GetSerializedSize(); + msgBytes += GetLogEntryAtIndex(i).GetSerializedSize(); // Always include at least 1 entry, after that limit by maxBytes. if (i != firstNew && msgBytes >= maxBytes) { break; } LogEntry entry; - entry.entry = log_[i].entry; + entry.entry = GetLogEntryAtIndex(i).entry; fields.entries.push_back(entry); } return fields; @@ -824,6 +828,18 @@ bool Raft::InFlightPerFollowerLimitReachedLocked(int followerId) const { return size == maxInFlightPerFollower; } +const LogEntry& Raft::GetLogEntryAtIndex(uint64_t index) const { + assert(index >= seqAfterCheckpoint_ && + "Tried to access entry that has been snapshotted"); + assert(index - seqAfterCheckpoint_ < log_.size() && + "Tried to access element that has not been added yet"); + return log_[index - seqAfterCheckpoint_]; +} + +int Raft::GetLogicalLogSize() const { + return log_.size() + seqAfterCheckpoint_; +} + void Raft::SetCurrentTerm(uint64_t currentTerm, bool writeMetadata) { currentTerm_ = currentTerm; if (writeMetadata) { @@ -856,7 +872,6 @@ void Raft::AddToLog(std::vector logEntriesToAdd, bool writeMetadata) { if (writeMetadata) { std::vector entries_to_add; for (const auto& entry : logEntriesToAdd) { - LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; entries_to_add.push_back(entry.entry); } LOG(INFO) << "Entries to add: " << logEntriesToAdd.size(); @@ -868,7 +883,7 @@ void Raft::AddToLog(std::vector logEntriesToAdd, bool writeMetadata) { log_.insert(log_.end(), std::make_move_iterator(logEntriesToAdd.begin()), std::make_move_iterator(logEntriesToAdd.end())); - assert(lastLogIndex_ == log_.size() - 1); + assert(lastLogIndex_ == GetLogicalLogSize() - 1); } void Raft::TruncateLog(uint64_t firstIndex, bool writeMetadata) { @@ -877,12 +892,13 @@ void Raft::TruncateLog(uint64_t firstIndex, bool writeMetadata) { if (writeMetadata) { TruncationRecord truncation; truncation.set_truncate_from_index(firstIndex); - truncation.set_truncate_from_term(log_[firstIndex].entry.term()); + truncation.set_truncate_from_term( + GetLogEntryAtIndex(firstIndex).entry.term()); recovery_->TruncateLog(truncation); } log_.erase(first, last); - lastLogIndex_ = log_.size() - 1; + lastLogIndex_ = GetLogicalLogSize() - 1; } void Raft::PrintDebugStateLocked() const { @@ -895,10 +911,10 @@ void Raft::PrintDebugState() const { LOG(INFO) << "currentTerm_: " << currentTerm_ << "\n"; LOG(INFO) << "votedFor_: " << votedFor_ << "\n"; - LOG(INFO) << "log_ (size " << log_.size() << "): ["; - for (size_t i = 0; i < log_.size(); ++i) { - LOG(INFO) << "{term: " << log_[i].entry.term(); - if (i + 1 != log_.size()) LOG(INFO) << ", "; + LOG(INFO) << "log_ (size " << GetLogicalLogSize() << "): ["; + for (size_t i = 0; i < GetLogicalLogSize(); ++i) { + LOG(INFO) << "{term: " << GetLogEntryAtIndex(i).entry.term(); + if (i + 1 != GetLogicalLogSize()) LOG(INFO) << ", "; } LOG(INFO) << "]\n"; diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index 0fd8865aef..f7b64fba9e 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -145,6 +145,8 @@ class Raft : public common::ProtocolBase { virtual void RecordNewInFlightMsgLocked( const AeFields& msg, std::chrono::steady_clock::time_point timestamp); virtual bool InFlightPerFollowerLimitReachedLocked(int followerId) const; + int GetLogicalLogSize() const; + const LogEntry& GetLogEntryAtIndex(uint64_t index) const; // Persistent state on all servers: uint64_t currentTerm_; // Protected by mutex_ diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index 5b50fdd561..fb3e06735c 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -128,20 +128,21 @@ void Consensus::RecoverFromLogs() { raft_->SetVotedFor(metadata.voted_for, false); }, [&](std::unique_ptr record) { - switch (record->payload_case()) { - case WALRecord::kEntry: { - LogEntry logEntry; - logEntry.entry = record->entry(); - raft_->AddToLog(logEntry, false); - break; - } - case WALRecord::kTruncation: - raft_->TruncateLog(record->truncation().truncate_from_index(), false); - break; - case WALRecord::PAYLOAD_NOT_SET: - assert(false && "WALRecord does not contain Truncation or Entry"); - break; + switch (record->payload_case()) { + case WALRecord::kEntry: { + LogEntry logEntry; + logEntry.entry = record->entry(); + raft_->AddToLog(logEntry, false); + break; } + case WALRecord::kTruncation: + raft_->TruncateLog(record->truncation().truncate_from_index(), + false); + break; + case WALRecord::PAYLOAD_NOT_SET: + assert(false && "WALRecord does not contain Truncation or Entry"); + break; + } }, [&](int seq) { raft_->SetSeqIndexCoveredBySnapshot(seq); }); } diff --git a/platform/consensus/recovery/raft_recovery.cpp b/platform/consensus/recovery/raft_recovery.cpp index ab6991a3ac..f0b5c72cb0 100644 --- a/platform/consensus/recovery/raft_recovery.cpp +++ b/platform/consensus/recovery/raft_recovery.cpp @@ -92,15 +92,6 @@ void RaftRecovery::OpenMetadataFile() { LOG(ERROR) << "Failed to open metadata file: " << strerror(errno); return; } - - // Read existing metadata if it exists, otherwise defaults are used - metadata_ = ReadMetadata(); - - close(metadata_fd_); - metadata_fd_ = -1; - - LOG(INFO) << "Opened metadata file: term: " << metadata_.current_term - << " votedFor: " << metadata_.voted_for; } void RaftRecovery::WriteMetadata(int64_t current_term, int32_t voted_for) { @@ -182,6 +173,9 @@ RaftMetadata RaftRecovery::ReadMetadata() { LOG(INFO) << "No existing metadata, using defaults"; return RaftMetadata{}; } + + LOG(INFO) << "Read metadata file: term: " << metadata.current_term + << " votedFor: " << metadata.voted_for; return metadata; } @@ -290,10 +284,10 @@ void RaftRecovery::PerformCallback( void RaftRecovery::HandleSystemInfo( int /*fd*/, std::function system_callback) { - RaftMetadata info = ReadMetadata(); - LOG(ERROR) << " info.voted_for: " << info.voted_for << "\ninfo.current_term " - << info.current_term; - system_callback(info); + metadata_ = ReadMetadata(); + LOG(ERROR) << " metadata_.voted_for: " << metadata_.voted_for + << "\nmetadata_.current_term " << metadata_.current_term; + system_callback(metadata_); } } // namespace raft diff --git a/platform/consensus/recovery/raft_recovery_test.cpp b/platform/consensus/recovery/raft_recovery_test.cpp index 5a1a0b70d4..cf60d5d0a1 100644 --- a/platform/consensus/recovery/raft_recovery_test.cpp +++ b/platform/consensus/recovery/raft_recovery_test.cpp @@ -201,91 +201,6 @@ TEST_F(RaftRecoveryTest, TruncateLog) { } } -TEST_F(RaftRecoveryTest, TruncateLog) { - int entries_to_add = 4; - { - RaftRecovery recovery(config_, &checkpoint_, nullptr); - - for (int i = 1; i <= entries_to_add; i++) { - Entry logEntry; - logEntry.set_term(i); - auto req = std::make_unique(); - req->set_seq(i); - req->set_data("Request " + std::to_string(i)); - std::string serialized; - if (!req->SerializeToString(&serialized)) { - assert(false); - } - logEntry.set_command(std::move(serialized)); - - recovery.AddLogEntry(&logEntry); - } - - TruncationRecord truncation; - truncation.set_truncate_from_index(3); - truncation.set_truncate_from_term(3); - recovery.TruncateLog(truncation); - - for (int i = 5; i <= entries_to_add*2; i++) { - Entry logEntry; - logEntry.set_term(i + 1); - auto req = std::make_unique(); - req->set_seq(i); - req->set_data("Request " + std::to_string(i)); - std::string serialized; - if (!req->SerializeToString(&serialized)) { - assert(false); - } - logEntry.set_command(std::move(serialized)); - - recovery.AddLogEntry(&logEntry); - } - - } - /* Recovery WAL - Term Seq Data - list[0] 1 1 Request 1 - list[1] 2 2 Request 2 - list[2] 3 3 Request 3 - list[3] 4 4 Request 4 - list[4] Truncate beginning at Seq 3 - list[5] 6 5 Request 5 - list[6] 7 6 Request 6 - list[7] 8 7 Request 7 - list[8] 9 8 Request 8 - */ - { - std::vector list; - RaftRecovery recovery(config_, &checkpoint_, nullptr); - recovery.ReadLogs( - [&](const RaftMetadata &data) {}, - [&](std::unique_ptr record) { list.push_back(*record); }, nullptr); - - EXPECT_EQ(list.size(), 2*entries_to_add + 1); - - for (size_t i = 0; i < entries_to_add; ++i) { - EXPECT_EQ(list[i].payload_case(), WALRecord::kEntry); - EXPECT_EQ(list[i].entry().term(), i + 1); - Request req; - req.ParseFromString(list[i].entry().command()); - EXPECT_EQ(req.data(), "Request " + std::to_string(i + 1)); - EXPECT_EQ(req.seq(), i + 1); - } - - EXPECT_EQ(list[4].payload_case(), WALRecord::kTruncation); - EXPECT_EQ(list[4].truncation().truncate_from_index(), 3); - - for (size_t i = entries_to_add + 1; i < 2*entries_to_add + 1; ++i) { - EXPECT_EQ(list[i].payload_case(), WALRecord::kEntry); - EXPECT_EQ(list[i].entry().term(), i + 1); - Request req; - req.ParseFromString(list[i].entry().command()); - EXPECT_EQ(req.data(), "Request " + std::to_string(i)); - EXPECT_EQ(req.seq(), i); - } - } -} - // TODO: Create tests that corrupt recovery files to test our handling of them. } // namespace raft From 43660a9a89c858a7e0b325486a810b90dbb0047f Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Fri, 1 May 2026 13:03:42 -0700 Subject: [PATCH 65/66] Add Checkpointing, new tests, miscellaneous changes Added some handling to deal with snapshotting in Raft. Checkpoints will be taken, the last snapshot index and term need to be stored, and index arithmetic needs to be accounted for. Note that this largely is not yet in use, as the hook to actual truncate a prefix in Consensus is currently commented out. Once there is handling to send and receive Snapshots, this will be used. Added a callback hook in recovery to let the consensus protocol know when a checkpoint has finished and up to what seq it covers. Moved some Raft test header functionality into a general Raft test utility to not be attached to that specific text fixture. Added some new tests for existing Raft features. Updated lastApplied_ in Raft to be lastCommitted (meaning we have queued it to be committed). lastApplied is now tracked in Raft's Consensus class. Fixed a bug with min_seq_ tracking in Recovery. Fixed potential issues with edge cases in ReadLogsFromFiles. --- .../ordering/common/framework/consensus.h | 2 +- .../consensus/ordering/raft/algorithm/BUILD | 26 + .../raft/algorithm/leaderelection_manager.h | 1 + .../ordering/raft/algorithm/raft.cpp | 301 ++++++-- .../consensus/ordering/raft/algorithm/raft.h | 39 +- .../raft_append_entries_response_test.cpp | 45 +- .../algorithm/raft_append_entries_test.cpp | 225 +++++- .../raft/algorithm/raft_integration_test.cpp | 480 +++++++++++++ .../raft_request_vote_response_test.cpp | 8 +- .../raft/algorithm/raft_request_vote_test.cpp | 91 ++- .../ordering/raft/algorithm/raft_test_util.h | 118 +++ .../ordering/raft/algorithm/raft_tests.h | 117 +-- .../consensus/ordering/raft/framework/BUILD | 21 +- .../raft/framework/checkpoint_manager.cpp | 558 -------------- .../raft/framework/checkpoint_manager.h | 149 ---- .../ordering/raft/framework/consensus.cpp | 42 +- .../ordering/raft/framework/consensus.h | 9 +- .../raft/framework/raft_checkpoint_manager.h | 40 ++ .../ordering/raft/proto/proposal.proto | 1 + platform/consensus/recovery/BUILD | 2 +- .../consensus/recovery/mock_raft_recovery.h | 8 +- platform/consensus/recovery/pbft_recovery.cpp | 3 +- platform/consensus/recovery/raft_recovery.cpp | 75 +- platform/consensus/recovery/raft_recovery.h | 13 +- .../consensus/recovery/raft_recovery_test.cpp | 679 ++++++++++++++++-- platform/consensus/recovery/recovery.h | 7 +- platform/consensus/recovery/recovery_impl.h | 70 +- 27 files changed, 2064 insertions(+), 1066 deletions(-) create mode 100644 platform/consensus/ordering/raft/algorithm/raft_integration_test.cpp create mode 100644 platform/consensus/ordering/raft/algorithm/raft_test_util.h delete mode 100644 platform/consensus/ordering/raft/framework/checkpoint_manager.cpp delete mode 100644 platform/consensus/ordering/raft/framework/checkpoint_manager.h create mode 100644 platform/consensus/ordering/raft/framework/raft_checkpoint_manager.h diff --git a/platform/consensus/ordering/common/framework/consensus.h b/platform/consensus/ordering/common/framework/consensus.h index 2f2884b893..022cc58bf3 100644 --- a/platform/consensus/ordering/common/framework/consensus.h +++ b/platform/consensus/ordering/common/framework/consensus.h @@ -53,7 +53,7 @@ class Consensus : public ConsensusManager { protected: int SendMsg(int type, const google::protobuf::Message& msg, int node_id); int Broadcast(int type, const google::protobuf::Message& msg); - int ResponseMsg(const BatchUserResponse& batch_resp); + virtual int ResponseMsg(const BatchUserResponse& batch_resp); void AsyncSend(); bool IsStop(); diff --git a/platform/consensus/ordering/raft/algorithm/BUILD b/platform/consensus/ordering/raft/algorithm/BUILD index 214b570d88..9b713d7845 100644 --- a/platform/consensus/ordering/raft/algorithm/BUILD +++ b/platform/consensus/ordering/raft/algorithm/BUILD @@ -81,6 +81,7 @@ cc_test( srcs = [ "raft_append_entries_test.cpp", "raft_tests.h", + "raft_test_util.h" ], copts = ["-DRAFT_TEST_MODE"], deps = [ @@ -101,6 +102,7 @@ cc_test( srcs = [ "raft_append_entries_response_test.cpp", "raft_tests.h", + "raft_test_util.h" ], copts = ["-DRAFT_TEST_MODE"], deps = [ @@ -121,6 +123,7 @@ cc_test( srcs = [ "raft_request_vote_test.cpp", "raft_tests.h", + "raft_test_util.h" ], copts = ["-DRAFT_TEST_MODE"], deps = [ @@ -141,6 +144,7 @@ cc_test( srcs = [ "raft_request_vote_response_test.cpp", "raft_tests.h", + "raft_test_util.h" ], copts = ["-DRAFT_TEST_MODE"], deps = [ @@ -154,4 +158,26 @@ cc_test( "//platform/proto:client_test_cc_proto", ], size="small" +) + +cc_test( + name = "raft_integration_test", + srcs = [ + "raft_integration_test.cpp", + "raft_tests.h", + "raft_test_util.h" + ], + copts = ["-DRAFT_TEST_MODE"], + deps = [ + ":raft", + ":mock_leader_election_manager", + "//platform/consensus/recovery:raft_recovery", + "//platform/networkstrate:mock_replica_communicator", + "//platform/consensus/checkpoint:mock_checkpoint", + "//common/crypto:mock_signature_verifier", + "//platform/config:resdb_config_utils", + "//common/test:test_main", + "//platform/proto:client_test_cc_proto", + ], + size="small" ) \ No newline at end of file diff --git a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h index 638aa42350..987cadd367 100644 --- a/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h +++ b/platform/consensus/ordering/raft/algorithm/leaderelection_manager.h @@ -47,6 +47,7 @@ class LeaderElectionManager { // If the monitor is not running, start to monitor. void MayStart(); void SetRaft(raft::Raft*); + // This function is called upon receiving a heartbeat virtual void OnHeartBeat(); virtual void OnRoleChange(); virtual void OnAeBroadcast(); diff --git a/platform/consensus/ordering/raft/algorithm/raft.cpp b/platform/consensus/ordering/raft/algorithm/raft.cpp index 14bd1a1eb9..4b78b13359 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft.cpp @@ -19,10 +19,14 @@ #include "platform/consensus/ordering/raft/algorithm/raft.h" +#include #include + #include #include #include +#include +#include #include #include "common/crypto/signature_verifier.h" @@ -33,6 +37,18 @@ namespace resdb { namespace raft { +void PrintStackTrace() { + void* buffer[64]; + int n = backtrace(buffer, 64); + char** symbols = backtrace_symbols(buffer, n); + + for (int i = 0; i < n; ++i) { + LOG(INFO) << symbols[i]; + } + + free(symbols); +} + std::ostream& operator<<(std::ostream& stream, Role role) { const char* nameRole[] = {"FOLLOWER", "CANDIDATE", "LEADER"}; return stream << nameRole[static_cast(role)]; @@ -60,17 +76,21 @@ Raft::Raft(int id, int f, int total_num, SignatureVerifier* verifier, : ProtocolBase(id, f, total_num), currentTerm_(0), votedFor_(-1), - lastLogIndex_(-1), + lastLogIndex_(-1), // This value is unsigned, but after the sentinel is + // added wraps back around to 0 commitIndex_(0), - lastApplied_(0), + lastCommitted_(0), role_(Role::FOLLOWER), - seqAfterCheckpoint_(0), + snapshot_last_index_(0), + snapshot_last_term_(0), + heartBeatsSentThisTerm_(0), is_stop_(false), quorum_((total_num / 2) + 1), verifier_(verifier), leader_election_manager_(leaderelection_manager), replica_communicator_(replica_communicator), recovery_(recovery) { + assert(recovery_); id_ = id; total_num_ = total_num; f_ = (total_num-1)/2; @@ -99,7 +119,12 @@ bool Raft::IsStop() { return is_stop_; } -void Raft::SetRole(Role role) { role_ = role; } +void Raft::SetRoleLocked(Role role) { role_ = role; } + +void Raft::SetRole(Role role) { + std::lock_guard lk(mutex_); + role_ = role; +} bool Raft::ReceiveTransaction(std::unique_ptr req) { std::vector messages; @@ -177,8 +202,10 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { if (tr != TermRelation::STALE && role_ == Role::FOLLOWER) { uint64_t i = ae->prevlogindex(); - if (i < static_cast(GetLogicalLogSize()) && - ae->prevlogterm() == GetLogEntryAtIndex(i).entry.term()) { + + if (i <= snapshot_last_index_ || + (i < static_cast(GetLogicalLogSize()) && + ae->prevlogterm() == GetLogTermAtIndex(i))) { success = true; } } @@ -191,12 +218,19 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { // ---------- Appending entries ---------- uint64_t logIdx = ae->prevlogindex() + 1; uint64_t entriesIdx = 0; + // If we receive an entry that has already been snapshotted, that means it + // was committed, which means it must be identical to what we have. So, skip + // to the first entry after a snapshot. + if (logIdx <= snapshot_last_index_) { + entriesIdx = snapshot_last_index_ - logIdx + 1; + logIdx = snapshot_last_index_ + 1; + } uint64_t entriesSize = static_cast(ae->entries_size()); // check for conflicting entry terms in existing indices // if conflict, delete suffix and short circuit out of loop while (logIdx < GetLogicalLogSize() && entriesIdx < entriesSize) { uint64_t term = ae->entries(entriesIdx).term(); - if (term != GetLogEntryAtIndex(logIdx).entry.term()) { + if (term != GetLogTermAtIndex(logIdx)) { TruncateLog(logIdx); if (replicationLoggingFlag_) { @@ -238,7 +272,6 @@ bool Raft::ReceiveAppendEntries(std::unique_ptr ae) { LOG(INFO) << "JIM -> " << parent_fn << ": Raised commitIndex_ from " << prevCommitIndex << " to " << commitIndex_; } - } // build vector to apply committed entries outside mutex @@ -331,8 +364,7 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a uint64_t lastReplicatedIndex = sorted[quorum_ - 1]; // Need to check the lastReplicatedIndex contains entry from current term if (lastReplicatedIndex > commitIndex_ && - GetLogEntryAtIndex(lastReplicatedIndex).entry.term() == - currentTerm_) { + GetLogTermAtIndex(lastReplicatedIndex) == currentTerm_) { LOG(INFO) << "JIM -> " << parent_fn << ": Raised commitIndex_ from " << commitIndex_ << " to " << lastReplicatedIndex; commitIndex_ = lastReplicatedIndex; @@ -346,7 +378,10 @@ bool Raft::ReceiveAppendEntriesResponse(std::unique_ptr a LOG(INFO) << "AppendEntriesResponse indicates FAILURE from follower " << followerId; LOG(INFO) << "NextIndex is: " << nextIndex_[followerId] << " their lastLogIndex is: " << aer->lastlogindex(); } - if (!InFlightPerFollowerLimitReachedLocked(followerId)) { + if (aer->lastlogindex() < snapshot_last_index_) { + LOG(INFO) << "snapshot_last_index_ is: " << snapshot_last_index_; + SendInstallSnapshot(followerId); + } else if (!InFlightPerFollowerLimitReachedLocked(followerId)) { fields = GatherAeFieldsLocked(followerId); resending = true; auto now = std::chrono::steady_clock::now(); @@ -402,6 +437,7 @@ void Raft::ReceiveRequestVote(std::unique_ptr rv) { // Then we continue voting process term = currentTerm_; votedFor = votedFor_; + uint64_t lastLogTerm = getLastLogTermLocked(); if (rv->lastlogterm() < lastLogTerm) { return; @@ -472,7 +508,7 @@ void Raft::ReceiveRequestVoteResponse(std::unique_ptr rvr) << votes_.size() << "/" << quorum_ << " in term " << currentTerm_; if (votes_.size() >= quorum_) { elected = true; - SetRole(Role::LEADER); + SetRoleLocked(Role::LEADER); ClearInFlightsLocked(); nextIndex_.assign(total_num_ + 1, lastLogIndex_ + 1); @@ -514,13 +550,11 @@ void Raft::StartElection() { return; } if (role_ == Role::FOLLOWER) { - SetRole(Role::CANDIDATE); + SetRoleLocked(Role::CANDIDATE); roleChanged = true; } heartBeatsSentThisTerm_ = 0; - LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; - SetCurrentTerm(currentTerm_ + 1, false); - SetVotedFor(id_); + SetCurrentTermAndVotedFor(currentTerm_ + 1, id_); votes_.clear(); votes_.push_back(id_); LOG(INFO) << "JIM -> " << __FUNCTION__ << ": I voted for myself. Votes: " @@ -613,12 +647,10 @@ void Raft::SendHeartBeat() { // returns true if demoted bool Raft::DemoteSelfLocked(uint64_t term) { if (term > currentTerm_) { - LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; - SetCurrentTerm(term, false); - SetVotedFor(-1); + SetCurrentTermAndVotedFor(term, -1); } if (role_ != Role::FOLLOWER) { - SetRole(Role::FOLLOWER); + SetRoleLocked(Role::FOLLOWER); //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Demoted to FOLLOWER"; return true; } @@ -640,35 +672,42 @@ TermRelation Raft::TermCheckLocked(uint64_t term) const { // requires raft mutex to be held uint64_t Raft::getLastLogTermLocked() const { - return GetLogEntryAtIndex(lastLogIndex_).entry.term(); + if (lastLogIndex_ <= snapshot_last_index_) { + return snapshot_last_term_; + } + + return GetLogTermAtIndex(lastLogIndex_); } // requires raft mutex to be held std::vector> Raft::PrepareCommitLocked() { std::vector> commitVec; - uint64_t begin = lastApplied_ + 1; + uint64_t begin = lastCommitted_ + 1; bool applying = false; - while (lastApplied_ < commitIndex_) { - ++lastApplied_; + while (lastCommitted_ < commitIndex_ && + lastCommitted_ < GetLogicalLogSize() - 1) { + ++lastCommitted_; auto command = std::make_unique(); + if (!command->ParseFromString( - GetLogEntryAtIndex(lastApplied_).entry.command())) { + GetLogEntryAtIndex(lastCommitted_).entry.command())) { LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Failed to parse command"; continue; } // assign seq number as log index for the request or executing transactions fails. - command->set_seq(lastApplied_); + command->set_seq(lastCommitted_); commitVec.push_back(std::move(command)); applying = true; } if (applying && replicationLoggingFlag_) { - if (lastApplied_ > begin) { - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Applying index entries " << begin << " to " << lastApplied_; - } - else { - LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Applying index entry " << lastApplied_; - } + if (lastCommitted_ > begin) { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Applying index entries " + << begin << " to " << lastCommitted_; + } else { + LOG(INFO) << "JIM -> " << __FUNCTION__ << ": Applying index entry " + << lastCommitted_; + } } return commitVec; @@ -676,11 +715,14 @@ std::vector> Raft::PrepareCommitLocked() { AeFields Raft::GatherAeFieldsLocked(int followerId, bool heartBeat) const { AeFields fields{}; + LOG(INFO) << "snapshot_last_index_ is: " << snapshot_last_index_; + assert((nextIndex_[followerId] - 1 >= snapshot_last_index_) || heartBeat); + fields.term = currentTerm_; fields.leaderId = id_; fields.leaderCommit = commitIndex_; fields.prevLogIndex = nextIndex_[followerId] - 1; - fields.prevLogTerm = GetLogEntryAtIndex(fields.prevLogIndex).entry.term(); + fields.prevLogTerm = GetLogTermAtIndex(fields.prevLogIndex); fields.followerId = followerId; if (heartBeat) { return fields; @@ -716,8 +758,10 @@ std::vector Raft::GatherAeFieldsForBroadcastLocked(bool heartBeat) con if (!heartBeat && InFlightPerFollowerLimitReachedLocked(i)) { continue; } - AeFields fields = GatherAeFieldsLocked(i, heartBeat); - fieldsVec.push_back(fields); + if (nextIndex_[i] - 1 >= snapshot_last_index_) { + AeFields fields = GatherAeFieldsLocked(i, heartBeat); + fieldsVec.push_back(fields); + } } return fieldsVec; } @@ -829,43 +873,100 @@ bool Raft::InFlightPerFollowerLimitReachedLocked(int followerId) const { } const LogEntry& Raft::GetLogEntryAtIndex(uint64_t index) const { - assert(index >= seqAfterCheckpoint_ && + assert(index > snapshot_last_index_ && "Tried to access entry that has been snapshotted"); - assert(index - seqAfterCheckpoint_ < log_.size() && + // A sentinel value is always included after a snapshot + // Example: snapshot_last_index_ = 5, we have truncated the entire log, added + // 1 entry, then log.size() == 2 with the sentinel. index could be 6, and + // snapshot_last_index_ + log.size() == 7 + assert(index < snapshot_last_index_ + log_.size() && "Tried to access element that has not been added yet"); - return log_[index - seqAfterCheckpoint_]; + return log_[index - snapshot_last_index_]; } +const uint64_t Raft::GetLogTermAtIndex(uint64_t index) const { + assert(index >= snapshot_last_index_ && + "Tried to access entry that has been snapshotted"); + // A sentinel value is always included after a snapshot + // Example: snapshot_last_index_ = 5, we have truncated the entire log, added + // 1 entry, then log.size() == 2 with the sentinel. index could be 6, and + // snapshot_last_index_ + log.size() == 7 + assert(index < snapshot_last_index_ + log_.size() && + "Tried to access element that has not been added yet"); + if (index == snapshot_last_index_) { + return snapshot_last_term_; + } + + return log_[index - snapshot_last_index_].entry.term(); +} + +// This would be what log.size() returns if no prefix truncation occurred. int Raft::GetLogicalLogSize() const { - return log_.size() + seqAfterCheckpoint_; + return log_.size() + snapshot_last_index_; } void Raft::SetCurrentTerm(uint64_t currentTerm, bool writeMetadata) { currentTerm_ = currentTerm; if (writeMetadata) { - recovery_->WriteMetadata(currentTerm_, votedFor_); + WriteMetadata(); } } void Raft::SetVotedFor(int votedFor, bool writeMetadata) { votedFor_ = votedFor; if (writeMetadata) { - recovery_->WriteMetadata(currentTerm_, votedFor_); + WriteMetadata(); + } +} + +void Raft::SetCurrentTermAndVotedFor(uint64_t currentTerm, int votedFor, + bool writeMetadata) { + currentTerm_ = currentTerm; + votedFor_ = votedFor; + if (writeMetadata) { + WriteMetadata(); } } -void Raft::SetSeqIndexCoveredBySnapshot(int seq) { - seqAfterCheckpoint_ = seq; +void Raft::SetSnapshotLastIndexAndTerm(uint64_t snapshot_last_index, + uint64_t snapshot_last_term, + bool writeMetadata) { + uint64_t old_snapshot_last_index = snapshot_last_index_; + snapshot_last_index_ = snapshot_last_index; + snapshot_last_term_ = snapshot_last_term; + LOG(INFO) << "setting snapshot_last_index " << snapshot_last_index + << " and snapshot_last_term" << snapshot_last_term; + if (writeMetadata) { + WriteMetadata(); + return; + } + if (old_snapshot_last_index) { + LOG(INFO) << "snapshot_last_index already set during recovery"; + return; + } + + lastLogIndex_ = snapshot_last_index_; + commitIndex_ = snapshot_last_index_; + lastCommitted_ = snapshot_last_index_; + log_[0].entry.set_term(snapshot_last_term_); +} + +uint64_t Raft::GetSnapshotLastIndex() { return snapshot_last_index_; } + +void Raft::WriteMetadata() { + recovery_->WriteMetadata(currentTerm_, votedFor_, snapshot_last_index_, + snapshot_last_term_); } void Raft::AddToLog(LogEntry &logEntryToAdd, bool writeMetadata) { + lastLogIndex_++; Entry* entry; entry = &logEntryToAdd.entry; if (writeMetadata) { - recovery_->AddLogEntry(entry); + recovery_->AddLogEntry(entry, lastLogIndex_); } log_.push_back(logEntryToAdd); - lastLogIndex_++; + assert(lastLogIndex_ == GetLogicalLogSize() - 1); } void Raft::AddToLog(std::vector logEntriesToAdd, bool writeMetadata) { @@ -874,11 +975,10 @@ void Raft::AddToLog(std::vector logEntriesToAdd, bool writeMetadata) { for (const auto& entry : logEntriesToAdd) { entries_to_add.push_back(entry.entry); } - LOG(INFO) << "Entries to add: " << logEntriesToAdd.size(); - recovery_->AddLogEntry(entries_to_add); + recovery_->AddLogEntry(entries_to_add, lastLogIndex_ + 1); } - lastLogIndex_ += logEntriesToAdd.size(); + lastLogIndex_ += logEntriesToAdd.size(); log_.reserve(log_.size() + logEntriesToAdd.size()); log_.insert(log_.end(), std::make_move_iterator(logEntriesToAdd.begin()), std::make_move_iterator(logEntriesToAdd.end())); @@ -887,65 +987,110 @@ void Raft::AddToLog(std::vector logEntriesToAdd, bool writeMetadata) { } void Raft::TruncateLog(uint64_t firstIndex, bool writeMetadata) { - auto first = log_.begin() + firstIndex; - auto last = log_.begin() + lastLogIndex_ + 1; + assert(firstIndex > commitIndex_); + auto first = log_.begin() + (firstIndex - snapshot_last_index_); + auto last = log_.begin() + (lastLogIndex_ - snapshot_last_index_) + 1; + auto num_elements_erased = lastLogIndex_ - firstIndex + 1; if (writeMetadata) { TruncationRecord truncation; truncation.set_truncate_from_index(firstIndex); - truncation.set_truncate_from_term( - GetLogEntryAtIndex(firstIndex).entry.term()); + truncation.set_truncate_from_term(GetLogTermAtIndex(firstIndex)); recovery_->TruncateLog(truncation); } log_.erase(first, last); - lastLogIndex_ = GetLogicalLogSize() - 1; + lastLogIndex_ -= num_elements_erased; + assert(lastLogIndex_ == GetLogicalLogSize() - 1); +} + +void Raft::TruncatePrefix(uint64_t index) { + std::lock_guard lk(mutex_); + TruncatePrefixLocked(index); } +void Raft::TruncatePrefixLocked(uint64_t index) { + assert(index > snapshot_last_index_ && + "Tried to truncate an entry that has been snapshotted"); + assert(index <= lastCommitted_ && + "Tried to prefix truncate an element that has not been committed"); + LOG(INFO) << "Setting Snapshot last index to:" << index + 1; + + // Keep the sentinel, erase everything up to the index. + auto erase_end = log_.begin() + (index - snapshot_last_index_); + auto last_snapshotted_entry_term = GetLogTermAtIndex(index); + log_.erase(log_.begin() + 1, erase_end + 1); + assert(log_[0].entry.term() == last_snapshotted_entry_term); + SetSnapshotLastIndexAndTerm(index, last_snapshotted_entry_term); + + assert(lastLogIndex_ == GetLogicalLogSize() - 1); +} + +void Raft::SendInstallSnapshot(int followerId) {} + +/* +void Raft::ReceiveInstallSnapshot() { + +} + +void Raft::ReceiveInstallSnapshotResponse() { + +} +*/ + void Raft::PrintDebugStateLocked() const { std::lock_guard lk(mutex_); PrintDebugState(); } void Raft::PrintDebugState() const { - LOG(INFO) << "---- Raft Debug State ----\n"; - LOG(INFO) << "currentTerm_: " << currentTerm_ << "\n"; - LOG(INFO) << "votedFor_: " << votedFor_ << "\n"; + std::ostringstream oss; + + oss << "---- Raft Debug State ----\n"; - LOG(INFO) << "log_ (size " << GetLogicalLogSize() << "): ["; + oss << "currentTerm_: " << currentTerm_ << "\n"; + oss << "votedFor_: " << votedFor_ << "\n"; + + // log_ + oss << "log_ (size " << GetLogicalLogSize() << "): ["; for (size_t i = 0; i < GetLogicalLogSize(); ++i) { - LOG(INFO) << "{term: " << GetLogEntryAtIndex(i).entry.term(); - if (i + 1 != GetLogicalLogSize()) LOG(INFO) << ", "; + oss << "{term: " << GetLogTermAtIndex(i) << "}"; + if (i + 1 != GetLogicalLogSize()) oss << ", "; } - LOG(INFO) << "]\n"; + oss << "]\n"; - LOG(INFO) << "nextIndex_: ["; + // nextIndex_ + oss << "nextIndex_: ["; for (size_t i = 0; i < nextIndex_.size(); ++i) { - LOG(INFO) << nextIndex_[i]; - if (i + 1 != nextIndex_.size()) LOG(INFO) << ", "; + oss << nextIndex_[i]; + if (i + 1 != nextIndex_.size()) oss << ", "; } - LOG(INFO) << "]\n"; + oss << "]\n"; - LOG(INFO) << "matchIndex_: ["; + // matchIndex_ + oss << "matchIndex_: ["; for (size_t i = 0; i < matchIndex_.size(); ++i) { - LOG(INFO) << matchIndex_[i]; - if (i + 1 != matchIndex_.size()) LOG(INFO) << ", "; + oss << matchIndex_[i]; + if (i + 1 != matchIndex_.size()) oss << ", "; } - LOG(INFO) << "]\n"; + oss << "]\n"; - LOG(INFO) << "heartBeatsSentThisTerm_: " << heartBeatsSentThisTerm_ << "\n"; - LOG(INFO) << "lastLogIndex_: " << lastLogIndex_ << "\n"; - LOG(INFO) << "commitIndex_: " << commitIndex_ << "\n"; - LOG(INFO) << "lastApplied_: " << lastApplied_ << "\n"; - LOG(INFO) << "role_: " << static_cast(role_) << "\n"; + oss << "heartBeatsSentThisTerm_: " << heartBeatsSentThisTerm_ << "\n"; + oss << "lastLogIndex_: " << lastLogIndex_ << "\n"; + oss << "commitIndex_: " << commitIndex_ << "\n"; + oss << "lastCommitted_: " << lastCommitted_ << "\n"; + oss << "role_: " << static_cast(role_) << "\n"; - LOG(INFO) << "votes_: ["; + // votes_ + oss << "votes_: ["; for (size_t i = 0; i < votes_.size(); ++i) { - LOG(INFO) << votes_[i]; - if (i + 1 != votes_.size()) LOG(INFO) << ", "; + oss << votes_[i]; + if (i + 1 != votes_.size()) oss << ", "; } - LOG(INFO) << "]\n"; + oss << "]\n"; + + oss << "--------------------------"; - LOG(INFO) << "--------------------------\n"; + LOG(INFO) << oss.str(); } } // namespace raft diff --git a/platform/consensus/ordering/raft/algorithm/raft.h b/platform/consensus/ordering/raft/algorithm/raft.h index f7b64fba9e..5d33142c32 100644 --- a/platform/consensus/ordering/raft/algorithm/raft.h +++ b/platform/consensus/ordering/raft/algorithm/raft.h @@ -78,7 +78,7 @@ struct RaftStatePatch { std::optional currentTerm; std::optional votedFor; std::optional commitIndex; - std::optional lastApplied; + std::optional lastCommitted; std::optional role; std::optional> log; @@ -114,13 +114,24 @@ class Raft : public common::ProtocolBase { virtual void SetRole(Role role); virtual void PrintDebugStateLocked() const; virtual void PrintDebugState() const; + void WriteMetadata(); + uint64_t GetSnapshotLastIndex(); + + // These functions with writeMetadata are also used to replay information upon + // recovery. So, they are called with false during recovery, and true + // everywhere else. virtual void SetCurrentTerm(uint64_t currentTerm, bool writeMetadata = true); virtual void SetVotedFor(int votedFor, bool writeMetadata = true); - virtual void SetSeqIndexCoveredBySnapshot(int seq); + virtual void SetCurrentTermAndVotedFor(uint64_t currentTerm, int votedFor, + bool writeMetadata = true); + void SetSnapshotLastIndexAndTerm(uint64_t snapshot_last_index, + uint64_t snapshot_last_term, + bool writeMetadata = true); void AddToLog(LogEntry &logEntry, bool writeMetadata = true); void AddToLog(std::vector logEntriesToAdd, bool writeMetadata = true); void TruncateLog(uint64_t first, bool writeMetadata = true); + void TruncatePrefix(uint64_t index); private: mutable std::mutex mutex_; @@ -146,7 +157,17 @@ class Raft : public common::ProtocolBase { const AeFields& msg, std::chrono::steady_clock::time_point timestamp); virtual bool InFlightPerFollowerLimitReachedLocked(int followerId) const; int GetLogicalLogSize() const; +#ifdef RAFT_TEST_MODE + public: +#endif const LogEntry& GetLogEntryAtIndex(uint64_t index) const; + const uint64_t GetLogTermAtIndex(uint64_t index) const; +#ifdef RAFT_TEST_MODE + private: +#endif + void SendInstallSnapshot(int followerId); + void TruncatePrefixLocked(uint64_t index); + void SetRoleLocked(Role role); // Persistent state on all servers: uint64_t currentTerm_; // Protected by mutex_ @@ -161,14 +182,17 @@ class Raft : public common::ProtocolBase { // Volatile state on all servers: uint64_t commitIndex_; // Protected by mutex_ - uint64_t lastApplied_; // Protected by mutex_ + // lastCommitted stores the last entry that has been passed to commit_, but it + // may not yet have been executed. Raft's Consensus file holds lastApplied_ + uint64_t lastCommitted_; // Protected by mutex_ Role role_; // Protected by mutex_ //int leaderId_; // Protected by mutex_ std::vector votes_; // Protected by mutex_ std::vector> inflightVecs_; // Protected by mutex_ //std::chrono::steady_clock::time_point last_ae_time_; //std::chrono::steady_clock::time_point last_heartbeat_time_; // Protected by mutex_ - int seqAfterCheckpoint_; + int64_t snapshot_last_index_; + int64_t snapshot_last_term_; bool is_stop_; const uint64_t quorum_; @@ -190,11 +214,10 @@ class Raft : public common::ProtocolBase { public: void SetStateForTest(RaftStatePatch patch) { std::lock_guard lk(mutex_); - if (patch.currentTerm) currentTerm_ = *patch.currentTerm; if (patch.votedFor) votedFor_ = *patch.votedFor; if (patch.commitIndex) commitIndex_ = *patch.commitIndex; - if (patch.lastApplied) lastApplied_ = *patch.lastApplied; + if (patch.lastCommitted) lastCommitted_ = *patch.lastCommitted; if (patch.role) role_ = *patch.role; if (patch.log) { @@ -270,9 +293,9 @@ class Raft : public common::ProtocolBase { return commitIndex_; } - uint64_t GetLastApplied() const { + uint64_t GetLastCommitted() const { std::lock_guard lock(mutex_); - return lastApplied_; + return lastCommitted_; } Role GetRole() const { diff --git a/platform/consensus/ordering/raft/algorithm/raft_append_entries_response_test.cpp b/platform/consensus/ordering/raft/algorithm/raft_append_entries_response_test.cpp index cc30602963..aaad9e8d8f 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_append_entries_response_test.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft_append_entries_response_test.cpp @@ -2,10 +2,6 @@ namespace resdb { namespace raft { -using ::testing::_; -using ::testing::AnyNumber; -using ::testing::Invoke; -using ::testing::Matcher; // Test 1: A leader receiving an AppendEntriesResponse success and updating the // follower's matchIndex. @@ -69,7 +65,7 @@ TEST_F(RaftTest, LeaderReceivesAppendEntriesResponseSuccessAndCommits) { raft_->SetStateForTest({.currentTerm = 1, .commitIndex = 0, - .lastApplied = 0, + .lastCommitted = 0, .role = Role::LEADER, .log = CreateLogEntries( { @@ -77,7 +73,7 @@ TEST_F(RaftTest, LeaderReceivesAppendEntriesResponseSuccessAndCommits) { {1, "Transaction 2"}, }, true), - .nextIndex = std::vector{0, 2, 2, 2, 2}, + .nextIndex = std::vector{1, 2, 2, 2, 2}, .matchIndex = std::vector{0, 2, 0, 1, 0}}); bool success = raft_->ReceiveAppendEntriesResponse( @@ -111,7 +107,7 @@ TEST_F(RaftTest, LeaderCatchesUpFollowerThatIsBehind) { raft_->SetStateForTest({ .currentTerm = 1, .commitIndex = 0, - .lastApplied = 0, + .lastCommitted = 0, .role = Role::LEADER, .log = CreateLogEntries( { @@ -151,7 +147,7 @@ TEST_F(RaftTest, LeaderCatchesUpFollowerThatIsBehindFailure) { raft_->SetStateForTest({ .currentTerm = 1, .commitIndex = 0, - .lastApplied = 0, + .lastCommitted = 0, .role = Role::LEADER, .log = CreateLogEntries( { @@ -202,5 +198,38 @@ TEST_F(RaftTest, LeaderIgnoresAppendEntriesResponseFromOutdatedTerm) { EXPECT_TRUE(success); } +// Test 8: A leader does not advance its commit index from a previous term if it +// has not replicated an entry from its current term. +TEST_F(RaftTest, + LeaderReceivesAppendEntriesResponseSuccessAndDoesNotCommitOldTerm) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(mock_commit, Commit(_)).Times(0); + + AppendEntriesResponse aeResponse; + aeResponse.set_success(true); + aeResponse.set_term(1); + aeResponse.set_id(2); + aeResponse.set_lastlogindex(2); + + raft_->SetStateForTest({.currentTerm = 1, + .commitIndex = 0, + .lastCommitted = 0, + .role = Role::LEADER, + .log = CreateLogEntries( + { + {0, "Transaction 1"}, + {0, "Transaction 2"}, + }, + true), + .nextIndex = std::vector{0, 2, 2, 2, 2}, + .matchIndex = std::vector{0, 2, 0, 1, 0}}); + + bool success = raft_->ReceiveAppendEntriesResponse( + std::make_unique(aeResponse)); + EXPECT_TRUE(success); + EXPECT_THAT(raft_->GetMatchIndex(), ::testing::ElementsAre(0, 2, 2, 1, 0)); + EXPECT_EQ(raft_->GetCommitIndex(), 0); +} + } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp b/platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp index 3366028666..fbbba46b35 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft_append_entries_test.cpp @@ -2,10 +2,6 @@ namespace resdb { namespace raft { -using ::testing::_; -using ::testing::AnyNumber; -using ::testing::Invoke; -using ::testing::Matcher; // Test 1: A follower receiving a client transaction should reject it. TEST_F(RaftTest, FollowerRejectsClientTransaction) { @@ -14,7 +10,11 @@ TEST_F(RaftTest, FollowerRejectsClientTransaction) { auto req = std::make_unique(); req->set_seq(1); - raft_->SetRole(Role::FOLLOWER); + raft_->SetStateForTest({ + .currentTerm = 0, + .role = Role::FOLLOWER, + .log = CreateLogEntries({}, true), + }); bool success = raft_->ReceiveTransaction(std::move(req)); EXPECT_FALSE(success); @@ -28,7 +28,11 @@ TEST_F(RaftTest, LeaderSendsAppendEntriesUponClientTransaction) { auto req = std::make_unique(); req->set_seq(1); - raft_->SetRole(Role::LEADER); + raft_->SetStateForTest({ + .currentTerm = 0, + .role = Role::LEADER, + .log = CreateLogEntries({}, true), + }); bool success = raft_->ReceiveTransaction(std::move(req)); EXPECT_TRUE(success); @@ -73,7 +77,7 @@ TEST_F(RaftTest, LeaderSendsAppendEntriesBasedOnNextIndex) { {0, "Term 0 Transaction 4"}, }, true), - .nextIndex = std::vector{0, 4, 3, 2, 1}}); + .nextIndex = std::vector{1, 4, 3, 2, 1}}); auto req = std::make_unique(); req->set_seq(5); @@ -110,7 +114,11 @@ TEST_F(RaftTest, FollowerAddsAppendEntriesWithMultipleEntries) { /*followerId=*/1); auto aemessage = CreateAeMessage(aefields); - raft_->SetRole(Role::FOLLOWER); + raft_->SetStateForTest({ + .currentTerm = 0, + .role = Role::FOLLOWER, + .log = CreateLogEntries({}, true), + }); bool success = raft_->ReceiveAppendEntries( std::make_unique(std::move(aemessage))); @@ -182,7 +190,12 @@ TEST_F(RaftTest, FollowerAddsMultipleAppendEntries) { auto aemessage1 = CreateAeMessage(aefields1); auto aemessage2 = CreateAeMessage(aefields2); auto aemessage3 = CreateAeMessage(aefields3); - raft_->SetRole(Role::FOLLOWER); + + raft_->SetStateForTest({ + .currentTerm = 0, + .role = Role::FOLLOWER, + .log = CreateLogEntries({}, true), + }); bool success1 = raft_->ReceiveAppendEntries( std::make_unique(std::move(aemessage1))); @@ -233,7 +246,6 @@ TEST_F(RaftTest, FollowerRejectsMismatchedTermAtPrevLogIndex) { }); auto aemessage = CreateAeMessage(aefields); - raft_->SetRole(Role::FOLLOWER); bool success = raft_->ReceiveAppendEntries( std::make_unique(std::move(aemessage))); @@ -266,7 +278,12 @@ TEST_F(RaftTest, FollowerRejectsMissingIndex) { /*followerId=*/1); auto aemessage = CreateAeMessage(aefields); - raft_->SetRole(Role::FOLLOWER); + + raft_->SetStateForTest({ + .currentTerm = 0, + .role = Role::FOLLOWER, + .log = CreateLogEntries({}, true), + }); bool success = raft_->ReceiveAppendEntries( std::make_unique(std::move(aemessage))); @@ -353,7 +370,7 @@ TEST_F(RaftTest, FollowerIncreasesCommitIndex) { raft_->SetStateForTest({ .currentTerm = 1, .commitIndex = 1, - .lastApplied = 1, + .lastCommitted = 1, .role = Role::FOLLOWER, .log = CreateLogEntries( { @@ -399,7 +416,7 @@ TEST_F(RaftTest, FollowerIncreasesCommitIndexCappedAtLogSize) { raft_->SetStateForTest({ .currentTerm = 1, .commitIndex = 1, - .lastApplied = 1, + .lastCommitted = 1, .role = Role::FOLLOWER, .log = CreateLogEntries( { @@ -528,7 +545,7 @@ TEST_F(RaftTest, CandidateReceivesNewerTermWithAppendEntriesItCanAccept) { raft_->SetStateForTest({ .currentTerm = 1, - .lastApplied = 2, + .lastCommitted = 2, .role = Role::CANDIDATE, .log = CreateLogEntries( { @@ -544,7 +561,7 @@ TEST_F(RaftTest, CandidateReceivesNewerTermWithAppendEntriesItCanAccept) { EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); } -// Test 14: A candidate receiving an AppendEntries that it can accept from a the +// Test 14: A candidate receiving an AppendEntries that it can accept from the // same term but further along. TEST_F(RaftTest, CandidateReceivesSameTermWithAppendEntriesItCanAccept) { EXPECT_CALL(mock_call, Call(_, _, _)) @@ -572,8 +589,8 @@ TEST_F(RaftTest, CandidateReceivesSameTermWithAppendEntriesItCanAccept) { auto aemessage = CreateAeMessage(aefields); raft_->SetStateForTest({ - .currentTerm = 1, - .lastApplied = 2, + .currentTerm = 2, + .lastCommitted = 2, .role = Role::CANDIDATE, .log = CreateLogEntries( { @@ -589,5 +606,179 @@ TEST_F(RaftTest, CandidateReceivesSameTermWithAppendEntriesItCanAccept) { EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); } +// Test 15: A follower receiving a leaderCommit whose index is less than its own +// commitIndex does not lower its commitIndex. +TEST_F(RaftTest, FollowerWillNotLowerCommitIndex) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce( + ::testing::Invoke([](int type, const google::protobuf::Message& msg, + int node_id) { return 0; })); + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); + + auto aefields = CreateAeFields( + /*term=*/1, + /*leaderId=*/2, + /*prevLogIndex=*/0, + /*prevLogTerm=*/2, + /*entries=*/ + CreateLogEntries({}), + /*leaderCommit=*/2, + /*followerId=*/1); + auto aemessage = CreateAeMessage(aefields); + + raft_->SetStateForTest({ + .currentTerm = 0, + .commitIndex = 4, + .lastCommitted = 0, + .role = Role::FOLLOWER, + .log = CreateLogEntries( + { + {0, "Transaction 1"}, + {0, "Transaction 2"}, + }, + true), + }); + + raft_->PrintDebugStateLocked(); + + bool success = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); + EXPECT_TRUE(success); +} + +// Test 16: A leader ignores an AppendEntries from itself +TEST_F(RaftTest, LeaderIgnoresAppendEntriesFromSelf) { + EXPECT_CALL(mock_call, Call(_, _, _)).Times(0); + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); + + auto aefields = CreateAeFields( + /*term=*/0, + /*leaderId=*/1, + /*prevLogIndex=*/0, + /*prevLogTerm=*/0, + /*entries=*/ + CreateLogEntries({ + {0, "Transaction 1"}, + }), + /*leaderCommit=*/0, + /*followerId=*/1); + auto aemessage = CreateAeMessage(aefields); + + raft_->SetStateForTest({ + .currentTerm = 0, + .lastCommitted = 0, + .role = Role::LEADER, + .log = CreateLogEntries({}, true), + }); + + bool success = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); + EXPECT_FALSE(success); +} + +// Test 17: A follower receiving a heartbeat will advance its commit index. +TEST_F(RaftTest, FollowerAdvancesCommitIndexOnHeartbeat) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce( + ::testing::Invoke([](int type, const google::protobuf::Message& msg, + int node_id) { return 0; })); + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(1); + + auto aefields = CreateAeFields( + /*term=*/0, + /*leaderId=*/2, + /*prevLogIndex=*/2, + /*prevLogTerm=*/0, + /*entries=*/ + CreateLogEntries({}), + /*leaderCommit=*/2, + /*followerId=*/1); + auto aemessage = CreateAeMessage(aefields); + + raft_->SetStateForTest({ + .currentTerm = 0, + .commitIndex = 0, + .lastCommitted = 0, + .role = Role::FOLLOWER, + .log = CreateLogEntries( + { + {0, "Transaction 1"}, + {0, "Transaction 2"}, + }, + true), + }); + + raft_->PrintDebugStateLocked(); + + bool success = raft_->ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); + EXPECT_TRUE(success); + EXPECT_EQ(raft_->GetCommitIndex(), 2); +} + +// Test 17: A leader correctly sends a heartbeat. +TEST_F(RaftTest, LeaderCorrectlySendsHeartbeat) { + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& ae = dynamic_cast(msg); + EXPECT_EQ(node_id, 2); + EXPECT_EQ(ae.prevlogindex(), 2); + EXPECT_EQ(ae.entries().size(), 0); + return 0; + })) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& ae = dynamic_cast(msg); + EXPECT_EQ(node_id, 3); + EXPECT_EQ(ae.prevlogindex(), 1); + EXPECT_EQ(ae.entries().size(), 0); + return 0; + })) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& ae = dynamic_cast(msg); + EXPECT_EQ(node_id, 4); + EXPECT_EQ(ae.prevlogindex(), 0); + EXPECT_EQ(ae.entries().size(), 0); + return 0; + })); + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); + + raft_->SetStateForTest({.currentTerm = 1, + .votedFor = 1, + .commitIndex = 0, + .lastCommitted = 0, + .role = Role::LEADER, + .log = CreateLogEntries( + { + {0, "Transaction 1"}, + {1, "Transaction 2"}, + }, + true), + .nextIndex = std::vector{1, 4, 3, 2, 1}, + .matchIndex = std::vector{0, 2, 0, 1, 0}, + .votes = std::vector{1, 3, 2}}); + + raft_->SendHeartBeat(); + + EXPECT_EQ(raft_->GetCurrentTerm(), 1); + EXPECT_EQ(raft_->GetVotedFor(), 1); + EXPECT_EQ(raft_->GetCommitIndex(), 0); + EXPECT_EQ(raft_->GetLastCommitted(), 0); + EXPECT_EQ(raft_->GetRole(), Role::LEADER); + auto log = raft_->GetLog(); + // Maybe check that the log itself is equal + EXPECT_EQ(log.size(), 3); + EXPECT_EQ(raft_->GetLastLogIndex(), log.size() - 1); + EXPECT_THAT(raft_->GetNextIndex(), ::testing::ElementsAre(1, 4, 3, 2, 1)); + EXPECT_THAT(raft_->GetMatchIndex(), ::testing::ElementsAre(0, 2, 0, 1, 0)); + EXPECT_THAT(raft_->GetVotes(), ::testing::ElementsAre(1, 3, 2)); +} + } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft_integration_test.cpp b/platform/consensus/ordering/raft/algorithm/raft_integration_test.cpp new file mode 100644 index 0000000000..61841dc0ad --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/raft_integration_test.cpp @@ -0,0 +1,480 @@ +// raft_integration_test.cpp +// +// Integration test: Raft state correctly restored after RecoverFromLogs(). +// Uses a real RaftRecovery (seeded with WAL data) and a real Raft. + +#include +#include + +#include + +#include "platform/consensus/checkpoint/mock_checkpoint.h" +#include "platform/consensus/ordering/raft/algorithm/raft_test_util.h" +#include "platform/consensus/recovery/raft_recovery.h" + +namespace resdb { +namespace raft { + +using resdb::raft::test_utils::CreateAeFields; +using resdb::raft::test_utils::CreateAeMessage; +using resdb::raft::test_utils::CreateLogEntries; +using resdb::raft::test_utils::GenerateConfig; +using resdb::raft::test_utils::MockBroadcastFunction; +using resdb::raft::test_utils::MockCommitFunction; +using resdb::raft::test_utils::MockSendMessageFunction; +using ::testing::_; +using ::testing::AnyNumber; +using ::testing::Invoke; + +namespace { + +const std::string kLogPath = "./log/raft_integration_test_log"; + +ResDBConfig MakeConfig() { + ResConfigData data; + data.set_recovery_enabled(true); + data.set_recovery_path(kLogPath); + data.set_recovery_buffer_size(1024); + data.set_recovery_ckpt_time_s(3); + return ResDBConfig({GenerateReplicaInfo(1, "127.0.0.1", 1234), + GenerateReplicaInfo(2, "127.0.0.1", 1235), + GenerateReplicaInfo(3, "127.0.0.1", 1236), + GenerateReplicaInfo(4, "127.0.0.1", 1237)}, + GenerateReplicaInfo(1, "127.0.0.1", 1234), data); +} + +// Mirrors what Consensus::RecoverFromLogs() does. +void RecoverFromLogs(RaftRecovery& recovery, Raft& raft) { + recovery.ReadLogs( + [&](const RaftMetadata& metadata) { + LOG(INFO) << "loading metadata file: term: " << metadata.current_term + << " votedFor: " << metadata.voted_for + << " snapshot_last_index: " << metadata.snapshot_last_index + << " snapshot_last_term: " << metadata.snapshot_last_term; + raft.SetCurrentTerm(metadata.current_term, /*writeMetadata=*/false); + raft.SetVotedFor(metadata.voted_for, /*writeMetadata=*/false); + raft.SetSnapshotLastIndexAndTerm(metadata.snapshot_last_index, + metadata.snapshot_last_term, + /*writeMetadata=*/false); + }, + [&](std::unique_ptr record) { + LOG(INFO) << "Replaying record with seq: " << record->seq(); + switch (record->payload_case()) { + case WALRecord::kEntry: { + LogEntry logEntry; + logEntry.entry = record->entry(); + LOG(INFO) << "Adding entry from term: " << logEntry.entry.term(); + raft.AddToLog(logEntry, /*writeMetadata=*/false); + break; + } + case WALRecord::kTruncation: + raft.TruncateLog(record->truncation().truncate_from_index(), + /*writeMetadata=*/false); + break; + case WALRecord::PAYLOAD_NOT_SET: + FAIL() << "Unexpected PAYLOAD_NOT_SET record"; + break; + } + }, + /*set_start_point=*/[](int) {}); +} + +} // namespace + +class RaftRecoveryIntegrationTest : public ::testing::Test { + private: + class MockCommitFunction { + public: + MOCK_METHOD(int, Commit, (const google::protobuf::Message&)); + }; + + protected: + void SetUp() override { + std::filesystem::remove_all(std::filesystem::path(kLogPath).parent_path()); + } + + ResDBConfig config_ = MakeConfig(); + MockCheckPoint checkpoint_; + MockSendMessageFunction mock_call; + MockBroadcastFunction mock_broadcast; + MockCommitFunction mock_commit; +}; + +// Test 1: Restore basic metadata and log entries. +TEST_F(RaftRecoveryIntegrationTest, RaftStateRestoredAfterRecovery) { + { + RaftRecovery recovery(config_, nullptr, nullptr, nullptr); + + recovery.WriteMetadata(/*current_term=*/5, /*voted_for=*/2, + /*snapshot_last_index=*/0, /*snapshot_last_term=*/0); + + for (int i = 1; i <= 3; ++i) { + Entry e; + e.set_term(i); + ClientTestRequest req; + req.set_value("cmd-" + std::to_string(i)); + req.SerializeToString(e.mutable_command()); + recovery.AddLogEntry(&e, i); + } + } + + MockSignatureVerifier verifier; + ResDBConfig config = MakeConfig(); + MockLeaderElectionManager lem(config); + MockReplicaCommunicator comm; + MockCheckPoint ckpt; + + RaftRecovery recovery(config_, nullptr, nullptr, nullptr); + + Raft raft(/*id=*/1, /*f=*/1, /*total=*/4, &verifier, &lem, &comm, &recovery); + + RecoverFromLogs(recovery, raft); + + // --- Assertions --- + EXPECT_EQ(raft.GetCurrentTerm(), 5u); + EXPECT_EQ(raft.GetVotedFor(), 2); + EXPECT_EQ(raft.GetSnapshotLastIndex(), 0u); + + // Log: index 0 is the sentinel (term=0), indices 1–3 are the replayed + // entries. + ASSERT_EQ(raft.GetLogSize(), 4u); + for (int i = 1; i <= 3; ++i) { + const auto& le = raft.GetLog()[i]; + EXPECT_EQ(le.entry.term(), i); + ClientTestRequest req; + req.ParseFromString(le.entry.command()); + EXPECT_EQ(req.value(), "cmd-" + std::to_string(i)); + } +} + +// Test 2: Restore the log using a checkpoint and the Recovery WAL. +TEST_F(RaftRecoveryIntegrationTest, + RaftStateRestoredAfterRecoveryWithCheckpoint) { + EXPECT_CALL(mock_commit, Commit(_)).Times(2); + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& aer = dynamic_cast(msg); + EXPECT_TRUE(aer.success()); + EXPECT_EQ(aer.lastlogindex(), 13); + return 0; + })); + + { + std::promise insert_done, ckpt_fired; + auto insert_done_future = insert_done.get_future(); + auto ckpt_fired_future = ckpt_fired.get_future(); + + int call_count = 0; + EXPECT_CALL(checkpoint_, GetStableCheckpoint()) + .WillRepeatedly(Invoke([&]() -> uint64_t { + ++call_count; + if (call_count == 1) + insert_done_future.get(); + else if (call_count == 2) + ckpt_fired.set_value(true); + return 5; + })); + + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + + recovery.WriteMetadata(/*current_term=*/5, /*voted_for=*/2, + /*snapshot_last_index=*/5, /*snapshot_last_term=*/5); + + for (int i = 1; i <= 8; ++i) { + Entry e; + e.set_term(i); + ClientTestRequest req; + req.set_value("Transaction " + std::to_string(i)); + req.SerializeToString(e.mutable_command()); + recovery.AddLogEntry(&e, i); + } + + insert_done.set_value(true); + ckpt_fired_future.get(); + + for (int i = 9; i <= 10; ++i) { + Entry e; + e.set_term(i); + ClientTestRequest req; + req.set_value("Transaction " + std::to_string(i)); + req.SerializeToString(e.mutable_command()); + recovery.AddLogEntry(&e, i); + } + } + + MockSignatureVerifier verifier; + ResDBConfig config = MakeConfig(); + MockLeaderElectionManager lem(config); + MockReplicaCommunicator comm; + MockCheckPoint ckpt; + + RaftRecovery recovery(config_, nullptr, nullptr, nullptr); + + Raft raft(/*id=*/1, /*f=*/1, /*total=*/4, &verifier, &lem, &comm, &recovery); + + raft.SetCommitFunc([&](const google::protobuf::Message& msg) { + return mock_commit.Commit(msg); + }); + raft.SetSingleCallFunc( + [&](int type, const google::protobuf::Message& msg, int node_id) { + return mock_call.Call(type, msg, node_id); + }); + + RecoverFromLogs(recovery, raft); + + EXPECT_EQ(raft.GetCurrentTerm(), 5u); + EXPECT_EQ(raft.GetVotedFor(), 2); + EXPECT_EQ(raft.GetSnapshotLastIndex(), 5u); + + auto aefields = CreateAeFields( + /*term=*/11, + /*leaderId=*/2, + /*prevLogIndex=*/10, + /*prevLogTerm=*/10, + /*entries=*/ + CreateLogEntries({ + {11, "Transaction 11"}, + {11, "Transaction 12"}, + {11, "Transaction 13"}, + }), + /*leaderCommit=*/7, + /*followerId=*/1); + + auto aemessage = CreateAeMessage(aefields); + + bool success = raft.ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); + EXPECT_TRUE(success); + + EXPECT_EQ(raft.GetCurrentTerm(), 11u); + // --- Assertions --- + EXPECT_EQ(raft.GetVotedFor(), -1); + EXPECT_EQ(raft.GetSnapshotLastIndex(), 5u); + EXPECT_EQ(raft.GetLastLogIndex(), 13u); + + // Log: index 0 is the sentinel (term/index=5), indices 1–8 are the replayed + // entries. + ASSERT_EQ(raft.GetLogSize(), 9u); + const auto& le = raft.GetLog()[0]; + EXPECT_EQ(le.entry.term(), 5); + EXPECT_EQ(raft.GetLogTermAtIndex(5), 5); + + for (int i = 1; i < 8; ++i) { + const auto& le = raft.GetLog()[i]; + if (i <= 5) { + EXPECT_EQ(le.entry.term(), i + 5); + } else { + EXPECT_EQ(le.entry.term(), 11); + } + EXPECT_EQ(raft.GetLogTermAtIndex(i + 5), le.entry.term()); + ClientTestRequest req; + req.ParseFromString(le.entry.command()); + EXPECT_EQ(req.value(), "Transaction " + std::to_string(i + 5)); + ClientTestRequest req2; + auto log_entry = raft.GetLogEntryAtIndex(i + 5); + req2.ParseFromString(log_entry.entry.command()); + EXPECT_EQ(req.value(), req2.value()); + } +} + +// Test 3: Demotion (higher-term AppendEntries) triggers WriteMetadata, and the +// updated metadata is visible after recovery. +TEST_F(RaftRecoveryIntegrationTest, DemotionTriggersWriteMetadata) { + { + MockSignatureVerifier verifier; + ResDBConfig config = MakeConfig(); + MockLeaderElectionManager lem(config); + MockReplicaCommunicator comm; + + RaftRecovery recovery(config_, nullptr, nullptr, nullptr); + + Raft raft(/*id=*/1, /*f=*/1, /*total=*/4, &verifier, &lem, &comm, + &recovery); + + recovery.WriteMetadata(/*current_term=*/3, /*voted_for=*/1, + /*snapshot_last_index=*/0, + /*snapshot_last_term=*/0); + + // Add a couple of entries so the log is non-trivial. + for (int i = 1; i <= 2; ++i) { + Entry e; + e.set_term(3); + ClientTestRequest req; + req.set_value("cmd-" + std::to_string(i)); + req.SerializeToString(e.mutable_command()); + recovery.AddLogEntry(&e, i); + } + + raft.SetStateForTest({ + .currentTerm = 6, + .role = Role::LEADER, + .log = CreateLogEntries({}, true), + }); + + raft.SetSingleCallFunc( + [&](int type, const google::protobuf::Message& msg, int node_id) { + return mock_call.Call(type, msg, node_id); + }); + + // Receive an AppendEntries from node 2 at a higher term. + auto aefields = CreateAeFields( + /*term=*/7, + /*leaderId=*/2, + /*prevLogIndex=*/0, + /*prevLogTerm=*/0, + /*entries=*/{}, + /*leaderCommit=*/0, + /*followerId=*/1); + auto aemessage = CreateAeMessage(aefields); + + raft.PrintDebugState(); + bool success = raft.ReceiveAppendEntries( + std::make_unique(std::move(aemessage))); + EXPECT_TRUE(success); + + EXPECT_EQ(raft.GetCurrentTerm(), 7u); + EXPECT_EQ(raft.GetVotedFor(), -1); + } + + { + MockSignatureVerifier verifier; + ResDBConfig config = MakeConfig(); + MockLeaderElectionManager lem(config); + MockReplicaCommunicator comm; + + RaftRecovery recovery(config_, nullptr, nullptr, nullptr); + + Raft raft(/*id=*/1, /*f=*/1, /*total=*/4, &verifier, &lem, &comm, + &recovery); + + RecoverFromLogs(recovery, raft); + + EXPECT_EQ(raft.GetCurrentTerm(), 7u); + EXPECT_EQ(raft.GetVotedFor(), -1); + + // The two entries written before the demotion should still be present. + ASSERT_EQ(raft.GetLogSize(), 3u); + for (int i = 1; i <= 2; ++i) { + const auto& le = raft.GetLog()[i]; + EXPECT_EQ(le.entry.term(), 3); + ClientTestRequest req; + req.ParseFromString(le.entry.command()); + EXPECT_EQ(req.value(), "cmd-" + std::to_string(i)); + } + } +} + +// Test 4: A truncation that occurs after a checkpoint is replayed correctly. +TEST_F(RaftRecoveryIntegrationTest, TruncationPersistsAfterCheckpoint) { + // Timeline: + // - Write entries 1–5 (all term 3). + // - Checkpoint fires at stable index 2 → WAL is compacted up to index 2. + // - Truncate from index 4 onward. + // - Write new entries at index 4–5 with term 6 and different commands. + { + std::promise insert_done, ckpt_fired; + auto insert_done_future = insert_done.get_future(); + auto ckpt_fired_future = ckpt_fired.get_future(); + + int call_count = 0; + EXPECT_CALL(checkpoint_, GetStableCheckpoint()) + .WillRepeatedly(Invoke([&]() -> uint64_t { + ++call_count; + if (call_count == 1) + insert_done_future.get(); // block until initial entries are in + else if (call_count == 2) + ckpt_fired.set_value(true); // signal that the checkpoint fired + return 2; // checkpoint covers indices 1–2 + })); + + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + + // Seed metadata. + recovery.WriteMetadata(/*current_term=*/3, /*voted_for=*/1, + /*snapshot_last_index=*/0, + /*snapshot_last_term=*/0); + + // Write entries 1–5 at term 3. + for (int i = 1; i <= 5; ++i) { + Entry e; + e.set_term(3); + ClientTestRequest req; + req.set_value("original-" + std::to_string(i)); + req.SerializeToString(e.mutable_command()); + recovery.AddLogEntry(&e, i); + } + + // Unblock the checkpoint poll and wait for it to fire. + insert_done.set_value(true); + ckpt_fired_future.get(); + + recovery.WriteMetadata(/*current_term=*/3, /*voted_for=*/1, + /*snapshot_last_index=*/2, + /*snapshot_last_term=*/3); + + // Truncate from index 4 onward (entries 4 and 5 are discarded). + // NOTE: Assumes RaftRecovery::TruncateLog(from_index) writes a + // kTruncation WAL record. Adjust the call if the API differs. + TruncationRecord truncation; + truncation.set_truncate_from_index(4); + truncation.set_truncate_from_term(3); + recovery.TruncateLog(truncation); + + // Rewrite indices 4–5 under term 6 with different commands. + for (int i = 4; i <= 5; ++i) { + Entry e; + e.set_term(6); + ClientTestRequest req; + req.set_value("rewritten-" + std::to_string(i)); + req.SerializeToString(e.mutable_command()); + recovery.AddLogEntry(&e, i); + } + } + + { + MockSignatureVerifier verifier; + ResDBConfig config = MakeConfig(); + MockLeaderElectionManager lem(config); + MockReplicaCommunicator comm; + + RaftRecovery recovery(config_, nullptr, nullptr, nullptr); + + Raft raft(/*id=*/1, /*f=*/1, /*total=*/4, &verifier, &lem, &comm, + &recovery); + + // Recover and verify: indices 1–3 are untouched, 4–5 carry the new data. + RecoverFromLogs(recovery, raft); + + EXPECT_EQ(raft.GetSnapshotLastIndex(), 2u); + + // Sentinel (index 0) + entries 1–5 after truncation/rewrite = 6 total. + // The WAL after compaction starts from the checkpoint (index 2 sentinel), + // then replays entries 3, 4 (rewritten), 5 (rewritten). + EXPECT_EQ(raft.GetLogSize(), 4u); + + // Entry at absolute index 3 should be original. + { + auto le = raft.GetLogEntryAtIndex(3); + EXPECT_EQ(le.entry.term(), 3); + ClientTestRequest req; + req.ParseFromString(le.entry.command()); + EXPECT_EQ(req.value(), "original-3"); + } + + // Entries at absolute indices 4–5 must reflect the post-truncation rewrite. + for (int i = 4; i <= 5; ++i) { + auto le = raft.GetLogEntryAtIndex(i); + EXPECT_EQ(le.entry.term(), 6); + ClientTestRequest req; + req.ParseFromString(le.entry.command()); + EXPECT_EQ(req.value(), "rewritten-" + std::to_string(i)); + } + + EXPECT_EQ(raft.GetLogTermAtIndex(4), 6); + EXPECT_EQ(raft.GetLogTermAtIndex(5), 6); + } +} + +} // namespace raft +} // namespace resdb \ No newline at end of file diff --git a/platform/consensus/ordering/raft/algorithm/raft_request_vote_response_test.cpp b/platform/consensus/ordering/raft/algorithm/raft_request_vote_response_test.cpp index b45c2c032f..08cd6c456c 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_request_vote_response_test.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft_request_vote_response_test.cpp @@ -2,10 +2,6 @@ namespace resdb { namespace raft { -using ::testing::_; -using ::testing::AnyNumber; -using ::testing::Invoke; -using ::testing::Matcher; // Test 1: A candidate gets elected. TEST_F(RaftTest, CandidateGetsElected) { @@ -50,7 +46,7 @@ TEST_F(RaftTest, CandidateGetsElected) { raft_->SetStateForTest({.currentTerm = 2, .commitIndex = 1, - .lastApplied = 1, + .lastCommitted = 1, .role = Role::CANDIDATE, .log = CreateLogEntries( { @@ -185,7 +181,7 @@ TEST_F(RaftTest, CandidateIgnoresDuplicateVote) { raft_->SetStateForTest({.currentTerm = 2, .commitIndex = 1, - .lastApplied = 1, + .lastCommitted = 1, .role = Role::CANDIDATE, .log = CreateLogEntries( { diff --git a/platform/consensus/ordering/raft/algorithm/raft_request_vote_test.cpp b/platform/consensus/ordering/raft/algorithm/raft_request_vote_test.cpp index 8601234ed3..156a8ab138 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_request_vote_test.cpp +++ b/platform/consensus/ordering/raft/algorithm/raft_request_vote_test.cpp @@ -2,10 +2,6 @@ namespace resdb { namespace raft { -using ::testing::_; -using ::testing::AnyNumber; -using ::testing::Invoke; -using ::testing::Matcher; // Test 1: A follower times out, transitions to candidate, and starts an // election. @@ -239,5 +235,92 @@ TEST_F(RaftTest, FollowerRejectsRequestVoteBecauseAlreadyVoted) { EXPECT_EQ(raft_->GetRoleSnapshot(), Role::FOLLOWER); } +// Test 7: A follower times out and starts an election. Then, as a candidate +// times out and starts another election. +TEST_F(RaftTest, CandidateTimesOutAndStartsAnotherElection) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(1); + EXPECT_CALL(mock_broadcast, Broadcast(_, _)) + .WillOnce( + ::testing::Invoke([](int type, const google::protobuf::Message& msg) { + const auto& requestVote = dynamic_cast(msg); + EXPECT_EQ(requestVote.term(), 1); + EXPECT_EQ(requestVote.candidateid(), 1); + EXPECT_EQ(requestVote.lastlogindex(), 1); + EXPECT_EQ(requestVote.lastlogterm(), 0); + return 0; + })) + .WillOnce( + ::testing::Invoke([](int type, const google::protobuf::Message& msg) { + const auto& requestVote = dynamic_cast(msg); + EXPECT_EQ(requestVote.term(), 2); + EXPECT_EQ(requestVote.candidateid(), 1); + EXPECT_EQ(requestVote.lastlogindex(), 1); + EXPECT_EQ(requestVote.lastlogterm(), 0); + return 0; + })); + + raft_->SetStateForTest({ + .currentTerm = 0, + .role = Role::FOLLOWER, + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + }, + true), + }); + + raft_->StartElection(); + EXPECT_EQ(raft_->GetVotedFor(), 1); + EXPECT_EQ(raft_->GetCurrentTerm(), 1); + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::CANDIDATE); + + // Start another election after a timeout + raft_->StartElection(); + EXPECT_EQ(raft_->GetVotedFor(), 1); + EXPECT_EQ(raft_->GetCurrentTerm(), 2); + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::CANDIDATE); +} + +// Test 8: A candidate receives a RequestVote from another candidate in the same +// term and does not demote. +TEST_F(RaftTest, CandidateReceivesRequestVoteFromSameTermAndDoesNotDemote) { + EXPECT_CALL(*leader_election_manager_, OnRoleChange()).Times(0); + EXPECT_CALL(mock_call, Call(_, _, _)) + .WillOnce(::testing::Invoke( + [](int type, const google::protobuf::Message& msg, int node_id) { + const auto& requestVoteResponse = + dynamic_cast(msg); + EXPECT_EQ(node_id, 2); + EXPECT_EQ(requestVoteResponse.term(), 1); + EXPECT_EQ(requestVoteResponse.voterid(), 1); + EXPECT_FALSE(requestVoteResponse.votegranted()); + return 0; + })); + EXPECT_CALL(*leader_election_manager_, OnHeartBeat()).Times(0); + + RequestVote rv; + rv.set_term(1); + rv.set_candidateid(2); + rv.set_lastlogindex(1); + rv.set_lastlogterm(0); + + raft_->SetStateForTest({ + .currentTerm = 1, + .votedFor = 1, + .role = Role::CANDIDATE, + .log = CreateLogEntries( + { + {0, "Term 0 Transaction 1"}, + }, + true), + }); + + raft_->ReceiveRequestVote(std::make_unique(rv)); + + EXPECT_EQ(raft_->GetVotedFor(), 1); + EXPECT_EQ(raft_->GetCurrentTerm(), 1); + EXPECT_EQ(raft_->GetRoleSnapshot(), Role::CANDIDATE); +} + } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft_test_util.h b/platform/consensus/ordering/raft/algorithm/raft_test_util.h new file mode 100644 index 0000000000..b468edc230 --- /dev/null +++ b/platform/consensus/ordering/raft/algorithm/raft_test_util.h @@ -0,0 +1,118 @@ +#pragma once + +#include + +#include "common/crypto/mock_signature_verifier.h" +#include "platform/config/resdb_config_utils.h" +#include "platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h" +#include "platform/consensus/ordering/raft/algorithm/raft.h" +#include "platform/networkstrate/mock_replica_communicator.h" +#include "platform/proto/client_test.pb.h" + +namespace resdb { +namespace raft { +namespace test_utils { + +inline ResDBConfig GenerateConfig() { + ResConfigData data; + data.set_duplicate_check_frequency_useconds(100000); + data.set_enable_viewchange(true); + return ResDBConfig({GenerateReplicaInfo(1, "127.0.0.1", 1234), + GenerateReplicaInfo(2, "127.0.0.1", 1235), + GenerateReplicaInfo(3, "127.0.0.1", 1236), + GenerateReplicaInfo(4, "127.0.0.1", 1237)}, + GenerateReplicaInfo(1, "127.0.0.1", 1234), data); +} + +class MockSendMessageFunction { + public: + MOCK_METHOD(int, Call, (int, const google::protobuf::Message&, int)); +}; +class MockBroadcastFunction { + public: + MOCK_METHOD(int, Broadcast, (int, const google::protobuf::Message&)); +}; +class MockCommitFunction { + public: + MOCK_METHOD(int, Commit, (const google::protobuf::Message&)); +}; + +inline AeFields CreateAeFields(uint64_t term, int leaderId, + uint64_t prevLogIndex, uint64_t prevLogTerm, + const std::vector& entries, + uint64_t leaderCommit, int followerId) { + AeFields fields{}; + fields.term = term; + fields.leaderId = leaderId; + fields.leaderCommit = leaderCommit; + fields.prevLogIndex = prevLogIndex; + fields.prevLogTerm = prevLogTerm; + fields.followerId = followerId; + + for (const auto& logEntry : entries) { + LogEntry log_entry; + log_entry.entry.set_term(logEntry.entry.term()); + log_entry.entry.set_command(logEntry.entry.command()); + fields.entries.push_back(std::move(log_entry)); + } + + return fields; +}; + +// Helper to create a single log entry. +inline LogEntry CreateLogEntry(uint64_t term, const std::string& command_data) { + LogEntry log_entry; + log_entry.entry.set_term(term); + log_entry.entry.set_command(command_data); + return log_entry; +} + +// Helper to create a vector of log entries for testing. +inline std::vector CreateLogEntries( + const std::vector>& term_and_cmds, + bool usedForLogPatch = false) { + std::vector entries; + + if (usedForLogPatch) { + LogEntry first_entry; + first_entry.entry.set_term(0); + first_entry.entry.set_command("COMMON_PREFIX"); + entries.push_back(first_entry); + } + + for (const auto& [term, cmd] : term_and_cmds) { + LogEntry log_entry; + log_entry.entry.set_term(term); + + ClientTestRequest req; + req.set_value(cmd); + + std::string serialized; + req.SerializeToString(&serialized); + log_entry.entry.set_command(serialized); + + entries.push_back(log_entry); + } + + return entries; +} + +inline AppendEntries CreateAeMessage(const AeFields& fields) { + AppendEntries ae; + ae.set_term(fields.term); + ae.set_leaderid(fields.leaderId); + ae.set_prevlogindex(fields.prevLogIndex); + ae.set_prevlogterm(fields.prevLogTerm); + ae.set_leadercommitindex(fields.leaderCommit); + for (const auto& log_entry : fields.entries) { + auto* newEntry = ae.add_entries(); + newEntry->set_term(log_entry.entry.term()); + newEntry->set_command(log_entry.entry.command()); + } + + return ae; +} + +} // namespace test_utils +} // namespace raft +} // namespace resdb diff --git a/platform/consensus/ordering/raft/algorithm/raft_tests.h b/platform/consensus/ordering/raft/algorithm/raft_tests.h index 932d9a4a79..5ead2ec963 100644 --- a/platform/consensus/ordering/raft/algorithm/raft_tests.h +++ b/platform/consensus/ordering/raft/algorithm/raft_tests.h @@ -1,45 +1,21 @@ -#include - -#include "common/crypto/mock_signature_verifier.h" -#include "platform/config/resdb_config_utils.h" -#include "platform/consensus/ordering/raft/algorithm/mock_leader_election_manager.h" -#include "platform/consensus/ordering/raft/algorithm/raft.h" -#include "platform/networkstrate/mock_replica_communicator.h" -#include "platform/proto/client_test.pb.h" +#include "platform/consensus/ordering/raft/algorithm/raft_test_util.h" #include "platform/consensus/recovery/mock_raft_recovery.h" namespace resdb { namespace raft { +using resdb::raft::test_utils::CreateAeFields; +using resdb::raft::test_utils::CreateAeMessage; +using resdb::raft::test_utils::CreateLogEntries; +using resdb::raft::test_utils::GenerateConfig; +using resdb::raft::test_utils::MockBroadcastFunction; +using resdb::raft::test_utils::MockCommitFunction; +using resdb::raft::test_utils::MockSendMessageFunction; using ::testing::_; using ::testing::AnyNumber; using ::testing::Invoke; using ::testing::Matcher; -ResDBConfig GenerateConfig() { - ResConfigData data; - data.set_duplicate_check_frequency_useconds(100000); - data.set_enable_viewchange(true); - return ResDBConfig({GenerateReplicaInfo(1, "127.0.0.1", 1234), - GenerateReplicaInfo(2, "127.0.0.1", 1235), - GenerateReplicaInfo(3, "127.0.0.1", 1236), - GenerateReplicaInfo(4, "127.0.0.1", 1237)}, - GenerateReplicaInfo(1, "127.0.0.1", 1234), data); -} - class RaftTest : public ::testing::Test { - private: - class MockSendMessageFunction { - public: - MOCK_METHOD(int, Call, (int, const google::protobuf::Message&, int)); - }; - class MockBroadcastFunction { - public: - MOCK_METHOD(int, Broadcast, (int, const google::protobuf::Message&)); - }; - class MockCommitFunction { - public: - MOCK_METHOD(int, Commit, (const google::protobuf::Message&)); - }; protected: void SetUp() override { @@ -70,83 +46,6 @@ class RaftTest : public ::testing::Test { }); } - AeFields CreateAeFields(uint64_t term, int leaderId, uint64_t prevLogIndex, - uint64_t prevLogTerm, - const std::vector& entries, - uint64_t leaderCommit, int followerId) { - AeFields fields{}; - fields.term = term; - fields.leaderId = leaderId; - fields.leaderCommit = leaderCommit; - fields.prevLogIndex = prevLogIndex; - fields.prevLogTerm = prevLogTerm; - fields.followerId = followerId; - - for (const auto& logEntry : entries) { - LogEntry log_entry; - log_entry.entry.set_term(logEntry.entry.term()); - log_entry.entry.set_command(logEntry.entry.command()); - fields.entries.push_back(std::move(log_entry)); - } - - return fields; - }; - - // Helper to create a single log entry. - LogEntry CreateLogEntry(uint64_t term, - const std::string& command_data) { - LogEntry log_entry; - log_entry.entry.set_term(term); - log_entry.entry.set_command(command_data); - return log_entry; - } - - // Helper to create a vector of log entries for testing. - std::vector CreateLogEntries( - const std::vector>& term_and_cmds, - bool usedForLogPatch = false) { - std::vector entries; - - if (usedForLogPatch) { - LogEntry first_entry; - first_entry.entry.set_term(0); - first_entry.entry.set_command("COMMON_PREFIX"); - entries.push_back(first_entry); - } - - for (const auto& [term, cmd] : term_and_cmds) { - LogEntry log_entry; - log_entry.entry.set_term(term); - - ClientTestRequest req; - req.set_value(cmd); - - std::string serialized; - req.SerializeToString(&serialized); - log_entry.entry.set_command(serialized); - - entries.push_back(log_entry); - } - - return entries; - } - - AppendEntries CreateAeMessage(const AeFields& fields) { - AppendEntries ae; - ae.set_term(fields.term); - ae.set_leaderid(fields.leaderId); - ae.set_prevlogindex(fields.prevLogIndex); - ae.set_prevlogterm(fields.prevLogTerm); - ae.set_leadercommitindex(fields.leaderCommit); - for (const auto& log_entry : fields.entries) { - auto* newEntry = ae.add_entries(); - newEntry->set_term(log_entry.entry.term()); - newEntry->set_command(log_entry.entry.command()); - } - - return ae; - } - std::unique_ptr verifier_; std::unique_ptr leader_election_manager_; std::unique_ptr replica_communicator_; diff --git a/platform/consensus/ordering/raft/framework/BUILD b/platform/consensus/ordering/raft/framework/BUILD index 0d9d75f728..6c70d57835 100644 --- a/platform/consensus/ordering/raft/framework/BUILD +++ b/platform/consensus/ordering/raft/framework/BUILD @@ -20,20 +20,13 @@ package(default_visibility = ["//platform/consensus/ordering/raft:__subpackages_ cc_library( - name = "checkpoint_manager", - srcs = ["checkpoint_manager.cpp"], - hdrs = ["checkpoint_manager.h"], + name = "raft_checkpoint_manager", + hdrs = ["raft_checkpoint_manager.h"], + visibility = [ + "//visibility:public", + ], deps = [ - "//platform/consensus/ordering/common:transaction_utils", - "//chain/state:chain_state", - "//common/crypto:signature_verifier", - "//interface/common:resdb_txn_accessor", - "//platform/config:resdb_config", - "//platform/consensus/checkpoint", - "//platform/consensus/execution:transaction_executor", - "//platform/networkstrate:replica_communicator", - "//platform/networkstrate:server_comm", - "//platform/proto:checkpoint_info_cc_proto", + "//platform/consensus/checkpoint:checkpoint" ], ) @@ -45,7 +38,7 @@ cc_library( "//visibility:public", ], deps = [ - ":checkpoint_manager", + ":raft_checkpoint_manager", "//platform/consensus/recovery:raft_recovery", "//common/utils", "//platform/consensus/ordering/common/framework:consensus", diff --git a/platform/consensus/ordering/raft/framework/checkpoint_manager.cpp b/platform/consensus/ordering/raft/framework/checkpoint_manager.cpp deleted file mode 100644 index 1a34bbbf47..0000000000 --- a/platform/consensus/ordering/raft/framework/checkpoint_manager.cpp +++ /dev/null @@ -1,558 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include "platform/consensus/ordering/raft/framework/checkpoint_manager.h" - -#include - -#include "platform/consensus/ordering/common/transaction_utils.h" -#include "platform/proto/checkpoint_info.pb.h" - -namespace resdb { - -CheckPointManager::CheckPointManager(const ResDBConfig& config, - ReplicaCommunicator* replica_communicator, - SignatureVerifier* verifier, - SystemInfo* sys_info) - : config_(config), - replica_communicator_(replica_communicator), - verifier_(verifier), - stop_(false), - txn_accessor_(config), - highest_prepared_seq_(0), - sys_info_(sys_info) { - current_stable_seq_ = 0; - if (config_.GetConfigData().enable_viewchange()) { - config_.EnableCheckPoint(true); - } - if (config_.IsCheckPointEnabled()) { - stable_checkpoint_thread_ = - std::thread(&CheckPointManager::UpdateStableCheckPointStatus, this); - checkpoint_thread_ = - std::thread(&CheckPointManager::UpdateCheckPointStatus, this); - status_thread_ = std::thread(&CheckPointManager::SyncStatus, this); - } - sem_init(&committable_seq_signal_, 0, 0); -} - -CheckPointManager::~CheckPointManager() { Stop(); } - -void CheckPointManager::Stop() { - stop_ = true; - if (checkpoint_thread_.joinable()) { - checkpoint_thread_.join(); - } - if (stable_checkpoint_thread_.joinable()) { - stable_checkpoint_thread_.join(); - } - if (status_thread_.joinable()) { - status_thread_.join(); - } -} - -void CheckPointManager::SetResetExecute( - std::function func) { - reset_execute_func_ = func; -} - -std::string GetHash(const std::string& h1, const std::string& h2) { - return SignatureVerifier::CalculateHash(h1 + h2); -} - -uint64_t CheckPointManager::GetStableCheckpoint() { - std::lock_guard lk(mutex_); - return current_stable_seq_; -} - -StableCheckPoint CheckPointManager::GetStableCheckpointWithVotes() { - std::lock_guard lk(mutex_); - return stable_ckpt_; -} - -void CheckPointManager::AddCommitData(std::unique_ptr request) { - if (config_.IsCheckPointEnabled()) { - data_queue_.Push(std::move(request)); - } -} - -// check whether there are 2f+1 valid checkpoint proof. -bool CheckPointManager::IsValidCheckpointProof( - const StableCheckPoint& stable_ckpt) { - std::string hash = stable_ckpt_.hash(); - std::set senders; - for (const auto& signature : stable_ckpt_.signatures()) { - if (!verifier_->VerifyMessage(hash, signature)) { - return false; - } - senders.insert(signature.node_id()); - } - - return (static_cast(senders.size()) >= config_.GetMinDataReceiveNum()) || - (stable_ckpt.seq() == 0 && senders.size() == 0); -} - -int CheckPointManager::ProcessCheckPoint(std::unique_ptr context, - std::unique_ptr request) { - CheckPointData checkpoint_data; - if (!checkpoint_data.ParseFromString(request->data())) { - LOG(ERROR) << "parse checkpont data fail:"; - return -2; - } - uint64_t checkpoint_seq = checkpoint_data.seq(); - uint32_t sender_id = request->sender_id(); - LOG(ERROR) << " receive ckpt:" << checkpoint_seq << " from:" << sender_id; - int water_mark = config_.GetCheckPointWaterMark(); - if (checkpoint_seq % water_mark) { - LOG(ERROR) << "checkpoint seq not invalid:" << checkpoint_seq; - return -2; - } - - if (verifier_) { - // check signatures - bool valid = verifier_->VerifyMessage(checkpoint_data.hash(), - checkpoint_data.hash_signature()); - if (!valid) { - LOG(ERROR) << "request is not valid:" - << checkpoint_data.hash_signature().DebugString(); - return -2; - } - } - - { - std::lock_guard lk(mutex_); - auto res = - sender_ckpt_[std::make_pair(checkpoint_seq, checkpoint_data.hash())] - .insert(sender_id); - if (res.second) { - sign_ckpt_[std::make_pair(checkpoint_seq, checkpoint_data.hash())] - .push_back(checkpoint_data.hash_signature()); - new_data_++; - } - if (sender_ckpt_[std::make_pair(checkpoint_seq, checkpoint_data.hash())] - .size() == 1) { - for (auto& hash_ : checkpoint_data.hashs()) { - hash_ckpt_[std::make_pair(checkpoint_seq, checkpoint_data.hash())] - .push_back(hash_); - } - } - Notify(); - } - return 0; -} - -void CheckPointManager::Notify() { - std::lock_guard lk(cv_mutex_); - cv_.notify_all(); -} - -bool CheckPointManager::Wait() { - int timeout_ms = 1000; - std::unique_lock lk(cv_mutex_); - return cv_.wait_for(lk, std::chrono::milliseconds(timeout_ms), - [&] { return new_data_ > 0; }); -} - -void CheckPointManager::CheckHealthy() { - uint32_t current_time = time(nullptr); - - std::map seqs; - - for (int i = 1; i <= config_.GetReplicaNum(); ++i) { - if (last_update_time_.find(i) == last_update_time_.end() || - last_update_time_[i] == 0) { - continue; - } - LOG(ERROR) << " check healthy, replica:" << i - << " current time:" << current_time - << " last time:" << last_update_time_[i] - << " timeout:" << replica_timeout_ - << " pass:" << current_time - last_update_time_[i]; - if (current_time - last_update_time_[i] > replica_timeout_) { - TimeoutHandler(i); - } - seqs[status_[i]]++; - } - - uint64_t unstable_check_ckpt = 0; - for (auto it : seqs) { - int num = 0; - for (auto sit : seqs) { - if (sit.first < it.first) { - continue; - } - num += sit.second; - } - if (num >= config_.GetMinDataReceiveNum()) { - unstable_check_ckpt = std::max(unstable_check_ckpt, it.first); - } - } - SetUnstableCkpt(unstable_check_ckpt); -} - -void CheckPointManager::UpdateStableCheckPointStatus() { - uint64_t last_committable_seq = 0; - while (!stop_) { - if (!Wait()) { - continue; - } - uint64_t stable_seq = 0; - std::string stable_hash; - { - std::lock_guard lk(mutex_); - for (auto it : sender_ckpt_) { - if (it.second.size() >= - static_cast(config_.GetMinCheckpointReceiveNum())) { - committable_seq_ = it.first.first; - committable_hash_ = it.first.second; - std::set senders_ = - sender_ckpt_[std::make_pair(committable_seq_, committable_hash_)]; - sem_post(&committable_seq_signal_); - } - if (it.second.size() >= - static_cast(config_.GetMinDataReceiveNum())) { - stable_seq = it.first.first; - stable_hash = it.first.second; - } - } - new_data_ = 0; - } - - LOG(ERROR) << "current stable seq:" << current_stable_seq_ - << " stable seq:" << stable_seq; - if (stable_seq == 0) { - continue; - } - std::vector votes; - if (current_stable_seq_ < stable_seq) { - std::lock_guard lk(mutex_); - votes = sign_ckpt_[std::make_pair(stable_seq, stable_hash)]; - std::set senders_ = - sender_ckpt_[std::make_pair(stable_seq, stable_hash)]; - - auto it = sender_ckpt_.begin(); - while (it != sender_ckpt_.end()) { - if (it->first.first <= stable_seq) { - sign_ckpt_.erase(sign_ckpt_.find(it->first)); - auto tmp = it++; - sender_ckpt_.erase(tmp); - } else { - it++; - } - } - stable_ckpt_.set_seq(stable_seq); - stable_ckpt_.set_hash(stable_hash); - stable_ckpt_.mutable_signatures()->Clear(); - for (auto vote : votes) { - *stable_ckpt_.add_signatures() = vote; - } - current_stable_seq_ = stable_seq; - } - UpdateStableCheckPointCallback(current_stable_seq_); - } -} - -void CheckPointManager::SetTimeoutHandler( - std::function timeout_handler) { - timeout_handler_ = timeout_handler; -} - -void CheckPointManager::TimeoutHandler() { - if (timeout_handler_) { - timeout_handler_(0); - } -} - -void CheckPointManager::TimeoutHandler(uint32_t replica) { - if (timeout_handler_) { - timeout_handler_(replica); - } -} - -void CheckPointManager::SetLastCommit(uint64_t seq) { - LOG(ERROR) << " set last commit:" << seq; - last_seq_ = seq; - std::lock_guard lk(lt_mutex_); - committed_status_.clear(); -} - -uint64_t CheckPointManager::GetLastCommit() { return last_seq_; } - -int CheckPointManager::ProcessStatusSync(std::unique_ptr context, - std::unique_ptr request) { - CheckPointData checkpoint_data; - if (!checkpoint_data.ParseFromString(request->data())) { - LOG(ERROR) << "parse checkpont data fail:"; - return -2; - } - uint64_t seq = checkpoint_data.seq(); - uint32_t sender_id = request->sender_id(); - uint32_t primary_id = checkpoint_data.primary_id(); - uint32_t view = checkpoint_data.view(); - - status_[sender_id] = seq; - last_update_time_[sender_id] = time(nullptr); - view_status_[sender_id] = std::make_pair(primary_id, view); - LOG(ERROR) << " received from :" << sender_id << " commit status:" << seq - << " primary:" << primary_id << " view:" << view; - return 0; -} - -void CheckPointManager::CheckStatus(uint64_t last_seq) { - std::vector seqs; - for (auto it : status_) { - seqs.push_back(it.second); - } - - sort(seqs.begin(), seqs.end()); - int f = config_.GetMaxMaliciousReplicaNum(); - - if (seqs.size() <= f + 1) { - return; - } - // uint64_t min_seq = seqs[f + 1]; - uint64_t min_seq = seqs.back(); - - LOG(ERROR) << " check last seq:" << last_seq << " max seq:" << min_seq; - if (last_seq < min_seq) { - // need recovery from others - reset_execute_func_(last_seq + 1); - BroadcastRecovery(last_seq + 1, std::min(min_seq, last_seq + 500)); - } -} - -void CheckPointManager::CheckSysStatus() { - int f = config_.GetMaxMaliciousReplicaNum(); - - std::map, int> views; - int current_primary = 0; - uint64_t current_view = 0; - for (auto it : view_status_) { - views[it.second]++; - if (views[it.second] >= 2 * f + 1) { - current_primary = it.second.first; - current_view = it.second.second; - } - } - - if (current_primary > 0 && current_primary != sys_info_->GetPrimaryId() && - current_view > sys_info_->GetCurrentView()) { - sys_info_->SetCurrentView(current_view); - sys_info_->SetPrimary(current_primary); - LOG(ERROR) << " change to primary:" << current_primary - << " view:" << current_view; - } -} - -void CheckPointManager::SyncStatus() { - uint64_t last_check_seq = 0; - uint64_t last_time = 0; - while (!stop_) { - uint64_t last_seq = last_seq_; - - CheckPointData checkpoint_data; - std::unique_ptr checkpoint_request = NewRequest( - Request::TYPE_STATUS_SYNC, Request(), config_.GetSelfInfo().id()); - checkpoint_data.set_seq(last_seq); - checkpoint_data.set_view(sys_info_->GetCurrentView()); - checkpoint_data.set_primary_id(sys_info_->GetPrimaryId()); - checkpoint_data.SerializeToString(checkpoint_request->mutable_data()); - replica_communicator_->BroadCast(*checkpoint_request); - - LOG(ERROR) << " sync status last seq:" << last_seq - << " last time:" << last_time - << " primary:" << sys_info_->GetPrimaryId() - << " view:" << sys_info_->GetCurrentView(); - if (last_check_seq == last_seq && last_time > 5) { - CheckStatus(last_seq); - last_time = 0; - } - CheckSysStatus(); - - if (last_seq != last_check_seq) { - last_check_seq = last_seq; - last_time = 0; - } - CheckHealthy(); - sleep(10); - last_time++; - } -} - -void CheckPointManager::UpdateCheckPointStatus() { - uint64_t last_ckpt_seq = 0; - int water_mark = config_.GetCheckPointWaterMark(); - int timeout_ms = config_.GetViewchangeCommitTimeout(); - std::vector stable_hashs; - std::vector stable_seqs; - std::map> pendings; - while (!stop_) { - std::unique_ptr request = nullptr; - if (!pendings.empty()) { - LOG(ERROR) << " last seq:" << last_seq_ - << " pending:" << pendings.begin()->second->seq(); - if (pendings.begin()->second->seq() == last_seq_ + 1) { - request = std::move(pendings.begin()->second); - pendings.erase(pendings.begin()); - } - } - if (request == nullptr) { - request = data_queue_.Pop(timeout_ms); - } - if (request == nullptr) { - continue; - } - std::string hash_ = request->hash(); - uint64_t current_seq = request->seq(); - LOG(ERROR) << "update checkpoint seq :" << last_seq_ - << " current:" << current_seq; - if (current_seq != last_seq_ + 1) { - LOG(ERROR) << "seq invalid:" << last_seq_ << " current:" << current_seq; - if (current_seq > last_seq_ + 1) { - pendings[current_seq] = std::move(request); - } - continue; - } - { - std::lock_guard lk(lt_mutex_); - last_hash_ = GetHash(last_hash_, request->hash()); - last_seq_++; - } - bool is_recovery = request->is_recovery(); - - LOG(ERROR) << " current seq:" << current_seq << " water mark:" << water_mark - << " current stable seq:" << current_stable_seq_; - if (current_seq > 0 && current_seq % water_mark == 0) { - last_ckpt_seq = current_seq; - BroadcastCheckPoint(last_ckpt_seq, last_hash_, stable_hashs, stable_seqs); - } - ClearCommittedStatus(current_seq); - } - return; -} - -void CheckPointManager::BroadcastCheckPoint( - uint64_t seq, const std::string& hash, - const std::vector& stable_hashs, - const std::vector& stable_seqs) { - CheckPointData checkpoint_data; - std::unique_ptr checkpoint_request = NewRequest( - Request::TYPE_CHECKPOINT, Request(), config_.GetSelfInfo().id()); - checkpoint_data.set_seq(seq); - checkpoint_data.set_hash(hash); - if (verifier_) { - auto signature_or = verifier_->SignMessage(hash); - if (!signature_or.ok()) { - LOG(ERROR) << "Sign message fail"; - return; - } - *checkpoint_data.mutable_hash_signature() = *signature_or; - } - - checkpoint_data.SerializeToString(checkpoint_request->mutable_data()); - replica_communicator_->BroadCast(*checkpoint_request); -} - -void CheckPointManager::BroadcastRecovery(uint64_t min_seq, uint64_t max_seq) { - RecoveryRequest recovery_data; - std::unique_ptr recovery_request = NewRequest( - Request::TYPE_RECOVERY_DATA, Request(), config_.GetSelfInfo().id()); - recovery_data.set_min_seq(min_seq); - recovery_data.set_max_seq(max_seq); - recovery_data.SerializeToString(recovery_request->mutable_data()); - - LOG(ERROR) << " recovery request [" << min_seq << "," << max_seq << "]"; - replica_communicator_->BroadCast(*recovery_request); -} - -void CheckPointManager::WaitSignal() { - std::unique_lock lk(mutex_); - signal_.wait(lk, [&] { return !stable_hash_queue_.Empty(); }); -} - -std::unique_ptr> -CheckPointManager::PopStableSeqHash() { - return stable_hash_queue_.Pop(); -} - -uint64_t CheckPointManager::GetHighestPreparedSeq() { - std::lock_guard lk(lt_mutex_); - LOG(ERROR) << "get high prepared seq:" << highest_prepared_seq_; - return highest_prepared_seq_; -} - -void CheckPointManager::SetHighestPreparedSeq(uint64_t seq) { - LOG(ERROR) << "set high prepared seq:" << seq; - std::lock_guard lk(lt_mutex_); - highest_prepared_seq_ = seq; -} - -sem_t* CheckPointManager::CommitableSeqSignal() { - std::lock_guard lk(lt_mutex_); - return &committable_seq_signal_; -} - -uint64_t CheckPointManager::GetCommittableSeq() { - std::lock_guard lk(lt_mutex_); - return committable_seq_; -} - -void CheckPointManager::SetUnstableCkpt(uint64_t unstable_check_ckpt) { - LOG(ERROR) << " set unstable ckpt:" << unstable_check_ckpt; - { - std::lock_guard lk(lt_mutex_); - unstable_check_ckpt_ = unstable_check_ckpt; - } -} - -uint64_t CheckPointManager::GetUnstableCkpt() { - std::lock_guard lk(lt_mutex_); - LOG(ERROR) << " get unstable ckpt:" << unstable_check_ckpt_; - return unstable_check_ckpt_; -} - -void CheckPointManager::AddCommitState(uint64_t seq) { - LOG(ERROR) << " add commited state:" << seq; - std::lock_guard lk(lt_mutex_); - committed_status_[seq] = true; -} - -bool CheckPointManager::IsCommitted(uint64_t seq) { - std::lock_guard lk(lt_mutex_); - if (seq < last_seq_) { - return true; - } - return committed_status_.find(seq) != committed_status_.end(); -} - -void CheckPointManager::ClearCommittedStatus(uint64_t seq) { - std::lock_guard lk(lt_mutex_); - while (!committed_status_.empty()) { - if (committed_status_.begin()->first <= seq) { - committed_status_.erase(committed_status_.begin()); - } else { - break; - } - } -} - -// void CheckPointManager::SetLastExecutedSeq(uint64_t latest_executed_seq){ -// latest_executed_seq = executor_->get_latest_executed_seq(); -// } - -} // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/checkpoint_manager.h b/platform/consensus/ordering/raft/framework/checkpoint_manager.h deleted file mode 100644 index 8a91638574..0000000000 --- a/platform/consensus/ordering/raft/framework/checkpoint_manager.h +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#pragma once - -#include - -#include "chain/state/chain_state.h" -#include "common/crypto/signature_verifier.h" -#include "interface/common/resdb_txn_accessor.h" -#include "platform/config/resdb_config.h" -#include "platform/consensus/checkpoint/checkpoint.h" -#include "platform/consensus/execution/transaction_executor.h" -#include "platform/networkstrate/replica_communicator.h" -#include "platform/networkstrate/server_comm.h" -#include "platform/proto/checkpoint_info.pb.h" -#include "platform/proto/resdb.pb.h" - -namespace resdb { - -class CheckPointManager : public CheckPoint { - public: - CheckPointManager(const ResDBConfig& config, - ReplicaCommunicator* replica_communicator, - SignatureVerifier* verifier, SystemInfo* sys_info); - virtual ~CheckPointManager(); - - void SetLastCommit(uint64_t seq); - uint64_t GetLastCommit(); - - void AddCommitData(std::unique_ptr request); - int ProcessCheckPoint(std::unique_ptr context, - std::unique_ptr request); - int ProcessStatusSync(std::unique_ptr context, - std::unique_ptr request); - - uint64_t GetStableCheckpoint() override; - // void SetLastExecutedSeq(uint64_t latest_executed_seq); - StableCheckPoint GetStableCheckpointWithVotes(); - bool IsValidCheckpointProof(const StableCheckPoint& stable_ckpt); - - void SetTimeoutHandler(std::function timeout_handler); - virtual void UpdateStableCheckPointCallback( - int64_t current_stable_checkpoint) {} - - void Stop(); - - void TimeoutHandler(); - void TimeoutHandler(uint32_t replica); - - void WaitSignal(); - std::unique_ptr> PopStableSeqHash(); - - void SetExecutor(TransactionExecutor* executor) { executor_ = executor; } - - uint64_t GetHighestPreparedSeq(); - - void SetHighestPreparedSeq(uint64_t seq); - - sem_t* CommitableSeqSignal(); - - uint64_t GetCommittableSeq(); - - void SetUnstableCkpt(uint64_t unstable_check_ckpt); - - uint64_t GetUnstableCkpt(); - - void AddCommitState(uint64_t seq); - - bool IsCommitted(uint64_t seq); - void ClearCommittedStatus(uint64_t seq); - - void SetResetExecute(std::function); - - private: - void UpdateCheckPointStatus(); - void UpdateStableCheckPointStatus(); - void BroadcastCheckPoint(uint64_t seq, const std::string& hash, - const std::vector& stable_hashs, - const std::vector& stable_seqs); - - void Notify(); - bool Wait(); - void BroadcastRecovery(uint64_t min_seq, uint64_t max_seq); - - void SyncStatus(); - void StatusProcess(); - void CheckStatus(uint64_t last_seq); - void CheckSysStatus(); - void CheckHealthy(); - - protected: - uint64_t last_executed_seq_ = 0; - ResDBConfig config_; - ReplicaCommunicator* replica_communicator_; - std::thread checkpoint_thread_, stable_checkpoint_thread_, status_thread_; - SignatureVerifier* verifier_; - std::atomic stop_; - std::map, std::set> sender_ckpt_; - std::map, std::vector> - sign_ckpt_; - std::map, std::vector> - hash_ckpt_; - std::atomic current_stable_seq_; - std::mutex mutex_; - LockFreeQueue data_queue_; - std::mutex cv_mutex_; - std::condition_variable cv_; - std::function timeout_handler_; - StableCheckPoint stable_ckpt_; - int new_data_ = 0; - LockFreeQueue> stable_hash_queue_; - std::condition_variable signal_; - ResDBTxnAccessor txn_accessor_; - std::mutex lt_mutex_, seq_mutex_; - uint64_t last_seq_ = 0; - uint64_t max_seq_ = 0; - TransactionExecutor* executor_; - std::atomic highest_prepared_seq_; - uint64_t committable_seq_ = 0; - std::string last_hash_, committable_hash_; - sem_t committable_seq_signal_; - std::map status_; - std::map last_update_time_; - int replica_timeout_ = 60; - uint64_t unstable_check_ckpt_; - std::map committed_status_; - std::function reset_execute_func_; - SystemInfo* sys_info_; - std::map> view_status_; -}; - -} // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/consensus.cpp b/platform/consensus/ordering/raft/framework/consensus.cpp index fb3e06735c..c6c01117cb 100644 --- a/platform/consensus/ordering/raft/framework/consensus.cpp +++ b/platform/consensus/ordering/raft/framework/consensus.cpp @@ -32,13 +32,14 @@ namespace raft { Consensus::Consensus(const ResDBConfig& config, std::unique_ptr executor) : common::Consensus(config, std::move(executor)), - leader_election_manager_(std::make_unique(config_)), - system_info_(std::make_unique(config_)), - checkpoint_manager_(std::make_unique( - config_, GetBroadCastClient(), GetSignatureVerifier(), - system_info_.get())), - recovery_(std::make_unique(config_, checkpoint_manager_.get(), - transaction_executor_->GetStorage())) { + leader_election_manager_( + std::make_unique(config_)), + system_info_(std::make_unique(config_)), + raft_checkpoint_manager_(std::make_unique()), + recovery_(std::make_unique( + config_, raft_checkpoint_manager_.get(), + transaction_executor_->GetStorage(), + [this](uint64_t seq) { OnCheckpointFinish(seq); })) { //LOG(INFO) << "JIM -> " << __FUNCTION__ << ": In consensus constructor"; int total_replicas = config_.GetReplicaNum(); int f = (total_replicas - 1) / 3; @@ -126,6 +127,8 @@ void Consensus::RecoverFromLogs() { << " voted for: " << metadata.voted_for; raft_->SetCurrentTerm(metadata.current_term, false); raft_->SetVotedFor(metadata.voted_for, false); + raft_->SetSnapshotLastIndexAndTerm(metadata.snapshot_last_index, + metadata.snapshot_last_term, false); }, [&](std::unique_ptr record) { switch (record->payload_case()) { @@ -144,7 +147,7 @@ void Consensus::RecoverFromLogs() { break; } }, - [&](int seq) { raft_->SetSeqIndexCoveredBySnapshot(seq); }); + [](int) {}); } int Consensus::ProcessNewTransaction(std::unique_ptr request) { @@ -162,5 +165,28 @@ int Consensus::CommitMsg(const google::protobuf::Message& msg) { return 0; } +int Consensus::ResponseMsg(const BatchUserResponse& batch_resp) { + // While we may receive these ResponseMsg's out of order, we do know the + // execution of transactions are guaranteed to be in order, so we know all + // transactions before batch_resp.seq() have been executed. + last_applied_ = std::max(batch_resp.seq(), last_applied_); + + // raft_checkpoint_manager_->SetStableCheckpoint(batch_resp.seq()); + if (batch_resp.seq() >= snapshot_interval_ + last_snapshot_initiated_at_) { + LOG(INFO) << "Initiating checkpoint at seq: " << batch_resp.seq(); + // Update the checkpoint in the manager + raft_checkpoint_manager_->SetStableCheckpoint(batch_resp.seq()); + last_snapshot_initiated_at_ = batch_resp.seq(); + LOG(INFO) << "Next Checkpoint will be after " + << (snapshot_interval_ + last_snapshot_initiated_at_); + } + return common::Consensus::ResponseMsg(batch_resp); +}; + +void Consensus::OnCheckpointFinish(uint64_t seq) { + LOG(INFO) << "Checkpointed all entries up to " << seq; + // raft_->TruncatePrefix(seq); +} + } // namespace raft } // namespace resdb diff --git a/platform/consensus/ordering/raft/framework/consensus.h b/platform/consensus/ordering/raft/framework/consensus.h index 4ca3fe711f..68ed55c44e 100644 --- a/platform/consensus/ordering/raft/framework/consensus.h +++ b/platform/consensus/ordering/raft/framework/consensus.h @@ -23,7 +23,7 @@ #include "platform/consensus/ordering/common/framework/consensus.h" #include "platform/consensus/ordering/raft/algorithm/leaderelection_manager.h" #include "platform/consensus/ordering/raft/algorithm/raft.h" -#include "platform/consensus/ordering/raft/framework/checkpoint_manager.h" +#include "platform/consensus/ordering/raft/framework/raft_checkpoint_manager.h" #include "platform/consensus/recovery/raft_recovery.h" #include "platform/networkstrate/consensus_manager.h" @@ -41,14 +41,19 @@ class Consensus : public common::Consensus { int ProcessNewTransaction(std::unique_ptr request) override; int CommitMsg(const google::protobuf::Message& msg) override; int CommitMsgInternal(const AppendEntries& txn); + int ResponseMsg(const BatchUserResponse& batch_resp) override; void RecoverFromLogs(); + void OnCheckpointFinish(uint64_t seq); protected: std::unique_ptr raft_; std::unique_ptr leader_election_manager_; std::unique_ptr system_info_; - std::unique_ptr checkpoint_manager_; + std::unique_ptr raft_checkpoint_manager_; std::unique_ptr recovery_; + uint32_t snapshot_interval_ = 1000; + uint64_t last_applied_ = 0; + uint32_t last_snapshot_initiated_at_ = 0; }; } // namespace raft diff --git a/platform/consensus/ordering/raft/framework/raft_checkpoint_manager.h b/platform/consensus/ordering/raft/framework/raft_checkpoint_manager.h new file mode 100644 index 0000000000..c4774a27c7 --- /dev/null +++ b/platform/consensus/ordering/raft/framework/raft_checkpoint_manager.h @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include "platform/consensus/checkpoint/checkpoint.h" + +namespace resdb { + +class RaftCheckPoint : public CheckPoint { + public: + RaftCheckPoint() = default; + virtual ~RaftCheckPoint() = default; + + virtual uint64_t GetStableCheckpoint() { return current_stable_seq_.load(); } + virtual void SetStableCheckpoint(uint64_t current_stable_seq) { + current_stable_seq_.store(current_stable_seq); + } + + private: + std::atomic current_stable_seq_ = 0; +}; + +} // namespace resdb diff --git a/platform/consensus/ordering/raft/proto/proposal.proto b/platform/consensus/ordering/raft/proto/proposal.proto index 762594c640..835d5132f3 100644 --- a/platform/consensus/ordering/raft/proto/proposal.proto +++ b/platform/consensus/ordering/raft/proto/proposal.proto @@ -27,6 +27,7 @@ message Entry { } message TruncationRecord { + // Including this index, remove it and everything after it. uint64 truncate_from_index = 1; uint64 truncate_from_term = 2; } diff --git a/platform/consensus/recovery/BUILD b/platform/consensus/recovery/BUILD index eeb25edbee..e9e74755ad 100644 --- a/platform/consensus/recovery/BUILD +++ b/platform/consensus/recovery/BUILD @@ -75,7 +75,7 @@ cc_library( "//common/utils", "//platform/consensus/ordering/raft/proto:proposal_cc_proto", "//platform/config:resdb_config", - "//platform/consensus/checkpoint", + "//platform/consensus/ordering/raft/framework:raft_checkpoint_manager", "//platform/networkstrate:server_comm", "//platform/proto:resdb_cc_proto", "//platform/consensus/recovery:recovery_base" diff --git a/platform/consensus/recovery/mock_raft_recovery.h b/platform/consensus/recovery/mock_raft_recovery.h index 936c2cd2dd..136997263a 100644 --- a/platform/consensus/recovery/mock_raft_recovery.h +++ b/platform/consensus/recovery/mock_raft_recovery.h @@ -31,10 +31,14 @@ namespace raft { class MockRaftRecovery : public RaftRecovery { public: MockRaftRecovery(const ResDBConfig& config) - : RaftRecovery(config, mock_checkpoint_.get(), mock_storage_.get()) {} + : RaftRecovery(config, mock_checkpoint_.get(), mock_storage_.get(), + nullptr) {} MOCK_METHOD(void, AddLogEntry, (const Entry* entry), ()); - MOCK_METHOD(void, WriteMetadata, (int64_t current_term, int32_t voted_for), ()); + MOCK_METHOD(void, WriteMetadata, + (int64_t current_term, int32_t voted_for, + uint64_t snapshot_last_index, uint64_t snapshot_last_term), + ()); MOCK_METHOD(void, AddLogEntry, (std::vector& entries_to_add), ()); MOCK_METHOD(void, TruncateLog, (TruncationRecord truncate_beginning_at), ()); diff --git a/platform/consensus/recovery/pbft_recovery.cpp b/platform/consensus/recovery/pbft_recovery.cpp index ca04343280..65e893ed0a 100644 --- a/platform/consensus/recovery/pbft_recovery.cpp +++ b/platform/consensus/recovery/pbft_recovery.cpp @@ -56,7 +56,8 @@ void PBFTRecovery::Init() { CallbackType callback = [this](std::unique_ptr context, std::unique_ptr request) { - min_seq_ == -1 ? min_seq_ = request->seq() + min_seq_ = (min_seq_ == -1) + ? request->seq() : std::min(min_seq_, static_cast(request->seq())); max_seq_ = std::max(max_seq_, static_cast(request->seq())); }; diff --git a/platform/consensus/recovery/raft_recovery.cpp b/platform/consensus/recovery/raft_recovery.cpp index f0b5c72cb0..b0d5d421df 100644 --- a/platform/consensus/recovery/raft_recovery.cpp +++ b/platform/consensus/recovery/raft_recovery.cpp @@ -38,9 +38,10 @@ namespace raft { using CallbackType = std::function)>; RaftRecovery::RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, - Storage* storage) - : RecoveryBase(config, checkpoint, - storage) { + Storage* storage, + std::function on_checkpoint) + : RecoveryBase( + config, checkpoint, storage, on_checkpoint) { Init(); } @@ -50,8 +51,6 @@ void RaftRecovery::Init() { return; } - wal_seq_ = 0; - LOG(ERROR) << " init"; GetLastFile(); @@ -61,7 +60,8 @@ void RaftRecovery::Init() { OpenMetadataFile(); CallbackType callback = [this](std::unique_ptr record) { - min_seq_ == -1 ? min_seq_ = record->seq() + min_seq_ = min_seq_ == -1 + ? record->seq() : std::min(min_seq_, static_cast(record->seq())); max_seq_ = std::max(max_seq_, static_cast(record->seq())); }; @@ -73,8 +73,6 @@ void RaftRecovery::Init() { } RaftRecovery::~RaftRecovery() { - LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " - << __func__ << "\n"; if (recovery_enabled_ == false) { return; } @@ -85,8 +83,7 @@ RaftRecovery::~RaftRecovery() { } void RaftRecovery::OpenMetadataFile() { - LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " - << __func__ << "\n"; + LOG(INFO) << "Opening Metadata File"; metadata_fd_ = open(meta_file_path_.c_str(), O_CREAT | O_RDWR, 0666); if (metadata_fd_ < 0) { LOG(ERROR) << "Failed to open metadata file: " << strerror(errno); @@ -94,7 +91,9 @@ void RaftRecovery::OpenMetadataFile() { } } -void RaftRecovery::WriteMetadata(int64_t current_term, int32_t voted_for) { +void RaftRecovery::WriteMetadata(int64_t current_term, int32_t voted_for, + uint64_t snapshot_last_index, + uint64_t snapshot_last_term) { if (recovery_enabled_ == false) { return; } @@ -112,6 +111,8 @@ void RaftRecovery::WriteMetadata(int64_t current_term, int32_t voted_for) { RaftMetadata new_metadata; new_metadata.current_term = current_term; new_metadata.voted_for = voted_for; + new_metadata.snapshot_last_index = snapshot_last_index; + new_metadata.snapshot_last_term = snapshot_last_term; ssize_t bytes_written = write(temp_fd, &new_metadata, sizeof(new_metadata)); if (bytes_written != static_cast(sizeof(new_metadata))) { @@ -151,7 +152,9 @@ void RaftRecovery::WriteMetadata(int64_t current_term, int32_t voted_for) { metadata_ = new_metadata; LOG(INFO) << "Wrote metadata: term: " << current_term - << " votedFor: " << voted_for; + << " votedFor: " << voted_for + << " snapshot last index: " << snapshot_last_index + << " snapshot last term: " << snapshot_last_term; LOG(INFO) << "METADATA location: " << meta_file_path_; } @@ -175,13 +178,15 @@ RaftMetadata RaftRecovery::ReadMetadata() { } LOG(INFO) << "Read metadata file: term: " << metadata.current_term - << " votedFor: " << metadata.voted_for; + << " votedFor: " << metadata.voted_for + << " snapshot_last_index: " << metadata.snapshot_last_index + << " snapshot_last_term: " << metadata.snapshot_last_term; return metadata; } void RaftRecovery::WriteSystemInfo() {} -void RaftRecovery::AddLogEntry(const Entry* entry) { +void RaftRecovery::AddLogEntry(const Entry* entry, int64_t seq) { if (recovery_enabled_ == false) { return; } @@ -189,22 +194,22 @@ void RaftRecovery::AddLogEntry(const Entry* entry) { std::unique_lock lk(mutex_); WALRecord record; *record.mutable_entry() = *entry; - record.set_seq(++wal_seq_); + record.set_seq(seq); WriteLog(record); Flush(); } -void RaftRecovery::AddLogEntry(std::vector& entries_to_add) { +void RaftRecovery::AddLogEntry(std::vector& entries_to_add, + int64_t seq) { if (recovery_enabled_ == false || entries_to_add.size() == 0) { return; } std::unique_lock lk(mutex_); for (const auto& entry : entries_to_add) { - LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " << __func__ << "\n"; WALRecord record; *record.mutable_entry() = entry; - record.set_seq(++wal_seq_); + record.set_seq(seq++); WriteLog(record); } Flush(); @@ -218,7 +223,7 @@ void RaftRecovery::TruncateLog(TruncationRecord truncate_beginning_at) { std::unique_lock lk(mutex_); WALRecord record; - record.set_seq(++wal_seq_); + record.set_seq(truncate_beginning_at.truncate_from_index() - 1); *record.mutable_truncation() = std::move(truncate_beginning_at); WriteLog(record); @@ -226,26 +231,38 @@ void RaftRecovery::TruncateLog(TruncationRecord truncate_beginning_at) { } void RaftRecovery::WriteLog(const WALRecord& record) { - LOG(INFO) << "Debug at " << __FILE__ << ":" << __LINE__ << " in function " - << __func__ << "\n"; std::string data; record.SerializeToString(&data); switch (record.payload_case()) { - case WALRecord::kEntry: + case WALRecord::kEntry: { min_seq_ = min_seq_ == -1 ? record.seq() : std::min(min_seq_, static_cast(record.seq())); max_seq_ = std::max(max_seq_, static_cast(record.seq())); break; - case WALRecord::kTruncation: - max_seq_ = record.seq(); + } + case WALRecord::kTruncation: { + int64_t keep_up_to = static_cast(record.seq()); + if (max_seq_ > keep_up_to) { + max_seq_ = keep_up_to; + } + // If we truncate everything, reset min and max seq + if (max_seq_ <= last_ckpt_) { + min_seq_ = -1; + max_seq_ = -1; + } else { + min_seq_ = + (min_seq_ == -1) ? keep_up_to : std::min(min_seq_, keep_up_to); + } break; - case WALRecord::PAYLOAD_NOT_SET: + } + case WALRecord::PAYLOAD_NOT_SET: { assert(false && "WALRecord does not contain Truncation or Entry"); break; + } } AppendData(data); @@ -273,7 +290,13 @@ void RaftRecovery::PerformCallback( int64_t ckpt) { uint64_t max_seq = 0; for (std::unique_ptr& record : record_list) { - if (ckpt < record->seq()) { + // Only replay entries that are after the latest checkpoint. + // Since truncation records store the seq of the last index remaining in the + // log, it could be equal to the ckpt, meaning that everything since the + // checkpoint is to be truncated. + if (ckpt < record->seq() || + (ckpt == record->seq() && + record->payload_case() == WALRecord::kTruncation)) { max_seq = record->seq(); call_back(std::move(record)); } diff --git a/platform/consensus/recovery/raft_recovery.h b/platform/consensus/recovery/raft_recovery.h index 3341d512f0..0ab7ffaa97 100644 --- a/platform/consensus/recovery/raft_recovery.h +++ b/platform/consensus/recovery/raft_recovery.h @@ -24,6 +24,7 @@ #include "chain/storage/storage.h" #include "platform/config/resdb_config.h" #include "platform/consensus/checkpoint/checkpoint.h" +#include "platform/consensus/ordering/raft/framework/raft_checkpoint_manager.h" #include "platform/consensus/ordering/raft/proto/proposal.pb.h" #include "platform/consensus/recovery/recovery.h" #include "platform/networkstrate/server_comm.h" @@ -36,6 +37,8 @@ namespace raft { struct RaftMetadata { int64_t current_term = 0; int32_t voted_for = -1; + uint64_t snapshot_last_index = 0; + uint64_t snapshot_last_term = 0; }; using CallbackType = std::function)>; @@ -46,14 +49,15 @@ class RaftRecovery public: RaftRecovery(const ResDBConfig& config, CheckPoint* checkpoint, - Storage* storage); + Storage* storage, std::function on_checkpoint); ~RaftRecovery(); RaftMetadata ReadMetadata(); void Init(); - void WriteMetadata(int64_t current_term, int32_t voted_for); - void AddLogEntry(const Entry* entry); - void AddLogEntry(std::vector& entries_to_add); + void WriteMetadata(int64_t current_term, int32_t voted_for, + uint64_t snapshot_last_index, uint64_t snapshot_last_term); + void AddLogEntry(const Entry* entry, int64_t seq); + void AddLogEntry(std::vector& entries_to_add, int64_t seq); void TruncateLog(TruncationRecord truncate_beginning_at); #ifdef RAFT_RECOVERY_TEST_MODE @@ -80,7 +84,6 @@ class RaftRecovery int metadata_fd_; std::string meta_file_path_; RaftMetadata metadata_; - uint64_t wal_seq_; }; } // namespace raft diff --git a/platform/consensus/recovery/raft_recovery_test.cpp b/platform/consensus/recovery/raft_recovery_test.cpp index cf60d5d0a1..3bcabfa292 100644 --- a/platform/consensus/recovery/raft_recovery_test.cpp +++ b/platform/consensus/recovery/raft_recovery_test.cpp @@ -5,6 +5,7 @@ #include #include +#include #include "chain/storage/mock_storage.h" #include "platform/consensus/checkpoint/mock_checkpoint.h" @@ -17,6 +18,7 @@ using ::testing::_; using ::testing::AnyNumber; using ::testing::Invoke; using ::testing::Matcher; +using ::testing::Return; using ::testing::Test; const std::string log_path = "./log/test_log"; @@ -41,6 +43,23 @@ std::vector Listlogs(const std::string &path) { return ret; } +static Entry CreateTestEntry(RaftRecovery &recovery, int term, int seq) { + Entry logEntry; + logEntry.set_term(term); + auto req = std::make_unique(); + req->set_seq(seq); + req->set_data("Request " + std::to_string(seq)); + std::string serialized; + EXPECT_TRUE(req->SerializeToString(&serialized)); + logEntry.set_command(std::move(serialized)); + return logEntry; +} + +static void AddTestEntry(RaftRecovery &recovery, int term, int seq) { + Entry logEntry = CreateTestEntry(recovery, term, seq); + recovery.AddLogEntry(&logEntry, seq); +} + class RaftRecoveryTest : public Test { public: RaftRecoveryTest() @@ -57,26 +76,46 @@ class RaftRecoveryTest : public Test { TEST_F(RaftRecoveryTest, WriteAndReadLog) { int entries_to_add = 3; { - RaftRecovery recovery(config_, &checkpoint_, nullptr); + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); for (int i = 1; i <= entries_to_add; i++) { - Entry logEntry; - logEntry.set_term(i); - auto req = std::make_unique(); - req->set_seq(i); - req->set_data("Request " + std::to_string(i)); - std::string serialized; - if (!req->SerializeToString(&serialized)) { - assert(false); - } - logEntry.set_command(std::move(serialized)); + AddTestEntry(recovery, i, i); + } + } + { + std::vector list; + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + recovery.ReadLogs( + [&](const RaftMetadata &data) {}, + [&](std::unique_ptr record) { list.push_back(*record); }, + nullptr); + + EXPECT_EQ(list.size(), entries_to_add); + + for (size_t i = 0; i < entries_to_add; ++i) { + EXPECT_EQ(list[i].payload_case(), WALRecord::kEntry); - recovery.AddLogEntry(&logEntry); + EXPECT_EQ(list[i].entry().term(), i + 1); + Request req; + req.ParseFromString(list[i].entry().command()); + EXPECT_EQ(req.data(), "Request " + std::to_string(i + 1)); } } +} + +TEST_F(RaftRecoveryTest, WriteMultipleEntriesAndReadLog) { + int entries_to_add = 3; + { + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + std::vector log_entries; + for (int i = 1; i <= entries_to_add; i++) { + log_entries.push_back(CreateTestEntry(recovery, i, i)); + } + recovery.AddLogEntry(log_entries, 1); + } { std::vector list; - RaftRecovery recovery(config_, &checkpoint_, nullptr); + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); recovery.ReadLogs( [&](const RaftMetadata &data) {}, [&](std::unique_ptr record) { list.push_back(*record); }, nullptr); @@ -96,44 +135,94 @@ TEST_F(RaftRecoveryTest, WriteAndReadLog) { TEST_F(RaftRecoveryTest, WriteAndReadMetadata) { { - RaftRecovery recovery(config_, &checkpoint_, nullptr); + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); - recovery.WriteMetadata(2, 1); + recovery.WriteMetadata(2, 3, 100, 1); } { int64_t current_term; int32_t voted_for; - RaftRecovery recovery(config_, &checkpoint_, nullptr); + uint64_t snapshot_last_index; + uint64_t snapshot_last_term; + + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); recovery.ReadLogs( [&](const RaftMetadata &data) { current_term = data.current_term; voted_for = data.voted_for; + snapshot_last_index = data.snapshot_last_index; + snapshot_last_term = data.snapshot_last_term; }, [&](std::unique_ptr record) {}, nullptr); EXPECT_EQ(current_term, 2); - EXPECT_EQ(voted_for, 1); + EXPECT_EQ(voted_for, 3); + EXPECT_EQ(snapshot_last_index, 100); + EXPECT_EQ(snapshot_last_term, 1); + } +} + +TEST_F(RaftRecoveryTest, WriteAndReadMetadataTwice) { + { + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + + recovery.WriteMetadata(2, 3, 100, 1); + recovery.WriteMetadata(4, 2, 200, 2); + } + { + int64_t current_term; + int32_t voted_for; + uint64_t snapshot_last_index; + uint64_t snapshot_last_term; + + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + recovery.ReadLogs( + [&](const RaftMetadata &data) { + current_term = data.current_term; + voted_for = data.voted_for; + snapshot_last_index = data.snapshot_last_index; + snapshot_last_term = data.snapshot_last_term; + }, + [&](std::unique_ptr record) {}, nullptr); + + EXPECT_EQ(current_term, 4); + EXPECT_EQ(voted_for, 2); + EXPECT_EQ(snapshot_last_index, 200); + EXPECT_EQ(snapshot_last_term, 2); + } +} + +TEST_F(RaftRecoveryTest, ReadMetadataDefaultValues) { + { + int64_t current_term; + int32_t voted_for; + uint64_t snapshot_last_index; + uint64_t snapshot_last_term; + + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + recovery.ReadLogs( + [&](const RaftMetadata &data) { + current_term = data.current_term; + voted_for = data.voted_for; + snapshot_last_index = data.snapshot_last_index; + snapshot_last_term = data.snapshot_last_term; + }, + [&](std::unique_ptr record) {}, nullptr); + + EXPECT_EQ(current_term, 0); + EXPECT_EQ(voted_for, -1); + EXPECT_EQ(snapshot_last_index, 0); + EXPECT_EQ(snapshot_last_term, 0); } } TEST_F(RaftRecoveryTest, TruncateLog) { int entries_to_add = 4; { - RaftRecovery recovery(config_, &checkpoint_, nullptr); + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); for (int i = 1; i <= entries_to_add; i++) { - Entry logEntry; - logEntry.set_term(i); - auto req = std::make_unique(); - req->set_seq(i); - req->set_data("Request " + std::to_string(i)); - std::string serialized; - if (!req->SerializeToString(&serialized)) { - assert(false); - } - logEntry.set_command(std::move(serialized)); - - recovery.AddLogEntry(&logEntry); + AddTestEntry(recovery, i, i); } TruncationRecord truncation; @@ -142,18 +231,7 @@ TEST_F(RaftRecoveryTest, TruncateLog) { recovery.TruncateLog(truncation); for (int i = 5; i <= entries_to_add * 2; i++) { - Entry logEntry; - logEntry.set_term(i + 1); - auto req = std::make_unique(); - req->set_seq(i); - req->set_data("Request " + std::to_string(i)); - std::string serialized; - if (!req->SerializeToString(&serialized)) { - assert(false); - } - logEntry.set_command(std::move(serialized)); - - recovery.AddLogEntry(&logEntry); + AddTestEntry(recovery, i + 1, i); } } /* Recovery WAL @@ -170,7 +248,7 @@ TEST_F(RaftRecoveryTest, TruncateLog) { */ { std::vector list; - RaftRecovery recovery(config_, &checkpoint_, nullptr); + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); recovery.ReadLogs( [&](const RaftMetadata &data) {}, [&](std::unique_ptr record) { list.push_back(*record); }, @@ -201,6 +279,521 @@ TEST_F(RaftRecoveryTest, TruncateLog) { } } +// After a checkpoint fires and the log file is rotated, there should be exactly +// two .log files on disk: the sealed (checkpointed) file and the new active +// one. +TEST_F(RaftRecoveryTest, CheckpointCreatesNewLogFile) { + std::promise insert_done, ckpt_fired; + auto insert_done_future = insert_done.get_future(); + auto ckpt_fired_future = ckpt_fired.get_future(); + + int call_count = 0; + EXPECT_CALL(checkpoint_, GetStableCheckpoint()) + .WillRepeatedly(Invoke([&]() -> uint64_t { + ++call_count; + if (call_count == 1) + insert_done_future.get(); + else if (call_count == 2) + ckpt_fired.set_value(true); + return 5; + })); + + { + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + + for (int i = 1; i <= 9; i++) { + AddTestEntry(recovery, i, i); + } + insert_done.set_value(true); + ckpt_fired_future.get(); + + // Write some more entries into the new file. + for (int i = 10; i <= 18; i++) { + AddTestEntry(recovery, i, i); + } + } + + std::vector log_list = Listlogs(log_path); + // 2 log files and one metadata file + EXPECT_EQ(log_list.size(), 3); +} + +// After a checkpoint at stable_seq=5, ReadLogs should only replay WAL records +// whose seq is strictly greater than 5. +TEST_F(RaftRecoveryTest, CheckpointFiltersOldEntries) { + std::promise insert_done, ckpt_fired; + auto insert_done_future = insert_done.get_future(); + auto ckpt_fired_future = ckpt_fired.get_future(); + + int call_count = 0; + EXPECT_CALL(checkpoint_, GetStableCheckpoint()) + .WillRepeatedly(Invoke([&]() -> uint64_t { + ++call_count; + if (call_count == 1) + insert_done_future.get(); + else if (call_count == 2) + ckpt_fired.set_value(true); + return 5; + })); + + { + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + + for (int i = 1; i <= 9; i++) { + AddTestEntry(recovery, i, i); + } + insert_done.set_value(true); + ckpt_fired_future.get(); + } + + { + std::vector list; + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + recovery.ReadLogs( + [&](const RaftMetadata &) {}, + [&](std::unique_ptr record) { list.push_back(*record); }, + nullptr); + + // Only WAL seqs 6-9 should be replayed (4 entries). + ASSERT_EQ(list.size(), 4u); + for (size_t i = 0; i < list.size(); ++i) { + EXPECT_EQ(list[i].payload_case(), WALRecord::kEntry); + Request req; + req.ParseFromString(list[i].entry().command()); + EXPECT_EQ(req.seq(), (int)(i + 6)); + } + } +} + +// After a checkpoint rotation, GetMinSeq()/GetMaxSeq() should reset to -1 for +// the newly opened (empty) file, then update as new entries are appended. +TEST_F(RaftRecoveryTest, CheckpointResetsMinMaxSeq) { + std::promise insert_done, ckpt_fired; + auto insert_done_future = insert_done.get_future(); + auto ckpt_fired_future = ckpt_fired.get_future(); + + int call_count = 0; + EXPECT_CALL(checkpoint_, GetStableCheckpoint()) + .WillRepeatedly(Invoke([&]() -> uint64_t { + ++call_count; + if (call_count == 1) + insert_done_future.get(); + else if (call_count == 2) + ckpt_fired.set_value(true); + return 5; + })); + + { + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + + for (int i = 1; i <= 5; i++) { + AddTestEntry(recovery, i, i); + } + insert_done.set_value(true); + ckpt_fired_future.get(); + + EXPECT_EQ(recovery.GetMinSeq(), -1); + EXPECT_EQ(recovery.GetMaxSeq(), -1); + + // Add entries to the new file and verify the range is tracked correctly. + for (int i = 6; i <= 9; i++) { + AddTestEntry(recovery, i, i); + } + + EXPECT_EQ(recovery.GetMinSeq(), 6); + EXPECT_EQ(recovery.GetMaxSeq(), 9); + } +} + +// Two successive checkpoints. After both fires, only entries whose WAL seq +// exceeds the second checkpoint value (15) survive replay. +TEST_F(RaftRecoveryTest, TwoCheckpoints) { + std::promise ins1, ck1, ins2, ck2; + auto ins1f = ins1.get_future(), ck1f = ck1.get_future(); + auto ins2f = ins2.get_future(), ck2f = ck2.get_future(); + + int call_count = 0; + EXPECT_CALL(checkpoint_, GetStableCheckpoint()) + .WillRepeatedly(Invoke([&]() -> uint64_t { + ++call_count; + if (call_count == 1) + ins1f.get(); + else if (call_count == 2) + ck1.set_value(true); + else if (call_count == 3) + ins2f.get(); + else if (call_count == 4) + ck2.set_value(true); + return (call_count <= 2) ? 5 : 15; + })); + + { + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + + for (int i = 1; i <= 9; i++) { + AddTestEntry(recovery, i, i); + } + ins1.set_value(true); + ck1f.get(); + + for (int i = 10; i <= 18; i++) { + AddTestEntry(recovery, i, i); + } + ins2.set_value(true); + ck2f.get(); + + // Third window: entries 19-22. + for (int i = 19; i <= 22; i++) { + AddTestEntry(recovery, i, i); + } + } + + std::vector log_list = Listlogs(log_path); + // 3 log files and one metadata file + EXPECT_EQ(log_list.size(), 4); + + { + std::vector list; + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + recovery.ReadLogs( + [&](const RaftMetadata &) {}, + [&](std::unique_ptr record) { list.push_back(*record); }, + nullptr); + + // ckpt=15: entries with WAL seq > 15 survive: seqs 16-22 (7 entries). + ASSERT_EQ(list.size(), 7u); + for (size_t i = 0; i < list.size(); ++i) { + Request req; + req.ParseFromString(list[i].entry().command()); + EXPECT_EQ(req.seq(), (int)(i + 16)); + } + // Even though seqs 16-22 survive, min seq and max seq refer to the most + // recent log. + EXPECT_EQ(recovery.GetMinSeq(), 19); + EXPECT_EQ(recovery.GetMaxSeq(), 22); + } +} + +// Metadata lives in a separate file and should be fully preserved across log +// rotations caused by a checkpoint. +TEST_F(RaftRecoveryTest, MetadataPersistedAcrossCheckpoint) { + std::promise insert_done, ckpt_fired; + auto insert_done_future = insert_done.get_future(); + auto ckpt_fired_future = ckpt_fired.get_future(); + + int call_count = 0; + EXPECT_CALL(checkpoint_, GetStableCheckpoint()) + .WillRepeatedly(Invoke([&]() -> uint64_t { + ++call_count; + if (call_count == 1) + insert_done_future.get(); + else if (call_count == 2) + ckpt_fired.set_value(true); + return 5; + })); + + { + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + recovery.WriteMetadata(7, 2, 50, 3); + + for (int i = 1; i <= 5; i++) { + AddTestEntry(recovery, i, i); + } + insert_done.set_value(true); + ckpt_fired_future.get(); + + for (int i = 6; i <= 8; i++) { + AddTestEntry(recovery, i, i); + } + } + + { + int64_t current_term = 0; + int32_t voted_for = 0; + uint64_t snapshot_last_index = 0; + uint64_t snapshot_last_term = 0; + + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + recovery.ReadLogs( + [&](const RaftMetadata &data) { + current_term = data.current_term; + voted_for = data.voted_for; + snapshot_last_index = data.snapshot_last_index; + snapshot_last_term = data.snapshot_last_term; + }, + [&](std::unique_ptr) {}, nullptr); + + EXPECT_EQ(current_term, 7); + EXPECT_EQ(voted_for, 2); + EXPECT_EQ(snapshot_last_index, 50); + EXPECT_EQ(snapshot_last_term, 3); + } +} + +// When Storage::Flush() fails, FinishFile() bails out early and the log file +// must NOT be rotated — only one file should remain on disk. +TEST_F(RaftRecoveryTest, CheckpointNotFinalizedWhenStorageFlushFails) { + MockStorage storage; + EXPECT_CALL(storage, Flush).WillRepeatedly(Return(false)); + + std::promise insert_done, ckpt_fired; + auto insert_done_future = insert_done.get_future(); + auto ckpt_fired_future = ckpt_fired.get_future(); + + int call_count = 0; + EXPECT_CALL(checkpoint_, GetStableCheckpoint()) + .WillRepeatedly(Invoke([&]() -> uint64_t { + ++call_count; + if (call_count == 1) + insert_done_future.get(); + else if (call_count == 2) + ckpt_fired.set_value(true); + return 5; + })); + + { + RaftRecovery recovery(config_, &checkpoint_, &storage, nullptr); + + for (int i = 1; i <= 5; i++) { + AddTestEntry(recovery, i, i); + } + insert_done.set_value(true); + ckpt_fired_future.get(); + + for (int i = 6; i <= 8; i++) { + AddTestEntry(recovery, i, i); + } + } + + // The file should never have been renamed; only one .log file exists. + std::vector log_list = Listlogs(log_path); + // 1 log file and one metadata file + EXPECT_EQ(log_list.size(), 2); +} + +ResConfigData GetConfigDataNoRecovery(int buf_size = 10) { + ResConfigData data; + data.set_recovery_enabled(false); + data.set_recovery_path(log_path); + data.set_recovery_buffer_size(buf_size); + data.set_recovery_ckpt_time_s(1); + return data; +} + +// When recovery_enabled=false, all write operations are no-ops and the WAL +// directory is never created on disk. +TEST_F(RaftRecoveryTest, RecoveryDisabledNoOpsAndCreatesNoDirectory) { + ResDBConfig config(GetConfigDataNoRecovery(1024), ReplicaInfo(), KeyInfo(), + CertificateInfo()); + + const std::string log_dir = + std::filesystem::path(log_path).parent_path().string(); + + // Precondition: directory does not exist (the fixture removes it in SetUp). + ASSERT_FALSE(std::filesystem::exists(log_dir)); + + { + RaftRecovery recovery(config, &checkpoint_, nullptr, nullptr); + + // All of these must be silent no-ops. + for (int i = 1; i <= 5; ++i) { + AddTestEntry(recovery, i, i); + } + + recovery.WriteMetadata(7, 2, 50, 3); + + TruncationRecord trunc; + trunc.set_truncate_from_index(3); + trunc.set_truncate_from_term(2); + recovery.TruncateLog(trunc); + + // ReadLogs must also be a no-op and invoke neither callback. + bool metadata_cb_called = false; + bool record_cb_called = false; + recovery.ReadLogs( + [&](const RaftMetadata &) { metadata_cb_called = true; }, + [&](std::unique_ptr) { record_cb_called = true; }, nullptr); + + EXPECT_FALSE(metadata_cb_called); + EXPECT_FALSE(record_cb_called); + } + + // The WAL directory must never have been created. + EXPECT_FALSE(std::filesystem::exists(log_dir)) + << "WAL directory was created even though recovery is disabled"; +} + +// When recovery is disabled, ReadMetadata returns the zero-value struct. +TEST_F(RaftRecoveryTest, RecoveryDisabledReadMetadataReturnsDefaults) { + ResDBConfig config(GetConfigDataNoRecovery(1024), ReplicaInfo(), KeyInfo(), + CertificateInfo()); + + RaftRecovery recovery(config, &checkpoint_, nullptr, nullptr); + + RaftMetadata meta = recovery.ReadMetadata(); + EXPECT_EQ(meta.current_term, 0); + EXPECT_EQ(meta.voted_for, -1); + EXPECT_EQ(meta.snapshot_last_index, 0u); + EXPECT_EQ(meta.snapshot_last_term, 0u); +} + +// Truncation record seq == checkpoint value. +// +// Layout written to WAL: +// seq 1 – entry (term 1) +// seq 2 – entry (term 2) +// seq 3 – entry (term 3) +// seq 4 – entry (term 4) +// truncation with truncate_from_index=3 → stored at seq = 3-1 = 2 +// seq 3 – entry (term 13) +// seq 4 – entry (term 14) +// +// The checkpoint fires at seq=2, directly before the truncation. +// +// What survives: only records with seq > 2, i.e. the two post-truncation +// entries at seq 3 and 4. +TEST_F(RaftRecoveryTest, TruncationAtCheckpointBoundary) { + std::promise insert_done, ckpt_fired; + auto insert_done_f = insert_done.get_future(); + auto ckpt_fired_f = ckpt_fired.get_future(); + + int call_count = 0; + EXPECT_CALL(checkpoint_, GetStableCheckpoint()) + .WillRepeatedly(Invoke([&]() -> uint64_t { + ++call_count; + if (call_count == 1) + insert_done_f.get(); + else if (call_count == 2) + ckpt_fired.set_value(true); + // Checkpoint at 2 — the same seq as the truncation record. + return 2; + })); + + { + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + + // Write entries 1–4 at seq 1–4. + for (int i = 1; i <= 4; ++i) { + AddTestEntry(recovery, i, i); + } + + // Truncate from index 3 → stored at seq = 2. + TruncationRecord trunc; + trunc.set_truncate_from_index(3); + trunc.set_truncate_from_term(2); + recovery.TruncateLog(trunc); + + // Write two replacement entries at seq 3–4 (new leader's branch). + for (int i = 3; i <= 4; ++i) { + AddTestEntry(recovery, 10 + i, i); + } + + insert_done.set_value(true); + ckpt_fired_f.get(); + // File is now sealed at ckpt=2. The active window starts fresh. + } + + { + std::vector list; + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + recovery.ReadLogs( + [&](const RaftMetadata &) {}, + [&](std::unique_ptr record) { list.push_back(*record); }, + nullptr); + + ASSERT_EQ(list.size(), 5u); + EXPECT_EQ(list[0].payload_case(), WALRecord::kEntry); + EXPECT_EQ(list[1].payload_case(), WALRecord::kEntry); + EXPECT_EQ(list[2].payload_case(), WALRecord::kTruncation); + EXPECT_EQ(list[3].payload_case(), WALRecord::kEntry); + EXPECT_EQ(list[4].payload_case(), WALRecord::kEntry); + + Request req3, req4, req3again, req4again; + req3.ParseFromString(list[0].entry().command()); + req4.ParseFromString(list[1].entry().command()); + EXPECT_EQ(req3.seq(), 3); + EXPECT_EQ(req4.seq(), 4); + EXPECT_EQ(list[0].entry().term(), 3); + EXPECT_EQ(list[1].entry().term(), 4); + + EXPECT_EQ(list[2].truncation().truncate_from_index(), 3); + + req3again.ParseFromString(list[3].entry().command()); + req4again.ParseFromString(list[4].entry().command()); + EXPECT_EQ(req3again.seq(), 3); + EXPECT_EQ(req4again.seq(), 4); + EXPECT_EQ(list[3].entry().term(), 13); + EXPECT_EQ(list[4].entry().term(), 14); + } +} + +// Truncation record seq BELOW checkpoint value: also dropped. +// +// Same layout but checkpoint fires at stable_seq=5 (above the truncation's +// seq=2). All records with seq ≤ 5 are behind the checkpoint; only seq 3 +// and 4 survive if they came from the pre-checkpoint file selected by +// GetRecoveryFiles. In this variant we check that no truncation record +// bleeds through in the surviving window. +TEST_F(RaftRecoveryTest, TruncationBelowCheckpointIsDropped) { + std::promise insert_done, ckpt_fired; + auto insert_done_f = insert_done.get_future(); + auto ckpt_fired_f = ckpt_fired.get_future(); + + int call_count = 0; + EXPECT_CALL(checkpoint_, GetStableCheckpoint()) + .WillRepeatedly(Invoke([&]() -> uint64_t { + ++call_count; + if (call_count == 1) + insert_done_f.get(); + else if (call_count == 2) + ckpt_fired.set_value(true); + return 5; + })); + + { + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + + for (int i = 1; i <= 4; ++i) { + AddTestEntry(recovery, i, i); + } + + TruncationRecord trunc; + trunc.set_truncate_from_index(3); + trunc.set_truncate_from_term(2); + recovery.TruncateLog(trunc); + + for (int i = 3; i <= 8; ++i) { + AddTestEntry(recovery, 10 + i, i); + } + + insert_done.set_value(true); + ckpt_fired_f.get(); + } + + { + std::vector list; + RaftRecovery recovery(config_, &checkpoint_, nullptr, nullptr); + recovery.ReadLogs( + [&](const RaftMetadata &) {}, + [&](std::unique_ptr record) { list.push_back(*record); }, + nullptr); + + // Entries at seq 6, 7, 8 survive (strictly > ckpt=5). + // The truncation at seq=2 is entirely behind the checkpoint and must not + // appear. + for (const auto &r : list) { + EXPECT_EQ(r.payload_case(), WALRecord::kEntry) + << "Truncation record below checkpoint leaked into replay"; + } + ASSERT_EQ(list.size(), 3u); + for (size_t i = 0; i < list.size(); ++i) { + Request req; + req.ParseFromString(list[i].entry().command()); + EXPECT_EQ(req.seq(), (int)(i + 6)); + } + } +} + // TODO: Create tests that corrupt recovery files to test our handling of them. } // namespace raft diff --git a/platform/consensus/recovery/recovery.h b/platform/consensus/recovery/recovery.h index 07ed6de53f..1517f6dd8a 100644 --- a/platform/consensus/recovery/recovery.h +++ b/platform/consensus/recovery/recovery.h @@ -44,7 +44,8 @@ template class RecoveryBase { public: RecoveryBase(const ResDBConfig& config, CheckPoint* checkpoint, - Storage* storage); + Storage* storage, + std::function on_checkpoint = nullptr); ~RecoveryBase(); void ReadLogs( @@ -95,7 +96,8 @@ class RecoveryBase { // Derived class must implement these auto ParseDataListItem(std::vector& data_list); - void PerformCallback(auto& request_list, TCallback call_back); + template + void PerformCallback(RequestList& request_list, TCallback call_back); void WriteSystemInfo(); @@ -115,6 +117,7 @@ class RecoveryBase { std::atomic stop_; int recovery_ckpt_time_s_; Storage* storage_; + std::function on_checkpoint_callback_; }; #include "platform/consensus/recovery/recovery_impl.h" diff --git a/platform/consensus/recovery/recovery_impl.h b/platform/consensus/recovery/recovery_impl.h index 0a36712ab8..4085934687 100644 --- a/platform/consensus/recovery/recovery_impl.h +++ b/platform/consensus/recovery/recovery_impl.h @@ -19,8 +19,12 @@ template RecoveryBase::RecoveryBase( - const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage) - : config_(config), checkpoint_(checkpoint), storage_(storage) { + const ResDBConfig& config, CheckPoint* checkpoint, Storage* storage, + std::function on_checkpoint) + : config_(config), + checkpoint_(checkpoint), + storage_(storage), + on_checkpoint_callback_(on_checkpoint) { recovery_enabled_ = config_.GetConfigData().recovery_enabled(); file_path_ = config_.GetConfigData().recovery_path(); if (file_path_.empty()) { @@ -50,7 +54,7 @@ RecoveryBase::RecoveryBase( recovery_ckpt_time_s_ = config_.GetConfigData().recovery_ckpt_time_s(); if (recovery_ckpt_time_s_ == 0) { - recovery_ckpt_time_s_ = 60; + recovery_ckpt_time_s_ = 30; } int ret = @@ -160,32 +164,38 @@ std::string RecoveryBase::GenerateFile( template void RecoveryBase::FinishFile( int64_t seq) { - std::unique_lock lk(mutex_); - Flush(); - if (storage_) { - if (!storage_->Flush(true)) { - return; + { + std::unique_lock lk(mutex_); + Flush(); + if (storage_) { + if (!storage_->Flush(true)) { + return; + } } - } - std::string new_file_path = GenerateFile(seq, min_seq_, max_seq_); - close(fd_); + std::string new_file_path = GenerateFile(seq, min_seq_, max_seq_); + close(fd_); - min_seq_ = -1; - max_seq_ = -1; + min_seq_ = -1; + max_seq_ = -1; + + std::rename(file_path_.c_str(), new_file_path.c_str()); - std::rename(file_path_.c_str(), new_file_path.c_str()); + std::string dir_path = + std::filesystem::path(file_path_).parent_path().string(); + int dir_fd = open(dir_path.c_str(), O_RDONLY); + fsync(dir_fd); + close(dir_fd); - std::string dir_path = - std::filesystem::path(file_path_).parent_path().string(); - int dir_fd = open(dir_path.c_str(), O_RDONLY); - fsync(dir_fd); - close(dir_fd); + LOG(INFO) << "rename:" << file_path_ << " to:" << new_file_path; + std::string next_file_path = GenerateFile(seq, -1, -1); + file_path_ = next_file_path; - LOG(INFO) << "rename:" << file_path_ << " to:" << new_file_path; - std::string next_file_path = GenerateFile(seq, -1, -1); - file_path_ = next_file_path; + OpenFile(file_path_); + } - OpenFile(file_path_); + if (on_checkpoint_callback_) { + on_checkpoint_callback_(seq); + } } template @@ -258,6 +268,7 @@ void RecoveryBase::Write(const char* data, int pos = 0; while (len > 0) { int write_len = write(fd_, data + pos, len); + if (write_len <= 0) break; len -= write_len; pos += write_len; } @@ -377,7 +388,9 @@ RecoveryBase::GetSortedRecoveryFiles( } sort(e_list.begin(), e_list.end()); - list.push_back(e_list.back()); + if (!e_list.empty()) { + list.push_back(e_list.back()); + } sort(list.begin(), list.end()); return list; } @@ -481,6 +494,15 @@ void RecoveryBase::ReadLogsFromFiles( } } if (request_list.size() == 0) { + LOG(ERROR) << " Request list is empty"; + close(fd); + fd = open(path.c_str(), O_RDWR); + if (fd < 0) { + LOG(ERROR) << " open file as O_RDWR to truncate fail:" << path; + } + if (ftruncate(fd, 0) != 0) { + LOG(ERROR) << " Failed to truncate file"; + } ftruncate(fd, 0); } From db35d9e493320f41056c6b47325ffeb758da3149 Mon Sep 17 00:00:00 2001 From: Josh Hutton Date: Fri, 1 May 2026 13:43:30 -0700 Subject: [PATCH 66/66] Undo change to recovery_ckpt_time_s_ --- platform/consensus/recovery/recovery_impl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/platform/consensus/recovery/recovery_impl.h b/platform/consensus/recovery/recovery_impl.h index 4085934687..463dc905e5 100644 --- a/platform/consensus/recovery/recovery_impl.h +++ b/platform/consensus/recovery/recovery_impl.h @@ -54,7 +54,7 @@ RecoveryBase::RecoveryBase( recovery_ckpt_time_s_ = config_.GetConfigData().recovery_ckpt_time_s(); if (recovery_ckpt_time_s_ == 0) { - recovery_ckpt_time_s_ = 30; + recovery_ckpt_time_s_ = 60; } int ret = @@ -510,4 +510,4 @@ void RecoveryBase::ReadLogsFromFiles( LOG(ERROR) << "read log from files:" << path << " done"; close(fd); -} \ No newline at end of file +}