Skip to content

Commit b50ff41

Browse files
Raft: Fix coprocessor ignore new added RegionReadStatus thus cause inconsistent result (pingcap#10543) (pingcap#10544)
close pingcap#10510 * Handling new added RegionReadStatus enum values for error response of coprocessor * Do not set the `default` branch for switch (e.status) so that the compiler will throw an error if there is not covered new enum values Signed-off-by: ti-chi-bot <ti-community-prow-bot@tidb.io> Signed-off-by: JaySon-Huang <tshent@qq.com> Co-authored-by: JaySon <tshent@qq.com> Co-authored-by: JaySon-Huang <tshent@qq.com>
1 parent e7d70a3 commit b50ff41

14 files changed

Lines changed: 380 additions & 125 deletions

dbms/src/Common/TiFlashMetrics.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,9 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva
580580
F(type_key_not_in_region, {{"type", "key_not_in_region"}}), \
581581
F(type_tikv_server_issue, {{"type", "tikv_server_issue"}}), \
582582
F(type_tikv_lock, {{"type", "tikv_lock"}}), \
583+
F(type_server_is_busy, {{"type", "server_is_busy"}}), \
584+
F(type_stale_command, {{"type", "stale_command"}}), \
585+
F(type_store_not_match, {{"type", "store_not_match"}}), \
583586
F(type_other, {{"type", "other"}})) \
584587
/* required by DBaaS */ \
585588
M(tiflash_server_info, \

dbms/src/Debug/ReadIndexStressTest.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ static const std::map<std::string, ReadIndexStressTest::TestType> TestName2Type
3838

3939
ReadIndexStressTest::ReadIndexStressTest(const TMTContext & tmt_)
4040
: tmt(tmt_)
41+
, logger(Logger::get("ReadIndexStressTest"))
4142
{
4243
MockStressTestCfg::enable = true;
4344
LOG_WARNING(logger, "enable MockStressTest");

dbms/src/Debug/ReadIndexStressTest.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ class ReadIndexStressTest
5757

5858
private:
5959
const TMTContext & tmt;
60-
Poco::Logger * logger{&Poco::Logger::get("ReadIndexStressTest")};
60+
LoggerPtr logger;
6161
};
6262

6363

dbms/src/Flash/CoprocessorHandler.cpp

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -231,31 +231,8 @@ grpc::Status CoprocessorHandler<is_stream>::execute()
231231
response = cop_response;
232232
}
233233

234-
errorpb::Error * region_err;
235-
switch (e.status)
236-
{
237-
case RegionException::RegionReadStatus::OTHER:
238-
case RegionException::RegionReadStatus::BUCKET_EPOCH_NOT_MATCH:
239-
case RegionException::RegionReadStatus::FLASHBACK:
240-
case RegionException::RegionReadStatus::KEY_NOT_IN_REGION:
241-
case RegionException::RegionReadStatus::TIKV_SERVER_ISSUE:
242-
case RegionException::RegionReadStatus::READ_INDEX_TIMEOUT:
243-
case RegionException::RegionReadStatus::NOT_LEADER:
244-
case RegionException::RegionReadStatus::NOT_FOUND_TIKV:
245-
case RegionException::RegionReadStatus::NOT_FOUND:
246-
GET_METRIC(tiflash_coprocessor_request_error, reason_region_not_found).Increment();
247-
region_err = response->mutable_region_error();
248-
region_err->mutable_region_not_found()->set_region_id(cop_request->context().region_id());
249-
break;
250-
case RegionException::RegionReadStatus::EPOCH_NOT_MATCH:
251-
GET_METRIC(tiflash_coprocessor_request_error, reason_epoch_not_match).Increment();
252-
region_err = response->mutable_region_error();
253-
region_err->mutable_epoch_not_match();
254-
break;
255-
default:
256-
// should not happen
257-
break;
258-
}
234+
setResponseByRegionException(response, e, cop_request->context().region_id());
235+
259236
if constexpr (is_stream)
260237
{
261238
cop_writer->Write(stream_response);

dbms/src/Storages/KVStore/FFI/ProxyFFI.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,6 @@ const std::string ColumnFamilyName::Lock = "lock";
5151
const std::string ColumnFamilyName::Default = "default";
5252
const std::string ColumnFamilyName::Write = "write";
5353

54-
extern const uint64_t DEFAULT_BATCH_READ_INDEX_TIMEOUT_MS;
55-
5654
ColumnFamilyType NameToCF(const std::string & cf)
5755
{
5856
if (cf.empty() || cf == ColumnFamilyName::Default)

dbms/src/Storages/KVStore/Read/LearnerReadWorker.cpp

Lines changed: 141 additions & 51 deletions
Large diffs are not rendered by default.

dbms/src/Storages/KVStore/Read/LearnerReadWorker.h

Lines changed: 13 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -48,57 +48,40 @@ struct UnavailableRegions
4848
, is_wn_disagg_read(is_wn_disagg_read_)
4949
{}
5050

51-
size_t size() const { return ids.size(); }
51+
size_t size() const { return unavailable_ids.size(); }
5252

53-
bool empty() const { return ids.empty(); }
53+
bool empty() const { return unavailable_ids.empty(); }
5454

55-
bool contains(RegionID region_id) const { return ids.contains(region_id); }
55+
bool contains(RegionID region_id) const { return unavailable_ids.contains(region_id); }
5656

57-
void addStatus(RegionID id, RegionException::RegionReadStatus status_, std::string && extra_msg_)
57+
void addStatus(RegionID region_id, RegionException::RegionReadStatus status_, std::string && extra_msg_)
5858
{
59-
status = status_;
60-
ids.emplace(id);
61-
extra_msg = std::move(extra_msg_);
59+
unavailable_ids[region_id] = UnavailableDesc{status_, std::move(extra_msg_)};
6260
}
6361

6462
void addRegionLock(RegionID region_id_, LockInfoPtr && region_lock_)
6563
{
6664
region_locks.emplace_back(region_id_, std::move(region_lock_));
67-
ids.emplace(region_id_);
65+
unavailable_ids[region_id_] = UnavailableDesc{RegionException::RegionReadStatus::MEET_LOCK, ""};
6866
}
6967

7068
void tryThrowRegionException();
7169

7270
void addRegionWaitIndexTimeout(RegionID region_id, UInt64 index_to_wait, UInt64 current_applied_index);
7371

74-
String toDebugString() const
75-
{
76-
FmtBuffer buffer;
77-
buffer.append("{ids=[");
78-
buffer.joinStr(
79-
ids.begin(),
80-
ids.end(),
81-
[](const auto & v, FmtBuffer & f) { f.fmtAppend("{}", v); },
82-
"|");
83-
buffer.append("] locks=");
84-
buffer.append("[");
85-
buffer.joinStr(
86-
region_locks.begin(),
87-
region_locks.end(),
88-
[](const auto & v, FmtBuffer & f) { f.fmtAppend("{}({})", v.first, v.second->DebugString()); },
89-
"|");
90-
buffer.append("]}");
91-
return buffer.toString();
92-
}
72+
String toDebugString(size_t num_show) const;
9373

9474
private:
9575
const bool batch_cop;
9676
const bool is_wn_disagg_read;
9777

98-
RegionException::UnavailableRegions ids;
78+
struct UnavailableDesc
79+
{
80+
RegionException::RegionReadStatus s;
81+
std::string extra_msg;
82+
};
83+
std::unordered_map<RegionID, UnavailableDesc> unavailable_ids;
9984
std::vector<std::pair<RegionID, LockInfoPtr>> region_locks;
100-
RegionException::RegionReadStatus status{RegionException::RegionReadStatus::NOT_FOUND};
101-
std::string extra_msg;
10285
};
10386

10487
using RegionsReadIndexResult = std::unordered_map<RegionID, kvrpcpb::ReadIndexResponse>;

dbms/src/Storages/KVStore/Read/ReadIndex.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,13 @@ void WaitCheckRegionReadyImpl(
137137
static constexpr double BATCH_READ_INDEX_TIME_RATE = 0.2;
138138
auto log = Logger::get(__FUNCTION__);
139139

140+
UInt64 read_index_timeout = tmt.batchReadIndexTimeout();
141+
140142
LOG_INFO(
141143
log,
142-
"start to check regions ready, min_wait_tick={:.3f}s max_wait_tick={:.3f}s wait_region_ready_timeout={:.3f}s",
144+
"start to check regions ready, read_index_timeout={} min_wait_tick={:.3f}s max_wait_tick={:.3f}s "
145+
"wait_region_ready_timeout={:.3f}s",
146+
read_index_timeout,
143147
wait_tick_time,
144148
max_wait_tick_time,
145149
get_wait_region_ready_timeout_sec);
@@ -172,7 +176,7 @@ void WaitCheckRegionReadyImpl(
172176
}
173177

174178
// Record the latest commit index in TiKV
175-
auto read_index_res = kvstore.batchReadIndex(batch_read_index_req, tmt.batchReadIndexTimeout());
179+
auto read_index_res = kvstore.batchReadIndex(batch_read_index_req, read_index_timeout);
176180
for (auto && [resp, region_id] : read_index_res)
177181
{
178182
bool need_retry = resp.read_index() == 0;

dbms/src/Storages/KVStore/Read/ReadIndexWorker.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121
#include <common/logger_useful.h>
2222
#include <kvproto/kvrpcpb.pb.h>
2323

24-
#include <algorithm>
25-
#include <condition_variable>
2624
#include <memory>
2725
#include <thread>
2826

@@ -32,6 +30,7 @@ namespace tests
3230
{
3331
class ReadIndexTest;
3432
} // namespace tests
33+
class KVStore;
3534

3635
struct AsyncWaker
3736
{
@@ -95,9 +94,7 @@ class ReadIndexWorkerManager : boost::noncopyable
9594
void runOneRound(SteadyClock::duration min_dur, size_t id);
9695
void stop();
9796
~ReadIndexWorkerManager();
98-
BatchReadIndexRes batchReadIndex(
99-
const std::vector<kvrpcpb::ReadIndexRequest> & reqs,
100-
uint64_t timeout_ms = 10 * 1000);
97+
BatchReadIndexRes batchReadIndex(const std::vector<kvrpcpb::ReadIndexRequest> & reqs, uint64_t timeout_ms);
10198

10299
static std::unique_ptr<ReadIndexWorkerManager> newReadIndexWorkerManager(
103100
const TiFlashRaftProxyHelper & proxy_helper,

dbms/src/Storages/KVStore/Read/ReadIndexWorkerManager.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#include <Common/setThreadName.h>
1616
#include <Storages/KVStore/Read/ReadIndexWorkerImpl.h>
17+
#include <common/logger_useful.h>
1718

1819
namespace DB
1920
{
@@ -244,8 +245,16 @@ BatchReadIndexRes ReadIndexWorkerManager::batchReadIndex(
244245
}
245246
else
246247
{
248+
// The read index request might be dropped by a leader/candidate/follower.
249+
// The learner will retry the read index request internally in raftstore.
250+
// See https://github.com/tikv/tikv/pull/19071. The default retry interval is 4s at the time of writing.
251+
// Reaching this point means the request still timed out after retries.
252+
GET_METRIC(tiflash_raft_learner_read_failures_count, type_read_index_timeout).Increment();
253+
// Generate a "region not found" error response for the region that still has no response after timeout
247254
kvrpcpb::ReadIndexResponse tmp;
248-
tmp.mutable_region_error()->mutable_region_not_found();
255+
auto * e = tmp.mutable_region_error();
256+
e->mutable_region_not_found()->set_region_id(it.first);
257+
e->set_message("tiflash read index timeout(" + std::to_string(timeout_ms) + "ms)");
249258
resps.emplace_back(std::move(tmp), it.first);
250259
}
251260
tasks.pop();

0 commit comments

Comments
 (0)