Skip to content

Commit 9a6dda3

Browse files
author
huangjun
committed
Fix time source mismatch in client RPC path after #3268 migration
PR #3268 ("Use monotonic time instead of wall time") migrated several hot paths from butil::gettimeofday_us() to butil::cpuwide_time_us(), including LocalityAwareLoadBalancer::Weight::Update which now reads end_time_us from the cpuwide clock. The migration was incomplete on the caller side: Channel::CallMethod and the retry sites in Controller::OnVersionedRPCReturned still passed gettimeofday_us() as the begin time, which is then plumbed through Controller::IssueRPC -> Controller::Call::begin_time_us -> SelectIn::begin_time_us -> CallInfo::begin_time_us into Weight::Update. In Weight::Update the latency is computed as latency = end_time_us - ci.begin_time_us = cpuwide_now - wallclock_begin ~= -1.7e15 us (huge negative) triggering the if (latency <= 0) { /* time skews, ignore the sample */ return 0; } short-circuit. _time_q never accumulates samples, _avg_latency stays at 0, and locality-aware feedback is silently disabled. The dual time domain also leaks into Controller::latency_us() (controller.h:215) where _begin_time_us was wallclock and the fallback used cpuwide. Visible downstream effect: cold-start `list://` channels with `lb=la` and 2 backends occasionally fail RPCs with EHOSTDOWN (`Fail to select server from list://...`) on retry even when one backend is healthy. Bisected reproduction in xsky/brpc fork: - 51 commit range c41e838..604dad0c (1.16.1 .. 1.17.0-rc2) - master code + LA-driven multipath probe at 2 backends, max_retry=1, repeat 500x: * commit 771de31 (one before #3268): 0/500 fail * commit 12fb539 (#3268): 25/500 fail * commit 12fb539 + revert only Weight::Update::end_time_us back to gettimeofday: 0/500 fail - Reverting the single end_time_us assignment confirms the time source mismatch is the regression vector. This commit aligns the client-side callers with #3268's intent, so all RPC time accounting uses cpuwide_time_us consistently: src/brpc/channel.cpp:451 CallMethod entry begin time src/brpc/channel.cpp:628 sync RPC end time src/brpc/controller.cpp:672 backup-request retry IssueRPC src/brpc/controller.cpp:705 retry-backoff deadline check src/brpc/controller.cpp:715 regular retry IssueRPC src/brpc/controller.cpp:803 circuit-breaker feedback latency src/brpc/controller.cpp:980 async OnRPCEnd src/brpc/controller.cpp:1019 backup-thread OnRPCEnd Adds test/brpc_load_balancer_unittest.cpp::la_records_latency_with_consistent_time_source which documents the LB-side invariant: when CallInfo::begin_time_us is in the cpuwide_time domain (matching Weight::Update::end_time_us) the LA feedback path produces a positive _avg_latency; when it is in the wallclock domain the time-skew short-circuit fires and _avg_latency stays 0.
1 parent 5fdb0d8 commit 9a6dda3

3 files changed

Lines changed: 86 additions & 8 deletions

File tree

src/brpc/channel.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ void Channel::CallMethod(const google::protobuf::MethodDescriptor* method,
448448
const google::protobuf::Message* request,
449449
google::protobuf::Message* response,
450450
google::protobuf::Closure* done) {
451-
const int64_t start_send_real_us = butil::gettimeofday_us();
451+
const int64_t start_send_real_us = butil::cpuwide_time_us();
452452
Controller* cntl = static_cast<Controller*>(controller_base);
453453
cntl->OnRPCBegin(start_send_real_us);
454454
// Override max_retry first to reset the range of correlation_id
@@ -625,7 +625,7 @@ void Channel::CallMethod(const google::protobuf::MethodDescriptor* method,
625625
// fails after retry)
626626
Join(correlation_id);
627627
cntl->SubmitSpan();
628-
cntl->OnRPCEnd(butil::gettimeofday_us());
628+
cntl->OnRPCEnd(butil::cpuwide_time_us());
629629
}
630630
}
631631

src/brpc/controller.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -669,7 +669,7 @@ void Controller::OnVersionedRPCReturned(const CompletionInfo& info,
669669
}
670670
++_current_call.nretry;
671671
add_flag(FLAGS_BACKUP_REQUEST);
672-
return IssueRPC(butil::gettimeofday_us());
672+
return IssueRPC(butil::cpuwide_time_us());
673673
} else {
674674
auto retry_policy = _retry_policy ? _retry_policy : DefaultRetryPolicy();
675675
if (retry_policy->DoRetry(this)) {
@@ -702,7 +702,7 @@ void Controller::OnVersionedRPCReturned(const CompletionInfo& info,
702702
bthread::TaskGroup* g = bthread::tls_task_group;
703703
int64_t backoff_time_us = retry_policy->GetBackoffTimeMs(this) * 1000L;
704704
if (backoff_time_us > 0 &&
705-
backoff_time_us < _deadline_us - butil::gettimeofday_us()) {
705+
backoff_time_us < _deadline_us - butil::cpuwide_time_us()) {
706706
// No need to do retry backoff when the backoff time is longer than the remaining rpc time.
707707
if (retry_policy->CanRetryBackoffInPthread() ||
708708
(g && !g->is_current_pthread_task())) {
@@ -712,7 +712,7 @@ void Controller::OnVersionedRPCReturned(const CompletionInfo& info,
712712
"skip retry backoff in pthread.";
713713
}
714714
}
715-
return IssueRPC(butil::gettimeofday_us());
715+
return IssueRPC(butil::cpuwide_time_us());
716716
}
717717
}
718718

@@ -800,7 +800,7 @@ void Controller::Call::OnComplete(
800800

801801
if (enable_circuit_breaker) {
802802
sending_sock->FeedbackCircuitBreaker(error_code,
803-
butil::gettimeofday_us() - begin_time_us);
803+
butil::cpuwide_time_us() - begin_time_us);
804804
}
805805
}
806806

@@ -977,7 +977,7 @@ void Controller::EndRPC(const CompletionInfo& info) {
977977
// Join is not signalled when the done does not Run() and the done
978978
// can't Run() because all backup threads are blocked by Join().
979979

980-
OnRPCEnd(butil::gettimeofday_us());
980+
OnRPCEnd(butil::cpuwide_time_us());
981981
const bool destroy_cid_in_done = has_flag(FLAGS_DESTROY_CID_IN_DONE);
982982
_done->Run();
983983
// NOTE: Don't touch this Controller anymore, because it's likely to be
@@ -1016,7 +1016,7 @@ void Controller::RunDoneInBackupThread(void* arg) {
10161016
void Controller::DoneInBackupThread() {
10171017
// OnRPCEnd for sync RPC is called in Channel::CallMethod to count in
10181018
// latency of the context-switch.
1019-
OnRPCEnd(butil::gettimeofday_us());
1019+
OnRPCEnd(butil::cpuwide_time_us());
10201020
const CallId saved_cid = _correlation_id;
10211021
const bool destroy_cid_in_done = has_flag(FLAGS_DESTROY_CID_IN_DONE);
10221022
_done->Run();

test/brpc_load_balancer_unittest.cpp

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1303,6 +1303,84 @@ TEST_F(LoadBalancerTest, revived_from_all_failed_intergrated) {
13031303
}
13041304
#endif // BUTIL_USE_ASAN
13051305

1306+
// Regression for #3268's incomplete migration. Weight::Update reads
1307+
// `end_time_us = butil::cpuwide_time_us()`, so every caller that supplies a
1308+
// `CallInfo::begin_time_us` MUST also be in the cpuwide_time domain.
1309+
// Before this fix, `Channel::CallMethod` (and the retry sites in
1310+
// `Controller::OnVersionedRPCReturned`) supplied `butil::gettimeofday_us()`,
1311+
// making `latency = end - begin` a huge negative number. Weight::Update then
1312+
// hit the `if (latency <= 0) { /* time skews, ignore */ return 0; }`
1313+
// short-circuit, _time_q never accumulated samples, _avg_latency stayed at 0
1314+
// and the locality-aware feedback was effectively disabled. Visible
1315+
// downstream symptom: cold-start `list://` channels with `lb=la` and 2
1316+
// backends occasionally fail RPCs with `EHOSTDOWN` (Fail to select server)
1317+
// on retry even when one backend is healthy.
1318+
//
1319+
// This test runs purely against LocalityAwareLoadBalancer (no Server / Channel
1320+
// is involved), so it is hermetic. It exercises only the invariant that
1321+
// Weight::Update relies on: matching cpuwide begin/end produces a positive
1322+
// latency sample.
1323+
TEST_F(LoadBalancerTest, la_records_latency_with_consistent_time_source) {
1324+
LALB lalb;
1325+
char addr[] = "192.168.1.1:8080";
1326+
butil::EndPoint dummy;
1327+
ASSERT_EQ(0, str2endpoint(addr, &dummy));
1328+
brpc::ServerId id(8888);
1329+
brpc::SocketOptions options;
1330+
options.remote_side = dummy;
1331+
ASSERT_EQ(0, brpc::Socket::Create(options, &id.id));
1332+
ASSERT_TRUE(lalb.AddServer(id));
1333+
1334+
auto avg_latency = [&]() -> int64_t {
1335+
std::ostringstream os;
1336+
brpc::DescribeOptions opts;
1337+
opts.verbose = true;
1338+
lalb.Describe(os, opts);
1339+
const std::string s = os.str();
1340+
const size_t p = s.find("avg_latency=");
1341+
if (p == std::string::npos) return -1;
1342+
return strtoll(s.c_str() + p + strlen("avg_latency="), NULL, 10);
1343+
};
1344+
1345+
// Path A (regression repro): caller supplies wall-clock begin_time_us.
1346+
// Weight::Update sees `latency = cpuwide_now - wallclock_begin = HUGE
1347+
// NEGATIVE`, drops the sample. _avg_latency stays at 0.
1348+
for (int i = 0; i < 8; ++i) {
1349+
const int64_t bad_begin = butil::gettimeofday_us();
1350+
brpc::SocketUniquePtr ptr;
1351+
brpc::LoadBalancer::SelectIn in = { bad_begin, true, false, 0u, NULL };
1352+
brpc::LoadBalancer::SelectOut out(&ptr);
1353+
ASSERT_EQ(0, lalb.SelectServer(in, &out));
1354+
bthread_usleep(2000);
1355+
brpc::LoadBalancer::CallInfo ci = { bad_begin, id.id, 0, NULL };
1356+
lalb.Feedback(ci);
1357+
}
1358+
EXPECT_EQ(0, avg_latency())
1359+
<< "Wall-clock begin_time_us must trip the time-skew short-circuit; "
1360+
<< "if avg_latency is non-zero here, either Weight::Update changed "
1361+
<< "its time source or the loop above is misconfigured.";
1362+
1363+
// Path B (post-fix): caller supplies cpuwide begin_time_us, matching
1364+
// Weight::Update::end_time_us. _avg_latency now reflects the actual
1365+
// elapsed latency (~2ms = 2000us).
1366+
for (int i = 0; i < 8; ++i) {
1367+
const int64_t good_begin = butil::cpuwide_time_us();
1368+
brpc::SocketUniquePtr ptr;
1369+
brpc::LoadBalancer::SelectIn in = { good_begin, true, false, 0u, NULL };
1370+
brpc::LoadBalancer::SelectOut out(&ptr);
1371+
ASSERT_EQ(0, lalb.SelectServer(in, &out));
1372+
bthread_usleep(2000);
1373+
brpc::LoadBalancer::CallInfo ci = { good_begin, id.id, 0, NULL };
1374+
lalb.Feedback(ci);
1375+
}
1376+
EXPECT_GT(avg_latency(), 0)
1377+
<< "cpuwide begin_time_us should yield positive latency feedback. "
1378+
<< "If this is 0, callers in Channel/Controller may have regressed "
1379+
<< "back to gettimeofday_us() for begin_time_us.";
1380+
1381+
ASSERT_EQ(0, brpc::Socket::SetFailed(id.id));
1382+
}
1383+
13061384
TEST_F(LoadBalancerTest, la_selection_too_long) {
13071385
brpc::GlobalInitializeOrDie();
13081386
brpc::LoadBalancerWithNaming lb;

0 commit comments

Comments
 (0)