@@ -86,6 +86,39 @@ void* run_create_span_func() {
8686 return BAIDU_GET_VOLATILE_THREAD_LOCAL (tls_bls).rpcz_parent_span ;
8787}
8888
89+ AtomicInteger128::Value AtomicInteger128::load () const {
90+ #if __x86_64__ || __ARM_NEON
91+ // Supress compiler warning.
92+ (void )_mutex;
93+ #endif // __x86_64__ || __ARM_NEON
94+
95+ #if __x86_64__ || __ARM_NEON
96+ #ifdef __x86_64__
97+ __m128i value = _mm_load_si128 (reinterpret_cast <__m128i*>(&_value));
98+ #else // __ARM_NEON
99+ int64x2_t value = vld1q_s64 (reinterpret_cast <const int64_t *>(&_value));
100+ #endif // __x86_64__
101+ return {value[0 ], value[1 ]};
102+ #else // __x86_64__ || __ARM_NEON
103+ BAIDU_SCOPED_LOCK (g->_mutex );
104+ return _value;
105+ #endif // __x86_64__ || __ARM_NEON
106+ }
107+
108+ void AtomicInteger128::store (Value value) {
109+ #if __x86_64__
110+ __m128i v = _mm_load_si128 (reinterpret_cast <const __m128i*>(&value));
111+ _mm_store_si128 (reinterpret_cast <__m128i*>(&_value), v);
112+ #elif __ARM_NEON
113+ int64x2_t v = vld1q_s64 (reinterpret_cast <const int64_t *>(&value));
114+ vst1q_s64 (reinterpret_cast <int64_t *>(&_value), v);
115+ #else
116+ BAIDU_SCOPED_LOCK (g->_mutex );
117+ _value = value;
118+ #endif // __x86_64__ || __ARM_NEON
119+ }
120+
121+
89122int TaskGroup::get_attr (bthread_t tid, bthread_attr_t * out) {
90123 TaskMeta* const m = address_meta (tid);
91124 if (m != NULL ) {
@@ -152,6 +185,16 @@ static double get_cumulated_cputime_from_this(void* arg) {
152185 return static_cast <TaskGroup*>(arg)->cumulated_cputime_ns () / 1000000000.0 ;
153186}
154187
188+ int64_t TaskGroup::cumulated_cputime_ns () const {
189+ CPUTimeStat cpu_time_stat = _cpu_time_stat.load ();
190+ // Add the elapsed time of running bthread.
191+ int64_t cumulated_cputime_ns = cpu_time_stat.cumulated_cputime_ns ();
192+ if (!cpu_time_stat.is_main_task ()) {
193+ cumulated_cputime_ns += butil::cpuwide_time_ns () - cpu_time_stat.last_run_ns ();
194+ }
195+ return cumulated_cputime_ns;
196+ }
197+
155198void TaskGroup::run_main_task () {
156199 bvar::PassiveStatus<double > cumulated_cputime (
157200 get_cumulated_cputime_from_this, this );
@@ -160,11 +203,11 @@ void TaskGroup::run_main_task() {
160203 TaskGroup* dummy = this ;
161204 bthread_t tid;
162205 while (wait_task (&tid)) {
163- TaskGroup:: sched_to (&dummy, tid);
206+ sched_to (&dummy, tid);
164207 DCHECK_EQ (this , dummy);
165208 DCHECK_EQ (_cur_meta->stack , _main_stack);
166209 if (_cur_meta->tid != _main_tid) {
167- TaskGroup:: task_runner (1 /* skip remained*/ );
210+ task_runner (1 /* skip remained*/ );
168211 }
169212 if (FLAGS_show_per_worker_usage_in_vars && !usage_bvar) {
170213 char name[32 ];
@@ -181,17 +224,12 @@ void TaskGroup::run_main_task() {
181224 }
182225 // Don't forget to add elapse of last wait_task.
183226 current_task ()->stat .cputime_ns +=
184- butil::cpuwide_time_ns () - std::abs (_cpu_time_stat.last_run_ns );
227+ butil::cpuwide_time_ns () - std::abs (_cpu_time_stat.load_unsafe (). last_run_ns () );
185228}
186229
187230TaskGroup::TaskGroup (TaskControl* c)
188231 : _control(c) {
189232 CHECK (c);
190- #if __x86_64__ || __ARM_NEON
191- // Supress compiler warning.
192- (void )_cpu_time_stat_mutex;
193- #endif // __x86_64__ || __ARM_NEON
194-
195233}
196234
197235TaskGroup::~TaskGroup () {
@@ -282,8 +320,12 @@ int TaskGroup::init(size_t runqueue_capacity) {
282320 _cur_meta = m;
283321 _main_tid = m->tid ;
284322 _main_stack = stk;
285- _cpu_time_stat.last_run_ns = -m->cpuwide_start_ns ;
323+
324+ CPUTimeStat cpu_time_stat;
325+ cpu_time_stat.set_last_run_ns (m->cpuwide_start_ns , true );
326+ _cpu_time_stat.store (cpu_time_stat);
286327 _last_cpu_clock_ns = 0 ;
328+
287329 return 0 ;
288330}
289331
@@ -404,7 +446,7 @@ void TaskGroup::task_runner(intptr_t skip_remained) {
404446
405447 g->_control ->_nbthreads << -1 ;
406448 g->_control ->tag_nbthreads (g->tag ()) << -1 ;
407- g->set_remained (TaskGroup:: _release_last_context, m);
449+ g->set_remained (_release_last_context, m);
408450 ending_sched (&g);
409451
410452 } while (g->_cur_meta ->tid != g->_main_tid );
@@ -481,12 +523,10 @@ int TaskGroup::start_foreground(TaskGroup** pg,
481523 fn = ready_to_run_in_worker;
482524 }
483525 ReadyToRunArgs args = {
484- g->tag (),
485- g->_cur_meta ,
486- (bool )(using_attr.flags & BTHREAD_NOSIGNAL)
526+ g->tag (), g->_cur_meta , (bool )(using_attr.flags & BTHREAD_NOSIGNAL)
487527 };
488528 g->set_remained (fn, &args);
489- TaskGroup:: sched_to (pg, m->tid );
529+ sched_to (pg, m->tid );
490530 }
491531 return 0 ;
492532}
@@ -668,13 +708,18 @@ void TaskGroup::sched_to(TaskGroup** pg, TaskMeta* next_meta, bool cur_ending) {
668708 }
669709#endif
670710 // Save errno so that errno is bthread-specific.
671- const int saved_errno = errno;
711+ int saved_errno = errno;
672712 void * saved_unique_user_ptr = tls_unique_user_ptr;
673713
674714 TaskMeta* const cur_meta = g->_cur_meta ;
675- const int64_t now = butil::cpuwide_time_ns ();
676- const int64_t elp_ns = now - std::abs (g->_cpu_time_stat .last_run_ns );
715+ int64_t now = butil::cpuwide_time_ns ();
716+ CPUTimeStat cpu_time_stat = g->_cpu_time_stat .load_unsafe ();
717+ int64_t elp_ns = now - cpu_time_stat.last_run_ns ();
677718 cur_meta->stat .cputime_ns += elp_ns;
719+ // Update cpu_time_stat.
720+ cpu_time_stat.set_last_run_ns (now, is_main_task (g, next_meta->tid ));
721+ cpu_time_stat.add_cumulated_cputime_ns (elp_ns, is_main_task (g, cur_meta->tid ));
722+ g->_cpu_time_stat .store (cpu_time_stat);
678723
679724 if (FLAGS_bthread_enable_cpu_clock_stat) {
680725 const int64_t cpu_thread_time = butil::cputhread_time_ns ();
@@ -686,36 +731,6 @@ void TaskGroup::sched_to(TaskGroup** pg, TaskMeta* next_meta, bool cur_ending) {
686731 g->_last_cpu_clock_ns = 0 ;
687732 }
688733
689- #if __x86_64__ || __ARM_NEON
690- // Refer to https://rigtorp.se/isatomic/, On the modern CPU microarchitectures
691- // (Skylake and Zen 2) AVX/AVX2 128b/256b aligned loads and stores are atomic
692- // even though Intel and AMD officially doesn’t guarantee this.
693- CPUTimeStat cpu_time_stat{
694- next_meta->tid != g->main_tid () ? now : -now,
695- g->_cpu_time_stat .cumulated_cputime_ns
696- };
697- if (cur_meta->tid != g->main_tid ()) {
698- cpu_time_stat.cumulated_cputime_ns += elp_ns;
699- }
700- #if __x86_64__
701- // On X86, SSE instructions can ensure atomic loads and stores.
702- __m128i value = _mm_load_si128 (reinterpret_cast <__m128i*>(&cpu_time_stat));
703- _mm_store_si128 (reinterpret_cast <__m128i*>(&g->_cpu_time_stat ), value);
704- #else // __ARM_NEON
705- // Starting from Armv8.4-A, neon can ensure atomic loads and stores.
706- int64x2_t value = vld1q_s64 (reinterpret_cast <const int64_t *>(&cpu_time_stat));
707- vst1q_s64 (reinterpret_cast <int64_t *>(&g->_cpu_time_stat ), value);
708- #endif // __x86_64__
709- #else // __x86_64__ || __ARM_NEON
710- {
711- BAIDU_SCOPED_LOCK (g->_cpu_time_stat_mutex );
712- g->_cpu_time_stat .last_run_ns = next_meta->tid != g->main_tid () ? now : -now;
713- if (cur_meta->tid != g->main_tid ()) {
714- g->_cpu_time_stat .cumulated_cputime_ns += elp_ns;
715- }
716- }
717- #endif // __x86_64__ || __ARM_NEON
718-
719734 ++cur_meta->stat .nswitch ;
720735 ++ g->_nswitch ;
721736 // Switch to the task
@@ -1047,14 +1062,14 @@ int TaskGroup::interrupt(bthread_t tid, TaskControl* c, bthread_tag_t tag) {
10471062 }
10481063 } else if (sleep_id != 0 ) {
10491064 if (get_global_timer_thread ()->unschedule (sleep_id) == 0 ) {
1050- bthread:: TaskGroup* g = bthread:: tls_task_group;
1065+ TaskGroup* g = tls_task_group;
10511066 if (g) {
1052- g->ready_to_run (TaskGroup:: address_meta (tid));
1067+ g->ready_to_run (address_meta (tid));
10531068 } else {
10541069 if (!c) {
10551070 return EINVAL;
10561071 }
1057- c->choose_one_group (tag)->ready_to_run_remote (TaskGroup:: address_meta (tid));
1072+ c->choose_one_group (tag)->ready_to_run_remote (address_meta (tid));
10581073 }
10591074 }
10601075 }
0 commit comments