Skip to content

Commit e9e8303

Browse files
committed
[hist] Use release-acquire in SnapshotAtomic
This ensures correctness on weakly-ordered systems. If not using SnapshotAtomic, release stores would not be needed in FillAtomic. However, tests indicate that it's not measurable in practice so we use it unconditionally to keep the implementation simple.
1 parent 07cf897 commit e9e8303

6 files changed

Lines changed: 137 additions & 31 deletions

File tree

hist/histv7/benchmark/hist_benchmark_atomic.cxx

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,15 @@ BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicAdd)(benchmark::State &state)
3838
}
3939
BENCHMARK_REGISTER_F(RHistAtomic_int, AtomicAdd);
4040

41+
BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicAddRelease)(benchmark::State &state)
42+
{
43+
for (auto _ : state) {
44+
ROOT::Experimental::Internal::AtomicAddRelease(&fAtomic, 1);
45+
benchmark::ClobberMemory();
46+
}
47+
}
48+
BENCHMARK_REGISTER_F(RHistAtomic_int, AtomicAddRelease);
49+
4150
BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicLoad)(benchmark::State &state)
4251
{
4352
int load;
@@ -90,6 +99,15 @@ BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicAdd)(benchmark::State &state)
9099
}
91100
BENCHMARK_REGISTER_F(RHistAtomic_float, AtomicAdd);
92101

102+
BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicAddRelease)(benchmark::State &state)
103+
{
104+
for (auto _ : state) {
105+
ROOT::Experimental::Internal::AtomicAddRelease(&fAtomic, 1.0f);
106+
benchmark::ClobberMemory();
107+
}
108+
}
109+
BENCHMARK_REGISTER_F(RHistAtomic_float, AtomicAddRelease);
110+
93111
BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicLoad)(benchmark::State &state)
94112
{
95113
float load;
@@ -142,6 +160,15 @@ BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicAdd)(benchmark::State &state)
142160
}
143161
BENCHMARK_REGISTER_F(RHistAtomic_double, AtomicAdd);
144162

163+
BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicAddRelease)(benchmark::State &state)
164+
{
165+
for (auto _ : state) {
166+
ROOT::Experimental::Internal::AtomicAddRelease(&fAtomic, 1.0);
167+
benchmark::ClobberMemory();
168+
}
169+
}
170+
BENCHMARK_REGISTER_F(RHistAtomic_double, AtomicAddRelease);
171+
145172
BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicLoad)(benchmark::State &state)
146173
{
147174
double load;
@@ -185,14 +212,14 @@ BENCHMARK_DEFINE_F(RBinWithError, Inc)(benchmark::State &state)
185212
}
186213
BENCHMARK_REGISTER_F(RBinWithError, Inc);
187214

188-
BENCHMARK_DEFINE_F(RBinWithError, AtomicInc)(benchmark::State &state)
215+
BENCHMARK_DEFINE_F(RBinWithError, AtomicIncRelease)(benchmark::State &state)
189216
{
190217
for (auto _ : state) {
191-
fBin.AtomicInc();
218+
fBin.AtomicIncRelease();
192219
benchmark::ClobberMemory();
193220
}
194221
}
195-
BENCHMARK_REGISTER_F(RBinWithError, AtomicInc);
222+
BENCHMARK_REGISTER_F(RBinWithError, AtomicIncRelease);
196223

197224
BENCHMARK_DEFINE_F(RBinWithError, Add)(benchmark::State &state)
198225
{
@@ -203,14 +230,14 @@ BENCHMARK_DEFINE_F(RBinWithError, Add)(benchmark::State &state)
203230
}
204231
BENCHMARK_REGISTER_F(RBinWithError, Add);
205232

206-
BENCHMARK_DEFINE_F(RBinWithError, AtomicAdd)(benchmark::State &state)
233+
BENCHMARK_DEFINE_F(RBinWithError, AtomicAddRelease)(benchmark::State &state)
207234
{
208235
for (auto _ : state) {
209-
fBin.AtomicAdd(1.0);
236+
fBin.AtomicAddRelease(1.0);
210237
benchmark::ClobberMemory();
211238
}
212239
}
213-
BENCHMARK_REGISTER_F(RBinWithError, AtomicAdd);
240+
BENCHMARK_REGISTER_F(RBinWithError, AtomicAddRelease);
214241

215242
BENCHMARK_DEFINE_F(RBinWithError, AtomicLoad)(benchmark::State &state)
216243
{

hist/histv7/inc/ROOT/RBinWithError.hxx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,9 @@ private:
9797
}
9898

9999
public:
100-
void AtomicInc() { AtomicAdd(1.0, 1.0); }
100+
void AtomicIncRelease() { AtomicAdd(1.0, 1.0); }
101101

102-
void AtomicAdd(double w) { AtomicAdd(w, w * w); }
102+
void AtomicAddRelease(double w) { AtomicAdd(w, w * w); }
103103

104104
/// Add another bin content using atomic instructions.
105105
///

hist/histv7/inc/ROOT/RHistEngine.hxx

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "RWeight.hxx"
1818

1919
#include <array>
20+
#include <atomic>
2021
#include <cassert>
2122
#include <cstddef>
2223
#include <cstdint>
@@ -500,7 +501,7 @@ public:
500501
RLinearizedIndex index = fAxes.ComputeGlobalIndexImpl<sizeof...(A)>(args);
501502
if (index.fValid) {
502503
assert(index.fIndex < fBinContents.size());
503-
Internal::AtomicInc(&fBinContents[index.fIndex]);
504+
Internal::AtomicIncRelease(&fBinContents[index.fIndex]);
504505
}
505506
}
506507

@@ -524,7 +525,7 @@ public:
524525
RLinearizedIndex index = fAxes.ComputeGlobalIndexImpl<sizeof...(A)>(args);
525526
if (index.fValid) {
526527
assert(index.fIndex < fBinContents.size());
527-
Internal::AtomicAdd(&fBinContents[index.fIndex], weight.fValue);
528+
Internal::AtomicAddRelease(&fBinContents[index.fIndex], weight.fValue);
528529
}
529530
}
530531

@@ -549,7 +550,7 @@ public:
549550
RLinearizedIndex index = fAxes.ComputeGlobalIndexImpl<sizeof...(A)>(args);
550551
if (index.fValid) {
551552
assert(index.fIndex < fBinContents.size());
552-
Internal::AtomicAdd(&fBinContents[index.fIndex], weight);
553+
Internal::AtomicAddRelease(&fBinContents[index.fIndex], weight);
553554
}
554555
}
555556

@@ -573,7 +574,7 @@ public:
573574
RLinearizedIndex index = fAxes.ComputeGlobalIndexImpl<N>(t);
574575
if (index.fValid) {
575576
assert(index.fIndex < fBinContents.size());
576-
Internal::AtomicAdd(&fBinContents[index.fIndex], weight.fValue);
577+
Internal::AtomicAddRelease(&fBinContents[index.fIndex], weight.fValue);
577578
}
578579
} else {
579580
FillAtomic(t);
@@ -836,6 +837,10 @@ public:
836837
BinContentType tmp;
837838
bool changed;
838839
do {
840+
// To guarantee correctness, we let the release operation(s) in FillAtomic synchronize with this acquire fence.
841+
// This ensures that all previous writes become visible side-effects and the atomic loads will see them.
842+
std::atomic_thread_fence(std::memory_order_acquire);
843+
839844
changed = false;
840845
for (std::size_t i = 0; i < fBinContents.size(); i++) {
841846
Internal::AtomicLoad(&fBinContents[i], &tmp);

hist/histv7/inc/ROOT/RHistUtils.hxx

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,18 @@ std::enable_if_t<std::is_arithmetic_v<T>, bool> AtomicCompareExchangeAcquire(T *
202202
#endif
203203
}
204204

205+
template <typename T>
206+
std::enable_if_t<std::is_arithmetic_v<T>, bool> AtomicCompareExchangeRelease(T *ptr, T *expected, T *desired)
207+
{
208+
#ifndef _MSC_VER
209+
return __atomic_compare_exchange(ptr, expected, desired, /*weak=*/false, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
210+
#else
211+
// Cannot specify the memory order directly, use an unconditional fence to avoid branching code.
212+
std::atomic_thread_fence(std::memory_order_release);
213+
return MSVC::AtomicOps<sizeof(T)>::CompareExchange(ptr, expected, desired);
214+
#endif
215+
}
216+
205217
template <typename T>
206218
std::enable_if_t<std::is_arithmetic_v<T>> AtomicAddCompareExchangeLoop(T *ptr, T val)
207219
{
@@ -214,6 +226,18 @@ std::enable_if_t<std::is_arithmetic_v<T>> AtomicAddCompareExchangeLoop(T *ptr, T
214226
}
215227
}
216228

229+
template <typename T>
230+
std::enable_if_t<std::is_arithmetic_v<T>> AtomicAddCompareExchangeReleaseLoop(T *ptr, T val)
231+
{
232+
T expected;
233+
AtomicLoad(ptr, &expected);
234+
T desired = expected + val;
235+
while (!AtomicCompareExchangeRelease(ptr, &expected, &desired)) {
236+
// expected holds the new value; try again.
237+
desired = expected + val;
238+
}
239+
}
240+
217241
#ifdef _MSC_VER
218242
namespace MSVC {
219243
inline void AtomicOps<8>::Add(void *ptr, const void *val)
@@ -243,17 +267,35 @@ std::enable_if_t<std::is_floating_point_v<T>> AtomicAdd(T *ptr, T val)
243267
AtomicAddCompareExchangeLoop(ptr, val);
244268
}
245269

270+
template <typename T>
271+
std::enable_if_t<std::is_integral_v<T>> AtomicAddRelease(T *ptr, T val)
272+
{
273+
#ifndef _MSC_VER
274+
__atomic_fetch_add(ptr, val, __ATOMIC_RELEASE);
275+
#else
276+
// Cannot specify the memory order directly, use a fence.
277+
std::atomic_thread_fence(std::memory_order_release);
278+
MSVC::AtomicOps<sizeof(T)>::Add(ptr, &val);
279+
#endif
280+
}
281+
282+
template <typename T>
283+
std::enable_if_t<std::is_floating_point_v<T>> AtomicAddRelease(T *ptr, T val)
284+
{
285+
AtomicAddCompareExchangeReleaseLoop(ptr, val);
286+
}
287+
246288
// For adding a double-precision weight to a single-precision bin content type, cast the argument once before the
247289
// compare-exchange loop.
248-
static inline void AtomicAdd(float *ptr, double val)
290+
static inline void AtomicAddRelease(float *ptr, double val)
249291
{
250-
AtomicAdd(ptr, static_cast<float>(val));
292+
AtomicAddRelease(ptr, static_cast<float>(val));
251293
}
252294

253295
template <typename T>
254-
std::enable_if_t<std::is_arithmetic_v<T>> AtomicInc(T *ptr)
296+
std::enable_if_t<std::is_arithmetic_v<T>> AtomicIncRelease(T *ptr)
255297
{
256-
AtomicAdd(ptr, static_cast<T>(1));
298+
AtomicAddRelease(ptr, static_cast<T>(1));
257299
}
258300

259301
template <typename T, typename U>
@@ -262,10 +304,16 @@ auto AtomicAdd(T *ptr, const U &add) -> decltype(ptr->AtomicAdd(add))
262304
return ptr->AtomicAdd(add);
263305
}
264306

307+
template <typename T, typename U>
308+
auto AtomicAddRelease(T *ptr, const U &add) -> decltype(ptr->AtomicAddRelease(add))
309+
{
310+
return ptr->AtomicAddRelease(add);
311+
}
312+
265313
template <typename T>
266-
auto AtomicInc(T *ptr) -> decltype(ptr->AtomicInc())
314+
auto AtomicIncRelease(T *ptr) -> decltype(ptr->AtomicIncRelease())
267315
{
268-
return ptr->AtomicInc();
316+
return ptr->AtomicIncRelease();
269317
}
270318

271319
} // namespace Internal

hist/histv7/test/hist_atomic.cxx

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@ class RHistAtomic : public testing::Test {};
1313
using AtomicTypes = testing::Types<char, short, int, long, long long, float, double>;
1414
TYPED_TEST_SUITE(RHistAtomic, AtomicTypes);
1515

16-
TYPED_TEST(RHistAtomic, AtomicInc)
16+
TYPED_TEST(RHistAtomic, AtomicIncRelease)
1717
{
1818
TypeParam a = 1;
19-
ROOT::Experimental::Internal::AtomicInc(&a);
19+
ROOT::Experimental::Internal::AtomicIncRelease(&a);
2020
EXPECT_EQ(a, 2);
2121
}
2222

@@ -28,7 +28,15 @@ TYPED_TEST(RHistAtomic, AtomicAdd)
2828
EXPECT_EQ(a, 3);
2929
}
3030

31-
// AtomicInc is implemented in terms of AtomicAdd, so it's sufficient to stress one of them.
31+
TYPED_TEST(RHistAtomic, AtomicAddRelease)
32+
{
33+
TypeParam a = 1;
34+
const TypeParam b = 2;
35+
ROOT::Experimental::Internal::AtomicAddRelease(&a, b);
36+
EXPECT_EQ(a, 3);
37+
}
38+
39+
// AtomicInc* is implemented in terms of AtomicAdd*, so it's sufficient to stress one of them.
3240
TYPED_TEST(RHistAtomic, StressAtomicAdd)
3341
{
3442
static constexpr TypeParam Addend = 1;
@@ -47,25 +55,43 @@ TYPED_TEST(RHistAtomic, StressAtomicAdd)
4755
EXPECT_EQ(a, NAdds * Addend);
4856
}
4957

50-
TEST(AtomicAdd, FloatDouble)
58+
TYPED_TEST(RHistAtomic, StressAtomicAddRelease)
59+
{
60+
static constexpr TypeParam Addend = 1;
61+
static constexpr std::size_t NThreads = 4;
62+
// Reduce number of additions for char to avoid overflow.
63+
static constexpr std::size_t NAddsPerThread = sizeof(TypeParam) == 1 ? 20 : 8000;
64+
static constexpr std::size_t NAdds = NThreads * NAddsPerThread;
65+
66+
TypeParam a = 0;
67+
StressInParallel(NThreads, [&] {
68+
for (std::size_t i = 0; i < NAddsPerThread; i++) {
69+
ROOT::Experimental::Internal::AtomicAddRelease(&a, Addend);
70+
}
71+
});
72+
73+
EXPECT_EQ(a, NAdds * Addend);
74+
}
75+
76+
TEST(AtomicAddRelease, FloatDouble)
5177
{
5278
float a = 1;
5379
const double b = 2;
54-
ROOT::Experimental::Internal::AtomicAdd(&a, b);
80+
ROOT::Experimental::Internal::AtomicAddRelease(&a, b);
5581
EXPECT_EQ(a, 3);
5682
}
5783

58-
TEST(RBinWithError, AtomicAdd)
84+
TEST(RBinWithError, AtomicAddRelease)
5985
{
6086
RBinWithError bin;
6187
bin.fSum = 1;
6288
bin.fSum2 = 2;
63-
bin.AtomicAdd(1.5);
89+
bin.AtomicAddRelease(1.5);
6490
EXPECT_EQ(bin.fSum, 2.5);
6591
EXPECT_EQ(bin.fSum2, 4.25);
6692
}
6793

68-
TEST(RBinWithError, StressAtomicAdd)
94+
TEST(RBinWithError, StressAtomicAddRelease)
6995
{
7096
static constexpr double Addend = 2.0;
7197
static constexpr std::size_t NThreads = 4;
@@ -75,7 +101,7 @@ TEST(RBinWithError, StressAtomicAdd)
75101
RBinWithError bin;
76102
StressInParallel(NThreads, [&] {
77103
for (std::size_t i = 0; i < NAddsPerThread; i++) {
78-
bin.AtomicAdd(Addend);
104+
bin.AtomicAddRelease(Addend);
79105
}
80106
});
81107

@@ -120,7 +146,7 @@ TEST(RBinWithError, StressAtomicLoad)
120146
}
121147
} else {
122148
for (std::size_t i = 0; i < NAddsPerThread; i++) {
123-
bin.AtomicAdd(Addend);
149+
bin.AtomicAddRelease(Addend);
124150
}
125151
}
126152
});

hist/histv7/test/hist_user.cxx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,11 @@ struct User {
5454
return *this;
5555
}
5656

57-
void AtomicInc() { ROOT::Experimental::Internal::AtomicInc(&fValue); }
57+
void AtomicIncRelease() { ROOT::Experimental::Internal::AtomicIncRelease(&fValue); }
5858

59-
void AtomicAdd(double w) { ROOT::Experimental::Internal::AtomicAdd(&fValue, w); }
59+
void AtomicAddRelease(double w) { ROOT::Experimental::Internal::AtomicAddRelease(&fValue, w); }
6060

61-
void AtomicAdd(const UserWeight &w) { ROOT::Experimental::Internal::AtomicAdd(&fValue, w.fWeight); }
61+
void AtomicAddRelease(const UserWeight &w) { ROOT::Experimental::Internal::AtomicAddRelease(&fValue, w.fWeight); }
6262

6363
void AtomicAdd(const User &rhs) { ROOT::Experimental::Internal::AtomicAdd(&fValue, rhs.fValue); }
6464

0 commit comments

Comments
 (0)