Skip to content

Commit c957255

Browse files
committed
[hist] Use release-acquire in SnapshotAtomic
This ensures correctness on weakly-ordered systems. If not using SnapshotAtomic, release stores would not be needed in FillAtomic. However, tests indicate that it's not measurable in practice so we use it unconditionally to keep the implementation simple.
1 parent ae96b56 commit c957255

6 files changed

Lines changed: 140 additions & 34 deletions

File tree

hist/histv7/benchmark/hist_benchmark_atomic.cxx

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,15 @@ BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicAdd)(benchmark::State &state)
3838
}
3939
BENCHMARK_REGISTER_F(RHistAtomic_int, AtomicAdd);
4040

41+
BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicAddRelease)(benchmark::State &state)
42+
{
43+
for (auto _ : state) {
44+
ROOT::Experimental::Internal::AtomicAddRelease(&fAtomic, 1);
45+
benchmark::ClobberMemory();
46+
}
47+
}
48+
BENCHMARK_REGISTER_F(RHistAtomic_int, AtomicAddRelease);
49+
4150
BENCHMARK_DEFINE_F(RHistAtomic_int, AtomicLoad)(benchmark::State &state)
4251
{
4352
int load;
@@ -90,6 +99,15 @@ BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicAdd)(benchmark::State &state)
9099
}
91100
BENCHMARK_REGISTER_F(RHistAtomic_float, AtomicAdd);
92101

102+
BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicAddRelease)(benchmark::State &state)
103+
{
104+
for (auto _ : state) {
105+
ROOT::Experimental::Internal::AtomicAddRelease(&fAtomic, 1.0f);
106+
benchmark::ClobberMemory();
107+
}
108+
}
109+
BENCHMARK_REGISTER_F(RHistAtomic_float, AtomicAddRelease);
110+
93111
BENCHMARK_DEFINE_F(RHistAtomic_float, AtomicLoad)(benchmark::State &state)
94112
{
95113
float load;
@@ -142,6 +160,15 @@ BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicAdd)(benchmark::State &state)
142160
}
143161
BENCHMARK_REGISTER_F(RHistAtomic_double, AtomicAdd);
144162

163+
BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicAddRelease)(benchmark::State &state)
164+
{
165+
for (auto _ : state) {
166+
ROOT::Experimental::Internal::AtomicAddRelease(&fAtomic, 1.0);
167+
benchmark::ClobberMemory();
168+
}
169+
}
170+
BENCHMARK_REGISTER_F(RHistAtomic_double, AtomicAddRelease);
171+
145172
BENCHMARK_DEFINE_F(RHistAtomic_double, AtomicLoad)(benchmark::State &state)
146173
{
147174
double load;
@@ -185,14 +212,14 @@ BENCHMARK_DEFINE_F(RBinWithError, Inc)(benchmark::State &state)
185212
}
186213
BENCHMARK_REGISTER_F(RBinWithError, Inc);
187214

188-
BENCHMARK_DEFINE_F(RBinWithError, AtomicInc)(benchmark::State &state)
215+
BENCHMARK_DEFINE_F(RBinWithError, AtomicIncRelease)(benchmark::State &state)
189216
{
190217
for (auto _ : state) {
191-
fBin.AtomicInc();
218+
fBin.AtomicIncRelease();
192219
benchmark::ClobberMemory();
193220
}
194221
}
195-
BENCHMARK_REGISTER_F(RBinWithError, AtomicInc);
222+
BENCHMARK_REGISTER_F(RBinWithError, AtomicIncRelease);
196223

197224
BENCHMARK_DEFINE_F(RBinWithError, Add)(benchmark::State &state)
198225
{
@@ -203,14 +230,14 @@ BENCHMARK_DEFINE_F(RBinWithError, Add)(benchmark::State &state)
203230
}
204231
BENCHMARK_REGISTER_F(RBinWithError, Add);
205232

206-
BENCHMARK_DEFINE_F(RBinWithError, AtomicAdd)(benchmark::State &state)
233+
BENCHMARK_DEFINE_F(RBinWithError, AtomicAddRelease)(benchmark::State &state)
207234
{
208235
for (auto _ : state) {
209-
fBin.AtomicAdd(1.0);
236+
fBin.AtomicAddRelease(1.0);
210237
benchmark::ClobberMemory();
211238
}
212239
}
213-
BENCHMARK_REGISTER_F(RBinWithError, AtomicAdd);
240+
BENCHMARK_REGISTER_F(RBinWithError, AtomicAddRelease);
214241

215242
BENCHMARK_DEFINE_F(RBinWithError, AtomicLoad)(benchmark::State &state)
216243
{

hist/histv7/inc/ROOT/RBinWithError.hxx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,9 @@ private:
9797
}
9898

9999
public:
100-
void AtomicInc() { AtomicAdd(1.0, 1.0); }
100+
void AtomicIncRelease() { AtomicAdd(1.0, 1.0); }
101101

102-
void AtomicAdd(double w) { AtomicAdd(w, w * w); }
102+
void AtomicAddRelease(double w) { AtomicAdd(w, w * w); }
103103

104104
/// Add another bin content using atomic instructions.
105105
///

hist/histv7/inc/ROOT/RHistEngine.hxx

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "RWeight.hxx"
1818

1919
#include <array>
20+
#include <atomic>
2021
#include <cassert>
2122
#include <cstddef>
2223
#include <cstdint>
@@ -501,7 +502,7 @@ public:
501502
RLinearizedIndex index = fAxes.ComputeGlobalIndexImpl<sizeof...(A)>(args);
502503
if (index.fValid) {
503504
assert(index.fIndex < fBinContents.size());
504-
Internal::AtomicInc(&fBinContents[index.fIndex]);
505+
Internal::AtomicIncRelease(&fBinContents[index.fIndex]);
505506
}
506507
}
507508

@@ -525,7 +526,7 @@ public:
525526
RLinearizedIndex index = fAxes.ComputeGlobalIndexImpl<sizeof...(A)>(args);
526527
if (index.fValid) {
527528
assert(index.fIndex < fBinContents.size());
528-
Internal::AtomicAdd(&fBinContents[index.fIndex], weight.fValue);
529+
Internal::AtomicAddRelease(&fBinContents[index.fIndex], weight.fValue);
529530
}
530531
}
531532

@@ -550,7 +551,7 @@ public:
550551
RLinearizedIndex index = fAxes.ComputeGlobalIndexImpl<sizeof...(A)>(args);
551552
if (index.fValid) {
552553
assert(index.fIndex < fBinContents.size());
553-
Internal::AtomicAdd(&fBinContents[index.fIndex], weight);
554+
Internal::AtomicAddRelease(&fBinContents[index.fIndex], weight);
554555
}
555556
}
556557

@@ -574,7 +575,7 @@ public:
574575
RLinearizedIndex index = fAxes.ComputeGlobalIndexImpl<N>(t);
575576
if (index.fValid) {
576577
assert(index.fIndex < fBinContents.size());
577-
Internal::AtomicAdd(&fBinContents[index.fIndex], weight.fValue);
578+
Internal::AtomicAddRelease(&fBinContents[index.fIndex], weight.fValue);
578579
}
579580
} else {
580581
FillAtomic(t);
@@ -837,6 +838,10 @@ public:
837838
BinContentType tmp;
838839
bool changed;
839840
do {
841+
// To guarantee correctness, we let the release operation(s) in FillAtomic synchronize with this acquire fence.
842+
// This ensures that all previous writes become visible side-effects and the atomic loads will see them.
843+
std::atomic_thread_fence(std::memory_order_acquire);
844+
840845
changed = false;
841846
for (std::size_t i = 0; i < fBinContents.size(); i++) {
842847
Internal::AtomicLoad(&fBinContents[i], &tmp);

hist/histv7/inc/ROOT/RHistUtils.hxx

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,18 @@ std::enable_if_t<std::is_arithmetic_v<T>, bool> AtomicCompareExchangeAcquire(T *
204204
#endif
205205
}
206206

207+
template <typename T>
208+
std::enable_if_t<std::is_arithmetic_v<T>, bool> AtomicCompareExchangeRelease(T *ptr, T *expected, T *desired)
209+
{
210+
#ifndef _MSC_VER
211+
return __atomic_compare_exchange(ptr, expected, desired, /*weak=*/false, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
212+
#else
213+
// Cannot specify the memory order directly, use an unconditional fence to avoid branching code.
214+
std::atomic_thread_fence(std::memory_order_release);
215+
return MSVC::AtomicOps<sizeof(T)>::CompareExchange(ptr, expected, desired);
216+
#endif
217+
}
218+
207219
template <typename T>
208220
std::enable_if_t<std::is_arithmetic_v<T>> AtomicAddCompareExchangeLoop(T *ptr, T val)
209221
{
@@ -216,6 +228,18 @@ std::enable_if_t<std::is_arithmetic_v<T>> AtomicAddCompareExchangeLoop(T *ptr, T
216228
}
217229
}
218230

231+
template <typename T>
232+
std::enable_if_t<std::is_arithmetic_v<T>> AtomicAddCompareExchangeReleaseLoop(T *ptr, T val)
233+
{
234+
T expected;
235+
AtomicLoad(ptr, &expected);
236+
T desired = expected + val;
237+
while (!AtomicCompareExchangeRelease(ptr, &expected, &desired)) {
238+
// expected holds the new value; try again.
239+
desired = expected + val;
240+
}
241+
}
242+
219243
#ifdef _MSC_VER
220244
namespace MSVC {
221245
inline void AtomicOps<8>::Add(void *ptr, const void *val)
@@ -245,17 +269,35 @@ std::enable_if_t<std::is_floating_point_v<T>> AtomicAdd(T *ptr, T val)
245269
AtomicAddCompareExchangeLoop(ptr, val);
246270
}
247271

272+
template <typename T>
273+
std::enable_if_t<std::is_integral_v<T>> AtomicAddRelease(T *ptr, T val)
274+
{
275+
#ifndef _MSC_VER
276+
__atomic_fetch_add(ptr, val, __ATOMIC_RELEASE);
277+
#else
278+
// Cannot specify the memory order directly, use a fence.
279+
std::atomic_thread_fence(std::memory_order_release);
280+
MSVC::AtomicOps<sizeof(T)>::Add(ptr, &val);
281+
#endif
282+
}
283+
284+
template <typename T>
285+
std::enable_if_t<std::is_floating_point_v<T>> AtomicAddRelease(T *ptr, T val)
286+
{
287+
AtomicAddCompareExchangeReleaseLoop(ptr, val);
288+
}
289+
248290
// For adding a double-precision weight to a single-precision bin content type, cast the argument once before the
249291
// compare-exchange loop.
250-
static inline void AtomicAdd(float *ptr, double val)
292+
static inline void AtomicAddRelease(float *ptr, double val)
251293
{
252-
AtomicAdd(ptr, static_cast<float>(val));
294+
AtomicAddRelease(ptr, static_cast<float>(val));
253295
}
254296

255297
template <typename T>
256-
std::enable_if_t<std::is_arithmetic_v<T>> AtomicInc(T *ptr)
298+
std::enable_if_t<std::is_arithmetic_v<T>> AtomicIncRelease(T *ptr)
257299
{
258-
AtomicAdd(ptr, static_cast<T>(1));
300+
AtomicAddRelease(ptr, static_cast<T>(1));
259301
}
260302

261303
template <typename T, typename U>
@@ -264,10 +306,16 @@ auto AtomicAdd(T *ptr, const U &add) -> decltype(ptr->AtomicAdd(add))
264306
return ptr->AtomicAdd(add);
265307
}
266308

309+
template <typename T, typename U>
310+
auto AtomicAddRelease(T *ptr, const U &add) -> decltype(ptr->AtomicAddRelease(add))
311+
{
312+
return ptr->AtomicAddRelease(add);
313+
}
314+
267315
template <typename T>
268-
auto AtomicInc(T *ptr) -> decltype(ptr->AtomicInc())
316+
auto AtomicIncRelease(T *ptr) -> decltype(ptr->AtomicIncRelease())
269317
{
270-
return ptr->AtomicInc();
318+
return ptr->AtomicIncRelease();
271319
}
272320

273321
} // namespace Internal

hist/histv7/test/hist_atomic.cxx

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@ class RHistAtomic : public testing::Test {};
1313
using AtomicTypes = testing::Types<char, short, int, long, long long, float, double>;
1414
TYPED_TEST_SUITE(RHistAtomic, AtomicTypes);
1515

16-
TYPED_TEST(RHistAtomic, AtomicInc)
16+
TYPED_TEST(RHistAtomic, AtomicIncRelease)
1717
{
1818
TypeParam a = 1;
19-
ROOT::Experimental::Internal::AtomicInc(&a);
19+
ROOT::Experimental::Internal::AtomicIncRelease(&a);
2020
EXPECT_EQ(a, 2);
2121
}
2222

@@ -28,7 +28,15 @@ TYPED_TEST(RHistAtomic, AtomicAdd)
2828
EXPECT_EQ(a, 3);
2929
}
3030

31-
// AtomicInc is implemented in terms of AtomicAdd, so it's sufficient to stress one of them.
31+
TYPED_TEST(RHistAtomic, AtomicAddRelease)
32+
{
33+
TypeParam a = 1;
34+
const TypeParam b = 2;
35+
ROOT::Experimental::Internal::AtomicAddRelease(&a, b);
36+
EXPECT_EQ(a, 3);
37+
}
38+
39+
// AtomicInc* is implemented in terms of AtomicAdd*, so it's sufficient to stress one of them.
3240
TYPED_TEST(RHistAtomic, StressAtomicAdd)
3341
{
3442
static constexpr TypeParam Addend = 1;
@@ -47,25 +55,43 @@ TYPED_TEST(RHistAtomic, StressAtomicAdd)
4755
EXPECT_EQ(a, NAdds * Addend);
4856
}
4957

50-
TEST(AtomicAdd, FloatDouble)
58+
TYPED_TEST(RHistAtomic, StressAtomicAddRelease)
59+
{
60+
static constexpr TypeParam Addend = 1;
61+
static constexpr std::size_t NThreads = 4;
62+
// Reduce number of additions for char to avoid overflow.
63+
static constexpr std::size_t NAddsPerThread = sizeof(TypeParam) == 1 ? 20 : 8000;
64+
static constexpr std::size_t NAdds = NThreads * NAddsPerThread;
65+
66+
TypeParam a = 0;
67+
StressInParallel(NThreads, [&] {
68+
for (std::size_t i = 0; i < NAddsPerThread; i++) {
69+
ROOT::Experimental::Internal::AtomicAddRelease(&a, Addend);
70+
}
71+
});
72+
73+
EXPECT_EQ(a, NAdds * Addend);
74+
}
75+
76+
TEST(AtomicAddRelease, FloatDouble)
5177
{
5278
float a = 1;
5379
const double b = 2;
54-
ROOT::Experimental::Internal::AtomicAdd(&a, b);
80+
ROOT::Experimental::Internal::AtomicAddRelease(&a, b);
5581
EXPECT_EQ(a, 3);
5682
}
5783

58-
TEST(RBinWithError, AtomicAdd)
84+
TEST(RBinWithError, AtomicAddRelease)
5985
{
6086
RBinWithError bin;
6187
bin.fSum = 1;
6288
bin.fSum2 = 2;
63-
bin.AtomicAdd(1.5);
89+
bin.AtomicAddRelease(1.5);
6490
EXPECT_EQ(bin.fSum, 2.5);
6591
EXPECT_EQ(bin.fSum2, 4.25);
6692
}
6793

68-
TEST(RBinWithError, StressAtomicAdd)
94+
TEST(RBinWithError, StressAtomicAddRelease)
6995
{
7096
static constexpr double Addend = 2.0;
7197
static constexpr std::size_t NThreads = 4;
@@ -75,7 +101,7 @@ TEST(RBinWithError, StressAtomicAdd)
75101
RBinWithError bin;
76102
StressInParallel(NThreads, [&] {
77103
for (std::size_t i = 0; i < NAddsPerThread; i++) {
78-
bin.AtomicAdd(Addend);
104+
bin.AtomicAddRelease(Addend);
79105
}
80106
});
81107

@@ -120,7 +146,7 @@ TEST(RBinWithError, StressAtomicLoad)
120146
}
121147
} else {
122148
for (std::size_t i = 0; i < NAddsPerThread; i++) {
123-
bin.AtomicAdd(Addend);
149+
bin.AtomicAddRelease(Addend);
124150
}
125151
}
126152
});

hist/histv7/test/hist_user.cxx

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,11 @@ struct User {
5454
return *this;
5555
}
5656

57-
void AtomicInc() { ROOT::Experimental::Internal::AtomicInc(&fValue); }
57+
void AtomicIncRelease() { ROOT::Experimental::Internal::AtomicIncRelease(&fValue); }
5858

59-
void AtomicAdd(double w) { ROOT::Experimental::Internal::AtomicAdd(&fValue, w); }
59+
void AtomicAddRelease(double w) { ROOT::Experimental::Internal::AtomicAddRelease(&fValue, w); }
6060

61-
void AtomicAdd(const UserWeight &w) { ROOT::Experimental::Internal::AtomicAdd(&fValue, w.fWeight); }
61+
void AtomicAddRelease(const UserWeight &w) { ROOT::Experimental::Internal::AtomicAddRelease(&fValue, w.fWeight); }
6262

6363
void AtomicAdd(const User &rhs) { ROOT::Experimental::Internal::AtomicAdd(&fValue, rhs.fValue); }
6464

@@ -194,7 +194,7 @@ TEST(RHistEngineUser, FillUserWeightInvalidNumberOfArguments)
194194

195195
TEST(RHistEngineUser, FillAtomic)
196196
{
197-
// Unweighted filling with atomic instructions uses AtomicInc
197+
// Unweighted filling with atomic instructions uses AtomicIncRelease
198198
static constexpr std::size_t Bins = 20;
199199
const RRegularAxis axis(Bins, {0, Bins});
200200
RHistEngine<User> engine({axis});
@@ -209,7 +209,7 @@ TEST(RHistEngineUser, FillAtomic)
209209

210210
TEST(RHistEngineUser, FillAtomicWeight)
211211
{
212-
// Weighted filling with atomic instructions uses AtomicAdd(double)
212+
// Weighted filling with atomic instructions uses AtomicAddRelease(double)
213213
static constexpr std::size_t Bins = 20;
214214
const RRegularAxis axis(Bins, {0, Bins});
215215
RHistEngine<User> engine({axis});
@@ -224,7 +224,7 @@ TEST(RHistEngineUser, FillAtomicWeight)
224224

225225
TEST(RHistEngineUser, FillAtomicUserWeight)
226226
{
227-
// Weighted filling with user-defined weight and atomic instructions uses AtomicAdd(const UserWeight &)
227+
// Weighted filling with user-defined weight and atomic instructions uses AtomicAddRelease(const UserWeight &)
228228
static constexpr std::size_t Bins = 20;
229229
const RRegularAxis axis(Bins, {0, Bins});
230230
RHistEngine<User> engine({axis});

0 commit comments

Comments
 (0)