Skip to content

Commit b02293a

Browse files
Martin Brusecopybara-github
authored andcommitted
Added HWY_DASSERTs asserting buffer alignment to all Load and Store operations that require alignment.
Fixed some bugs where Load/StoreU called Load/Store. PiperOrigin-RevId: 578459068
1 parent 46c9056 commit b02293a

12 files changed

Lines changed: 172 additions & 94 deletions

hwy/base.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454

5555
#endif // !HWY_IDE
5656

57-
#if !defined(HWY_NO_LIBCXX) && HWY_CXX_LANG > 201703L && \
57+
#if !defined(HWY_NO_LIBCXX) && HWY_CXX_LANG > 201703L && \
5858
__cpp_impl_three_way_comparison >= 201907L && defined(__has_include) && \
5959
!defined(HWY_DISABLE_CXX20_THREE_WAY_COMPARE)
6060
#if __has_include(<compare>)
@@ -293,6 +293,11 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
293293
} while (0)
294294
#endif
295295

296+
#define HWY_DASSERT_ALIGNED(d, addr) \
297+
HWY_DASSERT(reinterpret_cast<uintptr_t>(addr) % \
298+
(Lanes(d) * sizeof(TFromD<decltype(d)>)) == \
299+
0)
300+
296301
#if __cpp_constexpr >= 201304L
297302
#define HWY_CXX14_CONSTEXPR constexpr
298303
#else

hwy/ops/arm_neon-inl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3538,6 +3538,7 @@ HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
35383538
// On Arm, Load is the same as LoadU.
35393539
template <class D>
35403540
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
3541+
HWY_DASSERT_ALIGNED(d, p);
35413542
return LoadU(d, p);
35423543
}
35433544

@@ -3742,6 +3743,7 @@ HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
37423743
// On Arm, Store is the same as StoreU.
37433744
template <class D>
37443745
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
3746+
HWY_DASSERT_ALIGNED(d, aligned);
37453747
StoreU(v, d, aligned);
37463748
}
37473749

hwy/ops/arm_sve-inl.h

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include <arm_sve.h>
2020

21+
#include "hwy/base.h"
2122
#include "hwy/ops/shared-inl.h"
2223

2324
// Arm C215 declares that SVE vector lengths will always be a power of two.
@@ -1635,10 +1636,10 @@ HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq)
16351636
#if !HWY_SVE_HAVE_BFLOAT16
16361637

16371638
template <size_t N, int kPow2>
1638-
HWY_API VBF16 Load(Simd<bfloat16_t, N, kPow2> d,
1639-
const bfloat16_t* HWY_RESTRICT p) {
1640-
return BitCast(d, Load(RebindToUnsigned<decltype(d)>(),
1641-
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p)));
1639+
HWY_API VBF16 LoadU(Simd<bfloat16_t, N, kPow2> d,
1640+
const bfloat16_t* HWY_RESTRICT p) {
1641+
return BitCast(d, LoadU(RebindToUnsigned<decltype(d)>(),
1642+
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p)));
16421643
}
16431644

16441645
template <size_t N, int kPow2>
@@ -1688,10 +1689,10 @@ HWY_API VBF16 LoadDup128(D d, const bfloat16_t* HWY_RESTRICT p) {
16881689
#if !HWY_SVE_HAVE_BFLOAT16
16891690

16901691
template <size_t N, int kPow2>
1691-
HWY_API void Store(VBF16 v, Simd<bfloat16_t, N, kPow2> d,
1692-
bfloat16_t* HWY_RESTRICT p) {
1692+
HWY_API void StoreU(VBF16 v, Simd<bfloat16_t, N, kPow2> d,
1693+
bfloat16_t* HWY_RESTRICT p) {
16931694
const RebindToUnsigned<decltype(d)> du;
1694-
Store(BitCast(du, v), du, reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
1695+
StoreU(BitCast(du, v), du, reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
16951696
}
16961697

16971698
template <size_t N, int kPow2>
@@ -1711,18 +1712,20 @@ HWY_API void BlendedStore(VBF16 v, svbool_t m, Simd<bfloat16_t, N, kPow2> d,
17111712

17121713
#endif
17131714

1714-
// ------------------------------ Load/StoreU
1715+
// ------------------------------ Load/Store
17151716

17161717
// SVE only requires lane alignment, not natural alignment of the entire
17171718
// vector.
17181719
template <class D>
1719-
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1720-
return Load(d, p);
1720+
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
1721+
HWY_DASSERT_ALIGNED(d, p);
1722+
return LoadU(d, p);
17211723
}
17221724

17231725
template <class V, class D>
1724-
HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1725-
Store(v, d, p);
1726+
HWY_API void Store(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1727+
HWY_DASSERT_ALIGNED(d, p);
1728+
StoreU(v, d, p);
17261729
}
17271730

17281731
// ------------------------------ MaskedLoadOr

hwy/ops/emu128-inl.h

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1364,9 +1364,9 @@ HWY_API VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
13641364
// ------------------------------ Load
13651365

13661366
template <class D>
1367-
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
1367+
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
13681368
VFromD<D> v;
1369-
CopyBytes<d.MaxBytes()>(aligned, v.raw); // copy from array
1369+
CopyBytes<d.MaxBytes()>(p, v.raw); // copy from array
13701370
return v;
13711371
}
13721372

@@ -1383,8 +1383,9 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
13831383
}
13841384

13851385
template <class D>
1386-
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1387-
return Load(d, p);
1386+
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT aligned) {
1387+
HWY_DASSERT_ALIGNED(d, aligned);
1388+
return LoadU(d, aligned);
13881389
}
13891390

13901391
// In some use cases, "load single lane" is sufficient; otherwise avoid this.
@@ -1422,13 +1423,14 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const TFromD<D>* HWY_RESTRICT p,
14221423
// ------------------------------ Store
14231424

14241425
template <class D>
1425-
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
1426-
CopyBytes<d.MaxBytes()>(v.raw, aligned); // copy to array
1426+
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1427+
CopyBytes<d.MaxBytes()>(v.raw, p); // copy to array
14271428
}
14281429

14291430
template <class D>
1430-
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1431-
Store(v, d, p);
1431+
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
1432+
HWY_DASSERT_ALIGNED(d, aligned);
1433+
StoreU(v, d, aligned);
14321434
}
14331435

14341436
template <class D>

hwy/ops/ppc_vsx-inl.h

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -614,7 +614,9 @@ HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
614614
// ------------------------------ Load
615615

616616
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
617-
HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
617+
HWY_API Vec128<T> Load(D d, const T* HWY_RESTRICT aligned) {
618+
HWY_DASSERT_ALIGNED(d, aligned);
619+
(void)d;
618620
using LoadRaw = typename detail::Raw128<T>::AlignedRawVec;
619621
const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned);
620622
using ResultRaw = typename detail::Raw128<T>::type;
@@ -623,7 +625,7 @@ HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
623625

624626
// Any <= 64 bit
625627
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
626-
HWY_API VFromD<D> Load(D d, const T* HWY_RESTRICT p) {
628+
HWY_API VFromD<D> LoadU(D d, const T* HWY_RESTRICT p) {
627629
using BitsT = UnsignedFromSize<d.MaxBytes()>;
628630

629631
BitsT bits;
@@ -1072,8 +1074,9 @@ HWY_API Vec128<T> LoadU(D /* tag */, const T* HWY_RESTRICT p) {
10721074

10731075
// For < 128 bit, LoadU == Load.
10741076
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
1075-
HWY_API VFromD<D> LoadU(D d, const T* HWY_RESTRICT p) {
1076-
return Load(d, p);
1077+
HWY_API VFromD<D> Load(D d, const T* HWY_RESTRICT p) {
1078+
HWY_DASSERT_ALIGNED(d, p);
1079+
return LoadU(d, p);
10771080
}
10781081

10791082
// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
@@ -1212,7 +1215,9 @@ HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
12121215
// ------------------------------ Store
12131216

12141217
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
1215-
HWY_API void Store(Vec128<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
1218+
HWY_API void Store(Vec128<T> v, D d, T* HWY_RESTRICT aligned) {
1219+
HWY_DASSERT_ALIGNED(d, aligned);
1220+
(void)d;
12161221
using StoreRaw = typename detail::Raw128<T>::AlignedRawVec;
12171222
*HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast<StoreRaw>(v.raw);
12181223
}
@@ -1224,7 +1229,7 @@ HWY_API void StoreU(Vec128<T> v, D /* tag */, T* HWY_RESTRICT p) {
12241229
}
12251230

12261231
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
1227-
HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) {
1232+
HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
12281233
using BitsT = UnsignedFromSize<d.MaxBytes()>;
12291234

12301235
const Repartition<BitsT, decltype(d)> d_bits;
@@ -1234,8 +1239,9 @@ HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) {
12341239

12351240
// For < 128 bit, StoreU == Store.
12361241
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T = TFromD<D>>
1237-
HWY_API void StoreU(VFromD<D> v, D d, T* HWY_RESTRICT p) {
1238-
Store(v, d, p);
1242+
HWY_API void Store(VFromD<D> v, D d, T* HWY_RESTRICT p) {
1243+
HWY_DASSERT_ALIGNED(d, p);
1244+
StoreU(v, d, p);
12391245
}
12401246

12411247
#if HWY_PPC_HAVE_9

hwy/ops/rvv-inl.h

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1627,15 +1627,16 @@ HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT)
16271627

16281628
// There is no native BF16, treat as int16_t.
16291629
template <size_t N, int kPow2>
1630-
HWY_API VFromD<Simd<int16_t, N, kPow2>> Load(Simd<bfloat16_t, N, kPow2> d,
1630+
HWY_API VFromD<Simd<int16_t, N, kPow2>> LoadU(Simd<bfloat16_t, N, kPow2> d,
16311631
const bfloat16_t* HWY_RESTRICT p) {
1632-
return Load(RebindToSigned<decltype(d)>(),
1632+
return LoadU(RebindToSigned<decltype(d)>(),
16331633
reinterpret_cast<const int16_t * HWY_RESTRICT>(p));
16341634
}
16351635

16361636
template <size_t N, int kPow2>
16371637
HWY_API void Store(VFromD<Simd<int16_t, N, kPow2>> v,
16381638
Simd<bfloat16_t, N, kPow2> d, bfloat16_t* HWY_RESTRICT p) {
1639+
HWY_DASSERT_ALIGNED(d, p);
16391640
Store(v, RebindToSigned<decltype(d)>(),
16401641
reinterpret_cast<int16_t * HWY_RESTRICT>(p));
16411642
}
@@ -1644,26 +1645,26 @@ HWY_API void Store(VFromD<Simd<int16_t, N, kPow2>> v,
16441645

16451646
// NOTE: different type for float16_t than bfloat16_t, see Set().
16461647
template <size_t N, int kPow2>
1647-
HWY_API VFromD<Simd<uint16_t, N, kPow2>> Load(Simd<float16_t, N, kPow2> d,
1648+
HWY_API VFromD<Simd<uint16_t, N, kPow2>> LoadU(Simd<float16_t, N, kPow2> d,
16481649
const float16_t* HWY_RESTRICT p) {
16491650
return Load(RebindToUnsigned<decltype(d)>(),
16501651
reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
16511652
}
16521653

16531654
template <size_t N, int kPow2>
1654-
HWY_API void Store(VFromD<Simd<uint16_t, N, kPow2>> v,
1655+
HWY_API void StoreU(VFromD<Simd<uint16_t, N, kPow2>> v,
16551656
Simd<float16_t, N, kPow2> d, float16_t* HWY_RESTRICT p) {
1656-
Store(v, RebindToUnsigned<decltype(d)>(),
1657+
StoreU(v, RebindToUnsigned<decltype(d)>(),
16571658
reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
16581659
}
16591660

16601661
#endif // !HWY_HAVE_FLOAT16
16611662

1662-
// ------------------------------ LoadU
16631663
template <class D>
1664-
HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1664+
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
1665+
HWY_DASSERT_ALIGNED(d, p);
16651666
// RVV only requires element alignment, not vector alignment.
1666-
return Load(d, p);
1667+
return LoadU(d, p);
16671668
}
16681669

16691670
// ------------------------------ MaskedLoad
@@ -1858,11 +1859,12 @@ HWY_API void StoreN(VFromD<D> v, D /*d*/, T* HWY_RESTRICT p,
18581859
reinterpret_cast<TStore * HWY_RESTRICT>(p));
18591860
}
18601861

1861-
// ------------------------------ StoreU
1862+
// ------------------------------ Store
18621863
template <class V, class D>
1863-
HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1864+
HWY_API void Store(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1865+
HWY_DASSERT_ALIGNED(d, p);
18641866
// RVV only requires element alignment, not vector alignment.
1865-
Store(v, d, p);
1867+
StoreU(v, d, p);
18661868
}
18671869

18681870
// ------------------------------ Stream

hwy/ops/scalar-inl.h

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,7 +1068,7 @@ HWY_API Mask1<double> IsFinite(const Vec1<double> v) {
10681068
// ------------------------------ Load
10691069

10701070
template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
1071-
HWY_API Vec1<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
1071+
HWY_API Vec1<T> LoadU(D d, const T* HWY_RESTRICT aligned) {
10721072
T t;
10731073
CopySameSize(aligned, &t);
10741074
return Vec1<T>(t);
@@ -1086,8 +1086,9 @@ HWY_API Vec1<T> MaskedLoadOr(Vec1<T> v, Mask1<T> m, D d,
10861086
}
10871087

10881088
template <class D, HWY_IF_LANES_D(D, 1), typename T = TFromD<D>>
1089-
HWY_API Vec1<T> LoadU(D d, const T* HWY_RESTRICT p) {
1090-
return Load(d, p);
1089+
HWY_API Vec1<T> Load(D d, const T* HWY_RESTRICT p) {
1090+
HWY_DASSERT_ALIGNED(d, p);
1091+
return LoadU(d, p);
10911092
}
10921093

10931094
// In some use cases, "load single lane" is sufficient; otherwise avoid this.
@@ -1117,13 +1118,17 @@ HWY_API VFromD<D> LoadNOr(VFromD<D> no, D d, const T* HWY_RESTRICT p,
11171118
// ------------------------------ Store
11181119

11191120
template <class D, typename T = TFromD<D>>
1120-
HWY_API void Store(const Vec1<T> v, D /* tag */, T* HWY_RESTRICT aligned) {
1121+
HWY_API void StoreU(const Vec1<T> v, D d, T* HWY_RESTRICT aligned) {
1122+
(void)d;
11211123
CopySameSize(&v.raw, aligned);
11221124
}
11231125

11241126
template <class D, typename T = TFromD<D>>
1125-
HWY_API void StoreU(const Vec1<T> v, D d, T* HWY_RESTRICT p) {
1126-
return Store(v, d, p);
1127+
HWY_API void Store(const Vec1<T> v, D d, T* HWY_RESTRICT p) {
1128+
HWY_DASSERT_ALIGNED(d, p);
1129+
(void)d;
1130+
CopySameSize(&v.raw, p);
1131+
return StoreU(v, d, p);
11271132
}
11281133

11291134
template <class D, typename T = TFromD<D>>

hwy/ops/wasm_128-inl.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1860,13 +1860,15 @@ HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
18601860
// ------------------------------ Load
18611861

18621862
template <class D, HWY_IF_V_SIZE_D(D, 16), typename T = TFromD<D>>
1863-
HWY_API Vec128<T> Load(D /* tag */, const T* HWY_RESTRICT aligned) {
1863+
HWY_API Vec128<T> Load(D d, const T* HWY_RESTRICT aligned) {
1864+
HWY_DASSERT_ALIGNED(d, aligned);
18641865
return Vec128<T>{wasm_v128_load(aligned)};
18651866
}
18661867

18671868
// Partial
18681869
template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
18691870
HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
1871+
HWY_DASSERT_ALIGNED(d, p);
18701872
VFromD<D> v;
18711873
CopyBytes<d.MaxBytes()>(p, &v);
18721874
return v;
@@ -1939,24 +1941,27 @@ HWY_INLINE double ExtractLane(const Vec128<double, N> v) {
19391941
} // namespace detail
19401942

19411943
template <class D, HWY_IF_V_SIZE_D(D, 16)>
1942-
HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT aligned) {
1944+
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
1945+
HWY_DASSERT_ALIGNED(d, aligned);
1946+
(void)d;
19431947
wasm_v128_store(aligned, v.raw);
19441948
}
19451949

19461950
// Partial
19471951
template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
1948-
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1952+
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
19491953
CopyBytes<d.MaxBytes()>(&v, p);
19501954
}
19511955

19521956
template <class D, HWY_IF_LANES_D(D, 1)>
1953-
HWY_API void Store(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
1957+
HWY_API void StoreU(VFromD<D> v, D /* tag */, TFromD<D>* HWY_RESTRICT p) {
19541958
*p = detail::ExtractLane<0>(v);
19551959
}
19561960

19571961
// StoreU == Store.
19581962
template <class D>
1959-
HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1963+
HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1964+
HWY_DASSERT_ALIGNED(d, p);
19601965
Store(v, d, p);
19611966
}
19621967

0 commit comments

Comments
 (0)