Skip to content

Commit c3652a8

Browse files
DiamonDinoiaserge-sans-paille
authored andcommitted
1. Adding stream API for non temporal data transfers
2. Adding xsimd::fence as a wrapper around std atomic for cache coherence 3. Adding tests
1 parent 99587dd commit c3652a8

File tree

10 files changed

+350
-0
lines changed

10 files changed

+350
-0
lines changed

include/xsimd/arch/common/xsimd_common_memory.hpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,12 @@ namespace xsimd
292292
return load_unaligned(mem, b, A {});
293293
}
294294

295+
template <class A, class T>
296+
XSIMD_INLINE batch_bool<T, A> load_stream(bool const* mem, batch_bool<T, A> b, requires_arch<common>) noexcept
297+
{
298+
return load_aligned(mem, b, A {});
299+
}
300+
295301
// load_aligned
296302
namespace detail
297303
{
@@ -438,6 +444,12 @@ namespace xsimd
438444
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
439445
}
440446

447+
template <class A, class T_in, class T_out>
448+
XSIMD_INLINE batch<T_out, A> load_stream(T_in const* mem, convert<T_out> cvt, requires_arch<common>) noexcept
449+
{
450+
return load_aligned<A>(mem, cvt, A {});
451+
}
452+
441453
// rotate_right
442454
template <size_t N, class A, class T>
443455
XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& self, requires_arch<common>) noexcept
@@ -679,6 +691,12 @@ namespace xsimd
679691
mem[i] = bool(buffer[i]);
680692
}
681693

694+
template <class A, class T>
695+
XSIMD_INLINE void store_stream(batch_bool<T, A> const& self, bool* mem, requires_arch<common>) noexcept
696+
{
697+
store(self, mem, A {});
698+
}
699+
682700
// store_aligned
683701
template <class A, class T_in, class T_out>
684702
XSIMD_INLINE void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<common>) noexcept
@@ -697,6 +715,12 @@ namespace xsimd
697715
return store_aligned<A>(mem, self, common {});
698716
}
699717

718+
template <class A, class T_in, class T_out>
719+
XSIMD_INLINE void store_stream(T_out* mem, batch<T_in, A> const& self, requires_arch<common>) noexcept
720+
{
721+
store_aligned<A>(mem, self, A {});
722+
}
723+
700724
// swizzle
701725
template <class A, class T, class ITy, ITy... Vs>
702726
XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<common>) noexcept
@@ -778,6 +802,12 @@ namespace xsimd
778802
return detail::load_complex(hi, lo, A {});
779803
}
780804

805+
template <class A, class T_out, class T_in>
806+
XSIMD_INLINE batch<std::complex<T_out>, A> load_complex_stream(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<common>) noexcept
807+
{
808+
return load_complex_aligned<A>(mem, kernel::convert<std::complex<T_out>> {}, A {});
809+
}
810+
781811
// store_complex_aligned
782812
template <class A, class T_out, class T_in>
783813
XSIMD_INLINE void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<common>) noexcept
@@ -802,6 +832,12 @@ namespace xsimd
802832
hi.store_unaligned(buffer + real_batch::size);
803833
}
804834

835+
template <class A, class T_out, class T_in>
836+
XSIMD_INLINE void store_complex_stream(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<common>) noexcept
837+
{
838+
store_complex_aligned<A>(dst, src, A {});
839+
}
840+
805841
// transpose
806842
template <class A, class T>
807843
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<common>) noexcept

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1515,6 +1515,23 @@ namespace xsimd
15151515
return _mm256_storeu_pd(mem, self);
15161516
}
15171517

1518+
// store_stream
1519+
template <class A>
1520+
XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
1521+
{
1522+
_mm256_stream_ps(mem, self);
1523+
}
1524+
template <class A>
1525+
XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
1526+
{
1527+
_mm256_stream_pd(mem, self);
1528+
}
1529+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1530+
XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
1531+
{
1532+
_mm256_stream_si256((__m256i*)mem, self);
1533+
}
1534+
15181535
// sub
15191536
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
15201537
XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,23 @@ namespace xsimd
230230
store_masked<A>(reinterpret_cast<int64_t*>(mem), s64, batch_bool_constant<int64_t, A, Values...> {}, Mode {}, avx2 {});
231231
}
232232

233+
// load_stream
234+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
235+
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx2>) noexcept
236+
{
237+
return _mm256_stream_load_si256((__m256i const*)mem);
238+
}
239+
template <class A>
240+
XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<avx2>) noexcept
241+
{
242+
return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i const*)mem));
243+
}
244+
template <class A>
245+
XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<avx2>) noexcept
246+
{
247+
return _mm256_castsi256_pd(_mm256_stream_load_si256((__m256i const*)mem));
248+
}
249+
233250
// bitwise_and
234251
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
235252
XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1513,6 +1513,23 @@ namespace xsimd
15131513
return _mm512_loadu_pd(mem);
15141514
}
15151515

1516+
// load_stream
1517+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1518+
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
1519+
{
1520+
return _mm512_stream_load_si512((__m512i*)mem);
1521+
}
1522+
template <class A>
1523+
XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
1524+
{
1525+
return _mm512_castsi512_ps(_mm512_stream_load_si512((__m512i*)mem));
1526+
}
1527+
template <class A>
1528+
XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
1529+
{
1530+
return _mm512_castsi512_pd(_mm512_stream_load_si512((__m512i*)mem));
1531+
}
1532+
15161533
// lt
15171534
template <class A>
15181535
XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
@@ -2285,6 +2302,23 @@ namespace xsimd
22852302
return _mm512_storeu_pd(mem, self);
22862303
}
22872304

2305+
// store_stream
2306+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2307+
XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
2308+
{
2309+
_mm512_stream_si512((__m512i*)mem, self);
2310+
}
2311+
template <class A>
2312+
XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
2313+
{
2314+
_mm512_stream_ps(mem, self);
2315+
}
2316+
template <class A>
2317+
XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
2318+
{
2319+
_mm512_stream_pd(mem, self);
2320+
}
2321+
22882322
// sub
22892323
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
22902324
XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1958,6 +1958,23 @@ namespace xsimd
19581958
return _mm_storeu_pd(mem, self);
19591959
}
19601960

1961+
// store_stream
1962+
template <class A>
1963+
XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
1964+
{
1965+
_mm_stream_ps(mem, self);
1966+
}
1967+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1968+
XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
1969+
{
1970+
_mm_stream_si128((__m128i*)mem, self);
1971+
}
1972+
template <class A>
1973+
XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
1974+
{
1975+
_mm_stream_pd(mem, self);
1976+
}
1977+
19611978
// sub
19621979
template <class A>
19631980
XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,23 @@ namespace xsimd
237237
}
238238
}
239239

240+
// load_stream
241+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
242+
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<sse4_1>) noexcept
243+
{
244+
return _mm_stream_load_si128((__m128i*)mem);
245+
}
246+
template <class A>
247+
XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<sse4_1>) noexcept
248+
{
249+
return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)mem));
250+
}
251+
template <class A>
252+
XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<sse4_1>) noexcept
253+
{
254+
return _mm_castsi128_pd(_mm_stream_load_si128((__m128i*)mem));
255+
}
256+
240257
// min
241258
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
242259
XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept

include/xsimd/memory/xsimd_alignment.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,17 @@ namespace xsimd
3333
{
3434
};
3535

36+
/**
37+
* @struct stream_mode
38+
* @brief tag for load and store of aligned non-temporal memory.
39+
*
40+
* Streaming accesses expect aligned pointers. When no architecture-specific
41+
* implementation is available, they fall back to aligned semantics.
42+
*/
43+
struct stream_mode
44+
{
45+
};
46+
3647
/***********************
3748
* Allocator alignment *
3849
***********************/

include/xsimd/types/xsimd_api.hpp

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#ifndef XSIMD_API_HPP
1313
#define XSIMD_API_HPP
1414

15+
#include <atomic>
1516
#include <complex>
1617
#include <cstddef>
1718
#include <limits>
@@ -1378,6 +1379,30 @@ namespace xsimd
13781379
return kernel::load_complex_aligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
13791380
}
13801381

1382+
template <class To, class A = default_arch, class From>
1383+
XSIMD_INLINE simd_return_type<From, To, A> load_as(From const* ptr, stream_mode) noexcept
1384+
{
1385+
using batch_value_type = typename simd_return_type<From, To, A>::value_type;
1386+
detail::static_check_supported_config<From, A>();
1387+
detail::static_check_supported_config<To, A>();
1388+
return kernel::load_stream<A>(ptr, kernel::convert<batch_value_type> {}, A {});
1389+
}
1390+
1391+
template <class To, class A = default_arch>
1392+
XSIMD_INLINE simd_return_type<bool, To, A> load_as(bool const* ptr, stream_mode) noexcept
1393+
{
1394+
detail::static_check_supported_config<To, A>();
1395+
return simd_return_type<bool, To, A>::load_stream(ptr);
1396+
}
1397+
1398+
template <class To, class A = default_arch, class From>
1399+
XSIMD_INLINE simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, stream_mode) noexcept
1400+
{
1401+
detail::static_check_supported_config<To, A>();
1402+
using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
1403+
return kernel::load_complex_stream<A>(ptr, kernel::convert<batch_value_type> {}, A {});
1404+
}
1405+
13811406
#ifdef XSIMD_ENABLE_XTL_COMPLEX
13821407
template <class To, class A = default_arch, class From, bool i3ec>
13831408
XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, aligned_mode) noexcept
@@ -1386,6 +1411,14 @@ namespace xsimd
13861411
detail::static_check_supported_config<From, A>();
13871412
return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), aligned_mode());
13881413
}
1414+
1415+
template <class To, class A = default_arch, class From, bool i3ec>
1416+
XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, stream_mode) noexcept
1417+
{
1418+
detail::static_check_supported_config<To, A>();
1419+
detail::static_check_supported_config<From, A>();
1420+
return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), stream_mode());
1421+
}
13891422
#endif
13901423

13911424
/**
@@ -1460,6 +1493,13 @@ namespace xsimd
14601493
return load_as<From, A>(ptr, unaligned_mode {});
14611494
}
14621495

1496+
template <class A = default_arch, class From>
1497+
XSIMD_INLINE batch<From, A> load(From const* ptr, stream_mode) noexcept
1498+
{
1499+
detail::static_check_supported_config<From, A>();
1500+
return load_as<From, A>(ptr, stream_mode {});
1501+
}
1502+
14631503
/**
14641504
* @ingroup batch_data_transfer
14651505
*
@@ -2464,12 +2504,40 @@ namespace xsimd
24642504
kernel::store_complex_aligned<A>(dst, src, A {});
24652505
}
24662506

2507+
template <class To, class A = default_arch, class From>
2508+
XSIMD_INLINE void store_as(To* dst, batch<From, A> const& src, stream_mode) noexcept
2509+
{
2510+
detail::static_check_supported_config<From, A>();
2511+
kernel::store_stream<A>(dst, src, A {});
2512+
}
2513+
2514+
template <class A = default_arch, class From>
2515+
XSIMD_INLINE void store_as(bool* dst, batch_bool<From, A> const& src, stream_mode) noexcept
2516+
{
2517+
detail::static_check_supported_config<From, A>();
2518+
kernel::store_stream<A>(src, dst, A {});
2519+
}
2520+
2521+
template <class To, class A = default_arch, class From>
2522+
XSIMD_INLINE void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, stream_mode) noexcept
2523+
{
2524+
detail::static_check_supported_config<std::complex<From>, A>();
2525+
kernel::store_complex_stream<A>(dst, src, A {});
2526+
}
2527+
24672528
#ifdef XSIMD_ENABLE_XTL_COMPLEX
24682529
template <class To, class A = default_arch, class From, bool i3ec>
24692530
XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
24702531
{
24712532
store_as(reinterpret_cast<std::complex<To>*>(dst), src, aligned_mode());
24722533
}
2534+
2535+
template <class To, class A = default_arch, class From, bool i3ec>
2536+
XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, stream_mode) noexcept
2537+
{
2538+
detail::static_check_supported_config<std::complex<From>, A>();
2539+
store_as(reinterpret_cast<std::complex<To>*>(dst), src, stream_mode());
2540+
}
24732541
#endif
24742542

24752543
/**
@@ -2538,6 +2606,22 @@ namespace xsimd
25382606
store_as<T, A>(mem, val, unaligned_mode {});
25392607
}
25402608

2609+
template <class A, class T>
2610+
XSIMD_INLINE void store(T* mem, batch<T, A> const& val, stream_mode) noexcept
2611+
{
2612+
store_as<T, A>(mem, val, stream_mode {});
2613+
}
2614+
2615+
/**
2616+
* @ingroup batch_data_transfer
2617+
*
2618+
* Issues a sequentially consistent memory fence.
2619+
*/
2620+
XSIMD_INLINE void fence() noexcept
2621+
{
2622+
std::atomic_thread_fence(std::memory_order_seq_cst);
2623+
}
2624+
25412625
/**
25422626
* @ingroup batch_data_transfer
25432627
*

0 commit comments

Comments
 (0)