Skip to content

Commit b68bfe5

Browse files
DiamonDinoiaclaude
authored andcommitted
Add AArch64 NEON non-temporal load/store (ldnp/stnp)
Implement store_stream and load_stream for neon64 using inline asm with LDNP/STNP instructions, providing non-temporal cache hints on AArch64. Covers float, double, and integral types. Guarded behind __GNUC__ so MSVC ARM64 falls back to aligned load/store. Also remove xsimd::fence (std::atomic wrapper) and its test, which were unrelated additions from a prior commit. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c3652a8 commit b68bfe5

File tree

3 files changed

+84
-16
lines changed

3 files changed

+84
-16
lines changed

include/xsimd/arch/xsimd_neon64.hpp

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <cassert>
1616
#include <complex>
1717
#include <cstddef>
18+
#include <cstring>
1819
#include <tuple>
1920
#include <utility>
2021

@@ -179,6 +180,89 @@ namespace xsimd
179180
return store_aligned<A>(dst, src, A {});
180181
}
181182

183+
/****************
184+
* store_stream *
185+
****************/
186+
187+
#if defined(__GNUC__)
188+
template <class A>
189+
XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& val, requires_arch<neon64>) noexcept
190+
{
191+
float32x2_t lo = vget_low_f32(val);
192+
float32x2_t hi = vget_high_f32(val);
193+
__asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
194+
:
195+
: [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
196+
: "memory");
197+
}
198+
199+
template <class A>
200+
XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& val, requires_arch<neon64>) noexcept
201+
{
202+
float64x1_t lo = vget_low_f64(val);
203+
float64x1_t hi = vget_high_f64(val);
204+
__asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
205+
:
206+
: [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
207+
: "memory");
208+
}
209+
210+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
211+
XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& val, requires_arch<neon64>) noexcept
212+
{
213+
uint64x2_t u64;
214+
std::memcpy(&u64, &val, sizeof(u64));
215+
uint64x1_t lo = vget_low_u64(u64);
216+
uint64x1_t hi = vget_high_u64(u64);
217+
__asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
218+
:
219+
: [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
220+
: "memory");
221+
}
222+
#endif
223+
224+
/***************
225+
* load_stream *
226+
***************/
227+
228+
#if defined(__GNUC__)
229+
template <class A>
230+
XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<neon64>) noexcept
231+
{
232+
float32x2_t lo, hi;
233+
__asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
234+
: [lo] "=w"(lo), [hi] "=w"(hi)
235+
: [mem] "r"(mem)
236+
: "memory");
237+
return vcombine_f32(lo, hi);
238+
}
239+
240+
template <class A>
241+
XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<neon64>) noexcept
242+
{
243+
float64x1_t lo, hi;
244+
__asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
245+
: [lo] "=w"(lo), [hi] "=w"(hi)
246+
: [mem] "r"(mem)
247+
: "memory");
248+
return vcombine_f64(lo, hi);
249+
}
250+
251+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
252+
XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<neon64>) noexcept
253+
{
254+
uint64x1_t lo, hi;
255+
__asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
256+
: [lo] "=w"(lo), [hi] "=w"(hi)
257+
: [mem] "r"(mem)
258+
: "memory");
259+
uint64x2_t u64 = vcombine_u64(lo, hi);
260+
batch<T, A> result;
261+
std::memcpy(&result, &u64, sizeof(u64));
262+
return result;
263+
}
264+
#endif
265+
182266
/*********************
183267
* store<batch_bool> *
184268
*********************/

include/xsimd/types/xsimd_api.hpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
#ifndef XSIMD_API_HPP
1313
#define XSIMD_API_HPP
1414

15-
#include <atomic>
1615
#include <complex>
1716
#include <cstddef>
1817
#include <limits>
@@ -2612,16 +2611,6 @@ namespace xsimd
26122611
store_as<T, A>(mem, val, stream_mode {});
26132612
}
26142613

2615-
/**
2616-
* @ingroup batch_data_transfer
2617-
*
2618-
* Issues a sequentially consistent memory fence.
2619-
*/
2620-
XSIMD_INLINE void fence() noexcept
2621-
{
2622-
std::atomic_thread_fence(std::memory_order_seq_cst);
2623-
}
2624-
26252614
/**
26262615
* @ingroup batch_data_transfer
26272616
*

test/test_load_store.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -606,9 +606,4 @@ TEST_CASE_TEMPLATE("[load store]", B, BATCH_TYPES)
606606
SUBCASE("masked") { Test.test_masked(); }
607607
}
608608

609-
TEST_CASE("[fence] sequential consistency")
610-
{
611-
xsimd::fence();
612-
CHECK(true);
613-
}
614609
#endif

0 commit comments

Comments
 (0)