Skip to content

Commit 50822d3

Browse files
authored
SIMD implementation for XMFloatLoad/Store3SE (#308)
1 parent 74a0f33 commit 50822d3

4 files changed

Lines changed: 176 additions & 0 deletions

File tree

.github/workflows/arm64.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,39 @@ jobs:
6262
- name: 'Build'
6363
working-directory: ${{ github.workspace }}
6464
run: cmake --build out/build/${{ matrix.build_type }}
65+
66+
buildarm32:
67+
runs-on: windows-11-arm
68+
69+
strategy:
70+
fail-fast: false
71+
72+
matrix:
73+
build_type: [arm-Debug, arm-Release]
74+
75+
steps:
76+
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
77+
78+
- name: Clone test repository
79+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
80+
with:
81+
repository: walbourn/directxmathtest
82+
path: Tests
83+
ref: main
84+
85+
- name: 'Install Ninja'
86+
run: choco install ninja
87+
88+
# ARM32 is deprecated, so more recent Windows SDKs no longer support it
89+
- uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
90+
with:
91+
arch: arm64_arm
92+
sdk: 10.0.22621.0
93+
94+
- name: 'Configure CMake'
95+
working-directory: ${{ github.workspace }}
96+
run: cmake --preset=${{ matrix.build_type }}
97+
98+
- name: 'Build'
99+
working-directory: ${{ github.workspace }}
100+
run: cmake --build out/build/${{ matrix.build_type }}

CMakePresets.json

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,17 @@
3636
},
3737
"hidden": true
3838
},
39+
{
40+
"name": "ARM32",
41+
"architecture": {
42+
"value": "arm",
43+
"strategy": "external"
44+
},
45+
"cacheVariables": {
46+
"DXMATH_ARCHITECTURE": "arm"
47+
},
48+
"hidden": true
49+
},
3950
{
4051
"name": "ARM64",
4152
"architecture": {
@@ -205,6 +216,8 @@
205216
{ "name": "arm64-Release" , "description": "MSVC for ARM64 (Release) - ARM-NEON", "inherits": [ "base", "ARM64", "Release", "MSVC" ] },
206217
{ "name": "arm64ec-Debug" , "description": "MSVC for ARM64EC (Debug) - ARM-NEON", "inherits": [ "base", "ARM64EC", "Debug", "MSVC" ] },
207218
{ "name": "arm64ec-Release", "description": "MSVC for ARM64EC (Release) - ARM-NEON", "inherits": [ "base", "ARM64EC", "Release", "MSVC" ] },
219+
{ "name": "arm-Debug" , "description": "MSVC for ARM32 [Deprecated] (Debug) - ARM-NEON", "inherits": [ "base", "ARM32", "Debug", "MSVC" ] },
220+
{ "name": "arm-Release" , "description": "MSVC for ARM32 [Deprecated] (Release) - ARM-NEON", "inherits": [ "base", "ARM32", "Release", "MSVC" ] },
208221

209222
{ "name": "x64-Debug-Clang" , "description": "Clang/LLVM for x64 (Debug) - SSE/SSE2", "inherits": [ "base", "x64", "Debug", "Clang" ] },
210223
{ "name": "x64-Release-Clang" , "description": "Clang/LLVM for x64 (Release) - SSE/SSE2", "inherits": [ "base", "x64", "Release", "Clang" ] },

Inc/DirectXPackedVector.inl

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1279,6 +1279,8 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept
12791279
{
12801280
assert(pSource);
12811281

1282+
#if defined(_XM_NO_INTRINSICS_)
1283+
12821284
union { float f; int32_t i; } fi;
12831285
fi.i = 0x33800000 + (pSource->e << 23);
12841286
float Scale = fi.f;
@@ -1289,6 +1291,41 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept
12891291
Scale * float(pSource->zm),
12901292
1.0f } } };
12911293
return v;
1294+
1295+
#elif defined(_XM_ARM_NEON_INTRINSICS_)
1296+
1297+
// Build scale factor from shared exponent
1298+
union { float f; int32_t i; } fi;
1299+
fi.i = 0x33800000 + (pSource->e << 23);
1300+
1301+
// Extract 9-bit mantissas into vector lanes
1302+
uint32x4_t mantissas = vdupq_n_u32(0);
1303+
mantissas = vsetq_lane_u32(pSource->xm, mantissas, 0);
1304+
mantissas = vsetq_lane_u32(pSource->ym, mantissas, 1);
1305+
mantissas = vsetq_lane_u32(pSource->zm, mantissas, 2);
1306+
1307+
// Convert to float, scale, and set w = 1.0f
1308+
float32x4_t result = vmulq_n_f32(vcvtq_f32_u32(mantissas), fi.f);
1309+
return vsetq_lane_f32(1.0f, result, 3);
1310+
1311+
#elif defined(_XM_SSE_INTRINSICS_)
1312+
1313+
// Build scale factor from shared exponent
1314+
union { float f; int32_t i; } fi;
1315+
fi.i = 0x33800000 + (pSource->e << 23);
1316+
1317+
// Extract 9-bit mantissas, convert to float, and scale
1318+
__m128i mantissas = _mm_set_epi32(
1319+
0,
1320+
static_cast<int>(pSource->zm),
1321+
static_cast<int>(pSource->ym),
1322+
static_cast<int>(pSource->xm));
1323+
__m128 result = _mm_mul_ps(_mm_cvtepi32_ps(mantissas), _mm_set1_ps(fi.f));
1324+
1325+
// Set w = 1.0f (w lane is +0.0f so bitwise OR inserts 1.0f cleanly)
1326+
return _mm_or_ps(result, g_XMIdentityR3);
1327+
1328+
#endif
12921329
}
12931330

12941331
//------------------------------------------------------------------------------
@@ -2639,6 +2676,8 @@ inline void XM_CALLCONV XMStoreFloat3SE
26392676
{
26402677
assert(pDestination);
26412678

2679+
#if defined(_XM_NO_INTRINSICS_)
2680+
26422681
XMFLOAT3A tmp;
26432682
XMStoreFloat3A(&tmp, V);
26442683

@@ -2667,6 +2706,92 @@ inline void XM_CALLCONV XMStoreFloat3SE
26672706
pDestination->xm = static_cast<uint32_t>(MathInternal::round_to_nearest(x * ScaleR));
26682707
pDestination->ym = static_cast<uint32_t>(MathInternal::round_to_nearest(y * ScaleR));
26692708
pDestination->zm = static_cast<uint32_t>(MathInternal::round_to_nearest(z * ScaleR));
2709+
2710+
#elif defined(_XM_ARM_NEON_INTRINSICS_)
2711+
2712+
static const XMVECTORF32 MaxFloat9 = { { { float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7) } } };
2713+
static constexpr float minf9 = float(1.f / (1 << 16));
2714+
2715+
// Clamp to [0, maxf9] then zero w lane
2716+
float32x4_t clamped = vminq_f32(vmaxq_f32(V, vdupq_n_f32(0)), MaxFloat9);
2717+
clamped = vsetq_lane_f32(0.0f, clamped, 3);
2718+
2719+
// Horizontal max of xyz for shared exponent
2720+
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
2721+
float maxVal = vmaxvq_f32(clamped);
2722+
#else
2723+
float32x2_t vlow = vget_low_f32(clamped);
2724+
float32x2_t vhigh = vget_high_f32(clamped);
2725+
float32x2_t maxPair = vpmax_f32(vlow, vhigh);
2726+
maxPair = vpmax_f32(maxPair, maxPair);
2727+
float maxVal = vget_lane_f32(maxPair, 0);
2728+
#endif
2729+
2730+
if (maxVal < minf9) maxVal = minf9;
2731+
2732+
// Compute shared exponent (inherently scalar)
2733+
union { float f; int32_t i; } fi;
2734+
fi.f = maxVal;
2735+
fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1)
2736+
2737+
auto exp = static_cast<uint32_t>(fi.i) >> 23;
2738+
fi.i = static_cast<int32_t>(0x83000000 - (exp << 23));
2739+
2740+
// Scale all channels and convert to integer
2741+
float32x4_t scaled = vmulq_n_f32(clamped, fi.f);
2742+
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
2743+
uint32x4_t ints = vcvtnq_u32_f32(scaled);
2744+
#else
2745+
scaled = vaddq_f32(scaled, vdupq_n_f32(0.5f));
2746+
uint32x4_t ints = vcvtq_u32_f32(scaled);
2747+
#endif
2748+
2749+
// Extract and pack into bitfields
2750+
pDestination->xm = vgetq_lane_u32(ints, 0) & 0x1FF;
2751+
pDestination->ym = vgetq_lane_u32(ints, 1) & 0x1FF;
2752+
pDestination->zm = vgetq_lane_u32(ints, 2) & 0x1FF;
2753+
pDestination->e = exp - 0x6f;
2754+
2755+
#elif defined(_XM_SSE_INTRINSICS_)
2756+
2757+
static const XMVECTORF32 MaxFloat9 = { { { float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7) } } };
2758+
static constexpr float minf9 = float(1.f / (1 << 16));
2759+
2760+
// Clamp to [0, maxf9] then mask w to zero
2761+
__m128 clamped = _mm_min_ps(_mm_max_ps(V, _mm_setzero_ps()), MaxFloat9);
2762+
clamped = _mm_and_ps(clamped, g_XMMask3);
2763+
2764+
// Horizontal max of xyz for shared exponent
2765+
__m128 maxV = clamped;
2766+
__m128 temp = XM_PERMUTE_PS(maxV, _MM_SHUFFLE(1, 1, 1, 1));
2767+
maxV = _mm_max_ps(maxV, temp);
2768+
temp = XM_PERMUTE_PS(clamped, _MM_SHUFFLE(2, 2, 2, 2));
2769+
maxV = _mm_max_ps(maxV, temp);
2770+
2771+
// Ensure minimum threshold
2772+
maxV = _mm_max_ss(maxV, _mm_set_ss(minf9));
2773+
2774+
// Compute shared exponent (inherently scalar)
2775+
union { float f; int32_t i; } fi;
2776+
_mm_store_ss(&fi.f, maxV);
2777+
fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1)
2778+
2779+
auto exp = static_cast<uint32_t>(fi.i) >> 23;
2780+
fi.i = static_cast<int32_t>(0x83000000 - (exp << 23));
2781+
2782+
// Scale all channels and round to nearest integer
2783+
__m128 scaled = _mm_mul_ps(clamped, _mm_set1_ps(fi.f));
2784+
__m128i ints = _mm_cvtps_epi32(scaled);
2785+
2786+
// Extract and pack into bitfields
2787+
XM_ALIGNED_DATA(16) uint32_t ivals[4];
2788+
_mm_store_si128(reinterpret_cast<__m128i*>(ivals), ints);
2789+
2790+
pDestination->xm = ivals[0] & 0x1FF;
2791+
pDestination->ym = ivals[1] & 0x1FF;
2792+
pDestination->zm = ivals[2] & 0x1FF;
2793+
pDestination->e = exp - 0x6f;
2794+
#endif
26702795
}
26712796

26722797
//------------------------------------------------------------------------------

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ For a full change history, see [CHANGELOG.md](https://github.com/microsoft/Direc
9393

9494
* The clang/LLVM toolset currently does not respect the ``float_control`` pragma for SSE instrinsics. Therefore, the use of ``/fp:fast`` is not recommended on clang/LLVM until this issue is fixed. See [55713](https://github.com/llvm/llvm-project/issues/55713).
9595

96+
* AArch32/ARM32 (ARMv7) support is deprecated in Windows 11. Compiler support for ARM32 is deprecated in Visual Studio 2026, and the system libraries are no longer present in the Windows SDK (26100) or later. Therefore, support for ARM32 is deprecated in DirectXMath and will be removed in a future release. Since most codepaths are shared for AArch32 and AArch64, the codepaths will be refactored to assume AArch64 (ARMv8).
97+
9698
## Support
9799

98100
For questions, consider using [Stack Overflow](https://stackoverflow.com/questions/tagged/directxmath) with the *directxmath* tag, or the [DirectX Discord Server](https://discord.gg/directx) in the *dx12-developers* or *dx9-dx11-developers* channel.

0 commit comments

Comments
 (0)