Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .github/workflows/arm64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,39 @@ jobs:
- name: 'Build'
working-directory: ${{ github.workspace }}
run: cmake --build out/build/${{ matrix.build_type }}

buildarm32:
runs-on: windows-11-arm

strategy:
fail-fast: false

matrix:
build_type: [arm-Debug, arm-Release]

steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

- name: Clone test repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: walbourn/directxmathtest
path: Tests
ref: main

- name: 'Install Ninja'
run: choco install ninja

# ARM32 is deprecated, so more recent Windows SDKs no longer support it
- uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
with:
arch: arm64_arm
sdk: 10.0.22621.0

- name: 'Configure CMake'
working-directory: ${{ github.workspace }}
run: cmake --preset=${{ matrix.build_type }}

- name: 'Build'
working-directory: ${{ github.workspace }}
run: cmake --build out/build/${{ matrix.build_type }}
13 changes: 13 additions & 0 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,17 @@
},
"hidden": true
},
{
"name": "ARM32",
"architecture": {
"value": "arm",
"strategy": "external"
},
"cacheVariables": {
"DXMATH_ARCHITECTURE": "arm"
},
"hidden": true
},
{
"name": "ARM64",
"architecture": {
Expand Down Expand Up @@ -205,6 +216,8 @@
{ "name": "arm64-Release" , "description": "MSVC for ARM64 (Release) - ARM-NEON", "inherits": [ "base", "ARM64", "Release", "MSVC" ] },
{ "name": "arm64ec-Debug" , "description": "MSVC for ARM64EC (Debug) - ARM-NEON", "inherits": [ "base", "ARM64EC", "Debug", "MSVC" ] },
{ "name": "arm64ec-Release", "description": "MSVC for ARM64EC (Release) - ARM-NEON", "inherits": [ "base", "ARM64EC", "Release", "MSVC" ] },
{ "name": "arm-Debug" , "description": "MSVC for ARM32 [Deprecated] (Debug) - ARM-NEON", "inherits": [ "base", "ARM32", "Debug", "MSVC" ] },
{ "name": "arm-Release" , "description": "MSVC for ARM32 [Deprecated] (Release) - ARM-NEON", "inherits": [ "base", "ARM32", "Release", "MSVC" ] },

{ "name": "x64-Debug-Clang" , "description": "Clang/LLVM for x64 (Debug) - SSE/SSE2", "inherits": [ "base", "x64", "Debug", "Clang" ] },
{ "name": "x64-Release-Clang" , "description": "Clang/LLVM for x64 (Release) - SSE/SSE2", "inherits": [ "base", "x64", "Release", "Clang" ] },
Expand Down
125 changes: 125 additions & 0 deletions Inc/DirectXPackedVector.inl
Original file line number Diff line number Diff line change
Expand Up @@ -1279,6 +1279,8 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept
{
assert(pSource);

#if defined(_XM_NO_INTRINSICS_)

union { float f; int32_t i; } fi;
fi.i = 0x33800000 + (pSource->e << 23);
float Scale = fi.f;
Expand All @@ -1289,6 +1291,41 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept
Scale * float(pSource->zm),
1.0f } } };
return v;

#elif defined(_XM_ARM_NEON_INTRINSICS_)

// Build scale factor from shared exponent
union { float f; int32_t i; } fi;
fi.i = 0x33800000 + (pSource->e << 23);

// Extract 9-bit mantissas into vector lanes
uint32x4_t mantissas = vdupq_n_u32(0);
mantissas = vsetq_lane_u32(pSource->xm, mantissas, 0);
mantissas = vsetq_lane_u32(pSource->ym, mantissas, 1);
mantissas = vsetq_lane_u32(pSource->zm, mantissas, 2);

// Convert to float, scale, and set w = 1.0f
float32x4_t result = vmulq_n_f32(vcvtq_f32_u32(mantissas), fi.f);
return vsetq_lane_f32(1.0f, result, 3);

#elif defined(_XM_SSE_INTRINSICS_)

// Build scale factor from shared exponent
union { float f; int32_t i; } fi;
fi.i = 0x33800000 + (pSource->e << 23);

// Extract 9-bit mantissas, convert to float, and scale
__m128i mantissas = _mm_set_epi32(
0,
static_cast<int>(pSource->zm),
static_cast<int>(pSource->ym),
static_cast<int>(pSource->xm));
__m128 result = _mm_mul_ps(_mm_cvtepi32_ps(mantissas), _mm_set1_ps(fi.f));

// Set w = 1.0f (w lane is +0.0f so bitwise OR inserts 1.0f cleanly)
return _mm_or_ps(result, g_XMIdentityR3);

#endif
}

//------------------------------------------------------------------------------
Expand Down Expand Up @@ -2639,6 +2676,8 @@ inline void XM_CALLCONV XMStoreFloat3SE
{
assert(pDestination);

#if defined(_XM_NO_INTRINSICS_)

XMFLOAT3A tmp;
XMStoreFloat3A(&tmp, V);

Expand Down Expand Up @@ -2667,6 +2706,92 @@ inline void XM_CALLCONV XMStoreFloat3SE
pDestination->xm = static_cast<uint32_t>(MathInternal::round_to_nearest(x * ScaleR));
pDestination->ym = static_cast<uint32_t>(MathInternal::round_to_nearest(y * ScaleR));
pDestination->zm = static_cast<uint32_t>(MathInternal::round_to_nearest(z * ScaleR));

#elif defined(_XM_ARM_NEON_INTRINSICS_)

static const XMVECTORF32 MaxFloat9 = { { { float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7) } } };
static constexpr float minf9 = float(1.f / (1 << 16));

// Clamp to [0, maxf9] then zero w lane
float32x4_t clamped = vminq_f32(vmaxq_f32(V, vdupq_n_f32(0)), MaxFloat9);
clamped = vsetq_lane_f32(0.0f, clamped, 3);

// Horizontal max of xyz for shared exponent
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
float maxVal = vmaxvq_f32(clamped);
#else
float32x2_t vlow = vget_low_f32(clamped);
float32x2_t vhigh = vget_high_f32(clamped);
float32x2_t maxPair = vpmax_f32(vlow, vhigh);
maxPair = vpmax_f32(maxPair, maxPair);
float maxVal = vget_lane_f32(maxPair, 0);
#endif

if (maxVal < minf9) maxVal = minf9;

// Compute shared exponent (inherently scalar)
union { float f; int32_t i; } fi;
fi.f = maxVal;
fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1)

auto exp = static_cast<uint32_t>(fi.i) >> 23;
fi.i = static_cast<int32_t>(0x83000000 - (exp << 23));

// Scale all channels and convert to integer
float32x4_t scaled = vmulq_n_f32(clamped, fi.f);
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
uint32x4_t ints = vcvtnq_u32_f32(scaled);
#else
scaled = vaddq_f32(scaled, vdupq_n_f32(0.5f));
uint32x4_t ints = vcvtq_u32_f32(scaled);
#endif

// Extract and pack into bitfields
pDestination->xm = vgetq_lane_u32(ints, 0) & 0x1FF;
pDestination->ym = vgetq_lane_u32(ints, 1) & 0x1FF;
pDestination->zm = vgetq_lane_u32(ints, 2) & 0x1FF;
pDestination->e = exp - 0x6f;

#elif defined(_XM_SSE_INTRINSICS_)

static const XMVECTORF32 MaxFloat9 = { { { float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7) } } };
static constexpr float minf9 = float(1.f / (1 << 16));

// Clamp to [0, maxf9] then mask w to zero
__m128 clamped = _mm_min_ps(_mm_max_ps(V, _mm_setzero_ps()), MaxFloat9);
clamped = _mm_and_ps(clamped, g_XMMask3);

// Horizontal max of xyz for shared exponent
__m128 maxV = clamped;
__m128 temp = XM_PERMUTE_PS(maxV, _MM_SHUFFLE(1, 1, 1, 1));
maxV = _mm_max_ps(maxV, temp);
temp = XM_PERMUTE_PS(clamped, _MM_SHUFFLE(2, 2, 2, 2));
maxV = _mm_max_ps(maxV, temp);

// Ensure minimum threshold
maxV = _mm_max_ss(maxV, _mm_set_ss(minf9));

// Compute shared exponent (inherently scalar)
union { float f; int32_t i; } fi;
_mm_store_ss(&fi.f, maxV);
fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1)

auto exp = static_cast<uint32_t>(fi.i) >> 23;
fi.i = static_cast<int32_t>(0x83000000 - (exp << 23));

// Scale all channels and round to nearest integer
__m128 scaled = _mm_mul_ps(clamped, _mm_set1_ps(fi.f));
__m128i ints = _mm_cvtps_epi32(scaled);

// Extract and pack into bitfields
XM_ALIGNED_DATA(16) uint32_t ivals[4];
_mm_store_si128(reinterpret_cast<__m128i*>(ivals), ints);

pDestination->xm = ivals[0] & 0x1FF;
pDestination->ym = ivals[1] & 0x1FF;
pDestination->zm = ivals[2] & 0x1FF;
pDestination->e = exp - 0x6f;
#endif
}

//------------------------------------------------------------------------------
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ For a full change history, see [CHANGELOG.md](https://github.com/microsoft/Direc

* The clang/LLVM toolset currently does not respect the ``float_control`` pragma for SSE instrinsics. Therefore, the use of ``/fp:fast`` is not recommended on clang/LLVM until this issue is fixed. See [55713](https://github.com/llvm/llvm-project/issues/55713).

* AArch32/ARM32 (ARMv7) support is deprecated in Windows 11. Compiler support for ARM32 is deprecated in Visual Studio 2026, and the system libraries are no longer present in the Windows SDK (26100) or later. Therefore, support for ARM32 is deprecated in DirectXMath and will be removed in a future release. Since most codepaths are shared for AArch32 and AArch64, the codepaths will be refactored to assume AArch64 (ARMv8).

## Support

For questions, consider using [Stack Overflow](https://stackoverflow.com/questions/tagged/directxmath) with the *directxmath* tag, or the [DirectX Discord Server](https://discord.gg/directx) in the *dx12-developers* or *dx9-dx11-developers* channel.
Expand Down
Loading