From 89689e631418828605b2a688d3e3d0c955b8dfbd Mon Sep 17 00:00:00 2001 From: Chuck Walbourn Date: Wed, 13 May 2026 21:57:34 -0700 Subject: [PATCH 1/3] Minor XMLoadFloat3A SSE4 optimization --- Inc/DirectXMathConvert.inl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Inc/DirectXMathConvert.inl b/Inc/DirectXMathConvert.inl index 69218dc..1d3f1c1 100644 --- a/Inc/DirectXMathConvert.inl +++ b/Inc/DirectXMathConvert.inl @@ -541,6 +541,10 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3A(const XMFLOAT3A* pSource) noexcept float32x4_t V = vld1q_f32(reinterpret_cast(pSource)); #endif return vsetq_lane_f32(0, V, 3); +#elif defined(_XM_SSE4_INTRINSICS_) + // Reads an extra float which is zero'd + __m128 V = _mm_load_ps(&pSource->x); + return _mm_blend_ps(_mm_setzero_ps(), V, 0x7); #elif defined(_XM_SSE_INTRINSICS_) // Reads an extra float which is zero'd __m128 V = _mm_load_ps(&pSource->x); From 47c89f2683dd1e6df776cb1653d484df6c82f221 Mon Sep 17 00:00:00 2001 From: Chuck Walbourn Date: Thu, 14 May 2026 09:42:23 -0700 Subject: [PATCH 2/3] XMLoadFloat4x3 can use the same trick --- Inc/DirectXMathConvert.inl | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/Inc/DirectXMathConvert.inl b/Inc/DirectXMathConvert.inl index 1d3f1c1..3563051 100644 --- a/Inc/DirectXMathConvert.inl +++ b/Inc/DirectXMathConvert.inl @@ -869,6 +869,24 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); M.r[3] = vsetq_lane_f32(1.f, T3, 3); return M; +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]); + XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]); + XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]); + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2)); + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0)); + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2)); + XMVECTOR zero = _mm_setzero_ps(); + vTemp1 = _mm_blend_ps(zero, vTemp1, 0x7); + vTemp2 = _mm_blend_ps(zero, vTemp2, 0x7); + vTemp3 = _mm_blend_ps(zero, vTemp3, 0x7); + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8); + vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; #elif defined(_XM_SSE_INTRINSICS_) // Use unaligned load instructions to // load the 12 floats @@ -953,6 +971,24 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); M.r[3] = vsetq_lane_f32(1.f, T3, 3); return M; +#elif defined(_XM_SSE4_INTRINSICS_) + XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]); + XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]); + XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]); + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2)); + vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0)); + vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2)); + XMVECTOR zero = _mm_setzero_ps(); + vTemp1 = _mm_blend_ps(zero, vTemp1, 0x7); + vTemp2 = _mm_blend_ps(zero, vTemp2, 0x7); + vTemp3 = _mm_blend_ps(zero, vTemp3, 0x7); + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8); + vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; #elif defined(_XM_SSE_INTRINSICS_) // Use aligned load instructions to // load the 12 floats From 0fcaeff33df0166e90ef01518c61494be4cae46b Mon Sep 17 00:00:00 2001 From: Chuck Walbourn Date: Thu, 14 May 2026 09:49:42 -0700 Subject: [PATCH 3/3] Code review feedback --- Inc/DirectXMathConvert.inl | 58 ++++++++++++-------------------------- 1 file changed, 18 insertions(+), 40 deletions(-) diff --git a/Inc/DirectXMathConvert.inl b/Inc/DirectXMathConvert.inl index 3563051..e3a6bdc 100644 --- a/Inc/DirectXMathConvert.inl +++ b/Inc/DirectXMathConvert.inl @@ -869,24 +869,6 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); M.r[3] = vsetq_lane_f32(1.f, T3, 3); return M; -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]); - XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]); - XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]); - XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2)); - vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0)); - vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2)); - XMVECTOR zero = _mm_setzero_ps(); - vTemp1 = _mm_blend_ps(zero, vTemp1, 0x7); - vTemp2 = _mm_blend_ps(zero, vTemp2, 0x7); - vTemp3 = _mm_blend_ps(zero, vTemp3, 0x7); - __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8); - vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3); - XMMATRIX M(vTemp1, - vTemp2, - vTemp3, - _mm_castsi128_ps(vTemp4i)); - return M; #elif defined(_XM_SSE_INTRINSICS_) // Use unaligned load instructions to // load the 12 floats @@ -903,11 +885,18 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept // vTemp2 = x2,y2,z2,z2 vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2)); // vTemp1 = x1,y1,z1,0 - vTemp1 = _mm_and_ps(vTemp1, g_XMMask3); // vTemp2 = x2,y2,z2,0 - vTemp2 = _mm_and_ps(vTemp2, g_XMMask3); // vTemp3 = x3,y3,z3,0 +#ifdef _XM_SSE4_INTRINSICS_ + XMVECTOR zero = _mm_setzero_ps(); + vTemp1 = _mm_blend_ps(zero, vTemp1, 0x7); + vTemp2 = _mm_blend_ps(zero, vTemp2, 0x7); + vTemp3 = _mm_blend_ps(zero, vTemp3, 0x7); +#else + vTemp1 = _mm_and_ps(vTemp1, g_XMMask3); + vTemp2 = _mm_and_ps(vTemp2, g_XMMask3); vTemp3 = _mm_and_ps(vTemp3, g_XMMask3); +#endif // vTemp4i = x4,y4,z4,0 __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8); // vTemp4i = x4,y4,z4,1.0f @@ -971,24 +960,6 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept M.r[2] = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(T2), g_XMMask3)); M.r[3] = vsetq_lane_f32(1.f, T3, 3); return M; -#elif defined(_XM_SSE4_INTRINSICS_) - XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]); - XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]); - XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]); - XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2, vTemp4, _MM_SHUFFLE(0, 0, 3, 2)); - vTemp2 = _mm_shuffle_ps(vTemp2, vTemp1, _MM_SHUFFLE(3, 3, 1, 0)); - vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2)); - XMVECTOR zero = _mm_setzero_ps(); - vTemp1 = _mm_blend_ps(zero, vTemp1, 0x7); - vTemp2 = _mm_blend_ps(zero, vTemp2, 0x7); - vTemp3 = _mm_blend_ps(zero, vTemp3, 0x7); - __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8); - vTemp4i = _mm_or_si128(vTemp4i, g_XMIdentityR3); - XMMATRIX M(vTemp1, - vTemp2, - vTemp3, - _mm_castsi128_ps(vTemp4i)); - return M; #elif defined(_XM_SSE_INTRINSICS_) // Use aligned load instructions to // load the 12 floats @@ -1005,11 +976,18 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept // vTemp2 = x2,y2,z2,z2 vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2)); // vTemp1 = x1,y1,z1,0 - vTemp1 = _mm_and_ps(vTemp1, g_XMMask3); // vTemp2 = x2,y2,z2,0 - vTemp2 = _mm_and_ps(vTemp2, g_XMMask3); // vTemp3 = x3,y3,z3,0 +#ifdef _XM_SSE4_INTRINSICS_ + XMVECTOR zero = _mm_setzero_ps(); + vTemp1 = _mm_blend_ps(zero, vTemp1, 0x7); + vTemp2 = _mm_blend_ps(zero, vTemp2, 0x7); + vTemp3 = _mm_blend_ps(zero, vTemp3, 0x7); +#else + vTemp1 = _mm_and_ps(vTemp1, g_XMMask3); + vTemp2 = _mm_and_ps(vTemp2, g_XMMask3); vTemp3 = _mm_and_ps(vTemp3, g_XMMask3); +#endif // vTemp4i = x4,y4,z4,0 __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8); // vTemp4i = x4,y4,z4,1.0f