Skip to content

Commit 74a0f33

Browse files
authored
Minor XMLoadFloat3A/4x3(A) SSE4 optimization (#307)
1 parent 6504050 commit 74a0f33

1 file changed

Lines changed: 22 additions & 4 deletions

File tree

Inc/DirectXMathConvert.inl

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,10 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3A(const XMFLOAT3A* pSource) noexcept
541541
float32x4_t V = vld1q_f32(reinterpret_cast<const float*>(pSource));
542542
#endif
543543
return vsetq_lane_f32(0, V, 3);
544+
#elif defined(_XM_SSE4_INTRINSICS_)
545+
// Reads an extra float which is zero'd
546+
__m128 V = _mm_load_ps(&pSource->x);
547+
return _mm_blend_ps(_mm_setzero_ps(), V, 0x7);
544548
#elif defined(_XM_SSE_INTRINSICS_)
545549
// Reads an extra float which is zero'd
546550
__m128 V = _mm_load_ps(&pSource->x);
@@ -881,11 +885,18 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept
881885
// vTemp2 = x2,y2,z2,z2
882886
vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2));
883887
// vTemp1 = x1,y1,z1,0
884-
vTemp1 = _mm_and_ps(vTemp1, g_XMMask3);
885888
// vTemp2 = x2,y2,z2,0
886-
vTemp2 = _mm_and_ps(vTemp2, g_XMMask3);
887889
// vTemp3 = x3,y3,z3,0
890+
#ifdef _XM_SSE4_INTRINSICS_
891+
XMVECTOR zero = _mm_setzero_ps();
892+
vTemp1 = _mm_blend_ps(zero, vTemp1, 0x7);
893+
vTemp2 = _mm_blend_ps(zero, vTemp2, 0x7);
894+
vTemp3 = _mm_blend_ps(zero, vTemp3, 0x7);
895+
#else
896+
vTemp1 = _mm_and_ps(vTemp1, g_XMMask3);
897+
vTemp2 = _mm_and_ps(vTemp2, g_XMMask3);
888898
vTemp3 = _mm_and_ps(vTemp3, g_XMMask3);
899+
#endif
889900
// vTemp4i = x4,y4,z4,0
890901
__m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8);
891902
// vTemp4i = x4,y4,z4,1.0f
@@ -965,11 +976,18 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept
965976
// vTemp2 = x2,y2,z2,z2
966977
vTemp2 = XM_PERMUTE_PS(vTemp2, _MM_SHUFFLE(1, 1, 0, 2));
967978
// vTemp1 = x1,y1,z1,0
968-
vTemp1 = _mm_and_ps(vTemp1, g_XMMask3);
969979
// vTemp2 = x2,y2,z2,0
970-
vTemp2 = _mm_and_ps(vTemp2, g_XMMask3);
971980
// vTemp3 = x3,y3,z3,0
981+
#ifdef _XM_SSE4_INTRINSICS_
982+
XMVECTOR zero = _mm_setzero_ps();
983+
vTemp1 = _mm_blend_ps(zero, vTemp1, 0x7);
984+
vTemp2 = _mm_blend_ps(zero, vTemp2, 0x7);
985+
vTemp3 = _mm_blend_ps(zero, vTemp3, 0x7);
986+
#else
987+
vTemp1 = _mm_and_ps(vTemp1, g_XMMask3);
988+
vTemp2 = _mm_and_ps(vTemp2, g_XMMask3);
972989
vTemp3 = _mm_and_ps(vTemp3, g_XMMask3);
990+
#endif
973991
// vTemp4i = x4,y4,z4,0
974992
__m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4), 32 / 8);
975993
// vTemp4i = x4,y4,z4,1.0f

0 commit comments

Comments
 (0)