@@ -541,6 +541,10 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3A(const XMFLOAT3A* pSource) noexcept
541541 float32x4_t V = vld1q_f32 (reinterpret_cast <const float *>(pSource));
542542#endif
543543 return vsetq_lane_f32 (0 , V, 3 );
544+ #elif defined(_XM_SSE4_INTRINSICS_)
545+ // Reads an extra float which is zero'd
546+ __m128 V = _mm_load_ps (&pSource->x );
547+ return _mm_blend_ps (_mm_setzero_ps (), V, 0x7 );
544548#elif defined(_XM_SSE_INTRINSICS_)
545549 // Reads an extra float which is zero'd
546550 __m128 V = _mm_load_ps (&pSource->x );
@@ -881,11 +885,18 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept
881885 // vTemp2 = x2,y2,z2,z2
882886 vTemp2 = XM_PERMUTE_PS (vTemp2, _MM_SHUFFLE (1 , 1 , 0 , 2 ));
883887 // vTemp1 = x1,y1,z1,0
884- vTemp1 = _mm_and_ps (vTemp1, g_XMMask3);
885888 // vTemp2 = x2,y2,z2,0
886- vTemp2 = _mm_and_ps (vTemp2, g_XMMask3);
887889 // vTemp3 = x3,y3,z3,0
890+ #ifdef _XM_SSE4_INTRINSICS_
891+ XMVECTOR zero = _mm_setzero_ps ();
892+ vTemp1 = _mm_blend_ps (zero, vTemp1, 0x7 );
893+ vTemp2 = _mm_blend_ps (zero, vTemp2, 0x7 );
894+ vTemp3 = _mm_blend_ps (zero, vTemp3, 0x7 );
895+ #else
896+ vTemp1 = _mm_and_ps (vTemp1, g_XMMask3);
897+ vTemp2 = _mm_and_ps (vTemp2, g_XMMask3);
888898 vTemp3 = _mm_and_ps (vTemp3, g_XMMask3);
899+ #endif
889900 // vTemp4i = x4,y4,z4,0
890901 __m128i vTemp4i = _mm_srli_si128 (_mm_castps_si128 (vTemp4), 32 / 8 );
891902 // vTemp4i = x4,y4,z4,1.0f
@@ -965,11 +976,18 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept
965976 // vTemp2 = x2,y2,z2,z2
966977 vTemp2 = XM_PERMUTE_PS (vTemp2, _MM_SHUFFLE (1 , 1 , 0 , 2 ));
967978 // vTemp1 = x1,y1,z1,0
968- vTemp1 = _mm_and_ps (vTemp1, g_XMMask3);
969979 // vTemp2 = x2,y2,z2,0
970- vTemp2 = _mm_and_ps (vTemp2, g_XMMask3);
971980 // vTemp3 = x3,y3,z3,0
981+ #ifdef _XM_SSE4_INTRINSICS_
982+ XMVECTOR zero = _mm_setzero_ps ();
983+ vTemp1 = _mm_blend_ps (zero, vTemp1, 0x7 );
984+ vTemp2 = _mm_blend_ps (zero, vTemp2, 0x7 );
985+ vTemp3 = _mm_blend_ps (zero, vTemp3, 0x7 );
986+ #else
987+ vTemp1 = _mm_and_ps (vTemp1, g_XMMask3);
988+ vTemp2 = _mm_and_ps (vTemp2, g_XMMask3);
972989 vTemp3 = _mm_and_ps (vTemp3, g_XMMask3);
990+ #endif
973991 // vTemp4i = x4,y4,z4,0
974992 __m128i vTemp4i = _mm_srli_si128 (_mm_castps_si128 (vTemp4), 32 / 8 );
975993 // vTemp4i = x4,y4,z4,1.0f
0 commit comments