@@ -869,24 +869,6 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept
869869 M.r [2 ] = vreinterpretq_f32_u32 (vandq_u32 (vreinterpretq_u32_f32 (T2), g_XMMask3));
870870 M.r [3 ] = vsetq_lane_f32 (1 .f , T3, 3 );
871871 return M;
872- #elif defined(_XM_SSE4_INTRINSICS_)
873- XMVECTOR vTemp1 = _mm_loadu_ps (&pSource->m [0 ][0 ]);
874- XMVECTOR vTemp2 = _mm_loadu_ps (&pSource->m [1 ][1 ]);
875- XMVECTOR vTemp4 = _mm_loadu_ps (&pSource->m [2 ][2 ]);
876- XMVECTOR vTemp3 = _mm_shuffle_ps (vTemp2, vTemp4, _MM_SHUFFLE (0 , 0 , 3 , 2 ));
877- vTemp2 = _mm_shuffle_ps (vTemp2, vTemp1, _MM_SHUFFLE (3 , 3 , 1 , 0 ));
878- vTemp2 = XM_PERMUTE_PS (vTemp2, _MM_SHUFFLE (1 , 1 , 0 , 2 ));
879- XMVECTOR zero = _mm_setzero_ps ();
880- vTemp1 = _mm_blend_ps (zero, vTemp1, 0x7 );
881- vTemp2 = _mm_blend_ps (zero, vTemp2, 0x7 );
882- vTemp3 = _mm_blend_ps (zero, vTemp3, 0x7 );
883- __m128i vTemp4i = _mm_srli_si128 (_mm_castps_si128 (vTemp4), 32 / 8 );
884- vTemp4i = _mm_or_si128 (vTemp4i, g_XMIdentityR3);
885- XMMATRIX M (vTemp1,
886- vTemp2,
887- vTemp3,
888- _mm_castsi128_ps (vTemp4i));
889- return M;
890872#elif defined(_XM_SSE_INTRINSICS_)
891873 // Use unaligned load instructions to
892874 // load the 12 floats
@@ -903,11 +885,18 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3(const XMFLOAT4X3* pSource) noexcept
903885 // vTemp2 = x2,y2,z2,z2
904886 vTemp2 = XM_PERMUTE_PS (vTemp2, _MM_SHUFFLE (1 , 1 , 0 , 2 ));
905887 // vTemp1 = x1,y1,z1,0
906- vTemp1 = _mm_and_ps (vTemp1, g_XMMask3);
907888 // vTemp2 = x2,y2,z2,0
908- vTemp2 = _mm_and_ps (vTemp2, g_XMMask3);
909889 // vTemp3 = x3,y3,z3,0
890+ #ifdef _XM_SSE4_INTRINSICS_
891+ XMVECTOR zero = _mm_setzero_ps ();
892+ vTemp1 = _mm_blend_ps (zero, vTemp1, 0x7 );
893+ vTemp2 = _mm_blend_ps (zero, vTemp2, 0x7 );
894+ vTemp3 = _mm_blend_ps (zero, vTemp3, 0x7 );
895+ #else
896+ vTemp1 = _mm_and_ps (vTemp1, g_XMMask3);
897+ vTemp2 = _mm_and_ps (vTemp2, g_XMMask3);
910898 vTemp3 = _mm_and_ps (vTemp3, g_XMMask3);
899+ #endif
911900 // vTemp4i = x4,y4,z4,0
912901 __m128i vTemp4i = _mm_srli_si128 (_mm_castps_si128 (vTemp4), 32 / 8 );
913902 // vTemp4i = x4,y4,z4,1.0f
@@ -971,24 +960,6 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept
971960 M.r [2 ] = vreinterpretq_f32_u32 (vandq_u32 (vreinterpretq_u32_f32 (T2), g_XMMask3));
972961 M.r [3 ] = vsetq_lane_f32 (1 .f , T3, 3 );
973962 return M;
974- #elif defined(_XM_SSE4_INTRINSICS_)
975- XMVECTOR vTemp1 = _mm_load_ps (&pSource->m [0 ][0 ]);
976- XMVECTOR vTemp2 = _mm_load_ps (&pSource->m [1 ][1 ]);
977- XMVECTOR vTemp4 = _mm_load_ps (&pSource->m [2 ][2 ]);
978- XMVECTOR vTemp3 = _mm_shuffle_ps (vTemp2, vTemp4, _MM_SHUFFLE (0 , 0 , 3 , 2 ));
979- vTemp2 = _mm_shuffle_ps (vTemp2, vTemp1, _MM_SHUFFLE (3 , 3 , 1 , 0 ));
980- vTemp2 = XM_PERMUTE_PS (vTemp2, _MM_SHUFFLE (1 , 1 , 0 , 2 ));
981- XMVECTOR zero = _mm_setzero_ps ();
982- vTemp1 = _mm_blend_ps (zero, vTemp1, 0x7 );
983- vTemp2 = _mm_blend_ps (zero, vTemp2, 0x7 );
984- vTemp3 = _mm_blend_ps (zero, vTemp3, 0x7 );
985- __m128i vTemp4i = _mm_srli_si128 (_mm_castps_si128 (vTemp4), 32 / 8 );
986- vTemp4i = _mm_or_si128 (vTemp4i, g_XMIdentityR3);
987- XMMATRIX M (vTemp1,
988- vTemp2,
989- vTemp3,
990- _mm_castsi128_ps (vTemp4i));
991- return M;
992963#elif defined(_XM_SSE_INTRINSICS_)
993964 // Use aligned load instructions to
994965 // load the 12 floats
@@ -1005,11 +976,18 @@ inline XMMATRIX XM_CALLCONV XMLoadFloat4x3A(const XMFLOAT4X3A* pSource) noexcept
1005976 // vTemp2 = x2,y2,z2,z2
1006977 vTemp2 = XM_PERMUTE_PS (vTemp2, _MM_SHUFFLE (1 , 1 , 0 , 2 ));
1007978 // vTemp1 = x1,y1,z1,0
1008- vTemp1 = _mm_and_ps (vTemp1, g_XMMask3);
1009979 // vTemp2 = x2,y2,z2,0
1010- vTemp2 = _mm_and_ps (vTemp2, g_XMMask3);
1011980 // vTemp3 = x3,y3,z3,0
981+ #ifdef _XM_SSE4_INTRINSICS_
982+ XMVECTOR zero = _mm_setzero_ps ();
983+ vTemp1 = _mm_blend_ps (zero, vTemp1, 0x7 );
984+ vTemp2 = _mm_blend_ps (zero, vTemp2, 0x7 );
985+ vTemp3 = _mm_blend_ps (zero, vTemp3, 0x7 );
986+ #else
987+ vTemp1 = _mm_and_ps (vTemp1, g_XMMask3);
988+ vTemp2 = _mm_and_ps (vTemp2, g_XMMask3);
1012989 vTemp3 = _mm_and_ps (vTemp3, g_XMMask3);
990+ #endif
1013991 // vTemp4i = x4,y4,z4,0
1014992 __m128i vTemp4i = _mm_srli_si128 (_mm_castps_si128 (vTemp4), 32 / 8 );
1015993 // vTemp4i = x4,y4,z4,1.0f
0 commit comments