@@ -1279,6 +1279,8 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept
12791279{
12801280 assert (pSource);
12811281
1282+ #if defined(_XM_NO_INTRINSICS_)
1283+
12821284 union { float f; int32_t i; } fi;
12831285 fi.i = 0x33800000 + (pSource->e << 23 );
12841286 float Scale = fi.f ;
@@ -1289,6 +1291,45 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept
12891291 Scale * float (pSource->zm ),
12901292 1 .0f } } };
12911293 return v;
1294+
1295+ #elif defined(_XM_ARM_NEON_INTRINSICS_)
1296+
1297+ uint32_t v = pSource->v ;
1298+
1299+ // Build scale factor from shared exponent
1300+ union { float f; int32_t i; } fi;
1301+ fi.i = 0x33800000 + (static_cast <int >(v >> 27 ) << 23 );
1302+
1303+ // Extract 9-bit mantissas into vector lanes
1304+ uint32x4_t mantissas = vdupq_n_u32 (0 );
1305+ mantissas = vsetq_lane_u32 (v & 0x1FFu , mantissas, 0 );
1306+ mantissas = vsetq_lane_u32 ((v >> 9 ) & 0x1FFu , mantissas, 1 );
1307+ mantissas = vsetq_lane_u32 ((v >> 18 ) & 0x1FFu , mantissas, 2 );
1308+
1309+ // Convert to float, scale, and set w = 1.0f
1310+ float32x4_t result = vmulq_n_f32 (vcvtq_f32_u32 (mantissas), fi.f );
1311+ return vsetq_lane_f32 (1 .0f , result, 3 );
1312+
1313+ #elif defined(_XM_SSE_INTRINSICS_)
1314+
1315+ uint32_t v = pSource->v ;
1316+
1317+ // Build scale factor from shared exponent
1318+ union { float f; int32_t i; } fi;
1319+ fi.i = 0x33800000 + (static_cast <int >(v >> 27 ) << 23 );
1320+
1321+ // Extract 9-bit mantissas, convert to float, and scale
1322+ __m128i mantissas = _mm_set_epi32 (
1323+ 0 ,
1324+ static_cast <int >((v >> 18 ) & 0x1FF ),
1325+ static_cast <int >((v >> 9 ) & 0x1FF ),
1326+ static_cast <int >(v & 0x1FF ));
1327+ __m128 result = _mm_mul_ps (_mm_cvtepi32_ps (mantissas), _mm_set1_ps (fi.f ));
1328+
1329+ // Set w = 1.0f (w lane is +0.0f so bitwise OR inserts 1.0f cleanly)
1330+ return _mm_or_ps (result, g_XMIdentityR3);
1331+
1332+ #endif
12921333}
12931334
12941335// ------------------------------------------------------------------------------
@@ -2639,6 +2680,8 @@ inline void XM_CALLCONV XMStoreFloat3SE
26392680{
26402681 assert (pDestination);
26412682
2683+ #if defined(_XM_NO_INTRINSICS_)
2684+
26422685 XMFLOAT3A tmp;
26432686 XMStoreFloat3A (&tmp, V);
26442687
@@ -2667,6 +2710,93 @@ inline void XM_CALLCONV XMStoreFloat3SE
26672710 pDestination->xm = static_cast <uint32_t >(MathInternal::round_to_nearest (x * ScaleR));
26682711 pDestination->ym = static_cast <uint32_t >(MathInternal::round_to_nearest (y * ScaleR));
26692712 pDestination->zm = static_cast <uint32_t >(MathInternal::round_to_nearest (z * ScaleR));
2713+
2714+ #elif defined(_XM_ARM_NEON_INTRINSICS_)
2715+
2716+ static const XMVECTORF32 MaxFloat9 = { { { float (0x1FF << 7 ), float (0x1FF << 7 ), float (0x1FF << 7 ), float (0x1FF << 7 ) } } };
2717+ static constexpr float minf9 = float (1 .f / (1 << 16 ));
2718+
2719+ // Clamp to [0, maxf9] then zero w lane
2720+ float32x4_t clamped = vminq_f32 (vmaxq_f32 (V, vdupq_n_f32 (0 )), MaxFloat9);
2721+ clamped = vsetq_lane_f32 (0 .0f , clamped, 3 );
2722+
2723+ // Horizontal max of xyz for shared exponent
2724+ #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
2725+ float maxVal = vmaxvq_f32 (clamped);
2726+ #else
2727+ float32x2_t vlow = vget_low_f32 (clamped);
2728+ float32x2_t vhigh = vget_high_f32 (clamped);
2729+ float32x2_t maxPair = vpmax_f32 (vlow, vhigh);
2730+ maxPair = vpmax_f32 (maxPair, maxPair);
2731+ float maxVal = vget_lane_f32 (maxPair, 0 );
2732+ #endif
2733+
2734+ if (maxVal < minf9) maxVal = minf9;
2735+
2736+ // Compute shared exponent (inherently scalar)
2737+ union { float f; int32_t i; } fi;
2738+ fi.f = maxVal;
2739+ fi.i += 0x00004000 ; // round up leaving 9 bits in fraction (including assumed 1)
2740+
2741+ auto exp = static_cast <uint32_t >(fi.i ) >> 23 ;
2742+ fi.i = static_cast <int32_t >(0x83000000 - (exp << 23 ));
2743+
2744+ // Scale all channels and convert to integer
2745+ float32x4_t scaled = vmulq_n_f32 (clamped, fi.f );
2746+ #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
2747+ uint32x4_t ints = vcvtnq_u32_f32 (scaled);
2748+ #else
2749+ scaled = vaddq_f32 (scaled, vdupq_n_f32 (0 .5f ));
2750+ uint32x4_t ints = vcvtq_u32_f32 (scaled);
2751+ #endif
2752+
2753+ // Extract and pack into bitfields
2754+ pDestination->v = (vgetq_lane_u32 (ints, 0 ) & 0x1FF )
2755+ | ((vgetq_lane_u32 (ints, 1 ) & 0x1FF ) << 9 )
2756+ | ((vgetq_lane_u32 (ints, 2 ) & 0x1FF ) << 18 )
2757+ | ((exp - 0x6f ) << 27 );
2758+
2759+ #elif defined(_XM_SSE_INTRINSICS_)
2760+
2761+ static const XMVECTORF32 MaxFloat9 = { { { float (0x1FF << 7 ), float (0x1FF << 7 ), float (0x1FF << 7 ), float (0x1FF << 7 ) } } };
2762+ static constexpr float minf9 = float (1 .f / (1 << 16 ));
2763+
2764+ // Clamp to [0, maxf9] then mask w to zero
2765+ __m128 clamped = _mm_min_ps (_mm_max_ps (V, _mm_setzero_ps ()), MaxFloat9);
2766+ clamped = _mm_and_ps (clamped, g_XMMask3);
2767+
2768+ // Horizontal max of xyz for shared exponent
2769+ __m128 maxV = clamped;
2770+ __m128 temp = XM_PERMUTE_PS (maxV, _MM_SHUFFLE (1 , 1 , 1 , 1 ));
2771+ maxV = _mm_max_ps (maxV, temp);
2772+ temp = XM_PERMUTE_PS (clamped, _MM_SHUFFLE (2 , 2 , 2 , 2 ));
2773+ maxV = _mm_max_ps (maxV, temp);
2774+
2775+ // Ensure minimum threshold
2776+ maxV = _mm_max_ss (maxV, _mm_set_ss (minf9));
2777+
2778+ // Compute shared exponent (inherently scalar)
2779+ union { float f; int32_t i; } fi;
2780+ _mm_store_ss (&fi.f , maxV);
2781+ fi.i += 0x00004000 ; // round up leaving 9 bits in fraction (including assumed 1)
2782+
2783+ auto exp = static_cast <uint32_t >(fi.i ) >> 23 ;
2784+ fi.i = static_cast <int32_t >(0x83000000 - (exp << 23 ));
2785+
2786+ // Scale all channels and round to nearest integer
2787+ __m128 scaled = _mm_mul_ps (clamped, _mm_set1_ps (fi.f ));
2788+ __m128i ints = _mm_cvtps_epi32 (scaled);
2789+
2790+ // Extract and pack into bitfields
2791+ XM_ALIGNED_DATA (16 ) uint32_t ivals[4 ];
2792+ _mm_store_si128 (reinterpret_cast <__m128i*>(ivals), ints);
2793+
2794+ pDestination->v = (ivals[0 ] & 0x1FF )
2795+ | ((ivals[1 ] & 0x1FF ) << 9 )
2796+ | ((ivals[2 ] & 0x1FF ) << 18 )
2797+ | ((exp - 0x6f ) << 27 );
2798+
2799+ #endif
26702800}
26712801
26722802// ------------------------------------------------------------------------------
0 commit comments