Skip to content

Commit dddfcd6

Browse files
committed
SIMD implementation for XMFloatLoad/Store3SE
1 parent 74a0f33 commit dddfcd6

1 file changed

Lines changed: 130 additions & 0 deletions

File tree

Inc/DirectXPackedVector.inl

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1279,6 +1279,8 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept
12791279
{
12801280
assert(pSource);
12811281

1282+
#if defined(_XM_NO_INTRINSICS_)
1283+
12821284
union { float f; int32_t i; } fi;
12831285
fi.i = 0x33800000 + (pSource->e << 23);
12841286
float Scale = fi.f;
@@ -1289,6 +1291,45 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept
12891291
Scale * float(pSource->zm),
12901292
1.0f } } };
12911293
return v;
1294+
1295+
#elif defined(_XM_ARM_NEON_INTRINSICS_)
1296+
1297+
uint32_t v = pSource->v;
1298+
1299+
// Build scale factor from shared exponent
1300+
union { float f; int32_t i; } fi;
1301+
fi.i = 0x33800000 + (static_cast<int>(v >> 27) << 23);
1302+
1303+
// Extract 9-bit mantissas into vector lanes
1304+
uint32x4_t mantissas = vdupq_n_u32(0);
1305+
mantissas = vsetq_lane_u32(v & 0x1FFu, mantissas, 0);
1306+
mantissas = vsetq_lane_u32((v >> 9) & 0x1FFu, mantissas, 1);
1307+
mantissas = vsetq_lane_u32((v >> 18) & 0x1FFu, mantissas, 2);
1308+
1309+
// Convert to float, scale, and set w = 1.0f
1310+
float32x4_t result = vmulq_n_f32(vcvtq_f32_u32(mantissas), fi.f);
1311+
return vsetq_lane_f32(1.0f, result, 3);
1312+
1313+
#elif defined(_XM_SSE_INTRINSICS_)
1314+
1315+
uint32_t v = pSource->v;
1316+
1317+
// Build scale factor from shared exponent
1318+
union { float f; int32_t i; } fi;
1319+
fi.i = 0x33800000 + (static_cast<int>(v >> 27) << 23);
1320+
1321+
// Extract 9-bit mantissas, convert to float, and scale
1322+
__m128i mantissas = _mm_set_epi32(
1323+
0,
1324+
static_cast<int>((v >> 18) & 0x1FF),
1325+
static_cast<int>((v >> 9) & 0x1FF),
1326+
static_cast<int>(v & 0x1FF));
1327+
__m128 result = _mm_mul_ps(_mm_cvtepi32_ps(mantissas), _mm_set1_ps(fi.f));
1328+
1329+
// Set w = 1.0f (w lane is +0.0f so bitwise OR inserts 1.0f cleanly)
1330+
return _mm_or_ps(result, g_XMIdentityR3);
1331+
1332+
#endif
12921333
}
12931334

12941335
//------------------------------------------------------------------------------
@@ -2639,6 +2680,8 @@ inline void XM_CALLCONV XMStoreFloat3SE
26392680
{
26402681
assert(pDestination);
26412682

2683+
#if defined(_XM_NO_INTRINSICS_)
2684+
26422685
XMFLOAT3A tmp;
26432686
XMStoreFloat3A(&tmp, V);
26442687

@@ -2667,6 +2710,93 @@ inline void XM_CALLCONV XMStoreFloat3SE
26672710
pDestination->xm = static_cast<uint32_t>(MathInternal::round_to_nearest(x * ScaleR));
26682711
pDestination->ym = static_cast<uint32_t>(MathInternal::round_to_nearest(y * ScaleR));
26692712
pDestination->zm = static_cast<uint32_t>(MathInternal::round_to_nearest(z * ScaleR));
2713+
2714+
#elif defined(_XM_ARM_NEON_INTRINSICS_)
2715+
2716+
static const XMVECTORF32 MaxFloat9 = { { { float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7) } } };
2717+
static constexpr float minf9 = float(1.f / (1 << 16));
2718+
2719+
// Clamp to [0, maxf9] then zero w lane
2720+
float32x4_t clamped = vminq_f32(vmaxq_f32(V, vdupq_n_f32(0)), MaxFloat9);
2721+
clamped = vsetq_lane_f32(0.0f, clamped, 3);
2722+
2723+
// Horizontal max of xyz for shared exponent
2724+
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
2725+
float maxVal = vmaxvq_f32(clamped);
2726+
#else
2727+
float32x2_t vlow = vget_low_f32(clamped);
2728+
float32x2_t vhigh = vget_high_f32(clamped);
2729+
float32x2_t maxPair = vpmax_f32(vlow, vhigh);
2730+
maxPair = vpmax_f32(maxPair, maxPair);
2731+
float maxVal = vget_lane_f32(maxPair, 0);
2732+
#endif
2733+
2734+
if (maxVal < minf9) maxVal = minf9;
2735+
2736+
// Compute shared exponent (inherently scalar)
2737+
union { float f; int32_t i; } fi;
2738+
fi.f = maxVal;
2739+
fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1)
2740+
2741+
auto exp = static_cast<uint32_t>(fi.i) >> 23;
2742+
fi.i = static_cast<int32_t>(0x83000000 - (exp << 23));
2743+
2744+
// Scale all channels and convert to integer
2745+
float32x4_t scaled = vmulq_n_f32(clamped, fi.f);
2746+
#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__
2747+
uint32x4_t ints = vcvtnq_u32_f32(scaled);
2748+
#else
2749+
scaled = vaddq_f32(scaled, vdupq_n_f32(0.5f));
2750+
uint32x4_t ints = vcvtq_u32_f32(scaled);
2751+
#endif
2752+
2753+
// Extract and pack into bitfields
2754+
pDestination->v = (vgetq_lane_u32(ints, 0) & 0x1FF)
2755+
| ((vgetq_lane_u32(ints, 1) & 0x1FF) << 9)
2756+
| ((vgetq_lane_u32(ints, 2) & 0x1FF) << 18)
2757+
| ((exp - 0x6f) << 27);
2758+
2759+
#elif defined(_XM_SSE_INTRINSICS_)
2760+
2761+
static const XMVECTORF32 MaxFloat9 = { { { float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7) } } };
2762+
static constexpr float minf9 = float(1.f / (1 << 16));
2763+
2764+
// Clamp to [0, maxf9] then mask w to zero
2765+
__m128 clamped = _mm_min_ps(_mm_max_ps(V, _mm_setzero_ps()), MaxFloat9);
2766+
clamped = _mm_and_ps(clamped, g_XMMask3);
2767+
2768+
// Horizontal max of xyz for shared exponent
2769+
__m128 maxV = clamped;
2770+
__m128 temp = XM_PERMUTE_PS(maxV, _MM_SHUFFLE(1, 1, 1, 1));
2771+
maxV = _mm_max_ps(maxV, temp);
2772+
temp = XM_PERMUTE_PS(clamped, _MM_SHUFFLE(2, 2, 2, 2));
2773+
maxV = _mm_max_ps(maxV, temp);
2774+
2775+
// Ensure minimum threshold
2776+
maxV = _mm_max_ss(maxV, _mm_set_ss(minf9));
2777+
2778+
// Compute shared exponent (inherently scalar)
2779+
union { float f; int32_t i; } fi;
2780+
_mm_store_ss(&fi.f, maxV);
2781+
fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1)
2782+
2783+
auto exp = static_cast<uint32_t>(fi.i) >> 23;
2784+
fi.i = static_cast<int32_t>(0x83000000 - (exp << 23));
2785+
2786+
// Scale all channels and round to nearest integer
2787+
__m128 scaled = _mm_mul_ps(clamped, _mm_set1_ps(fi.f));
2788+
__m128i ints = _mm_cvtps_epi32(scaled);
2789+
2790+
// Extract and pack into bitfields
2791+
XM_ALIGNED_DATA(16) uint32_t ivals[4];
2792+
_mm_store_si128(reinterpret_cast<__m128i*>(ivals), ints);
2793+
2794+
pDestination->v = (ivals[0] & 0x1FF)
2795+
| ((ivals[1] & 0x1FF) << 9)
2796+
| ((ivals[2] & 0x1FF) << 18)
2797+
| ((exp - 0x6f) << 27);
2798+
2799+
#endif
26702800
}
26712801

26722802
//------------------------------------------------------------------------------

0 commit comments

Comments
 (0)