-
Notifications
You must be signed in to change notification settings - Fork 251
Expand file tree
/
Copy pathDirectXMathAVX.h
More file actions
275 lines (219 loc) · 13.2 KB
/
DirectXMathAVX.h
File metadata and controls
275 lines (219 loc) · 13.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
//-------------------------------------------------------------------------------------
// DirectXMathAVX.h -- AVX (version 1) extensions for SIMD C++ Math library
//
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
//
// https://go.microsoft.com/fwlink/?LinkID=615560
//-------------------------------------------------------------------------------------
#pragma once
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __arm__ || __aarch64__
#error AVX not supported on ARM platform
#endif
#include <DirectXMath.h>
namespace DirectX
{
namespace AVX
{
inline bool XMVerifyAVXSupport()
{
// Should return true for AMD Bulldozer, Intel "Sandy Bridge", and Intel "Ivy Bridge" or later processors
// with OS support for AVX (Windows 7 Service Pack 1, Windows Server 2008 R2 Service Pack 1, Windows 8, Windows Server 2012)
// See https://msdn.microsoft.com/en-us/library/hskdteyh.aspx
int CPUInfo[4] = { -1 };
#if (defined(__clang__) || defined(__GNUC__)) && defined(__cpuid)
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 0);
#endif
if (CPUInfo[0] < 1)
return false;
#if (defined(__clang__) || defined(__GNUC__)) && defined(__cpuid)
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 1);
#endif
// We check for AVX, OSXSAVE, SSSE4.1, and SSE3
return ((CPUInfo[2] & 0x18080001) == 0x18080001);
}
//-------------------------------------------------------------------------------------
// Vector
//-------------------------------------------------------------------------------------
inline XMVECTOR XM_CALLCONV XMVectorReplicatePtr(_In_ const float *pValue)
{
return _mm_broadcast_ss(pValue);
}
inline XMVECTOR XM_CALLCONV XMVectorSplatX(FXMVECTOR V)
{
return _mm_permute_ps(V, _MM_SHUFFLE(0, 0, 0, 0));
}
inline XMVECTOR XM_CALLCONV XMVectorSplatY(FXMVECTOR V)
{
return _mm_permute_ps(V, _MM_SHUFFLE(1, 1, 1, 1));
}
inline XMVECTOR XM_CALLCONV XMVectorSplatZ(FXMVECTOR V)
{
return _mm_permute_ps(V, _MM_SHUFFLE(2, 2, 2, 2));
}
inline XMVECTOR XM_CALLCONV XMVectorSplatW(FXMVECTOR V)
{
return _mm_permute_ps(V, _MM_SHUFFLE(3, 3, 3, 3));
}
inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3)
{
assert((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));
_Analysis_assume_((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));
unsigned int elem[4] = { E0, E1, E2, E3 };
__m128i vControl = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&elem[0]));
return _mm_permutevar_ps(V, vControl);
}
inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW)
{
assert(PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7);
_Analysis_assume_(PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7);
static const XMVECTORU32 three = { { { 3, 3, 3, 3 } } };
XM_ALIGNED_DATA(16) unsigned int elem[4] = { PermuteX, PermuteY, PermuteZ, PermuteW };
__m128i vControl = _mm_load_si128(reinterpret_cast<const __m128i *>(&elem[0]));
__m128i vSelect = _mm_cmpgt_epi32(vControl, three);
vControl = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(vControl), three));
__m128 shuffled1 = _mm_permutevar_ps(V1, vControl);
__m128 shuffled2 = _mm_permutevar_ps(V2, vControl);
__m128 masked1 = _mm_andnot_ps(_mm_castsi128_ps(vSelect), shuffled1);
__m128 masked2 = _mm_and_ps(_mm_castsi128_ps(vSelect), shuffled2);
return _mm_or_ps(masked1, masked2);
}
inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements)
{
assert(Elements < 4);
_Analysis_assume_(Elements < 4);
return AVX::XMVectorPermute(V1, V2, Elements, ((Elements)+ 1), ((Elements)+ 2), ((Elements)+ 3));
}
inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements)
{
assert(Elements < 4);
_Analysis_assume_(Elements < 4);
return AVX::XMVectorSwizzle(V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3);
}
inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V, uint32_t Elements)
{
assert(Elements < 4);
_Analysis_assume_(Elements < 4);
return AVX::XMVectorSwizzle(V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3);
}
//-------------------------------------------------------------------------------------
// Permute Templates
//-------------------------------------------------------------------------------------
namespace MathInternal
{
// Slow path fallback for permutes that do not map to a single SSE opcode.
template<uint32_t Shuffle, bool WhichX, bool WhichY, bool WhichZ, bool WhichW> struct PermuteHelper
{
static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2)
{
static const XMVECTORU32 selectMask =
{ { {
WhichX ? 0xFFFFFFFF : 0,
WhichY ? 0xFFFFFFFF : 0,
WhichZ ? 0xFFFFFFFF : 0,
WhichW ? 0xFFFFFFFF : 0,
} } };
XMVECTOR shuffled1 = _mm_permute_ps(v1, Shuffle);
XMVECTOR shuffled2 = _mm_permute_ps(v2, Shuffle);
XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1);
XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2);
return _mm_or_ps(masked1, masked2);
}
};
// Fast path for permutes that only read from the first vector.
template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, false, false>
{
static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return _mm_permute_ps(v1, Shuffle); }
};
// Fast path for permutes that only read from the second vector.
template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, true, true>
{
static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { (v1); return _mm_permute_ps(v2, Shuffle); }
};
// Fast path for permutes that read XY from the first vector, ZW from the second.
template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, true, true>
{
static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); }
};
// Fast path for permutes that read XY from the second vector, ZW from the first.
template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, false, false>
{
static XMVECTOR XM_CALLCONV Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); }
};
};
// General permute template
template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW>
inline XMVECTOR XM_CALLCONV XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2)
{
static_assert(PermuteX <= 7, "PermuteX template parameter out of range");
static_assert(PermuteY <= 7, "PermuteY template parameter out of range");
static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range");
static_assert(PermuteW <= 7, "PermuteW template parameter out of range");
const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3);
const bool WhichX = PermuteX > 3;
const bool WhichY = PermuteY > 3;
const bool WhichZ = PermuteZ > 3;
const bool WhichW = PermuteW > 3;
return AVX::MathInternal::PermuteHelper<Shuffle, WhichX, WhichY, WhichZ, WhichW>::Permute(V1, V2);
}
// Special-case permute templates
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 2, 3>(FXMVECTOR V1, FXMVECTOR) { return V1; }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 6, 7>(FXMVECTOR, FXMVECTOR V2) { return V2; }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1, V2, 0x1); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1, V2, 0x2); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 2, 3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1, V2, 0x3); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1, V2, 0x4); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1, V2, 0x5); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1, V2, 0x6); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 6, 3>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1, V2, 0x7); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1, V2, 0x8); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1, V2, 0x9); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1, V2, 0xA); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 5, 2, 7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1, V2, 0xB); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1, V2, 0xC); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<4, 1, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1, V2, 0xD); }
template<> inline XMVECTOR XM_CALLCONV XMVectorPermute<0, 5, 6, 7>(FXMVECTOR V1, FXMVECTOR V2) { return _mm_blend_ps(V1, V2, 0xE); }
//-------------------------------------------------------------------------------------
// Swizzle Templates
//-------------------------------------------------------------------------------------
// General swizzle template
template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
inline XMVECTOR XM_CALLCONV XMVectorSwizzle(FXMVECTOR V)
{
static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range");
static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range");
static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range");
static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range");
return _mm_permute_ps(V, _MM_SHUFFLE(SwizzleW, SwizzleZ, SwizzleY, SwizzleX));
}
// Specialized swizzles
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 1, 2, 3>(FXMVECTOR V) { return V; }
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0, 0, 2, 2>(FXMVECTOR V) { return _mm_moveldup_ps(V); }
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1, 1, 3, 3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
//-------------------------------------------------------------------------------------
// Other Templates
//-------------------------------------------------------------------------------------
template<uint32_t Elements>
inline XMVECTOR XM_CALLCONV XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2)
{
static_assert(Elements < 4, "Elements template parameter out of range");
return AVX::XMVectorPermute<Elements, (Elements + 1), (Elements + 2), (Elements + 3)>(V1, V2);
}
template<uint32_t Elements>
inline XMVECTOR XM_CALLCONV XMVectorRotateLeft(FXMVECTOR V)
{
static_assert(Elements < 4, "Elements template parameter out of range");
return AVX::XMVectorSwizzle<Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3>(V);
}
template<uint32_t Elements>
inline XMVECTOR XM_CALLCONV XMVectorRotateRight(FXMVECTOR V)
{
static_assert(Elements < 4, "Elements template parameter out of range");
return AVX::XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V);
}
} // namespace AVX
} // namespace DirectX;