|
12 | 12 | #ifndef XSIMD_CPUID_HPP |
13 | 13 | #define XSIMD_CPUID_HPP |
14 | 14 |
|
15 | | -#include <algorithm> |
16 | 15 | #include <cstring> |
17 | 16 |
|
| 17 | +#include "../types/xsimd_all_registers.hpp" |
| 18 | +#include "../xsimd_cpu_features_x86.hpp" |
| 19 | +#include "xsimd_inline.hpp" |
| 20 | + |
18 | 21 | #if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM) || defined(__riscv_vector)) |
19 | 22 | #include <asm/hwcap.h> |
20 | 23 | #include <sys/auxv.h> |
|
25 | 28 |
|
26 | 29 | #endif |
27 | 30 |
|
28 | | -#if defined(_MSC_VER) |
29 | | -// Contains the definition of __cpuidex |
30 | | -#include <intrin.h> |
31 | | -#endif |
32 | | - |
33 | | -#include "../types/xsimd_all_registers.hpp" |
34 | | - |
35 | 31 | namespace xsimd |
36 | 32 | { |
37 | 33 | namespace detail |
@@ -126,138 +122,54 @@ namespace xsimd |
126 | 122 | #endif |
127 | 123 | rvv = bool(getauxval(AT_HWCAP) & HWCAP_V); |
128 | 124 | #endif |
129 | | - |
130 | | -#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86) |
131 | | - |
132 | | - auto get_xcr0_low = []() noexcept |
133 | | - { |
134 | | - uint32_t xcr0; |
135 | | - |
136 | | -#if defined(_MSC_VER) && _MSC_VER >= 1400 |
137 | | - |
138 | | - xcr0 = (uint32_t)_xgetbv(0); |
139 | | - |
140 | | -#elif defined(__GNUC__) |
141 | | - |
142 | | - __asm__( |
143 | | - "xorl %%ecx, %%ecx\n" |
144 | | - "xgetbv\n" |
145 | | - : "=a"(xcr0) |
146 | | - : |
147 | | -#if defined(__i386__) |
148 | | - : "ecx", "edx" |
149 | | -#else |
150 | | - : "rcx", "rdx" |
151 | | -#endif |
152 | | - ); |
153 | | - |
154 | | -#else /* _MSC_VER < 1400 */ |
155 | | -#error "_MSC_VER < 1400 is not supported" |
156 | | -#endif /* _MSC_VER && _MSC_VER >= 1400 */ |
157 | | - return xcr0; |
158 | | - }; |
159 | | - |
160 | | - auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept |
161 | | - { |
162 | | - |
163 | | -#if defined(_MSC_VER) |
164 | | - __cpuidex(reg, level, count); |
165 | | - |
166 | | -#elif defined(__INTEL_COMPILER) |
167 | | - __cpuid(reg, level); |
168 | | - |
169 | | -#elif defined(__GNUC__) || defined(__clang__) |
170 | | - |
171 | | -#if defined(__i386__) && defined(__PIC__) |
172 | | - // %ebx may be the PIC register |
173 | | - __asm__("xchg{l}\t{%%}ebx, %1\n\t" |
174 | | - "cpuid\n\t" |
175 | | - "xchg{l}\t{%%}ebx, %1\n\t" |
176 | | - : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) |
177 | | - : "0"(level), "2"(count)); |
178 | | - |
179 | | -#else |
180 | | - __asm__("cpuid\n\t" |
181 | | - : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) |
182 | | - : "0"(level), "2"(count)); |
183 | | -#endif |
184 | | - |
185 | | -#else |
186 | | -#error "Unsupported configuration" |
187 | 125 | #endif |
188 | | - }; |
189 | | - |
190 | | - int regs1[4]; |
191 | | - |
192 | | - get_cpuid(regs1, 0x1); |
193 | | - |
194 | | - // OS can explicitly disable the usage of SSE/AVX extensions |
195 | | - // by setting an appropriate flag in CR0 register |
196 | | - // |
197 | | - // https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html |
| 126 | + const auto cpuid = xsimd::x86_cpu_id::read(); |
| 127 | + auto xcr0 = xsimd::x86_xcr0::make_false(); |
198 | 128 |
|
199 | | - unsigned sse_state_os_enabled = 1; |
| 129 | + bool sse_state_os_enabled = true; |
200 | 130 | // AVX and AVX512 strictly require OSXSAVE to be enabled by the OS. |
201 | 131 | // If OSXSAVE is disabled (e.g., via bcdedit /set xsavedisable 1), |
202 | 132 | // AVX state won't be preserved across context switches, so AVX cannot be used. |
203 | | - unsigned avx_state_os_enabled = 0; |
204 | | - unsigned avx512_state_os_enabled = 0; |
| 133 | + bool avx_state_os_enabled = false; |
| 134 | + bool avx512_state_os_enabled = false; |
205 | 135 |
|
206 | | - // OSXSAVE: A value of 1 indicates that the OS has set CR4.OSXSAVE[bit |
207 | | - // 18] to enable XSETBV/XGETBV instructions to access XCR0 and |
208 | | - // to support processor extended state management using |
209 | | - // XSAVE/XRSTOR. |
210 | | - bool osxsave = regs1[2] >> 27 & 1; |
211 | | - if (osxsave) |
| 136 | + if (cpuid.osxsave()) |
212 | 137 | { |
| 138 | + xcr0 = xsimd::x86_xcr0::read(); |
213 | 139 |
|
214 | | - uint32_t xcr0 = get_xcr0_low(); |
215 | | - |
216 | | - sse_state_os_enabled = xcr0 >> 1 & 1; |
217 | | - avx_state_os_enabled = xcr0 >> 2 & sse_state_os_enabled; |
218 | | - avx512_state_os_enabled = xcr0 >> 6 & avx_state_os_enabled; |
| 140 | + sse_state_os_enabled = xcr0.sse_state_os_enabled(); |
| 141 | + avx_state_os_enabled = xcr0.avx_state_os_enabled(); |
| 142 | + avx512_state_os_enabled = xcr0.avx512_state_os_enabled(); |
219 | 143 | } |
220 | 144 |
|
221 | | - sse2 = regs1[3] >> 26 & sse_state_os_enabled; |
222 | | - sse3 = regs1[2] >> 0 & sse_state_os_enabled; |
223 | | - ssse3 = regs1[2] >> 9 & sse_state_os_enabled; |
224 | | - sse4_1 = regs1[2] >> 19 & sse_state_os_enabled; |
225 | | - sse4_2 = regs1[2] >> 20 & sse_state_os_enabled; |
226 | | - fma3_sse42 = regs1[2] >> 12 & sse_state_os_enabled; |
227 | | - |
228 | | - avx = regs1[2] >> 28 & avx_state_os_enabled; |
229 | | - fma3_avx = avx && fma3_sse42; |
230 | | - |
231 | | - int regs8[4]; |
232 | | - get_cpuid(regs8, 0x80000001); |
233 | | - fma4 = regs8[2] >> 16 & avx_state_os_enabled; |
234 | | - |
235 | | - // sse4a = regs[2] >> 6 & 1; |
| 145 | + sse2 = cpuid.sse2() && sse_state_os_enabled; |
| 146 | + sse3 = cpuid.sse3() && sse_state_os_enabled; |
| 147 | + ssse3 = cpuid.ssse3() && sse_state_os_enabled; |
| 148 | + sse4_1 = cpuid.sse4_1() && sse_state_os_enabled; |
| 149 | + sse4_2 = cpuid.sse4_2() && sse_state_os_enabled; |
| 150 | + fma3_sse42 = cpuid.fma3() && sse_state_os_enabled; |
236 | 151 |
|
237 | | - // xop = regs[2] >> 11 & 1; |
238 | | - |
239 | | - int regs7[4]; |
240 | | - get_cpuid(regs7, 0x7); |
241 | | - avx2 = regs7[1] >> 5 & avx_state_os_enabled; |
242 | | - |
243 | | - int regs7a[4]; |
244 | | - get_cpuid(regs7a, 0x7, 0x1); |
245 | | - avxvnni = regs7a[0] >> 4 & avx_state_os_enabled; |
| 152 | + // sse4a not implemented in cpu_id yet |
| 153 | + // xop not implemented in cpu_id yet |
246 | 154 |
|
| 155 | + avx = cpuid.avx() && avx_state_os_enabled; |
| 156 | + fma3_avx = avx && fma3_sse42; |
| 157 | + fma4 = cpuid.fma4() && avx_state_os_enabled; |
| 158 | + avx2 = cpuid.avx2() && avx_state_os_enabled; |
| 159 | + avxvnni = cpuid.avxvnni() && avx_state_os_enabled; |
247 | 160 | fma3_avx2 = avx2 && fma3_sse42; |
248 | 161 |
|
249 | | - avx512f = regs7[1] >> 16 & avx512_state_os_enabled; |
250 | | - avx512cd = regs7[1] >> 28 & avx512_state_os_enabled; |
251 | | - avx512dq = regs7[1] >> 17 & avx512_state_os_enabled; |
252 | | - avx512bw = regs7[1] >> 30 & avx512_state_os_enabled; |
253 | | - avx512er = regs7[1] >> 27 & avx512_state_os_enabled; |
254 | | - avx512pf = regs7[1] >> 26 & avx512_state_os_enabled; |
255 | | - avx512ifma = regs7[1] >> 21 & avx512_state_os_enabled; |
256 | | - avx512vbmi = regs7[2] >> 1 & avx512_state_os_enabled; |
257 | | - avx512vbmi2 = regs7[2] >> 6 & avx512_state_os_enabled; |
258 | | - avx512vnni_bw = regs7[2] >> 11 & avx512_state_os_enabled; |
| 162 | + avx512f = cpuid.avx512f() && avx512_state_os_enabled; |
| 163 | + avx512cd = cpuid.avx512cd() && avx512_state_os_enabled; |
| 164 | + avx512dq = cpuid.avx512dq() && avx512_state_os_enabled; |
| 165 | + avx512bw = cpuid.avx512bw() && avx512_state_os_enabled; |
| 166 | + avx512er = cpuid.avx512er() && avx512_state_os_enabled; |
| 167 | + avx512pf = cpuid.avx512pf() && avx512_state_os_enabled; |
| 168 | + avx512ifma = cpuid.avx512ifma() && avx512_state_os_enabled; |
| 169 | + avx512vbmi = cpuid.avx512vbmi() && avx512_state_os_enabled; |
| 170 | + avx512vbmi2 = cpuid.avx512vbmi2() && avx512_state_os_enabled; |
| 171 | + avx512vnni_bw = cpuid.avx512vnni_bw() && avx512_state_os_enabled; |
259 | 172 | avx512vnni_vbmi2 = avx512vbmi2 && avx512vnni_bw; |
260 | | -#endif |
261 | 173 | } |
262 | 174 | }; |
263 | 175 | } // namespace detail |
|
0 commit comments