@@ -1394,155 +1394,6 @@ struct VM {
13941394 }
13951395
13961396
1397- // we want to precompute hashes at compile time for C++11 and later, so we need to match the hardware _mm_crc32_u8
1398- // it is based on CRC32-C (Castagnoli) polynomial
1399- struct constexpr_hash {
1400- static constexpr u32 crc32_bits (u32 crc, int bits) {
1401- return (bits == 0 ) ? crc :
1402- crc32_bits ((crc >> 1 ) ^ ((crc & 1 ) ? 0x82F63B78u : 0 ), bits - 1 );
1403- }
1404- static constexpr u32 crc32_str (const char * s, u32 crc) {
1405- return (*s == ' \0 ' ) ? crc :
1406- crc32_str (s + 1 , crc32_bits (crc ^ static_cast <u8 >(*s), 8 ));
1407- }
1408- static constexpr u32 get (const char * s) {
1409- return crc32_str (s, 0 );
1410- }
1411- };
1412-
1413-
1414- // this forces the compiler to calculate the hash when initializing the array while staying C++11 compatible
1415- struct thread_entry {
1416- u32 hash;
1417- u32 threads;
1418- constexpr thread_entry (const char * m, u32 t) : hash(constexpr_hash::get(m)), threads(t) {}
1419- };
1420-
1421-
1422- struct hasher {
1423- static u32 crc32_sw (u32 crc, char data) {
1424- crc ^= static_cast <u8 >(data);
1425- for (int i = 0 ; i < 8 ; ++i)
1426- crc = (crc >> 1 ) ^ ((crc & 1 ) ? 0x82F63B78u : 0 );
1427- return crc;
1428- }
1429-
1430- #if (CLANG || GCC)
1431- __attribute__ ((__target__(" crc32" )))
1432- #endif
1433- static u32 crc32_hw (u32 crc, char data) {
1434- return _mm_crc32_u8 (crc, static_cast <u8 >(data));
1435- }
1436-
1437- using hashfc = u32 (*)(u32 , char );
1438-
1439- static hashfc get () {
1440- // yes, vmaware runs on dinosaur cpus without sse4.2 pretty often
1441- i32 regs[4 ];
1442- cpu::cpuid (regs, 1 );
1443- const bool has_sse42 = (regs[2 ] & (1 << 20 )) != 0 ;
1444-
1445- return has_sse42 ? crc32_hw : crc32_sw;
1446- }
1447- };
1448-
1449-
1450- static void lookup (
1451- bool &found,
1452- const char * str,
1453- const thread_entry* thread_database,
1454- const std::size_t & db_size,
1455- std::size_t best_len,
1456- const u8 max_token_length,
1457- u32 * z_series_threads,
1458- u32 & expected_threads,
1459- const bool is_amd = false
1460- ) {
1461- const hasher::hashfc hash_func = util::hasher::get ();
1462-
1463- for (size_t i = 0 ; str[i] != ' \0 ' ; ) {
1464- char c = str[i];
1465- if (!((c >= ' 0' && c <= ' 9' ) || (c >= ' A' && c <= ' Z' ) || (c >= ' a' && c <= ' z' ))) {
1466- i++;
1467- continue ;
1468- }
1469-
1470- u32 current_hash = 0 ;
1471- size_t current_len = 0 ;
1472- size_t j = i;
1473-
1474- while (true ) {
1475- char k = str[j];
1476- const bool is_valid = (k >= ' 0' && k <= ' 9' ) ||
1477- (k >= ' A' && k <= ' Z' ) ||
1478- (k >= ' a' && k <= ' z' ) ||
1479- (k == ' -' ); // models have hyphen
1480- if (!is_valid) break ;
1481-
1482- if (current_len >= max_token_length) {
1483- while (str[j] != ' \0 ' && str[j] != ' ' ) j++; // fast forward to space/null
1484- break ;
1485- }
1486-
1487- if (is_amd) {
1488- // convert to lowercase on-the-fly to match compile-time keys
1489- if (k >= ' A' && k <= ' Z' ) k += 32 ;
1490- }
1491-
1492- /*
1493- models are usually 8 or more bytes long, i.e. i9-10900K
1494- so imagine we want to use u64, you hash the first 8 bytes i9-10900
1495- but then you are left with K. You have to handle the tail
1496- fetching 8 bytes would include the characters after the token, corrupting the hash
1497- so a byte-by-byte loop is the most optimal choice here
1498- */
1499-
1500- // since this technique is cross-platform, we cannot use a standard C++ try-catch block to catch a missing CPU instruction
1501- // we could use preprocessor directives and add an exception handler (VEH/SEH or SIGHANDLER) but nah
1502- current_hash = hash_func (current_hash, k);
1503- current_len++;
1504- j++;
1505-
1506- // boundary check, only verify match if the token has ended (next char is not alphanumeric)
1507- const char next = str[j];
1508- const bool next_is_alnum = (next >= ' 0' && next <= ' 9' ) ||
1509- (next >= ' A' && next <= ' Z' ) ||
1510- (next >= ' a' && next <= ' z' );
1511-
1512- if (!next_is_alnum) {
1513- // Check specific Z1 Extreme token
1514- // Hash for "extreme" (CRC32-C) is 0x3D09D5B4
1515- if (
1516- is_amd &&
1517- (z_series_threads != nullptr ) &&
1518- (current_hash == 0x3D09D5B4 )
1519- ) {
1520- *z_series_threads = 16 ;
1521- }
1522-
1523- // since it's a contiguous block of integers in .rodata/.rdata, this is extremely fast
1524- for (std::size_t i = 0 ; i < db_size; ++i) {
1525- if (
1526- (thread_database[i].hash == current_hash) &&
1527- (current_len > best_len)
1528- ) {
1529- best_len = current_len;
1530- expected_threads = thread_database[i].threads ;
1531- found = true ;
1532-
1533- // on intel, since hashing implies uniqueness in this dataset, you might say we could break
1534- // here, but we continue to ensure we find the longest substring match if overlaps exist,
1535- // so like it finds both "i9-11900" and "i9-11900K" i.e.
1536- }
1537- }
1538- }
1539- }
1540- i = j;
1541- }
1542- }
1543-
1544-
1545-
15461397 // wrapper for std::make_unique because it's not available for C++11
15471398 template <typename T, typename ... Args>
15481399 [[nodiscard]] static std::unique_ptr<T> make_unique (Args&&... args) {
0 commit comments