Skip to content

Commit 5b03f38

Browse files
thibaultchadebayang
andcommitted
feature: arm64: added support for CRC32 string hashing optimization.
Only, available in ARMv8, the CRC32 instructions are enabled when LuaJIT is compiled with `-march=armv8-a+crc`. Co-authored-by: Debayan Ghosh <debayang.qdt@qualcommdatacenter.com>
1 parent 3117e67 commit 5b03f38

15 files changed

Lines changed: 75 additions & 43 deletions

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,7 @@
1212
*~
1313
tags
1414
*.swo
15+
test/*.txt
16+
test/*.o
17+
test/*.d
18+
test/ht_test

src/lj_arch.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -616,7 +616,7 @@ extern void *LJ_WIN_LOADLIBA(const char *path);
616616
#endif
617617

618618
/* Optimized string hashing, added by OpenResty. */
619-
#if LUAJIT_TARGET == LUAJIT_ARCH_X64 && defined(__GNUC__) && defined(__SSE4_2__)
619+
#if (LUAJIT_TARGET == LUAJIT_ARCH_X64 && defined(__SSE4_2__) || LUAJIT_TARGET == LUAJIT_ARCH_ARM64 && __ARM_FEATURE_CRC32) && defined(__GNUC__)
620620
#ifndef LJ_OR_DISABLE_STRHASHCRC32
621621
#define LJ_OR_STRHASHCRC32 1
622622
#endif

src/lj_str_hash.c

Lines changed: 55 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
/*
2-
* This file defines string hash function using CRC32. It takes advantage of
3-
* Intel hardware support (crc32 instruction, SSE 4.2) to speedup the CRC32
4-
* computation. The hash functions try to compute CRC32 of length and up
5-
* to 128 bytes of given string.
2+
* This file defines string hash function using CRC32.
3+
* On Intel architectures, this implemantation takes advantage of hardware
4+
* support (CRC32 instruction, SSE 4.2) to speedup the CRC32 computation.
5+
* On ARM64 architectures, this implementation utilizes the ARMv8.1-A extension
6+
* wich offers CRC32 instructions.
7+
* The hash functions try to compute CRC32 of length and up to 128 bytes of
8+
* the given string.
69
*/
710

811
#define lj_str_hash_c
@@ -15,13 +18,34 @@
1518
#include <sys/types.h>
1619
#include <unistd.h>
1720
#include <time.h>
18-
#include <smmintrin.h>
1921
#include "lj_vm.h"
2022

23+
#if LUAJIT_TARGET == LUAJIT_ARCH_X64
24+
#include <smmintrin.h>
25+
26+
#define lj_crc32_u32 _mm_crc32_u32
27+
#define lj_crc32_u64 _mm_crc32_u64
28+
2129
#ifndef F_CPU_SSE4_2
2230
#define F_CPU_SSE4_2 (1 << 20)
2331
#endif
2432

33+
#elif LUAJIT_TARGET == LUAJIT_ARCH_ARM64
34+
#include <sys/auxv.h>
35+
#include <arm_acle.h>
36+
#include <errno.h>
37+
38+
#define lj_crc32_u32 __crc32cw
39+
#define lj_crc32_u64 __crc32cd
40+
41+
#ifndef HWCAP_CRC32
42+
#define HWCAP_CRC32 (1 << 7)
43+
#endif
44+
45+
#else
46+
#error "LJ_OR_STRHASHCRC32 not supported on this architecture"
47+
#endif
48+
2549
#ifdef __MINGW32__
2650
#define random() ((long) rand())
2751
#define srandom(seed) srand(seed)
@@ -49,7 +73,7 @@ static LJ_NOINLINE uint32_t lj_str_hash_1_4(const char* str, uint32_t len)
4973
v = (v << 8) | str[len >> 1];
5074
v = (v << 8) | str[len - 1];
5175
v = (v << 8) | len;
52-
return _mm_crc32_u32(0, v);
76+
return lj_crc32_u32(0, v);
5377
#else
5478
uint32_t a, b, h = len;
5579

@@ -79,9 +103,9 @@ static LJ_NOINLINE uint32_t lj_str_hash_4_16(const char* str, uint32_t len)
79103
v2 = *cast_uint32p(str + len - 4);
80104
}
81105

82-
h = _mm_crc32_u32(0, len);
83-
h = _mm_crc32_u64(h, v1);
84-
h = _mm_crc32_u64(h, v2);
106+
h = lj_crc32_u32(0, len);
107+
h = lj_crc32_u64(h, v1);
108+
h = lj_crc32_u64(h, v2);
85109

86110
return h;
87111
}
@@ -92,18 +116,18 @@ static LJ_NOINLINE uint32_t lj_str_hash_16_128(const char* str, uint32_t len)
92116
uint64_t h1, h2;
93117
uint32_t i;
94118

95-
h1 = _mm_crc32_u32(0, len);
119+
h1 = lj_crc32_u32(0, len);
96120
h2 = 0;
97121

98122
for (i = 0; i < len - 16; i += 16) {
99-
h1 += _mm_crc32_u64(h1, *cast_uint64p(str + i));
100-
h2 += _mm_crc32_u64(h2, *cast_uint64p(str + i + 8));
123+
h1 += lj_crc32_u64(h1, *cast_uint64p(str + i));
124+
h2 += lj_crc32_u64(h2, *cast_uint64p(str + i + 8));
101125
};
102126

103-
h1 = _mm_crc32_u64(h1, *cast_uint64p(str + len - 16));
104-
h2 = _mm_crc32_u64(h2, *cast_uint64p(str + len - 8));
127+
h1 = lj_crc32_u64(h1, *cast_uint64p(str + len - 16));
128+
h2 = lj_crc32_u64(h2, *cast_uint64p(str + len - 8));
105129

106-
return _mm_crc32_u32(h1, h2);
130+
return lj_crc32_u32(h1, h2);
107131
}
108132

109133
/* **************************************************************************
@@ -167,32 +191,32 @@ static LJ_NOINLINE uint32_t lj_str_hash_128_above(const char* str,
167191
pos1 = get_random_pos_unsafe(chunk_sz_log2, 0);
168192
pos2 = get_random_pos_unsafe(chunk_sz_log2, 1);
169193

170-
h1 = _mm_crc32_u32(0, len);
194+
h1 = lj_crc32_u32(0, len);
171195
h2 = 0;
172196

173197
/* loop over 14 chunks, 2 chunks at a time */
174198
for (i = 0, chunk_ptr = str; i < (chunk_num / 2 - 1);
175199
chunk_ptr += chunk_sz, i++) {
176200

177201
v = *cast_uint64p(chunk_ptr + pos1);
178-
h1 = _mm_crc32_u64(h1, v);
202+
h1 = lj_crc32_u64(h1, v);
179203

180204
v = *cast_uint64p(chunk_ptr + chunk_sz + pos2);
181-
h2 = _mm_crc32_u64(h2, v);
205+
h2 = lj_crc32_u64(h2, v);
182206
}
183207

184208
/* the last two chunks */
185209
v = *cast_uint64p(chunk_ptr + pos1);
186-
h1 = _mm_crc32_u64(h1, v);
210+
h1 = lj_crc32_u64(h1, v);
187211

188212
v = *cast_uint64p(chunk_ptr + chunk_sz - 8 - pos2);
189-
h2 = _mm_crc32_u64(h2, v);
213+
h2 = lj_crc32_u64(h2, v);
190214

191215
/* process the trailing part */
192-
h1 = _mm_crc32_u64(h1, *cast_uint64p(str));
193-
h2 = _mm_crc32_u64(h2, *cast_uint64p(str + len - 8));
216+
h1 = lj_crc32_u64(h1, *cast_uint64p(str));
217+
h2 = lj_crc32_u64(h2, *cast_uint64p(str + len - 8));
194218

195-
h1 = _mm_crc32_u32(h1, h2);
219+
h1 = lj_crc32_u32(h1, h2);
196220

197221
return h1;
198222
}
@@ -233,8 +257,8 @@ static void lj_str_hash_init_random(void)
233257
}
234258

235259
/* Init seed */
236-
seed = _mm_crc32_u32(0, getpid());
237-
seed = _mm_crc32_u32(seed, time(NULL));
260+
seed = lj_crc32_u32(0, getpid());
261+
seed = lj_crc32_u32(seed, time(NULL));
238262
srandom(seed);
239263

240264
/* Now start to populate the random_pos[][]. */
@@ -266,9 +290,15 @@ static void lj_str_hash_init_random(void)
266290

267291
LJ_FUNC unsigned char lj_check_crc32_support()
268292
{
293+
#if LUAJIT_TARGET == LUAJIT_ARCH_X64
269294
uint32_t features[4];
270295
if (lj_vm_cpuid(1, features))
271296
return (features[2] & F_CPU_SSE4_2) != 0;
297+
#elif LUAJIT_TARGET == LUAJIT_ARCH_ARM64
298+
uint32_t hwcap = getauxval(AT_HWCAP);
299+
if (hwcap != ENOENT)
300+
return (hwcap & HWCAP_CRC32) != 0;
301+
#endif
272302
return 0;
273303
}
274304

src/x64/Makefile

Lines changed: 0 additions & 13 deletions
This file was deleted.
Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,19 @@ else
1616
VALGRIND :=
1717
endif
1818

19-
CXXFLAGS := -O3 -MD -g -Wall -msse4.2 -I../..
20-
LDFLAGS := ../../libluajit.a -ldl -lm
19+
CXXFLAGS := -O3 -MD -g -Wall -I../src
20+
LDFLAGS := ../src/libluajit.a -ldl -lm
21+
22+
TARGET_TESTARCH=$(shell $(CC) -E ../src/lj_arch.h -dM)
23+
ifneq (,$(findstring LJ_TARGET_X64 ,$(TARGET_TESTARCH)))
24+
CXXFLAGS+= -msse4.2
25+
else
26+
ifneq (,$(findstring LJ_TARGET_ARM64 ,$(TARGET_TESTARCH)))
27+
CXXFLAGS+= -march=armv8-a+crc
28+
else
29+
$(error Unsupported target architecture)
30+
endif
31+
endif
2132

2233
%.o: %.cxx
2334
$(CXX) $(CXXFLAGS) -MD -c $<
@@ -28,7 +39,7 @@ test: $(TEST_PROGRAM)
2839
./unit_test.sh
2940

3041
@echo "smoke test"
31-
../../luajit test_str_comp.lua
42+
../src/luajit test_str_comp.lua
3243

3344
benchmark: $(BENCHMARK_PROGRAM)
3445
# micro benchmark
File renamed without changes.

0 commit comments

Comments
 (0)