Skip to content

Commit 0da2e2d

Browse files
performance improvements
1 parent 7f4102a commit 0da2e2d

1 file changed

Lines changed: 9 additions & 9 deletions

File tree

recordio/simd/search.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ int cpu_supports_avx512() {
6060
}
6161

6262
// Optimized SSE4 implementation for 3-byte pattern search
63-
// Checks 14 positions per iteration by advancing 1 byte at a time
63+
// Checks 14 positions per iteration by advancing 13 bytes at a time
6464
// Pattern bytes are broadcast once outside the loop for better performance
6565
int find_magic_numbers_sse4(const unsigned char* data, size_t off, size_t len) {
6666
if (len < 3) return -1;
@@ -75,8 +75,8 @@ int find_magic_numbers_sse4(const unsigned char* data, size_t off, size_t len) {
7575
__m128i p2 = _mm_set1_epi8(pattern[2]);
7676

7777
// Process 16 bytes per loop, checking 14 positions per iteration
78-
// Advance by 1 byte to check every possible starting position
79-
for (; i + 16 <= end; i += 1) {
78+
// Advance by 14 bytes (16 - 3 + 1) to check all positions without gaps
79+
for (; i + 16 <= end; i += 14) {
8080
__m128i d0 = _mm_loadu_si128((const __m128i*)(data + i));
8181
__m128i d1 = _mm_loadu_si128((const __m128i*)(data + i + 1));
8282
__m128i d2 = _mm_loadu_si128((const __m128i*)(data + i + 2));
@@ -107,7 +107,7 @@ int find_magic_numbers_sse4(const unsigned char* data, size_t off, size_t len) {
107107
}
108108

109109
// Optimized AVX2 implementation for 3-byte pattern search
110-
// Checks 30 positions per iteration by advancing 1 byte at a time
110+
// Checks 30 positions per iteration by advancing 29 bytes at a time
111111
// Pattern bytes are broadcast once outside the loop for better performance
112112
int find_magic_numbers_avx2(const unsigned char* data, size_t off, size_t len) {
113113
if (len < 3) return -1;
@@ -122,8 +122,8 @@ int find_magic_numbers_avx2(const unsigned char* data, size_t off, size_t len) {
122122
__m256i p2 = _mm256_set1_epi8(pattern[2]);
123123

124124
// Process 32 bytes per loop, checking 30 positions per iteration
125-
// Advance by 1 byte to check every possible starting position
126-
for (; i + 32 <= end; i += 1) {
125+
// Advance by 30 bytes (32 - 3 + 1) to check all positions without gaps
126+
for (; i + 32 <= end; i += 30) {
127127
__m256i d0 = _mm256_loadu_si256((const __m256i*)(data + i));
128128
__m256i d1 = _mm256_loadu_si256((const __m256i*)(data + i + 1));
129129
__m256i d2 = _mm256_loadu_si256((const __m256i*)(data + i + 2));
@@ -154,7 +154,7 @@ int find_magic_numbers_avx2(const unsigned char* data, size_t off, size_t len) {
154154
}
155155

156156
// Optimized AVX512 implementation for 3-byte pattern search
157-
// Checks 62 positions per iteration by advancing 1 byte at a time
157+
// Checks 62 positions per iteration by advancing 61 bytes at a time
158158
// Pattern bytes are broadcast once outside the loop for better performance
159159
int find_magic_numbers_avx512(const unsigned char* data, size_t off, size_t len) {
160160
if (len < 3) return -1;
@@ -169,8 +169,8 @@ int find_magic_numbers_avx512(const unsigned char* data, size_t off, size_t len)
169169
__m512i p2 = _mm512_set1_epi8(pattern[2]);
170170

171171
// Process 64 bytes per loop, checking 62 positions per iteration
172-
// Advance by 1 byte to check every possible starting position
173-
for (; i + 64 <= end; i += 1) {
172+
// Advance by 62 bytes (64 - 3 + 1) to check all positions without gaps
173+
for (; i + 64 <= end; i += 62) {
174174
__m512i d0 = _mm512_loadu_si512((const void*)(data + i));
175175
__m512i d1 = _mm512_loadu_si512((const void*)(data + i + 1));
176176
__m512i d2 = _mm512_loadu_si512((const void*)(data + i + 2));

0 commit comments

Comments
 (0)