@@ -60,7 +60,7 @@ int cpu_supports_avx512() {
6060}
6161
6262// Optimized SSE4 implementation for 3-byte pattern search
63- // Checks 14 positions per iteration by advancing 1 byte at a time
63+ // Checks 14 positions per iteration by advancing 13 bytes at a time
6464// Pattern bytes are broadcast once outside the loop for better performance
6565int find_magic_numbers_sse4 (const unsigned char * data , size_t off , size_t len ) {
6666 if (len < 3 ) return -1 ;
@@ -75,8 +75,8 @@ int find_magic_numbers_sse4(const unsigned char* data, size_t off, size_t len) {
7575 __m128i p2 = _mm_set1_epi8 (pattern [2 ]);
7676
7777 // Process 16 bytes per loop, checking 14 positions per iteration
78- // Advance by 1 byte to check every possible starting position
79- for (; i + 16 <= end ; i += 1 ) {
78+ // Advance by 14 bytes (16 - 3 + 1) to check all positions without gaps
79+ for (; i + 16 <= end ; i += 14 ) {
8080 __m128i d0 = _mm_loadu_si128 ((const __m128i * )(data + i ));
8181 __m128i d1 = _mm_loadu_si128 ((const __m128i * )(data + i + 1 ));
8282 __m128i d2 = _mm_loadu_si128 ((const __m128i * )(data + i + 2 ));
@@ -107,7 +107,7 @@ int find_magic_numbers_sse4(const unsigned char* data, size_t off, size_t len) {
107107}
108108
109109// Optimized AVX2 implementation for 3-byte pattern search
110- // Checks 30 positions per iteration by advancing 1 byte at a time
110+ // Checks 30 positions per iteration by advancing 29 bytes at a time
111111// Pattern bytes are broadcast once outside the loop for better performance
112112int find_magic_numbers_avx2 (const unsigned char * data , size_t off , size_t len ) {
113113 if (len < 3 ) return -1 ;
@@ -122,8 +122,8 @@ int find_magic_numbers_avx2(const unsigned char* data, size_t off, size_t len) {
122122 __m256i p2 = _mm256_set1_epi8 (pattern [2 ]);
123123
124124 // Process 32 bytes per loop, checking 30 positions per iteration
125- // Advance by 1 byte to check every possible starting position
126- for (; i + 32 <= end ; i += 1 ) {
125+ // Advance by 30 bytes (32 - 3 + 1) to check all positions without gaps
126+ for (; i + 32 <= end ; i += 30 ) {
127127 __m256i d0 = _mm256_loadu_si256 ((const __m256i * )(data + i ));
128128 __m256i d1 = _mm256_loadu_si256 ((const __m256i * )(data + i + 1 ));
129129 __m256i d2 = _mm256_loadu_si256 ((const __m256i * )(data + i + 2 ));
@@ -154,7 +154,7 @@ int find_magic_numbers_avx2(const unsigned char* data, size_t off, size_t len) {
154154}
155155
156156// Optimized AVX512 implementation for 3-byte pattern search
157- // Checks 62 positions per iteration by advancing 1 byte at a time
157+ // Checks 62 positions per iteration by advancing 61 bytes at a time
158158// Pattern bytes are broadcast once outside the loop for better performance
159159int find_magic_numbers_avx512 (const unsigned char * data , size_t off , size_t len ) {
160160 if (len < 3 ) return -1 ;
@@ -169,8 +169,8 @@ int find_magic_numbers_avx512(const unsigned char* data, size_t off, size_t len)
169169 __m512i p2 = _mm512_set1_epi8 (pattern [2 ]);
170170
171171 // Process 64 bytes per loop, checking 62 positions per iteration
172- // Advance by 1 byte to check every possible starting position
173- for (; i + 64 <= end ; i += 1 ) {
172+ // Advance by 62 bytes (64 - 3 + 1) to check all positions without gaps
173+ for (; i + 64 <= end ; i += 62 ) {
174174 __m512i d0 = _mm512_loadu_si512 ((const void * )(data + i ));
175175 __m512i d1 = _mm512_loadu_si512 ((const void * )(data + i + 1 ));
176176 __m512i d2 = _mm512_loadu_si512 ((const void * )(data + i + 2 ));
0 commit comments