|
| 1 | +// AVX2 implementation of Adler-32 block processing. |
| 2 | +// Expects MS x64 ABI: rcx = data ptr, edx = num_blocks, r8 = sums ptr, r9 = constants ptr. |
| 3 | +// ASums layout: [SumA: UInt32, SumB: UInt32]. |
| 4 | +// Constants layout: [weights: 32B, ones_16: 32B] at offsets 0 and 32. |
| 5 | +// Processes num_blocks x 32-byte blocks. Does NOT apply mod 65521 (caller does it). |
| 6 | +// Uses ymm0-ymm5 only (all volatile on Windows x64, no saves needed). |
| 7 | +// Weights and ones are reloaded from memory each iteration to avoid |
| 8 | +// using non-volatile ymm registers. |
| 9 | +// AVX/AVX2 instructions are db-encoded for broad assembler compatibility. |
| 10 | + |
| 11 | + // Zero register |
| 12 | + db $C5, $E5, $EF, $DB // vpxor ymm3, ymm3, ymm3 |
| 13 | + |
| 14 | + // Load initial sums |
| 15 | + mov eax, dword [r8] |
| 16 | + mov r10d, dword [r8 + 4] |
| 17 | + mov r11d, edx |
| 18 | + |
| 19 | + // v_ps = [SumA * num_blocks, 0, 0, 0] |
| 20 | + imul edx, eax |
| 21 | + db $C5, $F9, $6E, $D2 // vmovd xmm2, edx |
| 22 | + |
| 23 | + // v_s2 = [SumB, 0, 0, 0] |
| 24 | + db $C4, $C1, $79, $6E, $CA // vmovd xmm1, r10d |
| 25 | + |
| 26 | + // v_s1 = 0 |
| 27 | + db $C5, $F9, $EF, $C0 // vpxor xmm0, xmm0, xmm0 |
| 28 | + |
| 29 | +@loop: |
| 30 | + db $C5, $ED, $FE, $D0 // vpaddd ymm2, ymm2, ymm0 |
| 31 | + |
| 32 | + // Load 32 data bytes |
| 33 | + db $C5, $FE, $6F, $21 // vmovdqu ymm4, yword [rcx] |
| 34 | + |
| 35 | + // Byte sum for s1 |
| 36 | + db $C5, $DD, $F6, $EB // vpsadbw ymm5, ymm4, ymm3 |
| 37 | + db $C5, $FD, $FE, $C5 // vpaddd ymm0, ymm0, ymm5 |
| 38 | + |
| 39 | + // Weighted sum for s2 |
| 40 | + db $C4, $C1, $7E, $6F, $29 // vmovdqu ymm5, yword [r9] |
| 41 | + db $C4, $E2, $5D, $04, $E5 // vpmaddubsw ymm4, ymm4, ymm5 |
| 42 | + db $C4, $C1, $7E, $6F, $69, $20 // vmovdqu ymm5, yword [r9 + 32] |
| 43 | + db $C5, $DD, $F5, $E5 // vpmaddwd ymm4, ymm4, ymm5 |
| 44 | + db $C5, $F5, $FE, $CC // vpaddd ymm1, ymm1, ymm4 |
| 45 | + |
| 46 | + add rcx, 32 |
| 47 | + dec r11d |
| 48 | + jnz @loop |
| 49 | + |
| 50 | + // v_s2 += v_ps * 32 |
| 51 | + db $C5, $ED, $72, $F2, $05 // vpslld ymm2, ymm2, 5 |
| 52 | + db $C5, $F5, $FE, $CA // vpaddd ymm1, ymm1, ymm2 |
| 53 | + |
| 54 | + // Horizontal reduce v_s1: extract high 128 + 128-bit hsum |
| 55 | + db $C4, $E3, $7D, $39, $C5, $01 // vextracti128 xmm5, ymm0, 1 |
| 56 | + db $C5, $F9, $FE, $C5 // vpaddd xmm0, xmm0, xmm5 |
| 57 | + db $C5, $F9, $70, $E8, $B1 // vpshufd xmm5, xmm0, $B1 |
| 58 | + db $C5, $F9, $FE, $C5 // vpaddd xmm0, xmm0, xmm5 |
| 59 | + db $C5, $F9, $70, $E8, $4E // vpshufd xmm5, xmm0, $4E |
| 60 | + db $C5, $F9, $FE, $C5 // vpaddd xmm0, xmm0, xmm5 |
| 61 | + db $C5, $F9, $7E, $C2 // vmovd edx, xmm0 |
| 62 | + add eax, edx |
| 63 | + |
| 64 | + // Horizontal reduce v_s2: extract high 128 + 128-bit hsum |
| 65 | + db $C4, $E3, $7D, $39, $CD, $01 // vextracti128 xmm5, ymm1, 1 |
| 66 | + db $C5, $F1, $FE, $CD // vpaddd xmm1, xmm1, xmm5 |
| 67 | + db $C5, $F9, $70, $E9, $B1 // vpshufd xmm5, xmm1, $B1 |
| 68 | + db $C5, $F1, $FE, $CD // vpaddd xmm1, xmm1, xmm5 |
| 69 | + db $C5, $F9, $70, $E9, $4E // vpshufd xmm5, xmm1, $4E |
| 70 | + db $C5, $F1, $FE, $CD // vpaddd xmm1, xmm1, xmm5 |
| 71 | + db $C4, $C1, $79, $7E, $CA // vmovd r10d, xmm1 |
| 72 | + |
| 73 | + // Store results |
| 74 | + mov dword [r8], eax |
| 75 | + mov dword [r8 + 4], r10d |
| 76 | + |
| 77 | + db $C5, $F8, $77 // vzeroupper |
0 commit comments