|
| 1 | +// VPCLMULQDQ + AVX2 CRC folding + Barrett reduction for reflected (LSB-first) |
| 2 | +// CRCs with width 8..32 (CRC-64 requires a separate path). |
| 3 | +// |
| 4 | +// Function signature (included after SimdProc4Begin.inc): |
| 5 | +// function CRC_Fold_Vpclmul(AData: PByte; ALength: UInt32; |
| 6 | +// AState: Pointer; AConstants: Pointer): UInt64; |
| 7 | +// |
| 8 | +// Register mapping (MS x64 ABI after prologue): |
| 9 | +// rcx = AData, edx = ALength (>= 64), r8 = AState, r9 = AConstants |
| 10 | +// AState: [0..7] = initial CRC (reflected), [8..15] = 0. |
| 11 | +// AConstants layout (TCRCFoldConstants): |
| 12 | +// [0..15] = Fold_4x128 (stride 512) |
| 13 | +// [16..31] = Fold_1x128 (stride 128) |
| 14 | +// [32..47] = Barrett |
| 15 | +// [48..63] = Fold_8x128 (stride 1024) |
| 16 | +// Returns: final CRC in RAX. |
| 17 | +// |
| 18 | +// Reference: zlib-ng crc32_pclmulqdq_tpl.h (256-bit VPCLMULQDQ path) |
| 19 | +// and Linux kernel crc-pclmul-template.S by Eric Biggers (Barrett reduction). |
| 20 | +// |
| 21 | +// All VEX-encoded instructions are db-encoded for FPC compatibility. |
| 22 | +// VEX byte layout reference: |
| 23 | +// 2-byte: C5 [R.vvvv.L.pp] |
| 24 | +// 3-byte: C4 [R.X.B.mmmmm] [W.vvvv.L.pp] |
| 25 | + |
| 26 | + cmp edx, 128 |
| 27 | + jb @xmm_path |
| 28 | + |
| 29 | + // ===== YMM PATH (>= 128 bytes) ===== |
| 30 | + |
| 31 | + // Broadcast Fold_8x128 (stride 1024) into ymm6 |
| 32 | + db $C4, $C2, $7D, $5A, $71, $30 // vbroadcasti128 ymm6, [r9 + 48] |
| 33 | + |
| 34 | + // Load first 128 bytes into ymm0..ymm3 |
| 35 | + db $C5, $FE, $6F, $01 // vmovdqu ymm0, [rcx] |
| 36 | + db $C5, $FE, $6F, $49, $20 // vmovdqu ymm1, [rcx + 32] |
| 37 | + db $C5, $FE, $6F, $51, $40 // vmovdqu ymm2, [rcx + 64] |
| 38 | + db $C5, $FE, $6F, $59, $60 // vmovdqu ymm3, [rcx + 96] |
| 39 | + |
| 40 | + // XOR initial CRC into low 64 bits of ymm0 |
| 41 | + db $C4, $C1, $7A, $7E, $20 // vmovq xmm4, [r8] |
| 42 | + db $C5, $FD, $EF, $C4 // vpxor ymm0, ymm0, ymm4 |
| 43 | + |
| 44 | + add rcx, 128 |
| 45 | + sub edx, 128 |
| 46 | + |
| 47 | + // --- Main fold-by-8 loop (4 ymm accumulators, 128 bytes per iteration) --- |
| 48 | + cmp edx, 128 |
| 49 | + jb @ymm_done |
| 50 | + |
| 51 | +@ymm_loop: |
| 52 | + // Fold ymm0 |
| 53 | + db $C4, $E3, $7D, $44, $E6, $11 // vpclmulqdq ymm4, ymm0, ymm6, $11 |
| 54 | + db $C4, $E3, $7D, $44, $C6, $00 // vpclmulqdq ymm0, ymm0, ymm6, $00 |
| 55 | + db $C5, $FD, $EF, $C4 // vpxor ymm0, ymm0, ymm4 |
| 56 | + db $C5, $FE, $6F, $29 // vmovdqu ymm5, [rcx] |
| 57 | + db $C5, $FD, $EF, $C5 // vpxor ymm0, ymm0, ymm5 |
| 58 | + |
| 59 | + // Fold ymm1 |
| 60 | + db $C4, $E3, $75, $44, $E6, $11 // vpclmulqdq ymm4, ymm1, ymm6, $11 |
| 61 | + db $C4, $E3, $75, $44, $CE, $00 // vpclmulqdq ymm1, ymm1, ymm6, $00 |
| 62 | + db $C5, $F5, $EF, $CC // vpxor ymm1, ymm1, ymm4 |
| 63 | + db $C5, $FE, $6F, $69, $20 // vmovdqu ymm5, [rcx + 32] |
| 64 | + db $C5, $F5, $EF, $CD // vpxor ymm1, ymm1, ymm5 |
| 65 | + |
| 66 | + // Fold ymm2 |
| 67 | + db $C4, $E3, $6D, $44, $E6, $11 // vpclmulqdq ymm4, ymm2, ymm6, $11 |
| 68 | + db $C4, $E3, $6D, $44, $D6, $00 // vpclmulqdq ymm2, ymm2, ymm6, $00 |
| 69 | + db $C5, $ED, $EF, $D4 // vpxor ymm2, ymm2, ymm4 |
| 70 | + db $C5, $FE, $6F, $69, $40 // vmovdqu ymm5, [rcx + 64] |
| 71 | + db $C5, $ED, $EF, $D5 // vpxor ymm2, ymm2, ymm5 |
| 72 | + |
| 73 | + // Fold ymm3 |
| 74 | + db $C4, $E3, $65, $44, $E6, $11 // vpclmulqdq ymm4, ymm3, ymm6, $11 |
| 75 | + db $C4, $E3, $65, $44, $DE, $00 // vpclmulqdq ymm3, ymm3, ymm6, $00 |
| 76 | + db $C5, $E5, $EF, $DC // vpxor ymm3, ymm3, ymm4 |
| 77 | + db $C5, $FE, $6F, $69, $60 // vmovdqu ymm5, [rcx + 96] |
| 78 | + db $C5, $E5, $EF, $DD // vpxor ymm3, ymm3, ymm5 |
| 79 | + |
| 80 | + add rcx, 128 |
| 81 | + sub edx, 128 |
| 82 | + cmp edx, 128 |
| 83 | + jae @ymm_loop |
| 84 | + |
| 85 | +@ymm_done: |
| 86 | + // --- Reduce 4 ymm -> 2 ymm using Fold_4x128 (stride 512) --- |
| 87 | + db $C4, $C2, $7D, $5A, $31 // vbroadcasti128 ymm6, [r9] |
| 88 | + |
| 89 | + // ymm0 = fold(ymm0) XOR ymm2 |
| 90 | + db $C4, $E3, $7D, $44, $E6, $11 // vpclmulqdq ymm4, ymm0, ymm6, $11 |
| 91 | + db $C4, $E3, $7D, $44, $C6, $00 // vpclmulqdq ymm0, ymm0, ymm6, $00 |
| 92 | + db $C5, $FD, $EF, $C4 // vpxor ymm0, ymm0, ymm4 |
| 93 | + db $C5, $FD, $EF, $C2 // vpxor ymm0, ymm0, ymm2 |
| 94 | + |
| 95 | + // ymm1 = fold(ymm1) XOR ymm3 |
| 96 | + db $C4, $E3, $75, $44, $E6, $11 // vpclmulqdq ymm4, ymm1, ymm6, $11 |
| 97 | + db $C4, $E3, $75, $44, $CE, $00 // vpclmulqdq ymm1, ymm1, ymm6, $00 |
| 98 | + db $C5, $F5, $EF, $CC // vpxor ymm1, ymm1, ymm4 |
| 99 | + db $C5, $F5, $EF, $CB // vpxor ymm1, ymm1, ymm3 |
| 100 | + |
| 101 | + // --- Extract 4 xmm from 2 ymm --- |
| 102 | + // ymm0 = [result0 | result1], ymm1 = [result2 | result3] |
| 103 | + db $C5, $F9, $6F, $D1 // vmovdqa xmm2, xmm1 (save low of ymm1 = result2) |
| 104 | + db $C4, $E3, $7D, $39, $CB, $01 // vextracti128 xmm3, ymm1, 1 (high of ymm1 = result3) |
| 105 | + db $C4, $E3, $7D, $39, $C1, $01 // vextracti128 xmm1, ymm0, 1 (high of ymm0 = result1) |
| 106 | + // xmm0 = low(ymm0) = result0 (untouched) |
| 107 | + |
| 108 | + db $C5, $F8, $77 // vzeroupper |
| 109 | + jmp @fold4to1 |
| 110 | + |
| 111 | +@xmm_path: |
| 112 | + // ===== XMM PATH (64..127 bytes) ===== |
| 113 | + |
| 114 | + // Load first 64 bytes |
| 115 | + db $C5, $FA, $6F, $01 // vmovdqu xmm0, [rcx] |
| 116 | + db $C5, $FA, $6F, $49, $10 // vmovdqu xmm1, [rcx + 16] |
| 117 | + db $C5, $FA, $6F, $51, $20 // vmovdqu xmm2, [rcx + 32] |
| 118 | + db $C5, $FA, $6F, $59, $30 // vmovdqu xmm3, [rcx + 48] |
| 119 | + |
| 120 | + // XOR initial CRC into low 64 bits of xmm0 |
| 121 | + db $C4, $C1, $7A, $7E, $20 // vmovq xmm4, [r8] |
| 122 | + db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4 |
| 123 | + |
| 124 | + add rcx, 64 |
| 125 | + sub edx, 64 |
| 126 | + |
| 127 | +@fold4to1: |
| 128 | + // ===== FOLD 4 XMM -> 1 XMM using Fold_1x128 (stride 128) ===== |
| 129 | + db $C4, $C1, $7A, $6F, $79, $10 // vmovdqu xmm7, [r9 + 16] |
| 130 | + |
| 131 | + // xmm0 = fold(xmm0) XOR xmm1 |
| 132 | + db $C4, $E3, $79, $44, $E7, $11 // vpclmulqdq xmm4, xmm0, xmm7, $11 |
| 133 | + db $C4, $E3, $79, $44, $C7, $00 // vpclmulqdq xmm0, xmm0, xmm7, $00 |
| 134 | + db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4 |
| 135 | + db $C5, $F9, $EF, $C1 // vpxor xmm0, xmm0, xmm1 |
| 136 | + |
| 137 | + // xmm0 = fold(xmm0) XOR xmm2 |
| 138 | + db $C4, $E3, $79, $44, $E7, $11 // vpclmulqdq xmm4, xmm0, xmm7, $11 |
| 139 | + db $C4, $E3, $79, $44, $C7, $00 // vpclmulqdq xmm0, xmm0, xmm7, $00 |
| 140 | + db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4 |
| 141 | + db $C5, $F9, $EF, $C2 // vpxor xmm0, xmm0, xmm2 |
| 142 | + |
| 143 | + // xmm0 = fold(xmm0) XOR xmm3 |
| 144 | + db $C4, $E3, $79, $44, $E7, $11 // vpclmulqdq xmm4, xmm0, xmm7, $11 |
| 145 | + db $C4, $E3, $79, $44, $C7, $00 // vpclmulqdq xmm0, xmm0, xmm7, $00 |
| 146 | + db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4 |
| 147 | + db $C5, $F9, $EF, $C3 // vpxor xmm0, xmm0, xmm3 |
| 148 | + |
| 149 | + // ===== FOLD REMAINING 16-BYTE BLOCKS ===== |
| 150 | + cmp edx, 16 |
| 151 | + jb @tail_done |
| 152 | + |
| 153 | +@tail_loop: |
| 154 | + db $C4, $E3, $79, $44, $E7, $11 // vpclmulqdq xmm4, xmm0, xmm7, $11 |
| 155 | + db $C4, $E3, $79, $44, $C7, $00 // vpclmulqdq xmm0, xmm0, xmm7, $00 |
| 156 | + db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4 |
| 157 | + db $C5, $FA, $6F, $29 // vmovdqu xmm5, [rcx] |
| 158 | + db $C5, $F9, $EF, $C5 // vpxor xmm0, xmm0, xmm5 |
| 159 | + add rcx, 16 |
| 160 | + sub edx, 16 |
| 161 | + cmp edx, 16 |
| 162 | + jae @tail_loop |
| 163 | + |
| 164 | +@tail_done: |
| 165 | + // ================================================================= |
| 166 | + // Final reduction: 128-bit xmm0 -> CRC in rax |
| 167 | + // Following Linux kernel crc-pclmul-template.S Barrett reduction. |
| 168 | + // ================================================================= |
| 169 | + |
| 170 | + // Step 1: Multiply by x^n and reduce 128 bits to 64+n bits. |
| 171 | + db $C4, $E3, $79, $44, $CF, $10 // vpclmulqdq xmm1, xmm0, xmm7, $10 |
| 172 | + db $C5, $F9, $73, $D8, $08 // vpsrldq xmm0, xmm0, 8 |
| 173 | + db $C5, $F9, $EF, $C1 // vpxor xmm0, xmm0, xmm1 |
| 174 | + |
| 175 | + // Step 2: Barrett reduction. |
| 176 | + db $C4, $C1, $7A, $6F, $71, $20 // vmovdqu xmm6, [r9 + 32] |
| 177 | + db $C4, $E3, $79, $44, $CE, $00 // vpclmulqdq xmm1, xmm0, xmm6, $00 |
| 178 | + db $C4, $E3, $71, $44, $CE, $10 // vpclmulqdq xmm1, xmm1, xmm6, $10 |
| 179 | + db $C5, $F9, $EF, $C1 // vpxor xmm0, xmm0, xmm1 |
| 180 | + |
| 181 | + // Extract high qword (bits 64..127) into rax and return. |
| 182 | + db $C5, $F8, $77 // vzeroupper |
| 183 | + db $C4, $E3, $F9, $16, $C0, $01 // vpextrq rax, xmm0, 1 |
0 commit comments