|
8 | 8 | // AState: Pointer; AConstants: Pointer): UInt64; |
9 | 9 | // |
10 | 10 | // Register mapping (MS x64 ABI after prologue): |
11 | | -// rcx = AData, edx = ALength (>= 64), r8 = AState, r9 = AConstants |
| 11 | +// rcx = AData, edx = ALength (>= 16), r8 = AState, r9 = AConstants |
12 | 12 | // AState: [0..7] = initial CRC pre-shifted left by (64 - Width), [8..15] = 0. |
13 | 13 | // AConstants layout (TCRCFoldConstants): |
14 | 14 | // [0..15] = Fold_4x128 (stride 512) |
|
122 | 122 | jmp @fold4to1 |
123 | 123 |
|
124 | 124 | @xmm_path: |
125 | | - // ===== XMM PATH (64..127 bytes) ===== |
| 125 | + // ===== XMM PATH (16..127 bytes) ===== |
126 | 126 |
|
127 | 127 | // Load BswapMask into xmm5 |
128 | 128 | db $C4, $C1, $7A, $6F, $69, $40 // vmovdqu xmm5, [r9 + 64] |
129 | 129 |
|
130 | | - // Load first 64 bytes with byte-swap |
| 130 | + cmp edx, 64 |
| 131 | + jae @xmm_large |
| 132 | + |
| 133 | + // --- Small input (16..63 bytes): load 1 block with byte-swap --- |
| 134 | + db $C5, $FA, $6F, $01 // vmovdqu xmm0, [rcx] |
| 135 | + db $C4, $E2, $79, $00, $C5 // vpshufb xmm0, xmm0, xmm5 |
| 136 | + db $C4, $C1, $7A, $7E, $20 // vmovq xmm4, [r8] |
| 137 | + db $C5, $D9, $73, $FC, $08 // vpslldq xmm4, xmm4, 8 |
| 138 | + db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4 |
| 139 | + add rcx, 16 |
| 140 | + sub edx, 16 |
| 141 | + db $C4, $C1, $7A, $6F, $79, $10 // vmovdqu xmm7, [r9 + 16] |
| 142 | + jmp @tail_check |
| 143 | + |
| 144 | +@xmm_large: |
| 145 | + // --- Large input (64..127 bytes): load 4 blocks with byte-swap --- |
131 | 146 | db $C5, $FA, $6F, $01 // vmovdqu xmm0, [rcx] |
132 | 147 | db $C4, $E2, $79, $00, $C5 // vpshufb xmm0, xmm0, xmm5 |
133 | 148 | db $C5, $FA, $6F, $49, $10 // vmovdqu xmm1, [rcx + 16] |
|
137 | 152 | db $C5, $FA, $6F, $59, $30 // vmovdqu xmm3, [rcx + 48] |
138 | 153 | db $C4, $E2, $61, $00, $DD // vpshufb xmm3, xmm3, xmm5 |
139 | 154 |
|
140 | | - // XOR pre-shifted CRC into HIGH qword of xmm0 |
141 | 155 | db $C4, $C1, $7A, $7E, $20 // vmovq xmm4, [r8] |
142 | 156 | db $C5, $D9, $73, $FC, $08 // vpslldq xmm4, xmm4, 8 |
143 | 157 | db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4 |
|
168 | 182 | db $C5, $F9, $EF, $C3 // vpxor xmm0, xmm0, xmm3 |
169 | 183 |
|
170 | 184 | // ===== FOLD REMAINING 16-BYTE BLOCKS ===== |
| 185 | +@tail_check: |
171 | 186 | cmp edx, 16 |
172 | 187 | jb @tail_done |
173 | 188 |
|
|
0 commit comments