|
51 | 51 | ; Full 16-byte shuffle table: |
52 | 52 | ; 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10 |
53 | 53 | ; |
54 | | -; Step 2 — Extract high indices (PMADDUBSW with mul_lo = 0x0140, then >> 10): |
55 | | -; PMADDUBSW dst, src: result_word[j] = (dst_even[j] * src_even[j]) |
56 | | -; + (dst_odd[j] * src_odd[j]) |
57 | | -; With our shuffle, word j = [B, A] (even=B unsigned, odd=A unsigned). |
58 | | -; mul_lo word = 0x0140: even byte = 0x40 (+64 signed), odd byte = 0x01 (+1 signed). |
59 | | -; result = B*64 + A*1 (each up to 255*64 = 16320, fits in signed 16-bit) |
60 | | -; Shift right by 10: (B*64 + A) >> 10 = (B<<6 + A) >> 10 |
61 | | -; For the 16-bit range: bits 15..10 give (B*64 + A) / 1024. |
62 | | -; The value i0 = A>>2 falls in bits [9:4] of A, and: |
63 | | -; (A*4 + B) >> 4 ... hmm, let's cross-check with published values. |
| 54 | +; Step 2 — Extract 6-bit indices (aklomp mulhi/mullo method): |
| 55 | +; After shuffle, each dword is [B, A, C, B]. Two pairs of AND+multiply |
| 56 | +; isolate the even indices (i0, i2) and odd indices (i1, i3): |
64 | 57 | ; |
65 | | -; Reference (Muła/Klomp): the shuffle used is slightly different from |
66 | | -; the naive per-group arrangement. The published implementation uses: |
67 | | -; shuffle: 1,0,2,1, 4,3,5,4, 7,6,8,7, 10,9,11,10 |
68 | | -; mul_lo = 0x0140 (repeated as 16-bit words) |
69 | | -; mul_hi = 0x0801 (repeated as 16-bit words) |
70 | | -; After PMADDUBSW with mul_lo and >> 10: 8 "high" indices per 16-byte block |
71 | | -; After PMADDUBSW with mul_hi and & 0x3F: 8 "low" indices per 16-byte block |
72 | | -; After PACKUSWB(hi, lo) + re-interleave PSHUFB: 16 indices in correct order. |
| 58 | +; Even: (dword & 0x0FC0FC00) * 0x04000040 (PMULHUW, takes high 16 bits) |
| 59 | +; Word [B, A] & 0xFC00 = A[7:2] << 10 → mulhi shifts down to A >> 2 = i0 |
| 60 | +; Word [C, B] & 0x0FC0 = (B[3:0]<<6|C[7:6]) << 6 → mulhi gives i2 |
73 | 61 | ; |
74 | | -; Step 3 — Extract low indices (PMADDUBSW with mul_hi = 0x0801, then & 0x3F): |
75 | | -; mul_hi word = 0x0801: even byte = 0x01 (+1), odd byte = 0x08 (+8 signed). |
76 | | -; result = B*1 + A*8 (for the [B,A] word) or C*1 + B*8 (for [C,B] word) |
77 | | -; AND with 0x3F isolates the low 6 bits. |
| 62 | +; Odd: (dword & 0x003F03F0) * 0x01000010 (PMULLW, takes low 16 bits) |
| 63 | +; Word [B, A] & 0x03F0 = (A[1:0]<<4|B[7:4]) << 4 → mullo shifts to byte 1 |
| 64 | +; Word [C, B] & 0x003F = C[5:0] → mullo shifts to byte 3 → i3 |
78 | 65 | ; |
79 | | -; Step 4 — Pack (PACKUSWB / VPACKUSWB): |
80 | | -; Pack 8 hi-index words + 8 lo-index words into 16 bytes. |
81 | | -; The packed order is [hi0..hi7, lo0..lo7]; a PSHUFB interleave fixes this. |
| 66 | +; POR merges them: each byte = one 6-bit index in correct sequential order. |
82 | 67 | ; |
83 | 68 | ; Step 5 — Classify (range comparison + offset accumulation): |
84 | 69 | ; Start all 32 (or 16) output bytes with base offset +65 ('A'). |
@@ -131,35 +116,36 @@ section .data |
131 | 116 | db 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9,11,10 |
132 | 117 |
|
133 | 118 | ; ------------------------------------------------------------------------- |
134 | | - ; PMADDUBSW multiplier for HIGH indices (i0, i2 per group). |
135 | | - ; Each 16-bit word = 0x0140: low byte = 0x40 (+64), high byte = 0x01 (+1). |
136 | | - ; After multiply-add and >> 10 the high 6-bit index sits in bits [5:0]. |
| 119 | + ; 6-bit index extraction constants (aklomp mulhi/mullo method). |
| 120 | + ; |
| 121 | + ; After VPSHUFB with b64_shuf, each dword is [B, A, C, B] (little-endian). |
| 122 | + ; The two masks isolate the bit-fields that contribute to even (i0, i2) |
| 123 | + ; and odd (i1, i3) base64 indices respectively. PMULHUW/PMULLW shift |
| 124 | + ; the isolated bits into the correct byte positions so that POR merges |
| 125 | + ; them into 4 sequential index bytes per dword. |
| 126 | + ; |
| 127 | + ; Even indices (i0 = A>>2, i2 = (B&0xF)<<2 | C>>6): |
| 128 | + ; mask 0x0FC0FC00 — isolates A[7:2] in word 0, (B[3:0]<<2|C[7:6]) in word 1 |
| 129 | + ; mulhi 0x04000040 — shifts those fields down to bits [5:0] via high-multiply |
| 130 | + ; |
| 131 | + ; Odd indices (i1 = (A&3)<<4 | B>>4, i3 = C & 0x3F): |
| 132 | + ; mask 0x003F03F0 — isolates (A[1:0]<<4|B[7:4]) in word 0, C[5:0] in word 1 |
| 133 | + ; mullo 0x01000010 — shifts those fields up to bits [13:8] via low-multiply |
137 | 134 | align 32 |
138 | | - b64_mul_lo: |
139 | | - times 16 dw 0x0140 |
| 135 | + b64_mask_hi: |
| 136 | + times 8 dd 0x0FC0FC00 |
140 | 137 |
|
141 | | - ; PMADDUBSW multiplier for LOW indices (i1, i3 per group). |
142 | | - ; Each 16-bit word = 0x0801: low byte = 0x01 (+1), high byte = 0x08 (+8). |
143 | | - ; After multiply-add and & 0x3F the low 6-bit index sits in bits [5:0]. |
144 | 138 | align 32 |
145 | | - b64_mul_hi: |
146 | | - times 16 dw 0x0801 |
| 139 | + b64_mulhi_vec: |
| 140 | + times 8 dd 0x04000040 |
| 141 | + |
| 142 | + align 32 |
| 143 | + b64_mask_lo: |
| 144 | + times 8 dd 0x003F03F0 |
147 | 145 |
|
148 | | - ; ------------------------------------------------------------------------- |
149 | | - ; Post-pack interleave shuffle (Step 4b). |
150 | | - ; After PACKUSWB(hi_words, lo_words) the layout within each 16-byte lane is: |
151 | | - ; [hi0, hi1, hi2, hi3, hi4, hi5, hi6, hi7, |
152 | | - ; lo0, lo1, lo2, lo3, lo4, lo5, lo6, lo7] |
153 | | - ; hi indices correspond to output characters 0,2,4,6,8,10,12,14 |
154 | | - ; lo indices correspond to output characters 1,3,5,7,9,11,13,15 |
155 | | - ; We interleave to produce the correct sequential order: |
156 | | - ; [hi0,lo0, hi1,lo1, hi2,lo2, hi3,lo3, hi4,lo4, hi5,lo5, hi6,lo6, hi7,lo7] |
157 | | - ; i.e. read position i from slot: hi at i/2 (even i), lo at 8 + i/2 (odd i). |
158 | | - ; Shuffle bytes: 0,8, 1,9, 2,10, 3,11, 4,12, 5,13, 6,14, 7,15 |
159 | 146 | align 32 |
160 | | - b64_pack_shuf: |
161 | | - db 0, 8, 1, 9, 2,10, 3,11, 4,12, 5,13, 6,14, 7,15 |
162 | | - db 0, 8, 1, 9, 2,10, 3,11, 4,12, 5,13, 6,14, 7,15 |
| 147 | + b64_mullo_vec: |
| 148 | + times 8 dd 0x01000010 |
163 | 149 |
|
164 | 150 | ; ------------------------------------------------------------------------- |
165 | 151 | ; 0x3F mask used to isolate 6-bit indices from PMADDUBSW with mul_hi. |
@@ -354,17 +340,18 @@ ws_base64_encode: |
354 | 340 | cmp rax, 32 |
355 | 341 | jl .avx512vbmi_tail |
356 | 342 |
|
357 | | - ; ---- Steps 1-4: identical to AVX2 path ---- |
| 343 | + ; ---- Steps 1-3: load, shuffle, extract 6-bit indices ---- |
358 | 344 | ; Same two-load fix: high lane must contain bytes [r15+12..r15+23]. |
359 | 345 | vmovdqu xmm0, [r12 + r15] |
360 | 346 | vinserti128 ymm0, ymm0, [r12 + r15 + 12], 1 |
361 | 347 | vpshufb ymm0, ymm0, [b64_shuf] |
362 | | - vpmaddubsw ymm1, ymm0, [b64_mul_lo] |
363 | | - vpsrlw ymm1, ymm1, 10 |
364 | | - vpmaddubsw ymm2, ymm0, [b64_mul_hi] |
365 | | - vpand ymm2, ymm2, [b64_mask3f] |
366 | | - vpackuswb ymm3, ymm1, ymm2 |
367 | | - vpshufb ymm3, ymm3, [b64_pack_shuf] ; ymm3 = 32 six-bit indices (0-63) |
| 348 | + |
| 349 | + ; Extract indices via mulhi/mullo (aklomp method) |
| 350 | + vpand ymm1, ymm0, [b64_mask_hi] |
| 351 | + vpmulhuw ymm1, ymm1, [b64_mulhi_vec] |
| 352 | + vpand ymm2, ymm0, [b64_mask_lo] |
| 353 | + vpmullw ymm2, ymm2, [b64_mullo_vec] |
| 354 | + vpor ymm3, ymm1, ymm2 ; ymm3 = 32 six-bit indices (0-63) |
368 | 355 |
|
369 | 356 | ; ---- Steps 5+6: map 6-bit index -> ASCII in one instruction ---- |
370 | 357 | ; vpermb dst, idx, src — for each byte i: dst[i] = src[idx[i] & 63] |
@@ -434,17 +421,12 @@ ws_base64_encode: |
434 | 421 | ; ---- Step 2: Shuffle bytes for 6-bit extraction ---- |
435 | 422 | vpshufb ymm0, ymm0, [b64_shuf] |
436 | 423 |
|
437 | | - ; ---- Step 3a: Extract high indices (i0, i2 for each group) ---- |
438 | | - vpmaddubsw ymm1, ymm0, [b64_mul_lo] |
439 | | - vpsrlw ymm1, ymm1, 10 ; shift >> 10 leaves 6-bit index in low bits |
440 | | - |
441 | | - ; ---- Step 3b: Extract low indices (i1, i3 for each group) ---- |
442 | | - vpmaddubsw ymm2, ymm0, [b64_mul_hi] |
443 | | - vpand ymm2, ymm2, [b64_mask3f] |
444 | | - |
445 | | - ; ---- Step 4: Pack words to bytes, then interleave hi/lo ---- |
446 | | - vpackuswb ymm3, ymm1, ymm2 ; [hi0..hi7|lo0..lo7] per 128-bit lane |
447 | | - vpshufb ymm3, ymm3, [b64_pack_shuf] ; interleave -> correct sequential order |
| 424 | + ; ---- Steps 3-4: Extract 6-bit indices via mulhi/mullo (aklomp method) ---- |
| 425 | + vpand ymm1, ymm0, [b64_mask_hi] |
| 426 | + vpmulhuw ymm1, ymm1, [b64_mulhi_vec] |
| 427 | + vpand ymm2, ymm0, [b64_mask_lo] |
| 428 | + vpmullw ymm2, ymm2, [b64_mullo_vec] |
| 429 | + vpor ymm3, ymm1, ymm2 ; 32 six-bit indices in correct byte order |
448 | 430 |
|
449 | 431 | ; ---- Step 5: Classify — build per-byte ASCII offset vector ---- |
450 | 432 | ; Start with base offset +65 for every output byte position. |
@@ -523,21 +505,14 @@ ws_base64_encode: |
523 | 505 | movdqu xmm0, [r12 + r15] |
524 | 506 | pshufb xmm0, [b64_shuf] |
525 | 507 |
|
526 | | - ; ---- Step 3a: Extract high indices ---- |
| 508 | + ; ---- Steps 3-4: Extract 6-bit indices via mulhi/mullo (aklomp method) ---- |
527 | 509 | movdqa xmm1, xmm0 |
528 | | - pmaddubsw xmm1, [b64_mul_lo] |
529 | | - psrlw xmm1, 10 |
530 | | - |
531 | | - ; ---- Step 3b: Extract low indices ---- |
| 510 | + pand xmm1, [b64_mask_hi] |
| 511 | + pmulhuw xmm1, [b64_mulhi_vec] |
532 | 512 | movdqa xmm2, xmm0 |
533 | | - pmaddubsw xmm2, [b64_mul_hi] |
534 | | - pand xmm2, [b64_mask3f] |
535 | | - |
536 | | - ; ---- Step 4: Pack and interleave ---- |
537 | | - packuswb xmm1, xmm2 |
538 | | - pshufb xmm1, [b64_pack_shuf] |
539 | | - |
540 | | - ; xmm1 = 16 six-bit indices in correct output order. |
| 513 | + pand xmm2, [b64_mask_lo] |
| 514 | + pmullw xmm2, [b64_mullo_vec] |
| 515 | + por xmm1, xmm2 ; xmm1 = 16 six-bit indices in correct order |
541 | 516 |
|
542 | 517 | ; ---- Step 5: Build offset vector ---- |
543 | 518 | movdqa xmm4, xmm14 ; offset = +65 |
|
0 commit comments