1- ; ws_base64_asm.asm — Base64 encoder with AVX2/GFNI /SSE2/scalar dispatch
1+ ; ws_base64_asm.asm — Base64 encoder with AVX-512 VBMI2/VBMI/AVX2 /SSE2/scalar dispatch
22;
33; C signature:
44; size_t ws_base64_encode(const uint8_t *in, size_t len, uint8_t *out);
1212; rax = number of output bytes written (always ceil(len/3)*4)
1313;
1414; Dispatch order (fastest-available first):
15- ; 1. cpu_tier >= 3 + VBMI (bit 4) -> .avx512vbmi_path (24 in -> 32 out / iter, VPERMB)
16- ; 2. cpu_tier >= 2 (AVX2) -> .avx2_path (24 in -> 32 out / iter)
17- ; 3. cpu_tier >= 1 (SSE2) -> .sse2_path (12 in -> 16 out / iter)
18- ; 4. fallback -> .scalar_path ( 3 in -> 4 out / iter)
15+ ; 1. cpu_tier >= 3 + VBMI2 (bit 6) -> .avx512vbmi2_path (24 in -> 32 out / iter, VPMULTISHIFTQB)
16+ ; 2. cpu_tier >= 3 + VBMI (bit 4) -> .avx512vbmi_path (24 in -> 32 out / iter, VPERMB)
17+ ; 3. cpu_tier >= 2 (AVX2) -> .avx2_path (24 in -> 32 out / iter)
18+ ; 4. cpu_tier >= 1 (SSE2) -> .sse2_path (12 in -> 16 out / iter)
19+ ; 5. fallback -> .scalar_path ( 3 in -> 4 out / iter)
1920;
2021; Algorithm: Klomp/Muła VPSHUFB method (vectorised base64 encoding)
2122;
@@ -185,6 +186,31 @@ section .data
185186 align 32
186187 b64_const_p03: times 32 db 3 ; +3 correction ('+'->'/' boundary)
187188
189+ ; -------------------------------------------------------------------------
190+ ; VPMULTISHIFTQB shift control for VBMI2 base64 path.
191+ ; After VPSHUFB with b64_shuf, each dword contains [B, A, C, B] where
192+ ; A,B,C are consecutive input bytes. In a qword (two groups), the
193+ ; layout is [B0,A0,C0,B0, B1,A1,C1,B1] at bit positions:
194+ ; B0=[7:0], A0=[15:8], C0=[23:16], B0'=[31:24],
195+ ; B1=[39:32], A1=[47:40], C1=[55:48], B1'=[63:56]
196+ ;
197+ ; VPMULTISHIFTQB extracts 8 contiguous bits starting at each control byte's
198+ ; position (mod 64). After AND 0x3F the result is the 6-bit base64 index.
199+ ;
200+ ; Per group [A,B,C]:
201+ ; i0 = A >> 2 -> bits [15:10] -> shift = 10
202+ ; i1 = (A&3)<<4 | B>>4 -> bits [11:4] -> shift = 4 (& 0x3F)
203+ ; i2 = (B&F)<<2 | C>>6 -> bits [29:22] -> shift = 22 (& 0x3F)
204+ ; i3 = C & 0x3F -> bits [21:16] -> shift = 16
205+ ;
206+ ; Group 1 offsets are +32 within the qword.
207+ align 32
208+ b64_vbmi2_shifts:
209+ db 10 , 4 , 22 , 16 , 42 , 36 , 54 , 48
210+ db 10 , 4 , 22 , 16 , 42 , 36 , 54 , 48
211+ db 10 , 4 , 22 , 16 , 42 , 36 , 54 , 48
212+ db 10 , 4 , 22 , 16 , 42 , 36 , 54 , 48
213+
188214 ; -------------------------------------------------------------------------
189215 ; Standard 64-character base64 alphabet (RFC 4648 §4).
190216 align 64
@@ -226,7 +252,9 @@ ws_base64_encode:
226252 ; ------------------------------------------------------------------
227253 cmp dword [ cpu_tier ], 3
228254 jl .b64_check_avx2
229- test dword [ cpu_features ], ( 1 << 4 ) ; VBMI bit
255+ test dword [ cpu_features ], ( 1 << 6 ) ; VBMI2 bit
256+ jnz .avx512vbmi2_path
257+ test dword [ cpu_features ], ( 1 << 4 ) ; VBMI bit (fallback)
230258 jnz .avx512vbmi_path
231259
232260.b64_check_avx2:
@@ -239,6 +267,62 @@ ws_base64_encode:
239267 jmp .scalar_path
240268
241269
270+ ; ============================================================================
271+ ; AVX-512 VBMI2 PATH — 24 input bytes -> 32 output characters per iteration
272+ ;
273+ ; Replaces the 6-instruction Klomp/Mula extraction pipeline with 2 instructions:
274+ ; VPMULTISHIFTQB — extracts 8 arbitrary bit-fields per qword in one uop
275+ ; VPANDD — isolates the 6-bit indices (mask with 0x3F)
276+ ;
277+ ; The existing b64_shuf table produces [B,A,C,B] per dword. Within each qword
278+ ; (two groups), VPMULTISHIFTQB with control [10,4,22,16, 42,36,54,48] extracts
279+ ; the four 6-bit base64 indices per group directly.
280+ ;
281+ ; After extraction, VPERMB maps 6-bit indices to ASCII via b64_table (same as
282+ ; the VBMI path below). Net savings: 4 instructions per iteration vs VBMI path.
283+ ;
284+ ; Requires: AVX-512 VBMI2 (cpu_tier >= 3, cpu_features bit 6)
285+ ; ============================================================================
286+ align 32
287+ .avx512vbmi2_path:
288+ vmovdqa64 zmm9 , [ b64_table ] ; 64-byte base64 LUT
289+ vmovdqa ymm10 , [ b64_vbmi2_shifts ] ; shift control vector (32 bytes)
290+ vmovdqa ymm11 , [ b64_mask3f ] ; 0x3F mask (32 bytes, pre-filled)
291+
292+ align 32
293+ .avx512vbmi2_loop:
294+ ; Guard: need 32 bytes for safe overlapping load (consume 24).
295+ mov rax , r13
296+ sub rax , r15
297+ cmp rax , 32
298+ jl .avx512vbmi2_tail
299+
300+ ; ---- Step 1: Load 24 bytes via two 16-byte lane-aligned loads ----
301+ vmovdqu xmm0 , [ r12 + r15 ]
302+ vinserti128 ymm0 , ymm0 , [ r12 + r15 + 12 ], 1
303+
304+ ; ---- Step 2: Shuffle to [B,A,C,B] per dword ----
305+ vpshufb ymm0 , ymm0 , [ b64_shuf ]
306+
307+ ; ---- Step 3: Extract 6-bit fields (replaces 6-instruction pipeline) ----
308+ vpmultishiftqb ymm1 , ymm10 , ymm0 ; extract 8 bit-fields per qword
309+ vpand ymm1 , ymm1 , ymm11 ; isolate 6-bit indices
310+
311+ ; ---- Step 4: Map index -> ASCII via VPERMB ----
312+ vpermb zmm1 , zmm1 , zmm9 ; zmm1[i] = b64_table[ymm1[i] & 63]
313+
314+ ; ---- Step 5: Store 32 output bytes ----
315+ vmovdqu [ r14 + rbx ], ymm1
316+
317+ add r15 , 24
318+ add rbx , 32
319+ jmp .avx512vbmi2_loop
320+
321+ .avx512vbmi2_tail:
322+ SAFE_VZEROUPPER
323+ jmp .scalar_path
324+
325+
242326; ============================================================================
243327; AVX-512VBMI PATH — 24 input bytes -> 32 output characters per iteration
244328;
0 commit comments