Skip to content

Commit fb663b8

Browse files
mvogttechclaude
andcommitted
fix: correct NT prologue alignment/mask-cycling, simplify AVX-512 tail, fix ws_find_header scan
ws_mask / ws_unmask (all 4 NT prologue paths — AVX-512 + AVX2 × mask + unmask): - NT alignment calculation was inverted: `and rax, 63` computed offset INTO the current block instead of bytes TO the next boundary. Added `neg rax` before the AND so the formula becomes `(-ptr) & 63/31`, which correctly aligns vmovntdq destinations and prevents #GP on misaligned 512-bit NT stores. - Prologue mask cycling: every byte was XORed with mask[0] (r8b) regardless of position. Added `ror r8d, 8` per iteration and re-broadcast zmm0/ymm0 after the loop so the vector main loop starts with the correct mask phase. AVX-512 tail (ws_mask + ws_unmask): - Collapsed three identical `cmp rcx,64 / jbe / vmovdqu64×3 / add×3 / sub` blocks into a single `shr rax,6 / dec/jnz` loop feeding the shared opmask final block. Cuts ~25 duplicate instructions per function; BZHI with rcx=0 produces a zero mask so vmovdqu8{k1=0} is a safe no-op — no separate early exit needed. Hot-path performance is unchanged. ws_find_header: - pcmpistri unconditionally overwrites rcx with the inner match index, which destroyed the outer scan counter. In the no-match path `add rcx,16` then added 16 to the pcmpistri result (16), skipping every other 16-byte window. In the candidate path `add rcx,rcx` doubled the inner index rather than adding the outer position. Fixed by moving the outer scan counter into r11 (caller-saved; no push/pop needed) so pcmpistri can never corrupt it. ws_cpu.asm: - Refactored _init_cpu_features to execute each CPUID leaf exactly once, caching results in r9-r12. Adds VBMI detection (leaf 7 ECX bit 1, gated on cpu_tier == 3) as bit 4 of cpu_features. ws_base64_asm.asm: - Added AVX-512VBMI fast path: replaces the 13-instruction classify/map chain with a single VPERMB indexing directly into the 64-byte b64_table LUT. Dispatched when cpu_tier == 3 and cpu_features bit 4 (VBMI) is set. test/index.js: - Added NT prologue mask-cycling tests: non-zero offsets (1, 3) on 512 KB payloads force the alignment prologue to run; results are compared against the JS reference to catch silent data corruption. - Added misaligned-buffer unmask test (subarray(1) on a 512 KB buffer). - Added VBMI bit to cpuFeatures display output. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 1390cde commit fb663b8

4 files changed

Lines changed: 257 additions & 161 deletions

File tree

src/ws_base64_asm.asm

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@
1212
; rax = number of output bytes written (always ceil(len/3)*4)
1313
;
1414
; Dispatch order (fastest-available first):
15-
; 1. cpu_tier >= 2 (AVX2) -> .avx2_path (24 input bytes -> 32 output chars / iter)
16-
; 2. cpu_tier >= 1 (SSE2 baseline) -> .sse2_path (12 input bytes -> 16 output chars / iter)
17-
; 3. fallback -> .scalar_path ( 3 input bytes -> 4 output chars / iter)
15+
; 1. cpu_tier >= 3 + VBMI (bit 4) -> .avx512vbmi_path (24 in -> 32 out / iter, VPERMB)
16+
; 2. cpu_tier >= 2 (AVX2) -> .avx2_path (24 in -> 32 out / iter)
17+
; 3. cpu_tier >= 1 (SSE2) -> .sse2_path (12 in -> 16 out / iter)
18+
; 4. fallback -> .scalar_path ( 3 in -> 4 out / iter)
1819
;
1920
; Algorithm: Klomp/Muła VPSHUFB method (vectorised base64 encoding)
2021
;
@@ -103,6 +104,7 @@ BITS 64
103104
DEFAULT REL
104105

105106
extern cpu_tier
107+
extern cpu_features
106108

107109
; ============================================================================
108110
; .data — lookup tables and broadcast constant vectors
@@ -222,6 +224,12 @@ ws_base64_encode:
222224
; and is declared extern above. We use RIP-relative addressing
223225
; (DEFAULT REL ensures this).
224226
; ------------------------------------------------------------------
227+
cmp dword [cpu_tier], 3
228+
jl .b64_check_avx2
229+
test dword [cpu_features], (1 << 4) ; VBMI bit
230+
jnz .avx512vbmi_path
231+
232+
.b64_check_avx2:
225233
cmp dword [cpu_tier], 2
226234
jge .avx2_path
227235

@@ -231,6 +239,55 @@ ws_base64_encode:
231239
jmp .scalar_path
232240

233241

242+
; ============================================================================
243+
; AVX-512VBMI PATH — 24 input bytes -> 32 output characters per iteration
244+
;
245+
; Extraction pipeline is identical to the AVX2 path (VPSHUFB + VPMADDUBSW +
246+
; VPACKUSWB + VPSHUFB). The 13-instruction classify/map chain is replaced by
247+
; a single VPERMB that indexes directly into the 64-byte b64_table.
248+
;
249+
; After VEX-encoded YMM writes, zmm[511:256] = 0 (Intel manual §2.3.5).
250+
; VPERMB ZMM uses index & 63, so zero upper bytes map to b64_table[0]='A'.
251+
; Only the lower ymm3 (32 bytes) is stored — upper bytes are discarded.
252+
; ============================================================================
253+
align 32
254+
.avx512vbmi_path:
255+
vmovdqa64 zmm9, [b64_table] ; preload 64-byte LUT once (align 64 in .data)
256+
257+
align 32
258+
.avx512vbmi_loop:
259+
; Guard: need 32 bytes for a safe 32-byte vmovdqu load.
260+
mov rax, r13
261+
sub rax, r15
262+
cmp rax, 32
263+
jl .avx512vbmi_tail
264+
265+
; ---- Steps 1-4: identical to AVX2 path ----
266+
vmovdqu ymm0, [r12 + r15]
267+
vpshufb ymm0, ymm0, [b64_shuf]
268+
vpmaddubsw ymm1, ymm0, [b64_mul_lo]
269+
vpsrlw ymm1, ymm1, 10
270+
vpmaddubsw ymm2, ymm0, [b64_mul_hi]
271+
vpand ymm2, ymm2, [b64_mask3f]
272+
vpackuswb ymm3, ymm1, ymm2
273+
vpshufb ymm3, ymm3, [b64_pack_shuf] ; ymm3 = 32 six-bit indices (0-63)
274+
275+
; ---- Steps 5+6: map 6-bit index -> ASCII in one instruction ----
276+
; vpermb dst, idx, src — for each byte i: dst[i] = src[idx[i] & 63]
277+
vpermb zmm3, zmm3, zmm9
278+
279+
; ---- Step 7: store 32 output bytes (lower ymm3 half of zmm3) ----
280+
vmovdqu [r14 + rbx], ymm3
281+
282+
add r15, 24
283+
add rbx, 32
284+
jmp .avx512vbmi_loop
285+
286+
.avx512vbmi_tail:
287+
vzeroupper
288+
jmp .scalar_path
289+
290+
234291
; ============================================================================
235292
; AVX2 PATH — 24 input bytes -> 32 output characters per iteration
236293
;

src/ws_cpu.asm

Lines changed: 60 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@
55
; 0 = scalar only
66
; 1 = SSE2 baseline
77
; 2 = AVX2
8-
; 3 = AVX-512F+BW (disabled on production Alder Lake)
8+
; 3 = AVX-512F+BW
99
;
1010
; cpu_features bitmask:
1111
; bit 0 = GFNI (CPUID.7.0:ECX[8])
1212
; bit 1 = PCLMULQDQ (CPUID.1:ECX[1])
1313
; bit 2 = BMI2 (CPUID.7.0:EBX[8])
1414
; bit 3 = LZCNT (CPUID.0x80000001:ECX[5])
15+
; bit 4 = VBMI (CPUID.7.0:ECX[1], only set when cpu_tier == 3)
1516

1617
BITS 64
1718
DEFAULT REL
@@ -27,79 +28,103 @@ global cpu_tier
2728
global cpu_features
2829
global _init_cpu_features
2930

31+
; Register allocation across the function:
32+
; r8d = XCR0 (from xgetbv)
33+
; r9d = max basic leaf (from CPUID leaf 0)
34+
; r10d = leaf 1 ECX (OSXSAVE, PCLMULQDQ)
35+
; r11d = leaf 7 EBX (AVX2, AVX-512F/BW, BMI2)
36+
; r12d = leaf 7 ECX (GFNI, VBMI) [callee-saved — must push/pop]
37+
;
38+
; Each CPUID leaf is executed exactly once. The caller-saved scratch
39+
; registers r8-r11 need no push/pop; only r12 (callee-saved) does.
40+
3041
_init_cpu_features:
3142
push rbx
43+
push r12
3244

33-
; --- Tier detection ---
34-
mov dword [cpu_tier], 1 ; SSE2 baseline
45+
; Default = SSE2 baseline
46+
mov dword [cpu_tier], 1
3547

48+
; === Leaf 0: max basic leaf ===
3649
xor eax, eax
3750
cpuid
38-
cmp eax, 7
39-
jl .feat_detect
51+
mov r9d, eax ; r9d = max basic leaf
4052

53+
; === Leaf 1: OSXSAVE + PCLMULQDQ ===
4154
mov eax, 1
4255
cpuid
43-
test ecx, (1 << 27) ; OSXSAVE
56+
mov r10d, ecx ; r10d = leaf 1 ECX
57+
58+
; === Leaf 7 (if available) — AVX2, AVX-512F/BW, BMI2, GFNI, VBMI ===
59+
cmp r9d, 7
60+
jb .no_leaf7
61+
mov eax, 7
62+
xor ecx, ecx
63+
cpuid
64+
mov r11d, ebx ; r11d = leaf 7 EBX
65+
mov r12d, ecx ; r12d = leaf 7 ECX
66+
.no_leaf7:
67+
68+
; === Tier detection (AVX2 / AVX-512) ===
69+
test r10d, (1 << 27) ; OSXSAVE?
4470
jz .feat_detect
4571

4672
xor ecx, ecx
4773
xgetbv
48-
mov r8d, eax
74+
mov r8d, eax ; r8d = XCR0
75+
4976
and eax, 0x06
50-
cmp eax, 0x06 ; YMM state saved by OS
77+
cmp eax, 0x06 ; YMM state saved by OS?
5178
jne .feat_detect
5279

53-
mov eax, 7
54-
xor ecx, ecx
55-
cpuid
56-
test ebx, (1 << 5) ; AVX2
80+
cmp r9d, 7
81+
jb .feat_detect
82+
test r11d, (1 << 5) ; AVX2?
5783
jz .feat_detect
5884
mov dword [cpu_tier], 2
5985

6086
mov eax, r8d
6187
and eax, 0xE0
62-
cmp eax, 0xE0 ; ZMM/opmask state saved by OS
88+
cmp eax, 0xE0 ; ZMM/opmask state saved by OS?
6389
jne .feat_detect
64-
test ebx, (1 << 16) ; AVX-512F
90+
test r11d, (1 << 16) ; AVX-512F?
6591
jz .feat_detect
66-
test ebx, (1 << 30) ; AVX-512BW
92+
test r11d, (1 << 30) ; AVX-512BW?
6793
jz .feat_detect
6894
mov dword [cpu_tier], 3
6995

70-
; --- Feature bitmask detection ---
71-
; (runs regardless of tier — these features are orthogonal)
7296
.feat_detect:
73-
xor eax, eax
74-
cpuid
75-
cmp eax, 7
76-
jl .check_pclmul
97+
; === Feature bitmask (all use cached leaf results — no further CPUID) ===
98+
cmp r9d, 7
99+
jb .check_pclmul
77100

78-
mov eax, 7
79-
xor ecx, ecx
80-
cpuid
101+
; GFNI (bit 0): leaf 7 ECX bit 8
102+
test r12d, (1 << 8)
103+
jz .check_vbmi
104+
or dword [cpu_features], 1
81105

82-
; GFNI: leaf 7 ECX bit 8
83-
test ecx, (1 << 8)
106+
.check_vbmi:
107+
; VBMI (bit 4): leaf 7 ECX bit 1 — only useful when cpu_tier == 3
108+
cmp dword [cpu_tier], 3
109+
jl .check_bmi2
110+
test r12d, (1 << 1)
84111
jz .check_bmi2
85-
or dword [cpu_features], 1
112+
or dword [cpu_features], (1 << 4)
86113

87114
.check_bmi2:
88-
; BMI2: leaf 7 EBX bit 8
89-
test ebx, (1 << 8)
115+
; BMI2 (bit 2): leaf 7 EBX bit 8
116+
test r11d, (1 << 8)
90117
jz .check_pclmul
91118
or dword [cpu_features], 4
92119

93120
.check_pclmul:
94-
; PCLMULQDQ: leaf 1 ECX bit 1
95-
mov eax, 1
96-
cpuid
97-
test ecx, (1 << 1)
121+
; PCLMULQDQ (bit 1): leaf 1 ECX bit 1 (cached — no re-execution)
122+
test r10d, (1 << 1)
98123
jz .check_lzcnt
99124
or dword [cpu_features], 2
100125

101126
.check_lzcnt:
102-
; LZCNT: extended leaf 0x80000001 ECX bit 5
127+
; LZCNT (bit 3): extended leaf 0x80000001 ECX bit 5
103128
mov eax, 0x80000000
104129
cpuid
105130
cmp eax, 0x80000001
@@ -111,6 +136,7 @@ _init_cpu_features:
111136
or dword [cpu_features], 8
112137

113138
.all_done:
139+
pop r12
114140
pop rbx
115141
ret
116142

0 commit comments

Comments
 (0)