Skip to content

Commit 712735e

Browse files
mvogttechclaude
andcommitted
feat: add 7 ISA-level optimizations and SIMD UTF-8 validator
- VBMI2 base64: VPMULTISHIFTQB replaces 6-instruction extraction chain with 2 instructions (shift + mask), adds VBMI2 CPUID detection (bit 6) - AVX-512 findHeader: VPCMPEQB first+last byte filter at 64B/iter replaces PCMPISTRI (11-cycle latency), with TZCNT/BLSR candidate scan - PCLMULQDQ CRC-32C: 4-way parallel folding + Barrett reduction for buffers >= 256B, serial CRC32 fallback for small buffers - Software pipelining: interleaved load-XOR-store in AVX-512 cached mask/unmask loops to overlap memory operations - Multi-buffer parallel masking: ws_unmask4/ws_mask4 process 4 frames simultaneously using independent ZMM register sets - GFNI experiment: confirms GF2P8AFFINEQB cannot replace VPXORD for multi-byte XOR patterns (1-byte imm vs 4-byte mask) - SIMD UTF-8 validation: new ws_utf8_validate with AVX-512/AVX2 ASCII fast path (vpmovb2m + kortestq) and scalar state machine fallback 89 tests passing (was 29). Includes 47 UTF-8 validation tests with TextDecoder cross-check, 14 GFNI equivalence tests, CRC-32C folding tests across multiple buffer sizes. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 8cc6e02 commit 712735e

11 files changed

Lines changed: 2690 additions & 96 deletions

File tree

bench/index.js

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,51 @@ for (const { name, size } of sizes) {
195195
);
196196
}
197197

198+
// ── GFNI experiment: mask vs maskGfni (AVX-512 VPXORD baseline) ────────────
199+
200+
if (asmUtil?.maskGfni && (asmUtil.cpuFeatures & 1)) {
201+
console.log('\n=== GFNI Experiment: mask vs maskGfni ===');
202+
console.log('GF2P8AFFINEQB evaluated for XOR masking — cannot replace VPXORD');
203+
console.log('for multi-byte patterns (imm8 is 1 byte, mask is 4). Both use VPXORD.');
204+
console.log('This benchmark confirms identical performance.\n');
205+
console.log(
206+
'Size'.padEnd(10) +
207+
' mask ops/s'.padEnd(14) +
208+
' gfni ops/s'.padEnd(14) +
209+
' gfni/mask'.padEnd(11) +
210+
' spread'
211+
);
212+
console.log('\u2500'.repeat(60));
213+
214+
const gfniSizes = [
215+
{ name: '64 B', size: 64 },
216+
{ name: '256 B', size: 256 },
217+
{ name: '1 KB', size: 1024 },
218+
{ name: '16 KB', size: 16384 },
219+
{ name: '64 KB', size: 65536 },
220+
{ name: '1 MB', size: 1048576 },
221+
];
222+
223+
for (const { name, size } of gfniSizes) {
224+
const source = crypto.randomBytes(size);
225+
const mask = crypto.randomBytes(4);
226+
const output = Buffer.alloc(size);
227+
228+
collectGarbage();
229+
const maskRes = benchmark(() => asmUtil.mask(source, mask, output, 0, size));
230+
collectGarbage();
231+
const gfniRes = benchmark(() => asmUtil.maskGfni(source, mask, output, 0, size));
232+
233+
console.log(
234+
name.padEnd(10) +
235+
fmtOps(maskRes.median) +
236+
fmtOps(gfniRes.median) +
237+
fmtSpeedup(gfniRes.median, maskRes.median).padStart(11) +
238+
(' ' + fmtRange(gfniRes))
239+
);
240+
}
241+
}
242+
198243
// ── Unmask benchmark ────────────────────────────────────────────────────────
199244

200245
console.log('\n=== WebSocket Unmask ===');
@@ -416,6 +461,90 @@ if (asmUtil) {
416461
}
417462
}
418463

464+
// ── UTF-8 validation benchmark ─────────────────────────────────────────────
465+
466+
if (asmUtil?.utf8Validate) {
467+
console.log('\n=== UTF-8 Validation (ASM vs TextDecoder) ===');
468+
console.log(
469+
'Size'.padEnd(10) +
470+
' TextDecoder'.padEnd(14) +
471+
' ASM ops/s'.padEnd(14) +
472+
' ASM MB/s'.padEnd(12) +
473+
' vs TD'.padEnd(9) +
474+
' spread'
475+
);
476+
console.log('\u2500'.repeat(70));
477+
478+
const decoder = new TextDecoder('utf-8', { fatal: true });
479+
480+
// ASCII text (fast path — the common case for English WebSocket traffic)
481+
const asciiSizes = [
482+
{ name: '64 B', size: 64 },
483+
{ name: '256 B', size: 256 },
484+
{ name: '1 KB', size: 1024 },
485+
{ name: '16 KB', size: 16384 },
486+
{ name: '64 KB', size: 65536 },
487+
{ name: '256 KB', size: 262144 },
488+
{ name: '1 MB', size: 1048576 },
489+
];
490+
491+
console.log(' -- ASCII text (SIMD fast path) --');
492+
for (const { name, size } of asciiSizes) {
493+
// Fill with printable ASCII (0x20-0x7E)
494+
const data = Buffer.alloc(size);
495+
for (let i = 0; i < size; i++) data[i] = 0x20 + (i % 95);
496+
497+
collectGarbage();
498+
const tdRes = benchmark(() => { try { decoder.decode(data); } catch {} });
499+
collectGarbage();
500+
const asmRes = benchmark(() => asmUtil.utf8Validate(data));
501+
502+
console.log(
503+
name.padEnd(10) +
504+
fmtOps(tdRes.median) +
505+
fmtOps(asmRes.median) +
506+
fmtThroughput(asmRes.median, size).padStart(12) +
507+
fmtSpeedup(asmRes.median, tdRes.median).padStart(9) +
508+
(' ' + fmtRange(asmRes))
509+
);
510+
}
511+
512+
// Mixed UTF-8 text (Japanese — 3-byte sequences, forces scalar validation)
513+
const mixedText = '\u3053\u3093\u306b\u3061\u306f\u4e16\u754c'; // "こんにちは世界"
514+
const mixedSizes = [
515+
{ name: '64 B', size: 64 },
516+
{ name: '256 B', size: 256 },
517+
{ name: '1 KB', size: 1024 },
518+
{ name: '16 KB', size: 16384 },
519+
];
520+
521+
console.log(' -- Mixed UTF-8 (Japanese, 3-byte sequences) --');
522+
for (const { name, size } of mixedSizes) {
523+
const base = Buffer.from(mixedText.repeat(Math.ceil(size / 21)));
524+
const data = base.subarray(0, size);
525+
// Ensure we don't end on a truncated sequence — trim to valid boundary
526+
let trimmed = data;
527+
while (trimmed.length > 0) {
528+
try { decoder.decode(trimmed); break; } catch { trimmed = trimmed.subarray(0, trimmed.length - 1); }
529+
}
530+
if (trimmed.length === 0) continue;
531+
532+
collectGarbage();
533+
const tdRes = benchmark(() => { try { decoder.decode(trimmed); } catch {} });
534+
collectGarbage();
535+
const asmRes = benchmark(() => asmUtil.utf8Validate(trimmed));
536+
537+
console.log(
538+
name.padEnd(10) +
539+
fmtOps(tdRes.median) +
540+
fmtOps(asmRes.median) +
541+
fmtThroughput(asmRes.median, trimmed.length).padStart(12) +
542+
fmtSpeedup(asmRes.median, tdRes.median).padStart(9) +
543+
(' ' + fmtRange(asmRes))
544+
);
545+
}
546+
}
547+
419548
console.log('\n' + '\u2500'.repeat(80));
420549
console.log('Config: warmup=' + WARMUP_MS + 'ms, sample=' + SAMPLE_MS + 'ms, samples=' + SAMPLES);
421550
console.log('Median of ' + SAMPLES + ' samples shown. Spread = (max-min)/median.');

binding.gyp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,23 @@
5050
"action": ["nasm", "-f", "elf64",
5151
"-o", "<(INTERMEDIATE_DIR)/ws_crc32_asm.o",
5252
"src/ws_crc32_asm.asm"]
53+
},
54+
{
55+
"action_name": "assemble_utf8",
56+
"inputs": ["src/ws_utf8_asm.asm"],
57+
"outputs": ["<(INTERMEDIATE_DIR)/ws_utf8_asm.o"],
58+
"action": ["nasm", "-f", "elf64",
59+
"-o", "<(INTERMEDIATE_DIR)/ws_utf8_asm.o",
60+
"src/ws_utf8_asm.asm"]
5361
}
5462
],
5563
"link_settings": {
5664
"libraries": [
5765
"<(INTERMEDIATE_DIR)/ws_cpu.o",
5866
"<(INTERMEDIATE_DIR)/ws_mask_asm.o",
5967
"<(INTERMEDIATE_DIR)/ws_base64_asm.o",
60-
"<(INTERMEDIATE_DIR)/ws_crc32_asm.o"
68+
"<(INTERMEDIATE_DIR)/ws_crc32_asm.o",
69+
"<(INTERMEDIATE_DIR)/ws_utf8_asm.o"
6170
]
6271
}
6372
}]

src/ws_base64_asm.asm

Lines changed: 90 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; ws_base64_asm.asm — Base64 encoder with AVX2/GFNI/SSE2/scalar dispatch
1+
; ws_base64_asm.asm — Base64 encoder with AVX-512 VBMI2/VBMI/AVX2/SSE2/scalar dispatch
22
;
33
; C signature:
44
; size_t ws_base64_encode(const uint8_t *in, size_t len, uint8_t *out);
@@ -12,10 +12,11 @@
1212
; rax = number of output bytes written (always ceil(len/3)*4)
1313
;
1414
; Dispatch order (fastest-available first):
15-
; 1. cpu_tier >= 3 + VBMI (bit 4) -> .avx512vbmi_path (24 in -> 32 out / iter, VPERMB)
16-
; 2. cpu_tier >= 2 (AVX2) -> .avx2_path (24 in -> 32 out / iter)
17-
; 3. cpu_tier >= 1 (SSE2) -> .sse2_path (12 in -> 16 out / iter)
18-
; 4. fallback -> .scalar_path ( 3 in -> 4 out / iter)
15+
; 1. cpu_tier >= 3 + VBMI2 (bit 6) -> .avx512vbmi2_path (24 in -> 32 out / iter, VPMULTISHIFTQB)
16+
; 2. cpu_tier >= 3 + VBMI (bit 4) -> .avx512vbmi_path (24 in -> 32 out / iter, VPERMB)
17+
; 3. cpu_tier >= 2 (AVX2) -> .avx2_path (24 in -> 32 out / iter)
18+
; 4. cpu_tier >= 1 (SSE2) -> .sse2_path (12 in -> 16 out / iter)
19+
; 5. fallback -> .scalar_path ( 3 in -> 4 out / iter)
1920
;
2021
; Algorithm: Klomp/Muła VPSHUFB method (vectorised base64 encoding)
2122
;
@@ -185,6 +186,31 @@ section .data
185186
align 32
186187
b64_const_p03: times 32 db 3 ; +3 correction ('+'->'/' boundary)
187188

189+
; -------------------------------------------------------------------------
190+
; VPMULTISHIFTQB shift control for VBMI2 base64 path.
191+
; After VPSHUFB with b64_shuf, each dword contains [B, A, C, B] where
192+
; A,B,C are consecutive input bytes. In a qword (two groups), the
193+
; layout is [B0,A0,C0,B0, B1,A1,C1,B1] at bit positions:
194+
; B0=[7:0], A0=[15:8], C0=[23:16], B0'=[31:24],
195+
; B1=[39:32], A1=[47:40], C1=[55:48], B1'=[63:56]
196+
;
197+
; VPMULTISHIFTQB extracts 8 contiguous bits starting at each control byte's
198+
; position (mod 64). After AND 0x3F the result is the 6-bit base64 index.
199+
;
200+
; Per group [A,B,C]:
201+
; i0 = A >> 2 -> bits [15:10] -> shift = 10
202+
; i1 = (A&3)<<4 | B>>4 -> bits [11:4] -> shift = 4 (& 0x3F)
203+
; i2 = (B&F)<<2 | C>>6 -> bits [29:22] -> shift = 22 (& 0x3F)
204+
; i3 = C & 0x3F -> bits [21:16] -> shift = 16
205+
;
206+
; Group 1 offsets are +32 within the qword.
207+
align 32
208+
b64_vbmi2_shifts:
209+
db 10, 4, 22, 16, 42, 36, 54, 48
210+
db 10, 4, 22, 16, 42, 36, 54, 48
211+
db 10, 4, 22, 16, 42, 36, 54, 48
212+
db 10, 4, 22, 16, 42, 36, 54, 48
213+
188214
; -------------------------------------------------------------------------
189215
; Standard 64-character base64 alphabet (RFC 4648 §4).
190216
align 64
@@ -226,7 +252,9 @@ ws_base64_encode:
226252
; ------------------------------------------------------------------
227253
cmp dword [cpu_tier], 3
228254
jl .b64_check_avx2
229-
test dword [cpu_features], (1 << 4) ; VBMI bit
255+
test dword [cpu_features], (1 << 6) ; VBMI2 bit
256+
jnz .avx512vbmi2_path
257+
test dword [cpu_features], (1 << 4) ; VBMI bit (fallback)
230258
jnz .avx512vbmi_path
231259

232260
.b64_check_avx2:
@@ -239,6 +267,62 @@ ws_base64_encode:
239267
jmp .scalar_path
240268

241269

270+
; ============================================================================
271+
; AVX-512 VBMI2 PATH — 24 input bytes -> 32 output characters per iteration
272+
;
273+
; Replaces the 6-instruction Klomp/Mula extraction pipeline with 2 instructions:
274+
; VPMULTISHIFTQB — extracts 8 arbitrary bit-fields per qword in one uop
275+
; VPANDD — isolates the 6-bit indices (mask with 0x3F)
276+
;
277+
; The existing b64_shuf table produces [B,A,C,B] per dword. Within each qword
278+
; (two groups), VPMULTISHIFTQB with control [10,4,22,16, 42,36,54,48] extracts
279+
; the four 6-bit base64 indices per group directly.
280+
;
281+
; After extraction, VPERMB maps 6-bit indices to ASCII via b64_table (same as
282+
; the VBMI path below). Net savings: 4 instructions per iteration vs VBMI path.
283+
;
284+
; Requires: AVX-512 VBMI2 (cpu_tier >= 3, cpu_features bit 6)
285+
; ============================================================================
286+
align 32
287+
.avx512vbmi2_path:
288+
vmovdqa64 zmm9, [b64_table] ; 64-byte base64 LUT
289+
vmovdqa ymm10, [b64_vbmi2_shifts] ; shift control vector (32 bytes)
290+
vmovdqa ymm11, [b64_mask3f] ; 0x3F mask (32 bytes, pre-filled)
291+
292+
align 32
293+
.avx512vbmi2_loop:
294+
; Guard: need 32 bytes for safe overlapping load (consume 24).
295+
mov rax, r13
296+
sub rax, r15
297+
cmp rax, 32
298+
jl .avx512vbmi2_tail
299+
300+
; ---- Step 1: Load 24 bytes via two 16-byte lane-aligned loads ----
301+
vmovdqu xmm0, [r12 + r15]
302+
vinserti128 ymm0, ymm0, [r12 + r15 + 12], 1
303+
304+
; ---- Step 2: Shuffle to [B,A,C,B] per dword ----
305+
vpshufb ymm0, ymm0, [b64_shuf]
306+
307+
; ---- Step 3: Extract 6-bit fields (replaces 6-instruction pipeline) ----
308+
vpmultishiftqb ymm1, ymm10, ymm0 ; extract 8 bit-fields per qword
309+
vpand ymm1, ymm1, ymm11 ; isolate 6-bit indices
310+
311+
; ---- Step 4: Map index -> ASCII via VPERMB ----
312+
vpermb zmm1, zmm1, zmm9 ; zmm1[i] = b64_table[ymm1[i] & 63]
313+
314+
; ---- Step 5: Store 32 output bytes ----
315+
vmovdqu [r14 + rbx], ymm1
316+
317+
add r15, 24
318+
add rbx, 32
319+
jmp .avx512vbmi2_loop
320+
321+
.avx512vbmi2_tail:
322+
SAFE_VZEROUPPER
323+
jmp .scalar_path
324+
325+
242326
; ============================================================================
243327
; AVX-512VBMI PATH — 24 input bytes -> 32 output characters per iteration
244328
;

src/ws_cpu.asm

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
; bit 3 = LZCNT (CPUID.0x80000001:ECX[5])
1515
; bit 4 = VBMI (CPUID.7.0:ECX[1], only set when cpu_tier == 3)
1616
; bit 5 = AMD vendor (skip vzeroupper — no SSE/AVX transition penalty)
17+
; bit 6 = VBMI2 (CPUID.7.0:ECX[6], only set when cpu_tier == 3)
1718

1819
BITS 64
1920
DEFAULT REL
@@ -37,7 +38,7 @@ global _init_cpu_features
3738
; r9d = max basic leaf (from CPUID leaf 0)
3839
; r10d = leaf 1 ECX (OSXSAVE, PCLMULQDQ)
3940
; r11d = leaf 7 EBX (AVX2, AVX-512F/BW, BMI2)
40-
; r12d = leaf 7 ECX (GFNI, VBMI) [callee-saved — must push/pop]
41+
; r12d = leaf 7 ECX (GFNI, VBMI, VBMI2) [callee-saved — must push/pop]
4142
;
4243
; Each CPUID leaf is executed exactly once. The caller-saved scratch
4344
; registers r8-r11 need no push/pop; only r12 (callee-saved) does.
@@ -65,7 +66,7 @@ _init_cpu_features:
6566
cpuid
6667
mov r10d, ecx ; r10d = leaf 1 ECX
6768

68-
; === Leaf 7 (if available) — AVX2, AVX-512F/BW, BMI2, GFNI, VBMI ===
69+
; === Leaf 7 (if available) — AVX2, AVX-512F/BW, BMI2, GFNI, VBMI, VBMI2 ===
6970
cmp r9d, 7
7071
jb .no_leaf7
7172
mov eax, 7
@@ -118,9 +119,17 @@ _init_cpu_features:
118119
cmp dword [cpu_tier], 3
119120
jl .check_bmi2
120121
test r12d, (1 << 1)
121-
jz .check_bmi2
122+
jz .check_vbmi2
122123
or dword [cpu_features], (1 << 4)
123124

125+
.check_vbmi2:
126+
; VBMI2 (bit 6): leaf 7 ECX bit 6 — only useful when cpu_tier == 3
127+
cmp dword [cpu_tier], 3
128+
jl .check_bmi2
129+
test r12d, (1 << 6)
130+
jz .check_bmi2
131+
or dword [cpu_features], (1 << 6)
132+
124133
.check_bmi2:
125134
; BMI2 (bit 2): leaf 7 EBX bit 8
126135
test r11d, (1 << 8)

0 commit comments

Comments
 (0)