mvogttech
diff --git a/‎bench/index.js‎
Lines changed: 129 additions & 0 deletions b/‎bench/index.js‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎binding.gyp‎
Lines changed: 10 additions & 1 deletion b/‎binding.gyp‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎src/ws_base64_asm.asm‎
Lines changed: 90 additions & 6 deletions b/‎src/ws_base64_asm.asm‎
Lines changed: 90 additions & 6 deletions
diff --git a/‎src/ws_cpu.asm‎
Lines changed: 12 additions & 3 deletions b/‎src/ws_cpu.asm‎
Lines changed: 12 additions & 3 deletions
@@ -195,6 +195,51 @@ for (const { name, size } of sizes) {
   );
 }
 
+// ── GFNI experiment: mask vs maskGfni (AVX-512 VPXORD baseline) ────────────
+
+if (asmUtil?.maskGfni && (asmUtil.cpuFeatures & 1)) {
+  console.log('\n=== GFNI Experiment: mask vs maskGfni ===');
+  console.log('GF2P8AFFINEQB evaluated for XOR masking — cannot replace VPXORD');
+  console.log('for multi-byte patterns (imm8 is 1 byte, mask is 4). Both use VPXORD.');
+  console.log('This benchmark confirms identical performance.\n');
+  console.log(
+    'Size'.padEnd(10) +
+    '  mask ops/s'.padEnd(14) +
+    '  gfni ops/s'.padEnd(14) +
+    '  gfni/mask'.padEnd(11) +
+    '  spread'
+  );
+  console.log('\u2500'.repeat(60));
+
+  const gfniSizes = [
+    { name: '64 B',   size: 64 },
+    { name: '256 B',  size: 256 },
+    { name: '1 KB',   size: 1024 },
+    { name: '16 KB',  size: 16384 },
+    { name: '64 KB',  size: 65536 },
+    { name: '1 MB',   size: 1048576 },
+  ];
+
+  for (const { name, size } of gfniSizes) {
+    const source = crypto.randomBytes(size);
+    const mask   = crypto.randomBytes(4);
+    const output = Buffer.alloc(size);
+
+    collectGarbage();
+    const maskRes = benchmark(() => asmUtil.mask(source, mask, output, 0, size));
+    collectGarbage();
+    const gfniRes = benchmark(() => asmUtil.maskGfni(source, mask, output, 0, size));
+
+    console.log(
+      name.padEnd(10) +
+      fmtOps(maskRes.median) +
+      fmtOps(gfniRes.median) +
+      fmtSpeedup(gfniRes.median, maskRes.median).padStart(11) +
+      ('  ' + fmtRange(gfniRes))
+    );
+  }
+}
+
 // ── Unmask benchmark ────────────────────────────────────────────────────────
 
 console.log('\n=== WebSocket Unmask ===');
@@ -416,6 +461,90 @@ if (asmUtil) {
   }
 }
 
+// ── UTF-8 validation benchmark ─────────────────────────────────────────────
+
+if (asmUtil?.utf8Validate) {
+  console.log('\n=== UTF-8 Validation (ASM vs TextDecoder) ===');
+  console.log(
+    'Size'.padEnd(10) +
+    '  TextDecoder'.padEnd(14) +
+    '  ASM ops/s'.padEnd(14) +
+    '  ASM MB/s'.padEnd(12) +
+    '  vs TD'.padEnd(9) +
+    '  spread'
+  );
+  console.log('\u2500'.repeat(70));
+
+  const decoder = new TextDecoder('utf-8', { fatal: true });
+
+  // ASCII text (fast path — the common case for English WebSocket traffic)
+  const asciiSizes = [
+    { name: '64 B',   size: 64 },
+    { name: '256 B',  size: 256 },
+    { name: '1 KB',   size: 1024 },
+    { name: '16 KB',  size: 16384 },
+    { name: '64 KB',  size: 65536 },
+    { name: '256 KB', size: 262144 },
+    { name: '1 MB',   size: 1048576 },
+  ];
+
+  console.log('  -- ASCII text (SIMD fast path) --');
+  for (const { name, size } of asciiSizes) {
+    // Fill with printable ASCII (0x20-0x7E)
+    const data = Buffer.alloc(size);
+    for (let i = 0; i < size; i++) data[i] = 0x20 + (i % 95);
+
+    collectGarbage();
+    const tdRes = benchmark(() => { try { decoder.decode(data); } catch {} });
+    collectGarbage();
+    const asmRes = benchmark(() => asmUtil.utf8Validate(data));
+
+    console.log(
+      name.padEnd(10) +
+      fmtOps(tdRes.median) +
+      fmtOps(asmRes.median) +
+      fmtThroughput(asmRes.median, size).padStart(12) +
+      fmtSpeedup(asmRes.median, tdRes.median).padStart(9) +
+      ('  ' + fmtRange(asmRes))
+    );
+  }
+
+  // Mixed UTF-8 text (Japanese — 3-byte sequences, forces scalar validation)
+  const mixedText = '\u3053\u3093\u306b\u3061\u306f\u4e16\u754c'; // "こんにちは世界"
+  const mixedSizes = [
+    { name: '64 B',   size: 64 },
+    { name: '256 B',  size: 256 },
+    { name: '1 KB',   size: 1024 },
+    { name: '16 KB',  size: 16384 },
+  ];
+
+  console.log('  -- Mixed UTF-8 (Japanese, 3-byte sequences) --');
+  for (const { name, size } of mixedSizes) {
+    const base = Buffer.from(mixedText.repeat(Math.ceil(size / 21)));
+    const data = base.subarray(0, size);
+    // Ensure we don't end on a truncated sequence — trim to valid boundary
+    let trimmed = data;
+    while (trimmed.length > 0) {
+      try { decoder.decode(trimmed); break; } catch { trimmed = trimmed.subarray(0, trimmed.length - 1); }
+    }
+    if (trimmed.length === 0) continue;
+
+    collectGarbage();
+    const tdRes = benchmark(() => { try { decoder.decode(trimmed); } catch {} });
+    collectGarbage();
+    const asmRes = benchmark(() => asmUtil.utf8Validate(trimmed));
+
+    console.log(
+      name.padEnd(10) +
+      fmtOps(tdRes.median) +
+      fmtOps(asmRes.median) +
+      fmtThroughput(asmRes.median, trimmed.length).padStart(12) +
+      fmtSpeedup(asmRes.median, tdRes.median).padStart(9) +
+      ('  ' + fmtRange(asmRes))
+    );
+  }
+}
+
 console.log('\n' + '\u2500'.repeat(80));
 console.log('Config: warmup=' + WARMUP_MS + 'ms, sample=' + SAMPLE_MS + 'ms, samples=' + SAMPLES);
 console.log('Median of ' + SAMPLES + ' samples shown. Spread = (max-min)/median.');
 
@@ -50,14 +50,23 @@
               "action": ["nasm", "-f", "elf64",
                          "-o", "<(INTERMEDIATE_DIR)/ws_crc32_asm.o",
                          "src/ws_crc32_asm.asm"]
+            },
+            {
+              "action_name": "assemble_utf8",
+              "inputs":  ["src/ws_utf8_asm.asm"],
+              "outputs": ["<(INTERMEDIATE_DIR)/ws_utf8_asm.o"],
+              "action": ["nasm", "-f", "elf64",
+                         "-o", "<(INTERMEDIATE_DIR)/ws_utf8_asm.o",
+                         "src/ws_utf8_asm.asm"]
             }
           ],
           "link_settings": {
             "libraries": [
               "<(INTERMEDIATE_DIR)/ws_cpu.o",
               "<(INTERMEDIATE_DIR)/ws_mask_asm.o",
               "<(INTERMEDIATE_DIR)/ws_base64_asm.o",
-              "<(INTERMEDIATE_DIR)/ws_crc32_asm.o"
+              "<(INTERMEDIATE_DIR)/ws_crc32_asm.o",
+              "<(INTERMEDIATE_DIR)/ws_utf8_asm.o"
             ]
           }
         }]
 
@@ -1,4 +1,4 @@
-; ws_base64_asm.asm — Base64 encoder with AVX2/GFNI/SSE2/scalar dispatch
+; ws_base64_asm.asm — Base64 encoder with AVX-512 VBMI2/VBMI/AVX2/SSE2/scalar dispatch
 ;
 ; C signature:
 ;   size_t ws_base64_encode(const uint8_t *in, size_t len, uint8_t *out);
@@ -12,10 +12,11 @@
 ;   rax = number of output bytes written (always ceil(len/3)*4)
 ;
 ; Dispatch order (fastest-available first):
-;   1. cpu_tier >= 3 + VBMI (bit 4) -> .avx512vbmi_path (24 in -> 32 out / iter, VPERMB)
-;   2. cpu_tier >= 2 (AVX2)         -> .avx2_path        (24 in -> 32 out / iter)
-;   3. cpu_tier >= 1 (SSE2)         -> .sse2_path         (12 in -> 16 out / iter)
-;   4. fallback                     -> .scalar_path        ( 3 in ->  4 out / iter)
+;   1. cpu_tier >= 3 + VBMI2 (bit 6) -> .avx512vbmi2_path (24 in -> 32 out / iter, VPMULTISHIFTQB)
+;   2. cpu_tier >= 3 + VBMI  (bit 4) -> .avx512vbmi_path  (24 in -> 32 out / iter, VPERMB)
+;   3. cpu_tier >= 2 (AVX2)          -> .avx2_path         (24 in -> 32 out / iter)
+;   4. cpu_tier >= 1 (SSE2)          -> .sse2_path          (12 in -> 16 out / iter)
+;   5. fallback                      -> .scalar_path         ( 3 in ->  4 out / iter)
 ;
 ; Algorithm: Klomp/Muła VPSHUFB method (vectorised base64 encoding)
 ;
@@ -185,6 +186,31 @@ section .data
     align 32
     b64_const_p03:  times 32 db 3      ; +3  correction ('+'->'/' boundary)
 
+    ; -------------------------------------------------------------------------
+    ; VPMULTISHIFTQB shift control for VBMI2 base64 path.
+    ; After VPSHUFB with b64_shuf, each dword contains [B, A, C, B] where
+    ; A,B,C are consecutive input bytes.  In a qword (two groups), the
+    ; layout is [B0,A0,C0,B0, B1,A1,C1,B1] at bit positions:
+    ;   B0=[7:0], A0=[15:8], C0=[23:16], B0'=[31:24],
+    ;   B1=[39:32], A1=[47:40], C1=[55:48], B1'=[63:56]
+    ;
+    ; VPMULTISHIFTQB extracts 8 contiguous bits starting at each control byte's
+    ; position (mod 64).  After AND 0x3F the result is the 6-bit base64 index.
+    ;
+    ; Per group [A,B,C]:
+    ;   i0 = A >> 2           -> bits [15:10] -> shift = 10
+    ;   i1 = (A&3)<<4 | B>>4 -> bits [11:4]  -> shift = 4  (& 0x3F)
+    ;   i2 = (B&F)<<2 | C>>6 -> bits [29:22] -> shift = 22 (& 0x3F)
+    ;   i3 = C & 0x3F        -> bits [21:16] -> shift = 16
+    ;
+    ; Group 1 offsets are +32 within the qword.
+    align 32
+    b64_vbmi2_shifts:
+        db 10, 4, 22, 16, 42, 36, 54, 48
+        db 10, 4, 22, 16, 42, 36, 54, 48
+        db 10, 4, 22, 16, 42, 36, 54, 48
+        db 10, 4, 22, 16, 42, 36, 54, 48
+
     ; -------------------------------------------------------------------------
     ; Standard 64-character base64 alphabet (RFC 4648 §4).
     align 64
@@ -226,7 +252,9 @@ ws_base64_encode:
     ; ------------------------------------------------------------------
     cmp  dword [cpu_tier], 3
     jl   .b64_check_avx2
-    test dword [cpu_features], (1 << 4)   ; VBMI bit
+    test dword [cpu_features], (1 << 6)   ; VBMI2 bit
+    jnz  .avx512vbmi2_path
+    test dword [cpu_features], (1 << 4)   ; VBMI bit (fallback)
     jnz  .avx512vbmi_path
 
 .b64_check_avx2:
@@ -239,6 +267,62 @@ ws_base64_encode:
     jmp  .scalar_path
 
 
+; ============================================================================
+; AVX-512 VBMI2 PATH — 24 input bytes -> 32 output characters per iteration
+;
+; Replaces the 6-instruction Klomp/Mula extraction pipeline with 2 instructions:
+;   VPMULTISHIFTQB — extracts 8 arbitrary bit-fields per qword in one uop
+;   VPANDD         — isolates the 6-bit indices (mask with 0x3F)
+;
+; The existing b64_shuf table produces [B,A,C,B] per dword.  Within each qword
+; (two groups), VPMULTISHIFTQB with control [10,4,22,16, 42,36,54,48] extracts
+; the four 6-bit base64 indices per group directly.
+;
+; After extraction, VPERMB maps 6-bit indices to ASCII via b64_table (same as
+; the VBMI path below).  Net savings: 4 instructions per iteration vs VBMI path.
+;
+; Requires: AVX-512 VBMI2 (cpu_tier >= 3, cpu_features bit 6)
+; ============================================================================
+    align 32
+.avx512vbmi2_path:
+    vmovdqa64  zmm9, [b64_table]          ; 64-byte base64 LUT
+    vmovdqa    ymm10, [b64_vbmi2_shifts]  ; shift control vector (32 bytes)
+    vmovdqa    ymm11, [b64_mask3f]        ; 0x3F mask (32 bytes, pre-filled)
+
+    align 32
+.avx512vbmi2_loop:
+    ; Guard: need 32 bytes for safe overlapping load (consume 24).
+    mov  rax, r13
+    sub  rax, r15
+    cmp  rax, 32
+    jl   .avx512vbmi2_tail
+
+    ; ---- Step 1: Load 24 bytes via two 16-byte lane-aligned loads ----
+    vmovdqu     xmm0, [r12 + r15]
+    vinserti128 ymm0, ymm0, [r12 + r15 + 12], 1
+
+    ; ---- Step 2: Shuffle to [B,A,C,B] per dword ----
+    vpshufb    ymm0, ymm0, [b64_shuf]
+
+    ; ---- Step 3: Extract 6-bit fields (replaces 6-instruction pipeline) ----
+    vpmultishiftqb ymm1, ymm10, ymm0     ; extract 8 bit-fields per qword
+    vpand      ymm1, ymm1, ymm11         ; isolate 6-bit indices
+
+    ; ---- Step 4: Map index -> ASCII via VPERMB ----
+    vpermb     zmm1, zmm1, zmm9          ; zmm1[i] = b64_table[ymm1[i] & 63]
+
+    ; ---- Step 5: Store 32 output bytes ----
+    vmovdqu    [r14 + rbx], ymm1
+
+    add  r15, 24
+    add  rbx, 32
+    jmp  .avx512vbmi2_loop
+
+.avx512vbmi2_tail:
+    SAFE_VZEROUPPER
+    jmp  .scalar_path
+
+
 ; ============================================================================
 ; AVX-512VBMI PATH — 24 input bytes -> 32 output characters per iteration
 ;
 
@@ -14,6 +14,7 @@
 ;   bit 3 = LZCNT     (CPUID.0x80000001:ECX[5])
 ;   bit 4 = VBMI      (CPUID.7.0:ECX[1], only set when cpu_tier == 3)
 ;   bit 5 = AMD vendor (skip vzeroupper — no SSE/AVX transition penalty)
+;   bit 6 = VBMI2     (CPUID.7.0:ECX[6], only set when cpu_tier == 3)
 
 BITS 64
 DEFAULT REL
@@ -37,7 +38,7 @@ global _init_cpu_features
 ;   r9d  = max basic leaf (from CPUID leaf 0)
 ;   r10d = leaf 1 ECX  (OSXSAVE, PCLMULQDQ)
 ;   r11d = leaf 7 EBX  (AVX2, AVX-512F/BW, BMI2)
-;   r12d = leaf 7 ECX  (GFNI, VBMI)   [callee-saved — must push/pop]
+;   r12d = leaf 7 ECX  (GFNI, VBMI, VBMI2)  [callee-saved — must push/pop]
 ;
 ; Each CPUID leaf is executed exactly once.  The caller-saved scratch
 ; registers r8-r11 need no push/pop; only r12 (callee-saved) does.
@@ -65,7 +66,7 @@ _init_cpu_features:
     cpuid
     mov r10d, ecx                   ; r10d = leaf 1 ECX
 
-    ; === Leaf 7 (if available) — AVX2, AVX-512F/BW, BMI2, GFNI, VBMI ===
+    ; === Leaf 7 (if available) — AVX2, AVX-512F/BW, BMI2, GFNI, VBMI, VBMI2 ===
     cmp r9d, 7
     jb .no_leaf7
     mov eax, 7
@@ -118,9 +119,17 @@ _init_cpu_features:
     cmp dword [cpu_tier], 3
     jl .check_bmi2
     test r12d, (1 << 1)
-    jz .check_bmi2
+    jz .check_vbmi2
     or dword [cpu_features], (1 << 4)
 
+.check_vbmi2:
+    ; VBMI2 (bit 6): leaf 7 ECX bit 6 — only useful when cpu_tier == 3
+    cmp dword [cpu_tier], 3
+    jl .check_bmi2
+    test r12d, (1 << 6)
+    jz .check_bmi2
+    or dword [cpu_features], (1 << 6)
+
 .check_bmi2:
     ; BMI2 (bit 2): leaf 7 EBX bit 8
     test r11d, (1 << 8)