Skip to content

Commit 8d632cf

Browse files
mvogttechclaude
andcommitted
fix: replace dual-stream unmask loop with single-stream 8x unroll
The dual-stream AVX-512 unmask loop used the same zmm0 mask vector for both front and back streams. After a misaligned prologue rotates the mask, the back stream starts at a different offset % 4 — producing wrong XOR bytes. Replace with single-stream 8x-unrolled loop matching ws_mask's proven pattern. Fixes: "512 KB unmask, misaligned buffer — NT prologue mask cycling" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1088461 commit 8d632cf

1 file changed

Lines changed: 16 additions & 26 deletions

File tree

src/ws_mask_asm.asm

Lines changed: 16 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -620,49 +620,39 @@ ws_unmask:
620620
vpbroadcastd zmm0, r8d
621621

622622
.u_512_aligned:
623-
; Dual-stream: process 256 bytes from front + 256 from back per iteration.
624-
; Two independent memory streams increase page-level parallelism and
625-
; TLB coverage for large in-place buffers.
623+
; 8x unrolled: 512 bytes/iter (rdi now 64-byte aligned, in-place)
626624
mov rax, rcx
627625
shr rax, 9 ; iterations = len / 512
628626
jz .u_512_tail
629627

630-
lea r11, [rdi + rcx - 256] ; r11 = back pointer (last 256-byte block)
631-
632628
align 32
633-
.u_dual_512:
634-
; Memory-operand VPXORD fuses load+XOR; OoO engine overlaps stores naturally.
635-
prefetcht0 [rdi + 1024]
636-
prefetcht0 [r11 - 768]
629+
.u_512_512:
630+
; 8x unrolled: memory-operand VPXORD fuses load+XOR into one instruction.
631+
; OoO engine (Zen 4: 320-entry ROB) overlaps loads and stores naturally.
632+
prefetcht0 [rdi + 2048]
637633

638-
; Front 256 bytes
639634
vpxord zmm1, zmm0, [rdi]
640635
vpxord zmm2, zmm0, [rdi + 64]
641636
vpxord zmm3, zmm0, [rdi + 128]
642637
vpxord zmm4, zmm0, [rdi + 192]
638+
vpxord zmm5, zmm0, [rdi + 256]
639+
vpxord zmm6, zmm0, [rdi + 320]
640+
vpxord zmm7, zmm0, [rdi + 384]
641+
vpxord zmm8, zmm0, [rdi + 448]
643642
vmovdqu64 [rdi], zmm1
644643
vmovdqu64 [rdi + 64], zmm2
645644
vmovdqu64 [rdi + 128], zmm3
646645
vmovdqu64 [rdi + 192], zmm4
646+
vmovdqu64 [rdi + 256], zmm5
647+
vmovdqu64 [rdi + 320], zmm6
648+
vmovdqu64 [rdi + 384], zmm7
649+
vmovdqu64 [rdi + 448], zmm8
647650

648-
; Back 256 bytes
649-
vpxord zmm5, zmm0, [r11]
650-
vpxord zmm6, zmm0, [r11 + 64]
651-
vpxord zmm7, zmm0, [r11 + 128]
652-
vpxord zmm8, zmm0, [r11 + 192]
653-
vmovdqu64 [r11], zmm5
654-
vmovdqu64 [r11 + 64], zmm6
655-
vmovdqu64 [r11 + 128], zmm7
656-
vmovdqu64 [r11 + 192], zmm8
657-
658-
add rdi, 256
659-
sub r11, 256
651+
add rdi, 512
660652
dec rax
661-
jnz .u_dual_512
653+
jnz .u_512_512
662654

663-
; Remaining middle bytes: (r11 + 256) - rdi
664-
lea rcx, [r11 + 256]
665-
sub rcx, rdi
655+
and rcx, 511
666656

667657
.u_512_tail:
668658
; Handle remaining 0-511 bytes — full 64-byte chunks, then opmask tail

0 commit comments

Comments
 (0)