Skip to content

Commit 43c7a3e

Browse files
mvogttechclaude
andcommitted
perf: deepen AVX-512 cached loop to 8x unroll (512 bytes/iter)
Double the unroll from 4x zmm (256B) to 8x zmm (512B) per iteration, using zmm1-zmm8. Halves loop overhead and gives the reorder buffer more independent operations to schedule. Prefetch distance increased to 2048 bytes to match the wider stride. Tail path unchanged — handles 0-511 byte remainder via 64-byte chunks + opmask. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 5fcdd8b commit 43c7a3e

1 file changed

Lines changed: 43 additions & 18 deletions

File tree

src/ws_mask_asm.asm

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -90,41 +90,53 @@ ws_mask:
9090
cmp rcx, (1 << 23) ; >= 8MB → NT path
9191
jae .m_nt512
9292

93-
; 4x unrolled: 256 bytes/iter
93+
; 8x unrolled: 512 bytes/iter
9494
mov rax, rcx
95-
shr rax, 8
95+
shr rax, 9
9696
test rax, rax
9797
jz .m_512_tail
9898

9999
align 32
100-
.m_512_256:
101-
prefetcht0 [rdi + 1024]
100+
.m_512_512:
101+
prefetcht0 [rdi + 2048]
102102
vmovdqu64 zmm1, [rdi]
103103
vmovdqu64 zmm2, [rdi + 64]
104104
vmovdqu64 zmm3, [rdi + 128]
105105
vmovdqu64 zmm4, [rdi + 192]
106+
vmovdqu64 zmm5, [rdi + 256]
107+
vmovdqu64 zmm6, [rdi + 320]
108+
vmovdqu64 zmm7, [rdi + 384]
109+
vmovdqu64 zmm8, [rdi + 448]
106110
vpxord zmm1, zmm1, zmm0
107111
vpxord zmm2, zmm2, zmm0
108112
vpxord zmm3, zmm3, zmm0
109113
vpxord zmm4, zmm4, zmm0
114+
vpxord zmm5, zmm5, zmm0
115+
vpxord zmm6, zmm6, zmm0
116+
vpxord zmm7, zmm7, zmm0
117+
vpxord zmm8, zmm8, zmm0
110118
vmovdqu64 [rdx], zmm1
111119
vmovdqu64 [rdx + 64], zmm2
112120
vmovdqu64 [rdx + 128], zmm3
113121
vmovdqu64 [rdx + 192], zmm4
114-
add rdi, 256
115-
add rdx, 256
122+
vmovdqu64 [rdx + 256], zmm5
123+
vmovdqu64 [rdx + 320], zmm6
124+
vmovdqu64 [rdx + 384], zmm7
125+
vmovdqu64 [rdx + 448], zmm8
126+
add rdi, 512
127+
add rdx, 512
116128
dec rax
117-
jnz .m_512_256
129+
jnz .m_512_512
118130

119-
and rcx, 255
131+
and rcx, 511
120132

121133
.m_512_tail:
122-
; Handle remaining 0-255 bytes — full 64-byte chunks, then opmask tail
134+
; Handle remaining 0-511 bytes — full 64-byte chunks, then opmask tail
123135
test rcx, rcx
124136
jz .m_512_done
125137

126138
mov rax, rcx
127-
shr rax, 6 ; full 64-byte chunks (0-3)
139+
shr rax, 6 ; full 64-byte chunks (0-7)
128140
jz .m_512_final
129141

130142
.m_512_full64:
@@ -462,38 +474,51 @@ ws_unmask:
462474
cmp rcx, (1 << 23) ; >= 8MB → NT path
463475
jae .u_nt512
464476

477+
; 8x unrolled: 512 bytes/iter
465478
mov rax, rcx
466-
shr rax, 8
479+
shr rax, 9
467480
test rax, rax
468481
jz .u_512_tail
469482

470483
align 32
471-
.u_512_256:
472-
prefetcht0 [rdi + 1024]
484+
.u_512_512:
485+
prefetcht0 [rdi + 2048]
473486
vmovdqu64 zmm1, [rdi]
474487
vmovdqu64 zmm2, [rdi + 64]
475488
vmovdqu64 zmm3, [rdi + 128]
476489
vmovdqu64 zmm4, [rdi + 192]
490+
vmovdqu64 zmm5, [rdi + 256]
491+
vmovdqu64 zmm6, [rdi + 320]
492+
vmovdqu64 zmm7, [rdi + 384]
493+
vmovdqu64 zmm8, [rdi + 448]
477494
vpxord zmm1, zmm1, zmm0
478495
vpxord zmm2, zmm2, zmm0
479496
vpxord zmm3, zmm3, zmm0
480497
vpxord zmm4, zmm4, zmm0
498+
vpxord zmm5, zmm5, zmm0
499+
vpxord zmm6, zmm6, zmm0
500+
vpxord zmm7, zmm7, zmm0
501+
vpxord zmm8, zmm8, zmm0
481502
vmovdqu64 [rdi], zmm1
482503
vmovdqu64 [rdi + 64], zmm2
483504
vmovdqu64 [rdi + 128], zmm3
484505
vmovdqu64 [rdi + 192], zmm4
485-
add rdi, 256
506+
vmovdqu64 [rdi + 256], zmm5
507+
vmovdqu64 [rdi + 320], zmm6
508+
vmovdqu64 [rdi + 384], zmm7
509+
vmovdqu64 [rdi + 448], zmm8
510+
add rdi, 512
486511
dec rax
487-
jnz .u_512_256
488-
and rcx, 255
512+
jnz .u_512_512
513+
and rcx, 511
489514

490515
.u_512_tail:
491-
; Handle remaining 0-255 bytes — full 64-byte chunks, then opmask tail
516+
; Handle remaining 0-511 bytes — full 64-byte chunks, then opmask tail
492517
test rcx, rcx
493518
jz .u_512_done
494519

495520
mov rax, rcx
496-
shr rax, 6 ; full 64-byte chunks (0-3)
521+
shr rax, 6 ; full 64-byte chunks (0-7)
497522
jz .u_512_final
498523

499524
.u_512_full64:

0 commit comments

Comments
 (0)