Skip to content

Commit e0269c4

Browse files
mvogttechclaude
andcommitted
fix: use rotated mask register in scalar byte tails
The scalar tail (.m_bytes, .u_bytes, .gf_bytes) used [rsi+1] and [rsi+2] (original mask pointer) for the last 2-3 bytes instead of the rotated r8d register. After AVX2 alignment prologues rotate the mask for non-multiple-of-4 offsets, these produce wrong XOR bytes. Fix all three tails to advance r8d via ror and use r8b consistently. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 8d632cf commit e0269c4

1 file changed

Lines changed: 12 additions & 8 deletions

File tree

src/ws_mask_asm.asm

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -500,13 +500,15 @@ ws_mask:
500500
mov [rdx], al
501501
dec rcx
502502
jz .m_ret
503+
ror r8d, 8
503504
mov al, [rdi + 1]
504-
xor al, byte [rsi + 1]
505+
xor al, r8b
505506
mov [rdx + 1], al
506507
dec rcx
507508
jz .m_ret
509+
ror r8d, 8
508510
mov al, [rdi + 2]
509-
xor al, byte [rsi + 2]
511+
xor al, r8b
510512
mov [rdx + 2], al
511513
.m_ret:
512514
ret
@@ -928,12 +930,12 @@ ws_unmask:
928930
xor byte [rdi], r8b
929931
dec rcx
930932
jz .u_ret
931-
mov al, byte [rsi + 1]
932-
xor byte [rdi + 1], al
933+
ror r8d, 8
934+
xor byte [rdi + 1], r8b
933935
dec rcx
934936
jz .u_ret
935-
mov al, byte [rsi + 2]
936-
xor byte [rdi + 2], al
937+
ror r8d, 8
938+
xor byte [rdi + 2], r8b
937939
.u_ret:
938940
ret
939941

@@ -2098,13 +2100,15 @@ ws_mask_gfni:
20982100
mov [rdx], al
20992101
dec rcx
21002102
jz .gf_ret
2103+
ror r8d, 8
21012104
mov al, [rdi + 1]
2102-
xor al, byte [rsi + 1]
2105+
xor al, r8b
21032106
mov [rdx + 1], al
21042107
dec rcx
21052108
jz .gf_ret
2109+
ror r8d, 8
21062110
mov al, [rdi + 2]
2107-
xor al, byte [rsi + 2]
2111+
xor al, r8b
21082112
mov [rdx + 2], al
21092113
.gf_ret:
21102114
ret

0 commit comments

Comments
 (0)