2424 !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
2525/* simpasm: header-end */
2626
27- #define in %rsi
28- #define out %rdi
29- #define len %rdx
30- #define tab %rcx
31-
32- #define cnt %rax
33- #define pos %r8
34-
35- #define good %r11
36- #define pext_mask %r9
37- #define table_idx %r10
38-
39- #define bound %xmm0
40- #define temp0 %xmm1
41- #define temp1 %xmm6
42- #define vals %xmm2
43- #define shuffle_out_mask %xmm3
44- #define shuffle_in_mask %xmm4
45- #define and_mask %xmm5
27+ #define MLK_IN %rsi
28+ #define MLK_OUT %rdi
29+ #define MLK_LEN %rdx
30+ #define MLK_TAB %rcx
31+
32+ #define MLK_CNT %rax
33+ #define MLK_POS %r8
34+
35+ #define MLK_GOOD %r11
36+ #define MLK_PEXT_MASK %r9
37+ #define MLK_TABLE_IDX %r10
38+
39+ #define MLK_BOUND %xmm0
40+ #define MLK_TEMP0 %xmm1
41+ #define MLK_TEMP1 %xmm6
42+ #define MLK_VALS %xmm2
43+ #define MLK_SHUFFLE_OUT_MASK %xmm3
44+ #define MLK_SHUFFLE_IN_MASK %xmm4
45+ #define MLK_AND_MASK %xmm5
4646
4747// High level overview of the algorithm:
4848// For every 96 bits (12 bytes) of the input:
4949// 1. Split 96 bits into eight 12-bit integers where each integer
50- // occupies a corresponding 16-bit element of `vals ` xmm register,
51- // 2. Compute an 8-bit value `good ` such that
52- // good [i] = vals [i] < MLKEM_Q ? 1 : 0, for i in [0, 7],
53- // 3. Shuffle the elements in `vals ` such that all good elements
50+ // occupies a corresponding 16-bit element of `MLK_VALS ` xmm register,
51+ // 2. Compute an 8-bit value `MLK_GOOD ` such that
52+ // MLK_GOOD [i] = MLK_VALS [i] < MLKEM_Q ? 1 : 0, for i in [0, 7],
53+ // 3. Shuffle the elements in `MLK_VALS ` such that all MLK_GOOD elements
5454// are ordered consecutivelly, and store them.
5555//
5656// Notes:
57- // - We exit early if we find the required number of good values,
57+ // - We exit early if we find the required number of MLK_GOOD values,
5858// - We use the stack as a temporary storage and copy to the actual
5959// output buffer only in the end. This is because the algorithm
6060// can overwrite up to 14 bytes (we use 16B for alignment),
6868MLK_ASM_FN_SYMBOL(rej_uniform_asm)
6969 subq $MLK_STACK_SIZE, %rsp
7070
71- // Broadcast MLKEM_Q (3329) to all 16-bit elements of bound reg.
71+ // Broadcast MLKEM_Q (3329) to all 16-bit elements of MLK_BOUND reg.
7272 movq $0x0D010D010D010D01 , %rax
73- movq %rax , bound
74- pinsrq $1 , %rax , bound
73+ movq %rax , MLK_BOUND
74+ pinsrq $1 , %rax , MLK_BOUND
7575
76- // Broadcast 12-bit mask 0xFFF to all 16-bit elements of bound reg.
76+ // Broadcast 12-bit mask 0xFFF to all 16-bit elements of MLK_BOUND reg.
7777 movq $0x0FFF0FFF0FFF0FFF , %rax
78- movq %rax , and_mask
79- pinsrq $1 , %rax , and_mask
78+ movq %rax , MLK_AND_MASK
79+ pinsrq $1 , %rax , MLK_AND_MASK
8080
8181 // Load shuffle mask:
8282 // 0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11.
8383 movq $0x0504040302010100 , %rax
84- movq %rax , shuffle_in_mask
84+ movq %rax , MLK_SHUFFLE_IN_MASK
8585 movq $0x0B0A0A0908070706 , %rax
86- pinsrq $1 , %rax , shuffle_in_mask
86+ pinsrq $1 , %rax , MLK_SHUFFLE_IN_MASK
8787
88- movq $0 , cnt // cnt counts the number of good values we've found.
89- movq $0 , pos // pos is the current position in the input buffer.
90- movq $0x5555 , pext_mask // 0x5555 mask to extract every second bit.
88+ movq $0 , MLK_CNT // MLK_CNT counts the number of MLK_GOOD values we've found.
89+ movq $0 , MLK_POS // MLK_POS is the current position in the input buffer.
90+ movq $0x5555 , MLK_PEXT_MASK // 0x5555 mask to extract every second bit.
9191
9292rej_uniform_asm_loop_start:
9393 // 1. Split 96 bits into eight 12-bit integers where each integer.
94- // We explain the algorithm by considering the lowest 64 bits of vals .
95- movdqu (in , pos ), vals
96- // vals : [ 63..48 | 47..32 | 31..16 | 15..0 ]
97- pshufb shuffle_in_mask, vals
98- // vals : [ 47..32 | 39..24 | 23..8 | 15..0 ]
99- movdqa vals, temp1
94+ // We explain the algorithm by considering the lowest 64 bits of MLK_VALS .
95+ movdqu (MLK_IN, MLK_POS ), MLK_VALS
96+ // MLK_VALS : [ 63..48 | 47..32 | 31..16 | 15..0 ]
97+ pshufb MLK_SHUFFLE_IN_MASK, MLK_VALS
98+ // MLK_VALS : [ 47..32 | 39..24 | 23..8 | 15..0 ]
99+ movdqa MLK_VALS, MLK_TEMP1
100100 // temp: [ 47..32 | 39..24 | 23..8 | 15..0 ]
101- psrlw $4 , temp1
101+ psrlw $4 , MLK_TEMP1
102102 // temp: [ 47..36 | 39..28 | 23..12 | 15..4 ]
103- pblendw $0xAA , temp1, vals
104- // vals : [ 47..36 | 39..24 | 23..12 | 15..0]
105- pand and_mask, vals
106- // vals : [ 47..36 | 35..24 | 23..12 | 12..0]
107-
108- // 2. Compute an 8-bit value `good ` such that
109- // good [i] = vals [i] < MLKEM_Q ? 1 : 0, for i in [0, 7],
110- movdqa bound , temp0
111- pcmpgtw vals, temp0
112- pmovmskb temp0, good
113- pext pext_mask, good, good
114-
115- // 3. Shuffle the elements in `vals ` such that all good elements
103+ pblendw $0xAA , MLK_TEMP1, MLK_VALS
104+ // MLK_VALS : [ 47..36 | 39..24 | 23..12 | 15..0]
105+ pand MLK_AND_MASK, MLK_VALS
106+ // MLK_VALS : [ 47..36 | 35..24 | 23..12 | 12..0]
107+
108+ // 2. Compute an 8-bit value `MLK_GOOD ` such that
109+ // MLK_GOOD [i] = MLK_VALS [i] < MLKEM_Q ? 1 : 0, for i in [0, 7],
110+ movdqa MLK_BOUND, MLK_TEMP0
111+ pcmpgtw MLK_VALS, MLK_TEMP0
112+ pmovmskb MLK_TEMP0, MLK_GOOD
113+ pext MLK_PEXT_MASK, MLK_GOOD, MLK_GOOD
114+
115+ // 3. Shuffle the elements in `MLK_VALS ` such that all MLK_GOOD elements
116116 // are ordered consecutivelly, and store them.
117- movq good, table_idx
118- shl $4 , table_idx
119- movdqu (tab, table_idx ), shuffle_out_mask
120- pshufb shuffle_out_mask, vals
121- movdqu vals , (%rsp , cnt , 2 )
117+ movq MLK_GOOD, MLK_TABLE_IDX
118+ shl $4 , MLK_TABLE_IDX
119+ movdqu (MLK_TAB, MLK_TABLE_IDX ), MLK_SHUFFLE_OUT_MASK
120+ pshufb MLK_SHUFFLE_OUT_MASK, MLK_VALS
121+ movdqu MLK_VALS , (%rsp , MLK_CNT , 2 )
122122
123123 // Update the counter and check if we are done.
124- popcnt good, good
125- addq good, cnt
124+ popcnt MLK_GOOD, MLK_GOOD
125+ addq MLK_GOOD, MLK_CNT
126126
127- cmpq $256 , cnt
127+ cmpq $256 , MLK_CNT
128128 jnb rej_uniform_asm_final_copy
129129
130- addq $12 , pos
131- cmpq pos, len
130+ addq $12 , MLK_POS
131+ cmpq MLK_POS, MLK_LEN
132132 ja rej_uniform_asm_loop_start
133133
134134rej_uniform_asm_final_copy:
135- // Copy up to 256 values to the output: min(cnt , 256).
135+ // Copy up to 256 values to the output: min(MLK_CNT , 256).
136136 mov $256 , %rcx
137- cmp $256 , cnt
138- cmova %rcx , cnt
137+ cmp $256 , MLK_CNT
138+ cmova %rcx , MLK_CNT
139139
140140 movq %rsp , %rsi
141- movq cnt , %rcx
141+ movq MLK_CNT , %rcx
142142 shlq $1 , %rcx
143143 rep movsb
144144
@@ -147,22 +147,22 @@ rej_uniform_asm_final_copy:
147147
148148/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
149149 * Don't modify by hand -- this is auto-generated by scripts/autogen. */
150- #undef in
151- #undef out
152- #undef len
153- #undef tab
154- #undef cnt
155- #undef pos
156- #undef good
157- #undef pext_mask
158- #undef table_idx
159- #undef bound
160- #undef temp0
161- #undef temp1
162- #undef vals
163- #undef shuffle_out_mask
164- #undef shuffle_in_mask
165- #undef and_mask
150+ #undef MLK_IN
151+ #undef MLK_OUT
152+ #undef MLK_LEN
153+ #undef MLK_TAB
154+ #undef MLK_CNT
155+ #undef MLK_POS
156+ #undef MLK_GOOD
157+ #undef MLK_PEXT_MASK
158+ #undef MLK_TABLE_IDX
159+ #undef MLK_BOUND
160+ #undef MLK_TEMP0
161+ #undef MLK_TEMP1
162+ #undef MLK_VALS
163+ #undef MLK_SHUFFLE_OUT_MASK
164+ #undef MLK_SHUFFLE_IN_MASK
165+ #undef MLK_AND_MASK
166166#undef MLK_STACK_SIZE
167167
168168/* simpasm: footer-start */
0 commit comments