Skip to content

Commit a7001df

Browse files
committed
Namespace rej_uniform_asm
- There are some define in this files, namesapce them by MLK Signed-off-by: willieyz <willie.zhao@chelpis.com>
1 parent 7aa1636 commit a7001df

File tree

1 file changed

+85
-85
lines changed

1 file changed

+85
-85
lines changed

dev/x86_64/src/rej_uniform_asm.S

Lines changed: 85 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -24,37 +24,37 @@
2424
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
2525
/* simpasm: header-end */
2626

27-
#define in %rsi
28-
#define out %rdi
29-
#define len %rdx
30-
#define tab %rcx
31-
32-
#define cnt %rax
33-
#define pos %r8
34-
35-
#define good %r11
36-
#define pext_mask %r9
37-
#define table_idx %r10
38-
39-
#define bound %xmm0
40-
#define temp0 %xmm1
41-
#define temp1 %xmm6
42-
#define vals %xmm2
43-
#define shuffle_out_mask %xmm3
44-
#define shuffle_in_mask %xmm4
45-
#define and_mask %xmm5
27+
#define MLK_IN %rsi
28+
#define MLK_OUT %rdi
29+
#define MLK_LEN %rdx
30+
#define MLK_TAB %rcx
31+
32+
#define MLK_CNT %rax
33+
#define MLK_POS %r8
34+
35+
#define MLK_GOOD %r11
36+
#define MLK_PEXT_MASK %r9
37+
#define MLK_TABLE_IDX %r10
38+
39+
#define MLK_BOUND %xmm0
40+
#define MLK_TEMP0 %xmm1
41+
#define MLK_TEMP1 %xmm6
42+
#define MLK_VALS %xmm2
43+
#define MLK_SHUFFLE_OUT_MASK %xmm3
44+
#define MLK_SHUFFLE_IN_MASK %xmm4
45+
#define MLK_AND_MASK %xmm5
4646

4747
// High level overview of the algorithm:
4848
// For every 96 bits (12 bytes) of the input:
4949
// 1. Split 96 bits into eight 12-bit integers where each integer
50-
// occupies a corresponding 16-bit element of `vals` xmm register,
51-
// 2. Compute an 8-bit value `good` such that
52-
// good[i] = vals[i] < MLKEM_Q ? 1 : 0, for i in [0, 7],
53-
// 3. Shuffle the elements in `vals` such that all good elements
50+
// occupies a corresponding 16-bit element of `MLK_VALS` xmm register,
51+
// 2. Compute an 8-bit value `MLK_GOOD` such that
52+
// MLK_GOOD[i] = MLK_VALS[i] < MLKEM_Q ? 1 : 0, for i in [0, 7],
53+
// 3. Shuffle the elements in `MLK_VALS` such that all MLK_GOOD elements
5454
// are ordered consecutivelly, and store them.
5555
//
5656
// Notes:
57-
// - We exit early if we find the required number of good values,
57+
// - We exit early if we find the required number of MLK_GOOD values,
5858
// - We use the stack as a temporary storage and copy to the actual
5959
// output buffer only in the end. This is because the algorithm
6060
// can overwrite up to 14 bytes (we use 16B for alignment),
@@ -68,77 +68,77 @@
6868
MLK_ASM_FN_SYMBOL(rej_uniform_asm)
6969
subq $MLK_STACK_SIZE, %rsp
7070

71-
// Broadcast MLKEM_Q (3329) to all 16-bit elements of bound reg.
71+
// Broadcast MLKEM_Q (3329) to all 16-bit elements of MLK_BOUND reg.
7272
movq $0x0D010D010D010D01, %rax
73-
movq %rax, bound
74-
pinsrq $1, %rax, bound
73+
movq %rax, MLK_BOUND
74+
pinsrq $1, %rax, MLK_BOUND
7575

76-
// Broadcast 12-bit mask 0xFFF to all 16-bit elements of bound reg.
76+
// Broadcast 12-bit mask 0xFFF to all 16-bit elements of MLK_BOUND reg.
7777
movq $0x0FFF0FFF0FFF0FFF, %rax
78-
movq %rax, and_mask
79-
pinsrq $1, %rax, and_mask
78+
movq %rax, MLK_AND_MASK
79+
pinsrq $1, %rax, MLK_AND_MASK
8080

8181
// Load shuffle mask:
8282
// 0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11.
8383
movq $0x0504040302010100, %rax
84-
movq %rax, shuffle_in_mask
84+
movq %rax, MLK_SHUFFLE_IN_MASK
8585
movq $0x0B0A0A0908070706, %rax
86-
pinsrq $1, %rax, shuffle_in_mask
86+
pinsrq $1, %rax, MLK_SHUFFLE_IN_MASK
8787

88-
movq $0, cnt // cnt counts the number of good values we've found.
89-
movq $0, pos // pos is the current position in the input buffer.
90-
movq $0x5555, pext_mask // 0x5555 mask to extract every second bit.
88+
movq $0, MLK_CNT // MLK_CNT counts the number of MLK_GOOD values we've found.
89+
movq $0, MLK_POS // MLK_POS is the current position in the input buffer.
90+
movq $0x5555, MLK_PEXT_MASK // 0x5555 mask to extract every second bit.
9191

9292
rej_uniform_asm_loop_start:
9393
// 1. Split 96 bits into eight 12-bit integers where each integer.
94-
// We explain the algorithm by considering the lowest 64 bits of vals.
95-
movdqu (in, pos), vals
96-
// vals: [ 63..48 | 47..32 | 31..16 | 15..0 ]
97-
pshufb shuffle_in_mask, vals
98-
// vals: [ 47..32 | 39..24 | 23..8 | 15..0 ]
99-
movdqa vals, temp1
94+
// We explain the algorithm by considering the lowest 64 bits of MLK_VALS.
95+
movdqu (MLK_IN, MLK_POS), MLK_VALS
96+
// MLK_VALS: [ 63..48 | 47..32 | 31..16 | 15..0 ]
97+
pshufb MLK_SHUFFLE_IN_MASK, MLK_VALS
98+
// MLK_VALS: [ 47..32 | 39..24 | 23..8 | 15..0 ]
99+
movdqa MLK_VALS, MLK_TEMP1
100100
// temp: [ 47..32 | 39..24 | 23..8 | 15..0 ]
101-
psrlw $4, temp1
101+
psrlw $4, MLK_TEMP1
102102
// temp: [ 47..36 | 39..28 | 23..12 | 15..4 ]
103-
pblendw $0xAA, temp1, vals
104-
// vals: [ 47..36 | 39..24 | 23..12 | 15..0]
105-
pand and_mask, vals
106-
// vals: [ 47..36 | 35..24 | 23..12 | 12..0]
107-
108-
// 2. Compute an 8-bit value `good` such that
109-
// good[i] = vals[i] < MLKEM_Q ? 1 : 0, for i in [0, 7],
110-
movdqa bound, temp0
111-
pcmpgtw vals, temp0
112-
pmovmskb temp0, good
113-
pext pext_mask, good, good
114-
115-
// 3. Shuffle the elements in `vals` such that all good elements
103+
pblendw $0xAA, MLK_TEMP1, MLK_VALS
104+
// MLK_VALS: [ 47..36 | 39..24 | 23..12 | 15..0]
105+
pand MLK_AND_MASK, MLK_VALS
106+
// MLK_VALS: [ 47..36 | 35..24 | 23..12 | 12..0]
107+
108+
// 2. Compute an 8-bit value `MLK_GOOD` such that
109+
// MLK_GOOD[i] = MLK_VALS[i] < MLKEM_Q ? 1 : 0, for i in [0, 7],
110+
movdqa MLK_BOUND, MLK_TEMP0
111+
pcmpgtw MLK_VALS, MLK_TEMP0
112+
pmovmskb MLK_TEMP0, MLK_GOOD
113+
pext MLK_PEXT_MASK, MLK_GOOD, MLK_GOOD
114+
115+
// 3. Shuffle the elements in `MLK_VALS` such that all MLK_GOOD elements
116116
// are ordered consecutivelly, and store them.
117-
movq good, table_idx
118-
shl $4, table_idx
119-
movdqu (tab, table_idx), shuffle_out_mask
120-
pshufb shuffle_out_mask, vals
121-
movdqu vals, (%rsp, cnt, 2)
117+
movq MLK_GOOD, MLK_TABLE_IDX
118+
shl $4, MLK_TABLE_IDX
119+
movdqu (MLK_TAB, MLK_TABLE_IDX), MLK_SHUFFLE_OUT_MASK
120+
pshufb MLK_SHUFFLE_OUT_MASK, MLK_VALS
121+
movdqu MLK_VALS, (%rsp, MLK_CNT, 2)
122122

123123
// Update the counter and check if we are done.
124-
popcnt good, good
125-
addq good, cnt
124+
popcnt MLK_GOOD, MLK_GOOD
125+
addq MLK_GOOD, MLK_CNT
126126

127-
cmpq $256, cnt
127+
cmpq $256, MLK_CNT
128128
jnb rej_uniform_asm_final_copy
129129

130-
addq $12, pos
131-
cmpq pos, len
130+
addq $12, MLK_POS
131+
cmpq MLK_POS, MLK_LEN
132132
ja rej_uniform_asm_loop_start
133133

134134
rej_uniform_asm_final_copy:
135-
// Copy up to 256 values to the output: min(cnt, 256).
135+
// Copy up to 256 values to the output: min(MLK_CNT, 256).
136136
mov $256, %rcx
137-
cmp $256, cnt
138-
cmova %rcx, cnt
137+
cmp $256, MLK_CNT
138+
cmova %rcx, MLK_CNT
139139

140140
movq %rsp, %rsi
141-
movq cnt, %rcx
141+
movq MLK_CNT, %rcx
142142
shlq $1, %rcx
143143
rep movsb
144144

@@ -147,22 +147,22 @@ rej_uniform_asm_final_copy:
147147

148148
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
149149
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
150-
#undef in
151-
#undef out
152-
#undef len
153-
#undef tab
154-
#undef cnt
155-
#undef pos
156-
#undef good
157-
#undef pext_mask
158-
#undef table_idx
159-
#undef bound
160-
#undef temp0
161-
#undef temp1
162-
#undef vals
163-
#undef shuffle_out_mask
164-
#undef shuffle_in_mask
165-
#undef and_mask
150+
#undef MLK_IN
151+
#undef MLK_OUT
152+
#undef MLK_LEN
153+
#undef MLK_TAB
154+
#undef MLK_CNT
155+
#undef MLK_POS
156+
#undef MLK_GOOD
157+
#undef MLK_PEXT_MASK
158+
#undef MLK_TABLE_IDX
159+
#undef MLK_BOUND
160+
#undef MLK_TEMP0
161+
#undef MLK_TEMP1
162+
#undef MLK_VALS
163+
#undef MLK_SHUFFLE_OUT_MASK
164+
#undef MLK_SHUFFLE_IN_MASK
165+
#undef MLK_AND_MASK
166166
#undef MLK_STACK_SIZE
167167

168168
/* simpasm: footer-start */

0 commit comments

Comments
 (0)