Skip to content

Commit 85a713b

Browse files
committed
Remove 2 dead instruction from AArch64 back-end poly_tomont()
1. Remove 2 dead instructions and re-assign registers 2. Re-apply SLOTHY 3. Re-autogen all files 4. Update HOL-Light bytecode for poly_tomont() Signed-off-by: Rod Chapman <rodchap@amazon.com>
1 parent 8dae40b commit 85a713b

5 files changed

Lines changed: 117 additions & 133 deletions

File tree

dev/aarch64_clean/src/poly_tomont_asm.S

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,8 @@
4343
factor .req v2
4444
factor_t .req v3
4545
modulus .req v4
46-
modulus_twisted .req v5
4746

48-
tmp0 .req v6
47+
tmp0 .req v5
4948

5049
.text
5150
.global MLK_ASM_NAMESPACE(poly_tomont_asm)
@@ -55,9 +54,6 @@ MLK_ASM_FN_SYMBOL(poly_tomont_asm)
5554
mov wtmp, #3329 // ML-KEM modulus
5655
dup modulus.8h, wtmp
5756

58-
mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
59-
dup modulus_twisted.8h, wtmp
60-
6157
mov wtmp, #-1044 // 2^16 % 3329
6258
dup factor.8h, wtmp
6359

@@ -100,7 +96,6 @@ poly_tomont_loop:
10096
.unreq factor
10197
.unreq factor_t
10298
.unreq modulus
103-
.unreq modulus_twisted
10499

105100
.unreq tmp0
106101

dev/aarch64_opt/src/poly_tomont_asm.S

Lines changed: 59 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,8 @@
4242
factor .req v2
4343
factor_t .req v3
4444
modulus .req v4
45-
modulus_twisted .req v5
4645

47-
tmp0 .req v6
46+
tmp0 .req v5
4847

4948
.text
5049
.global MLK_ASM_NAMESPACE(poly_tomont_asm)
@@ -54,9 +53,6 @@ MLK_ASM_FN_SYMBOL(poly_tomont_asm)
5453
mov wtmp, #3329 // ML-KEM modulus
5554
dup modulus.8h, wtmp
5655

57-
mov wtmp, #20159 // Barrett twist of 1 wrt 2^27
58-
dup modulus_twisted.8h, wtmp
59-
6056
mov wtmp, #-1044 // 2^16 % 3329
6157
dup factor.8h, wtmp
6258

@@ -71,48 +67,48 @@ MLK_ASM_FN_SYMBOL(poly_tomont_asm)
7167
// Cycle bound: 23.0
7268
// IPC bound: 0.70
7369
//
74-
// Wall time: 0.05s
75-
// User time: 0.05s
70+
// Wall time: 0.11s
71+
// User time: 0.11s
7672
//
7773
// ----- cycle (expected) ------>
7874
// 0 25
7975
// |------------------------|----
80-
ldr q18, [x0, #32] // *.............................
81-
ldr q0, [x0, #16] // *.............................
82-
ldr q16, [x0], #64 // .*............................
83-
sqrdmulh v23.8H, v0.8H, v3.8H // ....*.........................
84-
mul v26.8H, v0.8H, v2.8H // ......*.......................
85-
sqrdmulh v19.8H, v16.8H, v3.8H // ........*.....................
86-
mls v26.8H, v23.8H, v4.H[0] // ..........*...................
87-
mul v29.8H, v16.8H, v2.8H // ............*.................
88-
ldr q16, [x0, #-16] // ............*.................
89-
mls v29.8H, v19.8H, v4.H[0] // ..............*...............
90-
str q26, [x0, #-48] // ...............*..............
91-
sqrdmulh v26.8H, v18.8H, v3.8H // ................*.............
92-
mul v18.8H, v18.8H, v2.8H // ..................*...........
93-
str q29, [x0, #-64] // ...................*..........
76+
ldr q5, [x0, #16] // *.............................
77+
ldr q17, [x0], #64 // *.............................
78+
ldr q0, [x0, #-32] // .*............................
79+
ldr q16, [x0, #-16] // .*............................
80+
mul v29.8H, v5.8H, v2.8H // ....*.........................
81+
sqrdmulh v22.8H, v5.8H, v3.8H // ......*.......................
82+
mul v5.8H, v17.8H, v2.8H // ........*.....................
83+
sqrdmulh v28.8H, v17.8H, v3.8H // ..........*...................
84+
sqrdmulh v17.8H, v0.8H, v3.8H // ............*.................
85+
mls v29.8H, v22.8H, v4.H[0] // ..............*...............
86+
mls v5.8H, v28.8H, v4.H[0] // ................*.............
87+
mul v18.8H, v0.8H, v2.8H // ..................*...........
88+
str q29, [x0, #-48] // ....................*.........
9489
sqrdmulh v29.8H, v16.8H, v3.8H // ....................*.........
95-
mls v18.8H, v26.8H, v4.H[0] // ......................*.......
90+
mls v18.8H, v17.8H, v4.H[0] // ......................*.......
91+
str q5, [x0, #-64] // ......................*.......
9692

9793
// ------ cycle (expected) ------>
9894
// 0 25
9995
// |------------------------|-----
100-
// ldr q19, [x0, #16] // *..............................
101-
// ldr q23, [x0, #32] // *..............................
102-
// ldr q17, [x0], #64 // .*.............................
103-
// ldr q16, [x0, #-16] // ............*..................
104-
// sqrdmulh v28.8H, v19.8H, v3.8H // ....*..........................
105-
// mul v0.8H, v19.8H, v2.8H // ......*........................
106-
// sqrdmulh v24.8H, v23.8H, v3.8H // ................*..............
96+
// ldr q23, [x0, #32] // .*.............................
97+
// ldr q17, [x0], #64 // *..............................
98+
// ldr q16, [x0, #-16] // .*.............................
10799
// mul v18.8H, v23.8H, v2.8H // ..................*............
108-
// sqrdmulh v22.8H, v17.8H, v3.8H // ........*......................
109-
// mul v26.8H, v17.8H, v2.8H // ............*..................
110-
// mls v0.8H, v28.8H, v4.H[0] // ..........*....................
111-
// mls v26.8H, v22.8H, v4.H[0] // ..............*................
100+
// ldr q19, [x0, #-48] // *..............................
101+
// sqrdmulh v24.8H, v23.8H, v3.8H // ............*..................
102+
// sqrdmulh v28.8H, v19.8H, v3.8H // ......*........................
103+
// mul v0.8H, v19.8H, v2.8H // ....*..........................
104+
// sqrdmulh v22.8H, v17.8H, v3.8H // ..........*....................
105+
// mul v26.8H, v17.8H, v2.8H // ........*......................
106+
// mls v0.8H, v28.8H, v4.H[0] // ..............*................
107+
// mls v26.8H, v22.8H, v4.H[0] // ................*..............
112108
// sqrdmulh v29.8H, v16.8H, v3.8H // ....................*..........
113-
// str q0, [x0, #-48] // ...............*...............
109+
// str q0, [x0, #-48] // ....................*..........
114110
// mls v18.8H, v24.8H, v4.H[0] // ......................*........
115-
// str q26, [x0, #-64] // ...................*...........
111+
// str q26, [x0, #-64] // ......................*........
116112

117113
sub count, count, #1
118114
poly_tomont_loop:
@@ -123,24 +119,24 @@ poly_tomont_loop:
123119
// Cycle bound: 24.0
124120
// IPC bound: 0.83
125121
//
126-
// Wall time: 0.14s
127-
// User time: 0.14s
122+
// Wall time: 0.28s
123+
// User time: 0.28s
128124
//
129125
// ----- cycle (expected) ------>
130126
// 0 25
131127
// |------------------------|----
132-
ldr q19, [x0, #16] // *.............................
133128
mul v26.8H, v16.8H, v2.8H // l.............................
134-
ldr q23, [x0, #32] // .*............................
135-
ldr q17, [x0], #64 // ..*...........................
129+
ldr q23, [x0, #32] // *.............................
130+
ldr q17, [x0], #64 // .*............................
136131
mls v26.8H, v29.8H, v4.H[0] // ..l...........................
137-
ldr q16, [x0, #-16] // ...*..........................
138-
sqrdmulh v28.8H, v19.8H, v3.8H // ....*.........................
139-
str q18, [x0, #-96] // ....l.........................
140-
mul v0.8H, v19.8H, v2.8H // ......*.......................
132+
ldr q16, [x0, #-16] // ..*...........................
133+
str q18, [x0, #-96] // ...l..........................
134+
mul v18.8H, v23.8H, v2.8H // ....*.........................
135+
ldr q19, [x0, #-48] // ....*.........................
136+
sqrdmulh v24.8H, v23.8H, v3.8H // ......*.......................
141137
str q26, [x0, #-80] // .......l......................
142-
sqrdmulh v24.8H, v23.8H, v3.8H // ........*.....................
143-
mul v18.8H, v23.8H, v2.8H // ..........*...................
138+
sqrdmulh v28.8H, v19.8H, v3.8H // ........*.....................
139+
mul v0.8H, v19.8H, v2.8H // ..........*...................
144140
sqrdmulh v22.8H, v17.8H, v3.8H // ............*.................
145141
mul v26.8H, v17.8H, v2.8H // ..............*...............
146142
mls v0.8H, v28.8H, v4.H[0] // ................*.............
@@ -153,25 +149,25 @@ poly_tomont_loop:
153149
// ------ cycle (expected) ------->
154150
// 0 25
155151
// |------------------------|------
156-
// ldr q0, [x0], #64 // ..*.....................'.~.....
157-
// sqrdmulh v6.8h, v0.8h, v3.8h // ............*...........'.......
152+
// ldr q0, [x0], #64 // .*......................'~......
153+
// sqrdmulh v5.8h, v0.8h, v3.8h // ............*...........'.......
158154
// mul v1.8h, v0.8h, v2.8h // ..............*.........'.......
159-
// mls v1.8h, v6.8h, v4.h[0] // ..................*.....'.......
155+
// mls v1.8h, v5.8h, v4.h[0] // ..................*.....'.......
160156
// str q1, [x0, #-64] // .......................*'.......
161-
// ldr q0, [x0, #-48] // *.......................~.......
162-
// sqrdmulh v6.8h, v0.8h, v3.8h // ....*...................'...~...
163-
// mul v1.8h, v0.8h, v2.8h // ......*.................'.....~.
164-
// mls v1.8h, v6.8h, v4.h[0] // ................*.......'.......
165-
// str q1, [x0, #-48] // .....................*..'.......
166-
// ldr q0, [x0, #-32] // .*......................'~......
167-
// sqrdmulh v6.8h, v0.8h, v3.8h // ........*...............'.......
157+
// ldr q0, [x0, #-48] // ....*...................'...~...
158+
// sqrdmulh v5.8h, v0.8h, v3.8h // ........*...............'.......
168159
// mul v1.8h, v0.8h, v2.8h // ..........*.............'.......
169-
// mls v1.8h, v6.8h, v4.h[0] // ......................*.'.......
170-
// str q1, [x0, #-32] // ....~...................'...l...
171-
// ldr q0, [x0, #-16] // ...*....................'..~....
172-
// sqrdmulh v6.8h, v0.8h, v3.8h // ....................*...'.......
160+
// mls v1.8h, v5.8h, v4.h[0] // ................*.......'.......
161+
// str q1, [x0, #-48] // .....................*..'.......
162+
// ldr q0, [x0, #-32] // *.......................~.......
163+
// sqrdmulh v5.8h, v0.8h, v3.8h // ......*.................'.....~.
164+
// mul v1.8h, v0.8h, v2.8h // ....*...................'...~...
165+
// mls v1.8h, v5.8h, v4.h[0] // ......................*.'.......
166+
// str q1, [x0, #-32] // ...~....................'..l....
167+
// ldr q0, [x0, #-16] // ..*.....................'.~.....
168+
// sqrdmulh v5.8h, v0.8h, v3.8h // ....................*...'.......
173169
// mul v1.8h, v0.8h, v2.8h // ~.......................l.......
174-
// mls v1.8h, v6.8h, v4.h[0] // ..~.....................'.l.....
170+
// mls v1.8h, v5.8h, v4.h[0] // ..~.....................'.l.....
175171
// str q1, [x0, #-16] // .......~................'......l
176172

177173
sub count, count, 1
@@ -183,8 +179,8 @@ poly_tomont_loop:
183179
// Cycle bound: 8.0
184180
// IPC bound: 0.50
185181
//
186-
// Wall time: 0.01s
187-
// User time: 0.01s
182+
// Wall time: 0.02s
183+
// User time: 0.02s
188184
//
189185
// ----- cycle (expected) ------>
190186
// 0 25
@@ -217,7 +213,6 @@ poly_tomont_loop:
217213
.unreq factor
218214
.unreq factor_t
219215
.unreq modulus
220-
.unreq modulus_twisted
221216

222217
.unreq tmp0
223218

mlkem/src/native/aarch64/src/poly_tomont_asm.S

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,44 +34,42 @@ MLK_ASM_FN_SYMBOL(poly_tomont_asm)
3434
.cfi_startproc
3535
mov w2, #0xd01 // =3329
3636
dup v4.8h, w2
37-
mov w2, #0x4ebf // =20159
38-
dup v5.8h, w2
3937
mov w2, #-0x414 // =-1044
4038
dup v2.8h, w2
4139
mov w2, #-0x2824 // =-10276
4240
dup v3.8h, w2
4341
mov x1, #0x8 // =8
44-
ldr q18, [x0, #0x20]
45-
ldr q0, [x0, #0x10]
46-
ldr q16, [x0], #0x40
47-
sqrdmulh v23.8h, v0.8h, v3.8h
48-
mul v26.8h, v0.8h, v2.8h
49-
sqrdmulh v19.8h, v16.8h, v3.8h
50-
mls v26.8h, v23.8h, v4.h[0]
51-
mul v29.8h, v16.8h, v2.8h
42+
ldr q5, [x0, #0x10]
43+
ldr q17, [x0], #0x40
44+
ldur q0, [x0, #-0x20]
5245
ldur q16, [x0, #-0x10]
53-
mls v29.8h, v19.8h, v4.h[0]
54-
stur q26, [x0, #-0x30]
55-
sqrdmulh v26.8h, v18.8h, v3.8h
56-
mul v18.8h, v18.8h, v2.8h
57-
stur q29, [x0, #-0x40]
46+
mul v29.8h, v5.8h, v2.8h
47+
sqrdmulh v22.8h, v5.8h, v3.8h
48+
mul v5.8h, v17.8h, v2.8h
49+
sqrdmulh v28.8h, v17.8h, v3.8h
50+
sqrdmulh v17.8h, v0.8h, v3.8h
51+
mls v29.8h, v22.8h, v4.h[0]
52+
mls v5.8h, v28.8h, v4.h[0]
53+
mul v18.8h, v0.8h, v2.8h
54+
stur q29, [x0, #-0x30]
5855
sqrdmulh v29.8h, v16.8h, v3.8h
59-
mls v18.8h, v26.8h, v4.h[0]
56+
mls v18.8h, v17.8h, v4.h[0]
57+
stur q5, [x0, #-0x40]
6058
sub x1, x1, #0x1
6159

6260
Lpoly_tomont_loop:
63-
ldr q19, [x0, #0x10]
6461
mul v26.8h, v16.8h, v2.8h
6562
ldr q23, [x0, #0x20]
6663
ldr q17, [x0], #0x40
6764
mls v26.8h, v29.8h, v4.h[0]
6865
ldur q16, [x0, #-0x10]
69-
sqrdmulh v28.8h, v19.8h, v3.8h
7066
stur q18, [x0, #-0x60]
71-
mul v0.8h, v19.8h, v2.8h
72-
stur q26, [x0, #-0x50]
73-
sqrdmulh v24.8h, v23.8h, v3.8h
7467
mul v18.8h, v23.8h, v2.8h
68+
ldur q19, [x0, #-0x30]
69+
sqrdmulh v24.8h, v23.8h, v3.8h
70+
stur q26, [x0, #-0x50]
71+
sqrdmulh v28.8h, v19.8h, v3.8h
72+
mul v0.8h, v19.8h, v2.8h
7573
sqrdmulh v22.8h, v17.8h, v3.8h
7674
mul v26.8h, v17.8h, v2.8h
7775
mls v0.8h, v28.8h, v4.h[0]

proofs/hol_light/aarch64/mlkem/mlkem_poly_tomont.S

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -37,44 +37,42 @@ PQCP_MLKEM_NATIVE_MLKEM768_poly_tomont_asm:
3737
.cfi_startproc
3838
mov w2, #0xd01 // =3329
3939
dup v4.8h, w2
40-
mov w2, #0x4ebf // =20159
41-
dup v5.8h, w2
4240
mov w2, #-0x414 // =-1044
4341
dup v2.8h, w2
4442
mov w2, #-0x2824 // =-10276
4543
dup v3.8h, w2
4644
mov x1, #0x8 // =8
47-
ldr q18, [x0, #0x20]
48-
ldr q0, [x0, #0x10]
49-
ldr q16, [x0], #0x40
50-
sqrdmulh v23.8h, v0.8h, v3.8h
51-
mul v26.8h, v0.8h, v2.8h
52-
sqrdmulh v19.8h, v16.8h, v3.8h
53-
mls v26.8h, v23.8h, v4.h[0]
54-
mul v29.8h, v16.8h, v2.8h
45+
ldr q5, [x0, #0x10]
46+
ldr q17, [x0], #0x40
47+
ldur q0, [x0, #-0x20]
5548
ldur q16, [x0, #-0x10]
56-
mls v29.8h, v19.8h, v4.h[0]
57-
stur q26, [x0, #-0x30]
58-
sqrdmulh v26.8h, v18.8h, v3.8h
59-
mul v18.8h, v18.8h, v2.8h
60-
stur q29, [x0, #-0x40]
49+
mul v29.8h, v5.8h, v2.8h
50+
sqrdmulh v22.8h, v5.8h, v3.8h
51+
mul v5.8h, v17.8h, v2.8h
52+
sqrdmulh v28.8h, v17.8h, v3.8h
53+
sqrdmulh v17.8h, v0.8h, v3.8h
54+
mls v29.8h, v22.8h, v4.h[0]
55+
mls v5.8h, v28.8h, v4.h[0]
56+
mul v18.8h, v0.8h, v2.8h
57+
stur q29, [x0, #-0x30]
6158
sqrdmulh v29.8h, v16.8h, v3.8h
62-
mls v18.8h, v26.8h, v4.h[0]
59+
mls v18.8h, v17.8h, v4.h[0]
60+
stur q5, [x0, #-0x40]
6361
sub x1, x1, #0x1
6462

6563
Lpoly_tomont_loop:
66-
ldr q19, [x0, #0x10]
6764
mul v26.8h, v16.8h, v2.8h
6865
ldr q23, [x0, #0x20]
6966
ldr q17, [x0], #0x40
7067
mls v26.8h, v29.8h, v4.h[0]
7168
ldur q16, [x0, #-0x10]
72-
sqrdmulh v28.8h, v19.8h, v3.8h
7369
stur q18, [x0, #-0x60]
74-
mul v0.8h, v19.8h, v2.8h
75-
stur q26, [x0, #-0x50]
76-
sqrdmulh v24.8h, v23.8h, v3.8h
7770
mul v18.8h, v23.8h, v2.8h
71+
ldur q19, [x0, #-0x30]
72+
sqrdmulh v24.8h, v23.8h, v3.8h
73+
stur q26, [x0, #-0x50]
74+
sqrdmulh v28.8h, v19.8h, v3.8h
75+
mul v0.8h, v19.8h, v2.8h
7876
sqrdmulh v22.8h, v17.8h, v3.8h
7977
mul v26.8h, v17.8h, v2.8h
8078
mls v0.8h, v28.8h, v4.h[0]

0 commit comments

Comments
 (0)