Skip to content

Commit 2e6c6df

Browse files
andrea-caforionasahlpa
authored andcommitted
[crypto] Ed25519: SCA hardening of the first sign stage
The first sign stage consists of two scalar-point multiplications that involve both the secret scalars s and r. To protect the multiplication it has to be performed over the arithmetic shares (s0, s1) and (r0, r1) as such a new scalar-point multiplication routine that achieves that is introduced in this commit. Signed-off-by: Andrea Caforio <andrea.caforio@lowrisc.org>
1 parent 02683bb commit 2e6c6df

5 files changed

Lines changed: 408 additions & 29 deletions

File tree

sw/otbn/crypto/ed25519.s

Lines changed: 232 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,16 @@
66
.globl ed25519_verify_var
77
.globl ed25519_sign_stage1
88
.globl ed25519_sign_stage2
9+
10+
/* Expose for testing only. */
911
.globl affine_encode
1012
.globl affine_decode_var
1113
.globl ext_scmul
14+
.globl ext_scmul_sca
1215
.globl ext_double
1316
.globl ext_add
17+
.globl ext_to_affine
18+
1419

1520
/**
1621
* This library contains an implementation of the Ed25519 signature scheme
@@ -100,6 +105,10 @@
100105
* clobbered flag groups: FG0
101106
*/
102107
ed25519_gen_public_key:
108+
/* Set up scalar arithmetic for the scalar reductions.
109+
MOD <= L.
110+
[w15:w14] <= mu. */
111+
jal x1, sc_init
103112

104113
/* Load the arithmetic shares (s0, s1) of the precomputed and clamped secret
105114
s and reduce them modulo L. */
@@ -112,7 +121,7 @@ ed25519_gen_public_key:
112121

113122
/* w28 <= [w17:w16] mod L = s0 mod L. */
114123
jal x1, sc_reduce
115-
bn.mov w28, w18
124+
bn.mov w2, w18
116125

117126
/* Clear w16 and w17 with randomness before loading the second share s1. */
118127
bn.wsrr w16, URND
@@ -126,9 +135,7 @@ ed25519_gen_public_key:
126135

127136
/* w18 <= [w17:w16] mod L = s1 mod L. */
128137
jal x1, sc_reduce
129-
130-
/* TODO: Remove once everything is in place. */
131-
bn.subm w28, w28, w18
138+
bn.mov w4, w18
132139

133140
/* Set up for field arithmetic in preparation for scalar multiplication.
134141
MOD <= p
@@ -161,9 +168,9 @@ ed25519_gen_public_key:
161168
[w9:w6] <= extended(B) = (B.X, B.Y, B.Z, B.T) */
162169
jal x1, affine_to_ext
163170

164-
/* Compute the public key point A = [s]B.
165-
[w13:w10] <= w28 * [w9:w6] = [s]B */
166-
jal x1, ext_scmul
171+
/* Compute the public key point A = [s0 - s1]B.
172+
[w13:w10] <= (w2 - w4) * [w9:w6] = [s0 - s1]B */
173+
jal x1, ext_scmul_sca
167174

168175
/* Convert A to affine coordinates.
169176
w10 <= A.x, w11 <= A.y */
@@ -477,6 +484,20 @@ ed25519_verify_var:
477484
* clobbered flag groups: FG0
478485
*/
479486
ed25519_sign_stage1:
487+
/* Calculate and write encoded public key A (A_) to DMEM.
488+
dmem[ed25519_public_key] <= A_
489+
490+
Furthermore, set up for field arithmetic in preparation for scalar multiplication.
491+
MOD <= p
492+
w19 <= 19
493+
494+
And initialize curve parameter d.
495+
w30 <= dmem[d] = (-121665/121666) mod p
496+
497+
Lastly, load B in extended coordinates.
498+
[w9:w6] <= extended(B) = (B.X, B.Y, B.Z, B.T)*/
499+
jal x1, ed25519_gen_public_key
500+
480501
/* Set up for scalar arithmetic.
481502
[w15:w14] <= mu
482503
MOD <= L */
@@ -492,9 +513,9 @@ ed25519_sign_stage1:
492513
bn.lid x2++, 32(x3)
493514
bn.lid x2++, 64(x3)
494515

495-
/* w5 <= [w22:w20] mod L = r0 mod L. */
516+
/* w2 <= [w22:w20] mod L = r0 mod L. */
496517
jal x1, sc_reduce_768
497-
bn.mov w5, w18
518+
bn.mov w2, w18
498519

499520
/* Overwrite w20-w22 with randomness before loading the second share r1. */
500521
bn.wsrr w20, URND
@@ -508,30 +529,19 @@ ed25519_sign_stage1:
508529
bn.lid x2++, 32(x3)
509530
bn.lid x2++, 64(x3)
510531

511-
/* w18 <= [w22:w20] mod L = r1 mod L. */
532+
/* w4 <= [w22:w20] mod L = r1 mod L. */
512533
jal x1, sc_reduce_768
534+
bn.mov w4, w18
513535

514-
/* TODO remove this once everything is in place. */
515-
bn.subm w5, w5, w18
516-
517-
/* Calculate and write encoded public key A (A_) to DMEM.
518-
dmem[ed25519_public_key] <= A_
519-
520-
Furthermore, set up for field arithmetic in preparation for scalar multiplication.
536+
/* Set up for field arithmetic in preparation for scalar multiplication.
521537
MOD <= p
522-
w19 <= 19
523-
524-
And initialize curve parameter d.
525-
w30 <= dmem[d] = (-121665/121666) mod p
526-
527-
Lastly, load B in extended coordinates.
528-
[w9:w6] <= extended(B) = (B.X, B.Y, B.Z, B.T)*/
529-
jal x1, ed25519_gen_public_key
538+
w19 <= 19 */
539+
jal x1, fe_init
530540

531-
/* Compute the signature point R = [r]B.
532-
[w13:w10] <= w5 * [w9:w6] = [r]B */
533-
bn.mov w28, w5
534-
jal x1, ext_scmul
541+
/* Compute the signature point R = [r0 - r1]B.
542+
[w13:w10] <= (w2 - w4) * [w9:w6] = [r0 - r1]B */
543+
/* bn.mov w28, w5 */
544+
jal x1, ext_scmul_sca
535545

536546
/* Convert R to affine coordinates.
537547
w10 <= R.x, w11 <= R.y */
@@ -1246,6 +1256,199 @@ ext_scmul:
12461256
[w13:w10] = P = a * (X1, Y1, Z1, T1) */
12471257
ret
12481258

1259+
/**
1260+
* Scalar-point multiplication with an arithmetically masked scalar.
1261+
*
1262+
* Returns (X2, Y2, Z2, T2) = (s0 - s1) * (X1, Y1, Z1, T1) = (s0 - s1) * P.
1263+
*
1264+
* This routine calculates both terms s0 * (X1, Y1, Z1, T1) and
1265+
* -s1 * (X1, Y1, Z1, T1) in parallel with the following double-and-add-always
1266+
* algorithm:
1267+
*
1268+
* Q = (0, 1, 1, 0)
1269+
*
1270+
* for i = bitlen(s0) - 1 to 0:
1271+
* Q = 2 * Q
1272+
* A = s0[i] ? P : -P
1273+
* B = Q + A
1274+
* Q = (s0[i] ^ s1[i]) ? B : Q
1275+
*
1276+
* return Q
1277+
*
1278+
* This routine runs in constant time.
1279+
*
1280+
* Flags: Flags have no meaning beyond the scope of this subroutine.
1281+
*
1282+
* @param[in] w2: s0, first scalar share, s0 < L
1283+
* @param[in] w4: s1, second scalar share, s1 < L
1284+
* @param[in] w6: input X1 (X1 < p)
1285+
* @param[in] w7: input Y1 (Y1 < p)
1286+
* @param[in] w8: input Z1 (Z1 < p)
1287+
* @param[in] wQ: input T1 (T1 < p)
1288+
* @param[in] w19: constant, w19 = 19
1289+
* @param[in] w29: constant, w29 = (2*d) mod p, d = (-121665/121666) mod p
1290+
* @param[in] w31: all-zero
1291+
* @param[in] MOD: p, modulus = 2^255 - 19
1292+
* @param[out] w10: output X2
1293+
* @param[out] w11: output Y2
1294+
* @param[out] w12: output Z2
1295+
* @param[out] w13: output T2
1296+
*
1297+
* Clobbered registers: w2 to w18, w20 to w27
1298+
* Clobbered flag groups: FG0, FG1
1299+
*/
1300+
ext_scmul_sca:
1301+
/* Initialize the intermediate result Q to the origin point.
1302+
[w13:w10] <= (0, 1, 1, 0) */
1303+
bn.mov w10, w31
1304+
bn.addi w11, w31, 1
1305+
bn.addi w12, w31, 1
1306+
bn.mov w13, w31
1307+
1308+
/*
1309+
* Blind both 253-bit scalar shares and expand them to 382-bit shares. Such
1310+
* large blinding factors are necessary to protect them against powerful SCA
1311+
* attacks.
1312+
*/
1313+
1314+
/* [w3:w2] <= w2 + k * L = s0 + k * L. */
1315+
bn.mov w20, w2
1316+
jal x1, sc_blind
1317+
bn.mov w2, w16
1318+
bn.mov w3, w17
1319+
1320+
/* [w5:w4] <= w4 + k * L = s1 + k' * L. */
1321+
bn.mov w20, w4
1322+
jal x1, sc_blind
1323+
bn.mov w4, w16
1324+
bn.mov w5, w17
1325+
1326+
/* Move the blinded 382-bit share s0 to the MSB position. */
1327+
bn.rshi w3, w3, w2 >> 126
1328+
bn.rshi w2, w2, w31 >> 126
1329+
1330+
bn.xor w31, w31, w31 /* dummy */
1331+
1332+
/* Move the blinded 382-bit share s1 to the MSB position. */
1333+
bn.rshi w5, w5, w4 >> 126
1334+
bn.rshi w4, w4, w31 >> 126
1335+
1336+
/* Iterate over all scalar bits starting at the MSB. */
1337+
loopi 382, 54
1338+
/* Compute Q = 2 * Q.
1339+
[w13:w10] <= [w13:w10] + [w13:w10] = 2 * Q */
1340+
jal x1, ext_double
1341+
1342+
/* Save the value 2 * Q for later.
1343+
[w17:w14] <= [w13:w10] = 2 * Q. */
1344+
bn.mov w14, w10
1345+
bn.mov w15, w11
1346+
bn.mov w16, w12
1347+
bn.mov w17, w13
1348+
1349+
/*
1350+
* First selection: A = s0[i] ? P : -P.
1351+
*
1352+
* It is important that the destination register of `bn.sel` is different
1353+
* from the source register to avoid leaking the secret flag bit.
1354+
*/
1355+
1356+
/* Randomize the destination registers. */
1357+
bn.wsrr w10, URND
1358+
bn.wsrr w11, URND
1359+
bn.wsrr w12, URND
1360+
bn.wsrr w13, URND
1361+
1362+
/* Negate P.
1363+
[w23:w20] <= (-P.X, P.Y, P.Z, -P.T). */
1364+
bn.subm w20, w31, w6
1365+
bn.mov w21, w7
1366+
bn.mov w22, w8
1367+
bn.subm w23, w31, w9
1368+
1369+
/* Isolate the bit s0[381] which is s0[i]. */
1370+
bn.addi w3, w3, 0, FG0
1371+
1372+
/* Perform the selection A = s0[i] ? P : -P.
1373+
[w13:w10] <= s0[i] ? P : -P. */
1374+
bn.sel w10, w6, w20, FG0.M
1375+
bn.sel w11, w7, w21, FG0.M
1376+
bn.sel w12, w8, w22, FG0.M
1377+
bn.sel w13, w9, w23, FG0.M
1378+
1379+
/* Clear the flag. */
1380+
bn.sub w31, w31, w31, FG0
1381+
1382+
/* Compute the addition B = Q + A.
1383+
[w13:w10] <= [w13:w10] + [w17:w14] = A + Q. */
1384+
jal x1, ext_add
1385+
1386+
/*
1387+
* Second selection: Q = (s0[i] ^ s1[i]) ? B : Q.
1388+
* The XOR (s0[i] ^ s1[i]) can be computed with 3 successive selections
1389+
* such that (s0[i] ^ s1[i]) = (s1[i]) ? (s0[i] ? B : Q) : (s1[i] ? Q : B).
1390+
*/
1391+
1392+
/* Randomize the destination registers. */
1393+
bn.wsrr w20, URND
1394+
bn.wsrr w21, URND
1395+
bn.wsrr w22, URND
1396+
bn.wsrr w23, URND
1397+
bn.wsrr w24, URND
1398+
bn.wsrr w25, URND
1399+
bn.wsrr w26, URND
1400+
bn.wsrr w27, URND
1401+
1402+
/* Isolate the bit s0[381] which is s0[i]. */
1403+
bn.addi w3, w3, 0, FG0
1404+
1405+
/* [w23:w20] <= s0[i] ? B : Q. */
1406+
bn.sel w20, w10, w14, FG0.M
1407+
bn.sel w21, w11, w15, FG0.M
1408+
bn.sel w22, w12, w16, FG0.M
1409+
bn.sel w23, w13, w17, FG0.M
1410+
1411+
/* [w27:w24] <= s0[i] ? Q : B. */
1412+
bn.sel w24, w14, w10, FG0.M
1413+
bn.sel w25, w15, w11, FG0.M
1414+
bn.sel w26, w16, w12, FG0.M
1415+
bn.sel w27, w17, w13, FG0.M
1416+
1417+
/* Clear the flag. */
1418+
bn.sub w31, w31, w31, FG0
1419+
1420+
/* Randomize the destination registers. */
1421+
bn.wsrr w10, URND
1422+
bn.wsrr w11, URND
1423+
bn.wsrr w12, URND
1424+
bn.wsrr w13, URND
1425+
1426+
/* Isolate the bit s1[381] which is s1[i]. */
1427+
bn.addi w5, w5, 0, FG1
1428+
1429+
/* [w13:w10] <= s1[i] ? (s0[i] ? B : Q) : (s0[i] ? Q : B). */
1430+
bn.sel w10, w24, w20, FG1.M
1431+
bn.sel w11, w25, w21, FG1.M
1432+
bn.sel w12, w26, w22, FG1.M
1433+
bn.sel w13, w27, w23, FG1.M
1434+
1435+
/* Clear the flag. */
1436+
bn.sub w31, w31, w31, FG0
1437+
1438+
/* Shift both scalars one position to the left and pad with randomness. */
1439+
bn.wsrr w20, URND
1440+
1441+
bn.rshi w3, w3, w2 >> 255
1442+
bn.rshi w2, w2, w20 >> 255
1443+
1444+
bn.xor w31, w31, w31 /* dummy */
1445+
1446+
bn.rshi w5, w5, w4 >> 255
1447+
bn.rshi w4, w4, w20 >> 255
1448+
/* End of loop */
1449+
1450+
ret
1451+
12491452
/**
12501453
* Add a point to itself in extended twisted Edwards coordinates.
12511454
*

sw/otbn/crypto/ed25519_scalar.s

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
.globl sc_reduce
77
.globl sc_reduce_768
88
.globl sc_mul
9+
.globl sc_blind
910

1011
/**
1112
* This library contains arithmetic for the scalar field of the Ed25519
@@ -325,6 +326,55 @@ sc_mul:
325326

326327
ret
327328

329+
/**
330+
* Blind a scalar with a multiple of the curve order L.
331+
*
332+
* Given a scalar s < L, this routine adds a random multiple of the group order
333+
* k * L to the scalar such that s + k * L. In accordance, with Schindler's and
334+
* Wiemers' recommendation [1] the blinding factor k is 128 bits. Since the
335+
* curve order L is 253 bits, by choosing a 128-bit factor k we can guarantee
336+
* that the blinded scalar s + k * L is at most 382 bits.
337+
*
338+
* [1] https://csrc.nist.gov/csrc/media/events/workshop-on-elliptic-curve-cryptography-standards/documents/papers/session6-schindler-werner.pdf
339+
*
340+
* @param[in] w20: s, the scalar to be blinded (s < L).
341+
* @param[in] w31: all-zero.
342+
* @param[out] [w17:w16]: The blinded scalar s + k * L.
343+
*
344+
* Clobbered registers: x2, x3, w21, w22, w23.
345+
* Clobbered flag groups: FG0.
346+
*/
347+
sc_blind:
348+
/* w21 <= L. */
349+
li x2, 21
350+
la x3, ed25519_scalar_L
351+
bn.lid x2, 0(x3)
352+
353+
/* Load a 128-bit random blinding factor k.
354+
w22 <= URND & (2^128 - 1). */
355+
bn.wsrr w22, URND
356+
bn.rshi w22, w31, w22 >> 128
357+
358+
/* Calculate k * L, i.e., 128-bit x 253-bit multiplication resulting in a
359+
381-bit value.
360+
[w17:w16] <= [w23:w22] * w21 = k * L. */
361+
bn.mulqacc.z w21.0, w22.0, 0
362+
bn.mulqacc w21.0, w22.1, 64
363+
bn.mulqacc.so w16.L, w21.1, w22.0, 64
364+
bn.mulqacc w21.1, w22.1, 0
365+
bn.mulqacc w21.2, w22.0, 0
366+
bn.mulqacc w21.2, w22.1, 64
367+
bn.mulqacc.so w16.U, w21.3, w22.0, 64
368+
bn.mulqacc.wo w17, w21.3, w22.1, 0
369+
370+
/* Add the 381-bit blinding value to the 253-bit scalar resulting in a
371+
382-bit blinded scalar avoiding any overflow.
372+
[w17:w16] <= [w17:w16] + w20 = k * L + s. */
373+
bn.add w16, w16, w20
374+
bn.addc w17, w17, w31
375+
376+
ret
377+
328378
.data
329379

330380
/* Modulus L = 2^252+27742317777372353535851937790883648493 */

0 commit comments

Comments
 (0)