@@ -1021,6 +1021,7 @@ static int mldsa_squeeze256(wc_Shake* shake256, const byte* in,
10211021 * @param [in] eta Range specifier of each value.
10221022 * @param [out] p Buffer to encode into.
10231023 */
1024+
10241025static void mldsa_vec_encode_eta_bits_c (const sword32 * s , byte d , byte eta ,
10251026 byte * p )
10261027{
@@ -1047,9 +1048,9 @@ static void mldsa_vec_encode_eta_bits_c(const sword32* s, byte d, byte eta,
10471048 byte s7 = (byte )(2 - s [j + 7 ]);
10481049
10491050 /* Pack 8 3-bit values into 3 bytes. */
1050- p [0 ] = ( byte ) ((s0 >> 0 ) | (s1 << 3 ) | (s2 << 6 ));
1051- p [1 ] = ( byte ) ((s2 >> 2 ) | (s3 << 1 ) | (s4 << 4 ) | (s5 << 7 ));
1052- p [2 ] = ( byte ) ((s5 >> 1 ) | (s6 << 2 ) | (s7 << 5 ));
1051+ p [0 ] = WC_OCTET ((s0 >> 0 ) | (s1 << 3 ) | (s2 << 6 ));
1052+ p [1 ] = WC_OCTET ((s2 >> 2 ) | (s3 << 1 ) | (s4 << 4 ) | (s5 << 7 ));
1053+ p [2 ] = WC_OCTET ((s5 >> 1 ) | (s6 << 2 ) | (s7 << 5 ));
10531054 /* Move to next place to encode into. */
10541055 p += MLDSA_ETA_2_BITS ;
10551056 }
@@ -1159,15 +1160,18 @@ static void mldsa_decode_eta_2_bits_c(const byte* p, sword32* s)
11591160 * 3 bits to encode each number.
11601161 * 8 numbers from 3 bytes. (8 * 3 bits = 3 * 8 bits) */
11611162 for (j = 0 ; j < MLDSA_N ; j += 8 ) {
1162- /* Get 3 bits and put in range of -2..2. */
1163- s [j + 0 ] = 2 - ((p [0 ] >> 0 ) & 0x7 );
1164- s [j + 1 ] = 2 - ((p [0 ] >> 3 ) & 0x7 );
1165- s [j + 2 ] = 2 - ((p [0 ] >> 6 ) | ((p [1 ] << 2 ) & 0x7 ));
1166- s [j + 3 ] = 2 - ((p [1 ] >> 1 ) & 0x7 );
1167- s [j + 4 ] = 2 - ((p [1 ] >> 4 ) & 0x7 );
1168- s [j + 5 ] = 2 - ((p [1 ] >> 7 ) | ((p [2 ] << 1 ) & 0x7 ));
1169- s [j + 6 ] = 2 - ((p [2 ] >> 2 ) & 0x7 );
1170- s [j + 7 ] = 2 - ((p [2 ] >> 5 ) & 0x7 );
1163+ /* Get 3 bits and put in range of -2..2.
1164+ * Cast to signed 32-bit before the subtract: where int is 16-bit a
1165+ * byte/word16 field promotes to unsigned, so a negative result would
1166+ * zero-extend instead of sign-extend into sword32. */
1167+ s [j + 0 ] = 2 - (sword32 )((p [0 ] >> 0 ) & 0x7 );
1168+ s [j + 1 ] = 2 - (sword32 )((p [0 ] >> 3 ) & 0x7 );
1169+ s [j + 2 ] = 2 - (sword32 )((p [0 ] >> 6 ) | ((p [1 ] << 2 ) & 0x7 ));
1170+ s [j + 3 ] = 2 - (sword32 )((p [1 ] >> 1 ) & 0x7 );
1171+ s [j + 4 ] = 2 - (sword32 )((p [1 ] >> 4 ) & 0x7 );
1172+ s [j + 5 ] = 2 - (sword32 )((p [1 ] >> 7 ) | ((p [2 ] << 1 ) & 0x7 ));
1173+ s [j + 6 ] = 2 - (sword32 )((p [2 ] >> 2 ) & 0x7 );
1174+ s [j + 7 ] = 2 - (sword32 )((p [2 ] >> 5 ) & 0x7 );
11711175 /* Move to next place to decode from. */
11721176 p += MLDSA_ETA_2_BITS ;
11731177 }
@@ -1221,24 +1225,24 @@ static void mldsa_decode_eta_4_bits_c(const byte* p, sword32* s)
12211225 * 4 bits to encode each number.
12221226 * 2 numbers from 1 bytes. (2 * 4 bits = 1 * 8 bits) */
12231227 for (j = 0 ; j < MLDSA_N / 2 ; j ++ ) {
1224- /* Get 4 bits and put in range of -4..4. */
1225- s [j * 2 + 0 ] = 4 - (p [j ] & 0xf );
1226- s [j * 2 + 1 ] = 4 - (p [j ] >> 4 );
1228+ /* Get 4 bits and put in range of -4..4. (sword32 cast: see eta-2.) */
1229+ s [j * 2 + 0 ] = 4 - (sword32 )( p [j ] & 0xf );
1230+ s [j * 2 + 1 ] = 4 - (sword32 )( p [j ] >> 4 );
12271231 }
12281232#else
12291233 /* Step 6 or 9.
12301234 * 4 bits to encode each number.
12311235 * 8 numbers from 4 bytes. (8 * 4 bits = 4 * 8 bits) */
12321236 for (j = 0 ; j < MLDSA_N / 2 ; j += 4 ) {
1233- /* Get 4 bits and put in range of -4..4. */
1234- s [j * 2 + 0 ] = 4 - (p [j + 0 ] & 0xf );
1235- s [j * 2 + 1 ] = 4 - (p [j + 0 ] >> 4 );
1236- s [j * 2 + 2 ] = 4 - (p [j + 1 ] & 0xf );
1237- s [j * 2 + 3 ] = 4 - (p [j + 1 ] >> 4 );
1238- s [j * 2 + 4 ] = 4 - (p [j + 2 ] & 0xf );
1239- s [j * 2 + 5 ] = 4 - (p [j + 2 ] >> 4 );
1240- s [j * 2 + 6 ] = 4 - (p [j + 3 ] & 0xf );
1241- s [j * 2 + 7 ] = 4 - (p [j + 3 ] >> 4 );
1237+ /* Get 4 bits and put in range of -4..4. (sword32 cast: see eta-2.) */
1238+ s [j * 2 + 0 ] = 4 - (sword32 )( p [j + 0 ] & 0xf );
1239+ s [j * 2 + 1 ] = 4 - (sword32 )( p [j + 0 ] >> 4 );
1240+ s [j * 2 + 2 ] = 4 - (sword32 )( p [j + 1 ] & 0xf );
1241+ s [j * 2 + 3 ] = 4 - (sword32 )( p [j + 1 ] >> 4 );
1242+ s [j * 2 + 4 ] = 4 - (sword32 )( p [j + 2 ] & 0xf );
1243+ s [j * 2 + 5 ] = 4 - (sword32 )( p [j + 2 ] >> 4 );
1244+ s [j * 2 + 6 ] = 4 - (sword32 )( p [j + 3 ] & 0xf );
1245+ s [j * 2 + 7 ] = 4 - (sword32 )( p [j + 3 ] >> 4 );
12421246 }
12431247#endif /* WOLFSSL_MLDSA_SMALL */
12441248}
@@ -1378,21 +1382,21 @@ static void mldsa_vec_encode_t0_t1_c(const sword32* t, byte d, byte* t0,
13781382 MLDSA_D );
13791383 /* Take 8 values of t and take bottom bits and make positive. */
13801384 word16 n0_0 = (word16 )(MLDSA_D_MAX_HALF -
1381- (t [j + 0 ] - (n1_0 << MLDSA_D )));
1385+ (t [j + 0 ] - (( sword32 ) n1_0 << MLDSA_D )));
13821386 word16 n0_1 = (word16 )(MLDSA_D_MAX_HALF -
1383- (t [j + 1 ] - (n1_1 << MLDSA_D )));
1387+ (t [j + 1 ] - (( sword32 ) n1_1 << MLDSA_D )));
13841388 word16 n0_2 = (word16 )(MLDSA_D_MAX_HALF -
1385- (t [j + 2 ] - (n1_2 << MLDSA_D )));
1389+ (t [j + 2 ] - (( sword32 ) n1_2 << MLDSA_D )));
13861390 word16 n0_3 = (word16 )(MLDSA_D_MAX_HALF -
1387- (t [j + 3 ] - (n1_3 << MLDSA_D )));
1391+ (t [j + 3 ] - (( sword32 ) n1_3 << MLDSA_D )));
13881392 word16 n0_4 = (word16 )(MLDSA_D_MAX_HALF -
1389- (t [j + 4 ] - (n1_4 << MLDSA_D )));
1393+ (t [j + 4 ] - (( sword32 ) n1_4 << MLDSA_D )));
13901394 word16 n0_5 = (word16 )(MLDSA_D_MAX_HALF -
1391- (t [j + 5 ] - (n1_5 << MLDSA_D )));
1395+ (t [j + 5 ] - (( sword32 ) n1_5 << MLDSA_D )));
13921396 word16 n0_6 = (word16 )(MLDSA_D_MAX_HALF -
1393- (t [j + 6 ] - (n1_6 << MLDSA_D )));
1397+ (t [j + 6 ] - (( sword32 ) n1_6 << MLDSA_D )));
13941398 word16 n0_7 = (word16 )(MLDSA_D_MAX_HALF -
1395- (t [j + 7 ] - (n1_7 << MLDSA_D )));
1399+ (t [j + 7 ] - (( sword32 ) n1_7 << MLDSA_D )));
13961400
13971401 /* 13 bits per number.
13981402 * 8 numbers become 13 bytes. (8 * 13 bits = 13 * 8 bits) */
@@ -1406,20 +1410,20 @@ static void mldsa_vec_encode_t0_t1_c(const sword32* t, byte d, byte* t0,
14061410 tp [2 ] = (n0_4 >> 12 ) | ((word32 )n0_5 << 1 ) |
14071411 ((word32 )n0_6 << 14 ) | ((word32 )n0_7 << 27 );
14081412 #else
1409- t0 [ 0 ] = ( byte )( (n0_0 << 0 ));
1410- t0 [ 1 ] = ( byte ) ((n0_0 >> 8 ) | (n0_1 << 5 ));
1411- t0 [ 2 ] = ( byte ) ((n0_1 >> 3 ) );
1412- t0 [ 3 ] = ( byte ) ((n0_1 >> 11 ) | (n0_2 << 2 ));
1413- t0 [ 4 ] = ( byte ) ((n0_2 >> 6 ) | (n0_3 << 7 ));
1414- t0 [ 5 ] = ( byte ) ((n0_3 >> 1 ) );
1415- t0 [ 6 ] = ( byte ) ((n0_3 >> 9 ) | (n0_4 << 4 ));
1416- t0 [ 7 ] = ( byte ) ((n0_4 >> 4 ) );
1417- t0 [ 8 ] = ( byte ) ((n0_4 >> 12 ) | (n0_5 << 1 ));
1418- t0 [ 9 ] = ( byte ) ((n0_5 >> 7 ) | (n0_6 << 6 ));
1419- t0 [10 ] = ( byte ) ((n0_6 >> 2 ) );
1420- t0 [11 ] = ( byte ) ((n0_6 >> 10 ) | (n0_7 << 3 ));
1413+ t0 [ 0 ] = WC_OCTET ( (n0_0 << 0 ));
1414+ t0 [ 1 ] = WC_OCTET ((n0_0 >> 8 ) | (n0_1 << 5 ));
1415+ t0 [ 2 ] = WC_OCTET ((n0_1 >> 3 ) );
1416+ t0 [ 3 ] = WC_OCTET ((n0_1 >> 11 ) | (n0_2 << 2 ));
1417+ t0 [ 4 ] = WC_OCTET ((n0_2 >> 6 ) | (n0_3 << 7 ));
1418+ t0 [ 5 ] = WC_OCTET ((n0_3 >> 1 ) );
1419+ t0 [ 6 ] = WC_OCTET ((n0_3 >> 9 ) | (n0_4 << 4 ));
1420+ t0 [ 7 ] = WC_OCTET ((n0_4 >> 4 ) );
1421+ t0 [ 8 ] = WC_OCTET ((n0_4 >> 12 ) | (n0_5 << 1 ));
1422+ t0 [ 9 ] = WC_OCTET ((n0_5 >> 7 ) | (n0_6 << 6 ));
1423+ t0 [10 ] = WC_OCTET ((n0_6 >> 2 ) );
1424+ t0 [11 ] = WC_OCTET ((n0_6 >> 10 ) | (n0_7 << 3 ));
14211425 #endif
1422- t0 [12 ] = ( byte ) ((n0_7 >> 5 ) );
1426+ t0 [12 ] = WC_OCTET ((n0_7 >> 5 ) );
14231427
14241428 /* 10 bits per number.
14251429 * 8 bytes become 10 bytes. (8 * 10 bits = 10 * 8 bits) */
@@ -1430,17 +1434,17 @@ static void mldsa_vec_encode_t0_t1_c(const sword32* t, byte d, byte* t0,
14301434 tp [1 ] = (n1_3 >> 2 ) | ((word32 )n1_4 << 8 ) |
14311435 ((word32 )n1_5 << 18 ) | ((word32 )n1_6 << 28 );
14321436 #else
1433- t1 [0 ] = ( byte )( (n1_0 << 0 ));
1434- t1 [1 ] = ( byte ) ((n1_0 >> 8 ) | (n1_1 << 2 ));
1435- t1 [2 ] = ( byte ) ((n1_1 >> 6 ) | (n1_2 << 4 ));
1436- t1 [3 ] = ( byte ) ((n1_2 >> 4 ) | (n1_3 << 6 ));
1437- t1 [4 ] = ( byte ) ((n1_3 >> 2 ) );
1438- t1 [5 ] = ( byte )( (n1_4 << 0 ));
1439- t1 [6 ] = ( byte ) ((n1_4 >> 8 ) | (n1_5 << 2 ));
1440- t1 [7 ] = ( byte ) ((n1_5 >> 6 ) | (n1_6 << 4 ));
1437+ t1 [0 ] = WC_OCTET ( (n1_0 << 0 ));
1438+ t1 [1 ] = WC_OCTET ((n1_0 >> 8 ) | (n1_1 << 2 ));
1439+ t1 [2 ] = WC_OCTET ((n1_1 >> 6 ) | (n1_2 << 4 ));
1440+ t1 [3 ] = WC_OCTET ((n1_2 >> 4 ) | (n1_3 << 6 ));
1441+ t1 [4 ] = WC_OCTET ((n1_3 >> 2 ) );
1442+ t1 [5 ] = WC_OCTET ( (n1_4 << 0 ));
1443+ t1 [6 ] = WC_OCTET ((n1_4 >> 8 ) | (n1_5 << 2 ));
1444+ t1 [7 ] = WC_OCTET ((n1_5 >> 6 ) | (n1_6 << 4 ));
14411445 #endif
1442- t1 [8 ] = ( byte ) ((n1_6 >> 4 ) | (n1_7 << 6 ));
1443- t1 [9 ] = ( byte ) ((n1_7 >> 2 ) );
1446+ t1 [8 ] = WC_OCTET ((n1_6 >> 4 ) | (n1_7 << 6 ));
1447+ t1 [9 ] = WC_OCTET ((n1_7 >> 2 ) );
14441448
14451449 /* Move to next place to encode bottom bits to. */
14461450 t0 += MLDSA_D ;
@@ -1526,25 +1530,27 @@ static void mldsa_decode_t0_c(const byte* t0, sword32* t)
15261530 t [j + 7 ] = MLDSA_D_MAX_HALF - (sword32 )
15271531 (( t32_2 >> 27 ) | ((word32 )t0 [12 ] ) << 5 );
15281532#else
1529- t [j + 0 ] = MLDSA_D_MAX_HALF -
1533+ /* sword32 cast on the unpacked field: see eta-2 decode - the subtract
1534+ * must be signed/32-bit so a negative t0 sign-extends correctly. */
1535+ t [j + 0 ] = MLDSA_D_MAX_HALF - (sword32 )
15301536 ((t0 [ 0 ] ) | (((word16 )(t0 [ 1 ] & 0x1f )) << 8 ));
1531- t [j + 1 ] = MLDSA_D_MAX_HALF -
1537+ t [j + 1 ] = MLDSA_D_MAX_HALF - ( sword32 )
15321538 ((t0 [ 1 ] >> 5 ) | (((word16 )(t0 [ 2 ] )) << 3 ) |
15331539 (((word16 )(t0 [ 3 ] & 0x03 )) << 11 ));
1534- t [j + 2 ] = MLDSA_D_MAX_HALF -
1540+ t [j + 2 ] = MLDSA_D_MAX_HALF - ( sword32 )
15351541 ((t0 [ 3 ] >> 2 ) | (((word16 )(t0 [ 4 ] & 0x7f )) << 6 ));
1536- t [j + 3 ] = MLDSA_D_MAX_HALF -
1542+ t [j + 3 ] = MLDSA_D_MAX_HALF - ( sword32 )
15371543 ((t0 [ 4 ] >> 7 ) | (((word16 )(t0 [ 5 ] )) << 1 ) |
15381544 (((word16 )(t0 [ 6 ] & 0x0f )) << 9 ));
1539- t [j + 4 ] = MLDSA_D_MAX_HALF -
1545+ t [j + 4 ] = MLDSA_D_MAX_HALF - ( sword32 )
15401546 ((t0 [ 6 ] >> 4 ) | (((word16 )(t0 [ 7 ] )) << 4 ) |
15411547 (((word16 )(t0 [ 8 ] & 0x01 )) << 12 ));
1542- t [j + 5 ] = MLDSA_D_MAX_HALF -
1548+ t [j + 5 ] = MLDSA_D_MAX_HALF - ( sword32 )
15431549 ((t0 [ 8 ] >> 1 ) | (((word16 )(t0 [ 9 ] & 0x3f )) << 7 ));
1544- t [j + 6 ] = MLDSA_D_MAX_HALF -
1550+ t [j + 6 ] = MLDSA_D_MAX_HALF - ( sword32 )
15451551 ((t0 [ 9 ] >> 6 ) | (((word16 )(t0 [10 ] )) << 2 ) |
15461552 (((word16 )(t0 [11 ] & 0x07 )) << 10 ));
1547- t [j + 7 ] = MLDSA_D_MAX_HALF -
1553+ t [j + 7 ] = MLDSA_D_MAX_HALF - ( sword32 )
15481554 ((t0 [11 ] >> 3 ) | (((word16 )(t0 [12 ] )) << 5 ));
15491555#endif
15501556 /* Move to next place to decode from. */
@@ -1771,16 +1777,16 @@ static void mldsa_encode_gamma1_17_bits_c(const sword32* z, byte* s)
17711777 s32p [1 ] = (z1 >> 14 ) | (z2 << 4 ) | (z3 << 22 );
17721778 #endif
17731779#else
1774- s [0 ] = ( byte ) ( z0 );
1775- s [1 ] = ( byte ) ( z0 >> 8 );
1776- s [2 ] = ( byte ) ((z0 >> 16 ) | (z1 << 2 ));
1777- s [3 ] = ( byte ) ( z1 >> 6 );
1778- s [4 ] = ( byte ) ((z1 >> 14 ) | (z2 << 4 ));
1779- s [5 ] = ( byte ) ( z2 >> 4 );
1780- s [6 ] = ( byte ) ((z2 >> 12 ) | (z3 << 6 ));
1781- s [7 ] = ( byte ) ( z3 >> 2 );
1782- #endif
1783- s [8 ] = ( byte ) ( z3 >> 10 );
1780+ s [0 ] = WC_OCTET ( z0 );
1781+ s [1 ] = WC_OCTET ( z0 >> 8 );
1782+ s [2 ] = WC_OCTET ((z0 >> 16 ) | (z1 << 2 ));
1783+ s [3 ] = WC_OCTET ( z1 >> 6 );
1784+ s [4 ] = WC_OCTET ((z1 >> 14 ) | (z2 << 4 ));
1785+ s [5 ] = WC_OCTET ( z2 >> 4 );
1786+ s [6 ] = WC_OCTET ((z2 >> 12 ) | (z3 << 6 ));
1787+ s [7 ] = WC_OCTET ( z3 >> 2 );
1788+ #endif
1789+ s [8 ] = WC_OCTET ( z3 >> 10 );
17841790 /* Move to next place to encode to. */
17851791 s += MLDSA_GAMMA1_17_ENC_BITS / 2 ;
17861792 }
@@ -1842,16 +1848,16 @@ static void mldsa_encode_gamma1_19_bits_c(const sword32* z, byte* s)
18421848 #endif
18431849 s16p [4 ] = (word16 )((z3 >> 4 ) );
18441850#else
1845- s [0 ] = ( byte ) z0 ;
1846- s [1 ] = ( byte ) (z0 >> 8 ) ;
1847- s [2 ] = ( byte ) ((z0 >> 16 ) | (z1 << 4 ));
1848- s [3 ] = ( byte ) (z1 >> 4 ) ;
1849- s [4 ] = ( byte ) (z1 >> 12 ) ;
1850- s [5 ] = ( byte ) z2 ;
1851- s [6 ] = ( byte ) (z2 >> 8 ) ;
1852- s [7 ] = ( byte ) ((z2 >> 16 ) | (z3 << 4 ));
1853- s [8 ] = ( byte ) (z3 >> 4 ) ;
1854- s [9 ] = ( byte ) (z3 >> 12 ) ;
1851+ s [0 ] = WC_OCTET ( z0 ) ;
1852+ s [1 ] = WC_OCTET ( (z0 >> 8 ) ) ;
1853+ s [2 ] = WC_OCTET ((z0 >> 16 ) | (z1 << 4 ));
1854+ s [3 ] = WC_OCTET ( (z1 >> 4 ) ) ;
1855+ s [4 ] = WC_OCTET ( (z1 >> 12 ) ) ;
1856+ s [5 ] = WC_OCTET ( z2 ) ;
1857+ s [6 ] = WC_OCTET ( (z2 >> 8 ) ) ;
1858+ s [7 ] = WC_OCTET ((z2 >> 16 ) | (z3 << 4 ));
1859+ s [8 ] = WC_OCTET ( (z3 >> 4 ) ) ;
1860+ s [9 ] = WC_OCTET ( (z3 >> 12 ) ) ;
18551861#endif
18561862 /* Move to next place to encode to. */
18571863 s += MLDSA_GAMMA1_19_ENC_BITS / 2 ;
@@ -2244,6 +2250,9 @@ static void mldsa_decode_gamma1(const byte* s, int bits, sword32* z)
22442250 * @param [in] bits Number of bits used in encoding - GAMMA1 bits.
22452251 * @param [out] z Vector of polynomials.
22462252 */
2253+ #ifndef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
2254+ /* The smallest-mem verify streams z one polynomial at a time with
2255+ * mldsa_decode_gamma1() directly, so the whole-vector wrapper is unused. */
22472256static void mldsa_vec_decode_gamma1 (const byte * x , byte l , int bits ,
22482257 sword32 * z )
22492258{
@@ -2258,6 +2267,7 @@ static void mldsa_vec_decode_gamma1(const byte* x, byte l, int bits,
22582267 z += MLDSA_N ;
22592268 }
22602269}
2270+ #endif /* !WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM */
22612271#endif
22622272
22632273#if !defined(WOLFSSL_MLDSA_NO_SIGN ) || !defined(WOLFSSL_MLDSA_NO_VERIFY )
@@ -4624,8 +4634,12 @@ static int mldsa_sample_in_ball_ex(int level, wc_Shake* shake256,
46244634
46254635 /* Step 8: Move value from random index to current index. */
46264636 c [i ] = c [j ];
4627- /* Step 9: Set value at random index to +/- 1. */
4628- c [j ] = 1 - ((((signs [s >> 3 ]) >> (s & 0x7 )) & 0x1 ) << 1 );
4637+ /* Step 9: Set value at random index to +/- 1.
4638+ * Cast to sword32 before the subtract: where a byte is as wide as int,
4639+ * signs[] promotes to unsigned and -1 would widen as 0x0000ffff.
4640+ * Matches the USE_INTEL_SPEEDUP path. */
4641+ c [j ] = (sword32 )1 -
4642+ (sword32 )((((signs [s >> 3 ]) >> (s & 0x7 )) & 0x1 ) << 1 );
46294643 /* Next sign bit index. */
46304644 s ++ ;
46314645 }
@@ -9771,6 +9785,10 @@ static int mldsa_verify_with_mu(wc_MlDsaKey* key, const byte* mu,
97719785 byte o ;
97729786 byte * encW1 ;
97739787 byte * seed = commit_calc ;
9788+ #ifdef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
9789+ /* Bytes of encoded z per polynomial - z is streamed one poly at a time. */
9790+ word32 zStride = (word32 )(MLDSA_N / 8 ) * (word32 )(params -> gamma1_bits + 1 );
9791+ #endif
97749792
97759793 /* Ensure the signature is the right size for the parameters. */
97769794 if (sigLen != params -> sigSz ) {
@@ -9825,15 +9843,33 @@ static int mldsa_verify_with_mu(wc_MlDsaKey* key, const byte* mu,
98259843#endif
98269844
98279845 if (ret == 0 ) {
9828- /* Step 2: Decode z from signature. */
9829- mldsa_vec_decode_gamma1 (ze , params -> l , params -> gamma1_bits , z );
98309846 /* Step 13: Check z is valid - values are low enough. */
98319847 hi = ((sword32 )1 << params -> gamma1_bits ) - params -> beta ;
9848+ #ifdef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
9849+ {
9850+ /* Step 2/13: Stream z one polynomial at a time for the range check;
9851+ * the per-poly NTT happens inside the matrix loop below. */
9852+ const byte * zp = ze ;
9853+ unsigned int zi ;
9854+
9855+ valid = 1 ;
9856+ for (zi = 0 ; valid && (zi < params -> l ); zi ++ ) {
9857+ mldsa_decode_gamma1 (zp , params -> gamma1_bits , z );
9858+ valid = mldsa_check_low (z , hi );
9859+ zp += zStride ;
9860+ }
9861+ }
9862+ #else
9863+ /* Step 2: Decode z from signature. */
9864+ mldsa_vec_decode_gamma1 (ze , params -> l , params -> gamma1_bits , z );
98329865 valid = mldsa_vec_check_low (z , params -> l , hi );
9866+ #endif
98339867 }
98349868 if ((ret == 0 ) && valid ) {
9869+ #ifndef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
98359870 /* Step 10: NTT(z) */
98369871 mldsa_vec_ntt_full (z , params -> l );
9872+ #endif
98379873
98389874 /* Step 9: Compute c from first 256 bits of commit. */
98399875#ifdef WOLFSSL_MLDSA_VERIFY_NO_MALLOC
@@ -9907,6 +9943,14 @@ static int mldsa_verify_with_mu(wc_MlDsaKey* key, const byte* mu,
99079943 for (s = 0 ; (ret == 0 ) && (s < params -> l ); s ++ ) {
99089944 /* Put s into buffer to be hashed. */
99099945 seed [MLDSA_PUB_SEED_SZ + 0 ] = (byte )s ;
9946+ #ifdef WOLFSSL_MLDSA_VERIFY_SMALLEST_MEM
9947+ /* Step 2/10: Decode and NTT this z polynomial on demand (z is
9948+ * not kept as a whole vector in this mode). */
9949+ mldsa_decode_gamma1 (ze + (word32 )s * zStride ,
9950+ params -> gamma1_bits , z );
9951+ mldsa_ntt_full (z );
9952+ zt = z ;
9953+ #endif
99109954 /* Step 3: Create polynomial from hashing seed. */
99119955 #ifdef WOLFSSL_MLDSA_VERIFY_NO_MALLOC
99129956 ret = mldsa_rej_ntt_poly_ex (& key -> shake , seed , a , key -> h );
0 commit comments