@@ -30,6 +30,30 @@ inline uint64_t ROL64(uint64_t val, int offset) {
3030 return (val << offset) | (val >> (64 - offset));
3131}
3232
33+ // Load/store 64-bit lanes in little-endian byte order.
34+ // The Keccak state uses LE lane encoding (FIPS 202 Section 1, B.1).
35+ // These helpers ensure correctness on both LE and BE platforms.
36+ inline uint64_t LoadLE64 (const uint8_t * src) {
37+ return static_cast <uint64_t >(src[0 ]) | (static_cast <uint64_t >(src[1 ]) << 8 ) |
38+ (static_cast <uint64_t >(src[2 ]) << 16 ) |
39+ (static_cast <uint64_t >(src[3 ]) << 24 ) |
40+ (static_cast <uint64_t >(src[4 ]) << 32 ) |
41+ (static_cast <uint64_t >(src[5 ]) << 40 ) |
42+ (static_cast <uint64_t >(src[6 ]) << 48 ) |
43+ (static_cast <uint64_t >(src[7 ]) << 56 );
44+ }
45+
46+ inline void StoreLE64 (uint8_t * dst, uint64_t val) {
47+ dst[0 ] = static_cast <uint8_t >(val);
48+ dst[1 ] = static_cast <uint8_t >(val >> 8 );
49+ dst[2 ] = static_cast <uint8_t >(val >> 16 );
50+ dst[3 ] = static_cast <uint8_t >(val >> 24 );
51+ dst[4 ] = static_cast <uint8_t >(val >> 32 );
52+ dst[5 ] = static_cast <uint8_t >(val >> 40 );
53+ dst[6 ] = static_cast <uint8_t >(val >> 48 );
54+ dst[7 ] = static_cast <uint8_t >(val >> 56 );
55+ }
56+
3357static const unsigned char rhotates[5 ][5 ] = {
3458 {0 , 1 , 62 , 28 , 27 },
3559 {36 , 44 , 6 , 55 , 20 },
@@ -116,41 +140,49 @@ void TurboSHAKE(const uint8_t* input,
116140 uint8_t * output,
117141 size_t output_len) {
118142 uint64_t A[5 ][5 ] = {};
119- uint8_t * state = reinterpret_cast <uint8_t *>(A);
143+ // Both rates (168, 136) are multiples of 8
144+ size_t lane_count = rate / 8 ;
120145
121146 size_t offset = 0 ;
122147
123148 // Absorb complete blocks from input
124149 while (offset + rate <= input_len) {
125- for (size_t i = 0 ; i < rate ; i++) {
126- state[i] ^= input[ offset + i] ;
150+ for (size_t i = 0 ; i < lane_count ; i++) {
151+ A[i / 5 ][i % 5 ] ^= LoadLE64 ( input + offset + i * 8 ) ;
127152 }
128153 KeccakP1600_12 (A);
129154 offset += rate;
130155 }
131156
132157 // Absorb last (partial) block: remaining input bytes + domain_sep + padding
133158 size_t remaining = input_len - offset;
134-
135- // XOR remaining input bytes
136- for (size_t i = 0 ; i < remaining; i++) {
137- state[i] ^= input[offset + i];
159+ uint8_t pad[168 ] = {}; // sized for max rate (TurboSHAKE128)
160+ if (remaining > 0 ) {
161+ memcpy (pad, input + offset, remaining);
138162 }
163+ pad[remaining] ^= domain_sep;
164+ pad[rate - 1 ] ^= 0x80 ;
139165
140- // XOR domain separation byte
141- state[remaining] ^= domain_sep;
142-
143- // XOR pad10*1 final bit at end of rate block
144- state[rate - 1 ] ^= 0x80 ;
145-
166+ for (size_t i = 0 ; i < lane_count; i++) {
167+ A[i / 5 ][i % 5 ] ^= LoadLE64 (pad + i * 8 );
168+ }
146169 KeccakP1600_12 (A);
147170
148171 // Squeeze output
149172 size_t out_offset = 0 ;
150173 while (out_offset < output_len) {
151174 size_t block = output_len - out_offset;
152175 if (block > rate) block = rate;
153- memcpy (output + out_offset, state, block);
176+ size_t full_lanes = block / 8 ;
177+ for (size_t i = 0 ; i < full_lanes; i++) {
178+ StoreLE64 (output + out_offset + i * 8 , A[i / 5 ][i % 5 ]);
179+ }
180+ size_t rem = block % 8 ;
181+ if (rem > 0 ) {
182+ uint8_t tmp[8 ];
183+ StoreLE64 (tmp, A[full_lanes / 5 ][full_lanes % 5 ]);
184+ memcpy (output + out_offset + full_lanes * 8 , tmp, rem);
185+ }
154186 out_offset += block;
155187 if (out_offset < output_len) {
156188 KeccakP1600_12 (A);
0 commit comments