|
12 | 12 |
|
13 | 13 | #include "fips202_native_armv81m.h" |
14 | 14 |
|
15 | | -/* |
16 | | - * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations. |
17 | | - * TODO: Replace with optimized MVE assembly implementations |
18 | | - * (as a part of XORBytes and ExtractBytes) |
19 | | - */ |
20 | | - |
21 | | -/* Extract even-indexed bits from 64-bit value into lower 32 bits */ |
22 | | -static uint32_t bitinterleave_even(uint64_t x) |
23 | | -{ |
24 | | - uint64_t t; |
25 | | - t = x & 0x5555555555555555ULL; |
26 | | - t = (t | (t >> 1)) & 0x3333333333333333ULL; |
27 | | - t = (t | (t >> 2)) & 0x0f0f0f0f0f0f0f0fULL; |
28 | | - t = (t | (t >> 4)) & 0x00ff00ff00ff00ffULL; |
29 | | - t = (t | (t >> 8)) & 0x0000ffff0000ffffULL; |
30 | | - t = (t | (t >> 16)) & 0x00000000ffffffffULL; |
31 | | - return (uint32_t)t; |
32 | | -} |
33 | | - |
34 | | -/* Extract odd-indexed bits from 64-bit value into lower 32 bits */ |
35 | | -static uint32_t bitinterleave_odd(uint64_t x) |
36 | | -{ |
37 | | - return bitinterleave_even(x >> 1); |
38 | | -} |
39 | | - |
40 | | -/* Spread 32-bit value across even bit positions of 64-bit result */ |
41 | | -static uint64_t spread_even(uint32_t x) |
42 | | -{ |
43 | | - uint64_t t = x; |
44 | | - t = (t | (t << 16)) & 0x0000ffff0000ffffULL; |
45 | | - t = (t | (t << 8)) & 0x00ff00ff00ff00ffULL; |
46 | | - t = (t | (t << 4)) & 0x0f0f0f0f0f0f0f0fULL; |
47 | | - t = (t | (t << 2)) & 0x3333333333333333ULL; |
48 | | - t = (t | (t << 1)) & 0x5555555555555555ULL; |
49 | | - return t; |
50 | | -} |
51 | | - |
52 | | -/* Combine even and odd 32-bit halves into interleaved 64-bit value */ |
53 | | -static uint64_t bitdeinterleave(uint32_t even, uint32_t odd) |
54 | | -{ |
55 | | - return spread_even(even) | (spread_even(odd) << 1); |
56 | | -} |
57 | 15 |
|
58 | 16 | /* |
59 | | - * TEMPORARY: Naive C interleaving functions. |
60 | | - * These will be replaced with optimized MVE assembly implementations. |
| 17 | + * Keccak-f1600 x4 permutation (on bit-interleaved state) |
| 18 | + * State is expected to already be in bit-interleaved format. |
61 | 19 | */ |
62 | | -static void interleave_4fold(uint64_t *state_4x, const uint64_t *state0, |
63 | | - const uint64_t *state1, const uint64_t *state2, |
64 | | - const uint64_t *state3) |
65 | | -{ |
66 | | - uint32_t *state_4xl = (uint32_t *)state_4x; |
67 | | - uint32_t *state_4xh = (uint32_t *)state_4x + 100; |
68 | | - |
69 | | - for (size_t i = 0; i < 25; i++) |
70 | | - { |
71 | | - state_4xl[i * 4 + 0] = bitinterleave_even(state0[i]); |
72 | | - state_4xl[i * 4 + 1] = bitinterleave_even(state1[i]); |
73 | | - state_4xl[i * 4 + 2] = bitinterleave_even(state2[i]); |
74 | | - state_4xl[i * 4 + 3] = bitinterleave_even(state3[i]); |
75 | | - |
76 | | - state_4xh[i * 4 + 0] = bitinterleave_odd(state0[i]); |
77 | | - state_4xh[i * 4 + 1] = bitinterleave_odd(state1[i]); |
78 | | - state_4xh[i * 4 + 2] = bitinterleave_odd(state2[i]); |
79 | | - state_4xh[i * 4 + 3] = bitinterleave_odd(state3[i]); |
80 | | - } |
81 | | -} |
82 | | - |
83 | | -static void deinterleave_4fold(uint64_t *state_4x, uint64_t *state0, |
84 | | - uint64_t *state1, uint64_t *state2, |
85 | | - uint64_t *state3) |
86 | | -{ |
87 | | - uint32_t *state_4xl = (uint32_t *)state_4x; |
88 | | - uint32_t *state_4xh = (uint32_t *)state_4x + 100; |
89 | | - |
90 | | - for (size_t i = 0; i < 25; i++) |
91 | | - { |
92 | | - state0[i] = bitdeinterleave(state_4xl[i * 4 + 0], state_4xh[i * 4 + 0]); |
93 | | - state1[i] = bitdeinterleave(state_4xl[i * 4 + 1], state_4xh[i * 4 + 1]); |
94 | | - state2[i] = bitdeinterleave(state_4xl[i * 4 + 2], state_4xh[i * 4 + 2]); |
95 | | - state3[i] = bitdeinterleave(state_4xl[i * 4 + 3], state_4xh[i * 4 + 3]); |
96 | | - } |
97 | | -} |
98 | | - |
99 | 20 | #define mld_keccak_f1600_x4_native_impl \ |
100 | 21 | MLD_NAMESPACE(keccak_f1600_x4_native_impl) |
101 | 22 | int mld_keccak_f1600_x4_native_impl(uint64_t *state) |
102 | 23 | { |
103 | | - /* |
104 | | - * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations. |
105 | | - * TODO: Replace with optimized MVE assembly implementations |
106 | | - * (as a part of XORBytes and ExtractBytes) |
107 | | - */ |
108 | | - MLD_ALIGN uint64_t state_4x[100]; |
109 | | - MLD_ALIGN uint64_t state_4x_tmp[100]; |
110 | | - |
111 | | - /* Interleave the 4 states into bit-interleaved format */ |
112 | | - interleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]); |
113 | | - |
114 | | - /* Run the permutation */ |
115 | | - mld_keccak_f1600_x4_mve_asm(state_4x, state_4x_tmp, |
| 24 | + MLD_ALIGN uint64_t state_tmp[100]; |
| 25 | + mld_keccak_f1600_x4_mve_asm(state, state_tmp, |
116 | 26 | mld_keccakf1600_round_constants); |
117 | | - |
118 | | - /* Deinterleave back to 4 separate states */ |
119 | | - deinterleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]); |
120 | | - |
121 | | - mld_zeroize(state_4x, sizeof(state_4x)); |
122 | | - mld_zeroize(state_4x_tmp, sizeof(state_4x_tmp)); |
| 27 | + mld_zeroize(state_tmp, sizeof(state_tmp)); |
123 | 28 | return MLD_NATIVE_FUNC_SUCCESS; |
124 | 29 | } |
125 | 30 |
|
|
0 commit comments