From a61e9cd681103b2353d3b2a69c7b60a206c0a696 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Thu, 19 Feb 2026 10:28:51 +0100 Subject: [PATCH 1/3] Unit Tests: Extend Keccak x4 test to cover xor_bytes and extract_bytes Replace test_keccakf1600x4_permute with test_keccakf1600x4_xor_permute_extract that tests the full x4 Keccak flow (xor_bytes, permute, extract_bytes) against the x1 C reference implementation. Testing through the public interface rather than comparing internal state directly allows verifying backends that use custom state representations (e.g., bit-interleaved) without requiring state conversion functions. The test uses random offsets and lengths for both xor_bytes and extract_bytes, and verifies each of the 4 lanes independently against the x1 reference. Also reduce functional test iterations for M55 baremetal platform. Signed-off-by: Matthias J. Kannwischer --- test/baremetal/platform/m55-an547/platform.mk | 3 +- test/src/test_unit.c | 62 +++++++++++++++---- 2 files changed, 51 insertions(+), 14 deletions(-) diff --git a/test/baremetal/platform/m55-an547/platform.mk b/test/baremetal/platform/m55-an547/platform.mk index 6e9918123..6109d1354 100644 --- a/test/baremetal/platform/m55-an547/platform.mk +++ b/test/baremetal/platform/m55-an547/platform.mk @@ -10,8 +10,9 @@ CC=gcc # Use PMU cycle counting by default CYCLES ?= PMU -# Reduce iterations for benchmarking +# Reduce iterations for benchmarking and functional tests CFLAGS += -DMLD_BENCHMARK_NTESTS=3 -DMLD_BENCHMARK_NITERATIONS=2 -DMLD_BENCHMARK_NWARMUP=3 +CFLAGS += -DNTESTS_FUNC=100 # Explicitly include experimental Armv8.1-M + MVE backend # Remove this once backend is finalized and enabled by default. diff --git a/test/src/test_unit.c b/test/src/test_unit.c index 6acb1a81b..998fdc3e7 100644 --- a/test/src/test_unit.c +++ b/test/src/test_unit.c @@ -47,7 +47,7 @@ void mld_polyvecl_pointwise_acc_montgomery_c(mld_poly *w, const mld_polyvecl *u, void mld_polyz_unpack_c(mld_poly *r, const uint8_t a[MLDSA_POLYZ_PACKEDBYTES]); void mld_keccakf1600_permute_c(uint64_t *state); -#if defined(MLD_USE_FIPS202_X1_NATIVE) || defined(MLD_USE_FIPS202_X4_NATIVE) +#if defined(MLD_USE_FIPS202_X1_NATIVE) static void print_u64_array(const char *label, const uint64_t *array, size_t len) { @@ -95,8 +95,7 @@ static int compare_u64_arrays(const uint64_t *a, const uint64_t *b, } return 1; } - -#endif /* MLD_USE_FIPS202_X1_NATIVE || MLD_USE_FIPS202_X4_NATIVE */ +#endif /* MLD_USE_FIPS202_X1_NATIVE */ #if defined(MLD_USE_NATIVE_NTT) || defined(MLD_USE_NATIVE_INTT) || \ defined(MLD_USE_NATIVE_POLY_DECOMPOSE_32) || \ @@ -665,32 +664,69 @@ static int test_keccakf1600_permute(void) } #endif /* MLD_USE_FIPS202_X1_NATIVE */ +/* + * Test that x4 Keccak (xor_bytes, permute, extract_bytes) produces + * the same results as the x1 C reference. + */ #ifdef MLD_USE_FIPS202_X4_NATIVE -static int test_keccakf1600x4_permute(void) +#define MAX_RATE 136 + +static int test_keccakf1600x4_xor_permute_extract(void) { uint64_t state_x4[MLD_KECCAK_LANES * MLD_KECCAK_WAY]; - uint64_t state_x1[MLD_KECCAK_LANES * MLD_KECCAK_WAY]; + uint64_t state_x1[MLD_KECCAK_LANES]; + unsigned char output_x4[MLD_KECCAK_WAY][MAX_RATE]; + unsigned char output_x1[MAX_RATE]; + unsigned char input[MLD_KECCAK_WAY][MAX_RATE]; + uint8_t xor_offset, xor_length, ext_offset, ext_length; int i, j; for (i = 0; i < NUM_RANDOM_TESTS; i++) { - randombytes((uint8_t *)state_x4, sizeof(state_x4)); - memcpy(state_x1, state_x4, sizeof(state_x4)); + /* Generate random offset and length for xor_bytes */ + randombytes(&xor_offset, 1); + randombytes(&xor_length, 1); + xor_offset = xor_offset % MAX_RATE; + xor_length = (uint8_t)(1 + (xor_length % (MAX_RATE - xor_offset))); + + /* Generate random offset and length for extract_bytes */ + randombytes(&ext_offset, 1); + randombytes(&ext_length, 1); + ext_offset = ext_offset % MAX_RATE; + ext_length = (uint8_t)(1 + (ext_length % (MAX_RATE - ext_offset))); + + /* Generate different random input for each lane */ + for (j = 0; j < MLD_KECCAK_WAY; j++) + { + randombytes(input[j], xor_length); + } + /* Run x4 implementation */ + memset(state_x4, 0, sizeof(state_x4)); + mld_keccakf1600x4_xor_bytes(state_x4, input[0], input[1], input[2], + input[3], xor_offset, xor_length); mld_keccakf1600x4_permute(state_x4); + mld_keccakf1600x4_extract_bytes(state_x4, output_x4[0], output_x4[1], + output_x4[2], output_x4[3], ext_offset, + ext_length); + /* Compare each lane against x1 C reference */ for (j = 0; j < MLD_KECCAK_WAY; j++) { - mld_keccakf1600_permute_c(state_x1 + j * MLD_KECCAK_LANES); - } + memset(state_x1, 0, sizeof(state_x1)); + mld_keccakf1600_xor_bytes(state_x1, input[j], xor_offset, xor_length); + mld_keccakf1600_permute_c(state_x1); + mld_keccakf1600_extract_bytes(state_x1, output_x1, ext_offset, + ext_length); - CHECK(compare_u64_arrays(state_x4, state_x1, - MLD_KECCAK_LANES * MLD_KECCAK_WAY, - "keccakf1600x4_permute")); + CHECK(memcmp(output_x4[j], output_x1, ext_length) == 0); + } } return 0; } + +#undef MAX_RATE #endif /* MLD_USE_FIPS202_X4_NATIVE */ static int test_backend_units(void) @@ -745,7 +781,7 @@ static int test_backend_units(void) #endif #ifdef MLD_USE_FIPS202_X4_NATIVE - CHECK(test_keccakf1600x4_permute() == 0); + CHECK(test_keccakf1600x4_xor_permute_extract() == 0); #endif return 0; From 1601cb6ae694f004be9de0e8a1cc2b9caccd4c5d Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Thu, 19 Feb 2026 10:28:56 +0100 Subject: [PATCH 2/3] FIPS202: Add native x4 XOR/extract bytes interface Extend the FIPS202 native backend API to support implementing XORBytes and ExtractBytes steps in native code. This is essential for backends using custom state representations (e.g., bit-interleaved state), where these functions handle conversion to/from the internal format on-the-fly. In such cases, they also account for a significant amount of processing time. New flags: - MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE: Backend provides native XOR bytes - MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE: Backend provides native extract bytes When set, backends provide native implementations for: - mld_keccakf1600_xor_bytes_x4_native: XOR input data into state - mld_keccakf1600_extract_bytes_x4_native: Extract output from state Signed-off-by: Matthias J. Kannwischer --- mldsa/src/fips202/native/api.h | 60 ++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/mldsa/src/fips202/native/api.h b/mldsa/src/fips202/native/api.h index b30135aa6..3b5b61afc 100644 --- a/mldsa/src/fips202/native/api.h +++ b/mldsa/src/fips202/native/api.h @@ -66,4 +66,64 @@ __contract__( ); #endif /* MLD_USE_FIPS202_X4_NATIVE */ +/* + * Native x4 XOR bytes and extract bytes interface. + * + * These functions allow backends to provide optimized implementations for + * XORing input data into the state and extracting output data from the state. + * This is particularly useful for backends that use a different internal state + * representation (e.g., bit-interleaved), as conversion can happen during + * XOR/extract rather than before/after each permutation. + * + * NOTE: We assume that the custom representation of the zero state is the + * all-zero state. + * + * MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE: Backend provides native XOR bytes + * MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE: Backend provides native extract + * bytes + */ + +#if defined(MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE) +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_keccakf1600_xor_bytes_x4_native( + uint64_t *state, const unsigned char *data0, const unsigned char *data1, + const unsigned char *data2, const unsigned char *data3, unsigned offset, + unsigned length) +__contract__( + requires(0 <= offset && offset <= 25 * sizeof(uint64_t) && + 0 <= length && length <= 25 * sizeof(uint64_t) - offset) + requires(memory_no_alias(state, sizeof(uint64_t) * 25 * 4)) + requires(memory_no_alias(data0, length)) + requires((data0 == data1 && + data0 == data2 && + data0 == data3) || + (memory_no_alias(data1, length) && + memory_no_alias(data2, length) && + memory_no_alias(data3, length))) + assigns(memory_slice(state, sizeof(uint64_t) * 25 * 4)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged_u64(state, 25 * 4))); +#endif /* MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE */ + +#if defined(MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE) +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_keccakf1600_extract_bytes_x4_native( + uint64_t *state, unsigned char *data0, unsigned char *data1, + unsigned char *data2, unsigned char *data3, unsigned offset, + unsigned length) +__contract__( + requires(0 <= offset && offset <= 25 * sizeof(uint64_t) && + 0 <= length && length <= 25 * sizeof(uint64_t) - offset) + requires(memory_no_alias(state, sizeof(uint64_t) * 25 * 4)) + requires(memory_no_alias(data0, length)) + requires(memory_no_alias(data1, length)) + requires(memory_no_alias(data2, length)) + requires(memory_no_alias(data3, length)) + assigns(memory_slice(data0, length)) + assigns(memory_slice(data1, length)) + assigns(memory_slice(data2, length)) + assigns(memory_slice(data3, length)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS)); +#endif /* MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE */ + #endif /* !MLD_FIPS202_NATIVE_API_H */ From 3823f1f35f690136e46e0691035cf27917f324a9 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Thu, 19 Feb 2026 10:29:04 +0100 Subject: [PATCH 3/3] Armv8.1-M: Add native Keccak x4 XORBytes and ExtractBytes Add native MVE implementations of XORBytes and ExtractBytes that perform bit-interleaving/deinterleaving on-the-fly, enabling use of a bit-interleaved state representation without temporary conversions in the permutation. This improves performance by: - Reducing the number of bit-interleaving operations - Accelerating bit-interleaving using MVE vector instructions The backend uses bit-interleaved state representation where each 64-bit lane is split into even and odd 32-bit halves for efficient 32-bit MVE processing. Co-Authored-By: Brendan Moran Signed-off-by: Matthias J. Kannwischer --- dev/fips202/armv81m/mve.h | 48 +++ .../armv81m/src/fips202_native_armv81m.h | 16 + dev/fips202/armv81m/src/keccak_f1600_x4_mve.S | 36 +- dev/fips202/armv81m/src/keccak_f1600_x4_mve.c | 105 +----- .../armv81m/src/state_extract_bytes_x4_mve.S | 333 +++++++++++++++++ .../armv81m/src/state_xor_bytes_x4_mve.S | 349 ++++++++++++++++++ mldsa/mldsa_native.c | 6 + mldsa/mldsa_native_asm.S | 8 + mldsa/src/fips202/keccakf1600.c | 58 ++- mldsa/src/fips202/native/armv81m/mve.h | 48 +++ .../armv81m/src/fips202_native_armv81m.h | 16 + .../native/armv81m/src/keccak_f1600_x4_mve.S | 34 +- .../native/armv81m/src/keccak_f1600_x4_mve.c | 105 +----- .../armv81m/src/state_extract_bytes_x4_mve.S | 290 +++++++++++++++ .../armv81m/src/state_xor_bytes_x4_mve.S | 314 ++++++++++++++++ proofs/cbmc/dummy_backend_fips202_x4.h | 2 + .../Makefile | 37 ++ ...ccakf1600x4_extract_bytes_native_harness.c | 16 + .../keccakf1600x4_xor_bytes_native/Makefile | 37 ++ .../keccakf1600x4_xor_bytes_native_harness.c | 16 + scripts/check-magic | 4 +- scripts/simpasm | 2 +- 22 files changed, 1663 insertions(+), 217 deletions(-) create mode 100644 dev/fips202/armv81m/src/state_extract_bytes_x4_mve.S create mode 100644 dev/fips202/armv81m/src/state_xor_bytes_x4_mve.S create mode 100644 mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S create mode 100644 mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S create mode 100644 proofs/cbmc/keccakf1600x4_extract_bytes_native/Makefile create mode 100644 proofs/cbmc/keccakf1600x4_extract_bytes_native/keccakf1600x4_extract_bytes_native_harness.c create mode 100644 proofs/cbmc/keccakf1600x4_xor_bytes_native/Makefile create mode 100644 proofs/cbmc/keccakf1600x4_xor_bytes_native/keccakf1600x4_xor_bytes_native_harness.c diff --git a/dev/fips202/armv81m/mve.h b/dev/fips202/armv81m/mve.h index a2bf121de..03ff5798c 100644 --- a/dev/fips202/armv81m/mve.h +++ b/dev/fips202/armv81m/mve.h @@ -11,12 +11,18 @@ /* Part of backend API */ #define MLD_USE_FIPS202_X4_NATIVE +#define MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE +#define MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE /* Guard for assembly file */ #define MLD_FIPS202_ARMV81M_NEED_X4 #if !defined(__ASSEMBLER__) #include "../api.h" +/* + * Native x4 permutation + * State is kept in bit-interleaved format. + */ #define mld_keccak_f1600_x4_native_impl \ MLD_NAMESPACE(keccak_f1600_x4_native_impl) int mld_keccak_f1600_x4_native_impl(uint64_t *state); @@ -27,6 +33,48 @@ static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state) return mld_keccak_f1600_x4_native_impl(state); } +/* + * Native x4 XOR bytes (with on-the-fly bit interleaving) + */ +#define mld_keccak_f1600_x4_state_xor_bytes \ + MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm) +void mld_keccak_f1600_x4_state_xor_bytes(void *state, const uint8_t *data0, + const uint8_t *data1, + const uint8_t *data2, + const uint8_t *data3, unsigned offset, + unsigned length); + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_keccakf1600_xor_bytes_x4_native( + uint64_t *state, const uint8_t *data0, const uint8_t *data1, + const uint8_t *data2, const uint8_t *data3, unsigned offset, + unsigned length) +{ + mld_keccak_f1600_x4_state_xor_bytes(state, data0, data1, data2, data3, offset, + length); + return MLD_NATIVE_FUNC_SUCCESS; +} + +/* + * Native x4 extract bytes (with on-the-fly bit de-interleaving) + */ +#define mld_keccak_f1600_x4_state_extract_bytes \ + MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm) +void mld_keccak_f1600_x4_state_extract_bytes(void *state, uint8_t *data0, + uint8_t *data1, uint8_t *data2, + uint8_t *data3, unsigned offset, + unsigned length); + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_keccakf1600_extract_bytes_x4_native( + uint64_t *state, uint8_t *data0, uint8_t *data1, uint8_t *data2, + uint8_t *data3, unsigned offset, unsigned length) +{ + mld_keccak_f1600_x4_state_extract_bytes(state, data0, data1, data2, data3, + offset, length); + return MLD_NATIVE_FUNC_SUCCESS; +} + #endif /* !__ASSEMBLER__ */ #endif /* !MLD_DEV_FIPS202_ARMV81M_MVE_H */ diff --git a/dev/fips202/armv81m/src/fips202_native_armv81m.h b/dev/fips202/armv81m/src/fips202_native_armv81m.h index ac8d9e29d..4ed3b90f0 100644 --- a/dev/fips202/armv81m/src/fips202_native_armv81m.h +++ b/dev/fips202/armv81m/src/fips202_native_armv81m.h @@ -17,4 +17,20 @@ extern const uint32_t mld_keccakf1600_round_constants[48]; void mld_keccak_f1600_x4_mve_asm(uint64_t state[100], uint64_t tmpstate[100], const uint32_t rc[48]); +#define mld_keccak_f1600_x4_state_xor_bytes_asm \ + MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm) +void mld_keccak_f1600_x4_state_xor_bytes_asm(void *state, const uint8_t *d0, + const uint8_t *d1, + const uint8_t *d2, + const uint8_t *d3, unsigned offset, + unsigned length); + +#define mld_keccak_f1600_x4_state_extract_bytes_asm \ + MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm) +void mld_keccak_f1600_x4_state_extract_bytes_asm(void *state, uint8_t *data0, + uint8_t *data1, uint8_t *data2, + uint8_t *data3, + unsigned offset, + unsigned length); + #endif /* !MLD_DEV_FIPS202_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H */ diff --git a/dev/fips202/armv81m/src/keccak_f1600_x4_mve.S b/dev/fips202/armv81m/src/keccak_f1600_x4_mve.S index 50c06595f..96033ac0e 100644 --- a/dev/fips202/armv81m/src/keccak_f1600_x4_mve.S +++ b/dev/fips202/armv81m/src/keccak_f1600_x4_mve.S @@ -7,7 +7,7 @@ /*yaml Name: keccak_f1600_x4_mve_asm - Description: Armv8.1-M MVE implementation of 4-way parallel Keccak-f[1600] permutation using bit-interleaved state + Description: Armv8.1-M MVE implementation of batched (x4) Keccak-f[1600] permutation using bit-interleaved state Signature: void mld_keccak_f1600_x4_mve_asm(void *state, void *tmpstate, const uint32_t *rc) ABI: r0: @@ -15,7 +15,7 @@ size_bytes: 800 permissions: read/write c_parameter: void *state - description: Four bit-interleaved Keccak states (low halves followed by high halves) + description: Bit-interleaved state for 4 Keccak instances (even halves followed by odd halves) r1: type: buffer size_bytes: 800 @@ -33,6 +33,36 @@ description: register preservation (44) + SIMD registers (64) + temporary storage (128) */ +// --------------------------------------------------------------------------- +// Bit-interleaving background +// --------------------------------------------------------------------------- +// Each 64-bit Keccak lane is stored as two 32-bit words: +// even half -- bits 0, 2, 4, ..., 62 of the lane +// odd half -- bits 1, 3, 5, ..., 63 of the lane +// This representation allows 64-bit lane rotations (used in the Keccak +// round function) to be implemented as pairs of 32-bit rotations. +// +// Batched (x4) processing: +// Four Keccak instances are processed as a batch. Their states are +// stored interleaved in a single 800-byte buffer: first the even +// halves of all 25 lanes (400 bytes), then the odd halves (400 bytes). +// Within each 16-byte row, the four u32 words correspond to +// instances 0..3 of the same lane, enabling SIMD-parallel operations +// across all four instances. +// +// State memory layout (25 lanes x 4 instances x 2 halves): +// S[i][l]_even/odd = even/odd half of lane l, instance i (u32) +// Each row is 16 bytes (one Q-register). +// Offset Contents +// 0 S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even +// 16 S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even +// ... +// 384 S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even +// 400 S[0][ 0]_odd, S[1][ 0]_odd, S[2][ 0]_odd, S[3][ 0]_odd +// 416 S[0][ 1]_odd, S[1][ 1]_odd, S[2][ 1]_odd, S[3][ 1]_odd +// ... +// 784 S[0][24]_odd, S[1][24]_odd, S[2][24]_odd, S[3][24]_odd + #include "../../../../common.h" #if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) @@ -426,7 +456,7 @@ qA20_l .req q2 .endm .text -.balign 8 +.balign 4 .type MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm), %function .global MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm) MLD_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm) diff --git a/dev/fips202/armv81m/src/keccak_f1600_x4_mve.c b/dev/fips202/armv81m/src/keccak_f1600_x4_mve.c index e74fd8913..e26f1bf22 100644 --- a/dev/fips202/armv81m/src/keccak_f1600_x4_mve.c +++ b/dev/fips202/armv81m/src/keccak_f1600_x4_mve.c @@ -12,114 +12,19 @@ #include "fips202_native_armv81m.h" -/* - * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations. - * TODO: Replace with optimized MVE assembly implementations - * (as a part of XORBytes and ExtractBytes) - */ - -/* Extract even-indexed bits from 64-bit value into lower 32 bits */ -static uint32_t bitinterleave_even(uint64_t x) -{ - uint64_t t; - t = x & 0x5555555555555555ULL; - t = (t | (t >> 1)) & 0x3333333333333333ULL; - t = (t | (t >> 2)) & 0x0f0f0f0f0f0f0f0fULL; - t = (t | (t >> 4)) & 0x00ff00ff00ff00ffULL; - t = (t | (t >> 8)) & 0x0000ffff0000ffffULL; - t = (t | (t >> 16)) & 0x00000000ffffffffULL; - return (uint32_t)t; -} - -/* Extract odd-indexed bits from 64-bit value into lower 32 bits */ -static uint32_t bitinterleave_odd(uint64_t x) -{ - return bitinterleave_even(x >> 1); -} - -/* Spread 32-bit value across even bit positions of 64-bit result */ -static uint64_t spread_even(uint32_t x) -{ - uint64_t t = x; - t = (t | (t << 16)) & 0x0000ffff0000ffffULL; - t = (t | (t << 8)) & 0x00ff00ff00ff00ffULL; - t = (t | (t << 4)) & 0x0f0f0f0f0f0f0f0fULL; - t = (t | (t << 2)) & 0x3333333333333333ULL; - t = (t | (t << 1)) & 0x5555555555555555ULL; - return t; -} - -/* Combine even and odd 32-bit halves into interleaved 64-bit value */ -static uint64_t bitdeinterleave(uint32_t even, uint32_t odd) -{ - return spread_even(even) | (spread_even(odd) << 1); -} /* - * TEMPORARY: Naive C interleaving functions. - * These will be replaced with optimized MVE assembly implementations. + * Keccak-f1600 x4 permutation (on bit-interleaved state) + * State is expected to already be in bit-interleaved format. */ -static void interleave_4fold(uint64_t *state_4x, const uint64_t *state0, - const uint64_t *state1, const uint64_t *state2, - const uint64_t *state3) -{ - uint32_t *state_4xl = (uint32_t *)state_4x; - uint32_t *state_4xh = (uint32_t *)state_4x + 100; - - for (size_t i = 0; i < 25; i++) - { - state_4xl[i * 4 + 0] = bitinterleave_even(state0[i]); - state_4xl[i * 4 + 1] = bitinterleave_even(state1[i]); - state_4xl[i * 4 + 2] = bitinterleave_even(state2[i]); - state_4xl[i * 4 + 3] = bitinterleave_even(state3[i]); - - state_4xh[i * 4 + 0] = bitinterleave_odd(state0[i]); - state_4xh[i * 4 + 1] = bitinterleave_odd(state1[i]); - state_4xh[i * 4 + 2] = bitinterleave_odd(state2[i]); - state_4xh[i * 4 + 3] = bitinterleave_odd(state3[i]); - } -} - -static void deinterleave_4fold(uint64_t *state_4x, uint64_t *state0, - uint64_t *state1, uint64_t *state2, - uint64_t *state3) -{ - uint32_t *state_4xl = (uint32_t *)state_4x; - uint32_t *state_4xh = (uint32_t *)state_4x + 100; - - for (size_t i = 0; i < 25; i++) - { - state0[i] = bitdeinterleave(state_4xl[i * 4 + 0], state_4xh[i * 4 + 0]); - state1[i] = bitdeinterleave(state_4xl[i * 4 + 1], state_4xh[i * 4 + 1]); - state2[i] = bitdeinterleave(state_4xl[i * 4 + 2], state_4xh[i * 4 + 2]); - state3[i] = bitdeinterleave(state_4xl[i * 4 + 3], state_4xh[i * 4 + 3]); - } -} - #define mld_keccak_f1600_x4_native_impl \ MLD_NAMESPACE(keccak_f1600_x4_native_impl) int mld_keccak_f1600_x4_native_impl(uint64_t *state) { - /* - * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations. - * TODO: Replace with optimized MVE assembly implementations - * (as a part of XORBytes and ExtractBytes) - */ - MLD_ALIGN uint64_t state_4x[100]; - MLD_ALIGN uint64_t state_4x_tmp[100]; - - /* Interleave the 4 states into bit-interleaved format */ - interleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]); - - /* Run the permutation */ - mld_keccak_f1600_x4_mve_asm(state_4x, state_4x_tmp, + MLD_ALIGN uint64_t state_tmp[100]; + mld_keccak_f1600_x4_mve_asm(state, state_tmp, mld_keccakf1600_round_constants); - - /* Deinterleave back to 4 separate states */ - deinterleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]); - - mld_zeroize(state_4x, sizeof(state_4x)); - mld_zeroize(state_4x_tmp, sizeof(state_4x_tmp)); + mld_zeroize(state_tmp, sizeof(state_tmp)); return MLD_NATIVE_FUNC_SUCCESS; } diff --git a/dev/fips202/armv81m/src/state_extract_bytes_x4_mve.S b/dev/fips202/armv81m/src/state_extract_bytes_x4_mve.S new file mode 100644 index 000000000..f45f168ca --- /dev/null +++ b/dev/fips202/armv81m/src/state_extract_bytes_x4_mve.S @@ -0,0 +1,333 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * Copyright (c) 2026 Arm Limited + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +// --------------------------------------------------------------------------- +// Overview +// --------------------------------------------------------------------------- +// MVE/Helium implementation of KeccakF1600x4_StateExtractBytes +// (inverse of state_xor_bytes_x4_mve.S). +// +// void KeccakF1600x4_StateExtractBytes(state, d0, d1, d2, d3, offset, length) +// +// Reads 'length' bytes from the bit-interleaved Keccak state starting at +// byte 'offset', recombines the even and odd halves of each lane back +// into plain bytes, and writes them to four output buffers (d0..d3). +// +// --------------------------------------------------------------------------- +// Bit-interleaving background +// --------------------------------------------------------------------------- +// Each 64-bit Keccak lane is stored as two 32-bit words: +// even half -- bits 0, 2, 4, ..., 62 of the lane +// odd half -- bits 1, 3, 5, ..., 63 of the lane +// This representation allows 64-bit lane rotations (used in the Keccak +// round function) to be implemented as pairs of 32-bit rotations. +// +// Batched (x4) processing: +// Four Keccak instances are processed as a batch. Their states are +// stored interleaved in a single 800-byte buffer: first the even +// halves of all 25 lanes (400 bytes), then the odd halves (400 bytes). +// Within each 16-byte row, the four u32 words correspond to +// instances 0..3 of the same lane, enabling SIMD-parallel operations +// across all four instances. +// +// State memory layout (25 lanes x 4 instances x 2 halves): +// S[i][l]_even/odd = even/odd half of lane l, instance i (u32) +// Each row is 16 bytes (one Q-register). +// Offset Contents +// 0 S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even +// 16 S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even +// ... +// 384 S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even +// 400 S[0][ 0]_odd, S[1][ 0]_odd, S[2][ 0]_odd, S[3][ 0]_odd +// 416 S[0][ 1]_odd, S[1][ 1]_odd, S[2][ 1]_odd, S[3][ 1]_odd +// ... +// 784 S[0][24]_odd, S[1][24]_odd, S[2][24]_odd, S[3][24]_odd +// +// --------------------------------------------------------------------------- +// Three-phase structure +// --------------------------------------------------------------------------- +// Prologue -- if offset is not 8-byte aligned, extract +// min(length, 8-(offset%8)) bytes via predicated byte stores. +// Main -- process full 8-byte groups: load even/odd lane pair, +// de-interleave, scatter-store to output buffers. +// Tail -- extract remaining <8 bytes via predicated byte stores. + +#include "../../../../common.h" +#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +.thumb +.syntax unified +.text + +// --------------------------------------------------------------------------- +// deinterleave_even: inverse of the even-bit compaction. Inflates the even +// half stored in \e back into byte positions. +// Inputs: \e (compacted even bits) +// Outputs: \e (even bits expanded to byte positions, odd bits are garbage) +// Clobbers: \tmp +// --------------------------------------------------------------------------- +.macro deinterleave_even e, tmp + // Inflate e +------+-----------+-----------+-----------+ + // Single-element annotation follows. e: | X | X | l[0],l[1] | l[2],l[3] | + vsli.u32 \e, \e, #8 // e: | X | l[0],l[1] | l[2],l[3] | l[2],l[3] | + vsli.u16 \e, \e, #4 // e: | l[0] | l[1],l[1] | l[3],l[2] | l[3],l[3] | + // +------+-----------+-----------+-----------+ + // Now, expand the lower bits e: | XXXX3210 | tmp: | XXXXXXXX | + vsli.u8 \e, \e, #1 // e: | XXX32100 | tmp: | XXXXXXXX | (start re-packing low bits) + vshr.u8 \tmp, \e, #3 // e: | XXX32100 | tmp: | XXXXXX32 | + vsli.u8 \e, \tmp, #4 // e: | XX322100 | tmp: | XXXXXX32 | (assemble nibbles) + vshr.u8 \tmp, \e, #5 // e: | XX322100 | tmp: | XXXXXXX3 | + vsli.u8 \e, \tmp, #6 // e: | X3322100 | tmp: | XXXXXXX3 | (finalize byte compaction) +.endm + +// --------------------------------------------------------------------------- +// from_bit_interleaving_4x: reconstruct byte vectors from bit-interleaved +// even/odd halves. +// Inputs: \qe = even half, \qo = odd half +// (per 32b lane: lo16 = bytes 0..3, hi16 = bytes 4..7) +// Outputs: \qe = [d0l, d1l, d2l, d3l] (low 4 bytes per instance) +// \qo = [d0h, d1h, d2h, d3h] (high 4 bytes per instance) +// Clobbers: \rtmp, \qt0, \qt1, \qt2 +// --------------------------------------------------------------------------- +.macro from_bit_interleaving_4x qe, qo, qt0, qt1, qt2, rtmp + // +------+------+------+------+------+------+------+------+ + // qe: | E0l | E0u | E1l | E1u | E2l | E2u | E3l | E3u | + // qo: | O0l | O0u | O1l | O1u | O2l | O2u | O3l | O3u | + // +------+------+------+------+------+------+------+------+ + // Clone and byte-swap to get upper halves into position + vrev32.u16 \qt0, \qe + vrev32.u16 \qt1, \qo + // De-interleave lower evens / lower odds + deinterleave_even \qe, \qt2 + deinterleave_even \qo, \qt2 + // qe and qo now hold valid even-position bits but garbage in odd positions. + // Build mask 0x55..55 (01010101b) to isolate even-bit positions, then + // shift the odd half left by 1 and OR to reconstruct the original bytes. + mov \rtmp, #0x55 + vdup.u8 \qt2, \rtmp + vand.u32 \qe, \qe, \qt2 + vand.u32 \qo, \qo, \qt2 + vshl.u32 \qo, \qo, #1 + vorr \qe, \qe, \qo // qe = low bytes reconstructed + // De-interleave upper evens / upper odds + deinterleave_even \qt0, \qo + deinterleave_even \qt1, \qo + vand.u32 \qo, \qt0, \qt2 // reuse mask still in qt2 + vand.u32 \qt1, \qt1, \qt2 + vshl.u32 \qt1, \qt1, #1 + vorr \qo, \qo, \qt1 // qo = high bytes reconstructed +.endm + +// --------------------------------------------------------------------------- +// transpose_lanes_to_streams: rearrange two lane-ordered vectors into four +// per-instance vectors (inverse of transpose_streams_to_lanes). +// q0 = [d0l, d1l, d2l, d3l] -> q0 = [d0l, d0h, ?, ?] +// q1 = [d0h, d1h, d2h, d3h] -> q1 = [d1l, d1h, ?, ?] +// q2 = [d2l, d2h, ?, ?] +// q3 = [d3l, d3h, ?, ?] +// Clobbers: q2, q3, p0, r0 +// +// Vectors: || q0 || q1 || q2 || q3 || +// Elements: || d0l | d1l | d2l | d3l || d0h | d1h | d2h | d3h || || || +// --------------------------------------------------------------------------- +.macro transpose_lanes_to_streams + vrev64.u32 q2, q0 // || d0l | d1l | d2l | d3l || d0h | d1h | d2h | d3h || d1l | d0l | d3l | d2l || || + vrev64.u32 q3, q1 // || d0l | d1l | d2l | d3l || d0h | d1h | d2h | d3h || d1l | d0l | d3l | d2l || d1h | d0h | d3h | d2h || + mov r0, #0x0F0F + vmsr p0, r0 + vpsel q0, q0, q3 // || d0l | d0h | d2l | d2h || d0h | d1h | d2h | d3h || d1l | d0l | d3l | d2l || d1h | d0h | d3h | d2h || + vpsel q1, q2, q1 // || d0l | d0h | d2l | d2h || d1l | d1h | d3l | d3h || d1l | d0l | d3l | d2l || d1h | d0h | d3h | d2h || + vmov d4, d1 // || d0l | d0h | d2l | d2h || d1l | d1h | d3l | d3h || d2l | d2h | d3l | d2l || d1h | d0h | d3h | d2h || + vmov d6, d3 // || d0l | d0h | d2l | d2h || d1l | d1h | d3l | d3h || d2l | d2h | d3l | d2l || d3l | d3h | d3h | d2h || +.endm + + +// --------------------------------------------------------------------------- +// void keccak_f1600_x4_state_extract_bytes_asm(void *state, +// unsigned char *data0, +// unsigned char *data1, +// unsigned char *data2, +// unsigned char *data3, +// unsigned offset, +// unsigned length) +// +// AAPCS: r0=state, r1=d0, r2=d1, r3=d2, stack: d3, offset, length +// --------------------------------------------------------------------------- +.balign 4 +.global MLD_ASM_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm) +.type MLD_ASM_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm), %function +MLD_ASM_FN_SYMBOL(keccak_f1600_x4_state_extract_bytes_asm) + .equ stack_offset, ((12-4+2)*4+(15-8+1)*8) + push {r4-r12, lr} + vpush {d8-d15} + + state .req r0 + dp0 .req r1 + dp1 .req r2 + dp2 .req r3 + dp3 .req r4 + off .req r5 + length .req r6 + rSO .req r7 + rSE .req r8 + lane_offset_bytes .req r9 + off_full .req r10 + mask .req r11 + tmp .req r12 + nB .req lr + + qP .req q7 + qd0 .req q0 + qd1 .req q1 + qd2 .req q2 + qd3 .req q3 + + ldr dp3, [sp, #stack_offset+0] + ldr off_full, [sp, #stack_offset+4] + ldr length, [sp, #stack_offset+8] + + cmp length, #0 + beq keccak_f1600_x4_state_extract_bytes_asm_exit + + and off, off_full, #7 + bic lane_offset_bytes, off_full, #7 + + add rSE, state, lane_offset_bytes, lsl #1 + add rSO, rSE, #400 + + // ----------------------------------------------------------------------- + // PROLOGUE: extract min(len, 8-offset%8) bytes from the unaligned lane + // ----------------------------------------------------------------------- + cmp off, #0 + beq keccak_f1600_x4_state_extract_bytes_asm_pre_main + + // Load even/odd halves of one lane from state (post-increment rSE/rSO by 16) + vldrw.u32 qd0, [rSE], #16 + vldrw.u32 qd1, [rSO], #16 + + // De-interleave (clobbers r0, q2, q3, q4) + from_bit_interleaving_4x q0, q1, q2, q3, q4, r0 + + // Transpose from per-lane to per-instance layout + transpose_lanes_to_streams + + // nB = min(length, 8 - off) + rsb nB, off, #8 + cmp length, nB + it ls + movls nB, length + + // Build predicate: nB active bytes shifted left by 'off' + vctp.8 nB + vmrs mask, p0 + lsl mask, mask, off + vmsr p0, mask + + // Subtract offset from data pointers so predicate window aligns + subs dp0, dp0, off + subs dp1, dp1, off + subs dp2, dp2, off + subs dp3, dp3, off + + // Predicated byte stores (post-increment by 4) + vpstttt + vstrbt.u8 qd0, [dp0], #4 + vstrbt.u8 qd1, [dp1], #4 + vstrbt.u8 qd2, [dp2], #4 + vstrbt.u8 qd3, [dp3], #4 + + subs length, length, nB + cmp length, #0 + beq keccak_f1600_x4_state_extract_bytes_asm_exit + + // Build qP from updated scalar pointers + vmov qP[2], qP[0], dp0, dp2 + vmov qP[3], qP[1], dp1, dp3 + b keccak_f1600_x4_state_extract_bytes_asm_main_body + +keccak_f1600_x4_state_extract_bytes_asm_pre_main: + vmov qP[2], qP[0], dp0, dp2 + vmov qP[3], qP[1], dp1, dp3 + mov tmp, #4 + vsub.u32 qP, qP, tmp + + // ----------------------------------------------------------------------- + // MAIN: process full 8-byte lanes + // ----------------------------------------------------------------------- +keccak_f1600_x4_state_extract_bytes_asm_main_body: + lsr lr, length, #3 + wls lr, lr, keccak_f1600_x4_state_extract_bytes_asm_main_loop_end +keccak_f1600_x4_state_extract_bytes_asm_main_loop_start: + vldrw.u32 qd0, [rSE], #16 + vldrw.u32 qd1, [rSO], #16 + + // De-interleave (clobbers r0, q2, q3, q4) + from_bit_interleaving_4x q0, q1, q2, q3, q4, r0 + + // Scatter-store 8 bytes per instance (two u32 stores with post-increment) + vstrw.u32 qd0, [qP, #4]! + vstrw.u32 qd1, [qP, #4]! + + le lr, keccak_f1600_x4_state_extract_bytes_asm_main_loop_start +keccak_f1600_x4_state_extract_bytes_asm_main_loop_end: + + // ----------------------------------------------------------------------- + // TAIL: extract <8 remaining bytes at lane offset 0 + // ----------------------------------------------------------------------- + ands length, length, #7 + beq keccak_f1600_x4_state_extract_bytes_asm_exit + + // Recover scalar pointers from qP + mov tmp, #4 + vadd.u32 qP, qP, tmp + vmov dp0, dp2, qP[2], qP[0] + vmov dp1, dp3, qP[3], qP[1] + + vldrw.u32 qd0, [rSE], #16 + vldrw.u32 qd1, [rSO], #16 + + // De-interleave (clobbers r0, q2, q3, q4) + from_bit_interleaving_4x q0, q1, q2, q3, q4, r0 + + // Transpose from per-lane to per-instance layout + transpose_lanes_to_streams + + // Predicated byte stores for remaining bytes + vctp.8 length + vpstttt + vstrbt.u8 qd0, [dp0], #4 + vstrbt.u8 qd1, [dp1], #4 + vstrbt.u8 qd2, [dp2], #4 + vstrbt.u8 qd3, [dp3], #4 + +keccak_f1600_x4_state_extract_bytes_asm_exit: + vpop {d8-d15} + pop {r4-r12, pc} + .unreq state + .unreq dp0 + .unreq dp1 + .unreq dp2 + .unreq dp3 + .unreq off + .unreq length + .unreq rSO + .unreq rSE + .unreq lane_offset_bytes + .unreq off_full + .unreq mask + .unreq tmp + .unreq nB + .unreq qP + .unreq qd0 + .unreq qd1 + .unreq qd2 + .unreq qd3 + +/* simpasm: footer-start */ +#endif /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/fips202/armv81m/src/state_xor_bytes_x4_mve.S b/dev/fips202/armv81m/src/state_xor_bytes_x4_mve.S new file mode 100644 index 000000000..cf343ee0e --- /dev/null +++ b/dev/fips202/armv81m/src/state_xor_bytes_x4_mve.S @@ -0,0 +1,349 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * Copyright (c) 2026 Arm Limited + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +// --------------------------------------------------------------------------- +// Overview +// --------------------------------------------------------------------------- +// MVE/Helium implementation of KeccakF1600x4_StateXORBytes. +// +// void KeccakF1600x4_StateXORBytes(state, d0, d1, d2, d3, offset, length) +// +// Reads 'length' plain bytes from each of four input buffers (d0..d3), +// splits every byte into its even and odd bits (bit-interleaving), and +// XORs the result into the Keccak state starting at byte 'offset'. +// +// --------------------------------------------------------------------------- +// Bit-interleaving background +// --------------------------------------------------------------------------- +// Each 64-bit Keccak lane is stored as two 32-bit words: +// even half -- bits 0, 2, 4, ..., 62 of the lane +// odd half -- bits 1, 3, 5, ..., 63 of the lane +// This representation allows 64-bit lane rotations (used in the Keccak +// round function) to be implemented as pairs of 32-bit rotations. +// +// Batched (x4) processing: +// Four Keccak instances are processed as a batch. Their states are +// stored interleaved in a single 800-byte buffer: first the even +// halves of all 25 lanes (400 bytes), then the odd halves (400 bytes). +// Within each 16-byte row, the four u32 words correspond to +// instances 0..3 of the same lane, enabling SIMD-parallel operations +// across all four instances. +// +// State memory layout (25 lanes x 4 instances x 2 halves): +// S[i][l]_even/odd = even/odd half of lane l, instance i (u32) +// Each row is 16 bytes (one Q-register). +// Offset Contents +// 0 S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even +// 16 S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even +// ... +// 384 S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even +// 400 S[0][ 0]_odd, S[1][ 0]_odd, S[2][ 0]_odd, S[3][ 0]_odd +// 416 S[0][ 1]_odd, S[1][ 1]_odd, S[2][ 1]_odd, S[3][ 1]_odd +// ... +// 784 S[0][24]_odd, S[1][24]_odd, S[2][24]_odd, S[3][24]_odd +// +// --------------------------------------------------------------------------- +// Three-phase structure +// --------------------------------------------------------------------------- +// Prologue -- if offset is not 8-byte aligned, absorb +// min(length, 8-(offset%8)) bytes via predicated byte loads. +// Main -- process full 8-byte groups via word-level gather loads, +// bit-interleave, then VEOR into even/odd state halves. +// Tail -- absorb remaining <8 bytes via predicated byte loads. + +#include "../../../../common.h" +#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +.thumb +.syntax unified +.text + +// --------------------------------------------------------------------------- +// Interleave macros +// --------------------------------------------------------------------------- + +// interleave_odds: in-place SWAR bit permutation that compacts odd-numbered +// bits of each byte/halfword/word in \t toward the upper half, preparing the +// odd half of the bit-interleaved representation. +// Inputs: \t (data) +// Outputs: \t (odd bits compacted; per 32b lane: lo16=bytes 0..3, hi16=4..7) +// Clobbers: \u +.macro interleave_odds t, u + vshl.u8 \u, \t, #2 // u = t[5..0],00 + vsri.u8 \t, \u, #1 // t = t[7],u[6..0] => t = t[7],t[5..0],0 + vshl.u8 \u, \t, #3 // stage 2 across nibbles + vsri.u8 \t, \u, #2 + vshl.u8 \u, \t, #4 // stage 3 across bytes + vsri.u8 \t, \u, #3 + vshl.u16 \u, \t, #8 // widen within halfwords + vsri.u8 \t, \u, #4 + vshl.u32 \u, \t, #16 // widen within words + vsri.u16 \t, \u, #8 +.endm + +// interleave_evens: in-place SWAR bit permutation that compacts even-numbered +// bits of each byte/halfword/word in \t toward the lower half, preparing the +// even half of the bit-interleaved representation. +// Inputs: \t (data) +// Outputs: \t (even bits compacted; per 32b lane: lo16=bytes 0..3, hi16=4..7) +// Clobbers: \u +.macro interleave_evens t, u + vshr.u8 \u, \t, #2 // stage 1 within bytes + vsli.u8 \t, \u, #1 // t = ((t >> 1) & 0x7E7E7E7E) | (t & 0x01010101) + vshr.u8 \u, \t, #3 // stage 2 within nibbles + vsli.u8 \t, \u, #2 // t = ((t >> 2) & 0x1C1C1C1C) | (t & 0x03030303) + vshr.u8 \u, \t, #4 // stage 3 across bytes + vsli.u8 \t, \u, #3 // t = ((t >> 3) & 0x08080808) | (t & 0x07070707) + vshr.u16 \u, \t, #8 // widen within halfwords + vsli.u8 \t, \u, #4 // t = ((t >> 4) & 0x00F000F0) | (t & 0x000F000F) + vshr.u32 \u, \t, #16 // widen within words + vsli.u16 \t, \u, #8 // t = ((t >> 8) & 0x0000FF00) | (t & 0x000000FF) +.endm + +// --------------------------------------------------------------------------- +// to_bit_interleaving_4x: split \qe/\qo (low/high 4 bytes per instance) +// into even and odd halves packed as: +// \qe = even half (lo16: bytes 0..3, hi16: bytes 4..7) +// \qo = odd half (lo16: bytes 0..3, hi16: bytes 4..7) +// Inputs: \qe = [d0l, d1l, d2l, d3l], \qo = [d0h, d1h, d2h, d3h] +// Outputs: \qe (even half), \qo (odd half) +// Clobbers: \qt0, \qt1, \qt2 +// --------------------------------------------------------------------------- +.macro to_bit_interleaving_4x qe, qo, qt0, qt1, qt2 + vmov \qt0, \qe + vmov \qt1, \qo + interleave_evens \qe, \qt2 // pack even bits in qe (low 16: d?l, high 16: d?h) + interleave_evens \qt1, \qt2 // pack even bits from the high-half vector + vsli.32 \qe, \qt1, #16 // merge: qe = [even(lo16), even(hi16)] + interleave_odds \qt0, \qt2 // pack odd bits from original qe + interleave_odds \qo, \qt2 // pack odd bits from original qo + vsri.32 \qo, \qt0, #16 // merge: qo = [odd(lo16), odd(hi16)] +.endm + +// --------------------------------------------------------------------------- +// transpose_streams_to_lanes: rearrange four per-instance vectors (q0..q3, +// each holding 8 bytes in its low 64 bits) into two vectors: +// q0 = [d0l, d1l, d2l, d3l] (low 4 bytes of each instance) +// q1 = [d0h, d1h, d2h, d3h] (high 4 bytes of each instance) +// Clobbers: q2, q3, p0, r0 +// +// Vectors: || q0 || q1 || q2 || q3 || +// Elements: || d0l | d0h | 0 | 0 || d1l | d1h | 0 | 0 || d2l | d2h | 0 | 0 || d3l | d3h | 0 | 0 || +// --------------------------------------------------------------------------- +.macro transpose_streams_to_lanes + vmov d1, d4 // || d0l | d0h | d2l | d2h || d1l | d1h | 0 | 0 || d2l | d2h | 0 | 0 || d3l | d3h | 0 | 0 || + vmov d3, d6 // || d0l | d0h | d2l | d2h || d1l | d1h | d3l | d3h || d2l | d2h | 0 | 0 || d3l | d3h | 0 | 0 || + vrev64.u32 q2, q0 // || d0l | d0h | d2l | d2h || d1l | d1h | d3l | d3h || d0h | d0l | d2h | d2l || d3l | d3h | 0 | 0 || + vrev64.u32 q3, q1 // || d0l | d0h | d2l | d2h || d1l | d1h | d3l | d3h || d0h | d0l | d2h | d2l || d1h | d1l | d3h | d3l || + mov r0, #0x0F0F // predicate: select lower 4 bytes within each 64b half + vmsr p0, r0 + vpsel q0, q0, q3 // q0 = [d0l, d1l, d2l, d3l] + vpsel q1, q2, q1 // q1 = [d0h, d1h, d2h, d3h] +.endm + +// --------------------------------------------------------------------------- +// xor_lane_and_store_postinc: XOR one lane into state with post-increment. +// rSE / rSO are current pointers to the even / odd state halves. +// --------------------------------------------------------------------------- +.macro xor_lane_and_store_postinc qE, qO, qS0, qS1, rSE, rSO + vldrw.u32 \qS0, [\rSE] // load 16B from even half + vldrw.u32 \qS1, [\rSO] // load 16B from odd half + veor.u32 \qS0, \qS0, \qE + veor.u32 \qS1, \qS1, \qO + vstrw.u32 \qS0, [\rSE], #16 // post-inc by 16 bytes + vstrw.u32 \qS1, [\rSO], #16 +.endm + + +// --------------------------------------------------------------------------- +// void keccak_f1600_x4_state_xor_bytes_asm(void *state, +// const unsigned char *data0, +// const unsigned char *data1, +// const unsigned char *data2, +// const unsigned char *data3, +// unsigned offset, unsigned length) +// +// AAPCS: r0=state, r1=d0, r2=d1, r3=d2, stack: d3, offset, length +// --------------------------------------------------------------------------- + +.balign 4 +.global MLD_ASM_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm) +.type MLD_ASM_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm), %function +MLD_ASM_FN_SYMBOL(keccak_f1600_x4_state_xor_bytes_asm) + .equ stack_offset, ((12-4+2)*4+(15-8+1)*8) + push {r4-r12, lr} + vpush {d8-d15} + + state .req r0 + dp0 .req r1 + dp1 .req r2 + dp2 .req r3 + dp3 .req r4 + off .req r5 + length .req r6 + rSO .req r7 + rSE .req r8 + lane_offset_bytes .req r9 + off_full .req r10 + mask .req r11 + tmp .req r0 + nB .req lr + + qP .req q7 + + qd0 .req q0 + qd1 .req q1 + qd2 .req q2 + qd3 .req q3 + + qS0 .req q4 + qS1 .req q5 + + ldr dp3, [sp, #stack_offset+0] + ldr off_full, [sp, #stack_offset+4] + ldr length, [sp, #stack_offset+8] + + cmp length, #0 + beq keccak_f1600_x4_state_xor_bytes_asm_exit + + and off, off_full, #7 + bic lane_offset_bytes, off_full, #7 + + add rSE, state, lane_offset_bytes, lsl #1 + add rSO, rSE, #400 + + // ----------------------------------------------------------------------- + // PROLOGUE: absorb min(len, 8-offset%8) bytes at the unaligned position + // ----------------------------------------------------------------------- + cmp off, #0 + beq keccak_f1600_x4_state_xor_bytes_asm_pre_main + + // Subtract offset from data pointers so predicate window aligns + subs dp0, dp0, off + subs dp1, dp1, off + subs dp2, dp2, off + subs dp3, dp3, off + + // nB = min(length, 8 - off) + rsb nB, off, #8 + cmp length, nB + it ls + movls nB, length + subs length, length, nB + + // Build predicate: nB active bytes shifted left by 'off' + vctp.8 nB + vmrs mask, p0 + lsl mask, mask, off + vmsr p0, mask + + // Predicated byte loads (4 bytes per instance, post-increment by 4) + vpstttt + vldrbt.u8 qd0, [dp0], #4 + vldrbt.u8 qd1, [dp1], #4 + vldrbt.u8 qd2, [dp2], #4 + vldrbt.u8 qd3, [dp3], #4 + + // Transpose from per-instance layout to per-lane layout + transpose_streams_to_lanes + + // Bit-interleave (clobbers q2, q3, q4) + to_bit_interleaving_4x q0, q1, q2, q3, q4 + + // XOR into state (post-increments rSE/rSO by 16) + xor_lane_and_store_postinc q0, q1, qS0, qS1, rSE, rSO + + // Build qP = {d0,d1,d2,d3} as u32 lanes for gather loads + vmov qP[2], qP[0], dp0, dp2 + vmov qP[3], qP[1], dp1, dp3 + cmp length, #0 + beq keccak_f1600_x4_state_xor_bytes_asm_exit + b keccak_f1600_x4_state_xor_bytes_asm_main_body + +keccak_f1600_x4_state_xor_bytes_asm_pre_main: + vmov qP[2], qP[0], dp0, dp2 + vmov qP[3], qP[1], dp1, dp3 + mov tmp, #4 + vsub.u32 qP, qP, tmp // pre-bias so first [qP,#4]! lands at original ptr + +keccak_f1600_x4_state_xor_bytes_asm_main_body: + // ----------------------------------------------------------------------- + // MAIN: process full 8-byte lanes + // ----------------------------------------------------------------------- + lsr lr, length, #3 + wls lr, lr, keccak_f1600_x4_state_xor_bytes_asm_main_loop_end +keccak_f1600_x4_state_xor_bytes_asm_main_loop_start: + // Gather 8 bytes per instance (two u32 loads with post-increment) + vldrw.u32 qd0, [qP, #4]! + vldrw.u32 qd1, [qP, #4]! + + // Bit-interleave (clobbers q2, q3, q4) + to_bit_interleaving_4x q0, q1, q2, q3, q4 + + // XOR into state (post-increments rSE/rSO by 16) + xor_lane_and_store_postinc q0, q1, qS0, qS1, rSE, rSO + + le lr, keccak_f1600_x4_state_xor_bytes_asm_main_loop_start +keccak_f1600_x4_state_xor_bytes_asm_main_loop_end: + + // ----------------------------------------------------------------------- + // TAIL: absorb <8 remaining bytes at lane offset 0 + // ----------------------------------------------------------------------- + ands length, length, #7 + beq keccak_f1600_x4_state_xor_bytes_asm_exit + + // Recover scalar pointers from qP + mov tmp, #4 + vadd.u32 qP, qP, tmp + vmov dp0, dp2, qP[2], qP[0] + vmov dp1, dp3, qP[3], qP[1] + + vctp.8 length + vpstttt + vldrbt.u8 qd0, [dp0] + vldrbt.u8 qd1, [dp1] + vldrbt.u8 qd2, [dp2] + vldrbt.u8 qd3, [dp3] + + // Transpose from per-instance layout to per-lane layout + transpose_streams_to_lanes + + // Bit-interleave (clobbers q2, q3, q4) + to_bit_interleaving_4x q0, q1, q2, q3, q4 + + // XOR into state (post-increments rSE/rSO by 16) + xor_lane_and_store_postinc qd0, qd1, qS0, qS1, rSE, rSO + +keccak_f1600_x4_state_xor_bytes_asm_exit: + vpop {d8-d15} + pop {r4-r12, pc} + .unreq state + .unreq dp0 + .unreq dp1 + .unreq dp2 + .unreq dp3 + .unreq off + .unreq length + .unreq rSO + .unreq rSE + .unreq lane_offset_bytes + .unreq off_full + .unreq mask + .unreq tmp + .unreq nB + .unreq qP + .unreq qd0 + .unreq qd1 + .unreq qd2 + .unreq qd3 + .unreq qS0 + .unreq qS1 + +/* simpasm: footer-start */ +#endif /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c index 4e02f0d7b..e4c74a4a4 100644 --- a/mldsa/mldsa_native.c +++ b/mldsa/mldsa_native.c @@ -586,11 +586,17 @@ #undef MLD_FIPS202_ARMV81M_NEED_X4 #undef MLD_FIPS202_NATIVE_ARMV81M #undef MLD_FIPS202_NATIVE_ARMV81M_MVE_H +#undef MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE #undef MLD_USE_FIPS202_X4_NATIVE +#undef MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE #undef mld_keccak_f1600_x4_native_impl +#undef mld_keccak_f1600_x4_state_extract_bytes +#undef mld_keccak_f1600_x4_state_xor_bytes /* mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h */ #undef MLD_FIPS202_NATIVE_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H #undef mld_keccak_f1600_x4_mve_asm +#undef mld_keccak_f1600_x4_state_extract_bytes_asm +#undef mld_keccak_f1600_x4_state_xor_bytes_asm #undef mld_keccakf1600_round_constants #endif /* MLD_SYS_ARMV81M_MVE */ #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */ diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S index cee9460ab..270e6009f 100644 --- a/mldsa/mldsa_native_asm.S +++ b/mldsa/mldsa_native_asm.S @@ -101,6 +101,8 @@ #endif #if defined(MLD_SYS_ARMV81M_MVE) #include "src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S" +#include "src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S" +#include "src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S" #endif #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */ @@ -589,11 +591,17 @@ #undef MLD_FIPS202_ARMV81M_NEED_X4 #undef MLD_FIPS202_NATIVE_ARMV81M #undef MLD_FIPS202_NATIVE_ARMV81M_MVE_H +#undef MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE #undef MLD_USE_FIPS202_X4_NATIVE +#undef MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE #undef mld_keccak_f1600_x4_native_impl +#undef mld_keccak_f1600_x4_state_extract_bytes +#undef mld_keccak_f1600_x4_state_xor_bytes /* mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h */ #undef MLD_FIPS202_NATIVE_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H #undef mld_keccak_f1600_x4_mve_asm +#undef mld_keccak_f1600_x4_state_extract_bytes_asm +#undef mld_keccak_f1600_x4_state_xor_bytes_asm #undef mld_keccakf1600_round_constants #endif /* MLD_SYS_ARMV81M_MVE */ #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */ diff --git a/mldsa/src/fips202/keccakf1600.c b/mldsa/src/fips202/keccakf1600.c index 0aec7b30c..83f7aebbb 100644 --- a/mldsa/src/fips202/keccakf1600.c +++ b/mldsa/src/fips202/keccakf1600.c @@ -80,11 +80,12 @@ void mld_keccakf1600_xor_bytes(uint64_t *state, const unsigned char *data, #endif /* !MLD_SYS_LITTLE_ENDIAN */ } -MLD_INTERNAL_API -void mld_keccakf1600x4_extract_bytes(uint64_t *state, unsigned char *data0, - unsigned char *data1, unsigned char *data2, - unsigned char *data3, unsigned offset, - unsigned length) +static void mld_keccakf1600x4_extract_bytes_c(uint64_t *state, + unsigned char *data0, + unsigned char *data1, + unsigned char *data2, + unsigned char *data3, + unsigned offset, unsigned length) { mld_keccakf1600_extract_bytes(state + MLD_KECCAK_LANES * 0, data0, offset, length); @@ -97,11 +98,29 @@ void mld_keccakf1600x4_extract_bytes(uint64_t *state, unsigned char *data0, } MLD_INTERNAL_API -void mld_keccakf1600x4_xor_bytes(uint64_t *state, const unsigned char *data0, - const unsigned char *data1, - const unsigned char *data2, - const unsigned char *data3, unsigned offset, - unsigned length) +void mld_keccakf1600x4_extract_bytes(uint64_t *state, unsigned char *data0, + unsigned char *data1, unsigned char *data2, + unsigned char *data3, unsigned offset, + unsigned length) +{ +#if defined(MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE) + if (mld_keccakf1600_extract_bytes_x4_native(state, data0, data1, data2, data3, + offset, length) == + MLD_NATIVE_FUNC_SUCCESS) + { + return; + } +#endif /* MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE */ + mld_keccakf1600x4_extract_bytes_c(state, data0, data1, data2, data3, offset, + length); +} + +static void mld_keccakf1600x4_xor_bytes_c(uint64_t *state, + const unsigned char *data0, + const unsigned char *data1, + const unsigned char *data2, + const unsigned char *data3, + unsigned offset, unsigned length) { mld_keccakf1600_xor_bytes(state + MLD_KECCAK_LANES * 0, data0, offset, length); @@ -113,6 +132,25 @@ void mld_keccakf1600x4_xor_bytes(uint64_t *state, const unsigned char *data0, length); } +MLD_INTERNAL_API +void mld_keccakf1600x4_xor_bytes(uint64_t *state, const unsigned char *data0, + const unsigned char *data1, + const unsigned char *data2, + const unsigned char *data3, unsigned offset, + unsigned length) +{ +#if defined(MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE) + if (mld_keccakf1600_xor_bytes_x4_native(state, data0, data1, data2, data3, + offset, + length) == MLD_NATIVE_FUNC_SUCCESS) + { + return; + } +#endif /* MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE */ + mld_keccakf1600x4_xor_bytes_c(state, data0, data1, data2, data3, offset, + length); +} + MLD_INTERNAL_API void mld_keccakf1600x4_permute(uint64_t *state) { diff --git a/mldsa/src/fips202/native/armv81m/mve.h b/mldsa/src/fips202/native/armv81m/mve.h index d0ab2be78..3d5e6f7db 100644 --- a/mldsa/src/fips202/native/armv81m/mve.h +++ b/mldsa/src/fips202/native/armv81m/mve.h @@ -11,12 +11,18 @@ /* Part of backend API */ #define MLD_USE_FIPS202_X4_NATIVE +#define MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE +#define MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE /* Guard for assembly file */ #define MLD_FIPS202_ARMV81M_NEED_X4 #if !defined(__ASSEMBLER__) #include "../api.h" +/* + * Native x4 permutation + * State is kept in bit-interleaved format. + */ #define mld_keccak_f1600_x4_native_impl \ MLD_NAMESPACE(keccak_f1600_x4_native_impl) int mld_keccak_f1600_x4_native_impl(uint64_t *state); @@ -27,6 +33,48 @@ static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state) return mld_keccak_f1600_x4_native_impl(state); } +/* + * Native x4 XOR bytes (with on-the-fly bit interleaving) + */ +#define mld_keccak_f1600_x4_state_xor_bytes \ + MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm) +void mld_keccak_f1600_x4_state_xor_bytes(void *state, const uint8_t *data0, + const uint8_t *data1, + const uint8_t *data2, + const uint8_t *data3, unsigned offset, + unsigned length); + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_keccakf1600_xor_bytes_x4_native( + uint64_t *state, const uint8_t *data0, const uint8_t *data1, + const uint8_t *data2, const uint8_t *data3, unsigned offset, + unsigned length) +{ + mld_keccak_f1600_x4_state_xor_bytes(state, data0, data1, data2, data3, offset, + length); + return MLD_NATIVE_FUNC_SUCCESS; +} + +/* + * Native x4 extract bytes (with on-the-fly bit de-interleaving) + */ +#define mld_keccak_f1600_x4_state_extract_bytes \ + MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm) +void mld_keccak_f1600_x4_state_extract_bytes(void *state, uint8_t *data0, + uint8_t *data1, uint8_t *data2, + uint8_t *data3, unsigned offset, + unsigned length); + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_keccakf1600_extract_bytes_x4_native( + uint64_t *state, uint8_t *data0, uint8_t *data1, uint8_t *data2, + uint8_t *data3, unsigned offset, unsigned length) +{ + mld_keccak_f1600_x4_state_extract_bytes(state, data0, data1, data2, data3, + offset, length); + return MLD_NATIVE_FUNC_SUCCESS; +} + #endif /* !__ASSEMBLER__ */ #endif /* !MLD_FIPS202_NATIVE_ARMV81M_MVE_H */ diff --git a/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h b/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h index dee44842d..779fd9304 100644 --- a/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h +++ b/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h @@ -17,4 +17,20 @@ extern const uint32_t mld_keccakf1600_round_constants[48]; void mld_keccak_f1600_x4_mve_asm(uint64_t state[100], uint64_t tmpstate[100], const uint32_t rc[48]); +#define mld_keccak_f1600_x4_state_xor_bytes_asm \ + MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm) +void mld_keccak_f1600_x4_state_xor_bytes_asm(void *state, const uint8_t *d0, + const uint8_t *d1, + const uint8_t *d2, + const uint8_t *d3, unsigned offset, + unsigned length); + +#define mld_keccak_f1600_x4_state_extract_bytes_asm \ + MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm) +void mld_keccak_f1600_x4_state_extract_bytes_asm(void *state, uint8_t *data0, + uint8_t *data1, uint8_t *data2, + uint8_t *data3, + unsigned offset, + unsigned length); + #endif /* !MLD_FIPS202_NATIVE_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H */ diff --git a/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S b/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S index 557c9136c..4aca4a354 100644 --- a/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S +++ b/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S @@ -7,7 +7,7 @@ /*yaml Name: keccak_f1600_x4_mve_asm - Description: Armv8.1-M MVE implementation of 4-way parallel Keccak-f[1600] permutation using bit-interleaved state + Description: Armv8.1-M MVE implementation of batched (x4) Keccak-f[1600] permutation using bit-interleaved state Signature: void mld_keccak_f1600_x4_mve_asm(void *state, void *tmpstate, const uint32_t *rc) ABI: r0: @@ -15,7 +15,7 @@ size_bytes: 800 permissions: read/write c_parameter: void *state - description: Four bit-interleaved Keccak states (low halves followed by high halves) + description: Bit-interleaved state for 4 Keccak instances (even halves followed by odd halves) r1: type: buffer size_bytes: 800 @@ -33,6 +33,36 @@ description: register preservation (44) + SIMD registers (64) + temporary storage (128) */ +// --------------------------------------------------------------------------- +// Bit-interleaving background +// --------------------------------------------------------------------------- +// Each 64-bit Keccak lane is stored as two 32-bit words: +// even half -- bits 0, 2, 4, ..., 62 of the lane +// odd half -- bits 1, 3, 5, ..., 63 of the lane +// This representation allows 64-bit lane rotations (used in the Keccak +// round function) to be implemented as pairs of 32-bit rotations. +// +// Batched (x4) processing: +// Four Keccak instances are processed as a batch. Their states are +// stored interleaved in a single 800-byte buffer: first the even +// halves of all 25 lanes (400 bytes), then the odd halves (400 bytes). +// Within each 16-byte row, the four u32 words correspond to +// instances 0..3 of the same lane, enabling SIMD-parallel operations +// across all four instances. +// +// State memory layout (25 lanes x 4 instances x 2 halves): +// S[i][l]_even/odd = even/odd half of lane l, instance i (u32) +// Each row is 16 bytes (one Q-register). +// Offset Contents +// 0 S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even +// 16 S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even +// ... +// 384 S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even +// 400 S[0][ 0]_odd, S[1][ 0]_odd, S[2][ 0]_odd, S[3][ 0]_odd +// 416 S[0][ 1]_odd, S[1][ 1]_odd, S[2][ 1]_odd, S[3][ 1]_odd +// ... +// 784 S[0][24]_odd, S[1][24]_odd, S[2][24]_odd, S[3][24]_odd + #include "../../../../common.h" #if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) diff --git a/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c b/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c index e74fd8913..e26f1bf22 100644 --- a/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c +++ b/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c @@ -12,114 +12,19 @@ #include "fips202_native_armv81m.h" -/* - * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations. - * TODO: Replace with optimized MVE assembly implementations - * (as a part of XORBytes and ExtractBytes) - */ - -/* Extract even-indexed bits from 64-bit value into lower 32 bits */ -static uint32_t bitinterleave_even(uint64_t x) -{ - uint64_t t; - t = x & 0x5555555555555555ULL; - t = (t | (t >> 1)) & 0x3333333333333333ULL; - t = (t | (t >> 2)) & 0x0f0f0f0f0f0f0f0fULL; - t = (t | (t >> 4)) & 0x00ff00ff00ff00ffULL; - t = (t | (t >> 8)) & 0x0000ffff0000ffffULL; - t = (t | (t >> 16)) & 0x00000000ffffffffULL; - return (uint32_t)t; -} - -/* Extract odd-indexed bits from 64-bit value into lower 32 bits */ -static uint32_t bitinterleave_odd(uint64_t x) -{ - return bitinterleave_even(x >> 1); -} - -/* Spread 32-bit value across even bit positions of 64-bit result */ -static uint64_t spread_even(uint32_t x) -{ - uint64_t t = x; - t = (t | (t << 16)) & 0x0000ffff0000ffffULL; - t = (t | (t << 8)) & 0x00ff00ff00ff00ffULL; - t = (t | (t << 4)) & 0x0f0f0f0f0f0f0f0fULL; - t = (t | (t << 2)) & 0x3333333333333333ULL; - t = (t | (t << 1)) & 0x5555555555555555ULL; - return t; -} - -/* Combine even and odd 32-bit halves into interleaved 64-bit value */ -static uint64_t bitdeinterleave(uint32_t even, uint32_t odd) -{ - return spread_even(even) | (spread_even(odd) << 1); -} /* - * TEMPORARY: Naive C interleaving functions. - * These will be replaced with optimized MVE assembly implementations. + * Keccak-f1600 x4 permutation (on bit-interleaved state) + * State is expected to already be in bit-interleaved format. */ -static void interleave_4fold(uint64_t *state_4x, const uint64_t *state0, - const uint64_t *state1, const uint64_t *state2, - const uint64_t *state3) -{ - uint32_t *state_4xl = (uint32_t *)state_4x; - uint32_t *state_4xh = (uint32_t *)state_4x + 100; - - for (size_t i = 0; i < 25; i++) - { - state_4xl[i * 4 + 0] = bitinterleave_even(state0[i]); - state_4xl[i * 4 + 1] = bitinterleave_even(state1[i]); - state_4xl[i * 4 + 2] = bitinterleave_even(state2[i]); - state_4xl[i * 4 + 3] = bitinterleave_even(state3[i]); - - state_4xh[i * 4 + 0] = bitinterleave_odd(state0[i]); - state_4xh[i * 4 + 1] = bitinterleave_odd(state1[i]); - state_4xh[i * 4 + 2] = bitinterleave_odd(state2[i]); - state_4xh[i * 4 + 3] = bitinterleave_odd(state3[i]); - } -} - -static void deinterleave_4fold(uint64_t *state_4x, uint64_t *state0, - uint64_t *state1, uint64_t *state2, - uint64_t *state3) -{ - uint32_t *state_4xl = (uint32_t *)state_4x; - uint32_t *state_4xh = (uint32_t *)state_4x + 100; - - for (size_t i = 0; i < 25; i++) - { - state0[i] = bitdeinterleave(state_4xl[i * 4 + 0], state_4xh[i * 4 + 0]); - state1[i] = bitdeinterleave(state_4xl[i * 4 + 1], state_4xh[i * 4 + 1]); - state2[i] = bitdeinterleave(state_4xl[i * 4 + 2], state_4xh[i * 4 + 2]); - state3[i] = bitdeinterleave(state_4xl[i * 4 + 3], state_4xh[i * 4 + 3]); - } -} - #define mld_keccak_f1600_x4_native_impl \ MLD_NAMESPACE(keccak_f1600_x4_native_impl) int mld_keccak_f1600_x4_native_impl(uint64_t *state) { - /* - * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations. - * TODO: Replace with optimized MVE assembly implementations - * (as a part of XORBytes and ExtractBytes) - */ - MLD_ALIGN uint64_t state_4x[100]; - MLD_ALIGN uint64_t state_4x_tmp[100]; - - /* Interleave the 4 states into bit-interleaved format */ - interleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]); - - /* Run the permutation */ - mld_keccak_f1600_x4_mve_asm(state_4x, state_4x_tmp, + MLD_ALIGN uint64_t state_tmp[100]; + mld_keccak_f1600_x4_mve_asm(state, state_tmp, mld_keccakf1600_round_constants); - - /* Deinterleave back to 4 separate states */ - deinterleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]); - - mld_zeroize(state_4x, sizeof(state_4x)); - mld_zeroize(state_4x_tmp, sizeof(state_4x_tmp)); + mld_zeroize(state_tmp, sizeof(state_tmp)); return MLD_NATIVE_FUNC_SUCCESS; } diff --git a/mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S b/mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S new file mode 100644 index 000000000..8a2bd9dc7 --- /dev/null +++ b/mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S @@ -0,0 +1,290 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * Copyright (c) 2026 Arm Limited + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +// --------------------------------------------------------------------------- +// Overview +// --------------------------------------------------------------------------- +// MVE/Helium implementation of KeccakF1600x4_StateExtractBytes +// (inverse of state_xor_bytes_x4_mve.S). +// +// void KeccakF1600x4_StateExtractBytes(state, d0, d1, d2, d3, offset, length) +// +// Reads 'length' bytes from the bit-interleaved Keccak state starting at +// byte 'offset', recombines the even and odd halves of each lane back +// into plain bytes, and writes them to four output buffers (d0..d3). +// +// --------------------------------------------------------------------------- +// Bit-interleaving background +// --------------------------------------------------------------------------- +// Each 64-bit Keccak lane is stored as two 32-bit words: +// even half -- bits 0, 2, 4, ..., 62 of the lane +// odd half -- bits 1, 3, 5, ..., 63 of the lane +// This representation allows 64-bit lane rotations (used in the Keccak +// round function) to be implemented as pairs of 32-bit rotations. +// +// Batched (x4) processing: +// Four Keccak instances are processed as a batch. Their states are +// stored interleaved in a single 800-byte buffer: first the even +// halves of all 25 lanes (400 bytes), then the odd halves (400 bytes). +// Within each 16-byte row, the four u32 words correspond to +// instances 0..3 of the same lane, enabling SIMD-parallel operations +// across all four instances. +// +// State memory layout (25 lanes x 4 instances x 2 halves): +// S[i][l]_even/odd = even/odd half of lane l, instance i (u32) +// Each row is 16 bytes (one Q-register). +// Offset Contents +// 0 S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even +// 16 S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even +// ... +// 384 S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even +// 400 S[0][ 0]_odd, S[1][ 0]_odd, S[2][ 0]_odd, S[3][ 0]_odd +// 416 S[0][ 1]_odd, S[1][ 1]_odd, S[2][ 1]_odd, S[3][ 1]_odd +// ... +// 784 S[0][24]_odd, S[1][24]_odd, S[2][24]_odd, S[3][24]_odd +// +// --------------------------------------------------------------------------- +// Three-phase structure +// --------------------------------------------------------------------------- +// Prologue -- if offset is not 8-byte aligned, extract +// min(length, 8-(offset%8)) bytes via predicated byte stores. +// Main -- process full 8-byte groups: load even/odd lane pair, +// de-interleave, scatter-store to output buffers. +// Tail -- extract remaining <8 bytes via predicated byte stores. + +#include "../../../../common.h" +#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/fips202/armv81m/src/state_extract_bytes_x4_mve.S using scripts/simpasm. Do not modify it directly. + */ + +.thumb +.syntax unified + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm) +MLD_ASM_FN_SYMBOL(keccak_f1600_x4_state_extract_bytes_asm) + + push.w {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + vpush {d8, d9, d10, d11, d12, d13, d14, d15} + ldr r4, [sp, #0x68] + ldr.w r10, [sp, #0x6c] + ldr r6, [sp, #0x70] + cmp r6, #0x0 + beq.w keccak_f1600_x4_state_extract_bytes_asm_exit @ imm = #0x2ea + and r5, r10, #0x7 + bic r9, r10, #0x7 + add.w r8, r0, r9, lsl #1 + add.w r7, r8, #0x190 + cmp r5, #0x0 + beq.w keccak_f1600_x4_state_extract_bytes_asm_pre_main @ imm = #0x112 + vldrw.u32 q0, [r8], #16 + vldrw.u32 q1, [r7], #16 + vrev32.16 q2, q0 + vrev32.16 q3, q1 + vsli.32 q0, q0, #0x8 + vsli.16 q0, q0, #0x4 + vsli.8 q0, q0, #0x1 + vshr.u8 q4, q0, #0x3 + vsli.8 q0, q4, #0x4 + vshr.u8 q4, q0, #0x5 + vsli.8 q0, q4, #0x6 + vsli.32 q1, q1, #0x8 + vsli.16 q1, q1, #0x4 + vsli.8 q1, q1, #0x1 + vshr.u8 q4, q1, #0x3 + vsli.8 q1, q4, #0x4 + vshr.u8 q4, q1, #0x5 + vsli.8 q1, q4, #0x6 + mov.w r0, #0x55 + vdup.8 q4, r0 + vand q0, q0, q4 + vand q1, q1, q4 + vshl.i32 q1, q1, #0x1 + vorr q0, q0, q1 + vsli.32 q2, q2, #0x8 + vsli.16 q2, q2, #0x4 + vsli.8 q2, q2, #0x1 + vshr.u8 q1, q2, #0x3 + vsli.8 q2, q1, #0x4 + vshr.u8 q1, q2, #0x5 + vsli.8 q2, q1, #0x6 + vsli.32 q3, q3, #0x8 + vsli.16 q3, q3, #0x4 + vsli.8 q3, q3, #0x1 + vshr.u8 q1, q3, #0x3 + vsli.8 q3, q1, #0x4 + vshr.u8 q1, q3, #0x5 + vsli.8 q3, q1, #0x6 + vand q1, q2, q4 + vand q3, q3, q4 + vshl.i32 q3, q3, #0x1 + vorr q1, q1, q3 + vrev64.32 q2, q0 + vrev64.32 q3, q1 + movw r0, #0xf0f + vmsr p0, r0 + vpsel q0, q0, q3 + vpsel q1, q2, q1 + vmov.f64 d4, d1 + vmov.f64 d6, d3 + rsb.w lr, r5, #0x8 + cmp r6, lr + it ls + movls lr, r6 + vctp.8 lr + vmrs r11, p0 + lsl.w r11, r11, r5 + vmsr p0, r11 + subs r1, r1, r5 + subs r2, r2, r5 + subs r3, r3, r5 + subs r4, r4, r5 + vpstttt + vstrbt.8 q0, [r1], #4 + vstrbt.8 q1, [r2], #4 + vstrbt.8 q2, [r3], #4 + vstrbt.8 q3, [r4], #4 + subs.w r6, r6, lr + cmp r6, #0x0 + beq.w keccak_f1600_x4_state_extract_bytes_asm_exit @ imm = #0x1cc + vmov q7[2], q7[0], r1, r3 + vmov q7[3], q7[1], r2, r4 + b keccak_f1600_x4_state_extract_bytes_asm_main_body @ imm = #0xe + +keccak_f1600_x4_state_extract_bytes_asm_pre_main: + vmov q7[2], q7[0], r1, r3 + vmov q7[3], q7[1], r2, r4 + mov.w r12, #0x4 + vsub.i32 q7, q7, r12 + +keccak_f1600_x4_state_extract_bytes_asm_main_body: + lsr.w lr, r6, #0x3 + wls lr, lr, keccak_f1600_x4_state_extract_bytes_asm_main_loop_end @ imm = #0xb4 + +keccak_f1600_x4_state_extract_bytes_asm_main_loop_start: + vldrw.u32 q0, [r8], #16 + vldrw.u32 q1, [r7], #16 + vrev32.16 q2, q0 + vrev32.16 q3, q1 + vsli.32 q0, q0, #0x8 + vsli.16 q0, q0, #0x4 + vsli.8 q0, q0, #0x1 + vshr.u8 q4, q0, #0x3 + vsli.8 q0, q4, #0x4 + vshr.u8 q4, q0, #0x5 + vsli.8 q0, q4, #0x6 + vsli.32 q1, q1, #0x8 + vsli.16 q1, q1, #0x4 + vsli.8 q1, q1, #0x1 + vshr.u8 q4, q1, #0x3 + vsli.8 q1, q4, #0x4 + vshr.u8 q4, q1, #0x5 + vsli.8 q1, q4, #0x6 + mov.w r0, #0x55 + vdup.8 q4, r0 + vand q0, q0, q4 + vand q1, q1, q4 + vshl.i32 q1, q1, #0x1 + vorr q0, q0, q1 + vsli.32 q2, q2, #0x8 + vsli.16 q2, q2, #0x4 + vsli.8 q2, q2, #0x1 + vshr.u8 q1, q2, #0x3 + vsli.8 q2, q1, #0x4 + vshr.u8 q1, q2, #0x5 + vsli.8 q2, q1, #0x6 + vsli.32 q3, q3, #0x8 + vsli.16 q3, q3, #0x4 + vsli.8 q3, q3, #0x1 + vshr.u8 q1, q3, #0x3 + vsli.8 q3, q1, #0x4 + vshr.u8 q1, q3, #0x5 + vsli.8 q3, q1, #0x6 + vand q1, q2, q4 + vand q3, q3, q4 + vshl.i32 q3, q3, #0x1 + vorr q1, q1, q3 + vstrw.32 q0, [q7, #4]! + vstrw.32 q1, [q7, #4]! + le lr, keccak_f1600_x4_state_extract_bytes_asm_main_loop_start @ imm = #-0xb4 + +keccak_f1600_x4_state_extract_bytes_asm_main_loop_end: + ands r6, r6, #0x7 + beq keccak_f1600_x4_state_extract_bytes_asm_exit @ imm = #0xee + mov.w r12, #0x4 + vadd.i32 q7, q7, r12 + vmov r1, r3, q7[2], q7[0] + vmov r2, r4, q7[3], q7[1] + vldrw.u32 q0, [r8], #16 + vldrw.u32 q1, [r7], #16 + vrev32.16 q2, q0 + vrev32.16 q3, q1 + vsli.32 q0, q0, #0x8 + vsli.16 q0, q0, #0x4 + vsli.8 q0, q0, #0x1 + vshr.u8 q4, q0, #0x3 + vsli.8 q0, q4, #0x4 + vshr.u8 q4, q0, #0x5 + vsli.8 q0, q4, #0x6 + vsli.32 q1, q1, #0x8 + vsli.16 q1, q1, #0x4 + vsli.8 q1, q1, #0x1 + vshr.u8 q4, q1, #0x3 + vsli.8 q1, q4, #0x4 + vshr.u8 q4, q1, #0x5 + vsli.8 q1, q4, #0x6 + mov.w r0, #0x55 + vdup.8 q4, r0 + vand q0, q0, q4 + vand q1, q1, q4 + vshl.i32 q1, q1, #0x1 + vorr q0, q0, q1 + vsli.32 q2, q2, #0x8 + vsli.16 q2, q2, #0x4 + vsli.8 q2, q2, #0x1 + vshr.u8 q1, q2, #0x3 + vsli.8 q2, q1, #0x4 + vshr.u8 q1, q2, #0x5 + vsli.8 q2, q1, #0x6 + vsli.32 q3, q3, #0x8 + vsli.16 q3, q3, #0x4 + vsli.8 q3, q3, #0x1 + vshr.u8 q1, q3, #0x3 + vsli.8 q3, q1, #0x4 + vshr.u8 q1, q3, #0x5 + vsli.8 q3, q1, #0x6 + vand q1, q2, q4 + vand q3, q3, q4 + vshl.i32 q3, q3, #0x1 + vorr q1, q1, q3 + vrev64.32 q2, q0 + vrev64.32 q3, q1 + movw r0, #0xf0f + vmsr p0, r0 + vpsel q0, q0, q3 + vpsel q1, q2, q1 + vmov.f64 d4, d1 + vmov.f64 d6, d3 + vctp.8 r6 + vpstttt + vstrbt.8 q0, [r1], #4 + vstrbt.8 q1, [r2], #4 + vstrbt.8 q2, [r3], #4 + vstrbt.8 q3, [r4], #4 + +keccak_f1600_x4_state_extract_bytes_asm_exit: + vpop {d8, d9, d10, d11, d12, d13, d14, d15} + pop.w {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} + +MLD_ASM_FN_SIZE(keccak_f1600_x4_state_extract_bytes_asm) + +#endif /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S b/mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S new file mode 100644 index 000000000..00907fb4d --- /dev/null +++ b/mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S @@ -0,0 +1,314 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * Copyright (c) 2026 Arm Limited + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +// --------------------------------------------------------------------------- +// Overview +// --------------------------------------------------------------------------- +// MVE/Helium implementation of KeccakF1600x4_StateXORBytes. +// +// void KeccakF1600x4_StateXORBytes(state, d0, d1, d2, d3, offset, length) +// +// Reads 'length' plain bytes from each of four input buffers (d0..d3), +// splits every byte into its even and odd bits (bit-interleaving), and +// XORs the result into the Keccak state starting at byte 'offset'. +// +// --------------------------------------------------------------------------- +// Bit-interleaving background +// --------------------------------------------------------------------------- +// Each 64-bit Keccak lane is stored as two 32-bit words: +// even half -- bits 0, 2, 4, ..., 62 of the lane +// odd half -- bits 1, 3, 5, ..., 63 of the lane +// This representation allows 64-bit lane rotations (used in the Keccak +// round function) to be implemented as pairs of 32-bit rotations. +// +// Batched (x4) processing: +// Four Keccak instances are processed as a batch. Their states are +// stored interleaved in a single 800-byte buffer: first the even +// halves of all 25 lanes (400 bytes), then the odd halves (400 bytes). +// Within each 16-byte row, the four u32 words correspond to +// instances 0..3 of the same lane, enabling SIMD-parallel operations +// across all four instances. +// +// State memory layout (25 lanes x 4 instances x 2 halves): +// S[i][l]_even/odd = even/odd half of lane l, instance i (u32) +// Each row is 16 bytes (one Q-register). +// Offset Contents +// 0 S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even +// 16 S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even +// ... +// 384 S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even +// 400 S[0][ 0]_odd, S[1][ 0]_odd, S[2][ 0]_odd, S[3][ 0]_odd +// 416 S[0][ 1]_odd, S[1][ 1]_odd, S[2][ 1]_odd, S[3][ 1]_odd +// ... +// 784 S[0][24]_odd, S[1][24]_odd, S[2][24]_odd, S[3][24]_odd +// +// --------------------------------------------------------------------------- +// Three-phase structure +// --------------------------------------------------------------------------- +// Prologue -- if offset is not 8-byte aligned, absorb +// min(length, 8-(offset%8)) bytes via predicated byte loads. +// Main -- process full 8-byte groups via word-level gather loads, +// bit-interleave, then VEOR into even/odd state halves. +// Tail -- absorb remaining <8 bytes via predicated byte loads. + +#include "../../../../common.h" +#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/fips202/armv81m/src/state_xor_bytes_x4_mve.S using scripts/simpasm. Do not modify it directly. + */ + +.thumb +.syntax unified + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm) +MLD_ASM_FN_SYMBOL(keccak_f1600_x4_state_xor_bytes_asm) + + push.w {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + vpush {d8, d9, d10, d11, d12, d13, d14, d15} + ldr r4, [sp, #0x68] + ldr.w r10, [sp, #0x6c] + ldr r6, [sp, #0x70] + cmp r6, #0x0 + beq.w keccak_f1600_x4_state_xor_bytes_asm_exit @ imm = #0x34c + and r5, r10, #0x7 + bic r9, r10, #0x7 + add.w r8, r0, r9, lsl #1 + add.w r7, r8, #0x190 + cmp r5, #0x0 + beq.w keccak_f1600_x4_state_xor_bytes_asm_pre_main @ imm = #0x132 + subs r1, r1, r5 + subs r2, r2, r5 + subs r3, r3, r5 + subs r4, r4, r5 + rsb.w lr, r5, #0x8 + cmp r6, lr + it ls + movls lr, r6 + subs.w r6, r6, lr + vctp.8 lr + vmrs r11, p0 + lsl.w r11, r11, r5 + vmsr p0, r11 + vpstttt + vldrbt.u8 q0, [r1], #4 + vldrbt.u8 q1, [r2], #4 + vldrbt.u8 q2, [r3], #4 + vldrbt.u8 q3, [r4], #4 + vmov.f64 d1, d4 + vmov.f64 d3, d6 + vrev64.32 q2, q0 + vrev64.32 q3, q1 + movw r0, #0xf0f + vmsr p0, r0 + vpsel q0, q0, q3 + vpsel q1, q2, q1 + vmov q2, q0 + vmov q3, q1 + vshr.u8 q4, q0, #0x2 + vsli.8 q0, q4, #0x1 + vshr.u8 q4, q0, #0x3 + vsli.8 q0, q4, #0x2 + vshr.u8 q4, q0, #0x4 + vsli.8 q0, q4, #0x3 + vshr.u16 q4, q0, #0x8 + vsli.8 q0, q4, #0x4 + vshr.u32 q4, q0, #0x10 + vsli.16 q0, q4, #0x8 + vshr.u8 q4, q3, #0x2 + vsli.8 q3, q4, #0x1 + vshr.u8 q4, q3, #0x3 + vsli.8 q3, q4, #0x2 + vshr.u8 q4, q3, #0x4 + vsli.8 q3, q4, #0x3 + vshr.u16 q4, q3, #0x8 + vsli.8 q3, q4, #0x4 + vshr.u32 q4, q3, #0x10 + vsli.16 q3, q4, #0x8 + vsli.32 q0, q3, #0x10 + vshl.i8 q4, q2, #0x2 + vsri.8 q2, q4, #0x1 + vshl.i8 q4, q2, #0x3 + vsri.8 q2, q4, #0x2 + vshl.i8 q4, q2, #0x4 + vsri.8 q2, q4, #0x3 + vshl.i16 q4, q2, #0x8 + vsri.8 q2, q4, #0x4 + vshl.i32 q4, q2, #0x10 + vsri.16 q2, q4, #0x8 + vshl.i8 q4, q1, #0x2 + vsri.8 q1, q4, #0x1 + vshl.i8 q4, q1, #0x3 + vsri.8 q1, q4, #0x2 + vshl.i8 q4, q1, #0x4 + vsri.8 q1, q4, #0x3 + vshl.i16 q4, q1, #0x8 + vsri.8 q1, q4, #0x4 + vshl.i32 q4, q1, #0x10 + vsri.16 q1, q4, #0x8 + vsri.32 q1, q2, #0x10 + vldrw.u32 q4, [r8] + vldrw.u32 q5, [r7] + veor q4, q4, q0 + veor q5, q5, q1 + vstrw.32 q4, [r8], #16 + vstrw.32 q5, [r7], #16 + vmov q7[2], q7[0], r1, r3 + vmov q7[3], q7[1], r2, r4 + cmp r6, #0x0 + beq.w keccak_f1600_x4_state_xor_bytes_asm_exit @ imm = #0x206 + b keccak_f1600_x4_state_xor_bytes_asm_main_body @ imm = #0xe + +keccak_f1600_x4_state_xor_bytes_asm_pre_main: + vmov q7[2], q7[0], r1, r3 + vmov q7[3], q7[1], r2, r4 + mov.w r0, #0x4 + vsub.i32 q7, q7, r0 + +keccak_f1600_x4_state_xor_bytes_asm_main_body: + lsr.w lr, r6, #0x3 + wls lr, lr, keccak_f1600_x4_state_xor_bytes_asm_main_loop_end @ imm = #0xd4 + +keccak_f1600_x4_state_xor_bytes_asm_main_loop_start: + vldrw.u32 q0, [q7, #4]! + vldrw.u32 q1, [q7, #4]! + vmov q2, q0 + vmov q3, q1 + vshr.u8 q4, q0, #0x2 + vsli.8 q0, q4, #0x1 + vshr.u8 q4, q0, #0x3 + vsli.8 q0, q4, #0x2 + vshr.u8 q4, q0, #0x4 + vsli.8 q0, q4, #0x3 + vshr.u16 q4, q0, #0x8 + vsli.8 q0, q4, #0x4 + vshr.u32 q4, q0, #0x10 + vsli.16 q0, q4, #0x8 + vshr.u8 q4, q3, #0x2 + vsli.8 q3, q4, #0x1 + vshr.u8 q4, q3, #0x3 + vsli.8 q3, q4, #0x2 + vshr.u8 q4, q3, #0x4 + vsli.8 q3, q4, #0x3 + vshr.u16 q4, q3, #0x8 + vsli.8 q3, q4, #0x4 + vshr.u32 q4, q3, #0x10 + vsli.16 q3, q4, #0x8 + vsli.32 q0, q3, #0x10 + vshl.i8 q4, q2, #0x2 + vsri.8 q2, q4, #0x1 + vshl.i8 q4, q2, #0x3 + vsri.8 q2, q4, #0x2 + vshl.i8 q4, q2, #0x4 + vsri.8 q2, q4, #0x3 + vshl.i16 q4, q2, #0x8 + vsri.8 q2, q4, #0x4 + vshl.i32 q4, q2, #0x10 + vsri.16 q2, q4, #0x8 + vshl.i8 q4, q1, #0x2 + vsri.8 q1, q4, #0x1 + vshl.i8 q4, q1, #0x3 + vsri.8 q1, q4, #0x2 + vshl.i8 q4, q1, #0x4 + vsri.8 q1, q4, #0x3 + vshl.i16 q4, q1, #0x8 + vsri.8 q1, q4, #0x4 + vshl.i32 q4, q1, #0x10 + vsri.16 q1, q4, #0x8 + vsri.32 q1, q2, #0x10 + vldrw.u32 q4, [r8] + vldrw.u32 q5, [r7] + veor q4, q4, q0 + veor q5, q5, q1 + vstrw.32 q4, [r8], #16 + vstrw.32 q5, [r7], #16 + le lr, keccak_f1600_x4_state_xor_bytes_asm_main_loop_start @ imm = #-0xd4 + +keccak_f1600_x4_state_xor_bytes_asm_main_loop_end: + ands r6, r6, #0x7 + beq.w keccak_f1600_x4_state_xor_bytes_asm_exit @ imm = #0x110 + mov.w r0, #0x4 + vadd.i32 q7, q7, r0 + vmov r1, r3, q7[2], q7[0] + vmov r2, r4, q7[3], q7[1] + vctp.8 r6 + vpstttt + vldrbt.u8 q0, [r1] + vldrbt.u8 q1, [r2] + vldrbt.u8 q2, [r3] + vldrbt.u8 q3, [r4] + vmov.f64 d1, d4 + vmov.f64 d3, d6 + vrev64.32 q2, q0 + vrev64.32 q3, q1 + movw r0, #0xf0f + vmsr p0, r0 + vpsel q0, q0, q3 + vpsel q1, q2, q1 + vmov q2, q0 + vmov q3, q1 + vshr.u8 q4, q0, #0x2 + vsli.8 q0, q4, #0x1 + vshr.u8 q4, q0, #0x3 + vsli.8 q0, q4, #0x2 + vshr.u8 q4, q0, #0x4 + vsli.8 q0, q4, #0x3 + vshr.u16 q4, q0, #0x8 + vsli.8 q0, q4, #0x4 + vshr.u32 q4, q0, #0x10 + vsli.16 q0, q4, #0x8 + vshr.u8 q4, q3, #0x2 + vsli.8 q3, q4, #0x1 + vshr.u8 q4, q3, #0x3 + vsli.8 q3, q4, #0x2 + vshr.u8 q4, q3, #0x4 + vsli.8 q3, q4, #0x3 + vshr.u16 q4, q3, #0x8 + vsli.8 q3, q4, #0x4 + vshr.u32 q4, q3, #0x10 + vsli.16 q3, q4, #0x8 + vsli.32 q0, q3, #0x10 + vshl.i8 q4, q2, #0x2 + vsri.8 q2, q4, #0x1 + vshl.i8 q4, q2, #0x3 + vsri.8 q2, q4, #0x2 + vshl.i8 q4, q2, #0x4 + vsri.8 q2, q4, #0x3 + vshl.i16 q4, q2, #0x8 + vsri.8 q2, q4, #0x4 + vshl.i32 q4, q2, #0x10 + vsri.16 q2, q4, #0x8 + vshl.i8 q4, q1, #0x2 + vsri.8 q1, q4, #0x1 + vshl.i8 q4, q1, #0x3 + vsri.8 q1, q4, #0x2 + vshl.i8 q4, q1, #0x4 + vsri.8 q1, q4, #0x3 + vshl.i16 q4, q1, #0x8 + vsri.8 q1, q4, #0x4 + vshl.i32 q4, q1, #0x10 + vsri.16 q1, q4, #0x8 + vsri.32 q1, q2, #0x10 + vldrw.u32 q4, [r8] + vldrw.u32 q5, [r7] + veor q4, q4, q0 + veor q5, q5, q1 + vstrw.32 q4, [r8], #16 + vstrw.32 q5, [r7], #16 + +keccak_f1600_x4_state_xor_bytes_asm_exit: + vpop {d8, d9, d10, d11, d12, d13, d14, d15} + pop.w {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} + nop + +MLD_ASM_FN_SIZE(keccak_f1600_x4_state_xor_bytes_asm) + +#endif /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/proofs/cbmc/dummy_backend_fips202_x4.h b/proofs/cbmc/dummy_backend_fips202_x4.h index d84df09c5..b42cbf34d 100644 --- a/proofs/cbmc/dummy_backend_fips202_x4.h +++ b/proofs/cbmc/dummy_backend_fips202_x4.h @@ -8,6 +8,8 @@ #define MLD_USE_FIPS202_X4_NATIVE +#define MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE +#define MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE #include "../../mldsa/src/fips202/native/api.h" diff --git a/proofs/cbmc/keccakf1600x4_extract_bytes_native/Makefile b/proofs/cbmc/keccakf1600x4_extract_bytes_native/Makefile new file mode 100644 index 000000000..770458826 --- /dev/null +++ b/proofs/cbmc/keccakf1600x4_extract_bytes_native/Makefile @@ -0,0 +1,37 @@ +# Copyright (c) The mldsa-native project authors +# Copyright (c) The mlkem-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +include ../Makefile_params.common + +HARNESS_ENTRY = harness +HARNESS_FILE = keccakf1600x4_extract_bytes_native_harness + +# This should be a unique identifier for this proof, and will appear on the +# Litani dashboard. It can be human-readable and contain spaces if you wish. +PROOF_UID = keccakf1600x4_extract_bytes_native + +DEFINES += -DMLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 -DMLD_CONFIG_FIPS202_BACKEND_FILE="\"dummy_backend_fips202_x4.h\"" +INCLUDES += + +REMOVE_FUNCTION_BODY += +UNWINDSET += + +PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c +PROJECT_SOURCES += $(SRCDIR)/mldsa/src/fips202/keccakf1600.c + +CHECK_FUNCTION_CONTRACTS=mld_keccakf1600x4_extract_bytes +USE_FUNCTION_CONTRACTS=mld_keccakf1600_extract_bytes_x4_native +APPLY_LOOP_CONTRACTS=on +USE_DYNAMIC_FRAMES=1 + +# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead +EXTERNAL_SAT_SOLVER= +CBMCFLAGS=--bitwuzla + +FUNCTION_NAME = keccakf1600x4_extract_bytes_native + +# This function is large enough to need... +CBMC_OBJECT_BITS = 8 + +include ../Makefile.common diff --git a/proofs/cbmc/keccakf1600x4_extract_bytes_native/keccakf1600x4_extract_bytes_native_harness.c b/proofs/cbmc/keccakf1600x4_extract_bytes_native/keccakf1600x4_extract_bytes_native_harness.c new file mode 100644 index 000000000..cd95f71c9 --- /dev/null +++ b/proofs/cbmc/keccakf1600x4_extract_bytes_native/keccakf1600x4_extract_bytes_native_harness.c @@ -0,0 +1,16 @@ +// Copyright (c) The mlkem-native project authors +// Copyright (c) The mldsa-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT-0 + +#include + +void harness(void) +{ + uint64_t *state; + unsigned char *data0, *data1, *data2, *data3; + unsigned offset; + unsigned length; + mld_keccakf1600x4_extract_bytes(state, data0, data1, data2, data3, offset, + length); +} diff --git a/proofs/cbmc/keccakf1600x4_xor_bytes_native/Makefile b/proofs/cbmc/keccakf1600x4_xor_bytes_native/Makefile new file mode 100644 index 000000000..8edfd859d --- /dev/null +++ b/proofs/cbmc/keccakf1600x4_xor_bytes_native/Makefile @@ -0,0 +1,37 @@ +# Copyright (c) The mldsa-native project authors +# Copyright (c) The mlkem-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +include ../Makefile_params.common + +HARNESS_ENTRY = harness +HARNESS_FILE = keccakf1600x4_xor_bytes_native_harness + +# This should be a unique identifier for this proof, and will appear on the +# Litani dashboard. It can be human-readable and contain spaces if you wish. +PROOF_UID = keccakf1600x4_xor_bytes_native + +DEFINES += -DMLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 -DMLD_CONFIG_FIPS202_BACKEND_FILE="\"dummy_backend_fips202_x4.h\"" +INCLUDES += + +REMOVE_FUNCTION_BODY += +UNWINDSET += + +PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c +PROJECT_SOURCES += $(SRCDIR)/mldsa/src/fips202/keccakf1600.c + +CHECK_FUNCTION_CONTRACTS=mld_keccakf1600x4_xor_bytes +USE_FUNCTION_CONTRACTS=mld_keccakf1600_xor_bytes_x4_native +APPLY_LOOP_CONTRACTS=on +USE_DYNAMIC_FRAMES=1 + +# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead +EXTERNAL_SAT_SOLVER= +CBMCFLAGS=--bitwuzla + +FUNCTION_NAME = keccakf1600x4_xor_bytes_native + +# This function is large enough to need... +CBMC_OBJECT_BITS = 8 + +include ../Makefile.common diff --git a/proofs/cbmc/keccakf1600x4_xor_bytes_native/keccakf1600x4_xor_bytes_native_harness.c b/proofs/cbmc/keccakf1600x4_xor_bytes_native/keccakf1600x4_xor_bytes_native_harness.c new file mode 100644 index 000000000..605f127a0 --- /dev/null +++ b/proofs/cbmc/keccakf1600x4_xor_bytes_native/keccakf1600x4_xor_bytes_native_harness.c @@ -0,0 +1,16 @@ +// Copyright (c) The mlkem-native project authors +// Copyright (c) The mldsa-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT-0 + +#include + +void harness(void) +{ + uint64_t *state; + const unsigned char *data0, *data1, *data2, *data3; + unsigned offset; + unsigned length; + mld_keccakf1600x4_xor_bytes(state, data0, data1, data2, data3, offset, + length); +} diff --git a/scripts/check-magic b/scripts/check-magic index ef7306b88..8ff6c6d63 100755 --- a/scripts/check-magic +++ b/scripts/check-magic @@ -35,7 +35,9 @@ REMEMBERED = f"{BLUE}⊢{NORMAL}" def check_magic_numbers(): mldsa_q = 8380417 - exceptions = [mldsa_q] + exceptions = [mldsa_q, + 2025, # years + ] enable_marker = "check-magic: on" disable_marker = "check-magic: off" autogen_marker = "This file is auto-generated from scripts/autogen" diff --git a/scripts/simpasm b/scripts/simpasm index 6097a3abc..5bd33543b 100755 --- a/scripts/simpasm +++ b/scripts/simpasm @@ -231,7 +231,7 @@ def simplify(logger, args, asm_input, asm_output=None): ) raise Exception("simpasm failed") sym_info = nm_output[0].split(" ") - sym_addr = int(sym_info[0]) + sym_addr = int(sym_info[0], 16) if sym_addr != 0: logger.error( f"Global sym {sym} not at address 0 (instead at address {hex(sym_addr)}) -- please reorder the assembly to start with the global function symbol"