Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions dev/fips202/armv81m/mve.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,18 @@

/* Part of backend API */
#define MLD_USE_FIPS202_X4_NATIVE
#define MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE
#define MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE
/* Guard for assembly file */
#define MLD_FIPS202_ARMV81M_NEED_X4

#if !defined(__ASSEMBLER__)
#include "../api.h"

/*
* Native x4 permutation
* State is kept in bit-interleaved format.
*/
#define mld_keccak_f1600_x4_native_impl \
MLD_NAMESPACE(keccak_f1600_x4_native_impl)
int mld_keccak_f1600_x4_native_impl(uint64_t *state);
Expand All @@ -27,6 +33,48 @@ static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state)
return mld_keccak_f1600_x4_native_impl(state);
}

/*
* Native x4 XOR bytes (with on-the-fly bit interleaving)
*/
#define mld_keccak_f1600_x4_state_xor_bytes \
MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm)
void mld_keccak_f1600_x4_state_xor_bytes(void *state, const uint8_t *data0,
const uint8_t *data1,
const uint8_t *data2,
const uint8_t *data3, unsigned offset,
unsigned length);

MLD_MUST_CHECK_RETURN_VALUE
static MLD_INLINE int mld_keccakf1600_xor_bytes_x4_native(
uint64_t *state, const uint8_t *data0, const uint8_t *data1,
const uint8_t *data2, const uint8_t *data3, unsigned offset,
unsigned length)
{
mld_keccak_f1600_x4_state_xor_bytes(state, data0, data1, data2, data3, offset,
length);
return MLD_NATIVE_FUNC_SUCCESS;
}

/*
* Native x4 extract bytes (with on-the-fly bit de-interleaving)
*/
#define mld_keccak_f1600_x4_state_extract_bytes \
MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
void mld_keccak_f1600_x4_state_extract_bytes(void *state, uint8_t *data0,
uint8_t *data1, uint8_t *data2,
uint8_t *data3, unsigned offset,
unsigned length);

MLD_MUST_CHECK_RETURN_VALUE
static MLD_INLINE int mld_keccakf1600_extract_bytes_x4_native(
uint64_t *state, uint8_t *data0, uint8_t *data1, uint8_t *data2,
uint8_t *data3, unsigned offset, unsigned length)
{
mld_keccak_f1600_x4_state_extract_bytes(state, data0, data1, data2, data3,
offset, length);
return MLD_NATIVE_FUNC_SUCCESS;
}

#endif /* !__ASSEMBLER__ */

#endif /* !MLD_DEV_FIPS202_ARMV81M_MVE_H */
16 changes: 16 additions & 0 deletions dev/fips202/armv81m/src/fips202_native_armv81m.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,20 @@ extern const uint32_t mld_keccakf1600_round_constants[48];
void mld_keccak_f1600_x4_mve_asm(uint64_t state[100], uint64_t tmpstate[100],
const uint32_t rc[48]);

#define mld_keccak_f1600_x4_state_xor_bytes_asm \
MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm)
void mld_keccak_f1600_x4_state_xor_bytes_asm(void *state, const uint8_t *d0,
const uint8_t *d1,
const uint8_t *d2,
const uint8_t *d3, unsigned offset,
unsigned length);

#define mld_keccak_f1600_x4_state_extract_bytes_asm \
MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
void mld_keccak_f1600_x4_state_extract_bytes_asm(void *state, uint8_t *data0,
uint8_t *data1, uint8_t *data2,
uint8_t *data3,
unsigned offset,
unsigned length);

#endif /* !MLD_DEV_FIPS202_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H */
36 changes: 33 additions & 3 deletions dev/fips202/armv81m/src/keccak_f1600_x4_mve.S
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@

/*yaml
Name: keccak_f1600_x4_mve_asm
Description: Armv8.1-M MVE implementation of 4-way parallel Keccak-f[1600] permutation using bit-interleaved state
Description: Armv8.1-M MVE implementation of batched (x4) Keccak-f[1600] permutation using bit-interleaved state
Signature: void mld_keccak_f1600_x4_mve_asm(void *state, void *tmpstate, const uint32_t *rc)
ABI:
r0:
type: buffer
size_bytes: 800
permissions: read/write
c_parameter: void *state
description: Four bit-interleaved Keccak states (low halves followed by high halves)
description: Bit-interleaved state for 4 Keccak instances (even halves followed by odd halves)
r1:
type: buffer
size_bytes: 800
Expand All @@ -33,6 +33,36 @@
description: register preservation (44) + SIMD registers (64) + temporary storage (128)
*/

// ---------------------------------------------------------------------------
// Bit-interleaving background
// ---------------------------------------------------------------------------
// Each 64-bit Keccak lane is stored as two 32-bit words:
// even half -- bits 0, 2, 4, ..., 62 of the lane
// odd half -- bits 1, 3, 5, ..., 63 of the lane
// This representation allows 64-bit lane rotations (used in the Keccak
// round function) to be implemented as pairs of 32-bit rotations.
//
// Batched (x4) processing:
// Four Keccak instances are processed as a batch. Their states are
// stored interleaved in a single 800-byte buffer: first the even
// halves of all 25 lanes (400 bytes), then the odd halves (400 bytes).
// Within each 16-byte row, the four u32 words correspond to
// instances 0..3 of the same lane, enabling SIMD-parallel operations
// across all four instances.
//
// State memory layout (25 lanes x 4 instances x 2 halves):
// S[i][l]_even/odd = even/odd half of lane l, instance i (u32)
// Each row is 16 bytes (one Q-register).
// Offset Contents
// 0 S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even
// 16 S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even
// ...
// 384 S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even
// 400 S[0][ 0]_odd, S[1][ 0]_odd, S[2][ 0]_odd, S[3][ 0]_odd
// 416 S[0][ 1]_odd, S[1][ 1]_odd, S[2][ 1]_odd, S[3][ 1]_odd
// ...
// 784 S[0][24]_odd, S[1][24]_odd, S[2][24]_odd, S[3][24]_odd

#include "../../../../common.h"
#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
Expand Down Expand Up @@ -426,7 +456,7 @@ qA20_l .req q2
.endm

.text
.balign 8
.balign 4
.type MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm), %function
.global MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm)
MLD_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm)
Expand Down
105 changes: 5 additions & 100 deletions dev/fips202/armv81m/src/keccak_f1600_x4_mve.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,114 +12,19 @@

#include "fips202_native_armv81m.h"

/*
* TEMPORARY: Bit-interleaving using efficient shift-and-mask operations.
* TODO: Replace with optimized MVE assembly implementations
* (as a part of XORBytes and ExtractBytes)
*/

/* Extract even-indexed bits from 64-bit value into lower 32 bits */
static uint32_t bitinterleave_even(uint64_t x)
{
uint64_t t;
t = x & 0x5555555555555555ULL;
t = (t | (t >> 1)) & 0x3333333333333333ULL;
t = (t | (t >> 2)) & 0x0f0f0f0f0f0f0f0fULL;
t = (t | (t >> 4)) & 0x00ff00ff00ff00ffULL;
t = (t | (t >> 8)) & 0x0000ffff0000ffffULL;
t = (t | (t >> 16)) & 0x00000000ffffffffULL;
return (uint32_t)t;
}

/* Extract odd-indexed bits from 64-bit value into lower 32 bits */
static uint32_t bitinterleave_odd(uint64_t x)
{
return bitinterleave_even(x >> 1);
}

/* Spread 32-bit value across even bit positions of 64-bit result */
static uint64_t spread_even(uint32_t x)
{
uint64_t t = x;
t = (t | (t << 16)) & 0x0000ffff0000ffffULL;
t = (t | (t << 8)) & 0x00ff00ff00ff00ffULL;
t = (t | (t << 4)) & 0x0f0f0f0f0f0f0f0fULL;
t = (t | (t << 2)) & 0x3333333333333333ULL;
t = (t | (t << 1)) & 0x5555555555555555ULL;
return t;
}

/* Combine even and odd 32-bit halves into interleaved 64-bit value */
static uint64_t bitdeinterleave(uint32_t even, uint32_t odd)
{
return spread_even(even) | (spread_even(odd) << 1);
}

/*
* TEMPORARY: Naive C interleaving functions.
* These will be replaced with optimized MVE assembly implementations.
* Keccak-f1600 x4 permutation (on bit-interleaved state)
* State is expected to already be in bit-interleaved format.
*/
static void interleave_4fold(uint64_t *state_4x, const uint64_t *state0,
const uint64_t *state1, const uint64_t *state2,
const uint64_t *state3)
{
uint32_t *state_4xl = (uint32_t *)state_4x;
uint32_t *state_4xh = (uint32_t *)state_4x + 100;

for (size_t i = 0; i < 25; i++)
{
state_4xl[i * 4 + 0] = bitinterleave_even(state0[i]);
state_4xl[i * 4 + 1] = bitinterleave_even(state1[i]);
state_4xl[i * 4 + 2] = bitinterleave_even(state2[i]);
state_4xl[i * 4 + 3] = bitinterleave_even(state3[i]);

state_4xh[i * 4 + 0] = bitinterleave_odd(state0[i]);
state_4xh[i * 4 + 1] = bitinterleave_odd(state1[i]);
state_4xh[i * 4 + 2] = bitinterleave_odd(state2[i]);
state_4xh[i * 4 + 3] = bitinterleave_odd(state3[i]);
}
}

static void deinterleave_4fold(uint64_t *state_4x, uint64_t *state0,
uint64_t *state1, uint64_t *state2,
uint64_t *state3)
{
uint32_t *state_4xl = (uint32_t *)state_4x;
uint32_t *state_4xh = (uint32_t *)state_4x + 100;

for (size_t i = 0; i < 25; i++)
{
state0[i] = bitdeinterleave(state_4xl[i * 4 + 0], state_4xh[i * 4 + 0]);
state1[i] = bitdeinterleave(state_4xl[i * 4 + 1], state_4xh[i * 4 + 1]);
state2[i] = bitdeinterleave(state_4xl[i * 4 + 2], state_4xh[i * 4 + 2]);
state3[i] = bitdeinterleave(state_4xl[i * 4 + 3], state_4xh[i * 4 + 3]);
}
}

#define mld_keccak_f1600_x4_native_impl \
MLD_NAMESPACE(keccak_f1600_x4_native_impl)
int mld_keccak_f1600_x4_native_impl(uint64_t *state)
{
/*
* TEMPORARY: Bit-interleaving using efficient shift-and-mask operations.
* TODO: Replace with optimized MVE assembly implementations
* (as a part of XORBytes and ExtractBytes)
*/
MLD_ALIGN uint64_t state_4x[100];
MLD_ALIGN uint64_t state_4x_tmp[100];

/* Interleave the 4 states into bit-interleaved format */
interleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]);

/* Run the permutation */
mld_keccak_f1600_x4_mve_asm(state_4x, state_4x_tmp,
MLD_ALIGN uint64_t state_tmp[100];
mld_keccak_f1600_x4_mve_asm(state, state_tmp,
mld_keccakf1600_round_constants);

/* Deinterleave back to 4 separate states */
deinterleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]);

mld_zeroize(state_4x, sizeof(state_4x));
mld_zeroize(state_4x_tmp, sizeof(state_4x_tmp));
mld_zeroize(state_tmp, sizeof(state_tmp));
return MLD_NATIVE_FUNC_SUCCESS;
}

Expand Down
Loading
Loading