pq-code-package
diff --git a/‎dev/fips202/armv81m/mve.h‎
Lines changed: 48 additions & 0 deletions b/‎dev/fips202/armv81m/mve.h‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎dev/fips202/armv81m/src/fips202_native_armv81m.h‎
Lines changed: 16 additions & 0 deletions b/‎dev/fips202/armv81m/src/fips202_native_armv81m.h‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎dev/fips202/armv81m/src/keccak_f1600_x4_mve.S‎
Lines changed: 33 additions & 3 deletions b/‎dev/fips202/armv81m/src/keccak_f1600_x4_mve.S‎
Lines changed: 33 additions & 3 deletions
diff --git a/‎dev/fips202/armv81m/src/keccak_f1600_x4_mve.c‎
Lines changed: 5 additions & 100 deletions b/‎dev/fips202/armv81m/src/keccak_f1600_x4_mve.c‎
Lines changed: 5 additions & 100 deletions
@@ -11,12 +11,18 @@
 
 /* Part of backend API */
 #define MLD_USE_FIPS202_X4_NATIVE
+#define MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE
+#define MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE
 /* Guard for assembly file */
 #define MLD_FIPS202_ARMV81M_NEED_X4
 
 #if !defined(__ASSEMBLER__)
 #include "../api.h"
 
+/*
+ * Native x4 permutation
+ * State is kept in bit-interleaved format.
+ */
 #define mld_keccak_f1600_x4_native_impl \
   MLD_NAMESPACE(keccak_f1600_x4_native_impl)
 int mld_keccak_f1600_x4_native_impl(uint64_t *state);
@@ -27,6 +33,48 @@ static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state)
   return mld_keccak_f1600_x4_native_impl(state);
 }
 
+/*
+ * Native x4 XOR bytes (with on-the-fly bit interleaving)
+ */
+#define mld_keccak_f1600_x4_state_xor_bytes \
+  MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm)
+void mld_keccak_f1600_x4_state_xor_bytes(void *state, const uint8_t *data0,
+                                         const uint8_t *data1,
+                                         const uint8_t *data2,
+                                         const uint8_t *data3, unsigned offset,
+                                         unsigned length);
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_keccakf1600_xor_bytes_x4_native(
+    uint64_t *state, const uint8_t *data0, const uint8_t *data1,
+    const uint8_t *data2, const uint8_t *data3, unsigned offset,
+    unsigned length)
+{
+  mld_keccak_f1600_x4_state_xor_bytes(state, data0, data1, data2, data3, offset,
+                                      length);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+/*
+ * Native x4 extract bytes (with on-the-fly bit de-interleaving)
+ */
+#define mld_keccak_f1600_x4_state_extract_bytes \
+  MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
+void mld_keccak_f1600_x4_state_extract_bytes(void *state, uint8_t *data0,
+                                             uint8_t *data1, uint8_t *data2,
+                                             uint8_t *data3, unsigned offset,
+                                             unsigned length);
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_keccakf1600_extract_bytes_x4_native(
+    uint64_t *state, uint8_t *data0, uint8_t *data1, uint8_t *data2,
+    uint8_t *data3, unsigned offset, unsigned length)
+{
+  mld_keccak_f1600_x4_state_extract_bytes(state, data0, data1, data2, data3,
+                                          offset, length);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
 #endif /* !__ASSEMBLER__ */
 
 #endif /* !MLD_DEV_FIPS202_ARMV81M_MVE_H */
@@ -17,4 +17,20 @@ extern const uint32_t mld_keccakf1600_round_constants[48];
 void mld_keccak_f1600_x4_mve_asm(uint64_t state[100], uint64_t tmpstate[100],
                                  const uint32_t rc[48]);
 
+#define mld_keccak_f1600_x4_state_xor_bytes_asm \
+  MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm)
+void mld_keccak_f1600_x4_state_xor_bytes_asm(void *state, const uint8_t *d0,
+                                             const uint8_t *d1,
+                                             const uint8_t *d2,
+                                             const uint8_t *d3, unsigned offset,
+                                             unsigned length);
+
+#define mld_keccak_f1600_x4_state_extract_bytes_asm \
+  MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
+void mld_keccak_f1600_x4_state_extract_bytes_asm(void *state, uint8_t *data0,
+                                                 uint8_t *data1, uint8_t *data2,
+                                                 uint8_t *data3,
+                                                 unsigned offset,
+                                                 unsigned length);
+
 #endif /* !MLD_DEV_FIPS202_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H */
@@ -7,15 +7,15 @@
 
 /*yaml
   Name: keccak_f1600_x4_mve_asm
-  Description: Armv8.1-M MVE implementation of 4-way parallel Keccak-f[1600] permutation using bit-interleaved state
+  Description: Armv8.1-M MVE implementation of batched (x4) Keccak-f[1600] permutation using bit-interleaved state
   Signature: void mld_keccak_f1600_x4_mve_asm(void *state, void *tmpstate, const uint32_t *rc)
   ABI:
     r0:
       type: buffer
       size_bytes: 800
       permissions: read/write
       c_parameter: void *state
-      description: Four bit-interleaved Keccak states (low halves followed by high halves)
+      description: Bit-interleaved state for 4 Keccak instances (even halves followed by odd halves)
     r1:
       type: buffer
       size_bytes: 800
@@ -33,6 +33,36 @@
     description: register preservation (44) + SIMD registers (64) + temporary storage (128)
 */
 
+// ---------------------------------------------------------------------------
+// Bit-interleaving background
+// ---------------------------------------------------------------------------
+// Each 64-bit Keccak lane is stored as two 32-bit words:
+//   even half -- bits 0, 2, 4, ..., 62 of the lane
+//   odd half  -- bits 1, 3, 5, ..., 63 of the lane
+// This representation allows 64-bit lane rotations (used in the Keccak
+// round function) to be implemented as pairs of 32-bit rotations.
+//
+// Batched (x4) processing:
+//   Four Keccak instances are processed as a batch.  Their states are
+//   stored interleaved in a single 800-byte buffer: first the even
+//   halves of all 25 lanes (400 bytes), then the odd halves (400 bytes).
+//   Within each 16-byte row, the four u32 words correspond to
+//   instances 0..3 of the same lane, enabling SIMD-parallel operations
+//   across all four instances.
+//
+// State memory layout (25 lanes x 4 instances x 2 halves):
+//   S[i][l]_even/odd = even/odd half of lane l, instance i  (u32)
+//   Each row is 16 bytes (one Q-register).
+//   Offset  Contents
+//     0     S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even
+//    16     S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even
+//    ...
+//   384     S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even
+//   400     S[0][ 0]_odd,  S[1][ 0]_odd,  S[2][ 0]_odd,  S[3][ 0]_odd
+//   416     S[0][ 1]_odd,  S[1][ 1]_odd,  S[2][ 1]_odd,  S[3][ 1]_odd
+//    ...
+//   784     S[0][24]_odd,  S[1][24]_odd,  S[2][24]_odd,  S[3][24]_odd
+
 #include "../../../../common.h"
 #if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \
     !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
@@ -426,7 +456,7 @@ qA20_l .req q2
 .endm
 
 .text
-.balign 8
+.balign 4
 .type MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm), %function
 .global MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm)
 MLD_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm)
 
@@ -12,114 +12,19 @@
 
 #include "fips202_native_armv81m.h"
 
-/*
- * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations.
- * TODO: Replace with optimized MVE assembly implementations
- * (as a part of XORBytes and ExtractBytes)
- */
-
-/* Extract even-indexed bits from 64-bit value into lower 32 bits */
-static uint32_t bitinterleave_even(uint64_t x)
-{
-  uint64_t t;
-  t = x & 0x5555555555555555ULL;
-  t = (t | (t >> 1)) & 0x3333333333333333ULL;
-  t = (t | (t >> 2)) & 0x0f0f0f0f0f0f0f0fULL;
-  t = (t | (t >> 4)) & 0x00ff00ff00ff00ffULL;
-  t = (t | (t >> 8)) & 0x0000ffff0000ffffULL;
-  t = (t | (t >> 16)) & 0x00000000ffffffffULL;
-  return (uint32_t)t;
-}
-
-/* Extract odd-indexed bits from 64-bit value into lower 32 bits */
-static uint32_t bitinterleave_odd(uint64_t x)
-{
-  return bitinterleave_even(x >> 1);
-}
-
-/* Spread 32-bit value across even bit positions of 64-bit result */
-static uint64_t spread_even(uint32_t x)
-{
-  uint64_t t = x;
-  t = (t | (t << 16)) & 0x0000ffff0000ffffULL;
-  t = (t | (t << 8)) & 0x00ff00ff00ff00ffULL;
-  t = (t | (t << 4)) & 0x0f0f0f0f0f0f0f0fULL;
-  t = (t | (t << 2)) & 0x3333333333333333ULL;
-  t = (t | (t << 1)) & 0x5555555555555555ULL;
-  return t;
-}
-
-/* Combine even and odd 32-bit halves into interleaved 64-bit value */
-static uint64_t bitdeinterleave(uint32_t even, uint32_t odd)
-{
-  return spread_even(even) | (spread_even(odd) << 1);
-}
 
 /*
- * TEMPORARY: Naive C interleaving functions.
- * These will be replaced with optimized MVE assembly implementations.
+ * Keccak-f1600 x4 permutation (on bit-interleaved state)
+ * State is expected to already be in bit-interleaved format.
  */
-static void interleave_4fold(uint64_t *state_4x, const uint64_t *state0,
-                             const uint64_t *state1, const uint64_t *state2,
-                             const uint64_t *state3)
-{
-  uint32_t *state_4xl = (uint32_t *)state_4x;
-  uint32_t *state_4xh = (uint32_t *)state_4x + 100;
-
-  for (size_t i = 0; i < 25; i++)
-  {
-    state_4xl[i * 4 + 0] = bitinterleave_even(state0[i]);
-    state_4xl[i * 4 + 1] = bitinterleave_even(state1[i]);
-    state_4xl[i * 4 + 2] = bitinterleave_even(state2[i]);
-    state_4xl[i * 4 + 3] = bitinterleave_even(state3[i]);
-
-    state_4xh[i * 4 + 0] = bitinterleave_odd(state0[i]);
-    state_4xh[i * 4 + 1] = bitinterleave_odd(state1[i]);
-    state_4xh[i * 4 + 2] = bitinterleave_odd(state2[i]);
-    state_4xh[i * 4 + 3] = bitinterleave_odd(state3[i]);
-  }
-}
-
-static void deinterleave_4fold(uint64_t *state_4x, uint64_t *state0,
-                               uint64_t *state1, uint64_t *state2,
-                               uint64_t *state3)
-{
-  uint32_t *state_4xl = (uint32_t *)state_4x;
-  uint32_t *state_4xh = (uint32_t *)state_4x + 100;
-
-  for (size_t i = 0; i < 25; i++)
-  {
-    state0[i] = bitdeinterleave(state_4xl[i * 4 + 0], state_4xh[i * 4 + 0]);
-    state1[i] = bitdeinterleave(state_4xl[i * 4 + 1], state_4xh[i * 4 + 1]);
-    state2[i] = bitdeinterleave(state_4xl[i * 4 + 2], state_4xh[i * 4 + 2]);
-    state3[i] = bitdeinterleave(state_4xl[i * 4 + 3], state_4xh[i * 4 + 3]);
-  }
-}
-
 #define mld_keccak_f1600_x4_native_impl \
   MLD_NAMESPACE(keccak_f1600_x4_native_impl)
 int mld_keccak_f1600_x4_native_impl(uint64_t *state)
 {
-  /*
-   * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations.
-   * TODO: Replace with optimized MVE assembly implementations
-   * (as a part of XORBytes and ExtractBytes)
-   */
-  MLD_ALIGN uint64_t state_4x[100];
-  MLD_ALIGN uint64_t state_4x_tmp[100];
-
-  /* Interleave the 4 states into bit-interleaved format */
-  interleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]);
-
-  /* Run the permutation */
-  mld_keccak_f1600_x4_mve_asm(state_4x, state_4x_tmp,
+  MLD_ALIGN uint64_t state_tmp[100];
+  mld_keccak_f1600_x4_mve_asm(state, state_tmp,
                               mld_keccakf1600_round_constants);
-
-  /* Deinterleave back to 4 separate states */
-  deinterleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]);
-
-  mld_zeroize(state_4x, sizeof(state_4x));
-  mld_zeroize(state_4x_tmp, sizeof(state_4x_tmp));
+  mld_zeroize(state_tmp, sizeof(state_tmp));
   return MLD_NATIVE_FUNC_SUCCESS;
 }