From a61e9cd681103b2353d3b2a69c7b60a206c0a696 Mon Sep 17 00:00:00 2001
From: "Matthias J. Kannwischer" <matthias@kannwischer.eu>
Date: Thu, 19 Feb 2026 10:28:51 +0100
Subject: [PATCH 1/3] Unit Tests: Extend Keccak x4 test to cover xor_bytes and
 extract_bytes

Replace test_keccakf1600x4_permute with test_keccakf1600x4_xor_permute_extract
that tests the full x4 Keccak flow (xor_bytes, permute, extract_bytes) against
the x1 C reference implementation.

Testing through the public interface rather than comparing internal state
directly allows verifying backends that use custom state representations
(e.g., bit-interleaved) without requiring state conversion functions.

The test uses random offsets and lengths for both xor_bytes and extract_bytes,
and verifies each of the 4 lanes independently against the x1 reference.

Also reduce functional test iterations for M55 baremetal platform.

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
---
 test/baremetal/platform/m55-an547/platform.mk |  3 +-
 test/src/test_unit.c                          | 62 +++++++++++++++----
 2 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/test/baremetal/platform/m55-an547/platform.mk b/test/baremetal/platform/m55-an547/platform.mk
index 6e9918123..6109d1354 100644
--- a/test/baremetal/platform/m55-an547/platform.mk
+++ b/test/baremetal/platform/m55-an547/platform.mk
@@ -10,8 +10,9 @@ CC=gcc
 # Use PMU cycle counting by default
 CYCLES ?= PMU
 
-# Reduce iterations for benchmarking
+# Reduce iterations for benchmarking and functional tests
 CFLAGS += -DMLD_BENCHMARK_NTESTS=3 -DMLD_BENCHMARK_NITERATIONS=2 -DMLD_BENCHMARK_NWARMUP=3
+CFLAGS += -DNTESTS_FUNC=100
 
 # Explicitly include experimental Armv8.1-M + MVE backend
 # Remove this once backend is finalized and enabled by default.
diff --git a/test/src/test_unit.c b/test/src/test_unit.c
index 6acb1a81b..998fdc3e7 100644
--- a/test/src/test_unit.c
+++ b/test/src/test_unit.c
@@ -47,7 +47,7 @@ void mld_polyvecl_pointwise_acc_montgomery_c(mld_poly *w, const mld_polyvecl *u,
 void mld_polyz_unpack_c(mld_poly *r, const uint8_t a[MLDSA_POLYZ_PACKEDBYTES]);
 void mld_keccakf1600_permute_c(uint64_t *state);
 
-#if defined(MLD_USE_FIPS202_X1_NATIVE) || defined(MLD_USE_FIPS202_X4_NATIVE)
+#if defined(MLD_USE_FIPS202_X1_NATIVE)
 static void print_u64_array(const char *label, const uint64_t *array,
                             size_t len)
 {
@@ -95,8 +95,7 @@ static int compare_u64_arrays(const uint64_t *a, const uint64_t *b,
   }
   return 1;
 }
-
-#endif /* MLD_USE_FIPS202_X1_NATIVE || MLD_USE_FIPS202_X4_NATIVE */
+#endif /* MLD_USE_FIPS202_X1_NATIVE */
 
 #if defined(MLD_USE_NATIVE_NTT) || defined(MLD_USE_NATIVE_INTT) ||  \
     defined(MLD_USE_NATIVE_POLY_DECOMPOSE_32) ||                    \
@@ -665,32 +664,69 @@ static int test_keccakf1600_permute(void)
 }
 #endif /* MLD_USE_FIPS202_X1_NATIVE */
 
+/*
+ * Test that x4 Keccak (xor_bytes, permute, extract_bytes) produces
+ * the same results as the x1 C reference.
+ */
 #ifdef MLD_USE_FIPS202_X4_NATIVE
-static int test_keccakf1600x4_permute(void)
+#define MAX_RATE 136
+
+static int test_keccakf1600x4_xor_permute_extract(void)
 {
   uint64_t state_x4[MLD_KECCAK_LANES * MLD_KECCAK_WAY];
-  uint64_t state_x1[MLD_KECCAK_LANES * MLD_KECCAK_WAY];
+  uint64_t state_x1[MLD_KECCAK_LANES];
+  unsigned char output_x4[MLD_KECCAK_WAY][MAX_RATE];
+  unsigned char output_x1[MAX_RATE];
+  unsigned char input[MLD_KECCAK_WAY][MAX_RATE];
+  uint8_t xor_offset, xor_length, ext_offset, ext_length;
   int i, j;
 
   for (i = 0; i < NUM_RANDOM_TESTS; i++)
   {
-    randombytes((uint8_t *)state_x4, sizeof(state_x4));
-    memcpy(state_x1, state_x4, sizeof(state_x4));
+    /* Generate random offset and length for xor_bytes */
+    randombytes(&xor_offset, 1);
+    randombytes(&xor_length, 1);
+    xor_offset = xor_offset % MAX_RATE;
+    xor_length = (uint8_t)(1 + (xor_length % (MAX_RATE - xor_offset)));
+
+    /* Generate random offset and length for extract_bytes */
+    randombytes(&ext_offset, 1);
+    randombytes(&ext_length, 1);
+    ext_offset = ext_offset % MAX_RATE;
+    ext_length = (uint8_t)(1 + (ext_length % (MAX_RATE - ext_offset)));
+
+    /* Generate different random input for each lane */
+    for (j = 0; j < MLD_KECCAK_WAY; j++)
+    {
+      randombytes(input[j], xor_length);
+    }
 
+    /* Run x4 implementation */
+    memset(state_x4, 0, sizeof(state_x4));
+    mld_keccakf1600x4_xor_bytes(state_x4, input[0], input[1], input[2],
+                                input[3], xor_offset, xor_length);
     mld_keccakf1600x4_permute(state_x4);
+    mld_keccakf1600x4_extract_bytes(state_x4, output_x4[0], output_x4[1],
+                                    output_x4[2], output_x4[3], ext_offset,
+                                    ext_length);
 
+    /* Compare each lane against x1 C reference */
     for (j = 0; j < MLD_KECCAK_WAY; j++)
     {
-      mld_keccakf1600_permute_c(state_x1 + j * MLD_KECCAK_LANES);
-    }
+      memset(state_x1, 0, sizeof(state_x1));
+      mld_keccakf1600_xor_bytes(state_x1, input[j], xor_offset, xor_length);
+      mld_keccakf1600_permute_c(state_x1);
+      mld_keccakf1600_extract_bytes(state_x1, output_x1, ext_offset,
+                                    ext_length);
 
-    CHECK(compare_u64_arrays(state_x4, state_x1,
-                             MLD_KECCAK_LANES * MLD_KECCAK_WAY,
-                             "keccakf1600x4_permute"));
+      CHECK(memcmp(output_x4[j], output_x1, ext_length) == 0);
+    }
   }
 
   return 0;
 }
+
+#undef MAX_RATE
 #endif /* MLD_USE_FIPS202_X4_NATIVE */
 
 static int test_backend_units(void)
@@ -745,7 +781,7 @@ static int test_backend_units(void)
 #endif
 
 #ifdef MLD_USE_FIPS202_X4_NATIVE
-  CHECK(test_keccakf1600x4_permute() == 0);
+  CHECK(test_keccakf1600x4_xor_permute_extract() == 0);
 #endif
 
   return 0;

From 1601cb6ae694f004be9de0e8a1cc2b9caccd4c5d Mon Sep 17 00:00:00 2001
From: "Matthias J. Kannwischer" <matthias@kannwischer.eu>
Date: Thu, 19 Feb 2026 10:28:56 +0100
Subject: [PATCH 2/3] FIPS202: Add native x4 XOR/extract bytes interface

Extend the FIPS202 native backend API to support implementing XORBytes and
ExtractBytes steps in native code.

This is essential for backends using custom state representations (e.g.,
bit-interleaved state), where these functions handle conversion to/from
the internal format on-the-fly. In such cases, they also account for a
significant amount of processing time.

New flags:
- MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE: Backend provides native XOR bytes
- MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE: Backend provides native extract bytes

When set, backends provide native implementations for:
- mld_keccakf1600_xor_bytes_x4_native: XOR input data into state
- mld_keccakf1600_extract_bytes_x4_native: Extract output from state

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
---
 mldsa/src/fips202/native/api.h | 60 ++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/mldsa/src/fips202/native/api.h b/mldsa/src/fips202/native/api.h
index b30135aa6..3b5b61afc 100644
--- a/mldsa/src/fips202/native/api.h
+++ b/mldsa/src/fips202/native/api.h
@@ -66,4 +66,64 @@ __contract__(
 );
 #endif /* MLD_USE_FIPS202_X4_NATIVE */
 
+/*
+ * Native x4 XOR bytes and extract bytes interface.
+ *
+ * These functions allow backends to provide optimized implementations for
+ * XORing input data into the state and extracting output data from the state.
+ * This is particularly useful for backends that use a different internal state
+ * representation (e.g., bit-interleaved), as conversion can happen during
+ * XOR/extract rather than before/after each permutation.
+ *
+ * NOTE: We assume that the custom representation of the zero state is the
+ * all-zero state.
+ *
+ * MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE: Backend provides native XOR bytes
+ * MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE: Backend provides native extract
+ * bytes
+ */
+
+#if defined(MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE)
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_keccakf1600_xor_bytes_x4_native(
+    uint64_t *state, const unsigned char *data0, const unsigned char *data1,
+    const unsigned char *data2, const unsigned char *data3, unsigned offset,
+    unsigned length)
+__contract__(
+  requires(0 <= offset && offset <= 25 * sizeof(uint64_t) &&
+           0 <= length && length <= 25 * sizeof(uint64_t) - offset)
+  requires(memory_no_alias(state, sizeof(uint64_t) * 25 * 4))
+  requires(memory_no_alias(data0, length))
+  requires((data0 == data1 &&
+            data0 == data2 &&
+            data0 == data3) ||
+           (memory_no_alias(data1, length) &&
+            memory_no_alias(data2, length) &&
+            memory_no_alias(data3, length)))
+  assigns(memory_slice(state, sizeof(uint64_t) * 25 * 4))
+  ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS)
+  ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged_u64(state, 25 * 4)));
+#endif /* MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE */
+
+#if defined(MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE)
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_keccakf1600_extract_bytes_x4_native(
+    uint64_t *state, unsigned char *data0, unsigned char *data1,
+    unsigned char *data2, unsigned char *data3, unsigned offset,
+    unsigned length)
+__contract__(
+  requires(0 <= offset && offset <= 25 * sizeof(uint64_t) &&
+           0 <= length && length <= 25 * sizeof(uint64_t) - offset)
+  requires(memory_no_alias(state, sizeof(uint64_t) * 25 * 4))
+  requires(memory_no_alias(data0, length))
+  requires(memory_no_alias(data1, length))
+  requires(memory_no_alias(data2, length))
+  requires(memory_no_alias(data3, length))
+  assigns(memory_slice(data0, length))
+  assigns(memory_slice(data1, length))
+  assigns(memory_slice(data2, length))
+  assigns(memory_slice(data3, length))
+  ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS));
+#endif /* MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE */
+
 #endif /* !MLD_FIPS202_NATIVE_API_H */

From 3823f1f35f690136e46e0691035cf27917f324a9 Mon Sep 17 00:00:00 2001
From: "Matthias J. Kannwischer" <matthias@kannwischer.eu>
Date: Thu, 19 Feb 2026 10:29:04 +0100
Subject: [PATCH 3/3] Armv8.1-M: Add native Keccak x4 XORBytes and ExtractBytes

Add native MVE implementations of XORBytes and ExtractBytes that perform
bit-interleaving/deinterleaving on-the-fly, enabling use of a bit-interleaved
state representation without temporary conversions in the permutation.

This improves performance by:
- Reducing the number of bit-interleaving operations
- Accelerating bit-interleaving using MVE vector instructions

The backend uses bit-interleaved state representation where each 64-bit
lane is split into even and odd 32-bit halves for efficient 32-bit
MVE processing.

Co-Authored-By: Brendan Moran <brendan.moran@arm.com>
Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
---
 dev/fips202/armv81m/mve.h                     |  48 +++
 .../armv81m/src/fips202_native_armv81m.h      |  16 +
 dev/fips202/armv81m/src/keccak_f1600_x4_mve.S |  36 +-
 dev/fips202/armv81m/src/keccak_f1600_x4_mve.c | 105 +-----
 .../armv81m/src/state_extract_bytes_x4_mve.S  | 333 +++++++++++++++++
 .../armv81m/src/state_xor_bytes_x4_mve.S      | 349 ++++++++++++++++++
 mldsa/mldsa_native.c                          |   6 +
 mldsa/mldsa_native_asm.S                      |   8 +
 mldsa/src/fips202/keccakf1600.c               |  58 ++-
 mldsa/src/fips202/native/armv81m/mve.h        |  48 +++
 .../armv81m/src/fips202_native_armv81m.h      |  16 +
 .../native/armv81m/src/keccak_f1600_x4_mve.S  |  34 +-
 .../native/armv81m/src/keccak_f1600_x4_mve.c  | 105 +-----
 .../armv81m/src/state_extract_bytes_x4_mve.S  | 290 +++++++++++++++
 .../armv81m/src/state_xor_bytes_x4_mve.S      | 314 ++++++++++++++++
 proofs/cbmc/dummy_backend_fips202_x4.h        |   2 +
 .../Makefile                                  |  37 ++
 ...ccakf1600x4_extract_bytes_native_harness.c |  16 +
 .../keccakf1600x4_xor_bytes_native/Makefile   |  37 ++
 .../keccakf1600x4_xor_bytes_native_harness.c  |  16 +
 scripts/check-magic                           |   4 +-
 scripts/simpasm                               |   2 +-
 22 files changed, 1663 insertions(+), 217 deletions(-)
 create mode 100644 dev/fips202/armv81m/src/state_extract_bytes_x4_mve.S
 create mode 100644 dev/fips202/armv81m/src/state_xor_bytes_x4_mve.S
 create mode 100644 mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S
 create mode 100644 mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S
 create mode 100644 proofs/cbmc/keccakf1600x4_extract_bytes_native/Makefile
 create mode 100644 proofs/cbmc/keccakf1600x4_extract_bytes_native/keccakf1600x4_extract_bytes_native_harness.c
 create mode 100644 proofs/cbmc/keccakf1600x4_xor_bytes_native/Makefile
 create mode 100644 proofs/cbmc/keccakf1600x4_xor_bytes_native/keccakf1600x4_xor_bytes_native_harness.c

diff --git a/dev/fips202/armv81m/mve.h b/dev/fips202/armv81m/mve.h
index a2bf121de..03ff5798c 100644
--- a/dev/fips202/armv81m/mve.h
+++ b/dev/fips202/armv81m/mve.h
@@ -11,12 +11,18 @@
 
 /* Part of backend API */
 #define MLD_USE_FIPS202_X4_NATIVE
+#define MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE
+#define MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE
 /* Guard for assembly file */
 #define MLD_FIPS202_ARMV81M_NEED_X4
 
 #if !defined(__ASSEMBLER__)
 #include "../api.h"
 
+/*
+ * Native x4 permutation
+ * State is kept in bit-interleaved format.
+ */
 #define mld_keccak_f1600_x4_native_impl \
   MLD_NAMESPACE(keccak_f1600_x4_native_impl)
 int mld_keccak_f1600_x4_native_impl(uint64_t *state);
@@ -27,6 +33,48 @@ static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state)
   return mld_keccak_f1600_x4_native_impl(state);
 }
 
+/*
+ * Native x4 XOR bytes (with on-the-fly bit interleaving)
+ */
+#define mld_keccak_f1600_x4_state_xor_bytes \
+  MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm)
+void mld_keccak_f1600_x4_state_xor_bytes(void *state, const uint8_t *data0,
+                                         const uint8_t *data1,
+                                         const uint8_t *data2,
+                                         const uint8_t *data3, unsigned offset,
+                                         unsigned length);
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_keccakf1600_xor_bytes_x4_native(
+    uint64_t *state, const uint8_t *data0, const uint8_t *data1,
+    const uint8_t *data2, const uint8_t *data3, unsigned offset,
+    unsigned length)
+{
+  mld_keccak_f1600_x4_state_xor_bytes(state, data0, data1, data2, data3, offset,
+                                      length);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+/*
+ * Native x4 extract bytes (with on-the-fly bit de-interleaving)
+ */
+#define mld_keccak_f1600_x4_state_extract_bytes \
+  MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
+void mld_keccak_f1600_x4_state_extract_bytes(void *state, uint8_t *data0,
+                                             uint8_t *data1, uint8_t *data2,
+                                             uint8_t *data3, unsigned offset,
+                                             unsigned length);
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_keccakf1600_extract_bytes_x4_native(
+    uint64_t *state, uint8_t *data0, uint8_t *data1, uint8_t *data2,
+    uint8_t *data3, unsigned offset, unsigned length)
+{
+  mld_keccak_f1600_x4_state_extract_bytes(state, data0, data1, data2, data3,
+                                          offset, length);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
 #endif /* !__ASSEMBLER__ */
 
 #endif /* !MLD_DEV_FIPS202_ARMV81M_MVE_H */
diff --git a/dev/fips202/armv81m/src/fips202_native_armv81m.h b/dev/fips202/armv81m/src/fips202_native_armv81m.h
index ac8d9e29d..4ed3b90f0 100644
--- a/dev/fips202/armv81m/src/fips202_native_armv81m.h
+++ b/dev/fips202/armv81m/src/fips202_native_armv81m.h
@@ -17,4 +17,20 @@ extern const uint32_t mld_keccakf1600_round_constants[48];
 void mld_keccak_f1600_x4_mve_asm(uint64_t state[100], uint64_t tmpstate[100],
                                  const uint32_t rc[48]);
 
+#define mld_keccak_f1600_x4_state_xor_bytes_asm \
+  MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm)
+void mld_keccak_f1600_x4_state_xor_bytes_asm(void *state, const uint8_t *d0,
+                                             const uint8_t *d1,
+                                             const uint8_t *d2,
+                                             const uint8_t *d3, unsigned offset,
+                                             unsigned length);
+
+#define mld_keccak_f1600_x4_state_extract_bytes_asm \
+  MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
+void mld_keccak_f1600_x4_state_extract_bytes_asm(void *state, uint8_t *data0,
+                                                 uint8_t *data1, uint8_t *data2,
+                                                 uint8_t *data3,
+                                                 unsigned offset,
+                                                 unsigned length);
+
 #endif /* !MLD_DEV_FIPS202_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H */
diff --git a/dev/fips202/armv81m/src/keccak_f1600_x4_mve.S b/dev/fips202/armv81m/src/keccak_f1600_x4_mve.S
index 50c06595f..96033ac0e 100644
--- a/dev/fips202/armv81m/src/keccak_f1600_x4_mve.S
+++ b/dev/fips202/armv81m/src/keccak_f1600_x4_mve.S
@@ -7,7 +7,7 @@
 
 /*yaml
   Name: keccak_f1600_x4_mve_asm
-  Description: Armv8.1-M MVE implementation of 4-way parallel Keccak-f[1600] permutation using bit-interleaved state
+  Description: Armv8.1-M MVE implementation of batched (x4) Keccak-f[1600] permutation using bit-interleaved state
   Signature: void mld_keccak_f1600_x4_mve_asm(void *state, void *tmpstate, const uint32_t *rc)
   ABI:
     r0:
@@ -15,7 +15,7 @@
       size_bytes: 800
       permissions: read/write
       c_parameter: void *state
-      description: Four bit-interleaved Keccak states (low halves followed by high halves)
+      description: Bit-interleaved state for 4 Keccak instances (even halves followed by odd halves)
     r1:
       type: buffer
       size_bytes: 800
@@ -33,6 +33,36 @@
     description: register preservation (44) + SIMD registers (64) + temporary storage (128)
 */
 
+// ---------------------------------------------------------------------------
+// Bit-interleaving background
+// ---------------------------------------------------------------------------
+// Each 64-bit Keccak lane is stored as two 32-bit words:
+//   even half -- bits 0, 2, 4, ..., 62 of the lane
+//   odd half  -- bits 1, 3, 5, ..., 63 of the lane
+// This representation allows 64-bit lane rotations (used in the Keccak
+// round function) to be implemented as pairs of 32-bit rotations.
+//
+// Batched (x4) processing:
+//   Four Keccak instances are processed as a batch.  Their states are
+//   stored interleaved in a single 800-byte buffer: first the even
+//   halves of all 25 lanes (400 bytes), then the odd halves (400 bytes).
+//   Within each 16-byte row, the four u32 words correspond to
+//   instances 0..3 of the same lane, enabling SIMD-parallel operations
+//   across all four instances.
+//
+// State memory layout (25 lanes x 4 instances x 2 halves):
+//   S[i][l]_even/odd = even/odd half of lane l, instance i  (u32)
+//   Each row is 16 bytes (one Q-register).
+//   Offset  Contents
+//     0     S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even
+//    16     S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even
+//    ...
+//   384     S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even
+//   400     S[0][ 0]_odd,  S[1][ 0]_odd,  S[2][ 0]_odd,  S[3][ 0]_odd
+//   416     S[0][ 1]_odd,  S[1][ 1]_odd,  S[2][ 1]_odd,  S[3][ 1]_odd
+//    ...
+//   784     S[0][24]_odd,  S[1][24]_odd,  S[2][24]_odd,  S[3][24]_odd
+
 #include "../../../../common.h"
 #if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \
     !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
@@ -426,7 +456,7 @@ qA20_l .req q2
 .endm
 
 .text
-.balign 8
+.balign 4
 .type MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm), %function
 .global MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm)
 MLD_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm)
diff --git a/dev/fips202/armv81m/src/keccak_f1600_x4_mve.c b/dev/fips202/armv81m/src/keccak_f1600_x4_mve.c
index e74fd8913..e26f1bf22 100644
--- a/dev/fips202/armv81m/src/keccak_f1600_x4_mve.c
+++ b/dev/fips202/armv81m/src/keccak_f1600_x4_mve.c
@@ -12,114 +12,19 @@
 
 #include "fips202_native_armv81m.h"
 
-/*
- * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations.
- * TODO: Replace with optimized MVE assembly implementations
- * (as a part of XORBytes and ExtractBytes)
- */
-
-/* Extract even-indexed bits from 64-bit value into lower 32 bits */
-static uint32_t bitinterleave_even(uint64_t x)
-{
-  uint64_t t;
-  t = x & 0x5555555555555555ULL;
-  t = (t | (t >> 1)) & 0x3333333333333333ULL;
-  t = (t | (t >> 2)) & 0x0f0f0f0f0f0f0f0fULL;
-  t = (t | (t >> 4)) & 0x00ff00ff00ff00ffULL;
-  t = (t | (t >> 8)) & 0x0000ffff0000ffffULL;
-  t = (t | (t >> 16)) & 0x00000000ffffffffULL;
-  return (uint32_t)t;
-}
-
-/* Extract odd-indexed bits from 64-bit value into lower 32 bits */
-static uint32_t bitinterleave_odd(uint64_t x)
-{
-  return bitinterleave_even(x >> 1);
-}
-
-/* Spread 32-bit value across even bit positions of 64-bit result */
-static uint64_t spread_even(uint32_t x)
-{
-  uint64_t t = x;
-  t = (t | (t << 16)) & 0x0000ffff0000ffffULL;
-  t = (t | (t << 8)) & 0x00ff00ff00ff00ffULL;
-  t = (t | (t << 4)) & 0x0f0f0f0f0f0f0f0fULL;
-  t = (t | (t << 2)) & 0x3333333333333333ULL;
-  t = (t | (t << 1)) & 0x5555555555555555ULL;
-  return t;
-}
-
-/* Combine even and odd 32-bit halves into interleaved 64-bit value */
-static uint64_t bitdeinterleave(uint32_t even, uint32_t odd)
-{
-  return spread_even(even) | (spread_even(odd) << 1);
-}
 
 /*
- * TEMPORARY: Naive C interleaving functions.
- * These will be replaced with optimized MVE assembly implementations.
+ * Keccak-f1600 x4 permutation (on bit-interleaved state)
+ * State is expected to already be in bit-interleaved format.
  */
-static void interleave_4fold(uint64_t *state_4x, const uint64_t *state0,
-                             const uint64_t *state1, const uint64_t *state2,
-                             const uint64_t *state3)
-{
-  uint32_t *state_4xl = (uint32_t *)state_4x;
-  uint32_t *state_4xh = (uint32_t *)state_4x + 100;
-
-  for (size_t i = 0; i < 25; i++)
-  {
-    state_4xl[i * 4 + 0] = bitinterleave_even(state0[i]);
-    state_4xl[i * 4 + 1] = bitinterleave_even(state1[i]);
-    state_4xl[i * 4 + 2] = bitinterleave_even(state2[i]);
-    state_4xl[i * 4 + 3] = bitinterleave_even(state3[i]);
-
-    state_4xh[i * 4 + 0] = bitinterleave_odd(state0[i]);
-    state_4xh[i * 4 + 1] = bitinterleave_odd(state1[i]);
-    state_4xh[i * 4 + 2] = bitinterleave_odd(state2[i]);
-    state_4xh[i * 4 + 3] = bitinterleave_odd(state3[i]);
-  }
-}
-
-static void deinterleave_4fold(uint64_t *state_4x, uint64_t *state0,
-                               uint64_t *state1, uint64_t *state2,
-                               uint64_t *state3)
-{
-  uint32_t *state_4xl = (uint32_t *)state_4x;
-  uint32_t *state_4xh = (uint32_t *)state_4x + 100;
-
-  for (size_t i = 0; i < 25; i++)
-  {
-    state0[i] = bitdeinterleave(state_4xl[i * 4 + 0], state_4xh[i * 4 + 0]);
-    state1[i] = bitdeinterleave(state_4xl[i * 4 + 1], state_4xh[i * 4 + 1]);
-    state2[i] = bitdeinterleave(state_4xl[i * 4 + 2], state_4xh[i * 4 + 2]);
-    state3[i] = bitdeinterleave(state_4xl[i * 4 + 3], state_4xh[i * 4 + 3]);
-  }
-}
-
 #define mld_keccak_f1600_x4_native_impl \
   MLD_NAMESPACE(keccak_f1600_x4_native_impl)
 int mld_keccak_f1600_x4_native_impl(uint64_t *state)
 {
-  /*
-   * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations.
-   * TODO: Replace with optimized MVE assembly implementations
-   * (as a part of XORBytes and ExtractBytes)
-   */
-  MLD_ALIGN uint64_t state_4x[100];
-  MLD_ALIGN uint64_t state_4x_tmp[100];
-
-  /* Interleave the 4 states into bit-interleaved format */
-  interleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]);
-
-  /* Run the permutation */
-  mld_keccak_f1600_x4_mve_asm(state_4x, state_4x_tmp,
+  MLD_ALIGN uint64_t state_tmp[100];
+  mld_keccak_f1600_x4_mve_asm(state, state_tmp,
                               mld_keccakf1600_round_constants);
-
-  /* Deinterleave back to 4 separate states */
-  deinterleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]);
-
-  mld_zeroize(state_4x, sizeof(state_4x));
-  mld_zeroize(state_4x_tmp, sizeof(state_4x_tmp));
+  mld_zeroize(state_tmp, sizeof(state_tmp));
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
diff --git a/dev/fips202/armv81m/src/state_extract_bytes_x4_mve.S b/dev/fips202/armv81m/src/state_extract_bytes_x4_mve.S
new file mode 100644
index 000000000..f45f168ca
--- /dev/null
+++ b/dev/fips202/armv81m/src/state_extract_bytes_x4_mve.S
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * Copyright (c) The mldsa-native project authors
+ * Copyright (c) 2026 Arm Limited
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+// ---------------------------------------------------------------------------
+// Overview
+// ---------------------------------------------------------------------------
+// MVE/Helium implementation of KeccakF1600x4_StateExtractBytes
+// (inverse of state_xor_bytes_x4_mve.S).
+//
+// void KeccakF1600x4_StateExtractBytes(state, d0, d1, d2, d3, offset, length)
+//
+// Reads 'length' bytes from the bit-interleaved Keccak state starting at
+// byte 'offset', recombines the even and odd halves of each lane back
+// into plain bytes, and writes them to four output buffers (d0..d3).
+//
+// ---------------------------------------------------------------------------
+// Bit-interleaving background
+// ---------------------------------------------------------------------------
+// Each 64-bit Keccak lane is stored as two 32-bit words:
+//   even half -- bits 0, 2, 4, ..., 62 of the lane
+//   odd half  -- bits 1, 3, 5, ..., 63 of the lane
+// This representation allows 64-bit lane rotations (used in the Keccak
+// round function) to be implemented as pairs of 32-bit rotations.
+//
+// Batched (x4) processing:
+//   Four Keccak instances are processed as a batch.  Their states are
+//   stored interleaved in a single 800-byte buffer: first the even
+//   halves of all 25 lanes (400 bytes), then the odd halves (400 bytes).
+//   Within each 16-byte row, the four u32 words correspond to
+//   instances 0..3 of the same lane, enabling SIMD-parallel operations
+//   across all four instances.
+//
+// State memory layout (25 lanes x 4 instances x 2 halves):
+//   S[i][l]_even/odd = even/odd half of lane l, instance i  (u32)
+//   Each row is 16 bytes (one Q-register).
+//   Offset  Contents
+//     0     S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even
+//    16     S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even
+//    ...
+//   384     S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even
+//   400     S[0][ 0]_odd,  S[1][ 0]_odd,  S[2][ 0]_odd,  S[3][ 0]_odd
+//   416     S[0][ 1]_odd,  S[1][ 1]_odd,  S[2][ 1]_odd,  S[3][ 1]_odd
+//    ...
+//   784     S[0][24]_odd,  S[1][24]_odd,  S[2][24]_odd,  S[3][24]_odd
+//
+// ---------------------------------------------------------------------------
+// Three-phase structure
+// ---------------------------------------------------------------------------
+//   Prologue -- if offset is not 8-byte aligned, extract
+//               min(length, 8-(offset%8)) bytes via predicated byte stores.
+//   Main     -- process full 8-byte groups: load even/odd lane pair,
+//               de-interleave, scatter-store to output buffers.
+//   Tail     -- extract remaining <8 bytes via predicated byte stores.
+
+#include "../../../../common.h"
+#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+/* simpasm: header-end */
+
+.thumb
+.syntax unified
+.text
+
+// ---------------------------------------------------------------------------
+// deinterleave_even: inverse of the even-bit compaction.  Inflates the even
+// half stored in \e back into byte positions.
+// Inputs:  \e (compacted even bits)
+// Outputs: \e (even bits expanded to byte positions, odd bits are garbage)
+// Clobbers: \tmp
+// ---------------------------------------------------------------------------
+.macro deinterleave_even e, tmp
+    // Inflate e                              +------+-----------+-----------+-----------+
+    // Single-element annotation follows.  e: |   X  |     X     | l[0],l[1] | l[2],l[3] |
+    vsli.u32    \e, \e, #8              // e: |   X  | l[0],l[1] | l[2],l[3] | l[2],l[3] |
+    vsli.u16    \e, \e, #4              // e: | l[0] | l[1],l[1] | l[3],l[2] | l[3],l[3] |
+    //                                        +------+-----------+-----------+-----------+
+    // Now, expand the lower bits  e: | XXXX3210 | tmp: | XXXXXXXX |
+    vsli.u8     \e, \e, #1    //   e: | XXX32100 | tmp: | XXXXXXXX |  (start re-packing low bits)
+    vshr.u8     \tmp, \e, #3  //   e: | XXX32100 | tmp: | XXXXXX32 |
+    vsli.u8     \e, \tmp, #4  //   e: | XX322100 | tmp: | XXXXXX32 |  (assemble nibbles)
+    vshr.u8     \tmp, \e, #5  //   e: | XX322100 | tmp: | XXXXXXX3 |
+    vsli.u8     \e, \tmp, #6  //   e: | X3322100 | tmp: | XXXXXXX3 |  (finalize byte compaction)
+.endm
+
+// ---------------------------------------------------------------------------
+// from_bit_interleaving_4x: reconstruct byte vectors from bit-interleaved
+// even/odd halves.
+// Inputs:  \qe = even half, \qo = odd half
+//          (per 32b lane: lo16 = bytes 0..3, hi16 = bytes 4..7)
+// Outputs: \qe = [d0l, d1l, d2l, d3l] (low  4 bytes per instance)
+//          \qo = [d0h, d1h, d2h, d3h] (high 4 bytes per instance)
+// Clobbers: \rtmp, \qt0, \qt1, \qt2
+// ---------------------------------------------------------------------------
+.macro from_bit_interleaving_4x qe, qo, qt0, qt1, qt2, rtmp
+    //     +------+------+------+------+------+------+------+------+
+    // qe: | E0l  | E0u  | E1l  | E1u  | E2l  | E2u  | E3l  | E3u  |
+    // qo: | O0l  | O0u  | O1l  | O1u  | O2l  | O2u  | O3l  | O3u  |
+    //     +------+------+------+------+------+------+------+------+
+    // Clone and byte-swap to get upper halves into position
+    vrev32.u16     \qt0, \qe
+    vrev32.u16     \qt1, \qo
+    // De-interleave lower evens / lower odds
+    deinterleave_even \qe, \qt2
+    deinterleave_even \qo, \qt2
+    // qe and qo now hold valid even-position bits but garbage in odd positions.
+    // Build mask 0x55..55 (01010101b) to isolate even-bit positions, then
+    // shift the odd half left by 1 and OR to reconstruct the original bytes.
+    mov          \rtmp, #0x55
+    vdup.u8      \qt2, \rtmp
+    vand.u32     \qe, \qe, \qt2
+    vand.u32     \qo, \qo, \qt2
+    vshl.u32     \qo, \qo, #1
+    vorr         \qe, \qe, \qo       // qe = low bytes reconstructed
+    // De-interleave upper evens / upper odds
+    deinterleave_even \qt0, \qo
+    deinterleave_even \qt1, \qo
+    vand.u32     \qo, \qt0, \qt2     // reuse mask still in qt2
+    vand.u32     \qt1, \qt1, \qt2
+    vshl.u32     \qt1, \qt1, #1
+    vorr         \qo, \qo, \qt1      // qo = high bytes reconstructed
+.endm
+
+// ---------------------------------------------------------------------------
+// transpose_lanes_to_streams: rearrange two lane-ordered vectors into four
+// per-instance vectors (inverse of transpose_streams_to_lanes).
+//   q0 = [d0l, d1l, d2l, d3l]  ->  q0 = [d0l, d0h, ?, ?]
+//   q1 = [d0h, d1h, d2h, d3h]  ->  q1 = [d1l, d1h, ?, ?]
+//                                   q2 = [d2l, d2h, ?, ?]
+//                                   q3 = [d3l, d3h, ?, ?]
+// Clobbers: q2, q3, p0, r0
+//
+// Vectors:             ||           q0          ||           q1          ||           q2          ||           q3          ||
+// Elements:            || d0l | d1l | d2l | d3l || d0h | d1h | d2h | d3h ||                       ||                       ||
+// ---------------------------------------------------------------------------
+.macro transpose_lanes_to_streams
+    vrev64.u32 q2, q0    // || d0l | d1l | d2l | d3l || d0h | d1h | d2h | d3h || d1l | d0l | d3l | d2l ||                       ||
+    vrev64.u32 q3, q1    // || d0l | d1l | d2l | d3l || d0h | d1h | d2h | d3h || d1l | d0l | d3l | d2l || d1h | d0h | d3h | d2h ||
+    mov r0, #0x0F0F
+    vmsr p0, r0
+    vpsel q0, q0, q3     // || d0l | d0h | d2l | d2h || d0h | d1h | d2h | d3h || d1l | d0l | d3l | d2l || d1h | d0h | d3h | d2h ||
+    vpsel q1, q2, q1     // || d0l | d0h | d2l | d2h || d1l | d1h | d3l | d3h || d1l | d0l | d3l | d2l || d1h | d0h | d3h | d2h ||
+    vmov d4, d1          // || d0l | d0h | d2l | d2h || d1l | d1h | d3l | d3h || d2l | d2h | d3l | d2l || d1h | d0h | d3h | d2h ||
+    vmov d6, d3          // || d0l | d0h | d2l | d2h || d1l | d1h | d3l | d3h || d2l | d2h | d3l | d2l || d3l | d3h | d3h | d2h ||
+.endm
+
+
+// ---------------------------------------------------------------------------
+// void keccak_f1600_x4_state_extract_bytes_asm(void *state,
+//                                              unsigned char *data0,
+//                                              unsigned char *data1,
+//                                              unsigned char *data2,
+//                                              unsigned char *data3,
+//                                              unsigned offset,
+//                                              unsigned length)
+//
+// AAPCS: r0=state, r1=d0, r2=d1, r3=d2, stack: d3, offset, length
+// ---------------------------------------------------------------------------
+.balign 4
+.global MLD_ASM_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
+.type MLD_ASM_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm), %function
+MLD_ASM_FN_SYMBOL(keccak_f1600_x4_state_extract_bytes_asm)
+    .equ stack_offset, ((12-4+2)*4+(15-8+1)*8)
+    push    {r4-r12, lr}
+    vpush   {d8-d15}
+
+    state             .req r0
+    dp0               .req r1
+    dp1               .req r2
+    dp2               .req r3
+    dp3               .req r4
+    off               .req r5
+    length            .req r6
+    rSO               .req r7
+    rSE               .req r8
+    lane_offset_bytes .req r9
+    off_full          .req r10
+    mask              .req r11
+    tmp               .req r12
+    nB                .req lr
+
+    qP                .req q7
+    qd0               .req q0
+    qd1               .req q1
+    qd2               .req q2
+    qd3               .req q3
+
+    ldr     dp3,      [sp, #stack_offset+0]
+    ldr     off_full, [sp, #stack_offset+4]
+    ldr     length,   [sp, #stack_offset+8]
+
+    cmp     length,  #0
+    beq     keccak_f1600_x4_state_extract_bytes_asm_exit
+
+    and     off, off_full, #7
+    bic     lane_offset_bytes, off_full, #7
+
+    add     rSE, state, lane_offset_bytes, lsl #1
+    add     rSO, rSE, #400
+
+    // -----------------------------------------------------------------------
+    // PROLOGUE: extract min(len, 8-offset%8) bytes from the unaligned lane
+    // -----------------------------------------------------------------------
+    cmp     off, #0
+    beq     keccak_f1600_x4_state_extract_bytes_asm_pre_main
+
+    // Load even/odd halves of one lane from state (post-increment rSE/rSO by 16)
+    vldrw.u32 qd0, [rSE], #16
+    vldrw.u32 qd1, [rSO], #16
+
+    // De-interleave (clobbers r0, q2, q3, q4)
+    from_bit_interleaving_4x q0, q1, q2, q3, q4, r0
+
+    // Transpose from per-lane to per-instance layout
+    transpose_lanes_to_streams
+
+    // nB = min(length, 8 - off)
+    rsb     nB, off, #8
+    cmp     length, nB
+    it      ls
+    movls   nB, length
+
+    // Build predicate: nB active bytes shifted left by 'off'
+    vctp.8 nB
+    vmrs mask, p0
+    lsl mask, mask, off
+    vmsr p0, mask
+
+    // Subtract offset from data pointers so predicate window aligns
+    subs dp0, dp0, off
+    subs dp1, dp1, off
+    subs dp2, dp2, off
+    subs dp3, dp3, off
+
+    // Predicated byte stores (post-increment by 4)
+    vpstttt
+    vstrbt.u8 qd0, [dp0], #4
+    vstrbt.u8 qd1, [dp1], #4
+    vstrbt.u8 qd2, [dp2], #4
+    vstrbt.u8 qd3, [dp3], #4
+
+    subs    length, length, nB
+    cmp     length, #0
+    beq     keccak_f1600_x4_state_extract_bytes_asm_exit
+
+    // Build qP from updated scalar pointers
+    vmov    qP[2], qP[0],  dp0, dp2
+    vmov    qP[3], qP[1],  dp1, dp3
+    b       keccak_f1600_x4_state_extract_bytes_asm_main_body
+
+keccak_f1600_x4_state_extract_bytes_asm_pre_main:
+    vmov    qP[2], qP[0],  dp0, dp2
+    vmov    qP[3], qP[1],  dp1, dp3
+    mov     tmp, #4
+    vsub.u32    qP, qP, tmp
+
+    // -----------------------------------------------------------------------
+    // MAIN: process full 8-byte lanes
+    // -----------------------------------------------------------------------
+keccak_f1600_x4_state_extract_bytes_asm_main_body:
+    lsr     lr, length, #3
+    wls     lr, lr, keccak_f1600_x4_state_extract_bytes_asm_main_loop_end
+keccak_f1600_x4_state_extract_bytes_asm_main_loop_start:
+    vldrw.u32 qd0, [rSE], #16
+    vldrw.u32 qd1, [rSO], #16
+
+    // De-interleave (clobbers r0, q2, q3, q4)
+    from_bit_interleaving_4x q0, q1, q2, q3, q4, r0
+
+    // Scatter-store 8 bytes per instance (two u32 stores with post-increment)
+    vstrw.u32   qd0, [qP, #4]!
+    vstrw.u32   qd1, [qP, #4]!
+
+    le      lr, keccak_f1600_x4_state_extract_bytes_asm_main_loop_start
+keccak_f1600_x4_state_extract_bytes_asm_main_loop_end:
+
+    // -----------------------------------------------------------------------
+    // TAIL: extract <8 remaining bytes at lane offset 0
+    // -----------------------------------------------------------------------
+    ands    length, length, #7
+    beq     keccak_f1600_x4_state_extract_bytes_asm_exit
+
+    // Recover scalar pointers from qP
+    mov     tmp, #4
+    vadd.u32    qP, qP, tmp
+    vmov    dp0, dp2, qP[2], qP[0]
+    vmov    dp1, dp3, qP[3], qP[1]
+
+    vldrw.u32 qd0, [rSE], #16
+    vldrw.u32 qd1, [rSO], #16
+
+    // De-interleave (clobbers r0, q2, q3, q4)
+    from_bit_interleaving_4x q0, q1, q2, q3, q4, r0
+
+    // Transpose from per-lane to per-instance layout
+    transpose_lanes_to_streams
+
+    // Predicated byte stores for remaining bytes
+    vctp.8 length
+    vpstttt
+    vstrbt.u8 qd0, [dp0], #4
+    vstrbt.u8 qd1, [dp1], #4
+    vstrbt.u8 qd2, [dp2], #4
+    vstrbt.u8 qd3, [dp3], #4
+
+keccak_f1600_x4_state_extract_bytes_asm_exit:
+    vpop    {d8-d15}
+    pop     {r4-r12, pc}
+    .unreq state
+    .unreq dp0
+    .unreq dp1
+    .unreq dp2
+    .unreq dp3
+    .unreq off
+    .unreq length
+    .unreq rSO
+    .unreq rSE
+    .unreq lane_offset_bytes
+    .unreq off_full
+    .unreq mask
+    .unreq tmp
+    .unreq nB
+    .unreq qP
+    .unreq qd0
+    .unreq qd1
+    .unreq qd2
+    .unreq qd3
+
+/* simpasm: footer-start */
+#endif /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/dev/fips202/armv81m/src/state_xor_bytes_x4_mve.S b/dev/fips202/armv81m/src/state_xor_bytes_x4_mve.S
new file mode 100644
index 000000000..cf343ee0e
--- /dev/null
+++ b/dev/fips202/armv81m/src/state_xor_bytes_x4_mve.S
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * Copyright (c) The mldsa-native project authors
+ * Copyright (c) 2026 Arm Limited
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+// ---------------------------------------------------------------------------
+// Overview
+// ---------------------------------------------------------------------------
+// MVE/Helium implementation of KeccakF1600x4_StateXORBytes.
+//
+// void KeccakF1600x4_StateXORBytes(state, d0, d1, d2, d3, offset, length)
+//
+// Reads 'length' plain bytes from each of four input buffers (d0..d3),
+// splits every byte into its even and odd bits (bit-interleaving), and
+// XORs the result into the Keccak state starting at byte 'offset'.
+//
+// ---------------------------------------------------------------------------
+// Bit-interleaving background
+// ---------------------------------------------------------------------------
+// Each 64-bit Keccak lane is stored as two 32-bit words:
+//   even half -- bits 0, 2, 4, ..., 62 of the lane
+//   odd half  -- bits 1, 3, 5, ..., 63 of the lane
+// This representation allows 64-bit lane rotations (used in the Keccak
+// round function) to be implemented as pairs of 32-bit rotations.
+//
+// Batched (x4) processing:
+//   Four Keccak instances are processed as a batch.  Their states are
+//   stored interleaved in a single 800-byte buffer: first the even
+//   halves of all 25 lanes (400 bytes), then the odd halves (400 bytes).
+//   Within each 16-byte row, the four u32 words correspond to
+//   instances 0..3 of the same lane, enabling SIMD-parallel operations
+//   across all four instances.
+//
+// State memory layout (25 lanes x 4 instances x 2 halves):
+//   S[i][l]_even/odd = even/odd half of lane l, instance i  (u32)
+//   Each row is 16 bytes (one Q-register).
+//   Offset  Contents
+//     0     S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even
+//    16     S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even
+//    ...
+//   384     S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even
+//   400     S[0][ 0]_odd,  S[1][ 0]_odd,  S[2][ 0]_odd,  S[3][ 0]_odd
+//   416     S[0][ 1]_odd,  S[1][ 1]_odd,  S[2][ 1]_odd,  S[3][ 1]_odd
+//    ...
+//   784     S[0][24]_odd,  S[1][24]_odd,  S[2][24]_odd,  S[3][24]_odd
+//
+// ---------------------------------------------------------------------------
+// Three-phase structure
+// ---------------------------------------------------------------------------
+//   Prologue -- if offset is not 8-byte aligned, absorb
+//               min(length, 8-(offset%8)) bytes via predicated byte loads.
+//   Main     -- process full 8-byte groups via word-level gather loads,
+//               bit-interleave, then VEOR into even/odd state halves.
+//   Tail     -- absorb remaining <8 bytes via predicated byte loads.
+
+#include "../../../../common.h"
+#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+/* simpasm: header-end */
+
+.thumb
+.syntax unified
+.text
+
+// ---------------------------------------------------------------------------
+// Interleave macros
+// ---------------------------------------------------------------------------
+
+// interleave_odds: in-place SWAR bit permutation that compacts odd-numbered
+// bits of each byte/halfword/word in \t toward the upper half, preparing the
+// odd half of the bit-interleaved representation.
+// Inputs:  \t (data)
+// Outputs: \t (odd bits compacted; per 32b lane: lo16=bytes 0..3, hi16=4..7)
+// Clobbers: \u
+.macro interleave_odds t, u
+    vshl.u8     \u, \t, #2       // u = t[5..0],00
+    vsri.u8     \t, \u, #1       // t = t[7],u[6..0]  => t = t[7],t[5..0],0
+    vshl.u8     \u, \t, #3       // stage 2 across nibbles
+    vsri.u8     \t, \u, #2
+    vshl.u8     \u, \t, #4       // stage 3 across bytes
+    vsri.u8     \t, \u, #3
+    vshl.u16    \u, \t, #8       // widen within halfwords
+    vsri.u8     \t, \u, #4
+    vshl.u32    \u, \t, #16      // widen within words
+    vsri.u16    \t, \u, #8
+.endm
+
+// interleave_evens: in-place SWAR bit permutation that compacts even-numbered
+// bits of each byte/halfword/word in \t toward the lower half, preparing the
+// even half of the bit-interleaved representation.
+// Inputs:  \t (data)
+// Outputs: \t (even bits compacted; per 32b lane: lo16=bytes 0..3, hi16=4..7)
+// Clobbers: \u
+.macro interleave_evens t, u
+    vshr.u8     \u, \t, #2       // stage 1 within bytes
+    vsli.u8     \t, \u, #1       // t = ((t >> 1) & 0x7E7E7E7E) | (t & 0x01010101)
+    vshr.u8     \u, \t, #3       // stage 2 within nibbles
+    vsli.u8     \t, \u, #2       // t = ((t >> 2) & 0x1C1C1C1C) | (t & 0x03030303)
+    vshr.u8     \u, \t, #4       // stage 3 across bytes
+    vsli.u8     \t, \u, #3       // t = ((t >> 3) & 0x08080808) | (t & 0x07070707)
+    vshr.u16    \u, \t, #8       // widen within halfwords
+    vsli.u8     \t, \u, #4       // t = ((t >> 4) & 0x00F000F0) | (t & 0x000F000F)
+    vshr.u32    \u, \t, #16      // widen within words
+    vsli.u16    \t, \u, #8       // t = ((t >> 8) & 0x0000FF00) | (t & 0x000000FF)
+.endm
+
+// ---------------------------------------------------------------------------
+// to_bit_interleaving_4x: split \qe/\qo (low/high 4 bytes per instance)
+// into even and odd halves packed as:
+//   \qe = even half (lo16: bytes 0..3, hi16: bytes 4..7)
+//   \qo = odd half  (lo16: bytes 0..3, hi16: bytes 4..7)
+// Inputs:  \qe = [d0l, d1l, d2l, d3l], \qo = [d0h, d1h, d2h, d3h]
+// Outputs: \qe (even half), \qo (odd half)
+// Clobbers: \qt0, \qt1, \qt2
+// ---------------------------------------------------------------------------
+.macro to_bit_interleaving_4x qe, qo, qt0, qt1, qt2
+    vmov \qt0, \qe
+    vmov \qt1, \qo
+    interleave_evens \qe, \qt2        // pack even bits in qe (low 16: d?l, high 16: d?h)
+    interleave_evens \qt1, \qt2       // pack even bits from the high-half vector
+    vsli.32          \qe, \qt1, #16   // merge: qe = [even(lo16), even(hi16)]
+    interleave_odds  \qt0, \qt2       // pack odd bits from original qe
+    interleave_odds  \qo, \qt2        // pack odd bits from original qo
+    vsri.32          \qo, \qt0, #16   // merge: qo = [odd(lo16), odd(hi16)]
+.endm
+
+// ---------------------------------------------------------------------------
+// transpose_streams_to_lanes: rearrange four per-instance vectors (q0..q3,
+// each holding 8 bytes in its low 64 bits) into two vectors:
+//   q0 = [d0l, d1l, d2l, d3l]  (low  4 bytes of each instance)
+//   q1 = [d0h, d1h, d2h, d3h]  (high 4 bytes of each instance)
+// Clobbers: q2, q3, p0, r0
+//
+// Vectors:             ||           q0          ||           q1          ||           q2          ||           q3          ||
+// Elements:            || d0l | d0h |  0  |  0  || d1l | d1h |  0  |  0  || d2l | d2h |  0  |  0  || d3l | d3h |  0  |  0  ||
+// ---------------------------------------------------------------------------
+.macro transpose_streams_to_lanes
+    vmov d1, d4          // || d0l | d0h | d2l | d2h || d1l | d1h |  0  |  0  || d2l | d2h |  0  |  0  || d3l | d3h |  0  |  0  ||
+    vmov d3, d6          // || d0l | d0h | d2l | d2h || d1l | d1h | d3l | d3h || d2l | d2h |  0  |  0  || d3l | d3h |  0  |  0  ||
+    vrev64.u32 q2, q0    // || d0l | d0h | d2l | d2h || d1l | d1h | d3l | d3h || d0h | d0l | d2h | d2l || d3l | d3h |  0  |  0  ||
+    vrev64.u32 q3, q1    // || d0l | d0h | d2l | d2h || d1l | d1h | d3l | d3h || d0h | d0l | d2h | d2l || d1h | d1l | d3h | d3l ||
+    mov r0, #0x0F0F      // predicate: select lower 4 bytes within each 64b half
+    vmsr p0, r0
+    vpsel q0, q0, q3     // q0 = [d0l, d1l, d2l, d3l]
+    vpsel q1, q2, q1     // q1 = [d0h, d1h, d2h, d3h]
+.endm
+
+// ---------------------------------------------------------------------------
+// xor_lane_and_store_postinc: XOR one lane into state with post-increment.
+// rSE / rSO are current pointers to the even / odd state halves.
+// ---------------------------------------------------------------------------
+.macro xor_lane_and_store_postinc qE, qO, qS0, qS1, rSE, rSO
+    vldrw.u32   \qS0, [\rSE]         // load 16B from even half
+    vldrw.u32   \qS1, [\rSO]         // load 16B from odd half
+    veor.u32    \qS0, \qS0, \qE
+    veor.u32    \qS1, \qS1, \qO
+    vstrw.u32   \qS0, [\rSE], #16    // post-inc by 16 bytes
+    vstrw.u32   \qS1, [\rSO], #16
+.endm
+
+
+// ---------------------------------------------------------------------------
+// void keccak_f1600_x4_state_xor_bytes_asm(void *state,
+//                                          const unsigned char *data0,
+//                                          const unsigned char *data1,
+//                                          const unsigned char *data2,
+//                                          const unsigned char *data3,
+//                                          unsigned offset, unsigned length)
+//
+// AAPCS: r0=state, r1=d0, r2=d1, r3=d2, stack: d3, offset, length
+// ---------------------------------------------------------------------------
+
+.balign 4
+.global MLD_ASM_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm)
+.type MLD_ASM_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm), %function
+MLD_ASM_FN_SYMBOL(keccak_f1600_x4_state_xor_bytes_asm)
+    .equ stack_offset, ((12-4+2)*4+(15-8+1)*8)
+    push    {r4-r12, lr}
+    vpush   {d8-d15}
+
+    state               .req r0
+    dp0                 .req r1
+    dp1                 .req r2
+    dp2                 .req r3
+    dp3                 .req r4
+    off                 .req r5
+    length              .req r6
+    rSO                 .req r7
+    rSE                 .req r8
+    lane_offset_bytes   .req r9
+    off_full            .req r10
+    mask                .req r11
+    tmp                 .req r0
+    nB                  .req lr
+
+    qP                  .req q7
+
+    qd0                 .req q0
+    qd1                 .req q1
+    qd2                 .req q2
+    qd3                 .req q3
+
+    qS0                 .req q4
+    qS1                 .req q5
+
+    ldr     dp3,      [sp, #stack_offset+0]
+    ldr     off_full, [sp, #stack_offset+4]
+    ldr     length,   [sp, #stack_offset+8]
+
+    cmp     length,  #0
+    beq     keccak_f1600_x4_state_xor_bytes_asm_exit
+
+    and     off, off_full, #7
+    bic     lane_offset_bytes, off_full, #7
+
+    add     rSE, state, lane_offset_bytes, lsl #1
+    add     rSO, rSE, #400
+
+    // -----------------------------------------------------------------------
+    // PROLOGUE: absorb min(len, 8-offset%8) bytes at the unaligned position
+    // -----------------------------------------------------------------------
+    cmp     off, #0
+    beq     keccak_f1600_x4_state_xor_bytes_asm_pre_main
+
+    // Subtract offset from data pointers so predicate window aligns
+    subs dp0, dp0, off
+    subs dp1, dp1, off
+    subs dp2, dp2, off
+    subs dp3, dp3, off
+
+    // nB = min(length, 8 - off)
+    rsb     nB, off, #8
+    cmp     length, nB
+    it      ls
+    movls   nB, length
+    subs    length, length, nB
+
+    // Build predicate: nB active bytes shifted left by 'off'
+    vctp.8 nB
+    vmrs mask, p0
+    lsl mask, mask, off
+    vmsr p0, mask
+
+    // Predicated byte loads (4 bytes per instance, post-increment by 4)
+    vpstttt
+    vldrbt.u8 qd0, [dp0], #4
+    vldrbt.u8 qd1, [dp1], #4
+    vldrbt.u8 qd2, [dp2], #4
+    vldrbt.u8 qd3, [dp3], #4
+
+    // Transpose from per-instance layout to per-lane layout
+    transpose_streams_to_lanes
+
+    // Bit-interleave (clobbers q2, q3, q4)
+    to_bit_interleaving_4x q0, q1, q2, q3, q4
+
+    // XOR into state (post-increments rSE/rSO by 16)
+    xor_lane_and_store_postinc q0, q1, qS0, qS1, rSE, rSO
+
+    // Build qP = {d0,d1,d2,d3} as u32 lanes for gather loads
+    vmov    qP[2], qP[0],  dp0, dp2
+    vmov    qP[3], qP[1],  dp1, dp3
+    cmp     length, #0
+    beq     keccak_f1600_x4_state_xor_bytes_asm_exit
+    b       keccak_f1600_x4_state_xor_bytes_asm_main_body
+
+keccak_f1600_x4_state_xor_bytes_asm_pre_main:
+    vmov    qP[2], qP[0],  dp0, dp2
+    vmov    qP[3], qP[1],  dp1, dp3
+    mov     tmp, #4
+    vsub.u32    qP, qP, tmp        // pre-bias so first [qP,#4]! lands at original ptr
+
+keccak_f1600_x4_state_xor_bytes_asm_main_body:
+    // -----------------------------------------------------------------------
+    // MAIN: process full 8-byte lanes
+    // -----------------------------------------------------------------------
+    lsr     lr, length, #3
+    wls     lr, lr, keccak_f1600_x4_state_xor_bytes_asm_main_loop_end
+keccak_f1600_x4_state_xor_bytes_asm_main_loop_start:
+    // Gather 8 bytes per instance (two u32 loads with post-increment)
+    vldrw.u32 qd0, [qP, #4]!
+    vldrw.u32 qd1, [qP, #4]!
+
+    // Bit-interleave (clobbers q2, q3, q4)
+    to_bit_interleaving_4x q0, q1, q2, q3, q4
+
+    // XOR into state (post-increments rSE/rSO by 16)
+    xor_lane_and_store_postinc q0, q1, qS0, qS1, rSE, rSO
+
+    le      lr, keccak_f1600_x4_state_xor_bytes_asm_main_loop_start
+keccak_f1600_x4_state_xor_bytes_asm_main_loop_end:
+
+    // -----------------------------------------------------------------------
+    // TAIL: absorb <8 remaining bytes at lane offset 0
+    // -----------------------------------------------------------------------
+    ands    length, length, #7
+    beq     keccak_f1600_x4_state_xor_bytes_asm_exit
+
+    // Recover scalar pointers from qP
+    mov     tmp, #4
+    vadd.u32    qP, qP, tmp
+    vmov    dp0, dp2, qP[2], qP[0]
+    vmov    dp1, dp3, qP[3], qP[1]
+
+    vctp.8  length
+    vpstttt
+    vldrbt.u8   qd0, [dp0]
+    vldrbt.u8   qd1, [dp1]
+    vldrbt.u8   qd2, [dp2]
+    vldrbt.u8   qd3, [dp3]
+
+    // Transpose from per-instance layout to per-lane layout
+    transpose_streams_to_lanes
+
+    // Bit-interleave (clobbers q2, q3, q4)
+    to_bit_interleaving_4x q0, q1, q2, q3, q4
+
+    // XOR into state (post-increments rSE/rSO by 16)
+    xor_lane_and_store_postinc qd0, qd1, qS0, qS1, rSE, rSO
+
+keccak_f1600_x4_state_xor_bytes_asm_exit:
+    vpop    {d8-d15}
+    pop     {r4-r12, pc}
+    .unreq state
+    .unreq dp0
+    .unreq dp1
+    .unreq dp2
+    .unreq dp3
+    .unreq off
+    .unreq length
+    .unreq rSO
+    .unreq rSE
+    .unreq lane_offset_bytes
+    .unreq off_full
+    .unreq mask
+    .unreq tmp
+    .unreq nB
+    .unreq qP
+    .unreq qd0
+    .unreq qd1
+    .unreq qd2
+    .unreq qd3
+    .unreq qS0
+    .unreq qS1
+
+/* simpasm: footer-start */
+#endif /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c
index 4e02f0d7b..e4c74a4a4 100644
--- a/mldsa/mldsa_native.c
+++ b/mldsa/mldsa_native.c
@@ -586,11 +586,17 @@
 #undef MLD_FIPS202_ARMV81M_NEED_X4
 #undef MLD_FIPS202_NATIVE_ARMV81M
 #undef MLD_FIPS202_NATIVE_ARMV81M_MVE_H
+#undef MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE
 #undef MLD_USE_FIPS202_X4_NATIVE
+#undef MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE
 #undef mld_keccak_f1600_x4_native_impl
+#undef mld_keccak_f1600_x4_state_extract_bytes
+#undef mld_keccak_f1600_x4_state_xor_bytes
 /* mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h */
 #undef MLD_FIPS202_NATIVE_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H
 #undef mld_keccak_f1600_x4_mve_asm
+#undef mld_keccak_f1600_x4_state_extract_bytes_asm
+#undef mld_keccak_f1600_x4_state_xor_bytes_asm
 #undef mld_keccakf1600_round_constants
 #endif /* MLD_SYS_ARMV81M_MVE */
 #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */
diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S
index cee9460ab..270e6009f 100644
--- a/mldsa/mldsa_native_asm.S
+++ b/mldsa/mldsa_native_asm.S
@@ -101,6 +101,8 @@
 #endif
 #if defined(MLD_SYS_ARMV81M_MVE)
 #include "src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S"
+#include "src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S"
+#include "src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S"
 #endif
 #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */
 
@@ -589,11 +591,17 @@
 #undef MLD_FIPS202_ARMV81M_NEED_X4
 #undef MLD_FIPS202_NATIVE_ARMV81M
 #undef MLD_FIPS202_NATIVE_ARMV81M_MVE_H
+#undef MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE
 #undef MLD_USE_FIPS202_X4_NATIVE
+#undef MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE
 #undef mld_keccak_f1600_x4_native_impl
+#undef mld_keccak_f1600_x4_state_extract_bytes
+#undef mld_keccak_f1600_x4_state_xor_bytes
 /* mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h */
 #undef MLD_FIPS202_NATIVE_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H
 #undef mld_keccak_f1600_x4_mve_asm
+#undef mld_keccak_f1600_x4_state_extract_bytes_asm
+#undef mld_keccak_f1600_x4_state_xor_bytes_asm
 #undef mld_keccakf1600_round_constants
 #endif /* MLD_SYS_ARMV81M_MVE */
 #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 */
diff --git a/mldsa/src/fips202/keccakf1600.c b/mldsa/src/fips202/keccakf1600.c
index 0aec7b30c..83f7aebbb 100644
--- a/mldsa/src/fips202/keccakf1600.c
+++ b/mldsa/src/fips202/keccakf1600.c
@@ -80,11 +80,12 @@ void mld_keccakf1600_xor_bytes(uint64_t *state, const unsigned char *data,
 #endif /* !MLD_SYS_LITTLE_ENDIAN */
 }
 
-MLD_INTERNAL_API
-void mld_keccakf1600x4_extract_bytes(uint64_t *state, unsigned char *data0,
-                                     unsigned char *data1, unsigned char *data2,
-                                     unsigned char *data3, unsigned offset,
-                                     unsigned length)
+static void mld_keccakf1600x4_extract_bytes_c(uint64_t *state,
+                                              unsigned char *data0,
+                                              unsigned char *data1,
+                                              unsigned char *data2,
+                                              unsigned char *data3,
+                                              unsigned offset, unsigned length)
 {
   mld_keccakf1600_extract_bytes(state + MLD_KECCAK_LANES * 0, data0, offset,
                                 length);
@@ -97,11 +98,29 @@ void mld_keccakf1600x4_extract_bytes(uint64_t *state, unsigned char *data0,
 }
 
 MLD_INTERNAL_API
-void mld_keccakf1600x4_xor_bytes(uint64_t *state, const unsigned char *data0,
-                                 const unsigned char *data1,
-                                 const unsigned char *data2,
-                                 const unsigned char *data3, unsigned offset,
-                                 unsigned length)
+void mld_keccakf1600x4_extract_bytes(uint64_t *state, unsigned char *data0,
+                                     unsigned char *data1, unsigned char *data2,
+                                     unsigned char *data3, unsigned offset,
+                                     unsigned length)
+{
+#if defined(MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE)
+  if (mld_keccakf1600_extract_bytes_x4_native(state, data0, data1, data2, data3,
+                                              offset, length) ==
+      MLD_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE */
+  mld_keccakf1600x4_extract_bytes_c(state, data0, data1, data2, data3, offset,
+                                    length);
+}
+
+static void mld_keccakf1600x4_xor_bytes_c(uint64_t *state,
+                                          const unsigned char *data0,
+                                          const unsigned char *data1,
+                                          const unsigned char *data2,
+                                          const unsigned char *data3,
+                                          unsigned offset, unsigned length)
 {
   mld_keccakf1600_xor_bytes(state + MLD_KECCAK_LANES * 0, data0, offset,
                             length);
@@ -113,6 +132,25 @@ void mld_keccakf1600x4_xor_bytes(uint64_t *state, const unsigned char *data0,
                             length);
 }
 
+MLD_INTERNAL_API
+void mld_keccakf1600x4_xor_bytes(uint64_t *state, const unsigned char *data0,
+                                 const unsigned char *data1,
+                                 const unsigned char *data2,
+                                 const unsigned char *data3, unsigned offset,
+                                 unsigned length)
+{
+#if defined(MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE)
+  if (mld_keccakf1600_xor_bytes_x4_native(state, data0, data1, data2, data3,
+                                          offset,
+                                          length) == MLD_NATIVE_FUNC_SUCCESS)
+  {
+    return;
+  }
+#endif /* MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE */
+  mld_keccakf1600x4_xor_bytes_c(state, data0, data1, data2, data3, offset,
+                                length);
+}
+
 MLD_INTERNAL_API
 void mld_keccakf1600x4_permute(uint64_t *state)
 {
diff --git a/mldsa/src/fips202/native/armv81m/mve.h b/mldsa/src/fips202/native/armv81m/mve.h
index d0ab2be78..3d5e6f7db 100644
--- a/mldsa/src/fips202/native/armv81m/mve.h
+++ b/mldsa/src/fips202/native/armv81m/mve.h
@@ -11,12 +11,18 @@
 
 /* Part of backend API */
 #define MLD_USE_FIPS202_X4_NATIVE
+#define MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE
+#define MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE
 /* Guard for assembly file */
 #define MLD_FIPS202_ARMV81M_NEED_X4
 
 #if !defined(__ASSEMBLER__)
 #include "../api.h"
 
+/*
+ * Native x4 permutation
+ * State is kept in bit-interleaved format.
+ */
 #define mld_keccak_f1600_x4_native_impl \
   MLD_NAMESPACE(keccak_f1600_x4_native_impl)
 int mld_keccak_f1600_x4_native_impl(uint64_t *state);
@@ -27,6 +33,48 @@ static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state)
   return mld_keccak_f1600_x4_native_impl(state);
 }
 
+/*
+ * Native x4 XOR bytes (with on-the-fly bit interleaving)
+ */
+#define mld_keccak_f1600_x4_state_xor_bytes \
+  MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm)
+void mld_keccak_f1600_x4_state_xor_bytes(void *state, const uint8_t *data0,
+                                         const uint8_t *data1,
+                                         const uint8_t *data2,
+                                         const uint8_t *data3, unsigned offset,
+                                         unsigned length);
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_keccakf1600_xor_bytes_x4_native(
+    uint64_t *state, const uint8_t *data0, const uint8_t *data1,
+    const uint8_t *data2, const uint8_t *data3, unsigned offset,
+    unsigned length)
+{
+  mld_keccak_f1600_x4_state_xor_bytes(state, data0, data1, data2, data3, offset,
+                                      length);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
+/*
+ * Native x4 extract bytes (with on-the-fly bit de-interleaving)
+ */
+#define mld_keccak_f1600_x4_state_extract_bytes \
+  MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
+void mld_keccak_f1600_x4_state_extract_bytes(void *state, uint8_t *data0,
+                                             uint8_t *data1, uint8_t *data2,
+                                             uint8_t *data3, unsigned offset,
+                                             unsigned length);
+
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_keccakf1600_extract_bytes_x4_native(
+    uint64_t *state, uint8_t *data0, uint8_t *data1, uint8_t *data2,
+    uint8_t *data3, unsigned offset, unsigned length)
+{
+  mld_keccak_f1600_x4_state_extract_bytes(state, data0, data1, data2, data3,
+                                          offset, length);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+
 #endif /* !__ASSEMBLER__ */
 
 #endif /* !MLD_FIPS202_NATIVE_ARMV81M_MVE_H */
diff --git a/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h b/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h
index dee44842d..779fd9304 100644
--- a/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h
+++ b/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h
@@ -17,4 +17,20 @@ extern const uint32_t mld_keccakf1600_round_constants[48];
 void mld_keccak_f1600_x4_mve_asm(uint64_t state[100], uint64_t tmpstate[100],
                                  const uint32_t rc[48]);
 
+#define mld_keccak_f1600_x4_state_xor_bytes_asm \
+  MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm)
+void mld_keccak_f1600_x4_state_xor_bytes_asm(void *state, const uint8_t *d0,
+                                             const uint8_t *d1,
+                                             const uint8_t *d2,
+                                             const uint8_t *d3, unsigned offset,
+                                             unsigned length);
+
+#define mld_keccak_f1600_x4_state_extract_bytes_asm \
+  MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
+void mld_keccak_f1600_x4_state_extract_bytes_asm(void *state, uint8_t *data0,
+                                                 uint8_t *data1, uint8_t *data2,
+                                                 uint8_t *data3,
+                                                 unsigned offset,
+                                                 unsigned length);
+
 #endif /* !MLD_FIPS202_NATIVE_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H */
diff --git a/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S b/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S
index 557c9136c..4aca4a354 100644
--- a/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S
+++ b/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S
@@ -7,7 +7,7 @@
 
 /*yaml
   Name: keccak_f1600_x4_mve_asm
-  Description: Armv8.1-M MVE implementation of 4-way parallel Keccak-f[1600] permutation using bit-interleaved state
+  Description: Armv8.1-M MVE implementation of batched (x4) Keccak-f[1600] permutation using bit-interleaved state
   Signature: void mld_keccak_f1600_x4_mve_asm(void *state, void *tmpstate, const uint32_t *rc)
   ABI:
     r0:
@@ -15,7 +15,7 @@
       size_bytes: 800
       permissions: read/write
       c_parameter: void *state
-      description: Four bit-interleaved Keccak states (low halves followed by high halves)
+      description: Bit-interleaved state for 4 Keccak instances (even halves followed by odd halves)
     r1:
       type: buffer
       size_bytes: 800
@@ -33,6 +33,36 @@
     description: register preservation (44) + SIMD registers (64) + temporary storage (128)
 */
 
+// ---------------------------------------------------------------------------
+// Bit-interleaving background
+// ---------------------------------------------------------------------------
+// Each 64-bit Keccak lane is stored as two 32-bit words:
+//   even half -- bits 0, 2, 4, ..., 62 of the lane
+//   odd half  -- bits 1, 3, 5, ..., 63 of the lane
+// This representation allows 64-bit lane rotations (used in the Keccak
+// round function) to be implemented as pairs of 32-bit rotations.
+//
+// Batched (x4) processing:
+//   Four Keccak instances are processed as a batch.  Their states are
+//   stored interleaved in a single 800-byte buffer: first the even
+//   halves of all 25 lanes (400 bytes), then the odd halves (400 bytes).
+//   Within each 16-byte row, the four u32 words correspond to
+//   instances 0..3 of the same lane, enabling SIMD-parallel operations
+//   across all four instances.
+//
+// State memory layout (25 lanes x 4 instances x 2 halves):
+//   S[i][l]_even/odd = even/odd half of lane l, instance i  (u32)
+//   Each row is 16 bytes (one Q-register).
+//   Offset  Contents
+//     0     S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even
+//    16     S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even
+//    ...
+//   384     S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even
+//   400     S[0][ 0]_odd,  S[1][ 0]_odd,  S[2][ 0]_odd,  S[3][ 0]_odd
+//   416     S[0][ 1]_odd,  S[1][ 1]_odd,  S[2][ 1]_odd,  S[3][ 1]_odd
+//    ...
+//   784     S[0][24]_odd,  S[1][24]_odd,  S[2][24]_odd,  S[3][24]_odd
+
 #include "../../../../common.h"
 #if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \
     !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
diff --git a/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c b/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c
index e74fd8913..e26f1bf22 100644
--- a/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c
+++ b/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c
@@ -12,114 +12,19 @@
 
 #include "fips202_native_armv81m.h"
 
-/*
- * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations.
- * TODO: Replace with optimized MVE assembly implementations
- * (as a part of XORBytes and ExtractBytes)
- */
-
-/* Extract even-indexed bits from 64-bit value into lower 32 bits */
-static uint32_t bitinterleave_even(uint64_t x)
-{
-  uint64_t t;
-  t = x & 0x5555555555555555ULL;
-  t = (t | (t >> 1)) & 0x3333333333333333ULL;
-  t = (t | (t >> 2)) & 0x0f0f0f0f0f0f0f0fULL;
-  t = (t | (t >> 4)) & 0x00ff00ff00ff00ffULL;
-  t = (t | (t >> 8)) & 0x0000ffff0000ffffULL;
-  t = (t | (t >> 16)) & 0x00000000ffffffffULL;
-  return (uint32_t)t;
-}
-
-/* Extract odd-indexed bits from 64-bit value into lower 32 bits */
-static uint32_t bitinterleave_odd(uint64_t x)
-{
-  return bitinterleave_even(x >> 1);
-}
-
-/* Spread 32-bit value across even bit positions of 64-bit result */
-static uint64_t spread_even(uint32_t x)
-{
-  uint64_t t = x;
-  t = (t | (t << 16)) & 0x0000ffff0000ffffULL;
-  t = (t | (t << 8)) & 0x00ff00ff00ff00ffULL;
-  t = (t | (t << 4)) & 0x0f0f0f0f0f0f0f0fULL;
-  t = (t | (t << 2)) & 0x3333333333333333ULL;
-  t = (t | (t << 1)) & 0x5555555555555555ULL;
-  return t;
-}
-
-/* Combine even and odd 32-bit halves into interleaved 64-bit value */
-static uint64_t bitdeinterleave(uint32_t even, uint32_t odd)
-{
-  return spread_even(even) | (spread_even(odd) << 1);
-}
 
 /*
- * TEMPORARY: Naive C interleaving functions.
- * These will be replaced with optimized MVE assembly implementations.
+ * Keccak-f1600 x4 permutation (on bit-interleaved state)
+ * State is expected to already be in bit-interleaved format.
  */
-static void interleave_4fold(uint64_t *state_4x, const uint64_t *state0,
-                             const uint64_t *state1, const uint64_t *state2,
-                             const uint64_t *state3)
-{
-  uint32_t *state_4xl = (uint32_t *)state_4x;
-  uint32_t *state_4xh = (uint32_t *)state_4x + 100;
-
-  for (size_t i = 0; i < 25; i++)
-  {
-    state_4xl[i * 4 + 0] = bitinterleave_even(state0[i]);
-    state_4xl[i * 4 + 1] = bitinterleave_even(state1[i]);
-    state_4xl[i * 4 + 2] = bitinterleave_even(state2[i]);
-    state_4xl[i * 4 + 3] = bitinterleave_even(state3[i]);
-
-    state_4xh[i * 4 + 0] = bitinterleave_odd(state0[i]);
-    state_4xh[i * 4 + 1] = bitinterleave_odd(state1[i]);
-    state_4xh[i * 4 + 2] = bitinterleave_odd(state2[i]);
-    state_4xh[i * 4 + 3] = bitinterleave_odd(state3[i]);
-  }
-}
-
-static void deinterleave_4fold(uint64_t *state_4x, uint64_t *state0,
-                               uint64_t *state1, uint64_t *state2,
-                               uint64_t *state3)
-{
-  uint32_t *state_4xl = (uint32_t *)state_4x;
-  uint32_t *state_4xh = (uint32_t *)state_4x + 100;
-
-  for (size_t i = 0; i < 25; i++)
-  {
-    state0[i] = bitdeinterleave(state_4xl[i * 4 + 0], state_4xh[i * 4 + 0]);
-    state1[i] = bitdeinterleave(state_4xl[i * 4 + 1], state_4xh[i * 4 + 1]);
-    state2[i] = bitdeinterleave(state_4xl[i * 4 + 2], state_4xh[i * 4 + 2]);
-    state3[i] = bitdeinterleave(state_4xl[i * 4 + 3], state_4xh[i * 4 + 3]);
-  }
-}
-
 #define mld_keccak_f1600_x4_native_impl \
   MLD_NAMESPACE(keccak_f1600_x4_native_impl)
 int mld_keccak_f1600_x4_native_impl(uint64_t *state)
 {
-  /*
-   * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations.
-   * TODO: Replace with optimized MVE assembly implementations
-   * (as a part of XORBytes and ExtractBytes)
-   */
-  MLD_ALIGN uint64_t state_4x[100];
-  MLD_ALIGN uint64_t state_4x_tmp[100];
-
-  /* Interleave the 4 states into bit-interleaved format */
-  interleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]);
-
-  /* Run the permutation */
-  mld_keccak_f1600_x4_mve_asm(state_4x, state_4x_tmp,
+  MLD_ALIGN uint64_t state_tmp[100];
+  mld_keccak_f1600_x4_mve_asm(state, state_tmp,
                               mld_keccakf1600_round_constants);
-
-  /* Deinterleave back to 4 separate states */
-  deinterleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]);
-
-  mld_zeroize(state_4x, sizeof(state_4x));
-  mld_zeroize(state_4x_tmp, sizeof(state_4x_tmp));
+  mld_zeroize(state_tmp, sizeof(state_tmp));
   return MLD_NATIVE_FUNC_SUCCESS;
 }
 
diff --git a/mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S b/mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S
new file mode 100644
index 000000000..8a2bd9dc7
--- /dev/null
+++ b/mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * Copyright (c) The mldsa-native project authors
+ * Copyright (c) 2026 Arm Limited
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+// ---------------------------------------------------------------------------
+// Overview
+// ---------------------------------------------------------------------------
+// MVE/Helium implementation of KeccakF1600x4_StateExtractBytes
+// (inverse of state_xor_bytes_x4_mve.S).
+//
+// void KeccakF1600x4_StateExtractBytes(state, d0, d1, d2, d3, offset, length)
+//
+// Reads 'length' bytes from the bit-interleaved Keccak state starting at
+// byte 'offset', recombines the even and odd halves of each lane back
+// into plain bytes, and writes them to four output buffers (d0..d3).
+//
+// ---------------------------------------------------------------------------
+// Bit-interleaving background
+// ---------------------------------------------------------------------------
+// Each 64-bit Keccak lane is stored as two 32-bit words:
+//   even half -- bits 0, 2, 4, ..., 62 of the lane
+//   odd half  -- bits 1, 3, 5, ..., 63 of the lane
+// This representation allows 64-bit lane rotations (used in the Keccak
+// round function) to be implemented as pairs of 32-bit rotations.
+//
+// Batched (x4) processing:
+//   Four Keccak instances are processed as a batch.  Their states are
+//   stored interleaved in a single 800-byte buffer: first the even
+//   halves of all 25 lanes (400 bytes), then the odd halves (400 bytes).
+//   Within each 16-byte row, the four u32 words correspond to
+//   instances 0..3 of the same lane, enabling SIMD-parallel operations
+//   across all four instances.
+//
+// State memory layout (25 lanes x 4 instances x 2 halves):
+//   S[i][l]_even/odd = even/odd half of lane l, instance i  (u32)
+//   Each row is 16 bytes (one Q-register).
+//   Offset  Contents
+//     0     S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even
+//    16     S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even
+//    ...
+//   384     S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even
+//   400     S[0][ 0]_odd,  S[1][ 0]_odd,  S[2][ 0]_odd,  S[3][ 0]_odd
+//   416     S[0][ 1]_odd,  S[1][ 1]_odd,  S[2][ 1]_odd,  S[3][ 1]_odd
+//    ...
+//   784     S[0][24]_odd,  S[1][24]_odd,  S[2][24]_odd,  S[3][24]_odd
+//
+// ---------------------------------------------------------------------------
+// Three-phase structure
+// ---------------------------------------------------------------------------
+//   Prologue -- if offset is not 8-byte aligned, extract
+//               min(length, 8-(offset%8)) bytes via predicated byte stores.
+//   Main     -- process full 8-byte groups: load even/odd lane pair,
+//               de-interleave, scatter-store to output buffers.
+//   Tail     -- extract remaining <8 bytes via predicated byte stores.
+
+#include "../../../../common.h"
+#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+/*
+ * WARNING: This file is auto-derived from the mldsa-native source file
+ *   dev/fips202/armv81m/src/state_extract_bytes_x4_mve.S using scripts/simpasm. Do not modify it directly.
+ */
+
+.thumb
+.syntax unified
+
+.text
+.balign 4
+.global MLD_ASM_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
+MLD_ASM_FN_SYMBOL(keccak_f1600_x4_state_extract_bytes_asm)
+
+        push.w	{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+        vpush	{d8, d9, d10, d11, d12, d13, d14, d15}
+        ldr	r4, [sp, #0x68]
+        ldr.w	r10, [sp, #0x6c]
+        ldr	r6, [sp, #0x70]
+        cmp	r6, #0x0
+        beq.w	keccak_f1600_x4_state_extract_bytes_asm_exit @ imm = #0x2ea
+        and	r5, r10, #0x7
+        bic	r9, r10, #0x7
+        add.w	r8, r0, r9, lsl #1
+        add.w	r7, r8, #0x190
+        cmp	r5, #0x0
+        beq.w	keccak_f1600_x4_state_extract_bytes_asm_pre_main @ imm = #0x112
+        vldrw.u32	q0, [r8], #16
+        vldrw.u32	q1, [r7], #16
+        vrev32.16	q2, q0
+        vrev32.16	q3, q1
+        vsli.32	q0, q0, #0x8
+        vsli.16	q0, q0, #0x4
+        vsli.8	q0, q0, #0x1
+        vshr.u8	q4, q0, #0x3
+        vsli.8	q0, q4, #0x4
+        vshr.u8	q4, q0, #0x5
+        vsli.8	q0, q4, #0x6
+        vsli.32	q1, q1, #0x8
+        vsli.16	q1, q1, #0x4
+        vsli.8	q1, q1, #0x1
+        vshr.u8	q4, q1, #0x3
+        vsli.8	q1, q4, #0x4
+        vshr.u8	q4, q1, #0x5
+        vsli.8	q1, q4, #0x6
+        mov.w	r0, #0x55
+        vdup.8	q4, r0
+        vand	q0, q0, q4
+        vand	q1, q1, q4
+        vshl.i32	q1, q1, #0x1
+        vorr	q0, q0, q1
+        vsli.32	q2, q2, #0x8
+        vsli.16	q2, q2, #0x4
+        vsli.8	q2, q2, #0x1
+        vshr.u8	q1, q2, #0x3
+        vsli.8	q2, q1, #0x4
+        vshr.u8	q1, q2, #0x5
+        vsli.8	q2, q1, #0x6
+        vsli.32	q3, q3, #0x8
+        vsli.16	q3, q3, #0x4
+        vsli.8	q3, q3, #0x1
+        vshr.u8	q1, q3, #0x3
+        vsli.8	q3, q1, #0x4
+        vshr.u8	q1, q3, #0x5
+        vsli.8	q3, q1, #0x6
+        vand	q1, q2, q4
+        vand	q3, q3, q4
+        vshl.i32	q3, q3, #0x1
+        vorr	q1, q1, q3
+        vrev64.32	q2, q0
+        vrev64.32	q3, q1
+        movw	r0, #0xf0f
+        vmsr	p0, r0
+        vpsel	q0, q0, q3
+        vpsel	q1, q2, q1
+        vmov.f64	d4, d1
+        vmov.f64	d6, d3
+        rsb.w	lr, r5, #0x8
+        cmp	r6, lr
+        it	ls
+        movls	lr, r6
+        vctp.8	lr
+        vmrs	r11, p0
+        lsl.w	r11, r11, r5
+        vmsr	p0, r11
+        subs	r1, r1, r5
+        subs	r2, r2, r5
+        subs	r3, r3, r5
+        subs	r4, r4, r5
+        vpstttt	
+        vstrbt.8	q0, [r1], #4
+        vstrbt.8	q1, [r2], #4
+        vstrbt.8	q2, [r3], #4
+        vstrbt.8	q3, [r4], #4
+        subs.w	r6, r6, lr
+        cmp	r6, #0x0
+        beq.w	keccak_f1600_x4_state_extract_bytes_asm_exit @ imm = #0x1cc
+        vmov	q7[2], q7[0], r1, r3
+        vmov	q7[3], q7[1], r2, r4
+        b	keccak_f1600_x4_state_extract_bytes_asm_main_body @ imm = #0xe
+
+keccak_f1600_x4_state_extract_bytes_asm_pre_main:
+        vmov	q7[2], q7[0], r1, r3
+        vmov	q7[3], q7[1], r2, r4
+        mov.w	r12, #0x4
+        vsub.i32	q7, q7, r12
+
+keccak_f1600_x4_state_extract_bytes_asm_main_body:
+        lsr.w	lr, r6, #0x3
+        wls	lr, lr, keccak_f1600_x4_state_extract_bytes_asm_main_loop_end @ imm = #0xb4
+
+keccak_f1600_x4_state_extract_bytes_asm_main_loop_start:
+        vldrw.u32	q0, [r8], #16
+        vldrw.u32	q1, [r7], #16
+        vrev32.16	q2, q0
+        vrev32.16	q3, q1
+        vsli.32	q0, q0, #0x8
+        vsli.16	q0, q0, #0x4
+        vsli.8	q0, q0, #0x1
+        vshr.u8	q4, q0, #0x3
+        vsli.8	q0, q4, #0x4
+        vshr.u8	q4, q0, #0x5
+        vsli.8	q0, q4, #0x6
+        vsli.32	q1, q1, #0x8
+        vsli.16	q1, q1, #0x4
+        vsli.8	q1, q1, #0x1
+        vshr.u8	q4, q1, #0x3
+        vsli.8	q1, q4, #0x4
+        vshr.u8	q4, q1, #0x5
+        vsli.8	q1, q4, #0x6
+        mov.w	r0, #0x55
+        vdup.8	q4, r0
+        vand	q0, q0, q4
+        vand	q1, q1, q4
+        vshl.i32	q1, q1, #0x1
+        vorr	q0, q0, q1
+        vsli.32	q2, q2, #0x8
+        vsli.16	q2, q2, #0x4
+        vsli.8	q2, q2, #0x1
+        vshr.u8	q1, q2, #0x3
+        vsli.8	q2, q1, #0x4
+        vshr.u8	q1, q2, #0x5
+        vsli.8	q2, q1, #0x6
+        vsli.32	q3, q3, #0x8
+        vsli.16	q3, q3, #0x4
+        vsli.8	q3, q3, #0x1
+        vshr.u8	q1, q3, #0x3
+        vsli.8	q3, q1, #0x4
+        vshr.u8	q1, q3, #0x5
+        vsli.8	q3, q1, #0x6
+        vand	q1, q2, q4
+        vand	q3, q3, q4
+        vshl.i32	q3, q3, #0x1
+        vorr	q1, q1, q3
+        vstrw.32	q0, [q7, #4]!
+        vstrw.32	q1, [q7, #4]!
+        le	lr, keccak_f1600_x4_state_extract_bytes_asm_main_loop_start @ imm = #-0xb4
+
+keccak_f1600_x4_state_extract_bytes_asm_main_loop_end:
+        ands	r6, r6, #0x7
+        beq	keccak_f1600_x4_state_extract_bytes_asm_exit @ imm = #0xee
+        mov.w	r12, #0x4
+        vadd.i32	q7, q7, r12
+        vmov	r1, r3, q7[2], q7[0]
+        vmov	r2, r4, q7[3], q7[1]
+        vldrw.u32	q0, [r8], #16
+        vldrw.u32	q1, [r7], #16
+        vrev32.16	q2, q0
+        vrev32.16	q3, q1
+        vsli.32	q0, q0, #0x8
+        vsli.16	q0, q0, #0x4
+        vsli.8	q0, q0, #0x1
+        vshr.u8	q4, q0, #0x3
+        vsli.8	q0, q4, #0x4
+        vshr.u8	q4, q0, #0x5
+        vsli.8	q0, q4, #0x6
+        vsli.32	q1, q1, #0x8
+        vsli.16	q1, q1, #0x4
+        vsli.8	q1, q1, #0x1
+        vshr.u8	q4, q1, #0x3
+        vsli.8	q1, q4, #0x4
+        vshr.u8	q4, q1, #0x5
+        vsli.8	q1, q4, #0x6
+        mov.w	r0, #0x55
+        vdup.8	q4, r0
+        vand	q0, q0, q4
+        vand	q1, q1, q4
+        vshl.i32	q1, q1, #0x1
+        vorr	q0, q0, q1
+        vsli.32	q2, q2, #0x8
+        vsli.16	q2, q2, #0x4
+        vsli.8	q2, q2, #0x1
+        vshr.u8	q1, q2, #0x3
+        vsli.8	q2, q1, #0x4
+        vshr.u8	q1, q2, #0x5
+        vsli.8	q2, q1, #0x6
+        vsli.32	q3, q3, #0x8
+        vsli.16	q3, q3, #0x4
+        vsli.8	q3, q3, #0x1
+        vshr.u8	q1, q3, #0x3
+        vsli.8	q3, q1, #0x4
+        vshr.u8	q1, q3, #0x5
+        vsli.8	q3, q1, #0x6
+        vand	q1, q2, q4
+        vand	q3, q3, q4
+        vshl.i32	q3, q3, #0x1
+        vorr	q1, q1, q3
+        vrev64.32	q2, q0
+        vrev64.32	q3, q1
+        movw	r0, #0xf0f
+        vmsr	p0, r0
+        vpsel	q0, q0, q3
+        vpsel	q1, q2, q1
+        vmov.f64	d4, d1
+        vmov.f64	d6, d3
+        vctp.8	r6
+        vpstttt	
+        vstrbt.8	q0, [r1], #4
+        vstrbt.8	q1, [r2], #4
+        vstrbt.8	q2, [r3], #4
+        vstrbt.8	q3, [r4], #4
+
+keccak_f1600_x4_state_extract_bytes_asm_exit:
+        vpop	{d8, d9, d10, d11, d12, d13, d14, d15}
+        pop.w	{r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+
+MLD_ASM_FN_SIZE(keccak_f1600_x4_state_extract_bytes_asm)
+
+#endif /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S b/mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S
new file mode 100644
index 000000000..00907fb4d
--- /dev/null
+++ b/mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * Copyright (c) The mldsa-native project authors
+ * Copyright (c) 2026 Arm Limited
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+// ---------------------------------------------------------------------------
+// Overview
+// ---------------------------------------------------------------------------
+// MVE/Helium implementation of KeccakF1600x4_StateXORBytes.
+//
+// void KeccakF1600x4_StateXORBytes(state, d0, d1, d2, d3, offset, length)
+//
+// Reads 'length' plain bytes from each of four input buffers (d0..d3),
+// splits every byte into its even and odd bits (bit-interleaving), and
+// XORs the result into the Keccak state starting at byte 'offset'.
+//
+// ---------------------------------------------------------------------------
+// Bit-interleaving background
+// ---------------------------------------------------------------------------
+// Each 64-bit Keccak lane is stored as two 32-bit words:
+//   even half -- bits 0, 2, 4, ..., 62 of the lane
+//   odd half  -- bits 1, 3, 5, ..., 63 of the lane
+// This representation allows 64-bit lane rotations (used in the Keccak
+// round function) to be implemented as pairs of 32-bit rotations.
+//
+// Batched (x4) processing:
+//   Four Keccak instances are processed as a batch.  Their states are
+//   stored interleaved in a single 800-byte buffer: first the even
+//   halves of all 25 lanes (400 bytes), then the odd halves (400 bytes).
+//   Within each 16-byte row, the four u32 words correspond to
+//   instances 0..3 of the same lane, enabling SIMD-parallel operations
+//   across all four instances.
+//
+// State memory layout (25 lanes x 4 instances x 2 halves):
+//   S[i][l]_even/odd = even/odd half of lane l, instance i  (u32)
+//   Each row is 16 bytes (one Q-register).
+//   Offset  Contents
+//     0     S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even
+//    16     S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even
+//    ...
+//   384     S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even
+//   400     S[0][ 0]_odd,  S[1][ 0]_odd,  S[2][ 0]_odd,  S[3][ 0]_odd
+//   416     S[0][ 1]_odd,  S[1][ 1]_odd,  S[2][ 1]_odd,  S[3][ 1]_odd
+//    ...
+//   784     S[0][24]_odd,  S[1][24]_odd,  S[2][24]_odd,  S[3][24]_odd
+//
+// ---------------------------------------------------------------------------
+// Three-phase structure
+// ---------------------------------------------------------------------------
+//   Prologue -- if offset is not 8-byte aligned, absorb
+//               min(length, 8-(offset%8)) bytes via predicated byte loads.
+//   Main     -- process full 8-byte groups via word-level gather loads,
+//               bit-interleave, then VEOR into even/odd state halves.
+//   Tail     -- absorb remaining <8 bytes via predicated byte loads.
+
+#include "../../../../common.h"
+#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+/*
+ * WARNING: This file is auto-derived from the mldsa-native source file
+ *   dev/fips202/armv81m/src/state_xor_bytes_x4_mve.S using scripts/simpasm. Do not modify it directly.
+ */
+
+.thumb
+.syntax unified
+
+.text
+.balign 4
+.global MLD_ASM_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm)
+MLD_ASM_FN_SYMBOL(keccak_f1600_x4_state_xor_bytes_asm)
+
+        push.w	{r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+        vpush	{d8, d9, d10, d11, d12, d13, d14, d15}
+        ldr	r4, [sp, #0x68]
+        ldr.w	r10, [sp, #0x6c]
+        ldr	r6, [sp, #0x70]
+        cmp	r6, #0x0
+        beq.w	keccak_f1600_x4_state_xor_bytes_asm_exit @ imm = #0x34c
+        and	r5, r10, #0x7
+        bic	r9, r10, #0x7
+        add.w	r8, r0, r9, lsl #1
+        add.w	r7, r8, #0x190
+        cmp	r5, #0x0
+        beq.w	keccak_f1600_x4_state_xor_bytes_asm_pre_main @ imm = #0x132
+        subs	r1, r1, r5
+        subs	r2, r2, r5
+        subs	r3, r3, r5
+        subs	r4, r4, r5
+        rsb.w	lr, r5, #0x8
+        cmp	r6, lr
+        it	ls
+        movls	lr, r6
+        subs.w	r6, r6, lr
+        vctp.8	lr
+        vmrs	r11, p0
+        lsl.w	r11, r11, r5
+        vmsr	p0, r11
+        vpstttt	
+        vldrbt.u8	q0, [r1], #4
+        vldrbt.u8	q1, [r2], #4
+        vldrbt.u8	q2, [r3], #4
+        vldrbt.u8	q3, [r4], #4
+        vmov.f64	d1, d4
+        vmov.f64	d3, d6
+        vrev64.32	q2, q0
+        vrev64.32	q3, q1
+        movw	r0, #0xf0f
+        vmsr	p0, r0
+        vpsel	q0, q0, q3
+        vpsel	q1, q2, q1
+        vmov	q2, q0
+        vmov	q3, q1
+        vshr.u8	q4, q0, #0x2
+        vsli.8	q0, q4, #0x1
+        vshr.u8	q4, q0, #0x3
+        vsli.8	q0, q4, #0x2
+        vshr.u8	q4, q0, #0x4
+        vsli.8	q0, q4, #0x3
+        vshr.u16	q4, q0, #0x8
+        vsli.8	q0, q4, #0x4
+        vshr.u32	q4, q0, #0x10
+        vsli.16	q0, q4, #0x8
+        vshr.u8	q4, q3, #0x2
+        vsli.8	q3, q4, #0x1
+        vshr.u8	q4, q3, #0x3
+        vsli.8	q3, q4, #0x2
+        vshr.u8	q4, q3, #0x4
+        vsli.8	q3, q4, #0x3
+        vshr.u16	q4, q3, #0x8
+        vsli.8	q3, q4, #0x4
+        vshr.u32	q4, q3, #0x10
+        vsli.16	q3, q4, #0x8
+        vsli.32	q0, q3, #0x10
+        vshl.i8	q4, q2, #0x2
+        vsri.8	q2, q4, #0x1
+        vshl.i8	q4, q2, #0x3
+        vsri.8	q2, q4, #0x2
+        vshl.i8	q4, q2, #0x4
+        vsri.8	q2, q4, #0x3
+        vshl.i16	q4, q2, #0x8
+        vsri.8	q2, q4, #0x4
+        vshl.i32	q4, q2, #0x10
+        vsri.16	q2, q4, #0x8
+        vshl.i8	q4, q1, #0x2
+        vsri.8	q1, q4, #0x1
+        vshl.i8	q4, q1, #0x3
+        vsri.8	q1, q4, #0x2
+        vshl.i8	q4, q1, #0x4
+        vsri.8	q1, q4, #0x3
+        vshl.i16	q4, q1, #0x8
+        vsri.8	q1, q4, #0x4
+        vshl.i32	q4, q1, #0x10
+        vsri.16	q1, q4, #0x8
+        vsri.32	q1, q2, #0x10
+        vldrw.u32	q4, [r8]
+        vldrw.u32	q5, [r7]
+        veor	q4, q4, q0
+        veor	q5, q5, q1
+        vstrw.32	q4, [r8], #16
+        vstrw.32	q5, [r7], #16
+        vmov	q7[2], q7[0], r1, r3
+        vmov	q7[3], q7[1], r2, r4
+        cmp	r6, #0x0
+        beq.w	keccak_f1600_x4_state_xor_bytes_asm_exit @ imm = #0x206
+        b	keccak_f1600_x4_state_xor_bytes_asm_main_body @ imm = #0xe
+
+keccak_f1600_x4_state_xor_bytes_asm_pre_main:
+        vmov	q7[2], q7[0], r1, r3
+        vmov	q7[3], q7[1], r2, r4
+        mov.w	r0, #0x4
+        vsub.i32	q7, q7, r0
+
+keccak_f1600_x4_state_xor_bytes_asm_main_body:
+        lsr.w	lr, r6, #0x3
+        wls	lr, lr, keccak_f1600_x4_state_xor_bytes_asm_main_loop_end @ imm = #0xd4
+
+keccak_f1600_x4_state_xor_bytes_asm_main_loop_start:
+        vldrw.u32	q0, [q7, #4]!
+        vldrw.u32	q1, [q7, #4]!
+        vmov	q2, q0
+        vmov	q3, q1
+        vshr.u8	q4, q0, #0x2
+        vsli.8	q0, q4, #0x1
+        vshr.u8	q4, q0, #0x3
+        vsli.8	q0, q4, #0x2
+        vshr.u8	q4, q0, #0x4
+        vsli.8	q0, q4, #0x3
+        vshr.u16	q4, q0, #0x8
+        vsli.8	q0, q4, #0x4
+        vshr.u32	q4, q0, #0x10
+        vsli.16	q0, q4, #0x8
+        vshr.u8	q4, q3, #0x2
+        vsli.8	q3, q4, #0x1
+        vshr.u8	q4, q3, #0x3
+        vsli.8	q3, q4, #0x2
+        vshr.u8	q4, q3, #0x4
+        vsli.8	q3, q4, #0x3
+        vshr.u16	q4, q3, #0x8
+        vsli.8	q3, q4, #0x4
+        vshr.u32	q4, q3, #0x10
+        vsli.16	q3, q4, #0x8
+        vsli.32	q0, q3, #0x10
+        vshl.i8	q4, q2, #0x2
+        vsri.8	q2, q4, #0x1
+        vshl.i8	q4, q2, #0x3
+        vsri.8	q2, q4, #0x2
+        vshl.i8	q4, q2, #0x4
+        vsri.8	q2, q4, #0x3
+        vshl.i16	q4, q2, #0x8
+        vsri.8	q2, q4, #0x4
+        vshl.i32	q4, q2, #0x10
+        vsri.16	q2, q4, #0x8
+        vshl.i8	q4, q1, #0x2
+        vsri.8	q1, q4, #0x1
+        vshl.i8	q4, q1, #0x3
+        vsri.8	q1, q4, #0x2
+        vshl.i8	q4, q1, #0x4
+        vsri.8	q1, q4, #0x3
+        vshl.i16	q4, q1, #0x8
+        vsri.8	q1, q4, #0x4
+        vshl.i32	q4, q1, #0x10
+        vsri.16	q1, q4, #0x8
+        vsri.32	q1, q2, #0x10
+        vldrw.u32	q4, [r8]
+        vldrw.u32	q5, [r7]
+        veor	q4, q4, q0
+        veor	q5, q5, q1
+        vstrw.32	q4, [r8], #16
+        vstrw.32	q5, [r7], #16
+        le	lr, keccak_f1600_x4_state_xor_bytes_asm_main_loop_start @ imm = #-0xd4
+
+keccak_f1600_x4_state_xor_bytes_asm_main_loop_end:
+        ands	r6, r6, #0x7
+        beq.w	keccak_f1600_x4_state_xor_bytes_asm_exit @ imm = #0x110
+        mov.w	r0, #0x4
+        vadd.i32	q7, q7, r0
+        vmov	r1, r3, q7[2], q7[0]
+        vmov	r2, r4, q7[3], q7[1]
+        vctp.8	r6
+        vpstttt	
+        vldrbt.u8	q0, [r1]
+        vldrbt.u8	q1, [r2]
+        vldrbt.u8	q2, [r3]
+        vldrbt.u8	q3, [r4]
+        vmov.f64	d1, d4
+        vmov.f64	d3, d6
+        vrev64.32	q2, q0
+        vrev64.32	q3, q1
+        movw	r0, #0xf0f
+        vmsr	p0, r0
+        vpsel	q0, q0, q3
+        vpsel	q1, q2, q1
+        vmov	q2, q0
+        vmov	q3, q1
+        vshr.u8	q4, q0, #0x2
+        vsli.8	q0, q4, #0x1
+        vshr.u8	q4, q0, #0x3
+        vsli.8	q0, q4, #0x2
+        vshr.u8	q4, q0, #0x4
+        vsli.8	q0, q4, #0x3
+        vshr.u16	q4, q0, #0x8
+        vsli.8	q0, q4, #0x4
+        vshr.u32	q4, q0, #0x10
+        vsli.16	q0, q4, #0x8
+        vshr.u8	q4, q3, #0x2
+        vsli.8	q3, q4, #0x1
+        vshr.u8	q4, q3, #0x3
+        vsli.8	q3, q4, #0x2
+        vshr.u8	q4, q3, #0x4
+        vsli.8	q3, q4, #0x3
+        vshr.u16	q4, q3, #0x8
+        vsli.8	q3, q4, #0x4
+        vshr.u32	q4, q3, #0x10
+        vsli.16	q3, q4, #0x8
+        vsli.32	q0, q3, #0x10
+        vshl.i8	q4, q2, #0x2
+        vsri.8	q2, q4, #0x1
+        vshl.i8	q4, q2, #0x3
+        vsri.8	q2, q4, #0x2
+        vshl.i8	q4, q2, #0x4
+        vsri.8	q2, q4, #0x3
+        vshl.i16	q4, q2, #0x8
+        vsri.8	q2, q4, #0x4
+        vshl.i32	q4, q2, #0x10
+        vsri.16	q2, q4, #0x8
+        vshl.i8	q4, q1, #0x2
+        vsri.8	q1, q4, #0x1
+        vshl.i8	q4, q1, #0x3
+        vsri.8	q1, q4, #0x2
+        vshl.i8	q4, q1, #0x4
+        vsri.8	q1, q4, #0x3
+        vshl.i16	q4, q1, #0x8
+        vsri.8	q1, q4, #0x4
+        vshl.i32	q4, q1, #0x10
+        vsri.16	q1, q4, #0x8
+        vsri.32	q1, q2, #0x10
+        vldrw.u32	q4, [r8]
+        vldrw.u32	q5, [r7]
+        veor	q4, q4, q0
+        veor	q5, q5, q1
+        vstrw.32	q4, [r8], #16
+        vstrw.32	q5, [r7], #16
+
+keccak_f1600_x4_state_xor_bytes_asm_exit:
+        vpop	{d8, d9, d10, d11, d12, d13, d14, d15}
+        pop.w	{r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+        nop
+
+MLD_ASM_FN_SIZE(keccak_f1600_x4_state_xor_bytes_asm)
+
+#endif /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/proofs/cbmc/dummy_backend_fips202_x4.h b/proofs/cbmc/dummy_backend_fips202_x4.h
index d84df09c5..b42cbf34d 100644
--- a/proofs/cbmc/dummy_backend_fips202_x4.h
+++ b/proofs/cbmc/dummy_backend_fips202_x4.h
@@ -8,6 +8,8 @@
 
 
 #define MLD_USE_FIPS202_X4_NATIVE
+#define MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE
+#define MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE
 
 #include "../../mldsa/src/fips202/native/api.h"
 
diff --git a/proofs/cbmc/keccakf1600x4_extract_bytes_native/Makefile b/proofs/cbmc/keccakf1600x4_extract_bytes_native/Makefile
new file mode 100644
index 000000000..770458826
--- /dev/null
+++ b/proofs/cbmc/keccakf1600x4_extract_bytes_native/Makefile
@@ -0,0 +1,37 @@
+# Copyright (c) The mldsa-native project authors
+# Copyright (c) The mlkem-native project authors
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+
+include ../Makefile_params.common
+
+HARNESS_ENTRY = harness
+HARNESS_FILE = keccakf1600x4_extract_bytes_native_harness
+
+# This should be a unique identifier for this proof, and will appear on the
+# Litani dashboard. It can be human-readable and contain spaces if you wish.
+PROOF_UID = keccakf1600x4_extract_bytes_native
+
+DEFINES += -DMLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 -DMLD_CONFIG_FIPS202_BACKEND_FILE="\"dummy_backend_fips202_x4.h\""
+INCLUDES +=
+
+REMOVE_FUNCTION_BODY +=
+UNWINDSET +=
+
+PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
+PROJECT_SOURCES += $(SRCDIR)/mldsa/src/fips202/keccakf1600.c
+
+CHECK_FUNCTION_CONTRACTS=mld_keccakf1600x4_extract_bytes
+USE_FUNCTION_CONTRACTS=mld_keccakf1600_extract_bytes_x4_native
+APPLY_LOOP_CONTRACTS=on
+USE_DYNAMIC_FRAMES=1
+
+# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead
+EXTERNAL_SAT_SOLVER=
+CBMCFLAGS=--bitwuzla
+
+FUNCTION_NAME = keccakf1600x4_extract_bytes_native
+
+# This function is large enough to need...
+CBMC_OBJECT_BITS = 8
+
+include ../Makefile.common
diff --git a/proofs/cbmc/keccakf1600x4_extract_bytes_native/keccakf1600x4_extract_bytes_native_harness.c b/proofs/cbmc/keccakf1600x4_extract_bytes_native/keccakf1600x4_extract_bytes_native_harness.c
new file mode 100644
index 000000000..cd95f71c9
--- /dev/null
+++ b/proofs/cbmc/keccakf1600x4_extract_bytes_native/keccakf1600x4_extract_bytes_native_harness.c
@@ -0,0 +1,16 @@
+// Copyright (c) The mlkem-native project authors
+// Copyright (c) The mldsa-native project authors
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT-0
+
+#include <keccakf1600.h>
+
+void harness(void)
+{
+  uint64_t *state;
+  unsigned char *data0, *data1, *data2, *data3;
+  unsigned offset;
+  unsigned length;
+  mld_keccakf1600x4_extract_bytes(state, data0, data1, data2, data3, offset,
+                                  length);
+}
diff --git a/proofs/cbmc/keccakf1600x4_xor_bytes_native/Makefile b/proofs/cbmc/keccakf1600x4_xor_bytes_native/Makefile
new file mode 100644
index 000000000..8edfd859d
--- /dev/null
+++ b/proofs/cbmc/keccakf1600x4_xor_bytes_native/Makefile
@@ -0,0 +1,37 @@
+# Copyright (c) The mldsa-native project authors
+# Copyright (c) The mlkem-native project authors
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+
+include ../Makefile_params.common
+
+HARNESS_ENTRY = harness
+HARNESS_FILE = keccakf1600x4_xor_bytes_native_harness
+
+# This should be a unique identifier for this proof, and will appear on the
+# Litani dashboard. It can be human-readable and contain spaces if you wish.
+PROOF_UID = keccakf1600x4_xor_bytes_native
+
+DEFINES += -DMLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 -DMLD_CONFIG_FIPS202_BACKEND_FILE="\"dummy_backend_fips202_x4.h\""
+INCLUDES +=
+
+REMOVE_FUNCTION_BODY +=
+UNWINDSET +=
+
+PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
+PROJECT_SOURCES += $(SRCDIR)/mldsa/src/fips202/keccakf1600.c
+
+CHECK_FUNCTION_CONTRACTS=mld_keccakf1600x4_xor_bytes
+USE_FUNCTION_CONTRACTS=mld_keccakf1600_xor_bytes_x4_native
+APPLY_LOOP_CONTRACTS=on
+USE_DYNAMIC_FRAMES=1
+
+# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead
+EXTERNAL_SAT_SOLVER=
+CBMCFLAGS=--bitwuzla
+
+FUNCTION_NAME = keccakf1600x4_xor_bytes_native
+
+# This function is large enough to need...
+CBMC_OBJECT_BITS = 8
+
+include ../Makefile.common
diff --git a/proofs/cbmc/keccakf1600x4_xor_bytes_native/keccakf1600x4_xor_bytes_native_harness.c b/proofs/cbmc/keccakf1600x4_xor_bytes_native/keccakf1600x4_xor_bytes_native_harness.c
new file mode 100644
index 000000000..605f127a0
--- /dev/null
+++ b/proofs/cbmc/keccakf1600x4_xor_bytes_native/keccakf1600x4_xor_bytes_native_harness.c
@@ -0,0 +1,16 @@
+// Copyright (c) The mlkem-native project authors
+// Copyright (c) The mldsa-native project authors
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT-0
+
+#include <keccakf1600.h>
+
+void harness(void)
+{
+  uint64_t *state;
+  const unsigned char *data0, *data1, *data2, *data3;
+  unsigned offset;
+  unsigned length;
+  mld_keccakf1600x4_xor_bytes(state, data0, data1, data2, data3, offset,
+                              length);
+}
diff --git a/scripts/check-magic b/scripts/check-magic
index ef7306b88..8ff6c6d63 100755
--- a/scripts/check-magic
+++ b/scripts/check-magic
@@ -35,7 +35,9 @@ REMEMBERED = f"{BLUE}⊢{NORMAL}"
 
 def check_magic_numbers():
     mldsa_q = 8380417
-    exceptions = [mldsa_q]
+    exceptions = [mldsa_q,
+                  2025, # years
+                  ]
     enable_marker = "check-magic: on"
     disable_marker = "check-magic: off"
     autogen_marker = "This file is auto-generated from scripts/autogen"
diff --git a/scripts/simpasm b/scripts/simpasm
index 6097a3abc..5bd33543b 100755
--- a/scripts/simpasm
+++ b/scripts/simpasm
@@ -231,7 +231,7 @@ def simplify(logger, args, asm_input, asm_output=None):
             )
             raise Exception("simpasm failed")
         sym_info = nm_output[0].split(" ")
-        sym_addr = int(sym_info[0])
+        sym_addr = int(sym_info[0], 16)
         if sym_addr != 0:
             logger.error(
                 f"Global sym {sym} not at address 0 (instead at address {hex(sym_addr)}) -- please reorder the assembly to start with the global function symbol"