aegis-aead · georges-arm · Mar 13, 2026
diff --git a/build.zig b/build.zig
@@ -43,6 +43,7 @@ pub fn build(b: *std.Build) void {
         "src/aegis128x2/aegis128x2_altivec.c",
         "src/aegis128x2/aegis128x2_avx2.c",
         "src/aegis128x2/aegis128x2_neon_aes.c",
+        "src/aegis128x2/aegis128x2_neon_sha3.c",
         "src/aegis128x2/aegis128x2_soft.c",
         "src/aegis128x2/aegis128x2.c",
 
@@ -51,19 +52,22 @@ pub fn build(b: *std.Build) void {
         "src/aegis128x4/aegis128x4_avx2.c",
         "src/aegis128x4/aegis128x4_avx512.c",
         "src/aegis128x4/aegis128x4_neon_aes.c",
+        "src/aegis128x4/aegis128x4_neon_sha3.c",
         "src/aegis128x4/aegis128x4_soft.c",
         "src/aegis128x4/aegis128x4.c",
 
         "src/aegis256/aegis256_aesni.c",
         "src/aegis256/aegis256_altivec.c",
         "src/aegis256/aegis256_neon_aes.c",
+        "src/aegis256/aegis256_neon_sha3.c",
         "src/aegis256/aegis256_soft.c",
         "src/aegis256/aegis256.c",
 
         "src/aegis256x2/aegis256x2_aesni.c",
         "src/aegis256x2/aegis256x2_altivec.c",
         "src/aegis256x2/aegis256x2_avx2.c",
         "src/aegis256x2/aegis256x2_neon_aes.c",
+        "src/aegis256x2/aegis256x2_neon_sha3.c",
         "src/aegis256x2/aegis256x2_soft.c",
         "src/aegis256x2/aegis256x2.c",
 
@@ -72,6 +76,7 @@ pub fn build(b: *std.Build) void {
         "src/aegis256x4/aegis256x4_avx2.c",
         "src/aegis256x4/aegis256x4_avx512.c",
         "src/aegis256x4/aegis256x4_neon_aes.c",
+        "src/aegis256x4/aegis256x4_neon_sha3.c",
         "src/aegis256x4/aegis256x4_soft.c",
         "src/aegis256x4/aegis256x4.c",
 

diff --git a/src/aegis128x2/aegis128x2.c b/src/aegis128x2/aegis128x2.c
@@ -8,6 +8,7 @@
 #include "aegis128x2_altivec.h"
 #include "aegis128x2_avx2.h"
 #include "aegis128x2_neon_aes.h"
+#include "aegis128x2_neon_sha3.h"
 
 #ifndef HAS_HW_AES
 #    include "aegis128x2_soft.h"
@@ -217,6 +218,10 @@ aegis128x2_pick_best_implementation(void)
 #endif
 
 #if defined(__aarch64__) || defined(_M_ARM64)
+    if (aegis_runtime_has_neon_sha3()) {
+        implementation = &aegis128x2_neon_sha3_implementation;
+        return 0;
+    }
     if (aegis_runtime_has_neon_aes()) {
         implementation = &aegis128x2_neon_aes_implementation;
         return 0;

diff --git a/src/aegis128x2/aegis128x2_common.h b/src/aegis128x2/aegis128x2_common.h
@@ -1,6 +1,13 @@
 #define RATE      64
 #define ALIGNMENT 64
 
+// If not inverting state[3] and state[7], treat bitwise-NOT operations as
+// no-ops.
+#ifndef AES_INVERT_STATE37
+#    define AES_BLOCK_NOT(A) (A)
+#    define AES_BLOCK_XNOR(A, B) AES_BLOCK_XOR((A), (B))
+#endif
+
 typedef aes_block_t aegis_blocks[8];
 
 static inline void
@@ -44,11 +51,11 @@ aegis128x2_init(const uint8_t *key, const uint8_t *nonce, aes_block_t *const sta
     state[0] = AES_BLOCK_XOR(k, n);
     state[1] = c1;
     state[2] = c0;
-    state[3] = c1;
+    state[3] = AES_BLOCK_NOT(c1);
     state[4] = AES_BLOCK_XOR(k, n);
     state[5] = AES_BLOCK_XOR(k, c0);
     state[6] = AES_BLOCK_XOR(k, c1);
-    state[7] = AES_BLOCK_XOR(k, c0);
+    state[7] = AES_BLOCK_XNOR(k, c0);
     for (i = 0; i < 10; i++) {
         state[3] = AES_BLOCK_XOR(state[3], context);
         state[7] = AES_BLOCK_XOR(state[7], context);
@@ -73,21 +80,21 @@ aegis128x2_mac(uint8_t *mac, size_t maclen, uint64_t adlen, uint64_t mlen, aes_b
 
     if (maclen == 16) {
         tmp = AES_BLOCK_XOR(state[6], AES_BLOCK_XOR(state[5], state[4]));
-        tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[3], state[2]));
+        tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XNOR(state[3], state[2]));
         tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[1], state[0]));
         AES_BLOCK_STORE(mac_multi_0, tmp);
         for (i = 0; i < 16; i++) {
             mac[i] = mac_multi_0[i] ^ mac_multi_0[1 * 16 + i];
         }
     } else if (maclen == 32) {
-        tmp = AES_BLOCK_XOR(state[3], state[2]);
+        tmp = AES_BLOCK_XNOR(state[3], state[2]);
         tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[1], state[0]));
         AES_BLOCK_STORE(mac_multi_0, tmp);
         for (i = 0; i < 16; i++) {
             mac[i] = mac_multi_0[i] ^ mac_multi_0[1 * 16 + i];
         }
 
-        tmp = AES_BLOCK_XOR(state[7], state[6]);
+        tmp = AES_BLOCK_XNOR(state[7], state[6]);
         tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[5], state[4]));
         AES_BLOCK_STORE(mac_multi_1, tmp);
         for (i = 0; i < 16; i++) {
@@ -124,9 +131,9 @@ aegis128x2_squeeze_keystream(uint8_t *const dst, aes_block_t *const state)
     aes_block_t tmp0, tmp1;
 
     tmp0 = AES_BLOCK_XOR(state[6], state[1]);
-    tmp0 = AES_BLOCK_XOR(tmp0, AES_BLOCK_AND(state[2], state[3]));
+    tmp0 = AES_BLOCK_XOR(tmp0, AES_BLOCK_AND(state[2], AES_BLOCK_NOT(state[3])));
     tmp1 = AES_BLOCK_XOR(state[5], state[2]);
-    tmp1 = AES_BLOCK_XOR(tmp1, AES_BLOCK_AND(state[6], state[7]));
+    tmp1 = AES_BLOCK_XOR(tmp1, AES_BLOCK_AND(state[6], AES_BLOCK_NOT(state[7])));
     AES_BLOCK_STORE(dst, tmp0);
     AES_BLOCK_STORE(dst + AES_BLOCK_LENGTH, tmp1);
 }
@@ -143,8 +150,8 @@ aegis128x2_enc(uint8_t *const dst, const uint8_t *const src, aes_block_t *const
     tmp0 = AES_BLOCK_XOR(tmp0, state[1]);
     tmp1 = AES_BLOCK_XOR(msg1, state[5]);
     tmp1 = AES_BLOCK_XOR(tmp1, state[2]);
-    tmp0 = AES_BLOCK_XOR(tmp0, AES_BLOCK_AND(state[2], state[3]));
-    tmp1 = AES_BLOCK_XOR(tmp1, AES_BLOCK_AND(state[6], state[7]));
+    tmp0 = AES_BLOCK_XOR(tmp0, AES_BLOCK_AND(state[2], AES_BLOCK_NOT(state[3])));
+    tmp1 = AES_BLOCK_XOR(tmp1, AES_BLOCK_AND(state[6], AES_BLOCK_NOT(state[7])));
     AES_BLOCK_STORE(dst, tmp0);
     AES_BLOCK_STORE(dst + AES_BLOCK_LENGTH, tmp1);
 
@@ -162,8 +169,8 @@ aegis128x2_dec(uint8_t *const dst, const uint8_t *const src, aes_block_t *const
     msg0 = AES_BLOCK_XOR(msg0, state[1]);
     msg1 = AES_BLOCK_XOR(msg1, state[5]);
     msg1 = AES_BLOCK_XOR(msg1, state[2]);
-    msg0 = AES_BLOCK_XOR(msg0, AES_BLOCK_AND(state[2], state[3]));
-    msg1 = AES_BLOCK_XOR(msg1, AES_BLOCK_AND(state[6], state[7]));
+    msg0 = AES_BLOCK_XOR(msg0, AES_BLOCK_AND(state[2], AES_BLOCK_NOT(state[3])));
+    msg1 = AES_BLOCK_XOR(msg1, AES_BLOCK_AND(state[6], AES_BLOCK_NOT(state[7])));
     AES_BLOCK_STORE(dst, msg0);
     AES_BLOCK_STORE(dst + AES_BLOCK_LENGTH, msg1);
 
@@ -186,8 +193,8 @@ aegis128x2_declast(uint8_t *const dst, const uint8_t *const src, size_t len,
     msg0 = AES_BLOCK_XOR(msg0, state[1]);
     msg1 = AES_BLOCK_XOR(msg1, state[5]);
     msg1 = AES_BLOCK_XOR(msg1, state[2]);
-    msg0 = AES_BLOCK_XOR(msg0, AES_BLOCK_AND(state[2], state[3]));
-    msg1 = AES_BLOCK_XOR(msg1, AES_BLOCK_AND(state[6], state[7]));
+    msg0 = AES_BLOCK_XOR(msg0, AES_BLOCK_AND(state[2], AES_BLOCK_NOT(state[3])));
+    msg1 = AES_BLOCK_XOR(msg1, AES_BLOCK_AND(state[6], AES_BLOCK_NOT(state[7])));
     AES_BLOCK_STORE(pad, msg0);
     AES_BLOCK_STORE(pad + AES_BLOCK_LENGTH, msg1);
 
@@ -220,7 +227,7 @@ aegis128x2_mac_nr(uint8_t *mac, size_t maclen, uint64_t adlen, aes_block_t *stat
     if (maclen == 16) {
 #if AES_BLOCK_LENGTH > 16
         tmp = AES_BLOCK_XOR(state[6], AES_BLOCK_XOR(state[5], state[4]));
-        tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[3], state[2]));
+        tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XNOR(state[3], state[2]));
         tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[1], state[0]));
         AES_BLOCK_STORE(t, tmp);
         for (i = 0; i < d / 2; i++) {
@@ -235,16 +242,16 @@ aegis128x2_mac_nr(uint8_t *mac, size_t maclen, uint64_t adlen, aes_block_t *stat
         }
 #endif
         tmp = AES_BLOCK_XOR(state[6], AES_BLOCK_XOR(state[5], state[4]));
-        tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[3], state[2]));
+        tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XNOR(state[3], state[2]));
         tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[1], state[0]));
         AES_BLOCK_STORE(t, tmp);
         memcpy(mac, t, 16);
     } else if (maclen == 32) {
 #if AES_BLOCK_LENGTH > 16
-        tmp = AES_BLOCK_XOR(state[3], state[2]);
+        tmp = AES_BLOCK_XNOR(state[3], state[2]);
         tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[1], state[0]));
         AES_BLOCK_STORE(t, tmp);
-        tmp = AES_BLOCK_XOR(state[7], state[6]);
+        tmp = AES_BLOCK_XNOR(state[7], state[6]);
         tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[5], state[4]));
         AES_BLOCK_STORE(t + AES_BLOCK_LENGTH, tmp);
         for (i = 1; i < d; i++) {
@@ -258,11 +265,11 @@ aegis128x2_mac_nr(uint8_t *mac, size_t maclen, uint64_t adlen, aes_block_t *stat
             aegis128x2_update(state, tmp, tmp);
         }
 #endif
-        tmp = AES_BLOCK_XOR(state[3], state[2]);
+        tmp = AES_BLOCK_XNOR(state[3], state[2]);
         tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[1], state[0]));
         AES_BLOCK_STORE(t, tmp);
         memcpy(mac, t, 16);
-        tmp = AES_BLOCK_XOR(state[7], state[6]);
+        tmp = AES_BLOCK_XNOR(state[7], state[6]);
         tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[5], state[4]));
         AES_BLOCK_STORE(t, tmp);
         memcpy(mac + 16, t, 16);

diff --git a/src/aegis128x2/aegis128x2_neon_sha3.c b/src/aegis128x2/aegis128x2_neon_sha3.c
@@ -0,0 +1,181 @@
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#    include <stddef.h>
+#    include <stdint.h>
+
+#    include "../common/common.h"
+#    include "aegis128x2.h"
+#    include "aegis128x2_neon_sha3.h"
+
+#    ifndef __ARM_FEATURE_CRYPTO
+#        define __ARM_FEATURE_CRYPTO 1
+#    endif
+#    ifndef __ARM_FEATURE_AES
+#        define __ARM_FEATURE_AES 1
+#    endif
+#    ifndef __ARM_FEATURE_SHA3
+#        define __ARM_FEATURE_SHA3 1
+#    endif
+
+#    include <arm_neon.h>
+
+#    ifdef __clang__
+#        pragma clang attribute push(__attribute__((target("neon,crypto,aes,sha3"))), \
+                                     apply_to = function)
+#    elif defined(__GNUC__)
+#        if __GNUC__ < 14
+#            pragma GCC target("arch=armv8.2-a+simd+crypto+sha3")
+#        else
+#            pragma GCC target("+simd+crypto+sha3")
+#        endif
+#    endif
+
+#    define AES_BLOCK_LENGTH 32
+#    define AES_INVERT_STATE37 1
+
+typedef struct {
+    uint8x16_t b0;
+    uint8x16_t b1;
+} aes_block_t;
+
+static const uint8_t ones_arr[] = { 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU,
+                                    0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU,
+                                    0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU,
+                                    0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU };
+
+static inline uint8x16x2_t
+fresh_ones_pair(void)
+{
+    // The AESE instruction first operand register is both an input and output
+    // to the instruction. If the first operand is used in multiple places,
+    // this can therefore force compilers to emit MOV instructions to duplicate
+    // the value. In AES_ENC0 we use a zero register which would ordinarily
+    // have this problem since the compiler can merge the zero constants
+    // together and reuse them, however compilers typically treat this as a
+    // special case since materializing zero is a zero-cost instruction on many
+    // micro-architectures. For AES_ENC1 we cannot use this trick:
+    // materializing 0xFF is not usually zero-cost so compilers do not treat it
+    // specially, however since this region of the code is so heavy in vector
+    // arithmetic, inserting an additional load instruction here is
+    // effectively free.
+    uint8x16x2_t ret;
+    __asm volatile("ldp %q0, %q1, [%2]": "=w"(ret.val[0]), "=w"(ret.val[1]): "r"(ones_arr));
+    return ret;
+}
+
+static inline aes_block_t
+AES_BLOCK_NOT(const aes_block_t a)
+{
+    return (aes_block_t) { vmvnq_u8(a.b0), vmvnq_u8(a.b1) };
+}
+
+static inline aes_block_t
+AES_BLOCK_XOR(const aes_block_t a, const aes_block_t b)
+{
+    return (aes_block_t) { veorq_u8(a.b0, b.b0), veorq_u8(a.b1, b.b1) };
+}
+
+static inline aes_block_t
+AES_BLOCK_XNOR(const aes_block_t a, const aes_block_t b)
+{
+    const uint8x16_t ones = vmovq_n_u8(0xff);
+
+    return (aes_block_t) { veor3q_u8(a.b0, b.b0, ones), veor3q_u8(a.b1, b.b1, ones) };
+}
+
+static inline aes_block_t
+AES_BLOCK_XOR3(const aes_block_t a, const aes_block_t b, const aes_block_t c)
+{
+    return (aes_block_t) { veor3q_u8(a.b0, b.b0, c.b0), veor3q_u8(a.b1, b.b1, c.b1) };
+}
+
+static inline aes_block_t
+AES_BLOCK_AND(const aes_block_t a, const aes_block_t b)
+{
+    return (aes_block_t) { vandq_u8(a.b0, b.b0), vandq_u8(a.b1, b.b1) };
+}
+
+static inline aes_block_t
+AES_BLOCK_LOAD(const uint8_t *a)
+{
+    return (aes_block_t) { vld1q_u8(a), vld1q_u8(a + 16) };
+}
+
+static inline aes_block_t
+AES_BLOCK_LOAD_64x2(uint64_t a, uint64_t b)
+{
+    const uint8x16_t t = vreinterpretq_u8_u64(vsetq_lane_u64((a), vmovq_n_u64(b), 1));
+    return (aes_block_t) { t, t };
+}
+
+static inline void
+AES_BLOCK_STORE(uint8_t *a, const aes_block_t b)
+{
+    vst1q_u8(a, b.b0);
+    vst1q_u8(a + 16, b.b1);
+}
+
+static inline aes_block_t
+AES_ENC0(const aes_block_t a)
+{
+    return (aes_block_t) { vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b0)),
+                           vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b1)) };
+}
+
+static inline aes_block_t
+AES_ENC1(const aes_block_t a)
+{
+    uint8x16x2_t ones = fresh_ones_pair();
+    return (aes_block_t) { vaesmcq_u8(vaeseq_u8(ones.val[0], a.b0)),
+                           vaesmcq_u8(vaeseq_u8(ones.val[1], a.b1)) };
+}
+
+static inline aes_block_t
+AES_ENC(const aes_block_t a, const aes_block_t b)
+{
+    return AES_BLOCK_XOR(AES_ENC0(a), b);
+}
+
+static inline void
+aegis128x2_update(aes_block_t *const state, const aes_block_t d1, const aes_block_t d2)
+{
+    // Apply bitwise-NOT to state[3] and state[7] to allow us to use the Arm
+    // SHA3 BCAX instruction.
+    aes_block_t enc7 = AES_ENC1(state[7]);
+    aes_block_t enc3 = AES_ENC1(state[3]);
+
+    state[7] = AES_BLOCK_XOR(AES_ENC0(state[6]), state[7]);
+    state[6] = AES_BLOCK_XOR(AES_ENC0(state[5]), state[6]);
+    state[5] = AES_BLOCK_XOR(AES_ENC0(state[4]), state[5]);
+    state[4] = AES_BLOCK_XOR3(enc3, state[4], d2);
+    state[3] = AES_BLOCK_XOR(AES_ENC0(state[2]), state[3]);
+    state[2] = AES_BLOCK_XOR(AES_ENC0(state[1]), state[2]);
+    state[1] = AES_BLOCK_XOR(AES_ENC0(state[0]), state[1]);
+    state[0] = AES_BLOCK_XOR3(enc7, state[0], d1);
+}
+
+#    include "aegis128x2_common.h"
+
+struct aegis128x2_implementation aegis128x2_neon_sha3_implementation = {
+    .encrypt_detached        = encrypt_detached,
+    .decrypt_detached        = decrypt_detached,
+    .encrypt_unauthenticated = encrypt_unauthenticated,
+    .decrypt_unauthenticated = decrypt_unauthenticated,
+    .stream                  = stream,
+    .state_init              = state_init,
+    .state_encrypt_update    = state_encrypt_update,
+    .state_encrypt_final     = state_encrypt_final,
+    .state_decrypt_update    = state_decrypt_update,
+    .state_decrypt_final     = state_decrypt_final,
+    .state_mac_init          = state_mac_init,
+    .state_mac_update        = state_mac_update,
+    .state_mac_final         = state_mac_final,
+    .state_mac_reset         = state_mac_reset,
+    .state_mac_clone         = state_mac_clone,
+};
+
+#    ifdef __clang__
+#        pragma clang attribute pop
+#    endif
+
+#endif
diff --git a/src/aegis128x2/aegis128x2_neon_sha3.h b/src/aegis128x2/aegis128x2_neon_sha3.h
@@ -0,0 +1,9 @@
+#ifndef aegis128x2_neon_sha3_H
+#define aegis128x2_neon_sha3_H
+
+#include "../common/common.h"
+#include "implementations.h"
+
+extern struct aegis128x2_implementation aegis128x2_neon_sha3_implementation;
+
+#endif