Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ pub fn build(b: *std.Build) void {
"src/aegis128x2/aegis128x2_altivec.c",
"src/aegis128x2/aegis128x2_avx2.c",
"src/aegis128x2/aegis128x2_neon_aes.c",
"src/aegis128x2/aegis128x2_neon_sha3.c",
"src/aegis128x2/aegis128x2_soft.c",
"src/aegis128x2/aegis128x2.c",

Expand All @@ -51,19 +52,22 @@ pub fn build(b: *std.Build) void {
"src/aegis128x4/aegis128x4_avx2.c",
"src/aegis128x4/aegis128x4_avx512.c",
"src/aegis128x4/aegis128x4_neon_aes.c",
"src/aegis128x4/aegis128x4_neon_sha3.c",
"src/aegis128x4/aegis128x4_soft.c",
"src/aegis128x4/aegis128x4.c",

"src/aegis256/aegis256_aesni.c",
"src/aegis256/aegis256_altivec.c",
"src/aegis256/aegis256_neon_aes.c",
"src/aegis256/aegis256_neon_sha3.c",
"src/aegis256/aegis256_soft.c",
"src/aegis256/aegis256.c",

"src/aegis256x2/aegis256x2_aesni.c",
"src/aegis256x2/aegis256x2_altivec.c",
"src/aegis256x2/aegis256x2_avx2.c",
"src/aegis256x2/aegis256x2_neon_aes.c",
"src/aegis256x2/aegis256x2_neon_sha3.c",
"src/aegis256x2/aegis256x2_soft.c",
"src/aegis256x2/aegis256x2.c",

Expand All @@ -72,6 +76,7 @@ pub fn build(b: *std.Build) void {
"src/aegis256x4/aegis256x4_avx2.c",
"src/aegis256x4/aegis256x4_avx512.c",
"src/aegis256x4/aegis256x4_neon_aes.c",
"src/aegis256x4/aegis256x4_neon_sha3.c",
"src/aegis256x4/aegis256x4_soft.c",
"src/aegis256x4/aegis256x4.c",

Expand Down
5 changes: 5 additions & 0 deletions src/aegis128x2/aegis128x2.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "aegis128x2_altivec.h"
#include "aegis128x2_avx2.h"
#include "aegis128x2_neon_aes.h"
#include "aegis128x2_neon_sha3.h"

#ifndef HAS_HW_AES
# include "aegis128x2_soft.h"
Expand Down Expand Up @@ -217,6 +218,10 @@ aegis128x2_pick_best_implementation(void)
#endif

#if defined(__aarch64__) || defined(_M_ARM64)
if (aegis_runtime_has_neon_sha3()) {
implementation = &aegis128x2_neon_sha3_implementation;
return 0;
}
if (aegis_runtime_has_neon_aes()) {
implementation = &aegis128x2_neon_aes_implementation;
return 0;
Expand Down
45 changes: 26 additions & 19 deletions src/aegis128x2/aegis128x2_common.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
#define RATE 64
#define ALIGNMENT 64

// If not inverting state[3] and state[7], treat bitwise-NOT operations as
// no-ops.
#ifndef AES_INVERT_STATE37
# define AES_BLOCK_NOT(A) (A)
# define AES_BLOCK_XNOR(A, B) AES_BLOCK_XOR((A), (B))
#endif

typedef aes_block_t aegis_blocks[8];

static inline void
Expand Down Expand Up @@ -44,11 +51,11 @@ aegis128x2_init(const uint8_t *key, const uint8_t *nonce, aes_block_t *const sta
state[0] = AES_BLOCK_XOR(k, n);
state[1] = c1;
state[2] = c0;
state[3] = c1;
state[3] = AES_BLOCK_NOT(c1);
state[4] = AES_BLOCK_XOR(k, n);
state[5] = AES_BLOCK_XOR(k, c0);
state[6] = AES_BLOCK_XOR(k, c1);
state[7] = AES_BLOCK_XOR(k, c0);
state[7] = AES_BLOCK_XNOR(k, c0);
for (i = 0; i < 10; i++) {
state[3] = AES_BLOCK_XOR(state[3], context);
state[7] = AES_BLOCK_XOR(state[7], context);
Expand All @@ -73,21 +80,21 @@ aegis128x2_mac(uint8_t *mac, size_t maclen, uint64_t adlen, uint64_t mlen, aes_b

if (maclen == 16) {
tmp = AES_BLOCK_XOR(state[6], AES_BLOCK_XOR(state[5], state[4]));
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[3], state[2]));
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XNOR(state[3], state[2]));
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[1], state[0]));
AES_BLOCK_STORE(mac_multi_0, tmp);
for (i = 0; i < 16; i++) {
mac[i] = mac_multi_0[i] ^ mac_multi_0[1 * 16 + i];
}
} else if (maclen == 32) {
tmp = AES_BLOCK_XOR(state[3], state[2]);
tmp = AES_BLOCK_XNOR(state[3], state[2]);
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[1], state[0]));
AES_BLOCK_STORE(mac_multi_0, tmp);
for (i = 0; i < 16; i++) {
mac[i] = mac_multi_0[i] ^ mac_multi_0[1 * 16 + i];
}

tmp = AES_BLOCK_XOR(state[7], state[6]);
tmp = AES_BLOCK_XNOR(state[7], state[6]);
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[5], state[4]));
AES_BLOCK_STORE(mac_multi_1, tmp);
for (i = 0; i < 16; i++) {
Expand Down Expand Up @@ -124,9 +131,9 @@ aegis128x2_squeeze_keystream(uint8_t *const dst, aes_block_t *const state)
aes_block_t tmp0, tmp1;

tmp0 = AES_BLOCK_XOR(state[6], state[1]);
tmp0 = AES_BLOCK_XOR(tmp0, AES_BLOCK_AND(state[2], state[3]));
tmp0 = AES_BLOCK_XOR(tmp0, AES_BLOCK_AND(state[2], AES_BLOCK_NOT(state[3])));
tmp1 = AES_BLOCK_XOR(state[5], state[2]);
tmp1 = AES_BLOCK_XOR(tmp1, AES_BLOCK_AND(state[6], state[7]));
tmp1 = AES_BLOCK_XOR(tmp1, AES_BLOCK_AND(state[6], AES_BLOCK_NOT(state[7])));
AES_BLOCK_STORE(dst, tmp0);
AES_BLOCK_STORE(dst + AES_BLOCK_LENGTH, tmp1);
}
Expand All @@ -143,8 +150,8 @@ aegis128x2_enc(uint8_t *const dst, const uint8_t *const src, aes_block_t *const
tmp0 = AES_BLOCK_XOR(tmp0, state[1]);
tmp1 = AES_BLOCK_XOR(msg1, state[5]);
tmp1 = AES_BLOCK_XOR(tmp1, state[2]);
tmp0 = AES_BLOCK_XOR(tmp0, AES_BLOCK_AND(state[2], state[3]));
tmp1 = AES_BLOCK_XOR(tmp1, AES_BLOCK_AND(state[6], state[7]));
tmp0 = AES_BLOCK_XOR(tmp0, AES_BLOCK_AND(state[2], AES_BLOCK_NOT(state[3])));
tmp1 = AES_BLOCK_XOR(tmp1, AES_BLOCK_AND(state[6], AES_BLOCK_NOT(state[7])));
AES_BLOCK_STORE(dst, tmp0);
AES_BLOCK_STORE(dst + AES_BLOCK_LENGTH, tmp1);

Expand All @@ -162,8 +169,8 @@ aegis128x2_dec(uint8_t *const dst, const uint8_t *const src, aes_block_t *const
msg0 = AES_BLOCK_XOR(msg0, state[1]);
msg1 = AES_BLOCK_XOR(msg1, state[5]);
msg1 = AES_BLOCK_XOR(msg1, state[2]);
msg0 = AES_BLOCK_XOR(msg0, AES_BLOCK_AND(state[2], state[3]));
msg1 = AES_BLOCK_XOR(msg1, AES_BLOCK_AND(state[6], state[7]));
msg0 = AES_BLOCK_XOR(msg0, AES_BLOCK_AND(state[2], AES_BLOCK_NOT(state[3])));
msg1 = AES_BLOCK_XOR(msg1, AES_BLOCK_AND(state[6], AES_BLOCK_NOT(state[7])));
AES_BLOCK_STORE(dst, msg0);
AES_BLOCK_STORE(dst + AES_BLOCK_LENGTH, msg1);

Expand All @@ -186,8 +193,8 @@ aegis128x2_declast(uint8_t *const dst, const uint8_t *const src, size_t len,
msg0 = AES_BLOCK_XOR(msg0, state[1]);
msg1 = AES_BLOCK_XOR(msg1, state[5]);
msg1 = AES_BLOCK_XOR(msg1, state[2]);
msg0 = AES_BLOCK_XOR(msg0, AES_BLOCK_AND(state[2], state[3]));
msg1 = AES_BLOCK_XOR(msg1, AES_BLOCK_AND(state[6], state[7]));
msg0 = AES_BLOCK_XOR(msg0, AES_BLOCK_AND(state[2], AES_BLOCK_NOT(state[3])));
msg1 = AES_BLOCK_XOR(msg1, AES_BLOCK_AND(state[6], AES_BLOCK_NOT(state[7])));
AES_BLOCK_STORE(pad, msg0);
AES_BLOCK_STORE(pad + AES_BLOCK_LENGTH, msg1);

Expand Down Expand Up @@ -220,7 +227,7 @@ aegis128x2_mac_nr(uint8_t *mac, size_t maclen, uint64_t adlen, aes_block_t *stat
if (maclen == 16) {
#if AES_BLOCK_LENGTH > 16
tmp = AES_BLOCK_XOR(state[6], AES_BLOCK_XOR(state[5], state[4]));
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[3], state[2]));
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XNOR(state[3], state[2]));
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[1], state[0]));
AES_BLOCK_STORE(t, tmp);
for (i = 0; i < d / 2; i++) {
Expand All @@ -235,16 +242,16 @@ aegis128x2_mac_nr(uint8_t *mac, size_t maclen, uint64_t adlen, aes_block_t *stat
}
#endif
tmp = AES_BLOCK_XOR(state[6], AES_BLOCK_XOR(state[5], state[4]));
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[3], state[2]));
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XNOR(state[3], state[2]));
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[1], state[0]));
AES_BLOCK_STORE(t, tmp);
memcpy(mac, t, 16);
} else if (maclen == 32) {
#if AES_BLOCK_LENGTH > 16
tmp = AES_BLOCK_XOR(state[3], state[2]);
tmp = AES_BLOCK_XNOR(state[3], state[2]);
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[1], state[0]));
AES_BLOCK_STORE(t, tmp);
tmp = AES_BLOCK_XOR(state[7], state[6]);
tmp = AES_BLOCK_XNOR(state[7], state[6]);
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[5], state[4]));
AES_BLOCK_STORE(t + AES_BLOCK_LENGTH, tmp);
for (i = 1; i < d; i++) {
Expand All @@ -258,11 +265,11 @@ aegis128x2_mac_nr(uint8_t *mac, size_t maclen, uint64_t adlen, aes_block_t *stat
aegis128x2_update(state, tmp, tmp);
}
#endif
tmp = AES_BLOCK_XOR(state[3], state[2]);
tmp = AES_BLOCK_XNOR(state[3], state[2]);
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[1], state[0]));
AES_BLOCK_STORE(t, tmp);
memcpy(mac, t, 16);
tmp = AES_BLOCK_XOR(state[7], state[6]);
tmp = AES_BLOCK_XNOR(state[7], state[6]);
tmp = AES_BLOCK_XOR(tmp, AES_BLOCK_XOR(state[5], state[4]));
AES_BLOCK_STORE(t, tmp);
memcpy(mac + 16, t, 16);
Expand Down
181 changes: 181 additions & 0 deletions src/aegis128x2/aegis128x2_neon_sha3.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#if defined(__aarch64__) || defined(_M_ARM64)

# include <stddef.h>
# include <stdint.h>

# include "../common/common.h"
# include "aegis128x2.h"
# include "aegis128x2_neon_sha3.h"

# ifndef __ARM_FEATURE_CRYPTO
# define __ARM_FEATURE_CRYPTO 1
# endif
# ifndef __ARM_FEATURE_AES
# define __ARM_FEATURE_AES 1
# endif
# ifndef __ARM_FEATURE_SHA3
# define __ARM_FEATURE_SHA3 1
# endif

# include <arm_neon.h>

# ifdef __clang__
# pragma clang attribute push(__attribute__((target("neon,crypto,aes,sha3"))), \
apply_to = function)
# elif defined(__GNUC__)
# if __GNUC__ < 14
# pragma GCC target("arch=armv8.2-a+simd+crypto+sha3")
# else
# pragma GCC target("+simd+crypto+sha3")
# endif
# endif

# define AES_BLOCK_LENGTH 32
# define AES_INVERT_STATE37 1

typedef struct {
uint8x16_t b0;
uint8x16_t b1;
} aes_block_t;

static const uint8_t ones_arr[] = { 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU,
0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU,
0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU,
0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU, 0xffU };

static inline uint8x16x2_t
fresh_ones_pair(void)
{
// The AESE instruction first operand register is both an input and output
// to the instruction. If the first operand is used in multiple places,
// this can therefore force compilers to emit MOV instructions to duplicate
// the value. In AES_ENC0 we use a zero register which would ordinarily
// have this problem since the compiler can merge the zero constants
// together and reuse them, however compilers typically treat this as a
// special case since materializing zero is a zero-cost instruction on many
// micro-architectures. For AES_ENC1 we cannot use this trick:
// materializing 0xFF is not usually zero-cost so compilers do not treat it
// specially, however since this region of the code is so heavy in vector
// arithmetic, inserting an additional load instruction here is
// effectively free.
uint8x16x2_t ret;
__asm volatile("ldp %q0, %q1, [%2]": "=w"(ret.val[0]), "=w"(ret.val[1]): "r"(ones_arr));
return ret;
}

static inline aes_block_t
AES_BLOCK_NOT(const aes_block_t a)
{
return (aes_block_t) { vmvnq_u8(a.b0), vmvnq_u8(a.b1) };
}

static inline aes_block_t
AES_BLOCK_XOR(const aes_block_t a, const aes_block_t b)
{
return (aes_block_t) { veorq_u8(a.b0, b.b0), veorq_u8(a.b1, b.b1) };
}

static inline aes_block_t
AES_BLOCK_XNOR(const aes_block_t a, const aes_block_t b)
{
const uint8x16_t ones = vmovq_n_u8(0xff);

return (aes_block_t) { veor3q_u8(a.b0, b.b0, ones), veor3q_u8(a.b1, b.b1, ones) };
}

static inline aes_block_t
AES_BLOCK_XOR3(const aes_block_t a, const aes_block_t b, const aes_block_t c)
{
return (aes_block_t) { veor3q_u8(a.b0, b.b0, c.b0), veor3q_u8(a.b1, b.b1, c.b1) };
}

static inline aes_block_t
AES_BLOCK_AND(const aes_block_t a, const aes_block_t b)
{
return (aes_block_t) { vandq_u8(a.b0, b.b0), vandq_u8(a.b1, b.b1) };
}

static inline aes_block_t
AES_BLOCK_LOAD(const uint8_t *a)
{
return (aes_block_t) { vld1q_u8(a), vld1q_u8(a + 16) };
}

static inline aes_block_t
AES_BLOCK_LOAD_64x2(uint64_t a, uint64_t b)
{
const uint8x16_t t = vreinterpretq_u8_u64(vsetq_lane_u64((a), vmovq_n_u64(b), 1));
return (aes_block_t) { t, t };
}

static inline void
AES_BLOCK_STORE(uint8_t *a, const aes_block_t b)
{
vst1q_u8(a, b.b0);
vst1q_u8(a + 16, b.b1);
}

static inline aes_block_t
AES_ENC0(const aes_block_t a)
{
return (aes_block_t) { vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b0)),
vaesmcq_u8(vaeseq_u8(vmovq_n_u8(0), a.b1)) };
}

static inline aes_block_t
AES_ENC1(const aes_block_t a)
{
uint8x16x2_t ones = fresh_ones_pair();
return (aes_block_t) { vaesmcq_u8(vaeseq_u8(ones.val[0], a.b0)),
vaesmcq_u8(vaeseq_u8(ones.val[1], a.b1)) };
}

static inline aes_block_t
AES_ENC(const aes_block_t a, const aes_block_t b)
{
return AES_BLOCK_XOR(AES_ENC0(a), b);
}

static inline void
aegis128x2_update(aes_block_t *const state, const aes_block_t d1, const aes_block_t d2)
{
// Apply bitwise-NOT to state[3] and state[7] to allow us to use the Arm
// SHA3 BCAX instruction.
aes_block_t enc7 = AES_ENC1(state[7]);
aes_block_t enc3 = AES_ENC1(state[3]);

state[7] = AES_BLOCK_XOR(AES_ENC0(state[6]), state[7]);
state[6] = AES_BLOCK_XOR(AES_ENC0(state[5]), state[6]);
state[5] = AES_BLOCK_XOR(AES_ENC0(state[4]), state[5]);
state[4] = AES_BLOCK_XOR3(enc3, state[4], d2);
state[3] = AES_BLOCK_XOR(AES_ENC0(state[2]), state[3]);
state[2] = AES_BLOCK_XOR(AES_ENC0(state[1]), state[2]);
state[1] = AES_BLOCK_XOR(AES_ENC0(state[0]), state[1]);
state[0] = AES_BLOCK_XOR3(enc7, state[0], d1);
}

# include "aegis128x2_common.h"

struct aegis128x2_implementation aegis128x2_neon_sha3_implementation = {
.encrypt_detached = encrypt_detached,
.decrypt_detached = decrypt_detached,
.encrypt_unauthenticated = encrypt_unauthenticated,
.decrypt_unauthenticated = decrypt_unauthenticated,
.stream = stream,
.state_init = state_init,
.state_encrypt_update = state_encrypt_update,
.state_encrypt_final = state_encrypt_final,
.state_decrypt_update = state_decrypt_update,
.state_decrypt_final = state_decrypt_final,
.state_mac_init = state_mac_init,
.state_mac_update = state_mac_update,
.state_mac_final = state_mac_final,
.state_mac_reset = state_mac_reset,
.state_mac_clone = state_mac_clone,
};

# ifdef __clang__
# pragma clang attribute pop
# endif

#endif
9 changes: 9 additions & 0 deletions src/aegis128x2/aegis128x2_neon_sha3.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#ifndef aegis128x2_neon_sha3_H
#define aegis128x2_neon_sha3_H

#include "../common/common.h"
#include "implementations.h"

extern struct aegis128x2_implementation aegis128x2_neon_sha3_implementation;

#endif
Loading
Loading