From 0a81791f7a6a771c5602aecf5898280543b87590 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Mon, 8 Sep 2025 11:53:55 -0400 Subject: [PATCH 01/22] Added optimized ppc64le support functions for ML-KEM. The supported native functions include: 1. MLK_USE_NATIVE_NTT (ntt_ppc.S) 2. MLK_USE_NATIVE_INTT (intt_ppc.S) 3. MLK_USE_NATIVE_POLY_REDUCE (reduce.S) 4. MLK_USE_NATIVE_POLY_TOMONT (poly_tomont.S) And other interface functions and headers. Signed-off-by: Danny Tsen --- dev/ppc64le/README.md | 6 + dev/ppc64le/meta.h | 49 ++ dev/ppc64le/src/arith_native_ppc64le.h | 23 + dev/ppc64le/src/intt_ppc.S | 773 ++++++++++++++++++ dev/ppc64le/src/ntt_ppc.S | 498 +++++++++++ dev/ppc64le/src/poly_tomont.S | 163 ++++ dev/ppc64le/src/reduce.S | 225 +++++ integration/liboqs/ML-KEM-1024_META.yml | 141 ++-- integration/liboqs/ML-KEM-512_META.yml | 141 ++-- integration/liboqs/ML-KEM-768_META.yml | 141 ++-- integration/liboqs/config_ppc64le.h | 266 ++++++ mlkem/src/native/meta.h | 4 + mlkem/src/native/ppc64le/README.md | 6 + mlkem/src/native/ppc64le/meta.h | 49 ++ .../native/ppc64le/src/arith_native_ppc64le.h | 23 + mlkem/src/native/ppc64le/src/intt_ppc.S | 773 ++++++++++++++++++ mlkem/src/native/ppc64le/src/ntt_ppc.S | 498 +++++++++++ mlkem/src/native/ppc64le/src/poly_tomont.S | 163 ++++ mlkem/src/native/ppc64le/src/reduce.S | 225 +++++ test/mk/auto.mk | 132 +-- test/mk/components.mk | 10 +- 21 files changed, 3970 insertions(+), 339 deletions(-) create mode 100644 dev/ppc64le/README.md create mode 100644 dev/ppc64le/meta.h create mode 100644 dev/ppc64le/src/arith_native_ppc64le.h create mode 100644 dev/ppc64le/src/intt_ppc.S create mode 100644 dev/ppc64le/src/ntt_ppc.S create mode 100644 dev/ppc64le/src/poly_tomont.S create mode 100644 dev/ppc64le/src/reduce.S create mode 100644 integration/liboqs/config_ppc64le.h create mode 100644 mlkem/src/native/ppc64le/README.md create mode 100644 mlkem/src/native/ppc64le/meta.h create mode 100644 mlkem/src/native/ppc64le/src/arith_native_ppc64le.h create mode 100644 mlkem/src/native/ppc64le/src/intt_ppc.S create mode 100644 mlkem/src/native/ppc64le/src/ntt_ppc.S create mode 100644 mlkem/src/native/ppc64le/src/poly_tomont.S create mode 100644 mlkem/src/native/ppc64le/src/reduce.S diff --git a/dev/ppc64le/README.md b/dev/ppc64le/README.md new file mode 100644 index 0000000000..5125a40eae --- /dev/null +++ b/dev/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h new file mode 100644 index 0000000000..bee788976b --- /dev/null +++ b/dev/ppc64le/meta.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_META_H +#define MLK_NATIVE_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { + mlk_ntt_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { + mlk_intt_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { + mlk_reduce_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { + mlk_poly_tomont_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* MLK_NATIVE_PPC64LE_META_H */ diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 0000000000..57f0b8f8ce --- /dev/null +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2024-2025 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *); + +#endif /* MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S new file mode 100644 index 0000000000..feb78b984e --- /dev/null +++ b/dev/ppc64le/src/intt_ppc.S @@ -0,0 +1,773 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" + +.machine "any" +.text + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +# Montgomery reduce constatnts +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V1441 10 + +.macro Load_4Coeffs start next step + mr 9, \start # j + add 10, 4, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 # r[j+len] + lxvd2x 32+12, 3, 17 # r[j+len] + lxvd2x 32+16, 3, 19 # r[j+len] + lxvd2x 32+20, 3, 21 # r[j+len] + xxpermdi 32+8, 32+8, 32+8, 2 + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+16, 32+16, 32+16, 2 + xxpermdi 32+20, 32+20, 32+20, 2 + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + xxpermdi 32+21, 32+21, 32+21, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+24, 32+24, 32+24, 2 + + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t +.endm + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +#----------------------------------- +# MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 # >> 1 + vsrah \_vo1, 20, 4 # >> 1 + vsrah \_vo2, 25, 4 # >> 1 + vsrah \_vo3, 30, 4 # >> 1 +.endm + +.macro Set_mont_consts + xxlor 32+5, 0, 0 # V_NMKQ + xxlor 32+2, 2, 2 # V_QINV + xxlor 32+3, 3, 3 # 0 + xxlor 32+4, 4, 4 # 1 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +.macro Write_B4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 9 + stxvx \_vs1, 3, 16 + stxvx \_vs2, 3, 18 + stxvx \_vs3, 3, 20 +.endm + +.macro Write_M4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 10 + stxvx \_vs1, 3, 17 + stxvx \_vs2, 3, 19 + stxvx \_vs3, 3, 21 +.endm + +.macro Reload_4coeffs + lxv 32+25, 0(3) + lxv 32+26, 16(3) + lxv 32+30, 32(3) + lxv 32+31, 48(3) + addi 3, 3, 64 +.endm + +.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 + stxv \_vs0, -128(3) + stxv \_vs1, -112(3) + stxv \_vs2, -96(3) + stxv \_vs3, -80(3) + stxv \_vs4, -64(3) + stxv \_vs5, -48(3) + stxv \_vs6, -32(3) + stxv \_vs7, -16(3) +.endm + +.macro Write_Len2_4C _vs0 _vs1 _vs2 _vs3 + xxmrglw 32+12, \_vs0, 10 + xxmrghw 32+11, \_vs0, 10 + xxpermdi 10, 32+12, 32+11, 3 + xxmrglw 32+16, \_vs1, 11 + xxmrghw 32+15, \_vs1, 11 + xxpermdi 11, 32+16, 32+15, 3 + xxmrglw 32+12, \_vs2, 12 + xxmrghw 32+11, \_vs2, 12 + xxpermdi 12, 32+12, 32+11, 3 + xxmrglw 32+16, \_vs3, 13 + xxmrghw 32+15, \_vs3, 13 + xxpermdi 13, 32+16, 32+15, 3 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 +.endm + +.macro Write_Len4_4C _vs0 _vs1 _vs2 _vs3 + xxpermdi 10, 10, \_vs0, 3 + xxpermdi 11, 11, \_vs1, 3 + xxpermdi 12, 12, \_vs2, 3 + xxpermdi 13, 13, \_vs3, 3 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 +.endm + +# intt +# t = r[j]; +# r[j] = barrett_reduce(t + r[j + len]); +# r[j + len] = r[j + len] - t; +# r[j + len] = fqmul(zeta, r[j + len]); + +# +# mlk_intt_ppc(r) +# +.global MLK_ASM_NAMESPACE(intt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(intt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # init vectors and constants + # Setup for Montgomery reduce + addis 8,2,.nmkq@toc@ha + addi 8,8,.nmkq@toc@l + lxv 0, 0(8) + + lxv 32+V_QINV, 16(8) # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 + xxlor 3, 32+3, 32+3 + xxlor 4, 32+4, 32+4 + + # Setup for Barrett reduce + addis 8,2,.mkq@toc@ha + addi 8,8,.mkq@toc@l + addis 9,2,.C20159@toc@ha + addi 9,9,.C20159@toc@l + addis 10,2,.C25@toc@ha + addi 10,10,.C25@toc@l + + lxv 6, 0(8) # V_MKQ + lxv 32+0, 0(9) # V20159 + lxv 7, 0(10) # V_25 + + #xxspltiw 8, 26 # for power9 and above + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 + + # zetas array + #addis 14,2,.izeta63@toc@ha + #addi 14,14,.izeta63@toc@l + +.align 4 +__Len2: + # + # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # Update zetas vectors, each vector has 2 zetas + addis 14,2,.izeta127@toc@ha + addi 14,14,.izeta127@toc@l + li 4, 4 + li 15, 4 + mtctr 15 + li 5, 0 +__Loop2: + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz __Loop2 + +.align 4 +__Len4: + # + # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + addis 14,2,.izeta63@toc@ha + addi 14,14,.izeta63@toc@l + li 5, 0 + li 4, 8 + li 15, 4 # loops + mtctr 15 +__Loop4: + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz __Loop4 + +.align 4 +__Len8: + # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + #addi 14, 14, 512 + li 4, 16 + li 5, 0 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len16: + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + #addi 14, 14, 768 + li 5, 0 + li 4, 32 + + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 16 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + li 5, 256 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + li 5, 272 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len32: + # + # 5. len = 32, start = 0, 64, 128, 192 + #addi 14, 14, 896 + li 5, 0 + li 4, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len64: + # + # 6. len = 64, start = 0, 128 + #addi 14, 14, 960 + li 5, 0 + li 4, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len128: + # 7. len = 128, start = 0 + # + #addi 14, 14, 992 + li 5, 0 # start + li 4, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + # + # Montgomery reduce loops with constant 1441 + # + addis 10,2,.C1441@toc@ha + addi 10,10,.C1441@toc@l + lvx V1441, 0, 10 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + +__intt_out: + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +.data +.align 4 +# -MLKEM_Q +.nmkq: +.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 +# QINV +.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 + +# MLKEM_Q +.mkq: +.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 + +.C20159: +.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 + +# 0x2000000 +.C25: +.long 33554432, 33554432, 33554432, 33554432 + +.C1441: +.short 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441 + +.align 4 +.izeta127: +.short 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522 +.short -1460, -1460, -1460, -1460, 958, 958, 958, 958 +.short 991, 991, 991, 991, 996, 996, 996, 996 +.short -308, -308, -308, -308, -108, -108, -108, -108 +.short 478, 478, 478, 478, -870, -870, -870, -870 +.short -854, -854, -854, -854, -1510, -1510, -1510, -1510 +.short 794, 794, 794, 794, -1278, -1278, -1278, -1278 +.short -1530, -1530, -1530, -1530, -1185, -1185, -1185, -1185 +.short -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187 +.short 220, 220, 220, 220, -874, -874, -874, -874 +.short -1335, -1335, -1335, -1335, 1218, 1218, 1218, 1218 +.short -136, -136, -136, -136, -1215, -1215, -1215, -1215 +.short 384, 384, 384, 384, -1465, -1465, -1465, -1465 +.short -1285, -1285, -1285, -1285, 1322, 1322, 1322, 1322 +.short 610, 610, 610, 610, 603, 603, 603, 603 +.short 1097, 1097, 1097, 1097, 817, 817, 817, 817 +.short -75, -75, -75, -75, -156, -156, -156, -156 +.short 329, 329, 329, 329, 418, 418, 418, 418 +.short 349, 349, 349, 349, -872, -872, -872, -872 +.short 644, 644, 644, 644, -1590, -1590, -1590, -1590 +.short 1119, 1119, 1119, 1119, -602, -602, -602, -602 +.short 1483, 1483, 1483, 1483, -777, -777, -777, -777 +.short -147, -147, -147, -147, 1159, 1159, 1159, 1159 +.short 778, 778, 778, 778, -246, -246, -246, -246 +.short 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574 +.short -460, -460, -460, -460, -291, -291, -291, -291 +.short -235, -235, -235, -235, 177, 177, 177, 177 +.short 587, 587, 587, 587, 422, 422, 422, 422 +.short 105, 105, 105, 105, 1550, 1550, 1550, 1550 +.short 871, 871, 871, 871, -1251, -1251, -1251, -1251 +.short 843, 843, 843, 843, 555, 555, 555, 555 +.short 430, 430, 430, 430, -1103, -1103, -1103, -1103 +.izeta63: +.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 +.short 677, 677, 677, 677, 677, 677, 677, 677 +.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 +.short 448, 448, 448, 448, 448, 448, 448, 448 +.short -725, -725, -725, -725, -725, -725, -725, -725 +.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 +.short 961, 961, 961, 961, 961, 961, 961, 961 +.short -398, -398, -398, -398, -398, -398, -398, -398 +.short -951, -951, -951, -951, -951, -951, -951, -951 +.short -247, -247, -247, -247, -247, -247, -247, -247 +.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 +.short 107, 107, 107, 107, 107, 107, 107, 107 +.short 830, 830, 830, 830, 830, 830, 830, 830 +.short -271, -271, -271, -271, -271, -271, -271, -271 +.short -90, -90, -90, -90, -90, -90, -90, -90 +.short -853, -853, -853, -853, -853, -853, -853, -853 +.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 +.short 126, 126, 126, 126, 126, 126, 126, 126 +.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 +.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 +.short -666, -666, -666, -666, -666, -666, -666, -666 +.short -320, -320, -320, -320, -320, -320, -320, -320 +.short -8, -8, -8, -8, -8, -8, -8, -8 +.short 516, 516, 516, 516, 516, 516, 516, 516 +.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 +.short -282, -282, -282, -282, -282, -282, -282, -282 +.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 +.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 +.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 +.short -552, -552, -552, -552, -552, -552, -552, -552 +.short 652, 652, 652, 652, 652, 652, 652, 652 +.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 +.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 +.short -205, -205, -205, -205, -205, -205, -205, -205 +.short 411, 411, 411, 411, 411, 411, 411, 411 +.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 +.short 608, 608, 608, 608, 608, 608, 608, 608 +.short 732, 732, 732, 732, 732, 732, 732, 732 +.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 +.short -681, -681, -681, -681, -681, -681, -681, -681 +.short -130, -130, -130, -130, -130, -130, -130, -130 +.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 +.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 +.short -829, -829, -829, -829, -829, -829, -829, -829 +.short 383, 383, 383, 383, 383, 383, 383, 383 +.short 264, 264, 264, 264, 264, 264, 264, 264 +.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 +.short 573, 573, 573, 573, 573, 573, 573, 573 +.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 +.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 +.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 +.short 962, 962, 962, 962, 962, 962, 962, 962 +.short 182, 182, 182, 182, 182, 182, 182, 182 +.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 +.short 622, 622, 622, 622, 622, 622, 622, 622 +.short -171, -171, -171, -171, -171, -171, -171, -171 +.short 202, 202, 202, 202, 202, 202, 202, 202 +.short 287, 287, 287, 287, 287, 287, 287, 287 +.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 +.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 +.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 +.short -359, -359, -359, -359, -359, -359, -359, -359 +.short -758, -758, -758, -758, -758, -758, -758, -758 diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S new file mode 100644 index 0000000000..172fef9cc8 --- /dev/null +++ b/dev/ppc64le/src/ntt_ppc.S @@ -0,0 +1,498 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + mr 9, \start + add 10, 4, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 # r[j+len] + lxvd2x 32+18, 3, 17 # r[j+len] + lxvd2x 32+23, 3, 19 # r[j+len] + lxvd2x 32+28, 3, 21 # r[j+len] + xxpermdi 32+13, 32+13, 32+13, 2 + xxpermdi 32+18, 32+18, 32+18, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+28, 32+28, 32+28, 2 + + # fqmul = zeta * coefficient + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 # >> 1 + vsrah 18, 20, 4 # >> 1 + vsrah 23, 25, 4 # >> 1 + vsrah 28, 30, 4 # >> 1 + + lxvd2x 32+12, 3, 9 # r[j] + lxvd2x 32+17, 3, 16 # r[j] + lxvd2x 32+22, 3, 18 # r[j] + lxvd2x 32+27, 3, 20 # r[j] + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+17, 32+17, 32+17, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+27, 32+27, 32+27, 2 + + vsubuhm 16, 12, 13 # r - t + vadduhm 15, 13, 12 # r + t + vsubuhm 21, 17, 18 # r - t + vadduhm 20, 18, 17 # r + t + vsubuhm 26, 22, 23 # r - t + vadduhm 25, 23, 22 # r + t + vsubuhm 31, 27, 28 # r - t + vadduhm 30, 28, 27 # r + t +.endm + +.macro Write_One + stxvx 32+15, 3, 9 + stxvx 32+16, 3, 10 + stxvx 32+20, 3, 16 + stxvx 32+21, 3, 17 + stxvx 32+25, 3, 18 + stxvx 32+26, 3, 19 + stxvx 32+30, 3, 20 + stxvx 32+31, 3, 21 +.endm + +.macro Write_Two + xxpermdi 32+17, 32+16, 32+15, 3 + xxpermdi 32+22, 32+21, 32+20, 3 + xxpermdi 32+27, 32+26, 32+25, 3 + xxpermdi 32+29, 32+31, 32+30, 3 + + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 +.endm + +.macro Write_Three + xxmrglw 32+14, 32+16, 32+15 + xxmrghw 32+13, 32+16, 32+15 + xxpermdi 32+17, 32+13, 32+14, 3 + xxmrglw 32+19, 32+21, 32+20 + xxmrghw 32+18, 32+21, 32+20 + xxpermdi 32+22, 32+18, 32+19, 3 + xxmrglw 32+14, 32+26, 32+25 + xxmrghw 32+13, 32+26, 32+25 + xxpermdi 32+27, 32+13, 32+14, 3 + xxmrglw 32+24, 32+31, 32+30 + xxmrghw 32+23, 32+31, 32+30 + xxpermdi 32+29, 32+23, 32+24, 3 + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +# +# mlk_ntt_ppc(int16_t *r) +# +.global MLK_ASM_NAMESPACE(ntt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(ntt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # get MLKEM_Q + addis 8,2,.nmkq@toc@ha + addi 8,8,.nmkq@toc@l + lvx V_NMKQ,0,8 + + # zetas array + addis 14,2,.K1@toc@ha + addi 14,14,.K1@toc@l + + vxor 3, 3, 3 + vspltish 4, 1 + lxv 32+V_QINV, 16(8) + +.align 4 +__Len128: + # + # Compute coefficients of the NTT based on the following loop. + # for (len = 128; len ≥ 2; len = len/2) + # + # 1. len = 128, start = 0 + # + li 5, 0 # start + li 4, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +__Len64: + # + # 2. len = 64, start = 0, 128 + # k += 2 + li 5, 0 + li 4, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +__Len32: + # + # 3. len = 32, start = 0, 64, 128, 192 + # k += 4 + li 5, 0 + li 4, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 64 + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 128 + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 192 + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +__Len16: + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + # k += 8 + li 5, 0 + li 4, 32 + Load_next_4zetas + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + +.align 4 +__Len8: + # + # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + # k += 16 + li 5, 0 + li 4, 16 + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + # + # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + # k += 32 + li 15, 4 # loops + mtctr 15 + li 5, 0 + li 4, 8 +.align 4 +__Len4: + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 + + bdnz __Len4 + + # + # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # k += 64 + # Update zetas vectors, each vector has 2 zetas + addis 14,2,.K64@toc@ha + addi 14,14,.K64@toc@l + + li 15, 4 + mtctr 15 + li 5, 0 + li 4, 4 +.align 4 +__Len2: + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + bdnz __Len2 + +__ntt_out: + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +.data +.align 4 +# -MLKEM_Q +.nmkq: +.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 +# QINV +.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 + +# zetas +.K1: +.short -758, -758, -758, -758, -758, -758, -758, -758 +.short -359, -359, -359, -359, -359, -359, -359, -359 +.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 +.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 +.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 +.short 287, 287, 287, 287, 287, 287, 287, 287 +.short 202, 202, 202, 202, 202, 202, 202, 202 +.short -171, -171, -171, -171, -171, -171, -171, -171 +.short 622, 622, 622, 622, 622, 622, 622, 622 +.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 +.short 182, 182, 182, 182, 182, 182, 182, 182 +.short 962, 962, 962, 962, 962, 962, 962, 962 +.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 +.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 +.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 +.short 573, 573, 573, 573, 573, 573, 573, 573 +.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 +.short 264, 264, 264, 264, 264, 264, 264, 264 +.short 383, 383, 383, 383, 383, 383, 383, 383 +.short -829, -829, -829, -829, -829, -829, -829, -829 +.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 +.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 +.short -130, -130, -130, -130, -130, -130, -130, -130 +.short -681, -681, -681, -681, -681, -681, -681, -681 +.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 +.short 732, 732, 732, 732, 732, 732, 732, 732 +.short 608, 608, 608, 608, 608, 608, 608, 608 +.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 +.short 411, 411, 411, 411, 411, 411, 411, 411 +.short -205, -205, -205, -205, -205, -205, -205, -205 +.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 +.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 +.short 652, 652, 652, 652, 652, 652, 652, 652 +.short -552, -552, -552, -552, -552, -552, -552, -552 +.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 +.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 +.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 +.short -282, -282, -282, -282, -282, -282, -282, -282 +.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 +.short 516, 516, 516, 516, 516, 516, 516, 516 +.short -8, -8, -8, -8, -8, -8, -8, -8 +.short -320, -320, -320, -320, -320, -320, -320, -320 +.short -666, -666, -666, -666, -666, -666, -666, -666 +.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 +.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 +.short 126, 126, 126, 126, 126, 126, 126, 126 +.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 +.short -853, -853, -853, -853, -853, -853, -853, -853 +.short -90, -90, -90, -90, -90, -90, -90, -90 +.short -271, -271, -271, -271, -271, -271, -271, -271 +.short 830, 830, 830, 830, 830, 830, 830, 830 +.short 107, 107, 107, 107, 107, 107, 107, 107 +.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 +.short -247, -247, -247, -247, -247, -247, -247, -247 +.short -951, -951, -951, -951, -951, -951, -951, -951 +.short -398, -398, -398, -398, -398, -398, -398, -398 +.short 961, 961, 961, 961, 961, 961, 961, 961 +.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 +.short -725, -725, -725, -725, -725, -725, -725, -725 +.short 448, 448, 448, 448, 448, 448, 448, 448 +.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 +.short 677, 677, 677, 677, 677, 677, 677, 677 +.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 +.K64: +.short -1103, -1103, -1103, -1103, 430, 430, 430, 430 +.short 555, 555, 555, 555, 843, 843, 843, 843 +.short -1251, -1251, -1251, -1251, 871, 871, 871, 871 +.short 1550, 1550, 1550, 1550, 105, 105, 105, 105 +.short 422, 422, 422, 422, 587, 587, 587, 587 +.short 177, 177, 177, 177, -235, -235, -235, -235 +.short -291, -291, -291, -291, -460, -460, -460, -460 +.short 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653 +.short -246, -246, -246, -246, 778, 778, 778, 778 +.short 1159, 1159, 1159, 1159, -147, -147, -147, -147 +.short -777, -777, -777, -777, 1483, 1483, 1483, 1483 +.short -602, -602, -602, -602, 1119, 1119, 1119, 1119 +.short -1590, -1590, -1590, -1590, 644, 644, 644, 644 +.short -872, -872, -872, -872, 349, 349, 349, 349 +.short 418, 418, 418, 418, 329, 329, 329, 329 +.short -156, -156, -156, -156, -75, -75, -75, -75 +.short 817, 817, 817, 817, 1097, 1097, 1097, 1097 +.short 603, 603, 603, 603, 610, 610, 610, 610 +.short 1322, 1322, 1322, 1322, -1285, -1285, -1285, -1285 +.short -1465, -1465, -1465, -1465, 384, 384, 384, 384 +.short -1215, -1215, -1215, -1215, -136, -136, -136, -136 +.short 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335 +.short -874, -874, -874, -874, 220, 220, 220, 220 +.short -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659 +.short -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530 +.short -1278, -1278, -1278, -1278, 794, 794, 794, 794 +.short -1510, -1510, -1510, -1510, -854, -854, -854, -854 +.short -870, -870, -870, -870, 478, 478, 478, 478 +.short -108, -108, -108, -108, -308, -308, -308, -308 +.short 996, 996, 996, 996, 991, 991, 991, 991 +.short 958, 958, 958, 958, -1460, -1460, -1460, -1460 +.short 1522, 1522, 1522, 1522, 1628, 1628, 1628, 1628 diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S new file mode 100644 index 0000000000..c07f25c5a8 --- /dev/null +++ b/dev/ppc64le/src/poly_tomont.S @@ -0,0 +1,163 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# Poly_tomont: Inplace conversion of all coefficients of a polynomial +# from normal domain to Montgomery domain +# +# Arguments:*r: pointer to input/output polynomial +# + +#include "../../../common.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_v0, _v1, _v2, _v3) +# +.macro MREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 # >> 1 + vsrah \_v1, 20, 4 # >> 1 + vsrah \_v2, 25, 4 # >> 1 + vsrah \_v3, 9, 4 # >> 1 +.endm + +.macro Write_8X + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + stdu 1, -320(1) + mflr 0 + + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + + addis 9,2,.nmkq@toc@ha + addi 9,9,.nmkq@toc@l + addis 10,2,.C1353@toc@ha + addi 10,10,.C1353@toc@l + + lxv 32+V_NMKQ,0(9) + lxv 32+V_QINV,16(9) + lxv 32+V1353,0(10) + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + mtlr 0 + addi 1, 1, 320 + blr + +.data +.align 4 +# -MLKEM_Q +.nmkq: +.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 +# QINV +.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 + +.C1353: +.short 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353 + diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S new file mode 100644 index 0000000000..ee8e1fdca1 --- /dev/null +++ b/dev/ppc64le/src/reduce.S @@ -0,0 +1,225 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# +# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial +# for details of the Barrett reduction +# +# Arguments: *r: pointer to input/output polynomial +# + +#include "../../../common.h" + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.machine "any" +.text + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +.macro Write_8X + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 +.endm + +# +# Conditional addition to get unsigned canonical representative +# +.macro To_unsigned_16 + lxv 32+12, 0(3) + lxv 32+13, 16(3) + lxv 32+14, 32(3) + lxv 32+15, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxv 32+3, -32(3) + stxv 32+2, -16(3) + stxv 32+1, -64(3) + stxv 32+0, -48(3) +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + + addis 8,2,.mkq@toc@ha + addi 8,8,.mkq@toc@l + addis 9,2,.C20159@toc@ha + addi 9,9,.C20159@toc@l + addis 10,2,.C25@toc@ha + addi 10,10,.C25@toc@l + + vxor 7, 7, 7 + + lxv 32+V_MKQ, 0(8) + lxv 32+V20159, 0(9) + lxv 32+V_25, 0(10) + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + # + # To unsigned canonical + # +.align 4 + addi 3, 3, -512 + xxspltib 32+9 ,0 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + mtlr 0 + addi 1, 1, 224 + blr + +.align 4 +.data +# MLKEM_Q +.mkq: +.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 + +.C20159: +.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 + +# 0x2000000 +.C25: +.long 33554432, 33554432, 33554432, 33554432 diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml index 7d8e50d4c6..766c936e28 100644 --- a/integration/liboqs/ML-KEM-1024_META.yml +++ b/integration/liboqs/ML-KEM-1024_META.yml @@ -9,83 +9,74 @@ length-ciphertext: 1568 length-secret-key: 3168 length-shared-secret: 32 length-keypair-seed: 64 -length-encaps-seed: 32 nistkat-sha256: f580d851e5fb27e6876e5e203fa18be4cdbfd49e05d48fec3d3992c8f43a13e6 testvectors-sha256: ff1a854b9b6761a70c65ccae85246fe0596a949e72eae0866a8a2a2d4ea54b10 principal-submitters: -- Peter Schwabe + - Peter Schwabe auxiliary-submitters: -- Roberto Avanzi -- Joppe Bos -- Léo Ducas -- Eike Kiltz -- Tancrède Lepoint -- Vadim Lyubashevsky -- John M. Schanck -- Gregor Seiler -- Damien Stehlé + - Roberto Avanzi + - Joppe Bos + - Léo Ducas + - Eike Kiltz + - Tancrède Lepoint + - Vadim Lyubashevsky + - John M. Schanck + - Gregor Seiler + - Damien Stehlé implementations: -- name: ref - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_C_dec - sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc -- name: x86_64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_dec - sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/x86_64 - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - - Darwin - required_flags: - - avx2 - - bmi2 - - popcnt -- name: aarch64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_dec - sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/aarch64 - supported_platforms: - - architecture: arm_8 - operating_systems: - - Linux - - Darwin - required_flags: - - asimd + - name: ref + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_C_dec + sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + - name: x86_64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_dec + sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt + - name: aarch64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_dec + sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: arm_8 + operating_systems: + - Linux + - Darwin + required_flags: + - asimd + - name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml index aa88537d3f..9d2c7633af 100644 --- a/integration/liboqs/ML-KEM-512_META.yml +++ b/integration/liboqs/ML-KEM-512_META.yml @@ -9,83 +9,74 @@ length-ciphertext: 768 length-secret-key: 1632 length-shared-secret: 32 length-keypair-seed: 64 -length-encaps-seed: 32 nistkat-sha256: c70041a761e01cd6426fa60e9fd6a4412c2be817386c8d0f3334898082512782 testvectors-sha256: 6730bb552c22d9d2176ffb5568e48eb30952cf1f065073ec5f9724f6a3c6ea85 principal-submitters: -- Peter Schwabe + - Peter Schwabe auxiliary-submitters: -- Roberto Avanzi -- Joppe Bos -- Léo Ducas -- Eike Kiltz -- Tancrède Lepoint -- Vadim Lyubashevsky -- John M. Schanck -- Gregor Seiler -- Damien Stehlé + - Roberto Avanzi + - Joppe Bos + - Léo Ducas + - Eike Kiltz + - Tancrède Lepoint + - Vadim Lyubashevsky + - John M. Schanck + - Gregor Seiler + - Damien Stehlé implementations: -- name: ref - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_C_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_C_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_C_dec - sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc -- name: x86_64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_dec - sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/x86_64 - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - - Darwin - required_flags: - - avx2 - - bmi2 - - popcnt -- name: aarch64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_dec - sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/aarch64 - supported_platforms: - - architecture: arm_8 - operating_systems: - - Linux - - Darwin - required_flags: - - asimd + - name: ref + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_C_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_C_dec + sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + - name: x86_64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_dec + sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt + - name: aarch64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_dec + sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: arm_8 + operating_systems: + - Linux + - Darwin + required_flags: + - asimd + - name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml index 254d67478a..e230f3ba6c 100644 --- a/integration/liboqs/ML-KEM-768_META.yml +++ b/integration/liboqs/ML-KEM-768_META.yml @@ -9,83 +9,74 @@ length-ciphertext: 1088 length-secret-key: 2400 length-shared-secret: 32 length-keypair-seed: 64 -length-encaps-seed: 32 nistkat-sha256: 5352539586b6c3df58be6158a6250aeff402bd73060b0a3de68850ac074c17c3 testvectors-sha256: 667c8ca2ca93729c0df6ff24588460bad1bbdbfb64ece0fe8563852a7ff348c6 principal-submitters: -- Peter Schwabe + - Peter Schwabe auxiliary-submitters: -- Roberto Avanzi -- Joppe Bos -- Léo Ducas -- Eike Kiltz -- Tancrède Lepoint -- Vadim Lyubashevsky -- John M. Schanck -- Gregor Seiler -- Damien Stehlé + - Roberto Avanzi + - Joppe Bos + - Léo Ducas + - Eike Kiltz + - Tancrède Lepoint + - Vadim Lyubashevsky + - John M. Schanck + - Gregor Seiler + - Damien Stehlé implementations: -- name: ref - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_C_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_C_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_C_dec - sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc -- name: x86_64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_dec - sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/x86_64 - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - - Darwin - required_flags: - - avx2 - - bmi2 - - popcnt -- name: aarch64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_enc - signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_enc_derand - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_dec - sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/aarch64 - supported_platforms: - - architecture: arm_8 - operating_systems: - - Linux - - Darwin - required_flags: - - asimd + - name: ref + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_C_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_C_dec + sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + - name: x86_64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_dec + sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt + - name: aarch64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_dec + sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: arm_8 + operating_systems: + - Linux + - Darwin + required_flags: + - asimd + - name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="....//integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/config_ppc64le.h b/integration/liboqs/config_ppc64le.h new file mode 100644 index 0000000000..2fa1cdbcf6 --- /dev/null +++ b/integration/liboqs/config_ppc64le.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [FIPS140_3_IG] + * Implementation Guidance for FIPS 140-3 and the Cryptographic Module + * Validation Program National Institute of Standards and Technology + * https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements + */ + +#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H +#define MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H + +/****************************************************************************** + * Name: MLK_CONFIG_PARAMETER_SET + * + * Description: Specifies the parameter set for ML-KEM + * - MLK_CONFIG_PARAMETER_SET=512 corresponds to ML-KEM-512 + * - MLK_CONFIG_PARAMETER_SET=768 corresponds to ML-KEM-768 + * - MLK_CONFIG_PARAMETER_SET=1024 corresponds to ML-KEM-1024 + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#ifndef MLK_CONFIG_PARAMETER_SET +#define MLK_CONFIG_PARAMETER_SET \ + 768 /* Change this for different security strengths */ +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_NAMESPACE_PREFIX + * + * Description: The prefix to use to namespace global symbols from mlkem/. + * + * In a multi-level build (that is, if either + * - MLK_CONFIG_MULTILEVEL_WITH_SHARED, or + * - MLK_CONFIG_MULTILEVEL_NO_SHARED, + * are set, level-dependent symbols will additionally be prefixed + * with the parameter set (512/768/1024). + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if MLK_CONFIG_PARAMETER_SET == 512 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 768 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 1024 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + * + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLK_CONFIG_ARITH_BACKEND_FILE: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + +/****************************************************************************** + * Name: MLK_CONFIG_ARITH_BACKEND_FILE + * + * Description: The arithmetic backend to use. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. + * + * This can be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_ARITH_BACKEND_FILE "native/meta.h" + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202_CUSTOM_HEADER \ + "../../integration/liboqs/fips202_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202X4_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202-X4 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202x4.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202X4_CUSTOM_HEADER \ + "../../integration/liboqs/fips202x4_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_ZEROIZE + * + * Description: In compliance with FIPS 203 Section 3.3, mlkem-native zeroizes + * intermediate stack buffers before returning from function calls. + * + * Set this option and define `mlk_zeroize` if you want to + * use a custom method to zeroize intermediate stack buffers. + * The default implementation uses SecureZeroMemory on Windows + * and a memset + compiler barrier otherwise. If neither of those + * is available on the target platform, compilation will fail, + * and you will need to use MLK_CONFIG_CUSTOM_ZEROIZE to provide + * a custom implementation of `mlk_zeroize()`. + * + * WARNING: + * The explicit stack zeroization conducted by mlkem-native + * reduces the likelihood of data leaking on the stack, but + * does not eliminate it! The C standard makes no guarantee about + * where a compiler allocates structures and whether/where it makes + * copies of them. Also, in addition to entire structures, there + * may also be potentially exploitable leakage of individual values + * on the stack. + * + * If you need bullet-proof zeroization of the stack, you need to + * consider additional measures instead of of what this feature + * provides. In this case, you can set mlk_zeroize to a no-op. + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_ZEROIZE + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void mlk_zeroize(void *ptr, size_t len) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_RANDOMBYTES + * + * Description: mlkem-native does not provide a secure randombytes + * implementation. Such an implementation has to provided by the + * consumer. + * + * If this option is not set, mlkem-native expects a function + * void randombytes(uint8_t *out, size_t outlen). + * + * Set this option and define `mlk_randombytes` if you want to + * use a custom method to sample randombytes with a different name + * or signature. + * + *****************************************************************************/ +#define MLK_CONFIG_CUSTOM_RANDOMBYTES +#if !defined(__ASSEMBLER__) +#include +#include +#include "../../mlkem/src/sys.h" +static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len) +{ + OQS_randombytes(ptr, len); +} +#endif /* !__ASSEMBLER__ */ + +/****************************************************************************** + * Name: MLK_CONFIG_NO_ASM + * + * Description: If this option is set, mlkem-native will be built without + * use of native code or inline assembly. + * + * By default, inline assembly is used to implement value barriers. + * Without inline assembly, mlkem-native will use a global volatile + * 'opt blocker' instead; see verify.h. + * + * Inline assembly is also used to implement a secure zeroization + * function on non-Windows platforms. If this option is set and + * the target platform is not Windows, you MUST set + * MLK_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization + * function. + * + * If this option is set, MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 and + * and MLK_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no + *native backends will be used. + * + *****************************************************************************/ +/* #define MLK_CONFIG_NO_ASM */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT + * + * Description: Compliance with @[FIPS140_3_IG, p.87] requires a + * Pairwise Consistency Test (PCT) to be carried out on a freshly + * generated keypair before it can be exported. + * + * Set this option if such a check should be implemented. + * In this case, crypto_kem_keypair_derand and crypto_kem_keypair + * will return a non-zero error code if the PCT failed. + * + * NOTE: This feature will drastically lower the performance of + * key generation. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + * + * Description: If this option is set, the user must provide a runtime + * function `static inline int mlk_break_pct() { ... }` to + * indicate whether the PCT should be made fail. + * + * This option only has an effect if MLK_CONFIG_KEYGEN_PCT is set. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + #if !defined(__ASSEMBLER__) + #include "sys.h" + static MLK_INLINE int mlk_break_pct(void) + { + ... return 0/1 depending on whether PCT should be broken ... + } + #endif +*/ + +/* Enable valgrind-based assertions in mlkem-native through macro + * from libOQS. */ +#if !defined(__ASSEMBLER__) +#include +#if defined(OQS_ENABLE_TEST_CONSTANT_TIME) +#define MLK_CONFIG_CT_TESTING_ENABLED +#endif +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H */ diff --git a/mlkem/src/native/meta.h b/mlkem/src/native/meta.h index f2b9b848b7..7fdcd6fcfa 100644 --- a/mlkem/src/native/meta.h +++ b/mlkem/src/native/meta.h @@ -18,4 +18,8 @@ #include "x86_64/meta.h" #endif +#ifdef MLK_SYS_PPC64LE +#include "ppc64le/meta.h" +#endif /* MLK_SYS_PPC64LE */ + #endif /* !MLK_NATIVE_META_H */ diff --git a/mlkem/src/native/ppc64le/README.md b/mlkem/src/native/ppc64le/README.md new file mode 100644 index 0000000000..5125a40eae --- /dev/null +++ b/mlkem/src/native/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h new file mode 100644 index 0000000000..bee788976b --- /dev/null +++ b/mlkem/src/native/ppc64le/meta.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_META_H +#define MLK_NATIVE_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { + mlk_ntt_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { + mlk_intt_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { + mlk_reduce_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { + mlk_poly_tomont_ppc(data); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* MLK_NATIVE_PPC64LE_META_H */ diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 0000000000..57f0b8f8ce --- /dev/null +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2024-2025 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *); + +#endif /* MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S new file mode 100644 index 0000000000..feb78b984e --- /dev/null +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -0,0 +1,773 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" + +.machine "any" +.text + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +# Montgomery reduce constatnts +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V1441 10 + +.macro Load_4Coeffs start next step + mr 9, \start # j + add 10, 4, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+8, 3, 10 # r[j+len] + lxvd2x 32+12, 3, 17 # r[j+len] + lxvd2x 32+16, 3, 19 # r[j+len] + lxvd2x 32+20, 3, 21 # r[j+len] + xxpermdi 32+8, 32+8, 32+8, 2 + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+16, 32+16, 32+16, 2 + xxpermdi 32+20, 32+20, 32+20, 2 + + lxvd2x 32+21, 3, 9 + lxvd2x 32+22, 3, 16 + lxvd2x 32+23, 3, 18 + lxvd2x 32+24, 3, 20 + xxpermdi 32+21, 32+21, 32+21, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+24, 32+24, 32+24, 2 + + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t +.endm + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + vxor 7, 7, 7 + xxlor 32+3, 6, 6 # V_MKQ + xxlor 32+1, 7, 7 # V_25 + xxlor 32+2, 8, 8 # V_26 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +#----------------------------------- +# MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah \_vo0, 15, 4 # >> 1 + vsrah \_vo1, 20, 4 # >> 1 + vsrah \_vo2, 25, 4 # >> 1 + vsrah \_vo3, 30, 4 # >> 1 +.endm + +.macro Set_mont_consts + xxlor 32+5, 0, 0 # V_NMKQ + xxlor 32+2, 2, 2 # V_QINV + xxlor 32+3, 3, 3 # 0 + xxlor 32+4, 4, 4 # 1 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +.macro Write_B4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 9 + stxvx \_vs1, 3, 16 + stxvx \_vs2, 3, 18 + stxvx \_vs3, 3, 20 +.endm + +.macro Write_M4C _vs0 _vs1 _vs2 _vs3 + stxvx \_vs0, 3, 10 + stxvx \_vs1, 3, 17 + stxvx \_vs2, 3, 19 + stxvx \_vs3, 3, 21 +.endm + +.macro Reload_4coeffs + lxv 32+25, 0(3) + lxv 32+26, 16(3) + lxv 32+30, 32(3) + lxv 32+31, 48(3) + addi 3, 3, 64 +.endm + +.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 + stxv \_vs0, -128(3) + stxv \_vs1, -112(3) + stxv \_vs2, -96(3) + stxv \_vs3, -80(3) + stxv \_vs4, -64(3) + stxv \_vs5, -48(3) + stxv \_vs6, -32(3) + stxv \_vs7, -16(3) +.endm + +.macro Write_Len2_4C _vs0 _vs1 _vs2 _vs3 + xxmrglw 32+12, \_vs0, 10 + xxmrghw 32+11, \_vs0, 10 + xxpermdi 10, 32+12, 32+11, 3 + xxmrglw 32+16, \_vs1, 11 + xxmrghw 32+15, \_vs1, 11 + xxpermdi 11, 32+16, 32+15, 3 + xxmrglw 32+12, \_vs2, 12 + xxmrghw 32+11, \_vs2, 12 + xxpermdi 12, 32+12, 32+11, 3 + xxmrglw 32+16, \_vs3, 13 + xxmrghw 32+15, \_vs3, 13 + xxpermdi 13, 32+16, 32+15, 3 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 +.endm + +.macro Write_Len4_4C _vs0 _vs1 _vs2 _vs3 + xxpermdi 10, 10, \_vs0, 3 + xxpermdi 11, 11, \_vs1, 3 + xxpermdi 12, 12, \_vs2, 3 + xxpermdi 13, 13, \_vs3, 3 + stxvd2x 10, 3, 9 + stxvd2x 11, 3, 16 + stxvd2x 12, 3, 18 + stxvd2x 13, 3, 20 +.endm + +# intt +# t = r[j]; +# r[j] = barrett_reduce(t + r[j + len]); +# r[j + len] = r[j + len] - t; +# r[j + len] = fqmul(zeta, r[j + len]); + +# +# mlk_intt_ppc(r) +# +.global MLK_ASM_NAMESPACE(intt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(intt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # init vectors and constants + # Setup for Montgomery reduce + addis 8,2,.nmkq@toc@ha + addi 8,8,.nmkq@toc@l + lxv 0, 0(8) + + lxv 32+V_QINV, 16(8) # QINV + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 + xxlor 3, 32+3, 32+3 + xxlor 4, 32+4, 32+4 + + # Setup for Barrett reduce + addis 8,2,.mkq@toc@ha + addi 8,8,.mkq@toc@l + addis 9,2,.C20159@toc@ha + addi 9,9,.C20159@toc@l + addis 10,2,.C25@toc@ha + addi 10,10,.C25@toc@l + + lxv 6, 0(8) # V_MKQ + lxv 32+0, 0(9) # V20159 + lxv 7, 0(10) # V_25 + + #xxspltiw 8, 26 # for power9 and above + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 + + # zetas array + #addis 14,2,.izeta63@toc@ha + #addi 14,14,.izeta63@toc@l + +.align 4 +__Len2: + # + # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # Update zetas vectors, each vector has 2 zetas + addis 14,2,.izeta127@toc@ha + addi 14,14,.izeta127@toc@l + li 4, 4 + li 15, 4 + mtctr 15 + li 5, 0 +__Loop2: + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len2_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz __Loop2 + +.align 4 +__Len4: + # + # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + addis 14,2,.izeta63@toc@ha + addi 14,14,.izeta63@toc@l + li 5, 0 + li 4, 8 + li 15, 4 # loops + mtctr 15 +__Loop4: + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_Len4_4C 32+13, 32+18, 32+23, 32+28 + addi 5, 5, 64 + bdnz __Loop4 + +.align 4 +__Len8: + # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + #addi 14, 14, 512 + li 4, 16 + li 5, 0 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 32, 32 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len16: + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + #addi 14, 14, 768 + li 5, 0 + li 4, 32 + + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 16 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + li 5, 256 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + li 5, 272 + Load_4Coeffs 5, 64, 64 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + addi 14, 14, -64 + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len32: + # + # 5. len = 32, start = 0, 64, 128, 192 + #addi 14, 14, 896 + li 5, 0 + li 4, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 384 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len64: + # + # 6. len = 64, start = 0, 128 + #addi 14, 14, 960 + li 5, 0 + li 4, 128 + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 256 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 320 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lxv 32+10, -16(14) + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 +__Len128: + # 7. len = 128, start = 0 + # + #addi 14, 14, 992 + li 5, 0 # start + li 4, 256 # len * 2 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 64 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 128 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + li 5, 192 + + Load_4Coeffs 5, 16, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + +.align 4 + # + # Montgomery reduce loops with constant 1441 + # + addis 10,2,.C1441@toc@ha + addi 10,10,.C1441@toc@l + lvx V1441, 0, 10 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + +__intt_out: + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +.data +.align 4 +# -MLKEM_Q +.nmkq: +.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 +# QINV +.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 + +# MLKEM_Q +.mkq: +.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 + +.C20159: +.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 + +# 0x2000000 +.C25: +.long 33554432, 33554432, 33554432, 33554432 + +.C1441: +.short 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441 + +.align 4 +.izeta127: +.short 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522 +.short -1460, -1460, -1460, -1460, 958, 958, 958, 958 +.short 991, 991, 991, 991, 996, 996, 996, 996 +.short -308, -308, -308, -308, -108, -108, -108, -108 +.short 478, 478, 478, 478, -870, -870, -870, -870 +.short -854, -854, -854, -854, -1510, -1510, -1510, -1510 +.short 794, 794, 794, 794, -1278, -1278, -1278, -1278 +.short -1530, -1530, -1530, -1530, -1185, -1185, -1185, -1185 +.short -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187 +.short 220, 220, 220, 220, -874, -874, -874, -874 +.short -1335, -1335, -1335, -1335, 1218, 1218, 1218, 1218 +.short -136, -136, -136, -136, -1215, -1215, -1215, -1215 +.short 384, 384, 384, 384, -1465, -1465, -1465, -1465 +.short -1285, -1285, -1285, -1285, 1322, 1322, 1322, 1322 +.short 610, 610, 610, 610, 603, 603, 603, 603 +.short 1097, 1097, 1097, 1097, 817, 817, 817, 817 +.short -75, -75, -75, -75, -156, -156, -156, -156 +.short 329, 329, 329, 329, 418, 418, 418, 418 +.short 349, 349, 349, 349, -872, -872, -872, -872 +.short 644, 644, 644, 644, -1590, -1590, -1590, -1590 +.short 1119, 1119, 1119, 1119, -602, -602, -602, -602 +.short 1483, 1483, 1483, 1483, -777, -777, -777, -777 +.short -147, -147, -147, -147, 1159, 1159, 1159, 1159 +.short 778, 778, 778, 778, -246, -246, -246, -246 +.short 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574 +.short -460, -460, -460, -460, -291, -291, -291, -291 +.short -235, -235, -235, -235, 177, 177, 177, 177 +.short 587, 587, 587, 587, 422, 422, 422, 422 +.short 105, 105, 105, 105, 1550, 1550, 1550, 1550 +.short 871, 871, 871, 871, -1251, -1251, -1251, -1251 +.short 843, 843, 843, 843, 555, 555, 555, 555 +.short 430, 430, 430, 430, -1103, -1103, -1103, -1103 +.izeta63: +.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 +.short 677, 677, 677, 677, 677, 677, 677, 677 +.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 +.short 448, 448, 448, 448, 448, 448, 448, 448 +.short -725, -725, -725, -725, -725, -725, -725, -725 +.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 +.short 961, 961, 961, 961, 961, 961, 961, 961 +.short -398, -398, -398, -398, -398, -398, -398, -398 +.short -951, -951, -951, -951, -951, -951, -951, -951 +.short -247, -247, -247, -247, -247, -247, -247, -247 +.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 +.short 107, 107, 107, 107, 107, 107, 107, 107 +.short 830, 830, 830, 830, 830, 830, 830, 830 +.short -271, -271, -271, -271, -271, -271, -271, -271 +.short -90, -90, -90, -90, -90, -90, -90, -90 +.short -853, -853, -853, -853, -853, -853, -853, -853 +.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 +.short 126, 126, 126, 126, 126, 126, 126, 126 +.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 +.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 +.short -666, -666, -666, -666, -666, -666, -666, -666 +.short -320, -320, -320, -320, -320, -320, -320, -320 +.short -8, -8, -8, -8, -8, -8, -8, -8 +.short 516, 516, 516, 516, 516, 516, 516, 516 +.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 +.short -282, -282, -282, -282, -282, -282, -282, -282 +.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 +.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 +.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 +.short -552, -552, -552, -552, -552, -552, -552, -552 +.short 652, 652, 652, 652, 652, 652, 652, 652 +.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 +.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 +.short -205, -205, -205, -205, -205, -205, -205, -205 +.short 411, 411, 411, 411, 411, 411, 411, 411 +.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 +.short 608, 608, 608, 608, 608, 608, 608, 608 +.short 732, 732, 732, 732, 732, 732, 732, 732 +.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 +.short -681, -681, -681, -681, -681, -681, -681, -681 +.short -130, -130, -130, -130, -130, -130, -130, -130 +.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 +.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 +.short -829, -829, -829, -829, -829, -829, -829, -829 +.short 383, 383, 383, 383, 383, 383, 383, 383 +.short 264, 264, 264, 264, 264, 264, 264, 264 +.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 +.short 573, 573, 573, 573, 573, 573, 573, 573 +.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 +.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 +.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 +.short 962, 962, 962, 962, 962, 962, 962, 962 +.short 182, 182, 182, 182, 182, 182, 182, 182 +.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 +.short 622, 622, 622, 622, 622, 622, 622, 622 +.short -171, -171, -171, -171, -171, -171, -171, -171 +.short 202, 202, 202, 202, 202, 202, 202, 202 +.short 287, 287, 287, 287, 287, 287, 287, 287 +.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 +.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 +.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 +.short -359, -359, -359, -359, -359, -359, -359, -359 +.short -758, -758, -758, -758, -758, -758, -758, -758 diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S new file mode 100644 index 0000000000..172fef9cc8 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -0,0 +1,498 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +#include "../../../common.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + mr 9, \start + add 10, 4, 9 # J + len*2 + addi 16, 9, \next + addi 17, 10, \step + addi 18, 16, \next + addi 19, 17, \step + addi 20, 18, \next + addi 21, 19, \step + lxvd2x 32+13, 3, 10 # r[j+len] + lxvd2x 32+18, 3, 17 # r[j+len] + lxvd2x 32+23, 3, 19 # r[j+len] + lxvd2x 32+28, 3, 21 # r[j+len] + xxpermdi 32+13, 32+13, 32+13, 2 + xxpermdi 32+18, 32+18, 32+18, 2 + xxpermdi 32+23, 32+23, 32+23, 2 + xxpermdi 32+28, 32+28, 32+28, 2 + + # fqmul = zeta * coefficient + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + vsrah 13, 15, 4 # >> 1 + vsrah 18, 20, 4 # >> 1 + vsrah 23, 25, 4 # >> 1 + vsrah 28, 30, 4 # >> 1 + + lxvd2x 32+12, 3, 9 # r[j] + lxvd2x 32+17, 3, 16 # r[j] + lxvd2x 32+22, 3, 18 # r[j] + lxvd2x 32+27, 3, 20 # r[j] + xxpermdi 32+12, 32+12, 32+12, 2 + xxpermdi 32+17, 32+17, 32+17, 2 + xxpermdi 32+22, 32+22, 32+22, 2 + xxpermdi 32+27, 32+27, 32+27, 2 + + vsubuhm 16, 12, 13 # r - t + vadduhm 15, 13, 12 # r + t + vsubuhm 21, 17, 18 # r - t + vadduhm 20, 18, 17 # r + t + vsubuhm 26, 22, 23 # r - t + vadduhm 25, 23, 22 # r + t + vsubuhm 31, 27, 28 # r - t + vadduhm 30, 28, 27 # r + t +.endm + +.macro Write_One + stxvx 32+15, 3, 9 + stxvx 32+16, 3, 10 + stxvx 32+20, 3, 16 + stxvx 32+21, 3, 17 + stxvx 32+25, 3, 18 + stxvx 32+26, 3, 19 + stxvx 32+30, 3, 20 + stxvx 32+31, 3, 21 +.endm + +.macro Write_Two + xxpermdi 32+17, 32+16, 32+15, 3 + xxpermdi 32+22, 32+21, 32+20, 3 + xxpermdi 32+27, 32+26, 32+25, 3 + xxpermdi 32+29, 32+31, 32+30, 3 + + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 +.endm + +.macro Write_Three + xxmrglw 32+14, 32+16, 32+15 + xxmrghw 32+13, 32+16, 32+15 + xxpermdi 32+17, 32+13, 32+14, 3 + xxmrglw 32+19, 32+21, 32+20 + xxmrghw 32+18, 32+21, 32+20 + xxpermdi 32+22, 32+18, 32+19, 3 + xxmrglw 32+14, 32+26, 32+25 + xxmrghw 32+13, 32+26, 32+25 + xxpermdi 32+27, 32+13, 32+14, 3 + xxmrglw 32+24, 32+31, 32+30 + xxmrghw 32+23, 32+31, 32+30 + xxpermdi 32+29, 32+23, 32+24, 3 + stxvx 32+17, 3, 9 + stxvx 32+22, 3, 16 + stxvx 32+27, 3, 18 + stxvx 32+29, 3, 20 +.endm + +.macro Load_next_4zetas + lxv 32+V_Z0, 0(14) + lxv 32+V_Z1, 16(14) + lxv 32+V_Z2, 32(14) + lxv 32+V_Z3, 48(14) + addi 14, 14, 64 +.endm + +# +# mlk_ntt_ppc(int16_t *r) +# +.global MLK_ASM_NAMESPACE(ntt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(ntt_ppc) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + stxv 32+31, 304(1) + + # get MLKEM_Q + addis 8,2,.nmkq@toc@ha + addi 8,8,.nmkq@toc@l + lvx V_NMKQ,0,8 + + # zetas array + addis 14,2,.K1@toc@ha + addi 14,14,.K1@toc@l + + vxor 3, 3, 3 + vspltish 4, 1 + lxv 32+V_QINV, 16(8) + +.align 4 +__Len128: + # + # Compute coefficients of the NTT based on the following loop. + # for (len = 128; len ≥ 2; len = len/2) + # + # 1. len = 128, start = 0 + # + li 5, 0 # start + li 4, 256 # len * 2 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 128 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 192 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +__Len64: + # + # 2. len = 64, start = 0, 128 + # k += 2 + li 5, 0 + li 4, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 64 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + li 5, 320 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +__Len32: + # + # 3. len = 32, start = 0, 64, 128, 192 + # k += 4 + li 5, 0 + li 4, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 64 + li 5, 128 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 128 + li 5, 256 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + #li 5, 192 + li 5, 384 + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + Write_One + +.align 4 +__Len16: + # + # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + # k += 8 + li 5, 0 + li 4, 32 + Load_next_4zetas + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 16 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + Load_next_4zetas + li 5, 256 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 272 + MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + +.align 4 +__Len8: + # + # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + # k += 16 + li 5, 0 + li 4, 16 + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 128 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 256 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + li 5, 384 + + Load_next_4zetas + MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + Write_One + + # + # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + # k += 32 + li 15, 4 # loops + mtctr 15 + li 5, 0 + li 4, 8 +.align 4 +__Len4: + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Two + addi 5, 5, 64 + + bdnz __Len4 + + # + # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + # k += 64 + # Update zetas vectors, each vector has 2 zetas + addis 14,2,.K64@toc@ha + addi 14,14,.K64@toc@l + + li 15, 4 + mtctr 15 + li 5, 0 + li 4, 4 +.align 4 +__Len2: + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + Load_next_4zetas + MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 + Write_Three + addi 5, 5, 64 + + bdnz __Len2 + +__ntt_out: + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + lxv 32+31, 304(1) + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 + blr + +.data +.align 4 +# -MLKEM_Q +.nmkq: +.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 +# QINV +.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 + +# zetas +.K1: +.short -758, -758, -758, -758, -758, -758, -758, -758 +.short -359, -359, -359, -359, -359, -359, -359, -359 +.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 +.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 +.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 +.short 287, 287, 287, 287, 287, 287, 287, 287 +.short 202, 202, 202, 202, 202, 202, 202, 202 +.short -171, -171, -171, -171, -171, -171, -171, -171 +.short 622, 622, 622, 622, 622, 622, 622, 622 +.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 +.short 182, 182, 182, 182, 182, 182, 182, 182 +.short 962, 962, 962, 962, 962, 962, 962, 962 +.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 +.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 +.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 +.short 573, 573, 573, 573, 573, 573, 573, 573 +.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 +.short 264, 264, 264, 264, 264, 264, 264, 264 +.short 383, 383, 383, 383, 383, 383, 383, 383 +.short -829, -829, -829, -829, -829, -829, -829, -829 +.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 +.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 +.short -130, -130, -130, -130, -130, -130, -130, -130 +.short -681, -681, -681, -681, -681, -681, -681, -681 +.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 +.short 732, 732, 732, 732, 732, 732, 732, 732 +.short 608, 608, 608, 608, 608, 608, 608, 608 +.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 +.short 411, 411, 411, 411, 411, 411, 411, 411 +.short -205, -205, -205, -205, -205, -205, -205, -205 +.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 +.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 +.short 652, 652, 652, 652, 652, 652, 652, 652 +.short -552, -552, -552, -552, -552, -552, -552, -552 +.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 +.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 +.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 +.short -282, -282, -282, -282, -282, -282, -282, -282 +.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 +.short 516, 516, 516, 516, 516, 516, 516, 516 +.short -8, -8, -8, -8, -8, -8, -8, -8 +.short -320, -320, -320, -320, -320, -320, -320, -320 +.short -666, -666, -666, -666, -666, -666, -666, -666 +.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 +.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 +.short 126, 126, 126, 126, 126, 126, 126, 126 +.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 +.short -853, -853, -853, -853, -853, -853, -853, -853 +.short -90, -90, -90, -90, -90, -90, -90, -90 +.short -271, -271, -271, -271, -271, -271, -271, -271 +.short 830, 830, 830, 830, 830, 830, 830, 830 +.short 107, 107, 107, 107, 107, 107, 107, 107 +.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 +.short -247, -247, -247, -247, -247, -247, -247, -247 +.short -951, -951, -951, -951, -951, -951, -951, -951 +.short -398, -398, -398, -398, -398, -398, -398, -398 +.short 961, 961, 961, 961, 961, 961, 961, 961 +.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 +.short -725, -725, -725, -725, -725, -725, -725, -725 +.short 448, 448, 448, 448, 448, 448, 448, 448 +.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 +.short 677, 677, 677, 677, 677, 677, 677, 677 +.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 +.K64: +.short -1103, -1103, -1103, -1103, 430, 430, 430, 430 +.short 555, 555, 555, 555, 843, 843, 843, 843 +.short -1251, -1251, -1251, -1251, 871, 871, 871, 871 +.short 1550, 1550, 1550, 1550, 105, 105, 105, 105 +.short 422, 422, 422, 422, 587, 587, 587, 587 +.short 177, 177, 177, 177, -235, -235, -235, -235 +.short -291, -291, -291, -291, -460, -460, -460, -460 +.short 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653 +.short -246, -246, -246, -246, 778, 778, 778, 778 +.short 1159, 1159, 1159, 1159, -147, -147, -147, -147 +.short -777, -777, -777, -777, 1483, 1483, 1483, 1483 +.short -602, -602, -602, -602, 1119, 1119, 1119, 1119 +.short -1590, -1590, -1590, -1590, 644, 644, 644, 644 +.short -872, -872, -872, -872, 349, 349, 349, 349 +.short 418, 418, 418, 418, 329, 329, 329, 329 +.short -156, -156, -156, -156, -75, -75, -75, -75 +.short 817, 817, 817, 817, 1097, 1097, 1097, 1097 +.short 603, 603, 603, 603, 610, 610, 610, 610 +.short 1322, 1322, 1322, 1322, -1285, -1285, -1285, -1285 +.short -1465, -1465, -1465, -1465, 384, 384, 384, 384 +.short -1215, -1215, -1215, -1215, -136, -136, -136, -136 +.short 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335 +.short -874, -874, -874, -874, 220, 220, 220, 220 +.short -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659 +.short -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530 +.short -1278, -1278, -1278, -1278, 794, 794, 794, 794 +.short -1510, -1510, -1510, -1510, -854, -854, -854, -854 +.short -870, -870, -870, -870, 478, 478, 478, 478 +.short -108, -108, -108, -108, -308, -308, -308, -308 +.short 996, 996, 996, 996, 991, 991, 991, 991 +.short 958, 958, 958, 958, -1460, -1460, -1460, -1460 +.short 1522, 1522, 1522, 1522, 1628, 1628, 1628, 1628 diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S new file mode 100644 index 0000000000..c07f25c5a8 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -0,0 +1,163 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# Poly_tomont: Inplace conversion of all coefficients of a polynomial +# from normal domain to Montgomery domain +# +# Arguments:*r: pointer to input/output polynomial +# + +#include "../../../common.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +.machine "any" +.text + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_v0, _v1, _v2, _v3) +# +.macro MREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 # >> 1 + vsrah \_v1, 20, 4 # >> 1 + vsrah \_v2, 25, 4 # >> 1 + vsrah \_v3, 9, 4 # >> 1 +.endm + +.macro Write_8X + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + stdu 1, -320(1) + mflr 0 + + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + stxv 32+25, 208(1) + stxv 32+26, 224(1) + stxv 32+27, 240(1) + stxv 32+28, 256(1) + stxv 32+29, 272(1) + stxv 32+30, 288(1) + + addis 9,2,.nmkq@toc@ha + addi 9,9,.nmkq@toc@l + addis 10,2,.C1353@toc@ha + addi 10,10,.C1353@toc@l + + lxv 32+V_NMKQ,0(9) + lxv 32+V_QINV,16(9) + lxv 32+V1353,0(10) + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + lxv 32+25, 208(1) + lxv 32+26, 224(1) + lxv 32+27, 240(1) + lxv 32+28, 256(1) + lxv 32+29, 272(1) + lxv 32+30, 288(1) + mtlr 0 + addi 1, 1, 320 + blr + +.data +.align 4 +# -MLKEM_Q +.nmkq: +.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 +# QINV +.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 + +.C1353: +.short 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353 + diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S new file mode 100644 index 0000000000..ee8e1fdca1 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -0,0 +1,225 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +# +# Copyright 2025- IBM Corp. +# +#=================================================================================== +# Written by Danny Tsen +# + +# +# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial +# for details of the Barrett reduction +# +# Arguments: *r: pointer to input/output polynomial +# + +#include "../../../common.h" + +# Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.machine "any" +.text + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +.macro Write_8X + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 +.endm + +# +# Conditional addition to get unsigned canonical representative +# +.macro To_unsigned_16 + lxv 32+12, 0(3) + lxv 32+13, 16(3) + lxv 32+14, 32(3) + lxv 32+15, 48(3) + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxv 32+3, -32(3) + stxv 32+2, -16(3) + stxv 32+1, -64(3) + stxv 32+0, -48(3) +.endm + +.align 4 +.globl MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + stxv 32+20, 128(1) + stxv 32+21, 144(1) + stxv 32+22, 160(1) + stxv 32+23, 176(1) + stxv 32+24, 192(1) + + addis 8,2,.mkq@toc@ha + addi 8,8,.mkq@toc@l + addis 9,2,.C20159@toc@ha + addi 9,9,.C20159@toc@l + addis 10,2,.C25@toc@ha + addi 10,10,.C25@toc@l + + vxor 7, 7, 7 + + lxv 32+V_MKQ, 0(8) + lxv 32+V20159, 0(9) + lxv 32+V_25, 0(10) + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + # + # To unsigned canonical + # +.align 4 + addi 3, 3, -512 + xxspltib 32+9 ,0 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + lxv 32+20, 128(1) + lxv 32+21, 144(1) + lxv 32+22, 160(1) + lxv 32+23, 176(1) + lxv 32+24, 192(1) + mtlr 0 + addi 1, 1, 224 + blr + +.align 4 +.data +# MLKEM_Q +.mkq: +.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 + +.C20159: +.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 + +# 0x2000000 +.C25: +.long 33554432, 33554432, 33554432, 33554432 diff --git a/test/mk/auto.mk b/test/mk/auto.mk index bcbf3ac1c0..b66eb724b2 100644 --- a/test/mk/auto.mk +++ b/test/mk/auto.mk @@ -1,113 +1,33 @@ # SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT # -# Automatically detect system architecture and set preprocessor flags accordingly -# This file detects host CPU capabilities and combines them with compiler support -# to enable optimal compilation flags. +# Automatically detect system architecture and set preprocessor etc accordingly -ifndef _AUTO_MK -_AUTO_MK := - -# Helper function to check if host CPU supports a feature -# Usage: $(call check_host_feature,feature_pattern,source_command) -define check_host_feature -$(shell $(2) 2>/dev/null | grep -q "$(1)" && echo 1 || echo 0) -endef - -# x86_64 architecture detection -ifeq ($(ARCH),x86_64) - -# Host CPU feature detection for x86_64 +# Native compilation +ifeq ($(CROSS_PREFIX),) ifeq ($(HOST_PLATFORM),Linux-x86_64) -# Linux: Use /proc/cpuinfo -MK_HOST_SUPPORTS_AVX2 := $(call check_host_feature,avx2,cat /proc/cpuinfo) -MK_HOST_SUPPORTS_SSE2 := $(call check_host_feature,sse2,cat /proc/cpuinfo) -MK_HOST_SUPPORTS_BMI2 := $(call check_host_feature,bmi2,cat /proc/cpuinfo) -else ifeq ($(HOST_PLATFORM),Darwin-x86_64) -# macOS: Use sysctl -MK_HOST_SUPPORTS_AVX2 := $(call check_host_feature,AVX2,sysctl -n machdep.cpu.leaf7_features) -MK_HOST_SUPPORTS_SSE2 := $(call check_host_feature,SSE2,sysctl -n machdep.cpu.features) -MK_HOST_SUPPORTS_BMI2 := $(call check_host_feature,BMI2,sysctl -n machdep.cpu.leaf7_features) -else ifneq ($(CROSS_PREFIX),) -# Cross-compilation: assume all features are supported -MK_HOST_SUPPORTS_AVX2 := 1 -MK_HOST_SUPPORTS_SSE2 := 1 -MK_HOST_SUPPORTS_BMI2 := 1 -else -# Other platforms: assume no support -MK_HOST_SUPPORTS_AVX2 := 0 -MK_HOST_SUPPORTS_SSE2 := 0 -MK_HOST_SUPPORTS_BMI2 := 0 -endif # HOST_PLATFORM x86_64 - -endif # x86_64 - -# AArch64 architecture detection -ifeq ($(ARCH),aarch64) - -# Host CPU feature detection for AArch64 -ifeq ($(HOST_PLATFORM),Linux-aarch64) -# Linux: Use /proc/cpuinfo (look for sha3 in Features line) -MK_HOST_SUPPORTS_SHA3 := $(call check_host_feature,sha3,cat /proc/cpuinfo) + CFLAGS += -mavx2 -mbmi2 -mpopcnt -maes + CFLAGS += -DMLK_FORCE_X86_64 +else ifeq ($(HOST_PLATFORM),Linux-aarch64) + CFLAGS += -DMLK_FORCE_AARCH64 else ifeq ($(HOST_PLATFORM),Darwin-arm64) -# macOS: Use sysctl to check for SHA3 support -MK_HOST_SUPPORTS_SHA3 := $(call check_host_feature,1,sysctl -n hw.optional.armv8_2_sha3) -else ifneq ($(CROSS_PREFIX),) -# Cross-compilation: assume all features are supported -MK_HOST_SUPPORTS_SHA3 := 1 -else -# Other platforms: assume no support -MK_HOST_SUPPORTS_SHA3 := 0 -endif # HOST_PLATFORM aarch64 - -endif # aarch64 - -# Only apply CFLAGS modifications if AUTO=1 -ifeq ($(AUTO),1) - -# x86_64 CFLAGS configuration -ifeq ($(ARCH),x86_64) -CFLAGS += -DMLK_FORCE_X86_64 - -# Add flags only if both compiler and host support the feature -ifeq ($(MK_COMPILER_SUPPORTS_AVX2)$(MK_HOST_SUPPORTS_AVX2),11) -CFLAGS += -mavx2 + CFLAGS += -DMLK_FORCE_AARCH64 +else ifeq ($(HOST_PLATFORM),Linux-ppc64le) + CFLAGS += -DMLK_FORCE_PPC64LE endif - -ifeq ($(MK_COMPILER_SUPPORTS_BMI2)$(MK_HOST_SUPPORTS_BMI2),11) -CFLAGS += -mbmi2 +# Cross compilation +else ifneq ($(findstring x86_64, $(CROSS_PREFIX)),) + CFLAGS += -mavx2 -mbmi2 -mpopcnt -maes + CFLAGS += -DMLK_FORCE_X86_64 +else ifneq ($(findstring aarch64_be, $(CROSS_PREFIX)),) + CFLAGS += -DMLK_FORCE_AARCH64_EB +else ifneq ($(findstring aarch64, $(CROSS_PREFIX)),) + CFLAGS += -DMLK_FORCE_AARCH64 +else ifneq ($(findstring riscv64, $(CROSS_PREFIX)),) + CFLAGS += -DMLK_FORCE_RISCV64 +else ifneq ($(findstring riscv32, $(CROSS_PREFIX)),) + CFLAGS += -DMLK_FORCE_RISCV32 +else ifneq ($(findstring powerpc64le, $(CROSS_PREFIX)),) + CFLAGS += -DMLK_FORCE_PPC64LE +else ifneq ($(findstring ppc64le, $(CROSS_PREFIX)),) + CFLAGS += -DMLK_FORCE_PPC64LE endif -endif # x86_64 - -# AArch64 CFLAGS configuration -ifeq ($(ARCH),aarch64) -CFLAGS += -DMLK_FORCE_AARCH64 - -# Add SHA3 flags only if both compiler and host support it -ifeq ($(MK_COMPILER_SUPPORTS_SHA3)$(MK_HOST_SUPPORTS_SHA3),11) -CFLAGS += -march=armv8.4-a+sha3 -endif -endif # aarch64 - -# AArch64 Big Endian CFLAGS configuration -ifeq ($(ARCH),aarch64_be) -CFLAGS += -DMLK_FORCE_AARCH64_EB -endif # aarch64_be - -# RISC-V 64-bit CFLAGS configuration -ifeq ($(ARCH),riscv64) -CFLAGS += -DMLK_FORCE_RISCV64 -endif # riscv64 - -# RISC-V 32-bit CFLAGS configuration -ifeq ($(ARCH),riscv32) -CFLAGS += -DMLK_FORCE_RISCV32 -endif # riscv32 - -# PowerPC 64-bit Little Endian CFLAGS configuration -ifeq ($(ARCH),powerpc64le) -CFLAGS += -DMLK_FORCE_PPC64LE -endif # powerpc64le - -endif # AUTO=1 - -endif # _AUTO_MK diff --git a/test/mk/components.mk b/test/mk/components.mk index fabe5b4129..77f9f32126 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -8,10 +8,11 @@ endif SOURCES += $(wildcard mlkem/src/*.c) ifeq ($(OPT),1) SOURCES += $(wildcard mlkem/src/native/aarch64/src/*.[csS]) $(wildcard mlkem/src/native/x86_64/src/*.[csS]) + SOURCES += $(wildcard mlkem/src/native/ppc64le/src/*.[csS]) CFLAGS += -DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif -ALL_TESTS = test_mlkem acvp_mlkem bench_mlkem bench_components_mlkem gen_KAT test_stack +ALL_TESTS = test_mlkem acvp_mlkem bench_mlkem bench_components_mlkem gen_KAT MLKEM512_DIR = $(BUILD_DIR)/mlkem512 MLKEM768_DIR = $(BUILD_DIR)/mlkem768 @@ -24,9 +25,6 @@ $(MLKEM768_OBJS): CFLAGS += -DMLK_CONFIG_PARAMETER_SET=768 MLKEM1024_OBJS = $(call MAKE_OBJS,$(MLKEM1024_DIR),$(SOURCES) $(FIPS202_SRCS)) $(MLKEM1024_OBJS): CFLAGS += -DMLK_CONFIG_PARAMETER_SET=1024 - - - $(BUILD_DIR)/libmlkem512.a: $(MLKEM512_OBJS) $(BUILD_DIR)/libmlkem768.a: $(MLKEM768_OBJS) $(BUILD_DIR)/libmlkem1024.a: $(MLKEM1024_OBJS) @@ -40,10 +38,6 @@ $(MLKEM512_DIR)/bin/bench_components_mlkem512: CFLAGS += -Itest/hal $(MLKEM768_DIR)/bin/bench_components_mlkem768: CFLAGS += -Itest/hal $(MLKEM1024_DIR)/bin/bench_components_mlkem1024: CFLAGS += -Itest/hal -$(MLKEM512_DIR)/bin/test_stack512: CFLAGS += -Imlkem/src -fstack-usage -$(MLKEM768_DIR)/bin/test_stack768: CFLAGS += -Imlkem/src -fstack-usage -$(MLKEM1024_DIR)/bin/test_stack1024: CFLAGS += -Imlkem/src -fstack-usage - $(MLKEM512_DIR)/bin/bench_mlkem512: $(MLKEM512_DIR)/test/hal/hal.c.o $(MLKEM768_DIR)/bin/bench_mlkem768: $(MLKEM768_DIR)/test/hal/hal.c.o $(MLKEM1024_DIR)/bin/bench_mlkem1024: $(MLKEM1024_DIR)/test/hal/hal.c.o From 68ee31ca5b07faf45967db681b516b0ff53243bf Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Mon, 8 Sep 2025 09:41:46 +0800 Subject: [PATCH 02/22] Document nix 2.18 requirement; disable nix setup test for nix 2.6 Since August 2025, nixpkgs requires a nix version of at least nix 2.18. Consequently, our nix setup tests using nix 2.6.1 and Ubuntu 22 (nix 2.6.0) break. This comment documents that at least nix 2.18 is required, updates the nix test to 2.18.0, and (temporarily) disables the Ubuntu 22 test. Signed-off-by: Matthias J. Kannwischer Signed-off-by: Danny Tsen --- .github/workflows/nix.yml | 11 +++++++---- CONTRIBUTING.md | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/nix.yml b/.github/workflows/nix.yml index 3de61b9ea3..baba540137 100644 --- a/.github/workflows/nix.yml +++ b/.github/workflows/nix.yml @@ -86,12 +86,15 @@ jobs: fail-fast: false matrix: target: + # nixpkgs requires 2.18 since August 2025, see + # https://github.com/NixOS/nixpkgs/pull/428076 + # TODO: Re-enable tests on Ubuntu 22 once nix has been updated to >= 2.18 + # - runner: ubuntu-22.04 + # container: + # install: 'apt' - runner: ubuntu-latest - container: nixos/nix:2.6.1 + container: nixos/nix:2.18.0 install: 'native' - - runner: ubuntu-22.04 - container: - install: 'apt' - runner: ubuntu-24.04 container: install: 'apt' diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index faed987434..1530147389 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -9,7 +9,7 @@ any of the open issues. Here are some things to get you started. We specify the development environment for mlkem-native using `nix`. If you want to help develop mlkem-native, please use `nix`. We recommend using the latest Nix version provided by the [nix installer -script](https://nixos.org/download/), but we currently support all Nix versions >= 2.6. +script](https://nixos.org/download/), but we currently support all Nix versions >= 2.18. All the development and build dependencies are specified in [flake.nix](flake.nix). To execute a bash shell, run ```bash From 1f41a2eb54396207c74310d2af02005d898d62ce Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Sat, 2 Aug 2025 20:47:50 +0800 Subject: [PATCH 03/22] Add clang_21 to compiler and constant-time tests Signed-off-by: Matthias J. Kannwischer Signed-off-by: Danny Tsen --- .github/workflows/ci.yml | 7 +++++++ .github/workflows/ct-tests.yml | 3 ++- flake.lock | 6 +++--- flake.nix | 3 +++ 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 12aeff977c..2e1ee84e3b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -257,6 +257,13 @@ jobs: c23: True opt: all examples: true + - name: clang-21 + shell: ci_clang21 + darwin: True + c17: True + c23: True + opt: all + examples: true # CPU flags are not correctly passed to the zig assembler # https://github.com/ziglang/zig/issues/23576 # We therefore only test the C backend diff --git a/.github/workflows/ct-tests.yml b/.github/workflows/ct-tests.yml index 24bde4b2c1..0789e5bdbf 100644 --- a/.github/workflows/ct-tests.yml +++ b/.github/workflows/ct-tests.yml @@ -26,6 +26,7 @@ jobs: - ci_valgrind-varlat_clang18 - ci_valgrind-varlat_clang19 - ci_valgrind-varlat_clang20 + - ci_valgrind-varlat_clang21 - ci_valgrind-varlat_gcc48 - ci_valgrind-varlat_gcc49 - ci_valgrind-varlat_gcc7 @@ -62,7 +63,7 @@ jobs: valgrind_flags: --variable-latency-errors=yes - name: Build and run test (-Ofast) # -Ofast got deprecated in clang19; -O3 -ffast-math should be used instead - if: ${{ matrix.nix-shell != 'ci_valgrind-varlat_clang19' && matrix.nix-shell != 'ci_valgrind-varlat_clang20' }} + if: ${{ matrix.nix-shell != 'ci_valgrind-varlat_clang19' && matrix.nix-shell != 'ci_valgrind-varlat_clang20' && matrix.nix-shell != 'ci_valgrind-varlat_clang21'}} uses: ./.github/actions/ct-test with: cflags: -Ofast -DMLK_CONFIG_KEYGEN_PCT diff --git a/flake.lock b/flake.lock index b9f3f45e24..a4ac8a8d1a 100644 --- a/flake.lock +++ b/flake.lock @@ -54,11 +54,11 @@ }, "nixpkgs-unstable": { "locked": { - "lastModified": 1753939845, - "narHash": "sha256-K2ViRJfdVGE8tpJejs8Qpvvejks1+A4GQej/lBk5y7I=", + "lastModified": 1757068644, + "narHash": "sha256-NOrUtIhTkIIumj1E/Rsv1J37Yi3xGStISEo8tZm3KW4=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "94def634a20494ee057c76998843c015909d6311", + "rev": "8eb28adfa3dc4de28e792e3bf49fcf9007ca8ac9", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index bcb9252dd7..625fe2f9eb 100644 --- a/flake.nix +++ b/flake.nix @@ -50,6 +50,7 @@ gcc48 = pkgs-2405.gcc48; gcc49 = pkgs-2405.gcc49; gcc7 = pkgs-2405.gcc7; + clang_21 = pkgs-unstable.clang_21; }) ]; }; @@ -128,6 +129,7 @@ devShells.ci_clang18 = util.mkShellWithCC' pkgs.clang_18; devShells.ci_clang19 = util.mkShellWithCC' pkgs.clang_19; devShells.ci_clang20 = util.mkShellWithCC' pkgs.clang_20; + devShells.ci_clang21 = util.mkShellWithCC' pkgs.clang_21; devShells.ci_zig0_12 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_12); devShells.ci_zig0_13 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_13); @@ -150,6 +152,7 @@ devShells.ci_valgrind-varlat_clang18 = util.mkShellWithCC_valgrind' pkgs.clang_18; devShells.ci_valgrind-varlat_clang19 = util.mkShellWithCC_valgrind' pkgs.clang_19; devShells.ci_valgrind-varlat_clang20 = util.mkShellWithCC_valgrind' pkgs.clang_20; + devShells.ci_valgrind-varlat_clang21 = util.mkShellWithCC_valgrind' pkgs.clang_21; devShells.ci_valgrind-varlat_gcc48 = util.mkShellWithCC_valgrind' pkgs.gcc48; devShells.ci_valgrind-varlat_gcc49 = util.mkShellWithCC_valgrind' pkgs.gcc49; devShells.ci_valgrind-varlat_gcc7 = util.mkShellWithCC_valgrind' pkgs.gcc7; From d4e8c286e654082d3b5e2c0a2bea1e288bf29b59 Mon Sep 17 00:00:00 2001 From: willieyz Date: Tue, 12 Aug 2025 18:22:07 +0800 Subject: [PATCH 04/22] bitwuzla: update the bitwuzla version from 0.7.0 to 0.8.2 in nix Signed-off-by: willieyz Signed-off-by: Danny Tsen --- flake.nix | 4 ++-- nix/cbmc/default.nix | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.nix b/flake.nix index 625fe2f9eb..8166f51086 100644 --- a/flake.nix +++ b/flake.nix @@ -25,7 +25,7 @@ util = pkgs.callPackage ./nix/util.nix { # Keep those around in case we want to switch to unstable versions cbmc = pkgs-unstable.cbmc; - bitwuzla = pkgs.bitwuzla; + bitwuzla = pkgs-unstable.bitwuzla; z3 = pkgs.z3; }; zigWrapCC = zig: pkgs.symlinkJoin { @@ -170,7 +170,7 @@ util = pkgs.callPackage ./nix/util.nix { inherit pkgs; cbmc = pkgs-unstable.cbmc; - bitwuzla = pkgs.bitwuzla; + bitwuzla = pkgs-unstable.bitwuzla; z3 = pkgs.z3; }; in diff --git a/nix/cbmc/default.nix b/nix/cbmc/default.nix index d9a602284d..4fd886b6ae 100644 --- a/nix/cbmc/default.nix +++ b/nix/cbmc/default.nix @@ -62,7 +62,7 @@ buildEnv { inherit cadical#2.1.3 - bitwuzla# 0.7.0 + bitwuzla# 0.8.2 ninja; # 1.12.1 }; } From 4e33b285c086e1a9a8346092cd2f9f97f65f8173 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Tue, 9 Sep 2025 18:35:34 +0800 Subject: [PATCH 05/22] Add compiler test for zig 0.15 https://github.com/ziglang/zig/releases/tag/0.15.1 Signed-off-by: Matthias J. Kannwischer Signed-off-by: Danny Tsen --- .github/workflows/ci.yml | 7 +++++++ flake.nix | 2 ++ 2 files changed, 9 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2e1ee84e3b..9f6f6fd395 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -291,6 +291,13 @@ jobs: c23: True examples: False opt: no_opt + - name: zig-0.15 + shell: ci_zig0_15 + darwin: True + c17: True + c23: True + examples: False + opt: no_opt runs-on: ${{ matrix.target.runner }} steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 diff --git a/flake.nix b/flake.nix index 8166f51086..9e63a64f55 100644 --- a/flake.nix +++ b/flake.nix @@ -51,6 +51,7 @@ gcc49 = pkgs-2405.gcc49; gcc7 = pkgs-2405.gcc7; clang_21 = pkgs-unstable.clang_21; + zig_0_15 = pkgs-unstable.zig_0_15; }) ]; }; @@ -134,6 +135,7 @@ devShells.ci_zig0_12 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_12); devShells.ci_zig0_13 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_13); devShells.ci_zig0_14 = util.mkShellWithCC' (zigWrapCC pkgs.zig); + devShells.ci_zig0_15 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_15); devShells.ci_gcc48 = util.mkShellWithCC' pkgs.gcc48; devShells.ci_gcc49 = util.mkShellWithCC' pkgs.gcc49; From aa3b87b67580e44a8e1671a80a8f058cc9c92c6a Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Tue, 9 Sep 2025 13:35:04 -0400 Subject: [PATCH 06/22] Fixed auto.mk, components.mk and YML files. Signed-off-by: Danny Tsen --- integration/liboqs/ML-KEM-1024_META.yml | 154 ++++++++++++++---------- integration/liboqs/ML-KEM-512_META.yml | 154 ++++++++++++++---------- integration/liboqs/ML-KEM-768_META.yml | 154 ++++++++++++++---------- test/mk/auto.mk | 132 ++++++++++++++++---- test/mk/components.mk | 9 +- 5 files changed, 378 insertions(+), 225 deletions(-) diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml index 766c936e28..c3ffce4e64 100644 --- a/integration/liboqs/ML-KEM-1024_META.yml +++ b/integration/liboqs/ML-KEM-1024_META.yml @@ -9,74 +9,96 @@ length-ciphertext: 1568 length-secret-key: 3168 length-shared-secret: 32 length-keypair-seed: 64 +length-encaps-seed: 32 nistkat-sha256: f580d851e5fb27e6876e5e203fa18be4cdbfd49e05d48fec3d3992c8f43a13e6 testvectors-sha256: ff1a854b9b6761a70c65ccae85246fe0596a949e72eae0866a8a2a2d4ea54b10 principal-submitters: - - Peter Schwabe +- Peter Schwabe auxiliary-submitters: - - Roberto Avanzi - - Joppe Bos - - Léo Ducas - - Eike Kiltz - - Tancrède Lepoint - - Vadim Lyubashevsky - - John M. Schanck - - Gregor Seiler - - Damien Stehlé +- Roberto Avanzi +- Joppe Bos +- Léo Ducas +- Eike Kiltz +- Tancrède Lepoint +- Vadim Lyubashevsky +- John M. Schanck +- Gregor Seiler +- Damien Stehlé implementations: - - name: ref - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_C_dec - sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - - name: x86_64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_dec - sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - - Darwin - required_flags: - - avx2 - - bmi2 - - popcnt - - name: aarch64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_dec - sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: arm_8 - operating_systems: - - Linux - - Darwin - required_flags: - - asimd - - name: ppc64le - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: ppc64le - operating_systems: - - Linux +- name: ref + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_C_dec + sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc +- name: x86_64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_dec + sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/x86_64 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt +- name: aarch64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_dec + sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/aarch64 + supported_platforms: + - architecture: arm_8 + operating_systems: + - Linux + - Darwin + required_flags: + - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml index 9d2c7633af..c5fb05e60c 100644 --- a/integration/liboqs/ML-KEM-512_META.yml +++ b/integration/liboqs/ML-KEM-512_META.yml @@ -9,74 +9,96 @@ length-ciphertext: 768 length-secret-key: 1632 length-shared-secret: 32 length-keypair-seed: 64 +length-encaps-seed: 32 nistkat-sha256: c70041a761e01cd6426fa60e9fd6a4412c2be817386c8d0f3334898082512782 testvectors-sha256: 6730bb552c22d9d2176ffb5568e48eb30952cf1f065073ec5f9724f6a3c6ea85 principal-submitters: - - Peter Schwabe +- Peter Schwabe auxiliary-submitters: - - Roberto Avanzi - - Joppe Bos - - Léo Ducas - - Eike Kiltz - - Tancrède Lepoint - - Vadim Lyubashevsky - - John M. Schanck - - Gregor Seiler - - Damien Stehlé +- Roberto Avanzi +- Joppe Bos +- Léo Ducas +- Eike Kiltz +- Tancrède Lepoint +- Vadim Lyubashevsky +- John M. Schanck +- Gregor Seiler +- Damien Stehlé implementations: - - name: ref - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_C_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_C_dec - sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - - name: x86_64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_dec - sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - - Darwin - required_flags: - - avx2 - - bmi2 - - popcnt - - name: aarch64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_dec - sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: arm_8 - operating_systems: - - Linux - - Darwin - required_flags: - - asimd - - name: ppc64le - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: ppc64le - operating_systems: - - Linux +- name: ref + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_C_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_C_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_C_dec + sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc +- name: x86_64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_dec + sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/x86_64 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt +- name: aarch64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_dec + sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/aarch64 + supported_platforms: + - architecture: arm_8 + operating_systems: + - Linux + - Darwin + required_flags: + - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml index e230f3ba6c..80b05ba45a 100644 --- a/integration/liboqs/ML-KEM-768_META.yml +++ b/integration/liboqs/ML-KEM-768_META.yml @@ -9,74 +9,96 @@ length-ciphertext: 1088 length-secret-key: 2400 length-shared-secret: 32 length-keypair-seed: 64 +length-encaps-seed: 32 nistkat-sha256: 5352539586b6c3df58be6158a6250aeff402bd73060b0a3de68850ac074c17c3 testvectors-sha256: 667c8ca2ca93729c0df6ff24588460bad1bbdbfb64ece0fe8563852a7ff348c6 principal-submitters: - - Peter Schwabe +- Peter Schwabe auxiliary-submitters: - - Roberto Avanzi - - Joppe Bos - - Léo Ducas - - Eike Kiltz - - Tancrède Lepoint - - Vadim Lyubashevsky - - John M. Schanck - - Gregor Seiler - - Damien Stehlé +- Roberto Avanzi +- Joppe Bos +- Léo Ducas +- Eike Kiltz +- Tancrède Lepoint +- Vadim Lyubashevsky +- John M. Schanck +- Gregor Seiler +- Damien Stehlé implementations: - - name: ref - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_C_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_C_dec - sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - - name: x86_64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_dec - sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - - Darwin - required_flags: - - avx2 - - bmi2 - - popcnt - - name: aarch64 - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_dec - sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: arm_8 - operating_systems: - - Linux - - Darwin - required_flags: - - asimd - - name: ppc64le - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="....//integration/liboqs/config_ppc64le.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc - supported_platforms: - - architecture: ppc64le - operating_systems: - - Linux +- name: ref + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_c.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_C_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_C_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_C_dec + sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc +- name: x86_64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_x86_64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_dec + sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/x86_64 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt +- name: aarch64 + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_aarch64.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_enc + signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_enc_derand + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_dec + sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/aarch64 + supported_platforms: + - architecture: arm_8 + operating_systems: + - Linux + - Darwin + required_flags: + - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="....//integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/test/mk/auto.mk b/test/mk/auto.mk index b66eb724b2..bcbf3ac1c0 100644 --- a/test/mk/auto.mk +++ b/test/mk/auto.mk @@ -1,33 +1,113 @@ # SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT # -# Automatically detect system architecture and set preprocessor etc accordingly +# Automatically detect system architecture and set preprocessor flags accordingly +# This file detects host CPU capabilities and combines them with compiler support +# to enable optimal compilation flags. -# Native compilation -ifeq ($(CROSS_PREFIX),) +ifndef _AUTO_MK +_AUTO_MK := + +# Helper function to check if host CPU supports a feature +# Usage: $(call check_host_feature,feature_pattern,source_command) +define check_host_feature +$(shell $(2) 2>/dev/null | grep -q "$(1)" && echo 1 || echo 0) +endef + +# x86_64 architecture detection +ifeq ($(ARCH),x86_64) + +# Host CPU feature detection for x86_64 ifeq ($(HOST_PLATFORM),Linux-x86_64) - CFLAGS += -mavx2 -mbmi2 -mpopcnt -maes - CFLAGS += -DMLK_FORCE_X86_64 -else ifeq ($(HOST_PLATFORM),Linux-aarch64) - CFLAGS += -DMLK_FORCE_AARCH64 +# Linux: Use /proc/cpuinfo +MK_HOST_SUPPORTS_AVX2 := $(call check_host_feature,avx2,cat /proc/cpuinfo) +MK_HOST_SUPPORTS_SSE2 := $(call check_host_feature,sse2,cat /proc/cpuinfo) +MK_HOST_SUPPORTS_BMI2 := $(call check_host_feature,bmi2,cat /proc/cpuinfo) +else ifeq ($(HOST_PLATFORM),Darwin-x86_64) +# macOS: Use sysctl +MK_HOST_SUPPORTS_AVX2 := $(call check_host_feature,AVX2,sysctl -n machdep.cpu.leaf7_features) +MK_HOST_SUPPORTS_SSE2 := $(call check_host_feature,SSE2,sysctl -n machdep.cpu.features) +MK_HOST_SUPPORTS_BMI2 := $(call check_host_feature,BMI2,sysctl -n machdep.cpu.leaf7_features) +else ifneq ($(CROSS_PREFIX),) +# Cross-compilation: assume all features are supported +MK_HOST_SUPPORTS_AVX2 := 1 +MK_HOST_SUPPORTS_SSE2 := 1 +MK_HOST_SUPPORTS_BMI2 := 1 +else +# Other platforms: assume no support +MK_HOST_SUPPORTS_AVX2 := 0 +MK_HOST_SUPPORTS_SSE2 := 0 +MK_HOST_SUPPORTS_BMI2 := 0 +endif # HOST_PLATFORM x86_64 + +endif # x86_64 + +# AArch64 architecture detection +ifeq ($(ARCH),aarch64) + +# Host CPU feature detection for AArch64 +ifeq ($(HOST_PLATFORM),Linux-aarch64) +# Linux: Use /proc/cpuinfo (look for sha3 in Features line) +MK_HOST_SUPPORTS_SHA3 := $(call check_host_feature,sha3,cat /proc/cpuinfo) else ifeq ($(HOST_PLATFORM),Darwin-arm64) - CFLAGS += -DMLK_FORCE_AARCH64 -else ifeq ($(HOST_PLATFORM),Linux-ppc64le) - CFLAGS += -DMLK_FORCE_PPC64LE +# macOS: Use sysctl to check for SHA3 support +MK_HOST_SUPPORTS_SHA3 := $(call check_host_feature,1,sysctl -n hw.optional.armv8_2_sha3) +else ifneq ($(CROSS_PREFIX),) +# Cross-compilation: assume all features are supported +MK_HOST_SUPPORTS_SHA3 := 1 +else +# Other platforms: assume no support +MK_HOST_SUPPORTS_SHA3 := 0 +endif # HOST_PLATFORM aarch64 + +endif # aarch64 + +# Only apply CFLAGS modifications if AUTO=1 +ifeq ($(AUTO),1) + +# x86_64 CFLAGS configuration +ifeq ($(ARCH),x86_64) +CFLAGS += -DMLK_FORCE_X86_64 + +# Add flags only if both compiler and host support the feature +ifeq ($(MK_COMPILER_SUPPORTS_AVX2)$(MK_HOST_SUPPORTS_AVX2),11) +CFLAGS += -mavx2 endif -# Cross compilation -else ifneq ($(findstring x86_64, $(CROSS_PREFIX)),) - CFLAGS += -mavx2 -mbmi2 -mpopcnt -maes - CFLAGS += -DMLK_FORCE_X86_64 -else ifneq ($(findstring aarch64_be, $(CROSS_PREFIX)),) - CFLAGS += -DMLK_FORCE_AARCH64_EB -else ifneq ($(findstring aarch64, $(CROSS_PREFIX)),) - CFLAGS += -DMLK_FORCE_AARCH64 -else ifneq ($(findstring riscv64, $(CROSS_PREFIX)),) - CFLAGS += -DMLK_FORCE_RISCV64 -else ifneq ($(findstring riscv32, $(CROSS_PREFIX)),) - CFLAGS += -DMLK_FORCE_RISCV32 -else ifneq ($(findstring powerpc64le, $(CROSS_PREFIX)),) - CFLAGS += -DMLK_FORCE_PPC64LE -else ifneq ($(findstring ppc64le, $(CROSS_PREFIX)),) - CFLAGS += -DMLK_FORCE_PPC64LE + +ifeq ($(MK_COMPILER_SUPPORTS_BMI2)$(MK_HOST_SUPPORTS_BMI2),11) +CFLAGS += -mbmi2 endif +endif # x86_64 + +# AArch64 CFLAGS configuration +ifeq ($(ARCH),aarch64) +CFLAGS += -DMLK_FORCE_AARCH64 + +# Add SHA3 flags only if both compiler and host support it +ifeq ($(MK_COMPILER_SUPPORTS_SHA3)$(MK_HOST_SUPPORTS_SHA3),11) +CFLAGS += -march=armv8.4-a+sha3 +endif +endif # aarch64 + +# AArch64 Big Endian CFLAGS configuration +ifeq ($(ARCH),aarch64_be) +CFLAGS += -DMLK_FORCE_AARCH64_EB +endif # aarch64_be + +# RISC-V 64-bit CFLAGS configuration +ifeq ($(ARCH),riscv64) +CFLAGS += -DMLK_FORCE_RISCV64 +endif # riscv64 + +# RISC-V 32-bit CFLAGS configuration +ifeq ($(ARCH),riscv32) +CFLAGS += -DMLK_FORCE_RISCV32 +endif # riscv32 + +# PowerPC 64-bit Little Endian CFLAGS configuration +ifeq ($(ARCH),powerpc64le) +CFLAGS += -DMLK_FORCE_PPC64LE +endif # powerpc64le + +endif # AUTO=1 + +endif # _AUTO_MK diff --git a/test/mk/components.mk b/test/mk/components.mk index 77f9f32126..f3b1f959d5 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -12,7 +12,7 @@ ifeq ($(OPT),1) CFLAGS += -DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif -ALL_TESTS = test_mlkem acvp_mlkem bench_mlkem bench_components_mlkem gen_KAT +ALL_TESTS = test_mlkem acvp_mlkem bench_mlkem bench_components_mlkem gen_KAT test_stack MLKEM512_DIR = $(BUILD_DIR)/mlkem512 MLKEM768_DIR = $(BUILD_DIR)/mlkem768 @@ -25,6 +25,9 @@ $(MLKEM768_OBJS): CFLAGS += -DMLK_CONFIG_PARAMETER_SET=768 MLKEM1024_OBJS = $(call MAKE_OBJS,$(MLKEM1024_DIR),$(SOURCES) $(FIPS202_SRCS)) $(MLKEM1024_OBJS): CFLAGS += -DMLK_CONFIG_PARAMETER_SET=1024 + + + $(BUILD_DIR)/libmlkem512.a: $(MLKEM512_OBJS) $(BUILD_DIR)/libmlkem768.a: $(MLKEM768_OBJS) $(BUILD_DIR)/libmlkem1024.a: $(MLKEM1024_OBJS) @@ -38,6 +41,10 @@ $(MLKEM512_DIR)/bin/bench_components_mlkem512: CFLAGS += -Itest/hal $(MLKEM768_DIR)/bin/bench_components_mlkem768: CFLAGS += -Itest/hal $(MLKEM1024_DIR)/bin/bench_components_mlkem1024: CFLAGS += -Itest/hal +$(MLKEM512_DIR)/bin/test_stack512: CFLAGS += -Imlkem/src -fstack-usage +$(MLKEM768_DIR)/bin/test_stack768: CFLAGS += -Imlkem/src -fstack-usage +$(MLKEM1024_DIR)/bin/test_stack1024: CFLAGS += -Imlkem/src -fstack-usage + $(MLKEM512_DIR)/bin/bench_mlkem512: $(MLKEM512_DIR)/test/hal/hal.c.o $(MLKEM768_DIR)/bin/bench_mlkem768: $(MLKEM768_DIR)/test/hal/hal.c.o $(MLKEM1024_DIR)/bin/bench_mlkem1024: $(MLKEM1024_DIR)/test/hal/hal.c.o From 5f71ef645f13d401afbd4ecb63320020742fa590 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Thu, 18 Sep 2025 10:52:25 -0500 Subject: [PATCH 07/22] Fixed format and styling by using autogen but no simpasm was run. Did some more comments on value bounds in ASM files. Used constants array instead of embedded data in assembly files. Autogen was run under nix env on Mac. Tests were run under HW p10. [05:52] danny@ltcden12-lp4 mlkem_dev % ./scripts/tests func INFO > Functional Test Compile (native no_opt): make func OPT=0 AUTO=1 -j8 INFO > Functional Test ML-KEM-512 (native no_opt): make run_func_512 -j8 INFO > Functional Test ML-KEM-768 (native no_opt): make run_func_768 -j8 INFO > Functional Test ML-KEM-1024 (native no_opt): make run_func_1024 -j8 INFO > Functional Test Compile (native opt): make func OPT=1 AUTO=1 -j8 INFO > Functional Test ML-KEM-512 (native opt): make run_func_512 -j8 INFO > Functional Test ML-KEM-768 (native opt): make run_func_768 -j8 INFO > Functional Test ML-KEM-1024 (native opt): make run_func_1024 -j8 All good! Signed-off-by: Danny Tsen --- BIBLIOGRAPHY.md | 1 + dev/ppc64le/meta.h | 34 +-- dev/ppc64le/src/arith_native_ppc64le.h | 15 +- dev/ppc64le/src/consts.c | 155 +++++++++++ dev/ppc64le/src/consts.h | 26 ++ dev/ppc64le/src/intt_ppc.S | 239 +++++------------ dev/ppc64le/src/ntt_ppc.S | 188 ++++--------- dev/ppc64le/src/poly_tomont.S | 36 ++- dev/ppc64le/src/reduce.S | 48 ++-- integration/liboqs/ML-KEM-1024_META.yml | 8 +- integration/liboqs/ML-KEM-512_META.yml | 8 +- integration/liboqs/ML-KEM-768_META.yml | 8 +- mlkem/mlkem_native.S | 27 ++ mlkem/mlkem_native.c | 27 ++ mlkem/src/native/meta.h | 2 +- mlkem/src/native/ppc64le/meta.h | 30 ++- .../native/ppc64le/src/arith_native_ppc64le.h | 11 +- mlkem/src/native/ppc64le/src/consts.c | 155 +++++++++++ mlkem/src/native/ppc64le/src/consts.h | 26 ++ mlkem/src/native/ppc64le/src/intt_ppc.S | 252 ++++++------------ mlkem/src/native/ppc64le/src/ntt_ppc.S | 192 ++++--------- mlkem/src/native/ppc64le/src/poly_tomont.S | 38 +-- mlkem/src/native/ppc64le/src/reduce.S | 55 ++-- 23 files changed, 831 insertions(+), 750 deletions(-) create mode 100644 dev/ppc64le/src/consts.c create mode 100644 dev/ppc64le/src/consts.h create mode 100644 mlkem/src/native/ppc64le/src/consts.c create mode 100644 mlkem/src/native/ppc64le/src/consts.h diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index ba4ff97185..e8c0bca7b4 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -28,6 +28,7 @@ source code and documentation. * Referenced from: - [integration/liboqs/config_aarch64.h](integration/liboqs/config_aarch64.h) - [integration/liboqs/config_c.h](integration/liboqs/config_c.h) + - [integration/liboqs/config_ppc64le.h](integration/liboqs/config_ppc64le.h) - [integration/liboqs/config_x86_64.h](integration/liboqs/config_x86_64.h) - [mlkem/src/config.h](mlkem/src/config.h) - [mlkem/src/kem.c](mlkem/src/kem.c) diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h index bee788976b..34f8cbec66 100644 --- a/dev/ppc64le/meta.h +++ b/dev/ppc64le/meta.h @@ -3,8 +3,8 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -#ifndef MLK_NATIVE_PPC64LE_META_H -#define MLK_NATIVE_PPC64LE_META_H +#ifndef MLK_DEV_PPC64LE_META_H +#define MLK_DEV_PPC64LE_META_H /* Identifier for this backend so that source and assembly files * in the build can be appropriately guarded. */ @@ -25,25 +25,29 @@ #include "../api.h" #include "src/arith_native_ppc64le.h" -static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { - mlk_ntt_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } -static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { - mlk_intt_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } -static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { - mlk_reduce_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } -static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { - mlk_poly_tomont_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } #endif /* !__ASSEMBLER__ */ -#endif /* MLK_NATIVE_PPC64LE_META_H */ +#endif /* !MLK_DEV_PPC64LE_META_H */ diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h index 57f0b8f8ce..1c75346689 100644 --- a/dev/ppc64le/src/arith_native_ppc64le.h +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -2,22 +2,23 @@ * Copyright (c) 2024-2025 The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 */ -#ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H -#define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#ifndef MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H #include #include "../../../common.h" +#include "consts.h" #define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) -void mlk_ntt_ppc(int16_t *); +void mlk_ntt_ppc(int16_t *, const int16_t *); #define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) -void mlk_intt_ppc(int16_t *); +void mlk_intt_ppc(int16_t *, const int16_t *); #define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) -void mlk_reduce_ppc(int16_t *r); +void mlk_reduce_ppc(int16_t *r, const int16_t *); #define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) -void mlk_poly_tomont_ppc(int16_t *); +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); -#endif /* MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ +#endif /* !MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c new file mode 100644 index 0000000000..4c2fbdf61a --- /dev/null +++ b/dev/ppc64le/src/consts.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { + /* -Q */ + -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* QINV */ + -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* Q */ + 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, + /* const 20159 for reduce.S and intt */ + 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, + /* const 1441 for intt */ + 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, + /* for poly_tomont.S */ + 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, + /* zetas */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, + 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, + -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, + 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, + 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, + -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, + 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, + -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, + -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, + -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, + -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, + -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, + 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, + -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, + -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, + -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, + 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, + -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, + -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, + 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, + /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ + -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, + 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, + 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, + 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, + -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, + -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, + -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, + -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, + 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, + 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, + 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, + 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, + 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, + 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, + 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, + -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, + -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, + -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, + -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, + 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, + 1628, 1628, + /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ + 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, + 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, + -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, + -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, + 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, + -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, + 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, + 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, + 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, + 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, + 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, + 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, + -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, + 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, + -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, + -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, + -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, + 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, + 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, + 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, + -1103, + /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ + -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, + 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, + -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, + -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, + -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, + -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, + 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, + -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, + -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, + 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, + -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, + -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, + 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, + 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, + 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, + -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, + -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, + 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, + 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, + -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, + -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, + 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, + -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, + 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, + 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, + -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, + 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, + -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, + 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, + -359, -758, -758, -758, -758, -758, -758, -758, -758}; + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h new file mode 100644 index 0000000000..d424601ac1 --- /dev/null +++ b/dev/ppc64le/src/consts.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_SRC_CONSTS_H +#define MLK_DEV_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_NTT_OFFSET64 1104 +#define IZETA_NTT_OFFSET127 1616 +#define IZETA_NTT_OFFSET63 2128 + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#endif + +#endif /* !MLK_DEV_PPC64LE_SRC_CONSTS_H */ diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index feb78b984e..1f4b48e42e 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -11,13 +11,18 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" .machine "any" .text # Barrett reduce constatnts #define V20159 0 -#define V_25 1 +#define V_25 1 #define V_26 2 #define V_MKQ 3 @@ -29,11 +34,11 @@ #define V_Z2 9 #define V_Z3 10 #define V_ZETA 10 -#define V1441 10 +#define V1441 10 .macro Load_4Coeffs start next step mr 9, \start # j - add 10, 4, 9 # J + len*2 + add 10, 7, 9 # J + len*2 addi 16, 9, \next addi 17, 10, \step addi 18, 16, \next @@ -73,6 +78,8 @@ xxlor 32+3, 6, 6 # V_MKQ xxlor 32+1, 7, 7 # V_25 xxlor 32+2, 8, 8 # V_26 + # Multify Odd/Even signed halfword; + # Results word bound by 2^32 in abs value. vmulosh 6, 8, V20159 vmulesh 5, 8, V20159 vmulosh 11, 12, V20159 @@ -97,6 +104,8 @@ vadduwm 14, 14, V_25 vadduwm 17, 17, V_25 vadduwm 18, 18, V_25 + # Right shift and pack lower halfword, + # results bond to 2^16 in abs value vsraw 4, 4, V_26 vsraw 5, 5, V_26 vsraw 9, 9, V_26 @@ -113,6 +122,8 @@ vsubuhm 13, 7, 13 vpkuwum 17, 18, 17 vsubuhm 17, 7, 17 + # Modulo multify-Low unsigned halfword; + # results bond to 2^16 * q in abs value. vmladduhm \_v0, 4, V_MKQ, 8 vmladduhm \_v1, 9, V_MKQ, 12 vmladduhm \_v2, 13, V_MKQ, 16 @@ -123,11 +134,13 @@ # MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) # .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + # Modular multification bond by 2^16 * q in abs value vmladduhm 15, 25, \_vz0, 3 vmladduhm 20, 26, \_vz1, 3 vmladduhm 27, 30, \_vz2, 3 vmladduhm 28, 31, \_vz3, 3 + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value vmhraddshs 14, 25, \_vz0, 3 vmhraddshs 19, 26, \_vz1, 3 vmhraddshs 24, 30, \_vz2, 3 @@ -265,50 +278,41 @@ MLK_ASM_FN_SYMBOL(intt_ppc) # init vectors and constants # Setup for Montgomery reduce - addis 8,2,.nmkq@toc@ha - addi 8,8,.nmkq@toc@l - lxv 0, 0(8) + lxv 0, 0(4) - lxv 32+V_QINV, 16(8) # QINV + lxv 32+V_QINV, QINV_OFFSET(4) # QINV xxlxor 32+3, 32+3, 32+3 vspltish 4, 1 - xxlor 2, 32+2, 32+2 - xxlor 3, 32+3, 32+3 - xxlor 4, 32+4, 32+4 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 # Setup for Barrett reduce - addis 8,2,.mkq@toc@ha - addi 8,8,.mkq@toc@l - addis 9,2,.C20159@toc@ha - addi 9,9,.C20159@toc@l - addis 10,2,.C25@toc@ha - addi 10,10,.C25@toc@l - - lxv 6, 0(8) # V_MKQ - lxv 32+0, 0(9) # V20159 - lxv 7, 0(10) # V_25 + lxv 6, Q_OFFSET(4) # V_MKQ + lxv 32+V20159, C20159_OFFSET(4) # V20159 + lxv 7, 0(4) # V_25 #xxspltiw 8, 26 # for power9 and above vspltisw 8, 13 vadduwm 8, 8, 8 - xxlor 8, 32+8, 32+8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 - # zetas array - #addis 14,2,.izeta63@toc@ha - #addi 14,14,.izeta63@toc@l + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 .align 4 -__Len2: +#__Len2: # # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # Update zetas vectors, each vector has 2 zetas - addis 14,2,.izeta127@toc@ha - addi 14,14,.izeta127@toc@l - li 4, 4 + addi 14, 4, IZETA_NTT_OFFSET127 + li 7, 4 li 15, 4 mtctr 15 li 5, 0 -__Loop2: +intt_ppc__Loop2: Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 xxlor 10, 32+4, 32+4 @@ -333,19 +337,18 @@ __Loop2: MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 Write_Len2_4C 32+13, 32+18, 32+23, 32+28 addi 5, 5, 64 - bdnz __Loop2 + bdnz intt_ppc__Loop2 .align 4 -__Len4: +#__Len4: # # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - addis 14,2,.izeta63@toc@ha - addi 14,14,.izeta63@toc@l + addi 14, 4, IZETA_NTT_OFFSET63 li 5, 0 - li 4, 8 + li 7, 8 li 15, 4 # loops mtctr 15 -__Loop4: +intt_ppc__Loop4: Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 xxlor 10, 32+4, 32+4 @@ -369,13 +372,13 @@ __Loop4: MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 Write_Len4_4C 32+13, 32+18, 32+23, 32+28 addi 5, 5, 64 - bdnz __Loop4 + bdnz intt_ppc__Loop4 .align 4 -__Len8: +#__Len8: # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 #addi 14, 14, 512 - li 4, 16 + li 7, 16 li 5, 0 Load_4Coeffs 5, 32, 32 @@ -414,12 +417,12 @@ __Len8: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len16: +#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 #addi 14, 14, 768 li 5, 0 - li 4, 32 + li 7, 32 Load_4Coeffs 5, 64, 64 BREDUCE_4X 4, 9, 13, 17 @@ -458,12 +461,12 @@ __Len16: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len32: +#__Len32: # # 5. len = 32, start = 0, 64, 128, 192 #addi 14, 14, 896 li 5, 0 - li 4, 64 + li 7, 64 Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 @@ -505,12 +508,12 @@ __Len32: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len64: +#__Len64: # # 6. len = 64, start = 0, 128 #addi 14, 14, 960 li 5, 0 - li 4, 128 + li 7, 128 Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 Write_B4C 32+4, 32+9, 32+13, 32+17 @@ -549,12 +552,12 @@ __Len64: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len128: +#__Len128: # 7. len = 128, start = 0 # #addi 14, 14, 992 li 5, 0 # start - li 4, 256 # len * 2 + li 7, 256 # len * 2 Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 @@ -596,9 +599,8 @@ __Len128: # # Montgomery reduce loops with constant 1441 # - addis 10,2,.C1441@toc@ha - addi 10,10,.C1441@toc@l - lvx V1441, 0, 10 + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 Reload_4coeffs MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 @@ -624,7 +626,6 @@ __Len128: MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 -__intt_out: lxv 32+20, 128(1) lxv 32+21, 144(1) lxv 32+22, 160(1) @@ -651,123 +652,21 @@ __intt_out: addi 1, 1, 352 blr -.data -.align 4 -# -MLKEM_Q -.nmkq: -.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 -# QINV -.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 - -# MLKEM_Q -.mkq: -.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 - -.C20159: -.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 - -# 0x2000000 -.C25: -.long 33554432, 33554432, 33554432, 33554432 - -.C1441: -.short 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441 - -.align 4 -.izeta127: -.short 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522 -.short -1460, -1460, -1460, -1460, 958, 958, 958, 958 -.short 991, 991, 991, 991, 996, 996, 996, 996 -.short -308, -308, -308, -308, -108, -108, -108, -108 -.short 478, 478, 478, 478, -870, -870, -870, -870 -.short -854, -854, -854, -854, -1510, -1510, -1510, -1510 -.short 794, 794, 794, 794, -1278, -1278, -1278, -1278 -.short -1530, -1530, -1530, -1530, -1185, -1185, -1185, -1185 -.short -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187 -.short 220, 220, 220, 220, -874, -874, -874, -874 -.short -1335, -1335, -1335, -1335, 1218, 1218, 1218, 1218 -.short -136, -136, -136, -136, -1215, -1215, -1215, -1215 -.short 384, 384, 384, 384, -1465, -1465, -1465, -1465 -.short -1285, -1285, -1285, -1285, 1322, 1322, 1322, 1322 -.short 610, 610, 610, 610, 603, 603, 603, 603 -.short 1097, 1097, 1097, 1097, 817, 817, 817, 817 -.short -75, -75, -75, -75, -156, -156, -156, -156 -.short 329, 329, 329, 329, 418, 418, 418, 418 -.short 349, 349, 349, 349, -872, -872, -872, -872 -.short 644, 644, 644, 644, -1590, -1590, -1590, -1590 -.short 1119, 1119, 1119, 1119, -602, -602, -602, -602 -.short 1483, 1483, 1483, 1483, -777, -777, -777, -777 -.short -147, -147, -147, -147, 1159, 1159, 1159, 1159 -.short 778, 778, 778, 778, -246, -246, -246, -246 -.short 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574 -.short -460, -460, -460, -460, -291, -291, -291, -291 -.short -235, -235, -235, -235, 177, 177, 177, 177 -.short 587, 587, 587, 587, 422, 422, 422, 422 -.short 105, 105, 105, 105, 1550, 1550, 1550, 1550 -.short 871, 871, 871, 871, -1251, -1251, -1251, -1251 -.short 843, 843, 843, 843, 555, 555, 555, 555 -.short 430, 430, 430, 430, -1103, -1103, -1103, -1103 -.izeta63: -.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 -.short 677, 677, 677, 677, 677, 677, 677, 677 -.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 -.short 448, 448, 448, 448, 448, 448, 448, 448 -.short -725, -725, -725, -725, -725, -725, -725, -725 -.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 -.short 961, 961, 961, 961, 961, 961, 961, 961 -.short -398, -398, -398, -398, -398, -398, -398, -398 -.short -951, -951, -951, -951, -951, -951, -951, -951 -.short -247, -247, -247, -247, -247, -247, -247, -247 -.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 -.short 107, 107, 107, 107, 107, 107, 107, 107 -.short 830, 830, 830, 830, 830, 830, 830, 830 -.short -271, -271, -271, -271, -271, -271, -271, -271 -.short -90, -90, -90, -90, -90, -90, -90, -90 -.short -853, -853, -853, -853, -853, -853, -853, -853 -.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 -.short 126, 126, 126, 126, 126, 126, 126, 126 -.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 -.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 -.short -666, -666, -666, -666, -666, -666, -666, -666 -.short -320, -320, -320, -320, -320, -320, -320, -320 -.short -8, -8, -8, -8, -8, -8, -8, -8 -.short 516, 516, 516, 516, 516, 516, 516, 516 -.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 -.short -282, -282, -282, -282, -282, -282, -282, -282 -.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 -.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 -.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 -.short -552, -552, -552, -552, -552, -552, -552, -552 -.short 652, 652, 652, 652, 652, 652, 652, 652 -.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 -.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 -.short -205, -205, -205, -205, -205, -205, -205, -205 -.short 411, 411, 411, 411, 411, 411, 411, 411 -.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 -.short 608, 608, 608, 608, 608, 608, 608, 608 -.short 732, 732, 732, 732, 732, 732, 732, 732 -.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 -.short -681, -681, -681, -681, -681, -681, -681, -681 -.short -130, -130, -130, -130, -130, -130, -130, -130 -.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 -.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 -.short -829, -829, -829, -829, -829, -829, -829, -829 -.short 383, 383, 383, 383, 383, 383, 383, 383 -.short 264, 264, 264, 264, 264, 264, 264, 264 -.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 -.short 573, 573, 573, 573, 573, 573, 573, 573 -.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 -.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 -.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 -.short 962, 962, 962, 962, 962, 962, 962, 962 -.short 182, 182, 182, 182, 182, 182, 182, 182 -.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 -.short 622, 622, 622, 622, 622, 622, 622, 622 -.short -171, -171, -171, -171, -171, -171, -171, -171 -.short 202, 202, 202, 202, 202, 202, 202, 202 -.short 287, 287, 287, 287, 287, 287, 287, 287 -.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 -.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 -.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 -.short -359, -359, -359, -359, -359, -359, -359, -359 -.short -758, -758, -758, -758, -758, -758, -758, -758 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index 172fef9cc8..5bc1c34b85 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -11,14 +11,19 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ -#define V_QINV 2 -#define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 -#define V_ZETA 10 +#include "consts.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 .machine "any" .text @@ -33,7 +38,7 @@ # .macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 mr 9, \start - add 10, 4, 9 # J + len*2 + add 10, 7, 9 # J + len*2 addi 16, 9, \next addi 17, 10, \step addi 18, 16, \next @@ -50,11 +55,13 @@ xxpermdi 32+28, 32+28, 32+28, 2 # fqmul = zeta * coefficient + # Modular multification bond by 2^16 * q in abs value vmladduhm 15, 13, \_vz0, 3 vmladduhm 20, 18, \_vz1, 3 vmladduhm 25, 23, \_vz2, 3 vmladduhm 30, 28, \_vz3, 3 + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value vmhraddshs 14, 13, \_vz0, 3 vmhraddshs 19, 18, \_vz1, 3 vmhraddshs 24, 23, \_vz2, 3 @@ -84,6 +91,9 @@ xxpermdi 32+22, 32+22, 32+22, 2 xxpermdi 32+27, 32+27, 32+27, 2 + # Since the result of the Montgomery multiplication is bounded + # by q in absolute value. + # Finally to complete the final update of the results with add/sub vsubuhm 16, 12, 13 # r - t vadduhm 15, 13, 12 # r + t vsubuhm 21, 17, 18 # r - t @@ -175,20 +185,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) stxv 32+31, 304(1) # get MLKEM_Q - addis 8,2,.nmkq@toc@ha - addi 8,8,.nmkq@toc@l - lvx V_NMKQ,0,8 + lvx V_NMKQ,0,4 # zetas array - addis 14,2,.K1@toc@ha - addi 14,14,.K1@toc@l + addi 14, 4, ZETA_NTT_OFFSET vxor 3, 3, 3 vspltish 4, 1 - lxv 32+V_QINV, 16(8) + + lxv 32+V_QINV, QINV_OFFSET(4) .align 4 -__Len128: +#__Len128: # # Compute coefficients of the NTT based on the following loop. # for (len = 128; len ≥ 2; len = len/2) @@ -196,7 +204,7 @@ __Len128: # 1. len = 128, start = 0 # li 5, 0 # start - li 4, 256 # len * 2 + li 7, 256 # len * 2 lvx V_ZETA, 0, 14 addi 14, 14, 16 @@ -213,12 +221,12 @@ __Len128: Write_One .align 4 -__Len64: +#__Len64: # # 2. len = 64, start = 0, 128 # k += 2 li 5, 0 - li 4, 128 + li 7, 128 lvx V_ZETA, 0, 14 addi 14, 14, 16 MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -237,12 +245,12 @@ __Len64: Write_One .align 4 -__Len32: +#__Len32: # # 3. len = 32, start = 0, 64, 128, 192 # k += 4 li 5, 0 - li 4, 64 + li 7, 64 lvx V_ZETA, 0, 14 addi 14, 14, 16 MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -270,12 +278,12 @@ __Len32: Write_One .align 4 -__Len16: +#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 # k += 8 li 5, 0 - li 4, 32 + li 7, 32 Load_next_4zetas MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One @@ -292,12 +300,12 @@ __Len16: Write_One .align 4 -__Len8: +#__Len8: # # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 # k += 16 li 5, 0 - li 4, 16 + li 7, 16 Load_next_4zetas MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One @@ -323,9 +331,9 @@ __Len8: li 15, 4 # loops mtctr 15 li 5, 0 - li 4, 8 + li 7, 8 .align 4 -__Len4: +ntt_ppc__Len4: Load_next_4zetas MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 Write_Two @@ -336,21 +344,21 @@ __Len4: Write_Two addi 5, 5, 64 - bdnz __Len4 + bdnz ntt_ppc__Len4 # # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # k += 64 # Update zetas vectors, each vector has 2 zetas - addis 14,2,.K64@toc@ha - addi 14,14,.K64@toc@l + + addi 14, 4, ZETA_NTT_OFFSET64 li 15, 4 mtctr 15 li 5, 0 - li 4, 4 + li 7, 4 .align 4 -__Len2: +ntt_ppc__Len2: Load_next_4zetas MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 Write_Three @@ -361,9 +369,8 @@ __Len2: Write_Three addi 5, 5, 64 - bdnz __Len2 + bdnz ntt_ppc__Len2 -__ntt_out: lxv 32+20, 128(1) lxv 32+21, 144(1) lxv 32+22, 160(1) @@ -390,109 +397,12 @@ __ntt_out: addi 1, 1, 352 blr -.data -.align 4 -# -MLKEM_Q -.nmkq: -.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 -# QINV -.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 - -# zetas -.K1: -.short -758, -758, -758, -758, -758, -758, -758, -758 -.short -359, -359, -359, -359, -359, -359, -359, -359 -.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 -.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 -.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 -.short 287, 287, 287, 287, 287, 287, 287, 287 -.short 202, 202, 202, 202, 202, 202, 202, 202 -.short -171, -171, -171, -171, -171, -171, -171, -171 -.short 622, 622, 622, 622, 622, 622, 622, 622 -.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 -.short 182, 182, 182, 182, 182, 182, 182, 182 -.short 962, 962, 962, 962, 962, 962, 962, 962 -.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 -.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 -.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 -.short 573, 573, 573, 573, 573, 573, 573, 573 -.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 -.short 264, 264, 264, 264, 264, 264, 264, 264 -.short 383, 383, 383, 383, 383, 383, 383, 383 -.short -829, -829, -829, -829, -829, -829, -829, -829 -.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 -.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 -.short -130, -130, -130, -130, -130, -130, -130, -130 -.short -681, -681, -681, -681, -681, -681, -681, -681 -.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 -.short 732, 732, 732, 732, 732, 732, 732, 732 -.short 608, 608, 608, 608, 608, 608, 608, 608 -.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 -.short 411, 411, 411, 411, 411, 411, 411, 411 -.short -205, -205, -205, -205, -205, -205, -205, -205 -.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 -.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 -.short 652, 652, 652, 652, 652, 652, 652, 652 -.short -552, -552, -552, -552, -552, -552, -552, -552 -.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 -.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 -.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 -.short -282, -282, -282, -282, -282, -282, -282, -282 -.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 -.short 516, 516, 516, 516, 516, 516, 516, 516 -.short -8, -8, -8, -8, -8, -8, -8, -8 -.short -320, -320, -320, -320, -320, -320, -320, -320 -.short -666, -666, -666, -666, -666, -666, -666, -666 -.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 -.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 -.short 126, 126, 126, 126, 126, 126, 126, 126 -.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 -.short -853, -853, -853, -853, -853, -853, -853, -853 -.short -90, -90, -90, -90, -90, -90, -90, -90 -.short -271, -271, -271, -271, -271, -271, -271, -271 -.short 830, 830, 830, 830, 830, 830, 830, 830 -.short 107, 107, 107, 107, 107, 107, 107, 107 -.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 -.short -247, -247, -247, -247, -247, -247, -247, -247 -.short -951, -951, -951, -951, -951, -951, -951, -951 -.short -398, -398, -398, -398, -398, -398, -398, -398 -.short 961, 961, 961, 961, 961, 961, 961, 961 -.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 -.short -725, -725, -725, -725, -725, -725, -725, -725 -.short 448, 448, 448, 448, 448, 448, 448, 448 -.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 -.short 677, 677, 677, 677, 677, 677, 677, 677 -.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 -.K64: -.short -1103, -1103, -1103, -1103, 430, 430, 430, 430 -.short 555, 555, 555, 555, 843, 843, 843, 843 -.short -1251, -1251, -1251, -1251, 871, 871, 871, 871 -.short 1550, 1550, 1550, 1550, 105, 105, 105, 105 -.short 422, 422, 422, 422, 587, 587, 587, 587 -.short 177, 177, 177, 177, -235, -235, -235, -235 -.short -291, -291, -291, -291, -460, -460, -460, -460 -.short 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653 -.short -246, -246, -246, -246, 778, 778, 778, 778 -.short 1159, 1159, 1159, 1159, -147, -147, -147, -147 -.short -777, -777, -777, -777, 1483, 1483, 1483, 1483 -.short -602, -602, -602, -602, 1119, 1119, 1119, 1119 -.short -1590, -1590, -1590, -1590, 644, 644, 644, 644 -.short -872, -872, -872, -872, 349, 349, 349, 349 -.short 418, 418, 418, 418, 329, 329, 329, 329 -.short -156, -156, -156, -156, -75, -75, -75, -75 -.short 817, 817, 817, 817, 1097, 1097, 1097, 1097 -.short 603, 603, 603, 603, 610, 610, 610, 610 -.short 1322, 1322, 1322, 1322, -1285, -1285, -1285, -1285 -.short -1465, -1465, -1465, -1465, 384, 384, 384, 384 -.short -1215, -1215, -1215, -1215, -136, -136, -136, -136 -.short 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335 -.short -874, -874, -874, -874, 220, 220, 220, 220 -.short -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659 -.short -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530 -.short -1278, -1278, -1278, -1278, 794, 794, 794, 794 -.short -1510, -1510, -1510, -1510, -854, -854, -854, -854 -.short -870, -870, -870, -870, 478, 478, 478, 478 -.short -108, -108, -108, -108, -308, -308, -308, -308 -.short 996, 996, 996, 996, 991, 991, 991, 991 -.short 958, 958, 958, 958, -1460, -1460, -1460, -1460 -.short 1522, 1522, 1522, 1522, 1628, 1628, 1628, 1628 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S index c07f25c5a8..b7b010aaf1 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont.S @@ -17,8 +17,13 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ -#define V1353 0 +#include "consts.h" + +#define V1353 0 #define V_QINV 2 #define V_NMKQ 5 @@ -98,14 +103,9 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) stxv 32+29, 272(1) stxv 32+30, 288(1) - addis 9,2,.nmkq@toc@ha - addi 9,9,.nmkq@toc@l - addis 10,2,.C1353@toc@ha - addi 10,10,.C1353@toc@l - - lxv 32+V_NMKQ,0(9) - lxv 32+V_QINV,16(9) - lxv 32+V1353,0(10) + lxv 32+V_NMKQ, NQ_OFFSET(4) + lxv 32+V_QINV, QINV_OFFSET(4) + lxv 32+V1353, C1353_OFFSET(4) vxor 3, 3, 3 vspltish 4, 1 @@ -150,14 +150,12 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) addi 1, 1, 320 blr -.data -.align 4 -# -MLKEM_Q -.nmkq: -.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 -# QINV -.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 - -.C1353: -.short 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index ee8e1fdca1..dfb6343929 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -18,10 +18,15 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" # Barrett reduce constatnts #define V20159 0 -#define V_25 1 +#define V_25 1 #define V_26 2 #define V_MKQ 3 @@ -136,18 +141,16 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) stxv 32+23, 176(1) stxv 32+24, 192(1) - addis 8,2,.mkq@toc@ha - addi 8,8,.mkq@toc@l - addis 9,2,.C20159@toc@ha - addi 9,9,.C20159@toc@l - addis 10,2,.C25@toc@ha - addi 10,10,.C25@toc@l - vxor 7, 7, 7 - lxv 32+V_MKQ, 0(8) - lxv 32+V20159, 0(9) - lxv 32+V_25, 0(10) + lxv 32+V_MKQ, Q_OFFSET(4) + lxv 32+V20159, C20159_OFFSET(4) + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 li 4, -128 li 5, -112 @@ -162,9 +165,6 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) li 15, 32 li 16, 48 - vspltisw V_26, 13 - vadduwm V_26, V_26, V_26 - BREDUCE_4X 21, 22, 23, 24 BREDUCE_4X 4, 9, 13, 17 Write_8X @@ -211,15 +211,13 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) addi 1, 1, 224 blr -.align 4 -.data -# MLKEM_Q -.mkq: -.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 - -.C20159: -.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ -# 0x2000000 -.C25: -.long 33554432, 33554432, 33554432, 33554432 +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml index c3ffce4e64..9c7fe672ab 100644 --- a/integration/liboqs/ML-KEM-1024_META.yml +++ b/integration/liboqs/ML-KEM-1024_META.yml @@ -97,7 +97,13 @@ implementations: signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le supported_platforms: - architecture: ppc64le operating_systems: diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml index c5fb05e60c..f46dbfdbf1 100644 --- a/integration/liboqs/ML-KEM-512_META.yml +++ b/integration/liboqs/ML-KEM-512_META.yml @@ -97,7 +97,13 @@ implementations: signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le supported_platforms: - architecture: ppc64le operating_systems: diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml index 80b05ba45a..1b01c4d426 100644 --- a/integration/liboqs/ML-KEM-768_META.yml +++ b/integration/liboqs/ML-KEM-768_META.yml @@ -97,7 +97,13 @@ implementations: signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le supported_platforms: - architecture: ppc64le operating_systems: diff --git a/mlkem/mlkem_native.S b/mlkem/mlkem_native.S index b74591221f..a129407856 100644 --- a/mlkem/mlkem_native.S +++ b/mlkem/mlkem_native.S @@ -457,6 +457,33 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef IZETA_NTT_OFFSET127 +#undef IZETA_NTT_OFFSET63 +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_NTT_OFFSET +#undef ZETA_NTT_OFFSET64 +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index 51bc1e33e6..18501942ed 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -444,6 +444,33 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef IZETA_NTT_OFFSET127 +#undef IZETA_NTT_OFFSET63 +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_NTT_OFFSET +#undef ZETA_NTT_OFFSET64 +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/src/native/meta.h b/mlkem/src/native/meta.h index 7fdcd6fcfa..e391883231 100644 --- a/mlkem/src/native/meta.h +++ b/mlkem/src/native/meta.h @@ -20,6 +20,6 @@ #ifdef MLK_SYS_PPC64LE #include "ppc64le/meta.h" -#endif /* MLK_SYS_PPC64LE */ +#endif #endif /* !MLK_NATIVE_META_H */ diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h index bee788976b..54b3ddd9c6 100644 --- a/mlkem/src/native/ppc64le/meta.h +++ b/mlkem/src/native/ppc64le/meta.h @@ -25,25 +25,29 @@ #include "../api.h" #include "src/arith_native_ppc64le.h" -static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { - mlk_ntt_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } -static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { - mlk_intt_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } -static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { - mlk_reduce_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } -static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { - mlk_poly_tomont_ppc(data); - return MLK_NATIVE_FUNC_SUCCESS; +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; } #endif /* !__ASSEMBLER__ */ -#endif /* MLK_NATIVE_PPC64LE_META_H */ +#endif /* !MLK_NATIVE_PPC64LE_META_H */ diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h index 57f0b8f8ce..dbcee3e3ee 100644 --- a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -7,17 +7,18 @@ #include #include "../../../common.h" +#include "consts.h" #define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) -void mlk_ntt_ppc(int16_t *); +void mlk_ntt_ppc(int16_t *, const int16_t *); #define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) -void mlk_intt_ppc(int16_t *); +void mlk_intt_ppc(int16_t *, const int16_t *); #define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) -void mlk_reduce_ppc(int16_t *r); +void mlk_reduce_ppc(int16_t *r, const int16_t *); #define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) -void mlk_poly_tomont_ppc(int16_t *); +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); -#endif /* MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ +#endif /* !MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c new file mode 100644 index 0000000000..4c2fbdf61a --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { + /* -Q */ + -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, + /* QINV */ + -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327, + /* Q */ + 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329, + /* const 20159 for reduce.S and intt */ + 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159, + /* const 1441 for intt */ + 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441, + /* for poly_tomont.S */ + 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353, + /* zetas */ + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, + 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, + -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, + 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, + 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, + -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, + 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, + -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, + -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, + -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, + -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, + -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, + 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, + -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, + -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, + -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, + 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, + -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, + -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, + 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, + /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ + -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, + 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, + 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, + 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, + -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, + -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, + -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, + -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, + 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, + 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, + 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, + 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, + 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, + 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, + 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, + -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, + -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, + -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, + -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, + 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, + 1628, 1628, + /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ + 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, + 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, + -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, + -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, + 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, + -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, + 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, + 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, + 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, + 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, + 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, + 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, + -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, + 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, + -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, + -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, + -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, + 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, + 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, + 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, + -1103, + /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ + -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, + 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, + -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, + -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, + -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, + -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, + 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, + -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, + -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, + 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, + -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, + -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, + 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, + 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, + 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, + -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, + -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, + 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, + 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, + -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, + -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, + 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, + -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, + 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, + 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, + -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, + 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, + -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, + 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, + -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, + -359, -758, -758, -758, -758, -758, -758, -758, -758}; + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h new file mode 100644 index 0000000000..49f519d0c3 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#define MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_NTT_OFFSET64 1104 +#define IZETA_NTT_OFFSET127 1616 +#define IZETA_NTT_OFFSET63 2128 + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#endif + +#endif /* !MLK_NATIVE_PPC64LE_SRC_CONSTS_H */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index feb78b984e..1a4975ba0e 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -11,13 +11,17 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" .machine "any" .text # Barrett reduce constatnts #define V20159 0 -#define V_25 1 +#define V_25 1 #define V_26 2 #define V_MKQ 3 @@ -29,11 +33,11 @@ #define V_Z2 9 #define V_Z3 10 #define V_ZETA 10 -#define V1441 10 +#define V1441 10 .macro Load_4Coeffs start next step mr 9, \start # j - add 10, 4, 9 # J + len*2 + add 10, 7, 9 # J + len*2 addi 16, 9, \next addi 17, 10, \step addi 18, 16, \next @@ -73,6 +77,8 @@ xxlor 32+3, 6, 6 # V_MKQ xxlor 32+1, 7, 7 # V_25 xxlor 32+2, 8, 8 # V_26 + # Multify Odd/Even signed halfword; + # Results word bound by 2^32 in abs value. vmulosh 6, 8, V20159 vmulesh 5, 8, V20159 vmulosh 11, 12, V20159 @@ -97,6 +103,8 @@ vadduwm 14, 14, V_25 vadduwm 17, 17, V_25 vadduwm 18, 18, V_25 + # Right shift and pack lower halfword, + # results bond to 2^16 in abs value vsraw 4, 4, V_26 vsraw 5, 5, V_26 vsraw 9, 9, V_26 @@ -113,6 +121,8 @@ vsubuhm 13, 7, 13 vpkuwum 17, 18, 17 vsubuhm 17, 7, 17 + # Modulo multify-Low unsigned halfword; + # results bond to 2^16 * q in abs value. vmladduhm \_v0, 4, V_MKQ, 8 vmladduhm \_v1, 9, V_MKQ, 12 vmladduhm \_v2, 13, V_MKQ, 16 @@ -123,11 +133,13 @@ # MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) # .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + # Modular multification bond by 2^16 * q in abs value vmladduhm 15, 25, \_vz0, 3 vmladduhm 20, 26, \_vz1, 3 vmladduhm 27, 30, \_vz2, 3 vmladduhm 28, 31, \_vz3, 3 + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value vmhraddshs 14, 25, \_vz0, 3 vmhraddshs 19, 26, \_vz1, 3 vmhraddshs 24, 30, \_vz2, 3 @@ -265,50 +277,41 @@ MLK_ASM_FN_SYMBOL(intt_ppc) # init vectors and constants # Setup for Montgomery reduce - addis 8,2,.nmkq@toc@ha - addi 8,8,.nmkq@toc@l - lxv 0, 0(8) + lxv 0, 0(4) - lxv 32+V_QINV, 16(8) # QINV + lxv 32+V_QINV, QINV_OFFSET(4) # QINV xxlxor 32+3, 32+3, 32+3 vspltish 4, 1 - xxlor 2, 32+2, 32+2 - xxlor 3, 32+3, 32+3 - xxlor 4, 32+4, 32+4 + xxlor 2, 32+2, 32+2 # QINV + xxlor 3, 32+3, 32+3 # 0 + xxlor 4, 32+4, 32+4 # 1 # Setup for Barrett reduce - addis 8,2,.mkq@toc@ha - addi 8,8,.mkq@toc@l - addis 9,2,.C20159@toc@ha - addi 9,9,.C20159@toc@l - addis 10,2,.C25@toc@ha - addi 10,10,.C25@toc@l - - lxv 6, 0(8) # V_MKQ - lxv 32+0, 0(9) # V20159 - lxv 7, 0(10) # V_25 + lxv 6, Q_OFFSET(4) # V_MKQ + lxv 32+V20159, C20159_OFFSET(4) # V20159 + lxv 7, 0(4) # V_25 #xxspltiw 8, 26 # for power9 and above vspltisw 8, 13 vadduwm 8, 8, 8 - xxlor 8, 32+8, 32+8 + xxlor 8, 32+8, 32+8 # V_26 store at vs8 - # zetas array - #addis 14,2,.izeta63@toc@ha - #addi 14,14,.izeta63@toc@l + vspltisw 9, 1 + vsubuwm 10, 8, 9 # 25 + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 # V_25 syore at vs7 .align 4 -__Len2: +#__Len2: # # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # Update zetas vectors, each vector has 2 zetas - addis 14,2,.izeta127@toc@ha - addi 14,14,.izeta127@toc@l - li 4, 4 + addi 14, 4, IZETA_NTT_OFFSET127 + li 7, 4 li 15, 4 mtctr 15 li 5, 0 -__Loop2: +intt_ppc__Loop2: Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 xxlor 10, 32+4, 32+4 @@ -333,19 +336,18 @@ __Loop2: MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 Write_Len2_4C 32+13, 32+18, 32+23, 32+28 addi 5, 5, 64 - bdnz __Loop2 + bdnz intt_ppc__Loop2 .align 4 -__Len4: +#__Len4: # # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - addis 14,2,.izeta63@toc@ha - addi 14,14,.izeta63@toc@l + addi 14, 4, IZETA_NTT_OFFSET63 li 5, 0 - li 4, 8 + li 7, 8 li 15, 4 # loops mtctr 15 -__Loop4: +intt_ppc__Loop4: Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 xxlor 10, 32+4, 32+4 @@ -369,13 +371,13 @@ __Loop4: MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 Write_Len4_4C 32+13, 32+18, 32+23, 32+28 addi 5, 5, 64 - bdnz __Loop4 + bdnz intt_ppc__Loop4 .align 4 -__Len8: +#__Len8: # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 #addi 14, 14, 512 - li 4, 16 + li 7, 16 li 5, 0 Load_4Coeffs 5, 32, 32 @@ -414,12 +416,12 @@ __Len8: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len16: +#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 #addi 14, 14, 768 li 5, 0 - li 4, 32 + li 7, 32 Load_4Coeffs 5, 64, 64 BREDUCE_4X 4, 9, 13, 17 @@ -458,12 +460,12 @@ __Len16: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len32: +#__Len32: # # 5. len = 32, start = 0, 64, 128, 192 #addi 14, 14, 896 li 5, 0 - li 4, 64 + li 7, 64 Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 @@ -505,12 +507,12 @@ __Len32: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len64: +#__Len64: # # 6. len = 64, start = 0, 128 #addi 14, 14, 960 li 5, 0 - li 4, 128 + li 7, 128 Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 Write_B4C 32+4, 32+9, 32+13, 32+17 @@ -549,12 +551,12 @@ __Len64: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -__Len128: +#__Len128: # 7. len = 128, start = 0 # #addi 14, 14, 992 li 5, 0 # start - li 4, 256 # len * 2 + li 7, 256 # len * 2 Load_4Coeffs 5, 16, 16 BREDUCE_4X 4, 9, 13, 17 @@ -596,9 +598,8 @@ __Len128: # # Montgomery reduce loops with constant 1441 # - addis 10,2,.C1441@toc@ha - addi 10,10,.C1441@toc@l - lvx V1441, 0, 10 + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 Reload_4coeffs MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 @@ -624,7 +625,6 @@ __Len128: MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 -__intt_out: lxv 32+20, 128(1) lxv 32+21, 144(1) lxv 32+22, 160(1) @@ -651,123 +651,35 @@ __intt_out: addi 1, 1, 352 blr -.data -.align 4 -# -MLKEM_Q -.nmkq: -.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 -# QINV -.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 - -# MLKEM_Q -.mkq: -.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 - -.C20159: -.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 - -# 0x2000000 -.C25: -.long 33554432, 33554432, 33554432, 33554432 - -.C1441: -.short 1441, 1441, 1441, 1441, 1441, 1441, 1441, 1441 - -.align 4 -.izeta127: -.short 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522 -.short -1460, -1460, -1460, -1460, 958, 958, 958, 958 -.short 991, 991, 991, 991, 996, 996, 996, 996 -.short -308, -308, -308, -308, -108, -108, -108, -108 -.short 478, 478, 478, 478, -870, -870, -870, -870 -.short -854, -854, -854, -854, -1510, -1510, -1510, -1510 -.short 794, 794, 794, 794, -1278, -1278, -1278, -1278 -.short -1530, -1530, -1530, -1530, -1185, -1185, -1185, -1185 -.short -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187 -.short 220, 220, 220, 220, -874, -874, -874, -874 -.short -1335, -1335, -1335, -1335, 1218, 1218, 1218, 1218 -.short -136, -136, -136, -136, -1215, -1215, -1215, -1215 -.short 384, 384, 384, 384, -1465, -1465, -1465, -1465 -.short -1285, -1285, -1285, -1285, 1322, 1322, 1322, 1322 -.short 610, 610, 610, 610, 603, 603, 603, 603 -.short 1097, 1097, 1097, 1097, 817, 817, 817, 817 -.short -75, -75, -75, -75, -156, -156, -156, -156 -.short 329, 329, 329, 329, 418, 418, 418, 418 -.short 349, 349, 349, 349, -872, -872, -872, -872 -.short 644, 644, 644, 644, -1590, -1590, -1590, -1590 -.short 1119, 1119, 1119, 1119, -602, -602, -602, -602 -.short 1483, 1483, 1483, 1483, -777, -777, -777, -777 -.short -147, -147, -147, -147, 1159, 1159, 1159, 1159 -.short 778, 778, 778, 778, -246, -246, -246, -246 -.short 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574 -.short -460, -460, -460, -460, -291, -291, -291, -291 -.short -235, -235, -235, -235, 177, 177, 177, 177 -.short 587, 587, 587, 587, 422, 422, 422, 422 -.short 105, 105, 105, 105, 1550, 1550, 1550, 1550 -.short 871, 871, 871, 871, -1251, -1251, -1251, -1251 -.short 843, 843, 843, 843, 555, 555, 555, 555 -.short 430, 430, 430, 430, -1103, -1103, -1103, -1103 -.izeta63: -.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 -.short 677, 677, 677, 677, 677, 677, 677, 677 -.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 -.short 448, 448, 448, 448, 448, 448, 448, 448 -.short -725, -725, -725, -725, -725, -725, -725, -725 -.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 -.short 961, 961, 961, 961, 961, 961, 961, 961 -.short -398, -398, -398, -398, -398, -398, -398, -398 -.short -951, -951, -951, -951, -951, -951, -951, -951 -.short -247, -247, -247, -247, -247, -247, -247, -247 -.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 -.short 107, 107, 107, 107, 107, 107, 107, 107 -.short 830, 830, 830, 830, 830, 830, 830, 830 -.short -271, -271, -271, -271, -271, -271, -271, -271 -.short -90, -90, -90, -90, -90, -90, -90, -90 -.short -853, -853, -853, -853, -853, -853, -853, -853 -.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 -.short 126, 126, 126, 126, 126, 126, 126, 126 -.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 -.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 -.short -666, -666, -666, -666, -666, -666, -666, -666 -.short -320, -320, -320, -320, -320, -320, -320, -320 -.short -8, -8, -8, -8, -8, -8, -8, -8 -.short 516, 516, 516, 516, 516, 516, 516, 516 -.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 -.short -282, -282, -282, -282, -282, -282, -282, -282 -.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 -.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 -.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 -.short -552, -552, -552, -552, -552, -552, -552, -552 -.short 652, 652, 652, 652, 652, 652, 652, 652 -.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 -.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 -.short -205, -205, -205, -205, -205, -205, -205, -205 -.short 411, 411, 411, 411, 411, 411, 411, 411 -.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 -.short 608, 608, 608, 608, 608, 608, 608, 608 -.short 732, 732, 732, 732, 732, 732, 732, 732 -.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 -.short -681, -681, -681, -681, -681, -681, -681, -681 -.short -130, -130, -130, -130, -130, -130, -130, -130 -.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 -.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 -.short -829, -829, -829, -829, -829, -829, -829, -829 -.short 383, 383, 383, 383, 383, 383, 383, 383 -.short 264, 264, 264, 264, 264, 264, 264, 264 -.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 -.short 573, 573, 573, 573, 573, 573, 573, 573 -.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 -.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 -.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 -.short 962, 962, 962, 962, 962, 962, 962, 962 -.short 182, 182, 182, 182, 182, 182, 182, 182 -.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 -.short 622, 622, 622, 622, 622, 622, 622, 622 -.short -171, -171, -171, -171, -171, -171, -171, -171 -.short 202, 202, 202, 202, 202, 202, 202, 202 -.short 287, 287, 287, 287, 287, 287, 287, 287 -.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 -.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 -.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 -.short -359, -359, -359, -359, -359, -359, -359, -359 -.short -758, -758, -758, -758, -758, -758, -758, -758 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index 172fef9cc8..e9a8df81f6 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -11,14 +11,18 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) -#define V_QINV 2 -#define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 -#define V_ZETA 10 +#include "consts.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 .machine "any" .text @@ -33,7 +37,7 @@ # .macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 mr 9, \start - add 10, 4, 9 # J + len*2 + add 10, 7, 9 # J + len*2 addi 16, 9, \next addi 17, 10, \step addi 18, 16, \next @@ -50,11 +54,13 @@ xxpermdi 32+28, 32+28, 32+28, 2 # fqmul = zeta * coefficient + # Modular multification bond by 2^16 * q in abs value vmladduhm 15, 13, \_vz0, 3 vmladduhm 20, 18, \_vz1, 3 vmladduhm 25, 23, \_vz2, 3 vmladduhm 30, 28, \_vz3, 3 + # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value vmhraddshs 14, 13, \_vz0, 3 vmhraddshs 19, 18, \_vz1, 3 vmhraddshs 24, 23, \_vz2, 3 @@ -84,6 +90,9 @@ xxpermdi 32+22, 32+22, 32+22, 2 xxpermdi 32+27, 32+27, 32+27, 2 + # Since the result of the Montgomery multiplication is bounded + # by q in absolute value. + # Finally to complete the final update of the results with add/sub vsubuhm 16, 12, 13 # r - t vadduhm 15, 13, 12 # r + t vsubuhm 21, 17, 18 # r - t @@ -175,20 +184,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) stxv 32+31, 304(1) # get MLKEM_Q - addis 8,2,.nmkq@toc@ha - addi 8,8,.nmkq@toc@l - lvx V_NMKQ,0,8 + lvx V_NMKQ,0,4 # zetas array - addis 14,2,.K1@toc@ha - addi 14,14,.K1@toc@l + addi 14, 4, ZETA_NTT_OFFSET vxor 3, 3, 3 vspltish 4, 1 - lxv 32+V_QINV, 16(8) + + lxv 32+V_QINV, QINV_OFFSET(4) .align 4 -__Len128: +#__Len128: # # Compute coefficients of the NTT based on the following loop. # for (len = 128; len ≥ 2; len = len/2) @@ -196,7 +203,7 @@ __Len128: # 1. len = 128, start = 0 # li 5, 0 # start - li 4, 256 # len * 2 + li 7, 256 # len * 2 lvx V_ZETA, 0, 14 addi 14, 14, 16 @@ -213,12 +220,12 @@ __Len128: Write_One .align 4 -__Len64: +#__Len64: # # 2. len = 64, start = 0, 128 # k += 2 li 5, 0 - li 4, 128 + li 7, 128 lvx V_ZETA, 0, 14 addi 14, 14, 16 MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -237,12 +244,12 @@ __Len64: Write_One .align 4 -__Len32: +#__Len32: # # 3. len = 32, start = 0, 64, 128, 192 # k += 4 li 5, 0 - li 4, 64 + li 7, 64 lvx V_ZETA, 0, 14 addi 14, 14, 16 MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -270,12 +277,12 @@ __Len32: Write_One .align 4 -__Len16: +#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 # k += 8 li 5, 0 - li 4, 32 + li 7, 32 Load_next_4zetas MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One @@ -292,12 +299,12 @@ __Len16: Write_One .align 4 -__Len8: +#__Len8: # # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 # k += 16 li 5, 0 - li 4, 16 + li 7, 16 Load_next_4zetas MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One @@ -323,9 +330,9 @@ __Len8: li 15, 4 # loops mtctr 15 li 5, 0 - li 4, 8 + li 7, 8 .align 4 -__Len4: +ntt_ppc__Len4: Load_next_4zetas MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 Write_Two @@ -336,21 +343,21 @@ __Len4: Write_Two addi 5, 5, 64 - bdnz __Len4 + bdnz ntt_ppc__Len4 # # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # k += 64 # Update zetas vectors, each vector has 2 zetas - addis 14,2,.K64@toc@ha - addi 14,14,.K64@toc@l + + addi 14, 4, ZETA_NTT_OFFSET64 li 15, 4 mtctr 15 li 5, 0 - li 4, 4 + li 7, 4 .align 4 -__Len2: +ntt_ppc__Len2: Load_next_4zetas MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 Write_Three @@ -361,9 +368,8 @@ __Len2: Write_Three addi 5, 5, 64 - bdnz __Len2 + bdnz ntt_ppc__Len2 -__ntt_out: lxv 32+20, 128(1) lxv 32+21, 144(1) lxv 32+22, 160(1) @@ -390,109 +396,17 @@ __ntt_out: addi 1, 1, 352 blr -.data -.align 4 -# -MLKEM_Q -.nmkq: -.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 -# QINV -.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 - -# zetas -.K1: -.short -758, -758, -758, -758, -758, -758, -758, -758 -.short -359, -359, -359, -359, -359, -359, -359, -359 -.short -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517 -.short 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493 -.short 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422 -.short 287, 287, 287, 287, 287, 287, 287, 287 -.short 202, 202, 202, 202, 202, 202, 202, 202 -.short -171, -171, -171, -171, -171, -171, -171, -171 -.short 622, 622, 622, 622, 622, 622, 622, 622 -.short 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577 -.short 182, 182, 182, 182, 182, 182, 182, 182 -.short 962, 962, 962, 962, 962, 962, 962, 962 -.short -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202 -.short -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474 -.short 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468 -.short 573, 573, 573, 573, 573, 573, 573, 573 -.short -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325 -.short 264, 264, 264, 264, 264, 264, 264, 264 -.short 383, 383, 383, 383, 383, 383, 383, 383 -.short -829, -829, -829, -829, -829, -829, -829, -829 -.short 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458 -.short -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602 -.short -130, -130, -130, -130, -130, -130, -130, -130 -.short -681, -681, -681, -681, -681, -681, -681, -681 -.short 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017 -.short 732, 732, 732, 732, 732, 732, 732, 732 -.short 608, 608, 608, 608, 608, 608, 608, 608 -.short -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542 -.short 411, 411, 411, 411, 411, 411, 411, 411 -.short -205, -205, -205, -205, -205, -205, -205, -205 -.short -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571 -.short 1223, 1223, 1223, 1223, 1223, 1223, 1223, 1223 -.short 652, 652, 652, 652, 652, 652, 652, 652 -.short -552, -552, -552, -552, -552, -552, -552, -552 -.short 1015, 1015, 1015, 1015, 1015, 1015, 1015, 1015 -.short -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293 -.short 1491, 1491, 1491, 1491, 1491, 1491, 1491, 1491 -.short -282, -282, -282, -282, -282, -282, -282, -282 -.short -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544 -.short 516, 516, 516, 516, 516, 516, 516, 516 -.short -8, -8, -8, -8, -8, -8, -8, -8 -.short -320, -320, -320, -320, -320, -320, -320, -320 -.short -666, -666, -666, -666, -666, -666, -666, -666 -.short -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618 -.short -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162 -.short 126, 126, 126, 126, 126, 126, 126, 126 -.short 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469 -.short -853, -853, -853, -853, -853, -853, -853, -853 -.short -90, -90, -90, -90, -90, -90, -90, -90 -.short -271, -271, -271, -271, -271, -271, -271, -271 -.short 830, 830, 830, 830, 830, 830, 830, 830 -.short 107, 107, 107, 107, 107, 107, 107, 107 -.short -1421, -1421, -1421, -1421, -1421, -1421, -1421, -1421 -.short -247, -247, -247, -247, -247, -247, -247, -247 -.short -951, -951, -951, -951, -951, -951, -951, -951 -.short -398, -398, -398, -398, -398, -398, -398, -398 -.short 961, 961, 961, 961, 961, 961, 961, 961 -.short -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508 -.short -725, -725, -725, -725, -725, -725, -725, -725 -.short 448, 448, 448, 448, 448, 448, 448, 448 -.short -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065 -.short 677, 677, 677, 677, 677, 677, 677, 677 -.short -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275 -.K64: -.short -1103, -1103, -1103, -1103, 430, 430, 430, 430 -.short 555, 555, 555, 555, 843, 843, 843, 843 -.short -1251, -1251, -1251, -1251, 871, 871, 871, 871 -.short 1550, 1550, 1550, 1550, 105, 105, 105, 105 -.short 422, 422, 422, 422, 587, 587, 587, 587 -.short 177, 177, 177, 177, -235, -235, -235, -235 -.short -291, -291, -291, -291, -460, -460, -460, -460 -.short 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653 -.short -246, -246, -246, -246, 778, 778, 778, 778 -.short 1159, 1159, 1159, 1159, -147, -147, -147, -147 -.short -777, -777, -777, -777, 1483, 1483, 1483, 1483 -.short -602, -602, -602, -602, 1119, 1119, 1119, 1119 -.short -1590, -1590, -1590, -1590, 644, 644, 644, 644 -.short -872, -872, -872, -872, 349, 349, 349, 349 -.short 418, 418, 418, 418, 329, 329, 329, 329 -.short -156, -156, -156, -156, -75, -75, -75, -75 -.short 817, 817, 817, 817, 1097, 1097, 1097, 1097 -.short 603, 603, 603, 603, 610, 610, 610, 610 -.short 1322, 1322, 1322, 1322, -1285, -1285, -1285, -1285 -.short -1465, -1465, -1465, -1465, 384, 384, 384, 384 -.short -1215, -1215, -1215, -1215, -136, -136, -136, -136 -.short 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335 -.short -874, -874, -874, -874, 220, 220, 220, 220 -.short -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659 -.short -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530 -.short -1278, -1278, -1278, -1278, 794, 794, 794, 794 -.short -1510, -1510, -1510, -1510, -854, -854, -854, -854 -.short -870, -870, -870, -870, 478, 478, 478, 478 -.short -108, -108, -108, -108, -308, -308, -308, -308 -.short 996, 996, 996, 996, 991, 991, 991, 991 -.short 958, 958, 958, 958, -1460, -1460, -1460, -1460 -.short 1522, 1522, 1522, 1522, 1628, 1628, 1628, 1628 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_ZETA diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S index c07f25c5a8..eb770a631c 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -17,8 +17,12 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) -#define V1353 0 +#include "consts.h" + +#define V1353 0 #define V_QINV 2 #define V_NMKQ 5 @@ -98,14 +102,9 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) stxv 32+29, 272(1) stxv 32+30, 288(1) - addis 9,2,.nmkq@toc@ha - addi 9,9,.nmkq@toc@l - addis 10,2,.C1353@toc@ha - addi 10,10,.C1353@toc@l - - lxv 32+V_NMKQ,0(9) - lxv 32+V_QINV,16(9) - lxv 32+V1353,0(10) + lxv 32+V_NMKQ, NQ_OFFSET(4) + lxv 32+V_QINV, QINV_OFFSET(4) + lxv 32+V1353, C1353_OFFSET(4) vxor 3, 3, 3 vspltish 4, 1 @@ -150,14 +149,17 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) addi 1, 1, 320 blr -.data -.align 4 -# -MLKEM_Q -.nmkq: -.short -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329 -# QINV -.short -3327, -3327, -3327, -3327, -3327, -3327, -3327, -3327 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ -.C1353: -.short 1353, 1353, 1353, 1353, 1353, 1353, 1353, 1353 +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index ee8e1fdca1..5584109557 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -18,10 +18,14 @@ # #include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" # Barrett reduce constatnts #define V20159 0 -#define V_25 1 +#define V_25 1 #define V_26 2 #define V_MKQ 3 @@ -136,18 +140,16 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) stxv 32+23, 176(1) stxv 32+24, 192(1) - addis 8,2,.mkq@toc@ha - addi 8,8,.mkq@toc@l - addis 9,2,.C20159@toc@ha - addi 9,9,.C20159@toc@l - addis 10,2,.C25@toc@ha - addi 10,10,.C25@toc@l - vxor 7, 7, 7 - lxv 32+V_MKQ, 0(8) - lxv 32+V20159, 0(9) - lxv 32+V_25, 0(10) + lxv 32+V_MKQ, Q_OFFSET(4) + lxv 32+V20159, C20159_OFFSET(4) + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 li 4, -128 li 5, -112 @@ -162,9 +164,6 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) li 15, 32 li 16, 48 - vspltisw V_26, 13 - vadduwm V_26, V_26, V_26 - BREDUCE_4X 21, 22, 23, 24 BREDUCE_4X 4, 9, 13, 17 Write_8X @@ -211,15 +210,19 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) addi 1, 1, 224 blr -.align 4 -.data -# MLKEM_Q -.mkq: -.short 3329, 3329, 3329, 3329, 3329, 3329, 3329, 3329 - -.C20159: -.short 20159, 20159, 20159, 20159, 20159, 20159, 20159, 20159 - -# 0x2000000 -.C25: -.long 33554432, 33554432, 33554432, 33554432 +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ From f3a7d3cb890a4efa169fe1b7316b69e7752c7b01 Mon Sep 17 00:00:00 2001 From: willieyz Date: Wed, 10 Sep 2025 16:51:26 +0800 Subject: [PATCH 08/22] Add MLK_CONFIG_NO_RANDOMIZED_API with default not set Signed-off-by: willieyz Signed-off-by: Danny Tsen --- mlkem/src/config.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mlkem/src/config.h b/mlkem/src/config.h index 53b9366471..8047ec0d4d 100644 --- a/mlkem/src/config.h +++ b/mlkem/src/config.h @@ -451,6 +451,24 @@ *****************************************************************************/ /* #define MLK_CONFIG_NO_ASM */ +/****************************************************************************** + * Name: MLK_CONFIG_NO_RANDOMIZED_API + * + * Description: If this option is set, mlkem-native will be built without the + * randomized API functions (crypto_kem_keypair and + * crypto_kem_enc). + *. This allows users to build mlkem-native without providing a + * randombytes() implementation if they only need the + * deterministic API + * (crypto_kem_keypair_derand, crypto_kem_enc_derand, + * crypto_kem_dec). + * + * NOTE: This option is incompatible with MLK_CONFIG_KEYGEN_PCT + * as the current PCT implementation requires crypto_kem_enc(). + * + *****************************************************************************/ +/* #define MLK_CONFIG_NO_RANDOMIZED_API */ + /****************************************************************************** * Name: MLK_CONFIG_KEYGEN_PCT * From 25d3218cfa1fb62648b8ce82c5d3c2a2fbfc52cb Mon Sep 17 00:00:00 2001 From: willieyz Date: Wed, 10 Sep 2025 17:55:41 +0800 Subject: [PATCH 09/22] Guard the `crypto_kem_keypair` and `crypto_kem_enc` from: - `kem.c` - `randombytes.h` - `mlkem_native.h` using `MLK_CONFIG_NO_RANDOMIZED_API` Also, add a check in `common.h` to ensure `MLK_CONFIG_NO_RANDOMIZED_API` is not used together with `MLK_CONFIG_KEYGEN_PCT` Signed-off-by: willieyz Signed-off-by: Danny Tsen --- mlkem/mlkem_native.h | 4 ++++ mlkem/src/common.h | 4 ++++ mlkem/src/kem.c | 4 ++++ mlkem/src/randombytes.h | 3 ++- 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/mlkem/mlkem_native.h b/mlkem/mlkem_native.h index dec00e6845..e6d3862224 100644 --- a/mlkem/mlkem_native.h +++ b/mlkem/mlkem_native.h @@ -155,6 +155,7 @@ int MLK_API_NAMESPACE(keypair_derand)( uint8_t sk[MLKEM_SECRETKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)], const uint8_t coins[2 * MLKEM_SYMBYTES]); +#if !defined(MLK_CONFIG_NO_RANDOMIZED_API) /************************************************* * Name: crypto_kem_keypair * @@ -176,6 +177,7 @@ MLK_API_MUST_CHECK_RETURN_VALUE int MLK_API_NAMESPACE(keypair)( uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)], uint8_t sk[MLKEM_SECRETKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)]); +#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */ /************************************************* * Name: crypto_kem_enc_derand @@ -206,6 +208,7 @@ int MLK_API_NAMESPACE(enc_derand)( const uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)], const uint8_t coins[MLKEM_SYMBYTES]); +#if !defined(MLK_CONFIG_NO_RANDOMIZED_API) /************************************************* * Name: crypto_kem_enc * @@ -231,6 +234,7 @@ int MLK_API_NAMESPACE(enc)( uint8_t ct[MLKEM_CIPHERTEXTBYTES(MLK_CONFIG_API_PARAMETER_SET)], uint8_t ss[MLKEM_BYTES], const uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)]); +#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */ /************************************************* * Name: crypto_kem_dec diff --git a/mlkem/src/common.h b/mlkem/src/common.h index 652ef6c7e3..d03f3db7d7 100644 --- a/mlkem/src/common.h +++ b/mlkem/src/common.h @@ -99,6 +99,10 @@ #error Bad configuration: MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, but MLK_CONFIG_FIPS202_BACKEND_FILE is not. #endif +#if defined(MLK_CONFIG_NO_RANDOMIZED_API) && defined(MLK_CONFIG_KEYGEN_PCT) +#error Bad configuration: MLK_CONFIG_NO_RANDOMIZED_API is incompatible with MLK_CONFIG_KEYGEN_PCT as the current PCT implementation requires crypto_kem_enc() +#endif + #if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH) #include MLK_CONFIG_ARITH_BACKEND_FILE /* Include to enforce consistency of API and implementation, diff --git a/mlkem/src/kem.c b/mlkem/src/kem.c index 65099d8471..01430e2c21 100644 --- a/mlkem/src/kem.c +++ b/mlkem/src/kem.c @@ -199,6 +199,7 @@ int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES], return 0; } +#if !defined(MLK_CONFIG_NO_RANDOMIZED_API) /* Reference: `crypto_kem_keypair()` in the reference implementation @[REF] * - We zeroize the stack buffer */ MLK_EXTERNAL_API @@ -219,6 +220,7 @@ int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES], mlk_zeroize(coins, sizeof(coins)); return res; } +#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */ /* Reference: `crypto_kem_enc_derand()` in the reference implementation @[REF] * - We include public key check @@ -258,6 +260,7 @@ int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES], return 0; } +#if !defined(MLK_CONFIG_NO_RANDOMIZED_API) /* Reference: `crypto_kem_enc()` in the reference implementation @[REF] * - We include stack buffer zeroization */ MLK_EXTERNAL_API @@ -278,6 +281,7 @@ int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES], mlk_zeroize(coins, sizeof(coins)); return res; } +#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */ /* Reference: `crypto_kem_dec()` in the reference implementation @[REF] * - We include secret key check diff --git a/mlkem/src/randombytes.h b/mlkem/src/randombytes.h index 132d920afb..1927afce27 100644 --- a/mlkem/src/randombytes.h +++ b/mlkem/src/randombytes.h @@ -11,6 +11,7 @@ #include "cbmc.h" #include "common.h" +#if !defined(MLK_CONFIG_NO_RANDOMIZED_API) #if !defined(MLK_CONFIG_CUSTOM_RANDOMBYTES) void randombytes(uint8_t *out, size_t outlen); static MLK_INLINE void mlk_randombytes(uint8_t *out, size_t outlen) @@ -18,5 +19,5 @@ __contract__( requires(memory_no_alias(out, outlen)) assigns(memory_slice(out, outlen))) { randombytes(out, outlen); } #endif /* !MLK_CONFIG_CUSTOM_RANDOMBYTES */ - +#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */ #endif /* !MLK_RANDOMBYTES_H */ From b4d8771a875b28dfd20faf8fb0667e521a72f05a Mon Sep 17 00:00:00 2001 From: willieyz Date: Wed, 10 Sep 2025 19:47:44 +0800 Subject: [PATCH 10/22] Add new example `basic_deterministic` This commit: - Adds the `basic_deterministic` example demonstrating deterministic API usage (without a `randombytes()` implementation). - Uses only the `crypto_kem_*_derand` functions, no `randombytes()` required. - Updates the expected key outputs for deterministic entropy inputs (Alice: all 0 input, Bob: all 1 input). Signed-off-by: willieyz Signed-off-by: Danny Tsen --- .github/workflows/base.yml | 3 + BIBLIOGRAPHY.md | 1 + Makefile | 1 + examples/README.md | 3 + examples/basic_deterministic/.gitignore | 3 + examples/basic_deterministic/Makefile | 96 ++++ examples/basic_deterministic/README.md | 17 + examples/basic_deterministic/main.c | 109 ++++ .../custom_no_randomized_config.h | 531 ++++++++++++++++++ .../basic_deterministic/mlkem_native/mlkem | 1 + scripts/tests | 9 + 11 files changed, 774 insertions(+) create mode 100644 examples/basic_deterministic/.gitignore create mode 100644 examples/basic_deterministic/Makefile create mode 100644 examples/basic_deterministic/README.md create mode 100644 examples/basic_deterministic/main.c create mode 100644 examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h create mode 120000 examples/basic_deterministic/mlkem_native/mlkem diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 63e91a6a54..435f2fe5ec 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -227,6 +227,9 @@ jobs: - name: basic run: | CFLAGS="-O0" make run -C examples/basic + - name: basic_deterministic + run: | + CFLAGS="-O0" make run -C examples/basic_deterministic - name: bring_your_own_fips202 run: | CFLAGS="-O0" make run -C examples/bring_your_own_fips202 diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index e8c0bca7b4..d75d368ef1 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -26,6 +26,7 @@ source code and documentation. - National Institute of Standards and Technology * URL: https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements * Referenced from: + - [examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h](examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h) - [integration/liboqs/config_aarch64.h](integration/liboqs/config_aarch64.h) - [integration/liboqs/config_c.h](integration/liboqs/config_c.h) - [integration/liboqs/config_ppc64le.h](integration/liboqs/config_ppc64le.h) diff --git a/Makefile b/Makefile index 058f0a7f4b..2c4aa6438c 100644 --- a/Makefile +++ b/Makefile @@ -218,6 +218,7 @@ clean: -make clean -C examples/bring_your_own_fips202 >/dev/null -make clean -C examples/custom_backend >/dev/null -make clean -C examples/basic >/dev/null + -make clean -C examples/basic_deterministic >/dev/null -make clean -C examples/monolithic_build >/dev/null -make clean -C examples/monolithic_build_native >/dev/null -make clean -C examples/monolithic_build_multilevel >/dev/null diff --git a/examples/README.md b/examples/README.md index cccfdeb906..65957ebc30 100644 --- a/examples/README.md +++ b/examples/README.md @@ -8,6 +8,9 @@ This directory contains minimal examples demonstrating how you can use mlkem-nat See [basic](basic) for a basic example of how to build a single instance of mlkem-native. +## Basic_deterministic + +See [basic_deterministic](basic_deterministic) for a basic example of how to build a single instance of mlkem-native without `randombytes()` implementation. This allows users to build mlkem-native using only the deterministic API when randomized functions are not required. ## Multi-level build (C only) See [multilevel_build](multilevel_build) for an example of how to build one instance of mlkem-native per security level, diff --git a/examples/basic_deterministic/.gitignore b/examples/basic_deterministic/.gitignore new file mode 100644 index 0000000000..eb98a94f12 --- /dev/null +++ b/examples/basic_deterministic/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +build diff --git a/examples/basic_deterministic/Makefile b/examples/basic_deterministic/Makefile new file mode 100644 index 0000000000..38ac80d55d --- /dev/null +++ b/examples/basic_deterministic/Makefile @@ -0,0 +1,96 @@ +# (SPDX-License-Identifier: CC-BY-4.0) + +.PHONY: build run clean size +.DEFAULT_GOAL := all + +# Append cross-prefix for cross compilation +# Remove or ignore for native builds +CC ?= gcc +SIZE ?= size +# When called from the root Makefile, CROSS_PREFIX has already been added here +ifeq (,$(findstring $(CROSS_PREFIX),$(CC))) +CC := $(CROSS_PREFIX)$(CC) +endif + +ifeq (,$(findstring $(CROSS_PREFIX),$(SIZE))) +SIZE := $(CROSS_PREFIX)$(SIZE) +endif + +# Part A: +# +# mlkem-native source and header files +# +# If you are not concerned about minimizing for a specific backend, +# you can just include _all_ source files into your build. +MLK_SOURCE=$(wildcard \ + mlkem_native/mlkem/src/*.c \ + mlkem_native/mlkem/src/**/*.c \ + mlkem_native/mlkem/src/**/**/*.c \ + mlkem_native/mlkem/src/**/**/**/*.c) + +# Part B: +# +# Your application source code +APP_SOURCE=$(wildcard *.c) + +ALL_SOURCE=$(MLK_SOURCE) $(RNG_SOURCE) $(APP_SOURCE) + +BUILD_DIR=build +BIN=test_binary + +CFLAGS := \ + -Wall \ + -Wextra \ + -Werror \ + -Wmissing-prototypes \ + -Wshadow \ + -Werror \ + -Wpointer-arith \ + -Wredundant-decls \ + -Wno-long-long \ + -Wno-unknown-pragmas \ + -Wno-unused-command-line-argument \ + -fomit-frame-pointer \ + -std=c99 \ + -pedantic \ + -MMD \ + -O3 \ + -Imlkem_native \ + $(CFLAGS) + +CFLAGS += -DMLK_CONFIG_NAMESPACE_PREFIX=mlkem +CFLAGS += -DMLK_CONFIG_FILE="\"custom_no_randomized_config.h\"" + +BINARY_NAME_FULL_512=$(BUILD_DIR)/$(BIN)512 +BINARY_NAME_FULL_768=$(BUILD_DIR)/$(BIN)768 +BINARY_NAME_FULL_1024=$(BUILD_DIR)/$(BIN)1024 +BINARIES_FULL=$(BINARY_NAME_FULL_512) $(BINARY_NAME_FULL_768) $(BINARY_NAME_FULL_1024) + +$(BINARY_NAME_FULL_512): CFLAGS += -DMLK_CONFIG_PARAMETER_SET=512 +$(BINARY_NAME_FULL_768): CFLAGS += -DMLK_CONFIG_PARAMETER_SET=768 +$(BINARY_NAME_FULL_1024): CFLAGS += -DMLK_CONFIG_PARAMETER_SET=1024 + +$(BINARIES_FULL): $(ALL_SOURCE) + echo "$@" + mkdir -p $(BUILD_DIR) + $(CC) $(CFLAGS) $^ -o $@ + +all: build size + +build: $(BINARIES_FULL) + +run: $(BINARIES_FULL) + $(EXEC_WRAPPER) ./$(BINARY_NAME_FULL_512) + $(EXEC_WRAPPER) ./$(BINARY_NAME_FULL_768) + $(EXEC_WRAPPER) ./$(BINARY_NAME_FULL_1024) + +size: build + @echo "=== Size info for $(BINARY_NAME_FULL_512) ===" + @$(SIZE) $(BINARY_NAME_FULL_512) + @echo "=== Size info for $(BINARY_NAME_FULL_768) ===" + @$(SIZE) $(BINARY_NAME_FULL_768) + @echo "=== Size info for $(BINARY_NAME_FULL_1024) ===" + @$(SIZE) $(BINARY_NAME_FULL_1024) + +clean: + rm -rf $(BUILD_DIR) diff --git a/examples/basic_deterministic/README.md b/examples/basic_deterministic/README.md new file mode 100644 index 0000000000..465722080a --- /dev/null +++ b/examples/basic_deterministic/README.md @@ -0,0 +1,17 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# Building mlkem-native + +This directory contains a minimal example showing how to build **mlkem-native** for use cases only requiring the deterministic key generation and encapsulation APIs (`crypto_kem_keypair_derand` and `crypto_kem_enc_derand`). In that case, no implementation of `randombytes()` has to be provided. + +## Components + +An application using mlkem-native as-is needs to include the following components: + +1. mlkem-native source tree, including [`mlkem/src/`](../../mlkem/src) and [`mlkem/src/fips202/`](../../mlkem/src/fips202). +2. The application source code + + +## Usage + +Build this example with `make build`, run with `make run`. diff --git a/examples/basic_deterministic/main.c b/examples/basic_deterministic/main.c new file mode 100644 index 0000000000..05f23bc8a6 --- /dev/null +++ b/examples/basic_deterministic/main.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include +#include + +/* Import public mlkem-native API + * + * This requires specifying the parameter set and namespace prefix + * used for the build. + */ +#define MLK_CONFIG_API_PARAMETER_SET MLK_CONFIG_PARAMETER_SET +#define MLK_CONFIG_API_NAMESPACE_PREFIX mlkem +#include "mlkem_native/mlkem/mlkem_native.h" + +/* No randombytes needed for deterministic API */ + +#define CHECK(x) \ + do \ + { \ + int rc; \ + rc = (x); \ + if (!rc) \ + { \ + fprintf(stderr, "ERROR (%s,%d)\n", __FILE__, __LINE__); \ + return 1; \ + } \ + } while (0) + +int main(void) +{ + uint8_t pk[CRYPTO_PUBLICKEYBYTES]; + uint8_t sk[CRYPTO_SECRETKEYBYTES]; + uint8_t ct[CRYPTO_CIPHERTEXTBYTES]; + uint8_t key_a[CRYPTO_BYTES]; + uint8_t key_b[CRYPTO_BYTES]; + uint8_t alice_en[2 * MLKEM_SYMBYTES] = {0}; + uint8_t bob_en[MLKEM_SYMBYTES] = {1}; + + + /* The PCT modifies the PRNG state, so the KAT tests don't work. + * We run KAT tests only for disabled PCT. + * Expected keys are generated using deterministic entropy: + * keypair uses all-zero entropy {0}, enc uses all-one entropy {1} */ +#if !defined(MLK_CONFIG_KEYGEN_PCT) +#if MLK_CONFIG_PARAMETER_SET == 512 + const uint8_t expected_key[] = { + 0x5f, 0x5f, 0x8c, 0xf5, 0x7c, 0x34, 0xd4, 0x68, 0x06, 0xa2, 0xe9, + 0xc9, 0x28, 0xba, 0x10, 0x5a, 0x46, 0xf2, 0x67, 0x1a, 0xc7, 0x81, + 0xdf, 0xf1, 0x4a, 0xbb, 0x27, 0xea, 0x46, 0x06, 0x46, 0x3c}; +#elif MLK_CONFIG_PARAMETER_SET == 768 + const uint8_t expected_key[] = { + 0x85, 0x21, 0xab, 0xc8, 0x14, 0xc7, 0x67, 0x70, 0x4f, 0xa6, 0x25, + 0xd9, 0x35, 0x95, 0xd0, 0x03, 0x79, 0xa8, 0xb3, 0x70, 0x35, 0x2c, + 0xa4, 0xba, 0xb3, 0xa6, 0x82, 0x46, 0x63, 0x0d, 0xb0, 0x8b}; +#elif MLK_CONFIG_PARAMETER_SET == 1024 + const uint8_t expected_key[] = { + 0x30, 0x4d, 0xbe, 0x54, 0xd6, 0x6f, 0x80, 0x66, 0xc6, 0xa8, 0x1c, + 0x6b, 0x36, 0xc4, 0x48, 0x9b, 0xf9, 0xe6, 0x05, 0x79, 0x83, 0x3c, + 0x4e, 0xdc, 0x8a, 0xc7, 0x92, 0xe5, 0x73, 0x0d, 0xdd, 0x85}; +#endif /* MLK_CONFIG_PARAMETER_SET == 1024 */ +#endif /* !MLK_CONFIG_KEYGEN_PCT */ + + /* No randombytes_reset() needed for deterministic API */ + + printf("Generating keypair ... "); + + /* Alice generates a public key using deterministic API with all-zero entropy + */ + CHECK(crypto_kem_keypair_derand(pk, sk, alice_en) == 0); + + printf("DONE\n"); + printf("Encaps... "); + + /* Bob derives a secret key and creates a response using deterministic API + * with all-one entropy */ + CHECK(crypto_kem_enc_derand(ct, key_b, pk, bob_en) == 0); + + printf("DONE\n"); + printf("Decaps... "); + + /* Alice uses Bobs response to get her shared key */ + CHECK(crypto_kem_dec(key_a, ct, sk) == 0); + + printf("DONE\n"); + printf("Compare... "); + + CHECK(memcmp(key_a, key_b, CRYPTO_BYTES) == 0); + + printf("Shared secret: "); + { + size_t i; + for (i = 0; i < sizeof(key_a); i++) + { + printf("%02x", key_a[i]); + } + } + printf("\n"); + + /* Check against hardcoded result to make sure that + * we integrated custom FIPS202 correctly */ + CHECK(memcmp(key_a, expected_key, CRYPTO_BYTES) == 0); + + + printf("OK\n"); + return 0; +} diff --git a/examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h b/examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h new file mode 100644 index 0000000000..f1d4180ae5 --- /dev/null +++ b/examples/basic_deterministic/mlkem_native/custom_no_randomized_config.h @@ -0,0 +1,531 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [FIPS140_3_IG] + * Implementation Guidance for FIPS 140-3 and the Cryptographic Module + * Validation Program National Institute of Standards and Technology + * https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements + */ + +#ifndef MLK_CONFIG_H +#define MLK_CONFIG_H + +/****************************************************************************** + * Name: MLK_CONFIG_PARAMETER_SET + * + * Description: Specifies the parameter set for ML-KEM + * - MLK_CONFIG_PARAMETER_SET=512 corresponds to ML-KEM-512 + * - MLK_CONFIG_PARAMETER_SET=768 corresponds to ML-KEM-768 + * - MLK_CONFIG_PARAMETER_SET=1024 corresponds to ML-KEM-1024 + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#ifndef MLK_CONFIG_PARAMETER_SET +#define MLK_CONFIG_PARAMETER_SET \ + 768 /* Change this for different security strengths */ +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_FILE + * + * Description: If defined, this is a header that will be included instead + * of this default configuration file mlkem/src/config.h. + * + * When you need to build mlkem-native in multiple configurations, + * using varying MLK_CONFIG_FILE can be more convenient + * then configuring everything through CFLAGS. + * + * To use, MLK_CONFIG_FILE _must_ be defined prior + * to the inclusion of any mlkem-native headers. For example, + * it can be set by passing `-DMLK_CONFIG_FILE="..."` + * on the command line. + * + *****************************************************************************/ +/* #define MLK_CONFIG_FILE "config.h" */ + +/****************************************************************************** + * Name: MLK_CONFIG_NAMESPACE_PREFIX + * + * Description: The prefix to use to namespace global symbols from mlkem/. + * + * In a multi-level build (that is, if either + * - MLK_CONFIG_MULTILEVEL_WITH_SHARED, or + * - MLK_CONFIG_MULTILEVEL_NO_SHARED, + * are set, level-dependent symbols will additionally be prefixed + * with the parameter set (512/768/1024). + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLK_CONFIG_NAMESPACE_PREFIX) +#define MLK_CONFIG_NAMESPACE_PREFIX MLK_DEFAULT_NAMESPACE_PREFIX +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_MULTILEVEL_WITH_SHARED + * + * Description: This is for multi-level builds of mlkem-native only. If you + * need only a single parameter set, keep this unset. + * + * If this is set, all MLK_CONFIG_PARAMETER_SET-independent + * code will be included in the build, including code needed only + * for other parameter sets. + * + * Example: mlk_poly_cbd3 is only needed for + * MLK_CONFIG_PARAMETER_SET == 512. Yet, if this option is set + * for a build with MLK_CONFIG_PARAMETER_SET == 768/1024, it + * would be included. + * + * To build mlkem-native with support for all parameter sets, + * build it three times -- once per parameter set -- and set the + * option MLK_CONFIG_MULTILEVEL_WITH_SHARED for exactly one of + * them, and MLK_CONFIG_MULTILEVEL_NO_SHARED for the others. + * + * See examples/multilevel_build for an example. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +/* #define MLK_CONFIG_MULTILEVEL_WITH_SHARED */ + +/****************************************************************************** + * Name: MLK_CONFIG_MULTILEVEL_NO_SHARED + * + * Description: This is for multi-level builds of mlkem-native only. If you + * need only a single parameter set, keep this unset. + * + * If this is set, no MLK_CONFIG_PARAMETER_SET-independent code + * will be included in the build. + * + * To build mlkem-native with support for all parameter sets, + * build it three times -- once per parameter set -- and set the + * option MLK_CONFIG_MULTILEVEL_WITH_SHARED for exactly one of + * them, and MLK_CONFIG_MULTILEVEL_NO_SHARED for the others. + * + * See examples/multilevel_build for an example. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +/* #define MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/****************************************************************************** + * Name: MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS + * + * Description: This is only relevant for single compilation unit (SCU) + * builds of mlkem-native. In this case, it determines whether + * directives defined in parameter-set-independent headers should + * be #undef'ined or not at the of the SCU file. This is needed + * in multilevel builds. + * + * See examples/multilevel_build_native for an example. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +/* #define MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */ + +/****************************************************************************** + * Name: MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + * + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLK_CONFIG_ARITH_BACKEND_FILE: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH) +/* #define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_ARITH_BACKEND_FILE + * + * Description: The arithmetic backend to use. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. + * + * This can be set using CFLAGS. + * + *****************************************************************************/ +#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH) && \ + !defined(MLK_CONFIG_ARITH_BACKEND_FILE) +#define MLK_CONFIG_ARITH_BACKEND_FILE "native/meta.h" +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 + * + * Description: Determines whether an native FIPS202 backend should be used. + * + * The FIPS202 backend covers 1x/2x/4x-fold Keccak-f1600, which is + * the performance bottleneck of SHA3 and SHAKE. + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the FIPS202 backend to be use is + * determined by MLK_CONFIG_FIPS202_BACKEND_FILE: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if !defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) +/* #define MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 */ +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202_BACKEND_FILE + * + * Description: The FIPS-202 backend to use. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, this option + * must either be undefined or the filename of a FIPS202 backend. + * If unset, the default backend will be used. + * + * This can be set using CFLAGS. + * + *****************************************************************************/ +#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) && \ + !defined(MLK_CONFIG_FIPS202_BACKEND_FILE) +#define MLK_CONFIG_FIPS202_BACKEND_FILE "fips202/native/auto.h" +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* #define MLK_CONFIG_FIPS202_CUSTOM_HEADER "SOME_FILE.h" */ + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202X4_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202-X4 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202x4.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* #define MLK_CONFIG_FIPS202X4_CUSTOM_HEADER "SOME_FILE.h" */ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_ZEROIZE + * + * Description: In compliance with FIPS 203 Section 3.3, mlkem-native zeroizes + * intermediate stack buffers before returning from function calls. + * + * Set this option and define `mlk_zeroize` if you want to + * use a custom method to zeroize intermediate stack buffers. + * The default implementation uses SecureZeroMemory on Windows + * and a memset + compiler barrier otherwise. If neither of those + * is available on the target platform, compilation will fail, + * and you will need to use MLK_CONFIG_CUSTOM_ZEROIZE to provide + * a custom implementation of `mlk_zeroize()`. + * + * WARNING: + * The explicit stack zeroization conducted by mlkem-native + * reduces the likelihood of data leaking on the stack, but + * does not eliminate it! The C standard makes no guarantee about + * where a compiler allocates structures and whether/where it makes + * copies of them. Also, in addition to entire structures, there + * may also be potentially exploitable leakage of individual values + * on the stack. + * + * If you need bullet-proof zeroization of the stack, you need to + * consider additional measures instead of of what this feature + * provides. In this case, you can set mlk_zeroize to a no-op. + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_ZEROIZE + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void mlk_zeroize(void *ptr, size_t len) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_RANDOMBYTES + * + * Description: mlkem-native does not provide a secure randombytes + * implementation. Such an implementation has to provided by the + * consumer. + * + * If this option is not set, mlkem-native expects a function + * void randombytes(uint8_t *out, size_t outlen). + * + * Set this option and define `mlk_randombytes` if you want to + * use a custom method to sample randombytes with a different name + * or signature. + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_RANDOMBYTES + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_CAPABILITY_FUNC + * + * Description: mlkem-native backends may rely on specific hardware features. + * Those backends will only be included in an mlkem-native build + * if support for the respective features is enabled at + * compile-time. However, when building for a heteroneous set + * of CPUs to run the resulting binary/library on, feature + * detection at _runtime_ is needed to decided whether a backend + * can be used or not. + * + * Set this option and define `mlk_sys_check_capability` if you + * want to use a custom method to dispatch between implementations. + * + * If this option is not set, mlkem-native uses compile-time + * feature detection only to decide which backend to use. + * + * If you compile mlkem-native on a system with different + * capabilities than the system that the resulting binary/library + * will be run on, you must use this option. + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_CAPABILITY_FUNC + static MLK_INLINE int mlk_sys_check_capability(mlk_sys_cap cap) + __contract__( + ensures(return_value == 0 || return_value == 1) + ) + { + ... your implementation ... + } +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_MEMCPY + * + * Description: Set this option and define `mlk_memcpy` if you want to + * use a custom method to copy memory instead of the standard + * library memcpy function. + * + * The custom implementation must have the same signature and + * behavior as the standard memcpy function: + * void *mlk_memcpy(void *dest, const void *src, size_t n) + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_MEMCPY + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void *mlk_memcpy(void *dest, const void *src, size_t n) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_MEMSET + * + * Description: Set this option and define `mlk_memset` if you want to + * use a custom method to set memory instead of the standard + * library memset function. + * + * The custom implementation must have the same signature and + * behavior as the standard memset function: + * void *mlk_memset(void *s, int c, size_t n) + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_MEMSET + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void *mlk_memset(void *s, int c, size_t n) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_INTERNAL_API_QUALIFIER + * + * Description: If set, this option provides an additional function + * qualifier to be added to declarations of internal API. + * + * The primary use case for this option are single-CU builds, + * in which case this option can be set to `static`. + * + *****************************************************************************/ +/* #define MLK_CONFIG_INTERNAL_API_QUALIFIER */ + +/****************************************************************************** + * Name: MLK_CONFIG_EXTERNAL_API_QUALIFIER + * + * Description: If set, this option provides an additional function + * qualifier to be added to declarations of mlkem-native's + * public API. + * + * The primary use case for this option are single-CU builds + * where the public API exposed by mlkem-native is wrapped by + * another API in the consuming application. In this case, + * even mlkem-native's public API can be marked `static`. + * + *****************************************************************************/ +/* #define MLK_CONFIG_EXTERNAL_API_QUALIFIER */ + +/****************************************************************************** + * Name: MLK_CONFIG_CT_TESTING_ENABLED + * + * Description: If set, mlkem-native annotates data as secret / public using + * valgrind's annotations VALGRIND_MAKE_MEM_UNDEFINED and + * VALGRIND_MAKE_MEM_DEFINED, enabling various checks for secret- + * dependent control flow of variable time execution (depending + * on the exact version of valgrind installed). + * + *****************************************************************************/ +/* #define MLK_CONFIG_CT_TESTING_ENABLED */ + +/****************************************************************************** + * Name: MLK_CONFIG_NO_ASM + * + * Description: If this option is set, mlkem-native will be built without + * use of native code or inline assembly. + * + * By default, inline assembly is used to implement value barriers. + * Without inline assembly, mlkem-native will use a global volatile + * 'opt blocker' instead; see verify.h. + * + * Inline assembly is also used to implement a secure zeroization + * function on non-Windows platforms. If this option is set and + * the target platform is not Windows, you MUST set + * MLK_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization + * function. + * + * If this option is set, MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 and + * and MLK_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no + * native backends will be used. + * + *****************************************************************************/ +/* #define MLK_CONFIG_NO_ASM */ + +/****************************************************************************** + * Name: MLK_CONFIG_NO_RANDOMIZED_API + * + * Description: If this option is set, mlkem-native will be built without the + * randomized API functions (crypto_kem_keypair and + * crypto_kem_enc). + *. This allows users to build mlkem-native without providing a + * randombytes() implementation if they only need the + * deterministic API + * (crypto_kem_keypair_derand, crypto_kem_enc_derand, + * crypto_kem_dec). + * + * NOTE: This option is incompatible with MLK_CONFIG_KEYGEN_PCT + * as the current PCT implementation requires crypto_kem_enc(). + * + *****************************************************************************/ +#define MLK_CONFIG_NO_RANDOMIZED_API + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT + * + * Description: Compliance with @[FIPS140_3_IG, p.87] requires a + * Pairwise Consistency Test (PCT) to be carried out on a freshly + * generated keypair before it can be exported. + * + * Set this option if such a check should be implemented. + * In this case, crypto_kem_keypair_derand and crypto_kem_keypair + * will return a non-zero error code if the PCT failed. + * + * NOTE: This feature will drastically lower the performance of + * key generation. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + * + * Description: If this option is set, the user must provide a runtime + * function `static inline int mlk_break_pct() { ... }` to + * indicate whether the PCT should be made fail. + * + * This option only has an effect if MLK_CONFIG_KEYGEN_PCT is set. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + #if !defined(__ASSEMBLER__) + #include "sys.h" + static MLK_INLINE int mlk_break_pct(void) + { + ... return 0/1 depending on whether PCT should be broken ... + } + #endif +*/ + +/************************* Config internals ********************************/ + +/* Default namespace + * + * Don't change this. If you need a different namespace, re-define + * MLK_CONFIG_NAMESPACE_PREFIX above instead, and remove the following. + * + * The default MLKEM namespace is + * + * PQCP_MLKEM_NATIVE_MLKEM_ + * + * e.g., PQCP_MLKEM_NATIVE_MLKEM512_ + */ + +#if MLK_CONFIG_PARAMETER_SET == 512 +#define MLK_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM512 +#elif MLK_CONFIG_PARAMETER_SET == 768 +#define MLK_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM768 +#elif MLK_CONFIG_PARAMETER_SET == 1024 +#define MLK_DEFAULT_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM1024 +#endif + +#endif /* !MLK_CONFIG_H */ diff --git a/examples/basic_deterministic/mlkem_native/mlkem b/examples/basic_deterministic/mlkem_native/mlkem new file mode 120000 index 0000000000..f4ec7bdb2d --- /dev/null +++ b/examples/basic_deterministic/mlkem_native/mlkem @@ -0,0 +1 @@ +../../../mlkem \ No newline at end of file diff --git a/scripts/tests b/scripts/tests index 40b35da7ff..37b85cc179 100755 --- a/scripts/tests +++ b/scripts/tests @@ -208,6 +208,7 @@ class TEST_TYPES(Enum): MONOLITHIC_BUILD_NATIVE = 14 STACK = 15 SIZE = 16 + BASIC_DETERMINISTIC = 17 def is_benchmark(self): return self in [TEST_TYPES.BENCH, TEST_TYPES.BENCH_COMPONENTS] @@ -227,6 +228,7 @@ class TEST_TYPES(Enum): TEST_TYPES.MONOLITHIC_BUILD_MULTILEVEL_NATIVE, TEST_TYPES.MULTILEVEL_BUILD, TEST_TYPES.MULTILEVEL_BUILD_NATIVE, + TEST_TYPES.BASIC_DETERMINISTIC, ] @staticmethod @@ -260,6 +262,8 @@ class TEST_TYPES(Enum): return "Example (Custom Backend)" if self == TEST_TYPES.BASIC: return "Example (mlkem-native as code package)" + if self == TEST_TYPES.BASIC_DETERMINISTIC: + return "Example (mlkem-native as code package without randombytes() implementation)" if self == TEST_TYPES.MONOLITHIC_BUILD: return "Example (monobuild)" if self == TEST_TYPES.MONOLITHIC_BUILD_NATIVE: @@ -282,6 +286,8 @@ class TEST_TYPES(Enum): return "examples/custom_backend" if self == TEST_TYPES.BASIC: return "examples/basic" + if self == TEST_TYPES.BASIC_DETERMINISTIC: + return "examples/basic_deterministic" if self == TEST_TYPES.MONOLITHIC_BUILD: return "examples/monolithic_build" if self == TEST_TYPES.MONOLITHIC_BUILD_NATIVE: @@ -315,6 +321,8 @@ class TEST_TYPES(Enum): return "" if self == TEST_TYPES.BASIC: return "" + if self == TEST_TYPES.BASIC_DETERMINISTIC: + return "" if self == TEST_TYPES.MONOLITHIC_BUILD: return "" if self == TEST_TYPES.MONOLITHIC_BUILD_NATIVE: @@ -1104,6 +1112,7 @@ def cli(): "bring_your_own_fips202", "custom_backend", "basic", + "basic_deterministic", "monolithic_build", "monolithic_build_native", "monolithic_build_multilevel", From 95d003e6500f33bc09ccba24130831cf23d4950b Mon Sep 17 00:00:00 2001 From: willieyz Date: Thu, 11 Sep 2025 13:33:26 +0800 Subject: [PATCH 11/22] CBMC: Increase the CBMC_OBJECT_BITS of `matvec_mul` to 12 - During latest change about adding derandomized config guard, the CBMC proof for `matvec_mul` failed due to SMT-solver return unknown, increadse the CBMC_OBJECT_BITS to fixed it. Signed-off-by: willieyz Signed-off-by: Danny Tsen --- proofs/cbmc/matvec_mul/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proofs/cbmc/matvec_mul/Makefile b/proofs/cbmc/matvec_mul/Makefile index 65223be1c2..65fc7e878b 100644 --- a/proofs/cbmc/matvec_mul/Makefile +++ b/proofs/cbmc/matvec_mul/Makefile @@ -47,7 +47,7 @@ FUNCTION_NAME = mlk_matvec_mul # EXPENSIVE = true # This function is large enough to need... -CBMC_OBJECT_BITS = 10 +CBMC_OBJECT_BITS = 12 # If you require access to a file-local ("static") function or object to conduct # your proof, set the following (and do not include the original source file From 111fbd9592c3e4a7c974544ff96f3e582ddfa788 Mon Sep 17 00:00:00 2001 From: willieyz Date: Thu, 11 Sep 2025 11:03:19 +0800 Subject: [PATCH 12/22] Add `--exclude example` args for "PCT enabled" CI testing - Adds an option in the tests script to exclude specific examples. - Needed because basic_deterministic is incompatible with MLK_CONFIG_KEYGEN_PCT. - Allows CI to run all examples while skipping incompatible ones. Signed-off-by: willieyz Signed-off-by: Danny Tsen --- .github/actions/config-variations/action.yml | 2 ++ scripts/tests | 25 ++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/.github/actions/config-variations/action.yml b/.github/actions/config-variations/action.yml index c824d5d5f3..15e0689f98 100644 --- a/.github/actions/config-variations/action.yml +++ b/.github/actions/config-variations/action.yml @@ -28,6 +28,8 @@ runs: kat: true acvp: true opt: ${{ inputs.opt }} + examples: true + extra_args: "--exclude-example basic_deterministic" - name: "PCT enabled + broken" if: ${{ inputs.tests == 'all' || contains(inputs.tests, 'pct-enabled-broken') }} shell: bash diff --git a/scripts/tests b/scripts/tests index 37b85cc179..c1fa2e5ba4 100755 --- a/scripts/tests +++ b/scripts/tests @@ -636,6 +636,12 @@ class Tests: l = TEST_TYPES.examples() else: l = list(map(TEST_TYPES.from_string, self.args.l)) + + # Filter out excluded examples + if hasattr(self.args, "exclude_example") and self.args.exclude_example: + excluded = [TEST_TYPES.from_string(ex) for ex in self.args.exclude_example] + l = [e for e in l if e not in excluded] + for e in l: self._compile_schemes(e, None) self._run_scheme(e, None, None) @@ -1075,6 +1081,25 @@ def cli(): help="Do not run examples", ) + all_parser.add_argument( + "--exclude-example", + help="Exclude specific examples from running (can be used multiple times)", + choices=[ + "bring_your_own_fips202", + "custom_backend", + "basic", + "basic_deterministic", + "monolithic_build", + "monolithic_build_native", + "monolithic_build_multilevel", + "monolithic_build_multilevel_native", + "multilevel_build", + "multilevel_build_native", + ], + action="append", + default=[], + ) + stack_group = all_parser.add_mutually_exclusive_group() stack_group.add_argument( "--stack", From f768bb5394667fdd8c37a0befbbec53cfb8a06cd Mon Sep 17 00:00:00 2001 From: Rod Chapman Date: Tue, 16 Sep 2025 20:57:17 +0100 Subject: [PATCH 13/22] Introduce explicit upper bounds on lengths of input and output buffers where appropriate. Force CBMC to use --malloc-fail-assert for all proofs to remove assumption on buffer lengths. Update autogenerated files following this change. Update Proof Guide with notes on max buffer size Signed-off-by: Rod Chapman Signed-off-by: Danny Tsen --- mlkem/src/cbmc.h | 14 +++++++++++++- mlkem/src/fips202/fips202.c | 2 ++ mlkem/src/fips202/fips202.h | 5 +++++ mlkem/src/fips202/fips202x4.c | 4 +++- mlkem/src/fips202/fips202x4.h | 4 +++- mlkem/src/verify.h | 6 ++++-- proofs/cbmc/Makefile.common | 2 +- proofs/cbmc/proof_guide.md | 35 +++++++++++++++++++++++++++++++++++ 8 files changed, 66 insertions(+), 6 deletions(-) diff --git a/mlkem/src/cbmc.h b/mlkem/src/cbmc.h index da1f7f2d99..7cbd780367 100644 --- a/mlkem/src/cbmc.h +++ b/mlkem/src/cbmc.h @@ -8,7 +8,6 @@ /*************************************************** * Basic replacements for __CPROVER_XXX contracts ***************************************************/ - #ifndef CBMC #define __contract__(x) @@ -16,6 +15,8 @@ #else /* !CBMC */ +#include + #define __contract__(x) x #define __loop__(x) x @@ -59,6 +60,17 @@ #define readable(...) __CPROVER_r_ok(__VA_ARGS__) #define writeable(...) __CPROVER_w_ok(__VA_ARGS__) +/* Maximum supported buffer size + * + * Larger buffers may be supported, but due to internal modeling constraints + * in CBMC, the proofs of memory- and type-safety won't be able to run. + * + * If you find yourself in need for a buffer size larger than this, + * please contact the maintainers, so we can prioritize work to relax + * this somewhat artificial bound. + */ +#define MLK_MAX_BUFFER_SIZE (SIZE_MAX >> 12) + /* * History variables * https://diffblue.github.io/cbmc/contracts-history-variables.html diff --git a/mlkem/src/fips202/fips202.c b/mlkem/src/fips202/fips202.c index 8113574397..06eda6f702 100644 --- a/mlkem/src/fips202/fips202.c +++ b/mlkem/src/fips202/fips202.c @@ -60,6 +60,7 @@ static void mlk_keccak_absorb_once(uint64_t *s, uint32_t r, const uint8_t *m, size_t mlen, uint8_t p) __contract__( + requires(mlen <= MLK_MAX_BUFFER_SIZE) requires(r <= sizeof(uint64_t) * MLK_KECCAK_LANES) requires(memory_no_alias(s, sizeof(uint64_t) * MLK_KECCAK_LANES)) requires(memory_no_alias(m, mlen)) @@ -153,6 +154,7 @@ __contract__( static void mlk_keccak_squeeze_once(uint8_t *h, size_t outlen, uint64_t *s, uint32_t r) __contract__( + requires(outlen <= MLK_MAX_BUFFER_SIZE) requires(r <= sizeof(uint64_t) * MLK_KECCAK_LANES) requires(memory_no_alias(s, sizeof(uint64_t) * MLK_KECCAK_LANES)) requires(memory_no_alias(h, outlen)) diff --git a/mlkem/src/fips202/fips202.h b/mlkem/src/fips202/fips202.h index fe27b341fa..2334718e74 100644 --- a/mlkem/src/fips202/fips202.h +++ b/mlkem/src/fips202/fips202.h @@ -47,6 +47,7 @@ typedef struct void mlk_shake128_absorb_once(mlk_shake128ctx *state, const uint8_t *input, size_t inlen) __contract__( + requires(inlen <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(state, sizeof(mlk_shake128ctx))) requires(memory_no_alias(input, inlen)) assigns(memory_slice(state, sizeof(mlk_shake128ctx))) @@ -96,6 +97,8 @@ void mlk_shake128_release(mlk_shake128ctx *state); void mlk_shake256(uint8_t *output, size_t outlen, const uint8_t *input, size_t inlen) __contract__( + requires(inlen <= MLK_MAX_BUFFER_SIZE) + requires(outlen <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(input, inlen)) requires(memory_no_alias(output, outlen)) assigns(memory_slice(output, outlen)) @@ -116,6 +119,7 @@ __contract__( **************************************************/ void mlk_sha3_256(uint8_t *output, const uint8_t *input, size_t inlen) __contract__( + requires(inlen <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(input, inlen)) requires(memory_no_alias(output, SHA3_256_HASHBYTES)) assigns(memory_slice(output, SHA3_256_HASHBYTES)) @@ -136,6 +140,7 @@ __contract__( **************************************************/ void mlk_sha3_512(uint8_t *output, const uint8_t *input, size_t inlen) __contract__( + requires(inlen <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(input, inlen)) requires(memory_no_alias(output, SHA3_512_HASHBYTES)) assigns(memory_slice(output, SHA3_512_HASHBYTES)) diff --git a/mlkem/src/fips202/fips202x4.c b/mlkem/src/fips202/fips202x4.c index 34cfd0aa40..5608a2b8a8 100644 --- a/mlkem/src/fips202/fips202x4.c +++ b/mlkem/src/fips202/fips202x4.c @@ -28,6 +28,7 @@ static void mlk_keccak_absorb_once_x4(uint64_t *s, uint32_t r, const uint8_t *in2, const uint8_t *in3, size_t inlen, uint8_t p) __contract__( + requires(inlen <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(s, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY)) requires(r <= sizeof(uint64_t) * MLK_KECCAK_LANES) requires(memory_no_alias(in0, inlen)) @@ -78,7 +79,8 @@ static void mlk_keccak_squeezeblocks_x4(uint8_t *out0, uint8_t *out1, size_t nblocks, uint64_t *s, uint32_t r) __contract__( requires(r <= sizeof(uint64_t) * MLK_KECCAK_LANES) - requires(nblocks <= 8 /* somewhat arbitrary bound */) + requires(r == SHAKE128_RATE || r == SHAKE256_RATE) + requires(nblocks <= (MLK_MAX_BUFFER_SIZE / SHAKE256_RATE)) requires(memory_no_alias(s, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY)) requires(memory_no_alias(out0, nblocks * r)) requires(memory_no_alias(out1, nblocks * r)) diff --git a/mlkem/src/fips202/fips202x4.h b/mlkem/src/fips202/fips202x4.h index 76741d2e31..d4f285e234 100644 --- a/mlkem/src/fips202/fips202x4.h +++ b/mlkem/src/fips202/fips202x4.h @@ -25,6 +25,7 @@ void mlk_shake128x4_absorb_once(mlk_shake128x4ctx *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) __contract__( + requires(inlen <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(state, sizeof(mlk_shake128x4ctx))) requires(memory_no_alias(in0, inlen)) requires(memory_no_alias(in1, inlen)) @@ -62,7 +63,8 @@ void mlk_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, uint8_t *in0, uint8_t *in1, uint8_t *in2, uint8_t *in3, size_t inlen) __contract__( - requires(outlen <= 8 * SHAKE256_RATE /* somewhat arbitrary bound */) + requires(inlen <= MLK_MAX_BUFFER_SIZE) + requires(outlen <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(in0, inlen)) requires(memory_no_alias(in1, inlen)) requires(memory_no_alias(in2, inlen)) diff --git a/mlkem/src/verify.h b/mlkem/src/verify.h index 89eac76786..c51495248e 100644 --- a/mlkem/src/verify.h +++ b/mlkem/src/verify.h @@ -318,7 +318,8 @@ __contract__(ensures(return_value == (cond ? a : b))) * * Arguments: const uint8_t *a: pointer to first byte array * const uint8_t *b: pointer to second byte array - * size_t len: length of the byte arrays + * size_t len: length of the byte arrays, upper-bounded + * to INT_MAX to control proof complexity * * Returns 0 if the byte arrays are equal, a non-zero value otherwise * @@ -338,9 +339,9 @@ __contract__(ensures(return_value == (cond ? a : b))) static MLK_INLINE uint8_t mlk_ct_memcmp(const uint8_t *a, const uint8_t *b, const size_t len) __contract__( + requires(len <= INT_MAX) requires(memory_no_alias(a, len)) requires(memory_no_alias(b, len)) - requires(len <= INT_MAX) ensures((return_value == 0) == forall(i, 0, len, (a[i] == b[i])))) { uint8_t r = 0, s = 0; @@ -391,6 +392,7 @@ __contract__( static MLK_INLINE void mlk_ct_cmov_zero(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) __contract__( + requires(len <= MLK_MAX_BUFFER_SIZE) requires(memory_no_alias(r, len)) requires(memory_no_alias(x, len)) assigns(memory_slice(r, len))) diff --git a/proofs/cbmc/Makefile.common b/proofs/cbmc/Makefile.common index 0bfb3f0b5b..cf9d61aedf 100644 --- a/proofs/cbmc/Makefile.common +++ b/proofs/cbmc/Makefile.common @@ -246,7 +246,7 @@ endif # * an entire project when added to Makefile-project-defines # * a specific proof when added to the harness Makefile -CBMC_FLAG_MALLOC_MAY_FAIL ?= # set to --no-malloc-may-fail to disable +CBMC_FLAG_MALLOC_MAY_FAIL ?= --malloc-fail-assert CBMC_FLAG_BOUNDS_CHECK ?= # set to --no-bounds-check to disable CBMC_FLAG_CONVERSION_CHECK ?= --conversion-check CBMC_FLAG_DIV_BY_ZERO_CHECK ?= # set to --no-div-by-zero-check to disable diff --git a/proofs/cbmc/proof_guide.md b/proofs/cbmc/proof_guide.md index 4253fecaef..e61a4b38c8 100644 --- a/proofs/cbmc/proof_guide.md +++ b/proofs/cbmc/proof_guide.md @@ -73,6 +73,41 @@ for some struct `foo`, you cannot pass `&foo[0]`, `&foo[1]` as arguments to a fu `memory_no_alias(...)` for both, because `&foo[0]`, `&foo[1]` point to the same object. In mlkem-native, we sometimes work around this by manually splitting statically-sized arrays into multiple separate objects. + +### Maximum buffer sizes + +CBMC assumes that allocated objects are less than `__CPROVER_max_malloc_size` +which is an an internal constant defined to be `SIZE_MAX >> (OBJECT_BITS + 1)` +for that particular run of CBMC, where `SIZE_MAX` is an implementation-defined +constant (declared in `stdint.h`) and `OBJECT_BITS` is a command-line parameter +with value typically in the range 8 .. 12 + +See the [memory bounds checking](https://diffblue.github.io/cbmc/memory-bounds-checking.html) +section of the CBMC manual for more details. + +Pragmatically, `SIZE_MAX` will either be `2**64-1` or `2**32-1` depending on the +host platform, and we choose the largest value of `OBJECT_BITS` that is used +for all proofs in this repository. + +This matters where a function takes a formal parameter `p` of some pointer type +`t` and a `len` parameter of type `size_t` that denotes the number of elements +pointed to by `p`, and those parameters are subject to a +`memory_no_alias(p, len * sizeof(t))` contract. + +In such cases, len must be explicitly bounded to be less that or equal to +MLK_MAX_BUFFER_SIZE which might be defined in `cbmc.h` as: +```c +#define MLK_MAX_BUFFER_SIZE (SIZE_MAX >> 12) +``` +and used, for example, as follows: +```c +void f(t *p, size_t len) +__contract__( + requires(len * sizeof(t) <= MLK_MAX_BUFFER_SIZE) + requires(memory_no_alias(p, len * sizeof(t))) +); +``` + ### Memory footprint The most common way to specify memory footprint in `assigns(...)` clauses is via `memory_slice(ptr, len)`. This asserts From 52b9c78bbc6c4571c4d3580eb624cc0a20376c9c Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Mon, 22 Sep 2025 03:17:14 -0400 Subject: [PATCH 14/22] Removed unused comment lines. Signed-off-by: Danny Tsen --- dev/ppc64le/src/intt_ppc.S | 7 ------- dev/ppc64le/src/ntt_ppc.S | 5 ----- mlkem/src/native/ppc64le/src/intt_ppc.S | 7 ------- mlkem/src/native/ppc64le/src/ntt_ppc.S | 5 ----- 4 files changed, 24 deletions(-) diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 1f4b48e42e..b3ffe2f312 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -303,7 +303,6 @@ MLK_ASM_FN_SYMBOL(intt_ppc) xxlor 7, 32+9, 32+9 # V_25 syore at vs7 .align 4 -#__Len2: # # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # Update zetas vectors, each vector has 2 zetas @@ -340,7 +339,6 @@ intt_ppc__Loop2: bdnz intt_ppc__Loop2 .align 4 -#__Len4: # # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 addi 14, 4, IZETA_NTT_OFFSET63 @@ -375,7 +373,6 @@ intt_ppc__Loop4: bdnz intt_ppc__Loop4 .align 4 -#__Len8: # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 #addi 14, 14, 512 li 7, 16 @@ -417,7 +414,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 #addi 14, 14, 768 @@ -461,7 +457,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len32: # # 5. len = 32, start = 0, 64, 128, 192 #addi 14, 14, 896 @@ -508,7 +503,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len64: # # 6. len = 64, start = 0, 128 #addi 14, 14, 960 @@ -552,7 +546,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len128: # 7. len = 128, start = 0 # #addi 14, 14, 992 diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index 5bc1c34b85..0c98581c53 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -196,7 +196,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) lxv 32+V_QINV, QINV_OFFSET(4) .align 4 -#__Len128: # # Compute coefficients of the NTT based on the following loop. # for (len = 128; len ≥ 2; len = len/2) @@ -221,7 +220,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len64: # # 2. len = 64, start = 0, 128 # k += 2 @@ -245,7 +243,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len32: # # 3. len = 32, start = 0, 64, 128, 192 # k += 4 @@ -278,7 +275,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 # k += 8 @@ -300,7 +296,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len8: # # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 # k += 16 diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 1a4975ba0e..163c3f806f 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -302,7 +302,6 @@ MLK_ASM_FN_SYMBOL(intt_ppc) xxlor 7, 32+9, 32+9 # V_25 syore at vs7 .align 4 -#__Len2: # # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # Update zetas vectors, each vector has 2 zetas @@ -339,7 +338,6 @@ intt_ppc__Loop2: bdnz intt_ppc__Loop2 .align 4 -#__Len4: # # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 addi 14, 4, IZETA_NTT_OFFSET63 @@ -374,7 +372,6 @@ intt_ppc__Loop4: bdnz intt_ppc__Loop4 .align 4 -#__Len8: # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 #addi 14, 14, 512 li 7, 16 @@ -416,7 +413,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 #addi 14, 14, 768 @@ -460,7 +456,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len32: # # 5. len = 32, start = 0, 64, 128, 192 #addi 14, 14, 896 @@ -507,7 +502,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len64: # # 6. len = 64, start = 0, 128 #addi 14, 14, 960 @@ -551,7 +545,6 @@ intt_ppc__Loop4: Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 -#__Len128: # 7. len = 128, start = 0 # #addi 14, 14, 992 diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index e9a8df81f6..83f42f9b8a 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -195,7 +195,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) lxv 32+V_QINV, QINV_OFFSET(4) .align 4 -#__Len128: # # Compute coefficients of the NTT based on the following loop. # for (len = 128; len ≥ 2; len = len/2) @@ -220,7 +219,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len64: # # 2. len = 64, start = 0, 128 # k += 2 @@ -244,7 +242,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len32: # # 3. len = 32, start = 0, 64, 128, 192 # k += 4 @@ -277,7 +274,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len16: # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 # k += 8 @@ -299,7 +295,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) Write_One .align 4 -#__Len8: # # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 # k += 16 From 61abb93fe5b2ab573691c60b75e078617b0268b2 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Mon, 22 Sep 2025 07:00:04 -0400 Subject: [PATCH 15/22] Removed un-wanted comment. Removed non-p8 instruction, xxspltib. Signed-off-by: Danny Tsen --- dev/ppc64le/src/intt_ppc.S | 1 - dev/ppc64le/src/reduce.S | 2 +- mlkem/src/native/ppc64le/src/intt_ppc.S | 1 - mlkem/src/native/ppc64le/src/reduce.S | 2 +- 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index b3ffe2f312..95bf370b8b 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -292,7 +292,6 @@ MLK_ASM_FN_SYMBOL(intt_ppc) lxv 32+V20159, C20159_OFFSET(4) # V20159 lxv 7, 0(4) # V_25 - #xxspltiw 8, 26 # for power9 and above vspltisw 8, 13 vadduwm 8, 8, 8 xxlor 8, 32+8, 32+8 # V_26 store at vs8 diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index dfb6343929..603e0d38b0 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -186,7 +186,7 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) # .align 4 addi 3, 3, -512 - xxspltib 32+9 ,0 + vxor 9, 9, 9 vspltish 10, 15 vmr 11, V_MKQ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 163c3f806f..817c8c2997 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -291,7 +291,6 @@ MLK_ASM_FN_SYMBOL(intt_ppc) lxv 32+V20159, C20159_OFFSET(4) # V20159 lxv 7, 0(4) # V_25 - #xxspltiw 8, 26 # for power9 and above vspltisw 8, 13 vadduwm 8, 8, 8 xxlor 8, 32+8, 32+8 # V_26 store at vs8 diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index 5584109557..f9681c4568 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -185,7 +185,7 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) # .align 4 addi 3, 3, -512 - xxspltib 32+9 ,0 + vxor 9, 9, 9 vspltish 10, 15 vmr 11, V_MKQ From 33858ecc8820c61f69a4ffa6a395df5cc22da2fb Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Wed, 8 Oct 2025 01:34:49 -0400 Subject: [PATCH 16/22] Fixed failure in INTT unit tests. Re-arranged zeta array for NTT/INTT for Len 2 and 4. Signed-off-by: Danny Tsen --- dev/ppc64le/src/consts.c | 191 ++++++------ dev/ppc64le/src/consts.h | 4 +- dev/ppc64le/src/intt_ppc.S | 371 ++++++++++++----------- dev/ppc64le/src/ntt_ppc.S | 225 +++++++++----- mlkem/src/native/ppc64le/src/consts.c | 191 ++++++------ mlkem/src/native/ppc64le/src/consts.h | 4 +- mlkem/src/native/ppc64le/src/intt_ppc.S | 386 ++++++++++++------------ mlkem/src/native/ppc64le/src/ntt_ppc.S | 231 ++++++++------ 8 files changed, 836 insertions(+), 767 deletions(-) diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c index 4c2fbdf61a..fa0f7097f5 100644 --- a/dev/ppc64le/src/consts.c +++ b/dev/ppc64le/src/consts.c @@ -3,6 +3,11 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ +#include +#include +#include +#include + #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ @@ -10,7 +15,7 @@ #include "consts.h" -MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { +MLK_ALIGN const int16_t mlk_ppc_qdata[1072] = { /* -Q */ -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, /* QINV */ @@ -44,112 +49,84 @@ MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, - -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, - 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, - -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, - 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, - 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, - -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, - 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, - -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, - -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, - -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, - 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, - -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, - -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, - 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, - -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, - -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, - -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, - 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, - -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, - -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, - 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, - /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ - -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, - 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, - 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, - 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, - -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, - -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, - -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, - -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, - 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, - 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, - 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, - 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, - 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, - 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, - 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, - -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, - -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, - -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, - -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, - 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, - 1628, 1628, - /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ - 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, - 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, - -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, - -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, - 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, - -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, - 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, - 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, - 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, - 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, - 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, - 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, - -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, - 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, - -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, - -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, - -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, - 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, - 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, - 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, - -1103, - /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ - -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, - 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, - 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, - -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, - 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, - -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, - -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, - -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, - 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, - -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, - -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, - 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, - -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, - -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, - -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, - 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, - -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, - 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, - 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, - -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, - 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, - -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, - 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, - -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, - 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, - -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, - -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, - 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, - -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, - 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, - 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, - 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, - -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, - 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, - 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, - -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, - 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, - -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, - -359, -758, -758, -758, -758, -758, -758, -758, -758}; + -1571, -1571, -1571, -1571, -1571, -1571, + /* For Len=4 */ + 1223, 1223, 1223, 1223, 652, 652, 652, 652, -552, -552, -552, -552, 1015, + 1015, 1015, 1015, -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, -282, + -282, -282, -282, -1544, -1544, -1544, -1544, 516, 516, 516, 516, -8, -8, + -8, -8, -320, -320, -320, -320, -666, -666, -666, -666, -1618, -1618, -1618, + -1618, -1162, -1162, -1162, -1162, 126, 126, 126, 126, 1469, 1469, 1469, + 1469, -853, -853, -853, -853, -90, -90, -90, -90, -271, -271, -271, -271, + 830, 830, 830, 830, 107, 107, 107, 107, -1421, -1421, -1421, -1421, -247, + -247, -247, -247, -951, -951, -951, -951, -398, -398, -398, -398, 961, 961, + 961, 961, -1508, -1508, -1508, -1508, -725, -725, -725, -725, 448, 448, 448, + 448, -1065, -1065, -1065, -1065, 677, 677, 677, 677, -1275, -1275, -1275, + -1275, + /* + * For ntt Len=2 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + 555, 555, -1103, -1103, 843, 843, 430, 430, 1550, 1550, -1251, -1251, 105, + 105, 871, 871, 177, 177, 422, 422, -235, -235, 587, 587, 1574, 1574, -291, + -291, 1653, 1653, -460, -460, 1159, 1159, -246, -246, -147, -147, 778, 778, + -602, -602, -777, -777, 1119, 1119, 1483, 1483, -872, -872, -1590, -1590, + 349, 349, 644, 644, -156, -156, 418, 418, -75, -75, 329, 329, 603, 603, 817, + 817, 610, 610, 1097, 1097, -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, + 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, -1187, -1187, -874, + -874, -1659, -1659, 220, 220, -1278, -1278, -1185, -1185, 794, 794, -1530, + -1530, -870, -870, -1510, -1510, 478, 478, -854, -854, 996, 996, -108, -108, + 991, 991, -308, -308, 1522, 1522, 958, 958, 1628, 1628, -1460, -1460, + /* + * For intt Len=2, offset IZETA_NTT_OFFSET127 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, -308, -308, 991, 991, -108, + -108, 996, 996, -854, -854, 478, 478, -1510, -1510, -870, -870, -1530, + -1530, 794, 794, -1185, -1185, -1278, -1278, 220, 220, -1659, -1659, -874, + -874, -1187, -1187, -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, + -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, 1097, 1097, 610, 610, 817, + 817, 603, 603, 329, 329, -75, -75, 418, 418, -156, -156, 644, 644, 349, 349, + -1590, -1590, -872, -872, 1483, 1483, 1119, 1119, -777, -777, -602, -602, + 778, 778, -147, -147, -246, -246, 1159, 1159, -460, -460, 1653, 1653, -291, + -291, 1574, 1574, 587, 587, -235, -235, 422, 422, 177, 177, 871, 871, 105, + 105, -1251, -1251, 1550, 1550, 430, 430, 843, 843, -1103, -1103, 555, 555, + /* For intt Len=4 */ + -1275, -1275, -1275, -1275, 677, 677, 677, 677, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, -725, -725, -725, -725, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, -398, -398, -398, -398, -951, -951, -951, -951, -247, -247, + -247, -247, -1421, -1421, -1421, -1421, 107, 107, 107, 107, 830, 830, 830, + 830, -271, -271, -271, -271, -90, -90, -90, -90, -853, -853, -853, -853, + 1469, 1469, 1469, 1469, 126, 126, 126, 126, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -666, -666, -666, -666, -320, -320, -320, -320, + -8, -8, -8, -8, 516, 516, 516, 516, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, 1015, 1015, + 1015, 1015, -552, -552, -552, -552, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, + /* For intt Len=8 and others */ + -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, -205, -205, -205, + -205, -205, -205, -205, -205, 411, 411, 411, 411, 411, 411, 411, 411, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 608, 608, 608, 608, 608, + 608, 608, 608, 732, 732, 732, 732, 732, 732, 732, 732, 1017, 1017, 1017, + 1017, 1017, 1017, 1017, 1017, -681, -681, -681, -681, -681, -681, -681, + -681, -130, -130, -130, -130, -130, -130, -130, -130, -1602, -1602, -1602, + -1602, -1602, -1602, -1602, -1602, 1458, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, -829, -829, -829, -829, -829, -829, -829, -829, 383, 383, 383, 383, + 383, 383, 383, 383, 264, 264, 264, 264, 264, 264, 264, 264, -1325, -1325, + -1325, -1325, -1325, -1325, -1325, -1325, 573, 573, 573, 573, 573, 573, 573, + 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, -1474, -1474, -1474, + -1474, -1474, -1474, -1474, -1474, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1202, 962, 962, 962, 962, 962, 962, 962, 962, 182, 182, 182, 182, + 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 622, + 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, -171, -171, -171, -171, + -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, 287, 287, 287, 287, 287, + 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1493, 1493, 1493, + 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -359, -359, -359, -359, -359, -359, -359, -359, -758, -758, + -758, -758, -758, -758, -758, -758}; #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h index d424601ac1..b5e66983fe 100644 --- a/dev/ppc64le/src/consts.h +++ b/dev/ppc64le/src/consts.h @@ -14,9 +14,7 @@ #define C1441_OFFSET 64 #define C1353_OFFSET 80 #define ZETA_NTT_OFFSET 96 -#define ZETA_NTT_OFFSET64 1104 -#define IZETA_NTT_OFFSET127 1616 -#define IZETA_NTT_OFFSET63 2128 +#define ZETA_INTT_OFFSET 1104 #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 95bf370b8b..5c7b3dba67 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -36,6 +36,17 @@ #define V_ZETA 10 #define V1441 10 +.macro Compute_4Coeffs + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t +.endm + .macro Load_4Coeffs start next step mr 9, \start # j add 10, 7, 9 # J + len*2 @@ -63,14 +74,64 @@ xxpermdi 32+23, 32+23, 32+23, 2 xxpermdi 32+24, 32+24, 32+24, 2 - vsubuhm 25, 8, 21 # r[j+len] - t - vsubuhm 26, 12, 22 # r[j+len] - t - vsubuhm 30, 16, 23 # r[j+len] - t - vsubuhm 31, 20, 24 # r[j+len] - t - vadduhm 8, 8, 21 # r[j+len] + t - vadduhm 12, 12, 22 # r[j+len] + t - vadduhm 16, 16, 23 # r[j+len] + t - vadduhm 20, 20, 24 # r[j+len] + t + Compute_4Coeffs +.endm + +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxv 32+25, 0(5) # a[j], r[j+len] + lxv 32+26, 16(5) # a[j], r[j+len] + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxv 32+25, 32(5) # a[j], r[j+len] + lxv 32+26, 48(5) # a[j], r[j+len] + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxv 32+25, 64(5) # a[j], r[j+len] + lxv 32+26, 80(5) # a[j], r[j+len] + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxv 32+25, 96(5) # a[j], r[j+len] + lxv 32+26, 112(5) # a[j], r[j+len] + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 +.endm + +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxv 10, 0(5) # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxv 11, 16(5) # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+8, 11, 10, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+21, 11, 10, 3 # rj0 - rj4, rj8 - rj11 + lxv 10, 32(5) + lxv 11, 48(5) + xxpermdi 32+12, 11, 10, 0 + xxpermdi 32+22, 11, 10, 3 + lxv 10, 64(5) + lxv 11, 80(5) + xxpermdi 32+16, 11, 10, 0 + xxpermdi 32+23, 11, 10, 3 + lxv 10, 96(5) + lxv 11, 112(5) + xxpermdi 32+20, 11, 10, 0 + xxpermdi 32+24, 11, 10, 3 .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 @@ -131,7 +192,7 @@ .endm #----------------------------------- -# MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) # .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 # Modular multification bond by 2^16 * q in abs value @@ -210,34 +271,88 @@ stxv \_vs7, -16(3) .endm -.macro Write_Len2_4C _vs0 _vs1 _vs2 _vs3 - xxmrglw 32+12, \_vs0, 10 - xxmrghw 32+11, \_vs0, 10 - xxpermdi 10, 32+12, 32+11, 3 - xxmrglw 32+16, \_vs1, 11 - xxmrghw 32+15, \_vs1, 11 - xxpermdi 11, 32+16, 32+15, 3 - xxmrglw 32+12, \_vs2, 12 - xxmrghw 32+11, \_vs2, 12 - xxpermdi 12, 32+12, 32+11, 3 - xxmrglw 32+16, \_vs3, 13 - xxmrghw 32+15, \_vs3, 13 - xxpermdi 13, 32+16, 32+15, 3 - stxvd2x 10, 3, 9 - stxvd2x 11, 3, 16 - stxvd2x 12, 3, 18 - stxvd2x 13, 3, 20 +.macro PermWriteL44 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+13, 32+14, 3 + xxpermdi 32+11, 32+13, 32+14, 0 + xxpermdi 32+12, 32+18, 32+19, 3 + xxpermdi 32+13, 32+18, 32+19, 0 + xxpermdi 32+14, 32+23, 32+24, 3 + xxpermdi 32+15, 32+23, 32+24, 0 + xxpermdi 32+16, 32+28, 32+29, 3 + xxpermdi 32+17, 32+28, 32+29, 0 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) +.endm + +.macro PermWriteL24 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) +.endm + +.macro INTT_REDUCE_L24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL24 +.endm + +.macro INTT_REDUCE_L44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL44 .endm -.macro Write_Len4_4C _vs0 _vs1 _vs2 _vs3 - xxpermdi 10, 10, \_vs0, 3 - xxpermdi 11, 11, \_vs1, 3 - xxpermdi 12, 12, \_vs2, 3 - xxpermdi 13, 13, \_vs3, 3 - stxvd2x 10, 3, 9 - stxvd2x 11, 3, 16 - stxvd2x 12, 3, 18 - stxvd2x 13, 3, 20 +.macro INTT_REDUCE_4X start next step + Load_4Coeffs \start, \next, \step + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .endm # intt @@ -301,164 +416,85 @@ MLK_ASM_FN_SYMBOL(intt_ppc) vslw 9, 9, 10 xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + # + # Montgomery reduce loops with constant 1441 + # + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + li 15, 4 # loops + mtctr 15 + + Set_mont_consts +intt_ppc__Loopf: + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + bdnz intt_ppc__Loopf + + addi 3, 3, -512 + .align 4 # # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # Update zetas vectors, each vector has 2 zetas - addi 14, 4, IZETA_NTT_OFFSET127 + addi 14, 4, ZETA_INTT_OFFSET li 7, 4 li 15, 4 mtctr 15 - li 5, 0 + mr 5, 3 intt_ppc__Loop2: - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len2_4C 32+13, 32+18, 32+23, 32+28 - - addi 5, 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len2_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 + INTT_REDUCE_L24 + addi 5, 5, 128 bdnz intt_ppc__Loop2 .align 4 # # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - addi 14, 4, IZETA_NTT_OFFSET63 - li 5, 0 + mr 5, 3 li 7, 8 li 15, 4 # loops mtctr 15 intt_ppc__Loop4: - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len4_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len4_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 + INTT_REDUCE_L44 + addi 5, 5, 128 bdnz intt_ppc__Loop4 .align 4 # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - #addi 14, 14, 512 li 7, 16 li 5, 0 + li 15, 4 # loops + mtctr 15 - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 +intt_ppc__Loop8: + INTT_REDUCE_4X 5, 32, 32 + addi 5, 5, 128 + bdnz intt_ppc__Loop8 .align 4 # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - #addi 14, 14, 768 li 5, 0 li 7, 32 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 + li 5, 16 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts addi 14, 14, -64 - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 li 5, 256 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 li 5, 272 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts addi 14, 14, -64 - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 .align 4 # # 5. len = 32, start = 0, 64, 128, 192 - #addi 14, 14, 896 li 5, 0 li 7, 64 @@ -504,7 +540,6 @@ intt_ppc__Loop4: .align 4 # # 6. len = 64, start = 0, 128 - #addi 14, 14, 960 li 5, 0 li 7, 128 Load_4Coeffs 5, 16, 16 @@ -547,7 +582,6 @@ intt_ppc__Loop4: .align 4 # 7. len = 128, start = 0 # - #addi 14, 14, 992 li 5, 0 # start li 7, 256 # len * 2 @@ -587,37 +621,6 @@ intt_ppc__Loop4: MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 Write_M4C 32+13, 32+18, 32+23, 32+28 -.align 4 - # - # Montgomery reduce loops with constant 1441 - # - addi 14, 4, C1441_OFFSET - lvx V1441, 0, 14 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - lxv 32+20, 128(1) lxv 32+21, 144(1) lxv 32+22, 160(1) diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index 0c98581c53..435e5bb52e 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -28,15 +28,7 @@ .machine "any" .text -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) -# -.macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 +.macro Load_4Coeffs start next step mr 9, \start add 10, 7, 9 # J + len*2 addi 16, 9, \next @@ -53,7 +45,74 @@ xxpermdi 32+18, 32+18, 32+18, 2 xxpermdi 32+23, 32+23, 32+23, 2 xxpermdi 32+28, 32+28, 32+28, 2 +.endm +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxv 32+25, 0(5) # a[j], r[j+len] + lxv 32+26, 16(5) # a[j], r[j+len] + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxv 32+25, 32(5) # a[j], r[j+len] + lxv 32+26, 48(5) # a[j], r[j+len] + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxv 32+25, 64(5) # a[j], r[j+len] + lxv 32+26, 80(5) # a[j], r[j+len] + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxv 32+25, 96(5) # a[j], r[j+len] + lxv 32+26, 112(5) # a[j], r[j+len] + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 +.endm + +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxv 1, 0(5) # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxv 2, 16(5) # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+13, 2, 1, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+12, 2, 1, 3 # rj0 - rj4, rj8 - rj11 + lxv 3, 32(5) + lxv 4, 48(5) + xxpermdi 32+18, 4, 3, 0 + xxpermdi 32+17, 4, 3, 3 + lxv 1, 64(5) + lxv 2, 80(5) + xxpermdi 32+23, 2, 1, 0 + xxpermdi 32+22, 2, 1, 3 + lxv 3, 96(5) + lxv 4, 112(5) + xxpermdi 32+28, 4, 3, 0 + xxpermdi 32+27, 4, 3, 3 +.endm + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 # fqmul = zeta * coefficient # Modular multification bond by 2^16 * q in abs value vmladduhm 15, 13, \_vz0, 3 @@ -82,6 +141,9 @@ vsrah 23, 25, 4 # >> 1 vsrah 28, 30, 4 # >> 1 +.endm + +.macro Load_4Aj lxvd2x 32+12, 3, 9 # r[j] lxvd2x 32+17, 3, 16 # r[j] lxvd2x 32+22, 3, 18 # r[j] @@ -90,7 +152,9 @@ xxpermdi 32+17, 32+17, 32+17, 2 xxpermdi 32+22, 32+22, 32+22, 2 xxpermdi 32+27, 32+27, 32+27, 2 +.endm +.macro Compute_4Coeffs # Since the result of the Montgomery multiplication is bounded # by q in absolute value. # Finally to complete the final update of the results with add/sub @@ -104,6 +168,13 @@ vadduhm 30, 28, 27 # r + t .endm +.macro NTT_MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + Load_4Coeffs \start, \next, \step + MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 + Load_4Aj + Compute_4Coeffs +.endm + .macro Write_One stxvx 32+15, 3, 9 stxvx 32+16, 3, 10 @@ -115,35 +186,44 @@ stxvx 32+31, 3, 21 .endm -.macro Write_Two - xxpermdi 32+17, 32+16, 32+15, 3 - xxpermdi 32+22, 32+21, 32+20, 3 - xxpermdi 32+27, 32+26, 32+25, 3 - xxpermdi 32+29, 32+31, 32+30, 3 - - stxvx 32+17, 3, 9 - stxvx 32+22, 3, 16 - stxvx 32+27, 3, 18 - stxvx 32+29, 3, 20 +.macro PermWriteL44 + Compute_4Coeffs + xxpermdi 0, 32+16, 32+15, 3 + xxpermdi 1, 32+16, 32+15, 0 + xxpermdi 2, 32+21, 32+20, 3 + xxpermdi 3, 32+21, 32+20, 0 + xxpermdi 4, 32+26, 32+25, 3 + xxpermdi 5, 32+26, 32+25, 0 + xxpermdi 6, 32+31, 32+30, 3 + xxpermdi 7, 32+31, 32+30, 0 + stxv 0, 0(5) + stxv 1, 16(5) + stxv 2, 32(5) + stxv 3, 48(5) + stxv 4, 64(5) + stxv 5, 80(5) + stxv 6, 96(5) + stxv 7, 112(5) .endm -.macro Write_Three - xxmrglw 32+14, 32+16, 32+15 - xxmrghw 32+13, 32+16, 32+15 - xxpermdi 32+17, 32+13, 32+14, 3 - xxmrglw 32+19, 32+21, 32+20 - xxmrghw 32+18, 32+21, 32+20 - xxpermdi 32+22, 32+18, 32+19, 3 - xxmrglw 32+14, 32+26, 32+25 - xxmrghw 32+13, 32+26, 32+25 - xxpermdi 32+27, 32+13, 32+14, 3 - xxmrglw 32+24, 32+31, 32+30 - xxmrghw 32+23, 32+31, 32+30 - xxpermdi 32+29, 32+23, 32+24, 3 - stxvx 32+17, 3, 9 - stxvx 32+22, 3, 16 - stxvx 32+27, 3, 18 - stxvx 32+29, 3, 20 +.macro PermWriteL24 + Compute_4Coeffs + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) .endm .macro Load_next_4zetas @@ -207,16 +287,16 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 64 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 128 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 192 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One .align 4 @@ -227,19 +307,19 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 7, 128 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 64 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 256 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 320 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One .align 4 @@ -250,28 +330,25 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 7, 64 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 64 li 5, 128 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 128 li 5, 256 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 192 li 5, 384 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One .align 4 @@ -281,18 +358,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 5, 0 li 7, 32 Load_next_4zetas - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 16 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One Load_next_4zetas li 5, 256 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 272 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One .align 4 @@ -302,22 +379,22 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 5, 0 li 7, 16 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 128 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 256 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 384 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One # @@ -325,19 +402,15 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) # k += 32 li 15, 4 # loops mtctr 15 - li 5, 0 + mr 5, 3 li 7, 8 .align 4 ntt_ppc__Len4: Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Two - addi 5, 5, 64 - - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Two - addi 5, 5, 64 + Load_L44Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi 5, 5, 128 bdnz ntt_ppc__Len4 @@ -346,23 +419,17 @@ ntt_ppc__Len4: # k += 64 # Update zetas vectors, each vector has 2 zetas - addi 14, 4, ZETA_NTT_OFFSET64 - li 15, 4 mtctr 15 - li 5, 0 + mr 5, 3 li 7, 4 .align 4 ntt_ppc__Len2: Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Three - addi 5, 5, 64 - - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Three - addi 5, 5, 64 + Load_L24Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi 5, 5, 128 bdnz ntt_ppc__Len2 diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c index 4c2fbdf61a..fa0f7097f5 100644 --- a/mlkem/src/native/ppc64le/src/consts.c +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -3,6 +3,11 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ +#include +#include +#include +#include + #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ @@ -10,7 +15,7 @@ #include "consts.h" -MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { +MLK_ALIGN const int16_t mlk_ppc_qdata[1072] = { /* -Q */ -3329, -3329, -3329, -3329, -3329, -3329, -3329, -3329, /* QINV */ @@ -44,112 +49,84 @@ MLK_ALIGN const int16_t mlk_ppc_qdata[1568] = { 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, - -1571, -1571, -1571, -1571, -1571, -1571, 1223, 1223, 1223, 1223, 1223, - 1223, 1223, 1223, 652, 652, 652, 652, 652, 652, 652, 652, -552, -552, -552, - -552, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1015, 1015, 1015, - 1015, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1491, 1491, - 1491, 1491, 1491, 1491, 1491, 1491, -282, -282, -282, -282, -282, -282, - -282, -282, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, 516, - 516, 516, 516, 516, 516, 516, 516, -8, -8, -8, -8, -8, -8, -8, -8, -320, - -320, -320, -320, -320, -320, -320, -320, -666, -666, -666, -666, -666, - -666, -666, -666, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, - -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, 126, 126, 126, 126, - 126, 126, 126, 126, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, -853, - -853, -853, -853, -853, -853, -853, -853, -90, -90, -90, -90, -90, -90, -90, - -90, -271, -271, -271, -271, -271, -271, -271, -271, 830, 830, 830, 830, - 830, 830, 830, 830, 107, 107, 107, 107, 107, 107, 107, 107, -1421, -1421, - -1421, -1421, -1421, -1421, -1421, -1421, -247, -247, -247, -247, -247, - -247, -247, -247, -951, -951, -951, -951, -951, -951, -951, -951, -398, - -398, -398, -398, -398, -398, -398, -398, 961, 961, 961, 961, 961, 961, 961, - 961, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -725, -725, - -725, -725, -725, -725, -725, -725, 448, 448, 448, 448, 448, 448, 448, 448, - -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, 677, 677, 677, 677, - 677, 677, 677, 677, -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, - /* For intt Len=2, offset IZETA_NTT_OFFSET64 */ - -1103, -1103, -1103, -1103, 430, 430, 430, 430, 555, 555, 555, 555, 843, - 843, 843, 843, -1251, -1251, -1251, -1251, 871, 871, 871, 871, 1550, 1550, - 1550, 1550, 105, 105, 105, 105, 422, 422, 422, 422, 587, 587, 587, 587, 177, - 177, 177, 177, -235, -235, -235, -235, -291, -291, -291, -291, -460, -460, - -460, -460, 1574, 1574, 1574, 1574, 1653, 1653, 1653, 1653, -246, -246, - -246, -246, 778, 778, 778, 778, 1159, 1159, 1159, 1159, -147, -147, -147, - -147, -777, -777, -777, -777, 1483, 1483, 1483, 1483, -602, -602, -602, - -602, 1119, 1119, 1119, 1119, -1590, -1590, -1590, -1590, 644, 644, 644, - 644, -872, -872, -872, -872, 349, 349, 349, 349, 418, 418, 418, 418, 329, - 329, 329, 329, -156, -156, -156, -156, -75, -75, -75, -75, 817, 817, 817, - 817, 1097, 1097, 1097, 1097, 603, 603, 603, 603, 610, 610, 610, 610, 1322, - 1322, 1322, 1322, -1285, -1285, -1285, -1285, -1465, -1465, -1465, -1465, - 384, 384, 384, 384, -1215, -1215, -1215, -1215, -136, -136, -136, -136, - 1218, 1218, 1218, 1218, -1335, -1335, -1335, -1335, -874, -874, -874, -874, - 220, 220, 220, 220, -1187, -1187, -1187, -1187, -1659, -1659, -1659, -1659, - -1185, -1185, -1185, -1185, -1530, -1530, -1530, -1530, -1278, -1278, -1278, - -1278, 794, 794, 794, 794, -1510, -1510, -1510, -1510, -854, -854, -854, - -854, -870, -870, -870, -870, 478, 478, 478, 478, -108, -108, -108, -108, - -308, -308, -308, -308, 996, 996, 996, 996, 991, 991, 991, 991, 958, 958, - 958, 958, -1460, -1460, -1460, -1460, 1522, 1522, 1522, 1522, 1628, 1628, - 1628, 1628, - /* For intt Len=2, offset IZETA_NTT_OFFSET127 */ - 1628, 1628, 1628, 1628, 1522, 1522, 1522, 1522, -1460, -1460, -1460, -1460, - 958, 958, 958, 958, 991, 991, 991, 991, 996, 996, 996, 996, -308, -308, - -308, -308, -108, -108, -108, -108, 478, 478, 478, 478, -870, -870, -870, - -870, -854, -854, -854, -854, -1510, -1510, -1510, -1510, 794, 794, 794, - 794, -1278, -1278, -1278, -1278, -1530, -1530, -1530, -1530, -1185, -1185, - -1185, -1185, -1659, -1659, -1659, -1659, -1187, -1187, -1187, -1187, 220, - 220, 220, 220, -874, -874, -874, -874, -1335, -1335, -1335, -1335, 1218, - 1218, 1218, 1218, -136, -136, -136, -136, -1215, -1215, -1215, -1215, 384, - 384, 384, 384, -1465, -1465, -1465, -1465, -1285, -1285, -1285, -1285, 1322, - 1322, 1322, 1322, 610, 610, 610, 610, 603, 603, 603, 603, 1097, 1097, 1097, - 1097, 817, 817, 817, 817, -75, -75, -75, -75, -156, -156, -156, -156, 329, - 329, 329, 329, 418, 418, 418, 418, 349, 349, 349, 349, -872, -872, -872, - -872, 644, 644, 644, 644, -1590, -1590, -1590, -1590, 1119, 1119, 1119, - 1119, -602, -602, -602, -602, 1483, 1483, 1483, 1483, -777, -777, -777, - -777, -147, -147, -147, -147, 1159, 1159, 1159, 1159, 778, 778, 778, 778, - -246, -246, -246, -246, 1653, 1653, 1653, 1653, 1574, 1574, 1574, 1574, - -460, -460, -460, -460, -291, -291, -291, -291, -235, -235, -235, -235, 177, - 177, 177, 177, 587, 587, 587, 587, 422, 422, 422, 422, 105, 105, 105, 105, - 1550, 1550, 1550, 1550, 871, 871, 871, 871, -1251, -1251, -1251, -1251, 843, - 843, 843, 843, 555, 555, 555, 555, 430, 430, 430, 430, -1103, -1103, -1103, - -1103, - /* For intt Len=4 and others, offset IZETA_NTT_OFFSET63 */ - -1275, -1275, -1275, -1275, -1275, -1275, -1275, -1275, 677, 677, 677, 677, - 677, 677, 677, 677, -1065, -1065, -1065, -1065, -1065, -1065, -1065, -1065, - 448, 448, 448, 448, 448, 448, 448, 448, -725, -725, -725, -725, -725, -725, - -725, -725, -1508, -1508, -1508, -1508, -1508, -1508, -1508, -1508, 961, - 961, 961, 961, 961, 961, 961, 961, -398, -398, -398, -398, -398, -398, -398, - -398, -951, -951, -951, -951, -951, -951, -951, -951, -247, -247, -247, - -247, -247, -247, -247, -247, -1421, -1421, -1421, -1421, -1421, -1421, - -1421, -1421, 107, 107, 107, 107, 107, 107, 107, 107, 830, 830, 830, 830, - 830, 830, 830, 830, -271, -271, -271, -271, -271, -271, -271, -271, -90, - -90, -90, -90, -90, -90, -90, -90, -853, -853, -853, -853, -853, -853, -853, - -853, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 1469, 126, 126, 126, 126, - 126, 126, 126, 126, -1162, -1162, -1162, -1162, -1162, -1162, -1162, -1162, - -1618, -1618, -1618, -1618, -1618, -1618, -1618, -1618, -666, -666, -666, - -666, -666, -666, -666, -666, -320, -320, -320, -320, -320, -320, -320, - -320, -8, -8, -8, -8, -8, -8, -8, -8, 516, 516, 516, 516, 516, 516, 516, - 516, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -1544, -282, -282, - -282, -282, -282, -282, -282, -282, 1491, 1491, 1491, 1491, 1491, 1491, - 1491, 1491, -1293, -1293, -1293, -1293, -1293, -1293, -1293, -1293, 1015, - 1015, 1015, 1015, 1015, 1015, 1015, 1015, -552, -552, -552, -552, -552, - -552, -552, -552, 652, 652, 652, 652, 652, 652, 652, 652, 1223, 1223, 1223, - 1223, 1223, 1223, 1223, 1223, -1571, -1571, -1571, -1571, -1571, -1571, - -1571, -1571, -205, -205, -205, -205, -205, -205, -205, -205, 411, 411, 411, - 411, 411, 411, 411, 411, -1542, -1542, -1542, -1542, -1542, -1542, -1542, - -1542, 608, 608, 608, 608, 608, 608, 608, 608, 732, 732, 732, 732, 732, 732, - 732, 732, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, -681, -681, -681, - -681, -681, -681, -681, -681, -130, -130, -130, -130, -130, -130, -130, - -130, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, 1458, 1458, - 1458, 1458, 1458, 1458, 1458, 1458, -829, -829, -829, -829, -829, -829, - -829, -829, 383, 383, 383, 383, 383, 383, 383, 383, 264, 264, 264, 264, 264, - 264, 264, 264, -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 573, - 573, 573, 573, 573, 573, 573, 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, - 1468, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1202, -1202, - -1202, -1202, -1202, -1202, -1202, -1202, 962, 962, 962, 962, 962, 962, 962, - 962, 182, 182, 182, 182, 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, - 1577, 1577, 1577, 622, 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, - -171, -171, -171, -171, -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, - 287, 287, 287, 287, 287, 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 1422, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, - -1517, -1517, -1517, -1517, -1517, -359, -359, -359, -359, -359, -359, -359, - -359, -758, -758, -758, -758, -758, -758, -758, -758}; + -1571, -1571, -1571, -1571, -1571, -1571, + /* For Len=4 */ + 1223, 1223, 1223, 1223, 652, 652, 652, 652, -552, -552, -552, -552, 1015, + 1015, 1015, 1015, -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, -282, + -282, -282, -282, -1544, -1544, -1544, -1544, 516, 516, 516, 516, -8, -8, + -8, -8, -320, -320, -320, -320, -666, -666, -666, -666, -1618, -1618, -1618, + -1618, -1162, -1162, -1162, -1162, 126, 126, 126, 126, 1469, 1469, 1469, + 1469, -853, -853, -853, -853, -90, -90, -90, -90, -271, -271, -271, -271, + 830, 830, 830, 830, 107, 107, 107, 107, -1421, -1421, -1421, -1421, -247, + -247, -247, -247, -951, -951, -951, -951, -398, -398, -398, -398, 961, 961, + 961, 961, -1508, -1508, -1508, -1508, -725, -725, -725, -725, 448, 448, 448, + 448, -1065, -1065, -1065, -1065, 677, 677, 677, 677, -1275, -1275, -1275, + -1275, + /* + * For ntt Len=2 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + 555, 555, -1103, -1103, 843, 843, 430, 430, 1550, 1550, -1251, -1251, 105, + 105, 871, 871, 177, 177, 422, 422, -235, -235, 587, 587, 1574, 1574, -291, + -291, 1653, 1653, -460, -460, 1159, 1159, -246, -246, -147, -147, 778, 778, + -602, -602, -777, -777, 1119, 1119, 1483, 1483, -872, -872, -1590, -1590, + 349, 349, 644, 644, -156, -156, 418, 418, -75, -75, 329, 329, 603, 603, 817, + 817, 610, 610, 1097, 1097, -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, + 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, -1187, -1187, -874, + -874, -1659, -1659, 220, 220, -1278, -1278, -1185, -1185, 794, 794, -1530, + -1530, -870, -870, -1510, -1510, 478, 478, -854, -854, 996, 996, -108, -108, + 991, 991, -308, -308, 1522, 1522, 958, 958, 1628, 1628, -1460, -1460, + /* + * For intt Len=2, offset IZETA_NTT_OFFSET127 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, -308, -308, 991, 991, -108, + -108, 996, 996, -854, -854, 478, 478, -1510, -1510, -870, -870, -1530, + -1530, 794, 794, -1185, -1185, -1278, -1278, 220, 220, -1659, -1659, -874, + -874, -1187, -1187, -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, + -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, 1097, 1097, 610, 610, 817, + 817, 603, 603, 329, 329, -75, -75, 418, 418, -156, -156, 644, 644, 349, 349, + -1590, -1590, -872, -872, 1483, 1483, 1119, 1119, -777, -777, -602, -602, + 778, 778, -147, -147, -246, -246, 1159, 1159, -460, -460, 1653, 1653, -291, + -291, 1574, 1574, 587, 587, -235, -235, 422, 422, 177, 177, 871, 871, 105, + 105, -1251, -1251, 1550, 1550, 430, 430, 843, 843, -1103, -1103, 555, 555, + /* For intt Len=4 */ + -1275, -1275, -1275, -1275, 677, 677, 677, 677, -1065, -1065, -1065, -1065, + 448, 448, 448, 448, -725, -725, -725, -725, -1508, -1508, -1508, -1508, 961, + 961, 961, 961, -398, -398, -398, -398, -951, -951, -951, -951, -247, -247, + -247, -247, -1421, -1421, -1421, -1421, 107, 107, 107, 107, 830, 830, 830, + 830, -271, -271, -271, -271, -90, -90, -90, -90, -853, -853, -853, -853, + 1469, 1469, 1469, 1469, 126, 126, 126, 126, -1162, -1162, -1162, -1162, + -1618, -1618, -1618, -1618, -666, -666, -666, -666, -320, -320, -320, -320, + -8, -8, -8, -8, 516, 516, 516, 516, -1544, -1544, -1544, -1544, -282, -282, + -282, -282, 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, 1015, 1015, + 1015, 1015, -552, -552, -552, -552, 652, 652, 652, 652, 1223, 1223, 1223, + 1223, + /* For intt Len=8 and others */ + -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, -205, -205, -205, + -205, -205, -205, -205, -205, 411, 411, 411, 411, 411, 411, 411, 411, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 608, 608, 608, 608, 608, + 608, 608, 608, 732, 732, 732, 732, 732, 732, 732, 732, 1017, 1017, 1017, + 1017, 1017, 1017, 1017, 1017, -681, -681, -681, -681, -681, -681, -681, + -681, -130, -130, -130, -130, -130, -130, -130, -130, -1602, -1602, -1602, + -1602, -1602, -1602, -1602, -1602, 1458, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, -829, -829, -829, -829, -829, -829, -829, -829, 383, 383, 383, 383, + 383, 383, 383, 383, 264, 264, 264, 264, 264, 264, 264, 264, -1325, -1325, + -1325, -1325, -1325, -1325, -1325, -1325, 573, 573, 573, 573, 573, 573, 573, + 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, -1474, -1474, -1474, + -1474, -1474, -1474, -1474, -1474, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1202, 962, 962, 962, 962, 962, 962, 962, 962, 182, 182, 182, 182, + 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 622, + 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, -171, -171, -171, -171, + -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, 287, 287, 287, 287, 287, + 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1493, 1493, 1493, + 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -359, -359, -359, -359, -359, -359, -359, -359, -758, -758, + -758, -758, -758, -758, -758, -758}; #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h index 49f519d0c3..df5d163f78 100644 --- a/mlkem/src/native/ppc64le/src/consts.h +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -14,9 +14,7 @@ #define C1441_OFFSET 64 #define C1353_OFFSET 80 #define ZETA_NTT_OFFSET 96 -#define ZETA_NTT_OFFSET64 1104 -#define IZETA_NTT_OFFSET127 1616 -#define IZETA_NTT_OFFSET63 2128 +#define ZETA_INTT_OFFSET 1104 #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 817c8c2997..65df15b996 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -35,6 +35,17 @@ #define V_ZETA 10 #define V1441 10 +.macro Compute_4Coeffs + vsubuhm 25, 8, 21 # r[j+len] - t + vsubuhm 26, 12, 22 # r[j+len] - t + vsubuhm 30, 16, 23 # r[j+len] - t + vsubuhm 31, 20, 24 # r[j+len] - t + vadduhm 8, 8, 21 # r[j+len] + t + vadduhm 12, 12, 22 # r[j+len] + t + vadduhm 16, 16, 23 # r[j+len] + t + vadduhm 20, 20, 24 # r[j+len] + t +.endm + .macro Load_4Coeffs start next step mr 9, \start # j add 10, 7, 9 # J + len*2 @@ -62,14 +73,64 @@ xxpermdi 32+23, 32+23, 32+23, 2 xxpermdi 32+24, 32+24, 32+24, 2 - vsubuhm 25, 8, 21 # r[j+len] - t - vsubuhm 26, 12, 22 # r[j+len] - t - vsubuhm 30, 16, 23 # r[j+len] - t - vsubuhm 31, 20, 24 # r[j+len] - t - vadduhm 8, 8, 21 # r[j+len] + t - vadduhm 12, 12, 22 # r[j+len] + t - vadduhm 16, 16, 23 # r[j+len] + t - vadduhm 20, 20, 24 # r[j+len] + t + Compute_4Coeffs +.endm + +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxv 32+25, 0(5) # a[j], r[j+len] + lxv 32+26, 16(5) # a[j], r[j+len] + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxv 32+25, 32(5) # a[j], r[j+len] + lxv 32+26, 48(5) # a[j], r[j+len] + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxv 32+25, 64(5) # a[j], r[j+len] + lxv 32+26, 80(5) # a[j], r[j+len] + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxv 32+25, 96(5) # a[j], r[j+len] + lxv 32+26, 112(5) # a[j], r[j+len] + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 +.endm + +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxv 10, 0(5) # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxv 11, 16(5) # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+8, 11, 10, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+21, 11, 10, 3 # rj0 - rj4, rj8 - rj11 + lxv 10, 32(5) + lxv 11, 48(5) + xxpermdi 32+12, 11, 10, 0 + xxpermdi 32+22, 11, 10, 3 + lxv 10, 64(5) + lxv 11, 80(5) + xxpermdi 32+16, 11, 10, 0 + xxpermdi 32+23, 11, 10, 3 + lxv 10, 96(5) + lxv 11, 112(5) + xxpermdi 32+20, 11, 10, 0 + xxpermdi 32+24, 11, 10, 3 .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 @@ -130,7 +191,7 @@ .endm #----------------------------------- -# MREDUCE_4X(len, start, _vz0, _vz1, _vz2, _vz3) +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) # .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 # Modular multification bond by 2^16 * q in abs value @@ -209,34 +270,88 @@ stxv \_vs7, -16(3) .endm -.macro Write_Len2_4C _vs0 _vs1 _vs2 _vs3 - xxmrglw 32+12, \_vs0, 10 - xxmrghw 32+11, \_vs0, 10 - xxpermdi 10, 32+12, 32+11, 3 - xxmrglw 32+16, \_vs1, 11 - xxmrghw 32+15, \_vs1, 11 - xxpermdi 11, 32+16, 32+15, 3 - xxmrglw 32+12, \_vs2, 12 - xxmrghw 32+11, \_vs2, 12 - xxpermdi 12, 32+12, 32+11, 3 - xxmrglw 32+16, \_vs3, 13 - xxmrghw 32+15, \_vs3, 13 - xxpermdi 13, 32+16, 32+15, 3 - stxvd2x 10, 3, 9 - stxvd2x 11, 3, 16 - stxvd2x 12, 3, 18 - stxvd2x 13, 3, 20 +.macro PermWriteL44 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+13, 32+14, 3 + xxpermdi 32+11, 32+13, 32+14, 0 + xxpermdi 32+12, 32+18, 32+19, 3 + xxpermdi 32+13, 32+18, 32+19, 0 + xxpermdi 32+14, 32+23, 32+24, 3 + xxpermdi 32+15, 32+23, 32+24, 0 + xxpermdi 32+16, 32+28, 32+29, 3 + xxpermdi 32+17, 32+28, 32+29, 0 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) +.endm + +.macro PermWriteL24 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) +.endm + +.macro INTT_REDUCE_L24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL24 +.endm + +.macro INTT_REDUCE_L44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL44 .endm -.macro Write_Len4_4C _vs0 _vs1 _vs2 _vs3 - xxpermdi 10, 10, \_vs0, 3 - xxpermdi 11, 11, \_vs1, 3 - xxpermdi 12, 12, \_vs2, 3 - xxpermdi 13, 13, \_vs3, 3 - stxvd2x 10, 3, 9 - stxvd2x 11, 3, 16 - stxvd2x 12, 3, 18 - stxvd2x 13, 3, 20 +.macro INTT_REDUCE_4X start next step + Load_4Coeffs \start, \next, \step + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .endm # intt @@ -300,164 +415,85 @@ MLK_ASM_FN_SYMBOL(intt_ppc) vslw 9, 9, 10 xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + # + # Montgomery reduce loops with constant 1441 + # + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + li 15, 4 # loops + mtctr 15 + + Set_mont_consts +intt_ppc__Loopf: + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + bdnz intt_ppc__Loopf + + addi 3, 3, -512 + .align 4 # # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 # Update zetas vectors, each vector has 2 zetas - addi 14, 4, IZETA_NTT_OFFSET127 + addi 14, 4, ZETA_INTT_OFFSET li 7, 4 li 15, 4 mtctr 15 - li 5, 0 + mr 5, 3 intt_ppc__Loop2: - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len2_4C 32+13, 32+18, 32+23, 32+28 - - addi 5, 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len2_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 + INTT_REDUCE_L24 + addi 5, 5, 128 bdnz intt_ppc__Loop2 .align 4 # # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - addi 14, 4, IZETA_NTT_OFFSET63 - li 5, 0 + mr 5, 3 li 7, 8 li 15, 4 # loops mtctr 15 intt_ppc__Loop4: - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len4_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_Len4_4C 32+13, 32+18, 32+23, 32+28 - addi 5, 5, 64 + INTT_REDUCE_L44 + addi 5, 5, 128 bdnz intt_ppc__Loop4 .align 4 # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - #addi 14, 14, 512 li 7, 16 li 5, 0 + li 15, 4 # loops + mtctr 15 - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 32, 32 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 +intt_ppc__Loop8: + INTT_REDUCE_4X 5, 32, 32 + addi 5, 5, 128 + bdnz intt_ppc__Loop8 .align 4 # # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - #addi 14, 14, 768 li 5, 0 li 7, 32 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 + li 5, 16 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts addi 14, 14, -64 - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 li 5, 256 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 li 5, 272 - Load_4Coeffs 5, 64, 64 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts addi 14, 14, -64 - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + INTT_REDUCE_4X 5, 64, 64 .align 4 # # 5. len = 32, start = 0, 64, 128, 192 - #addi 14, 14, 896 li 5, 0 li 7, 64 @@ -503,7 +539,6 @@ intt_ppc__Loop4: .align 4 # # 6. len = 64, start = 0, 128 - #addi 14, 14, 960 li 5, 0 li 7, 128 Load_4Coeffs 5, 16, 16 @@ -546,7 +581,6 @@ intt_ppc__Loop4: .align 4 # 7. len = 128, start = 0 # - #addi 14, 14, 992 li 5, 0 # start li 7, 256 # len * 2 @@ -586,37 +620,6 @@ intt_ppc__Loop4: MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 Write_M4C 32+13, 32+18, 32+23, 32+28 -.align 4 - # - # Montgomery reduce loops with constant 1441 - # - addi 14, 4, C1441_OFFSET - lvx V1441, 0, 14 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 - Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - lxv 32+20, 128(1) lxv 32+21, 144(1) lxv 32+22, 160(1) @@ -660,18 +663,3 @@ intt_ppc__Loop4: #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ !MLK_CONFIG_MULTILEVEL_NO_SHARED */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V20159 -#undef V_25 -#undef V_26 -#undef V_MKQ -#undef V_QINV -#undef V_NMKQ -#undef V_Z0 -#undef V_Z1 -#undef V_Z2 -#undef V_Z3 -#undef V_ZETA -#undef V1441 diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index 83f42f9b8a..70e7bf7104 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -27,15 +27,7 @@ .machine "any" .text -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(start, _vz0, _vz1, _vz2, _vz3) -# -.macro MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 +.macro Load_4Coeffs start next step mr 9, \start add 10, 7, 9 # J + len*2 addi 16, 9, \next @@ -52,7 +44,74 @@ xxpermdi 32+18, 32+18, 32+18, 2 xxpermdi 32+23, 32+23, 32+23, 2 xxpermdi 32+28, 32+28, 32+28, 2 +.endm + +# +# Load Coeffients and setup vectors +# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 +# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 +# +# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 +# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 +# +.macro Load_L24Coeffs + lxv 32+25, 0(5) # a[j], r[j+len] + lxv 32+26, 16(5) # a[j], r[j+len] + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxv 32+25, 32(5) # a[j], r[j+len] + lxv 32+26, 48(5) # a[j], r[j+len] + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxv 32+25, 64(5) # a[j], r[j+len] + lxv 32+26, 80(5) # a[j], r[j+len] + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxv 32+25, 96(5) # a[j], r[j+len] + lxv 32+26, 112(5) # a[j], r[j+len] + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 +.endm +# +# Permute +# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 +# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 +# +# to +# rjlen4 - rjlen7, rjlen12 - rjlen15 +# rj0 - rj4, rj8 - rj11 +# +.macro Load_L44Coeffs + lxv 1, 0(5) # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxv 2, 16(5) # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+13, 2, 1, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+12, 2, 1, 3 # rj0 - rj4, rj8 - rj11 + lxv 3, 32(5) + lxv 4, 48(5) + xxpermdi 32+18, 4, 3, 0 + xxpermdi 32+17, 4, 3, 3 + lxv 1, 64(5) + lxv 2, 80(5) + xxpermdi 32+23, 2, 1, 0 + xxpermdi 32+22, 2, 1, 3 + lxv 3, 96(5) + lxv 4, 112(5) + xxpermdi 32+28, 4, 3, 0 + xxpermdi 32+27, 4, 3, 3 +.endm + +# +# montgomery_reduce +# t = a * QINV +# t = (a - (int32_t)t*_MLKEM_Q) >> 16 +# +#----------------------------------- +# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) +# +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 # fqmul = zeta * coefficient # Modular multification bond by 2^16 * q in abs value vmladduhm 15, 13, \_vz0, 3 @@ -81,6 +140,9 @@ vsrah 23, 25, 4 # >> 1 vsrah 28, 30, 4 # >> 1 +.endm + +.macro Load_4Aj lxvd2x 32+12, 3, 9 # r[j] lxvd2x 32+17, 3, 16 # r[j] lxvd2x 32+22, 3, 18 # r[j] @@ -89,7 +151,9 @@ xxpermdi 32+17, 32+17, 32+17, 2 xxpermdi 32+22, 32+22, 32+22, 2 xxpermdi 32+27, 32+27, 32+27, 2 +.endm +.macro Compute_4Coeffs # Since the result of the Montgomery multiplication is bounded # by q in absolute value. # Finally to complete the final update of the results with add/sub @@ -103,6 +167,13 @@ vadduhm 30, 28, 27 # r + t .endm +.macro NTT_MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 + Load_4Coeffs \start, \next, \step + MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 + Load_4Aj + Compute_4Coeffs +.endm + .macro Write_One stxvx 32+15, 3, 9 stxvx 32+16, 3, 10 @@ -114,35 +185,44 @@ stxvx 32+31, 3, 21 .endm -.macro Write_Two - xxpermdi 32+17, 32+16, 32+15, 3 - xxpermdi 32+22, 32+21, 32+20, 3 - xxpermdi 32+27, 32+26, 32+25, 3 - xxpermdi 32+29, 32+31, 32+30, 3 - - stxvx 32+17, 3, 9 - stxvx 32+22, 3, 16 - stxvx 32+27, 3, 18 - stxvx 32+29, 3, 20 +.macro PermWriteL44 + Compute_4Coeffs + xxpermdi 0, 32+16, 32+15, 3 + xxpermdi 1, 32+16, 32+15, 0 + xxpermdi 2, 32+21, 32+20, 3 + xxpermdi 3, 32+21, 32+20, 0 + xxpermdi 4, 32+26, 32+25, 3 + xxpermdi 5, 32+26, 32+25, 0 + xxpermdi 6, 32+31, 32+30, 3 + xxpermdi 7, 32+31, 32+30, 0 + stxv 0, 0(5) + stxv 1, 16(5) + stxv 2, 32(5) + stxv 3, 48(5) + stxv 4, 64(5) + stxv 5, 80(5) + stxv 6, 96(5) + stxv 7, 112(5) .endm -.macro Write_Three - xxmrglw 32+14, 32+16, 32+15 - xxmrghw 32+13, 32+16, 32+15 - xxpermdi 32+17, 32+13, 32+14, 3 - xxmrglw 32+19, 32+21, 32+20 - xxmrghw 32+18, 32+21, 32+20 - xxpermdi 32+22, 32+18, 32+19, 3 - xxmrglw 32+14, 32+26, 32+25 - xxmrghw 32+13, 32+26, 32+25 - xxpermdi 32+27, 32+13, 32+14, 3 - xxmrglw 32+24, 32+31, 32+30 - xxmrghw 32+23, 32+31, 32+30 - xxpermdi 32+29, 32+23, 32+24, 3 - stxvx 32+17, 3, 9 - stxvx 32+22, 3, 16 - stxvx 32+27, 3, 18 - stxvx 32+29, 3, 20 +.macro PermWriteL24 + Compute_4Coeffs + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxv 32+10, 0(5) + stxv 32+11, 16(5) + stxv 32+12, 32(5) + stxv 32+13, 48(5) + stxv 32+14, 64(5) + stxv 32+15, 80(5) + stxv 32+16, 96(5) + stxv 32+17, 112(5) .endm .macro Load_next_4zetas @@ -206,16 +286,16 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 64 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 128 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 192 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One .align 4 @@ -226,19 +306,19 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 7, 128 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 64 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 256 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One li 5, 320 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One .align 4 @@ -249,28 +329,25 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 7, 64 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 64 li 5, 128 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 128 li 5, 256 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One - #li 5, 192 li 5, 384 lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA Write_One .align 4 @@ -280,18 +357,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 5, 0 li 7, 32 Load_next_4zetas - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 16 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One Load_next_4zetas li 5, 256 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 272 - MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 Write_One .align 4 @@ -301,22 +378,22 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 5, 0 li 7, 16 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 128 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 256 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One li 5, 384 Load_next_4zetas - MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 Write_One # @@ -324,19 +401,15 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) # k += 32 li 15, 4 # loops mtctr 15 - li 5, 0 + mr 5, 3 li 7, 8 .align 4 ntt_ppc__Len4: Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Two - addi 5, 5, 64 - - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Two - addi 5, 5, 64 + Load_L44Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi 5, 5, 128 bdnz ntt_ppc__Len4 @@ -345,23 +418,17 @@ ntt_ppc__Len4: # k += 64 # Update zetas vectors, each vector has 2 zetas - addi 14, 4, ZETA_NTT_OFFSET64 - li 15, 4 mtctr 15 - li 5, 0 + mr 5, 3 li 7, 4 .align 4 ntt_ppc__Len2: Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Three - addi 5, 5, 64 - - Load_next_4zetas - MREDUCE_4X 5, 16, 16, V_Z0, V_Z1, V_Z2, V_Z3 - Write_Three - addi 5, 5, 64 + Load_L24Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi 5, 5, 128 bdnz ntt_ppc__Len2 @@ -399,9 +466,3 @@ ntt_ppc__Len2: #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ !MLK_CONFIG_MULTILEVEL_NO_SHARED */ - -/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. - * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V_QINV -#undef V_NMKQ -#undef V_ZETA From 7b09ee4b818f87dc047b3c0880d6d48f17601682 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Fri, 17 Oct 2025 15:28:42 -0400 Subject: [PATCH 17/22] Fixed illegal instructions used for p8 arch, lxv and stxv. Also fixed instruction byte orerding mismatch for p8 and p9/10, lxvx/stxvx and lxvd2x/stxvd2x. Used lxvd2x and stxvd2x for consistant byte ordering. Signed-off-by: Danny Tsen --- dev/ppc64le/src/intt_ppc.S | 334 ++++++++++++--------- dev/ppc64le/src/ntt_ppc.S | 279 +++++++++-------- dev/ppc64le/src/poly_tomont.S | 83 +++-- dev/ppc64le/src/reduce.S | 59 ++-- mlkem/src/native/ppc64le/src/intt_ppc.S | 334 ++++++++++++--------- mlkem/src/native/ppc64le/src/ntt_ppc.S | 279 +++++++++-------- mlkem/src/native/ppc64le/src/poly_tomont.S | 77 +++-- mlkem/src/native/ppc64le/src/reduce.S | 52 ++-- 8 files changed, 883 insertions(+), 614 deletions(-) diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 5c7b3dba67..4fc49edcd6 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -36,6 +36,81 @@ #define V_ZETA 10 #define V1441 10 +.macro SAVE_REGS + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 +.endm + +.macro RESTORE_REGS + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 +.endm + .macro Compute_4Coeffs vsubuhm 25, 8, 21 # r[j+len] - t vsubuhm 26, 12, 22 # r[j+len] - t @@ -60,19 +135,11 @@ lxvd2x 32+12, 3, 17 # r[j+len] lxvd2x 32+16, 3, 19 # r[j+len] lxvd2x 32+20, 3, 21 # r[j+len] - xxpermdi 32+8, 32+8, 32+8, 2 - xxpermdi 32+12, 32+12, 32+12, 2 - xxpermdi 32+16, 32+16, 32+16, 2 - xxpermdi 32+20, 32+20, 32+20, 2 lxvd2x 32+21, 3, 9 lxvd2x 32+22, 3, 16 lxvd2x 32+23, 3, 18 lxvd2x 32+24, 3, 20 - xxpermdi 32+21, 32+21, 32+21, 2 - xxpermdi 32+22, 32+22, 32+22, 2 - xxpermdi 32+23, 32+23, 32+23, 2 - xxpermdi 32+24, 32+24, 32+24, 2 Compute_4Coeffs .endm @@ -86,20 +153,20 @@ # a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 # .macro Load_L24Coeffs - lxv 32+25, 0(5) # a[j], r[j+len] - lxv 32+26, 16(5) # a[j], r[j+len] + lxvd2x 32+25, 0, 5 # a[j], r[j+len] + lxvd2x 32+26, 10, 5 # a[j], r[j+len] vmrgew 8, 25, 26 vmrgow 21, 25, 26 - lxv 32+25, 32(5) # a[j], r[j+len] - lxv 32+26, 48(5) # a[j], r[j+len] + lxvd2x 32+25, 11, 5 # a[j], r[j+len] + lxvd2x 32+26, 12, 5 # a[j], r[j+len] vmrgew 12, 25, 26 vmrgow 22, 25, 26 - lxv 32+25, 64(5) # a[j], r[j+len] - lxv 32+26, 80(5) # a[j], r[j+len] + lxvd2x 32+25, 15, 5 # a[j], r[j+len] + lxvd2x 32+26, 16, 5 # a[j], r[j+len] vmrgew 16, 25, 26 vmrgow 23, 25, 26 - lxv 32+25, 96(5) # a[j], r[j+len] - lxv 32+26, 112(5) # a[j], r[j+len] + lxvd2x 32+25, 17, 5 # a[j], r[j+len] + lxvd2x 32+26, 18, 5 # a[j], r[j+len] vmrgew 20, 25, 26 vmrgow 24, 25, 26 .endm @@ -114,24 +181,24 @@ # rj0 - rj4, rj8 - rj11 # .macro Load_L44Coeffs - lxv 10, 0(5) # rj0, rj1, rj2, rj3, - # rjlen4, rjlen5, rjlen6, rjlen7 - lxv 11, 16(5) # rj8, rj9, rj10, rj11 - # rjlen12, rjlen13, rjlen14, rjlen15 - xxpermdi 32+8, 11, 10, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 - xxpermdi 32+21, 11, 10, 3 # rj0 - rj4, rj8 - rj11 - lxv 10, 32(5) - lxv 11, 48(5) - xxpermdi 32+12, 11, 10, 0 - xxpermdi 32+22, 11, 10, 3 - lxv 10, 64(5) - lxv 11, 80(5) - xxpermdi 32+16, 11, 10, 0 - xxpermdi 32+23, 11, 10, 3 - lxv 10, 96(5) - lxv 11, 112(5) - xxpermdi 32+20, 11, 10, 0 - xxpermdi 32+24, 11, 10, 3 + lxvd2x 10, 0, 5 # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxvd2x 11, 10, 5 # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+8, 11, 10, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+21, 11, 10, 0 # rj0 - rj4, rj8 - rj11 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxpermdi 32+12, 11, 10, 3 + xxpermdi 32+22, 11, 10, 0 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxpermdi 32+16, 11, 10, 3 + xxpermdi 32+23, 11, 10, 0 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxpermdi 32+20, 11, 10, 3 + xxpermdi 32+24, 11, 10, 0 .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 @@ -231,44 +298,56 @@ .endm .macro Load_next_4zetas - lxv 32+V_Z0, 0(14) - lxv 32+V_Z1, 16(14) - lxv 32+V_Z2, 32(14) - lxv 32+V_Z3, 48(14) - addi 14, 14, 64 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 8, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 + addi 14, 14, 64 +.endm + +.macro Perm_4zetas + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 .endm .macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvx \_vs0, 3, 9 - stxvx \_vs1, 3, 16 - stxvx \_vs2, 3, 18 - stxvx \_vs3, 3, 20 + stxvd2x \_vs0, 3, 9 + stxvd2x \_vs1, 3, 16 + stxvd2x \_vs2, 3, 18 + stxvd2x \_vs3, 3, 20 .endm .macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvx \_vs0, 3, 10 - stxvx \_vs1, 3, 17 - stxvx \_vs2, 3, 19 - stxvx \_vs3, 3, 21 + stxvd2x \_vs0, 3, 10 + stxvd2x \_vs1, 3, 17 + stxvd2x \_vs2, 3, 19 + stxvd2x \_vs3, 3, 21 .endm .macro Reload_4coeffs - lxv 32+25, 0(3) - lxv 32+26, 16(3) - lxv 32+30, 32(3) - lxv 32+31, 48(3) + lxvd2x 32+25, 0, 3 + lxvd2x 32+26, 10, 3 + lxvd2x 32+30, 11, 3 + lxvd2x 32+31, 12, 3 addi 3, 3, 64 .endm .macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - stxv \_vs0, -128(3) - stxv \_vs1, -112(3) - stxv \_vs2, -96(3) - stxv \_vs3, -80(3) - stxv \_vs4, -64(3) - stxv \_vs5, -48(3) - stxv \_vs6, -32(3) - stxv \_vs7, -16(3) + addi 3, 3, -128 + stxvd2x \_vs0, 0, 3 + stxvd2x \_vs1, 10, 3 + stxvd2x \_vs2, 11, 3 + stxvd2x \_vs3, 12, 3 + stxvd2x \_vs4, 15, 3 + stxvd2x \_vs5, 16, 3 + stxvd2x \_vs6, 17, 3 + stxvd2x \_vs7, 18, 3 + addi 3, 3, 128 .endm .macro PermWriteL44 @@ -276,22 +355,22 @@ xxlor 32+19, 11, 11 xxlor 32+24, 12, 12 xxlor 32+29, 13, 13 - xxpermdi 32+10, 32+13, 32+14, 3 - xxpermdi 32+11, 32+13, 32+14, 0 - xxpermdi 32+12, 32+18, 32+19, 3 - xxpermdi 32+13, 32+18, 32+19, 0 - xxpermdi 32+14, 32+23, 32+24, 3 - xxpermdi 32+15, 32+23, 32+24, 0 - xxpermdi 32+16, 32+28, 32+29, 3 - xxpermdi 32+17, 32+28, 32+29, 0 - stxv 32+10, 0(5) - stxv 32+11, 16(5) - stxv 32+12, 32(5) - stxv 32+13, 48(5) - stxv 32+14, 64(5) - stxv 32+15, 80(5) - stxv 32+16, 96(5) - stxv 32+17, 112(5) + xxpermdi 32+10, 32+14, 32+13, 3 + xxpermdi 32+11, 32+14, 32+13, 0 + xxpermdi 32+12, 32+19, 32+18, 3 + xxpermdi 32+13, 32+19, 32+18, 0 + xxpermdi 32+14, 32+24, 32+23, 3 + xxpermdi 32+15, 32+24, 32+23, 0 + xxpermdi 32+16, 32+29, 32+28, 3 + xxpermdi 32+17, 32+29, 32+28, 0 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm .macro PermWriteL24 @@ -307,14 +386,14 @@ vmrgow 15, 23, 24 vmrgew 16, 28, 29 vmrgow 17, 28, 29 - stxv 32+10, 0(5) - stxv 32+11, 16(5) - stxv 32+12, 32(5) - stxv 32+13, 48(5) - stxv 32+14, 64(5) - stxv 32+15, 80(5) - stxv 32+16, 96(5) - stxv 32+17, 112(5) + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm .macro INTT_REDUCE_L24 @@ -341,6 +420,7 @@ xxlor 13, 32+17, 32+17 Set_mont_consts Load_next_4zetas + Perm_4zetas MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 PermWriteL44 .endm @@ -368,34 +448,14 @@ .align 4 MLK_ASM_FN_SYMBOL(intt_ppc) - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - stxv 32+31, 304(1) + SAVE_REGS # init vectors and constants # Setup for Montgomery reduce - lxv 0, 0(4) + lxvx 0, 0, 4 - lxv 32+V_QINV, QINV_OFFSET(4) # QINV + li 10, QINV_OFFSET + lxvx 32+V_QINV, 10, 4 # QINV xxlxor 32+3, 32+3, 32+3 vspltish 4, 1 xxlor 2, 32+2, 32+2 # QINV @@ -403,9 +463,10 @@ MLK_ASM_FN_SYMBOL(intt_ppc) xxlor 4, 32+4, 32+4 # 1 # Setup for Barrett reduce - lxv 6, Q_OFFSET(4) # V_MKQ - lxv 32+V20159, C20159_OFFSET(4) # V20159 - lxv 7, 0(4) # V_25 + li 10, Q_OFFSET + li 11, C20159_OFFSET + lxvx 6, 10, 4 # V_MKQ + lxvx 32+V20159, 11, 4 # V20159 vspltisw 8, 13 vadduwm 8, 8, 8 @@ -416,13 +477,21 @@ MLK_ASM_FN_SYMBOL(intt_ppc) vslw 9, 9, 10 xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + # # Montgomery reduce loops with constant 1441 # addi 14, 4, C1441_OFFSET lvx V1441, 0, 14 - li 15, 4 # loops - mtctr 15 + li 8, 4 # loops + mtctr 8 Set_mont_consts intt_ppc__Loopf: @@ -441,8 +510,8 @@ intt_ppc__Loopf: # Update zetas vectors, each vector has 2 zetas addi 14, 4, ZETA_INTT_OFFSET li 7, 4 - li 15, 4 - mtctr 15 + li 8, 4 + mtctr 8 mr 5, 3 intt_ppc__Loop2: INTT_REDUCE_L24 @@ -454,8 +523,8 @@ intt_ppc__Loop2: # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 mr 5, 3 li 7, 8 - li 15, 4 # loops - mtctr 15 + li 8, 4 # loops + mtctr 8 intt_ppc__Loop4: INTT_REDUCE_L44 addi 5, 5, 128 @@ -547,7 +616,6 @@ intt_ppc__Loop8: Write_B4C 32+4, 32+9, 32+13, 32+17 Set_mont_consts lvx V_ZETA, 0, 14 - addi 14, 14, 16 MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 Write_M4C 32+13, 32+18, 32+23, 32+28 li 5, 64 @@ -556,7 +624,8 @@ intt_ppc__Loop8: BREDUCE_4X 4, 9, 13, 17 Write_B4C 32+4, 32+9, 32+13, 32+17 Set_mont_consts - lxv 32+10, -16(14) + lvx V_ZETA, 0, 14 + addi 14, 14, 16 MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 Write_M4C 32+13, 32+18, 32+23, 32+28 li 5, 256 @@ -566,7 +635,6 @@ intt_ppc__Loop8: Write_B4C 32+4, 32+9, 32+13, 32+17 Set_mont_consts lvx V_ZETA, 0, 14 - addi 14, 14, 16 MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 Write_M4C 32+13, 32+18, 32+23, 32+28 li 5, 320 @@ -575,7 +643,8 @@ intt_ppc__Loop8: BREDUCE_4X 4, 9, 13, 17 Write_B4C 32+4, 32+9, 32+13, 32+17 Set_mont_consts - lxv 32+10, -16(14) + lvx V_ZETA, 0, 14 + addi 14, 14, 16 MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 Write_M4C 32+13, 32+18, 32+23, 32+28 @@ -621,30 +690,7 @@ intt_ppc__Loop8: MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 Write_M4C 32+13, 32+18, 32+23, 32+28 - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) - lxv 32+31, 304(1) - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + RESTORE_REGS blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index 435e5bb52e..c8dba7b27e 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -28,6 +28,81 @@ .machine "any" .text +.macro SAVE_REGS + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 +.endm + +.macro RESTORE_REGS + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 +.endm + .macro Load_4Coeffs start next step mr 9, \start add 10, 7, 9 # J + len*2 @@ -41,10 +116,6 @@ lxvd2x 32+18, 3, 17 # r[j+len] lxvd2x 32+23, 3, 19 # r[j+len] lxvd2x 32+28, 3, 21 # r[j+len] - xxpermdi 32+13, 32+13, 32+13, 2 - xxpermdi 32+18, 32+18, 32+18, 2 - xxpermdi 32+23, 32+23, 32+23, 2 - xxpermdi 32+28, 32+28, 32+28, 2 .endm # @@ -56,20 +127,20 @@ # a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 # .macro Load_L24Coeffs - lxv 32+25, 0(5) # a[j], r[j+len] - lxv 32+26, 16(5) # a[j], r[j+len] + lxvd2x 32+25, 0, 5 # a[j], r[j+len] + lxvd2x 32+26, 10, 5 # a[j], r[j+len] vmrgew 13, 25, 26 vmrgow 12, 25, 26 - lxv 32+25, 32(5) # a[j], r[j+len] - lxv 32+26, 48(5) # a[j], r[j+len] + lxvd2x 32+25, 11, 5 # a[j], r[j+len] + lxvd2x 32+26, 12, 5 # a[j], r[j+len] vmrgew 18, 25, 26 vmrgow 17, 25, 26 - lxv 32+25, 64(5) # a[j], r[j+len] - lxv 32+26, 80(5) # a[j], r[j+len] + lxvd2x 32+25, 15, 5 # a[j], r[j+len] + lxvd2x 32+26, 16, 5 # a[j], r[j+len] vmrgew 23, 25, 26 vmrgow 22, 25, 26 - lxv 32+25, 96(5) # a[j], r[j+len] - lxv 32+26, 112(5) # a[j], r[j+len] + lxvd2x 32+25, 17, 5 # a[j], r[j+len] + lxvd2x 32+26, 18, 5 # a[j], r[j+len] vmrgew 28, 25, 26 vmrgow 27, 25, 26 .endm @@ -84,24 +155,24 @@ # rj0 - rj4, rj8 - rj11 # .macro Load_L44Coeffs - lxv 1, 0(5) # rj0, rj1, rj2, rj3, + lxvd2x 1, 0, 5 # rj0, rj1, rj2, rj3, # rjlen4, rjlen5, rjlen6, rjlen7 - lxv 2, 16(5) # rj8, rj9, rj10, rj11 + lxvd2x 2, 10, 5 # rj8, rj9, rj10, rj11 # rjlen12, rjlen13, rjlen14, rjlen15 - xxpermdi 32+13, 2, 1, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 - xxpermdi 32+12, 2, 1, 3 # rj0 - rj4, rj8 - rj11 - lxv 3, 32(5) - lxv 4, 48(5) - xxpermdi 32+18, 4, 3, 0 - xxpermdi 32+17, 4, 3, 3 - lxv 1, 64(5) - lxv 2, 80(5) - xxpermdi 32+23, 2, 1, 0 - xxpermdi 32+22, 2, 1, 3 - lxv 3, 96(5) - lxv 4, 112(5) - xxpermdi 32+28, 4, 3, 0 - xxpermdi 32+27, 4, 3, 3 + xxpermdi 32+13, 2, 1, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+12, 2, 1, 0 # rj0 - rj4, rj8 - rj11 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxpermdi 32+18, 4, 3, 3 + xxpermdi 32+17, 4, 3, 0 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxpermdi 32+23, 2, 1, 3 + xxpermdi 32+22, 2, 1, 0 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxpermdi 32+28, 4, 3, 3 + xxpermdi 32+27, 4, 3, 0 .endm # @@ -148,10 +219,6 @@ lxvd2x 32+17, 3, 16 # r[j] lxvd2x 32+22, 3, 18 # r[j] lxvd2x 32+27, 3, 20 # r[j] - xxpermdi 32+12, 32+12, 32+12, 2 - xxpermdi 32+17, 32+17, 32+17, 2 - xxpermdi 32+22, 32+22, 32+22, 2 - xxpermdi 32+27, 32+27, 32+27, 2 .endm .macro Compute_4Coeffs @@ -176,34 +243,34 @@ .endm .macro Write_One - stxvx 32+15, 3, 9 - stxvx 32+16, 3, 10 - stxvx 32+20, 3, 16 - stxvx 32+21, 3, 17 - stxvx 32+25, 3, 18 - stxvx 32+26, 3, 19 - stxvx 32+30, 3, 20 - stxvx 32+31, 3, 21 + stxvd2x 32+15, 3, 9 + stxvd2x 32+16, 3, 10 + stxvd2x 32+20, 3, 16 + stxvd2x 32+21, 3, 17 + stxvd2x 32+25, 3, 18 + stxvd2x 32+26, 3, 19 + stxvd2x 32+30, 3, 20 + stxvd2x 32+31, 3, 21 .endm .macro PermWriteL44 Compute_4Coeffs - xxpermdi 0, 32+16, 32+15, 3 - xxpermdi 1, 32+16, 32+15, 0 - xxpermdi 2, 32+21, 32+20, 3 - xxpermdi 3, 32+21, 32+20, 0 - xxpermdi 4, 32+26, 32+25, 3 - xxpermdi 5, 32+26, 32+25, 0 - xxpermdi 6, 32+31, 32+30, 3 - xxpermdi 7, 32+31, 32+30, 0 - stxv 0, 0(5) - stxv 1, 16(5) - stxv 2, 32(5) - stxv 3, 48(5) - stxv 4, 64(5) - stxv 5, 80(5) - stxv 6, 96(5) - stxv 7, 112(5) + xxpermdi 0, 32+15, 32+16, 3 + xxpermdi 1, 32+15, 32+16, 0 + xxpermdi 2, 32+20, 32+21, 3 + xxpermdi 3, 32+20, 32+21, 0 + xxpermdi 4, 32+25, 32+26, 3 + xxpermdi 5, 32+25, 32+26, 0 + xxpermdi 6, 32+30, 32+31, 3 + xxpermdi 7, 32+30, 32+31, 0 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 .endm .macro PermWriteL24 @@ -216,24 +283,34 @@ vmrgow 15, 26, 25 vmrgew 16, 31, 30 vmrgow 17, 31, 30 - stxv 32+10, 0(5) - stxv 32+11, 16(5) - stxv 32+12, 32(5) - stxv 32+13, 48(5) - stxv 32+14, 64(5) - stxv 32+15, 80(5) - stxv 32+16, 96(5) - stxv 32+17, 112(5) + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm .macro Load_next_4zetas - lxv 32+V_Z0, 0(14) - lxv 32+V_Z1, 16(14) - lxv 32+V_Z2, 32(14) - lxv 32+V_Z3, 48(14) + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 10, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 addi 14, 14, 64 .endm +.macro Perm_4zetas + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 +.endm + # # mlk_ntt_ppc(int16_t *r) # @@ -241,28 +318,7 @@ .align 4 MLK_ASM_FN_SYMBOL(ntt_ppc) - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - stxv 32+31, 304(1) + SAVE_REGS # get MLKEM_Q lvx V_NMKQ,0,4 @@ -273,7 +329,8 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) vxor 3, 3, 3 vspltish 4, 1 - lxv 32+V_QINV, QINV_OFFSET(4) + li 10, QINV_OFFSET + lvx V_QINV, 10, 4 .align 4 # @@ -404,9 +461,19 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) mtctr 15 mr 5, 3 li 7, 8 + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + .align 4 ntt_ppc__Len4: Load_next_4zetas + Perm_4zetas Load_L44Coeffs MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 PermWriteL44 @@ -419,10 +486,11 @@ ntt_ppc__Len4: # k += 64 # Update zetas vectors, each vector has 2 zetas - li 15, 4 - mtctr 15 + li 8, 4 + mtctr 8 mr 5, 3 li 7, 4 + .align 4 ntt_ppc__Len2: Load_next_4zetas @@ -433,30 +501,7 @@ ntt_ppc__Len2: bdnz ntt_ppc__Len2 - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) - lxv 32+31, 304(1) - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + RESTORE_REGS blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S index b7b010aaf1..72c6310f28 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont.S @@ -91,21 +91,35 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) stdu 1, -320(1) mflr 0 - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - - lxv 32+V_NMKQ, NQ_OFFSET(4) - lxv 32+V_QINV, QINV_OFFSET(4) - lxv 32+V1353, C1353_OFFSET(4) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + stxvx 32+25, 11, 1 + stxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvx 32+27, 6, 1 + stxvx 32+28, 7, 1 + stxvx 32+29, 8, 1 + stxvx 32+30, 9, 1 + + li 6, NQ_OFFSET + li 7, QINV_OFFSET + li 8, C1353_OFFSET + lxvx 32+V_NMKQ, 6, 4 + lxvx 32+V_QINV, 7, 4 + lxvx 32+V1353, 8, 4 vxor 3, 3, 3 vspltish 4, 1 @@ -135,17 +149,28 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) MREDUCE_4X 13, 18, 23, 7 Write_8X - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + lxvx 32+25, 11, 1 + lxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvx 32+27, 6, 1 + lxvx 32+28, 7, 1 + lxvx 32+29, 8, 1 + lxvx 32+30, 9, 1 mtlr 0 addi 1, 1, 320 blr @@ -159,3 +184,9 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) /* simpasm: footer-start */ #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index 603e0d38b0..b7c6235b9a 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -100,10 +100,10 @@ # Conditional addition to get unsigned canonical representative # .macro To_unsigned_16 - lxv 32+12, 0(3) - lxv 32+13, 16(3) - lxv 32+14, 32(3) - lxv 32+15, 48(3) + lxvd2x 32+12, 0, 3 + lxvd2x 32+13, 14, 3 + lxvd2x 32+14, 15, 3 + lxvd2x 32+15, 16, 3 addi 3, 3, 64 vsrh 1, 12, 10 vsrh 0, 13, 10 @@ -121,10 +121,10 @@ xxsel 32+0, 32+8,32+13, 32+0 xxsel 32+3, 32+5,32+14, 32+3 xxsel 32+2, 32+6,32+15, 32+2 - stxv 32+3, -32(3) - stxv 32+2, -16(3) - stxv 32+1, -64(3) - stxv 32+0, -48(3) + stxvd2x 32+3, 10, 3 + stxvd2x 32+2, 11, 3 + stxvd2x 32+1, 8, 3 + stxvd2x 32+0, 9, 3 .endm .align 4 @@ -135,16 +135,23 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) std 14, 96(1) std 15, 104(1) std 16, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 vxor 7, 7, 7 - lxv 32+V_MKQ, Q_OFFSET(4) - lxv 32+V20159, C20159_OFFSET(4) + li 6, Q_OFFSET + li 7, C20159_OFFSET + lxvx 32+V_MKQ, 6, 4 + lxvx 32+V20159, 7, 4 vspltisw V_26, 13 vadduwm V_26, V_26, V_26 @@ -202,11 +209,16 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) ld 14, 96(1) ld 15, 104(1) ld 16, 112(1) - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 mtlr 0 addi 1, 1, 224 blr @@ -221,3 +233,10 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) /* simpasm: footer-start */ #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 65df15b996..07663c4950 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -35,6 +35,81 @@ #define V_ZETA 10 #define V1441 10 +.macro SAVE_REGS + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 +.endm + +.macro RESTORE_REGS + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 +.endm + .macro Compute_4Coeffs vsubuhm 25, 8, 21 # r[j+len] - t vsubuhm 26, 12, 22 # r[j+len] - t @@ -59,19 +134,11 @@ lxvd2x 32+12, 3, 17 # r[j+len] lxvd2x 32+16, 3, 19 # r[j+len] lxvd2x 32+20, 3, 21 # r[j+len] - xxpermdi 32+8, 32+8, 32+8, 2 - xxpermdi 32+12, 32+12, 32+12, 2 - xxpermdi 32+16, 32+16, 32+16, 2 - xxpermdi 32+20, 32+20, 32+20, 2 lxvd2x 32+21, 3, 9 lxvd2x 32+22, 3, 16 lxvd2x 32+23, 3, 18 lxvd2x 32+24, 3, 20 - xxpermdi 32+21, 32+21, 32+21, 2 - xxpermdi 32+22, 32+22, 32+22, 2 - xxpermdi 32+23, 32+23, 32+23, 2 - xxpermdi 32+24, 32+24, 32+24, 2 Compute_4Coeffs .endm @@ -85,20 +152,20 @@ # a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 # .macro Load_L24Coeffs - lxv 32+25, 0(5) # a[j], r[j+len] - lxv 32+26, 16(5) # a[j], r[j+len] + lxvd2x 32+25, 0, 5 # a[j], r[j+len] + lxvd2x 32+26, 10, 5 # a[j], r[j+len] vmrgew 8, 25, 26 vmrgow 21, 25, 26 - lxv 32+25, 32(5) # a[j], r[j+len] - lxv 32+26, 48(5) # a[j], r[j+len] + lxvd2x 32+25, 11, 5 # a[j], r[j+len] + lxvd2x 32+26, 12, 5 # a[j], r[j+len] vmrgew 12, 25, 26 vmrgow 22, 25, 26 - lxv 32+25, 64(5) # a[j], r[j+len] - lxv 32+26, 80(5) # a[j], r[j+len] + lxvd2x 32+25, 15, 5 # a[j], r[j+len] + lxvd2x 32+26, 16, 5 # a[j], r[j+len] vmrgew 16, 25, 26 vmrgow 23, 25, 26 - lxv 32+25, 96(5) # a[j], r[j+len] - lxv 32+26, 112(5) # a[j], r[j+len] + lxvd2x 32+25, 17, 5 # a[j], r[j+len] + lxvd2x 32+26, 18, 5 # a[j], r[j+len] vmrgew 20, 25, 26 vmrgow 24, 25, 26 .endm @@ -113,24 +180,24 @@ # rj0 - rj4, rj8 - rj11 # .macro Load_L44Coeffs - lxv 10, 0(5) # rj0, rj1, rj2, rj3, - # rjlen4, rjlen5, rjlen6, rjlen7 - lxv 11, 16(5) # rj8, rj9, rj10, rj11 - # rjlen12, rjlen13, rjlen14, rjlen15 - xxpermdi 32+8, 11, 10, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 - xxpermdi 32+21, 11, 10, 3 # rj0 - rj4, rj8 - rj11 - lxv 10, 32(5) - lxv 11, 48(5) - xxpermdi 32+12, 11, 10, 0 - xxpermdi 32+22, 11, 10, 3 - lxv 10, 64(5) - lxv 11, 80(5) - xxpermdi 32+16, 11, 10, 0 - xxpermdi 32+23, 11, 10, 3 - lxv 10, 96(5) - lxv 11, 112(5) - xxpermdi 32+20, 11, 10, 0 - xxpermdi 32+24, 11, 10, 3 + lxvd2x 10, 0, 5 # rj0, rj1, rj2, rj3, + # rjlen4, rjlen5, rjlen6, rjlen7 + lxvd2x 11, 10, 5 # rj8, rj9, rj10, rj11 + # rjlen12, rjlen13, rjlen14, rjlen15 + xxpermdi 32+8, 11, 10, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+21, 11, 10, 0 # rj0 - rj4, rj8 - rj11 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxpermdi 32+12, 11, 10, 3 + xxpermdi 32+22, 11, 10, 0 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxpermdi 32+16, 11, 10, 3 + xxpermdi 32+23, 11, 10, 0 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxpermdi 32+20, 11, 10, 3 + xxpermdi 32+24, 11, 10, 0 .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 @@ -230,44 +297,56 @@ .endm .macro Load_next_4zetas - lxv 32+V_Z0, 0(14) - lxv 32+V_Z1, 16(14) - lxv 32+V_Z2, 32(14) - lxv 32+V_Z3, 48(14) - addi 14, 14, 64 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 8, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 + addi 14, 14, 64 +.endm + +.macro Perm_4zetas + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 .endm .macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvx \_vs0, 3, 9 - stxvx \_vs1, 3, 16 - stxvx \_vs2, 3, 18 - stxvx \_vs3, 3, 20 + stxvd2x \_vs0, 3, 9 + stxvd2x \_vs1, 3, 16 + stxvd2x \_vs2, 3, 18 + stxvd2x \_vs3, 3, 20 .endm .macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvx \_vs0, 3, 10 - stxvx \_vs1, 3, 17 - stxvx \_vs2, 3, 19 - stxvx \_vs3, 3, 21 + stxvd2x \_vs0, 3, 10 + stxvd2x \_vs1, 3, 17 + stxvd2x \_vs2, 3, 19 + stxvd2x \_vs3, 3, 21 .endm .macro Reload_4coeffs - lxv 32+25, 0(3) - lxv 32+26, 16(3) - lxv 32+30, 32(3) - lxv 32+31, 48(3) + lxvd2x 32+25, 0, 3 + lxvd2x 32+26, 10, 3 + lxvd2x 32+30, 11, 3 + lxvd2x 32+31, 12, 3 addi 3, 3, 64 .endm .macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - stxv \_vs0, -128(3) - stxv \_vs1, -112(3) - stxv \_vs2, -96(3) - stxv \_vs3, -80(3) - stxv \_vs4, -64(3) - stxv \_vs5, -48(3) - stxv \_vs6, -32(3) - stxv \_vs7, -16(3) + addi 3, 3, -128 + stxvd2x \_vs0, 0, 3 + stxvd2x \_vs1, 10, 3 + stxvd2x \_vs2, 11, 3 + stxvd2x \_vs3, 12, 3 + stxvd2x \_vs4, 15, 3 + stxvd2x \_vs5, 16, 3 + stxvd2x \_vs6, 17, 3 + stxvd2x \_vs7, 18, 3 + addi 3, 3, 128 .endm .macro PermWriteL44 @@ -275,22 +354,22 @@ xxlor 32+19, 11, 11 xxlor 32+24, 12, 12 xxlor 32+29, 13, 13 - xxpermdi 32+10, 32+13, 32+14, 3 - xxpermdi 32+11, 32+13, 32+14, 0 - xxpermdi 32+12, 32+18, 32+19, 3 - xxpermdi 32+13, 32+18, 32+19, 0 - xxpermdi 32+14, 32+23, 32+24, 3 - xxpermdi 32+15, 32+23, 32+24, 0 - xxpermdi 32+16, 32+28, 32+29, 3 - xxpermdi 32+17, 32+28, 32+29, 0 - stxv 32+10, 0(5) - stxv 32+11, 16(5) - stxv 32+12, 32(5) - stxv 32+13, 48(5) - stxv 32+14, 64(5) - stxv 32+15, 80(5) - stxv 32+16, 96(5) - stxv 32+17, 112(5) + xxpermdi 32+10, 32+14, 32+13, 3 + xxpermdi 32+11, 32+14, 32+13, 0 + xxpermdi 32+12, 32+19, 32+18, 3 + xxpermdi 32+13, 32+19, 32+18, 0 + xxpermdi 32+14, 32+24, 32+23, 3 + xxpermdi 32+15, 32+24, 32+23, 0 + xxpermdi 32+16, 32+29, 32+28, 3 + xxpermdi 32+17, 32+29, 32+28, 0 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm .macro PermWriteL24 @@ -306,14 +385,14 @@ vmrgow 15, 23, 24 vmrgew 16, 28, 29 vmrgow 17, 28, 29 - stxv 32+10, 0(5) - stxv 32+11, 16(5) - stxv 32+12, 32(5) - stxv 32+13, 48(5) - stxv 32+14, 64(5) - stxv 32+15, 80(5) - stxv 32+16, 96(5) - stxv 32+17, 112(5) + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm .macro INTT_REDUCE_L24 @@ -340,6 +419,7 @@ xxlor 13, 32+17, 32+17 Set_mont_consts Load_next_4zetas + Perm_4zetas MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 PermWriteL44 .endm @@ -367,34 +447,14 @@ .align 4 MLK_ASM_FN_SYMBOL(intt_ppc) - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - stxv 32+31, 304(1) + SAVE_REGS # init vectors and constants # Setup for Montgomery reduce - lxv 0, 0(4) + lxvx 0, 0, 4 - lxv 32+V_QINV, QINV_OFFSET(4) # QINV + li 10, QINV_OFFSET + lxvx 32+V_QINV, 10, 4 # QINV xxlxor 32+3, 32+3, 32+3 vspltish 4, 1 xxlor 2, 32+2, 32+2 # QINV @@ -402,9 +462,10 @@ MLK_ASM_FN_SYMBOL(intt_ppc) xxlor 4, 32+4, 32+4 # 1 # Setup for Barrett reduce - lxv 6, Q_OFFSET(4) # V_MKQ - lxv 32+V20159, C20159_OFFSET(4) # V20159 - lxv 7, 0(4) # V_25 + li 10, Q_OFFSET + li 11, C20159_OFFSET + lxvx 6, 10, 4 # V_MKQ + lxvx 32+V20159, 11, 4 # V20159 vspltisw 8, 13 vadduwm 8, 8, 8 @@ -415,13 +476,21 @@ MLK_ASM_FN_SYMBOL(intt_ppc) vslw 9, 9, 10 xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + # # Montgomery reduce loops with constant 1441 # addi 14, 4, C1441_OFFSET lvx V1441, 0, 14 - li 15, 4 # loops - mtctr 15 + li 8, 4 # loops + mtctr 8 Set_mont_consts intt_ppc__Loopf: @@ -440,8 +509,8 @@ intt_ppc__Loopf: # Update zetas vectors, each vector has 2 zetas addi 14, 4, ZETA_INTT_OFFSET li 7, 4 - li 15, 4 - mtctr 15 + li 8, 4 + mtctr 8 mr 5, 3 intt_ppc__Loop2: INTT_REDUCE_L24 @@ -453,8 +522,8 @@ intt_ppc__Loop2: # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 mr 5, 3 li 7, 8 - li 15, 4 # loops - mtctr 15 + li 8, 4 # loops + mtctr 8 intt_ppc__Loop4: INTT_REDUCE_L44 addi 5, 5, 128 @@ -546,7 +615,6 @@ intt_ppc__Loop8: Write_B4C 32+4, 32+9, 32+13, 32+17 Set_mont_consts lvx V_ZETA, 0, 14 - addi 14, 14, 16 MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 Write_M4C 32+13, 32+18, 32+23, 32+28 li 5, 64 @@ -555,7 +623,8 @@ intt_ppc__Loop8: BREDUCE_4X 4, 9, 13, 17 Write_B4C 32+4, 32+9, 32+13, 32+17 Set_mont_consts - lxv 32+10, -16(14) + lvx V_ZETA, 0, 14 + addi 14, 14, 16 MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 Write_M4C 32+13, 32+18, 32+23, 32+28 li 5, 256 @@ -565,7 +634,6 @@ intt_ppc__Loop8: Write_B4C 32+4, 32+9, 32+13, 32+17 Set_mont_consts lvx V_ZETA, 0, 14 - addi 14, 14, 16 MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 Write_M4C 32+13, 32+18, 32+23, 32+28 li 5, 320 @@ -574,7 +642,8 @@ intt_ppc__Loop8: BREDUCE_4X 4, 9, 13, 17 Write_B4C 32+4, 32+9, 32+13, 32+17 Set_mont_consts - lxv 32+10, -16(14) + lvx V_ZETA, 0, 14 + addi 14, 14, 16 MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 Write_M4C 32+13, 32+18, 32+23, 32+28 @@ -620,30 +689,7 @@ intt_ppc__Loop8: MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 Write_M4C 32+13, 32+18, 32+23, 32+28 - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) - lxv 32+31, 304(1) - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + RESTORE_REGS blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index 70e7bf7104..dbe7c82fa5 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -27,6 +27,81 @@ .machine "any" .text +.macro SAVE_REGS + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 +.endm + +.macro RESTORE_REGS + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 +.endm + .macro Load_4Coeffs start next step mr 9, \start add 10, 7, 9 # J + len*2 @@ -40,10 +115,6 @@ lxvd2x 32+18, 3, 17 # r[j+len] lxvd2x 32+23, 3, 19 # r[j+len] lxvd2x 32+28, 3, 21 # r[j+len] - xxpermdi 32+13, 32+13, 32+13, 2 - xxpermdi 32+18, 32+18, 32+18, 2 - xxpermdi 32+23, 32+23, 32+23, 2 - xxpermdi 32+28, 32+28, 32+28, 2 .endm # @@ -55,20 +126,20 @@ # a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 # .macro Load_L24Coeffs - lxv 32+25, 0(5) # a[j], r[j+len] - lxv 32+26, 16(5) # a[j], r[j+len] + lxvd2x 32+25, 0, 5 # a[j], r[j+len] + lxvd2x 32+26, 10, 5 # a[j], r[j+len] vmrgew 13, 25, 26 vmrgow 12, 25, 26 - lxv 32+25, 32(5) # a[j], r[j+len] - lxv 32+26, 48(5) # a[j], r[j+len] + lxvd2x 32+25, 11, 5 # a[j], r[j+len] + lxvd2x 32+26, 12, 5 # a[j], r[j+len] vmrgew 18, 25, 26 vmrgow 17, 25, 26 - lxv 32+25, 64(5) # a[j], r[j+len] - lxv 32+26, 80(5) # a[j], r[j+len] + lxvd2x 32+25, 15, 5 # a[j], r[j+len] + lxvd2x 32+26, 16, 5 # a[j], r[j+len] vmrgew 23, 25, 26 vmrgow 22, 25, 26 - lxv 32+25, 96(5) # a[j], r[j+len] - lxv 32+26, 112(5) # a[j], r[j+len] + lxvd2x 32+25, 17, 5 # a[j], r[j+len] + lxvd2x 32+26, 18, 5 # a[j], r[j+len] vmrgew 28, 25, 26 vmrgow 27, 25, 26 .endm @@ -83,24 +154,24 @@ # rj0 - rj4, rj8 - rj11 # .macro Load_L44Coeffs - lxv 1, 0(5) # rj0, rj1, rj2, rj3, + lxvd2x 1, 0, 5 # rj0, rj1, rj2, rj3, # rjlen4, rjlen5, rjlen6, rjlen7 - lxv 2, 16(5) # rj8, rj9, rj10, rj11 + lxvd2x 2, 10, 5 # rj8, rj9, rj10, rj11 # rjlen12, rjlen13, rjlen14, rjlen15 - xxpermdi 32+13, 2, 1, 0 # rjlen4 - rjlen7, rjlen12 - rjlen15 - xxpermdi 32+12, 2, 1, 3 # rj0 - rj4, rj8 - rj11 - lxv 3, 32(5) - lxv 4, 48(5) - xxpermdi 32+18, 4, 3, 0 - xxpermdi 32+17, 4, 3, 3 - lxv 1, 64(5) - lxv 2, 80(5) - xxpermdi 32+23, 2, 1, 0 - xxpermdi 32+22, 2, 1, 3 - lxv 3, 96(5) - lxv 4, 112(5) - xxpermdi 32+28, 4, 3, 0 - xxpermdi 32+27, 4, 3, 3 + xxpermdi 32+13, 2, 1, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 + xxpermdi 32+12, 2, 1, 0 # rj0 - rj4, rj8 - rj11 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxpermdi 32+18, 4, 3, 3 + xxpermdi 32+17, 4, 3, 0 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxpermdi 32+23, 2, 1, 3 + xxpermdi 32+22, 2, 1, 0 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxpermdi 32+28, 4, 3, 3 + xxpermdi 32+27, 4, 3, 0 .endm # @@ -147,10 +218,6 @@ lxvd2x 32+17, 3, 16 # r[j] lxvd2x 32+22, 3, 18 # r[j] lxvd2x 32+27, 3, 20 # r[j] - xxpermdi 32+12, 32+12, 32+12, 2 - xxpermdi 32+17, 32+17, 32+17, 2 - xxpermdi 32+22, 32+22, 32+22, 2 - xxpermdi 32+27, 32+27, 32+27, 2 .endm .macro Compute_4Coeffs @@ -175,34 +242,34 @@ .endm .macro Write_One - stxvx 32+15, 3, 9 - stxvx 32+16, 3, 10 - stxvx 32+20, 3, 16 - stxvx 32+21, 3, 17 - stxvx 32+25, 3, 18 - stxvx 32+26, 3, 19 - stxvx 32+30, 3, 20 - stxvx 32+31, 3, 21 + stxvd2x 32+15, 3, 9 + stxvd2x 32+16, 3, 10 + stxvd2x 32+20, 3, 16 + stxvd2x 32+21, 3, 17 + stxvd2x 32+25, 3, 18 + stxvd2x 32+26, 3, 19 + stxvd2x 32+30, 3, 20 + stxvd2x 32+31, 3, 21 .endm .macro PermWriteL44 Compute_4Coeffs - xxpermdi 0, 32+16, 32+15, 3 - xxpermdi 1, 32+16, 32+15, 0 - xxpermdi 2, 32+21, 32+20, 3 - xxpermdi 3, 32+21, 32+20, 0 - xxpermdi 4, 32+26, 32+25, 3 - xxpermdi 5, 32+26, 32+25, 0 - xxpermdi 6, 32+31, 32+30, 3 - xxpermdi 7, 32+31, 32+30, 0 - stxv 0, 0(5) - stxv 1, 16(5) - stxv 2, 32(5) - stxv 3, 48(5) - stxv 4, 64(5) - stxv 5, 80(5) - stxv 6, 96(5) - stxv 7, 112(5) + xxpermdi 0, 32+15, 32+16, 3 + xxpermdi 1, 32+15, 32+16, 0 + xxpermdi 2, 32+20, 32+21, 3 + xxpermdi 3, 32+20, 32+21, 0 + xxpermdi 4, 32+25, 32+26, 3 + xxpermdi 5, 32+25, 32+26, 0 + xxpermdi 6, 32+30, 32+31, 3 + xxpermdi 7, 32+30, 32+31, 0 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 .endm .macro PermWriteL24 @@ -215,24 +282,34 @@ vmrgow 15, 26, 25 vmrgew 16, 31, 30 vmrgow 17, 31, 30 - stxv 32+10, 0(5) - stxv 32+11, 16(5) - stxv 32+12, 32(5) - stxv 32+13, 48(5) - stxv 32+14, 64(5) - stxv 32+15, 80(5) - stxv 32+16, 96(5) - stxv 32+17, 112(5) + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm .macro Load_next_4zetas - lxv 32+V_Z0, 0(14) - lxv 32+V_Z1, 16(14) - lxv 32+V_Z2, 32(14) - lxv 32+V_Z3, 48(14) + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 10, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 addi 14, 14, 64 .endm +.macro Perm_4zetas + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 +.endm + # # mlk_ntt_ppc(int16_t *r) # @@ -240,28 +317,7 @@ .align 4 MLK_ASM_FN_SYMBOL(ntt_ppc) - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - stxv 32+31, 304(1) + SAVE_REGS # get MLKEM_Q lvx V_NMKQ,0,4 @@ -272,7 +328,8 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) vxor 3, 3, 3 vspltish 4, 1 - lxv 32+V_QINV, QINV_OFFSET(4) + li 10, QINV_OFFSET + lvx V_QINV, 10, 4 .align 4 # @@ -403,9 +460,19 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) mtctr 15 mr 5, 3 li 7, 8 + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + .align 4 ntt_ppc__Len4: Load_next_4zetas + Perm_4zetas Load_L44Coeffs MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 PermWriteL44 @@ -418,10 +485,11 @@ ntt_ppc__Len4: # k += 64 # Update zetas vectors, each vector has 2 zetas - li 15, 4 - mtctr 15 + li 8, 4 + mtctr 8 mr 5, 3 li 7, 4 + .align 4 ntt_ppc__Len2: Load_next_4zetas @@ -432,30 +500,7 @@ ntt_ppc__Len2: bdnz ntt_ppc__Len2 - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) - lxv 32+31, 304(1) - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + RESTORE_REGS blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S index eb770a631c..765ef91763 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -90,21 +90,35 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) stdu 1, -320(1) mflr 0 - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) - stxv 32+25, 208(1) - stxv 32+26, 224(1) - stxv 32+27, 240(1) - stxv 32+28, 256(1) - stxv 32+29, 272(1) - stxv 32+30, 288(1) - - lxv 32+V_NMKQ, NQ_OFFSET(4) - lxv 32+V_QINV, QINV_OFFSET(4) - lxv 32+V1353, C1353_OFFSET(4) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + stxvx 32+25, 11, 1 + stxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvx 32+27, 6, 1 + stxvx 32+28, 7, 1 + stxvx 32+29, 8, 1 + stxvx 32+30, 9, 1 + + li 6, NQ_OFFSET + li 7, QINV_OFFSET + li 8, C1353_OFFSET + lxvx 32+V_NMKQ, 6, 4 + lxvx 32+V_QINV, 7, 4 + lxvx 32+V1353, 8, 4 vxor 3, 3, 3 vspltish 4, 1 @@ -134,17 +148,28 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) MREDUCE_4X 13, 18, 23, 7 Write_8X - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) - lxv 32+25, 208(1) - lxv 32+26, 224(1) - lxv 32+27, 240(1) - lxv 32+28, 256(1) - lxv 32+29, 272(1) - lxv 32+30, 288(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + lxvx 32+25, 11, 1 + lxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvx 32+27, 6, 1 + lxvx 32+28, 7, 1 + lxvx 32+29, 8, 1 + lxvx 32+30, 9, 1 mtlr 0 addi 1, 1, 320 blr diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index f9681c4568..40c7a4cef5 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -99,10 +99,10 @@ # Conditional addition to get unsigned canonical representative # .macro To_unsigned_16 - lxv 32+12, 0(3) - lxv 32+13, 16(3) - lxv 32+14, 32(3) - lxv 32+15, 48(3) + lxvd2x 32+12, 0, 3 + lxvd2x 32+13, 14, 3 + lxvd2x 32+14, 15, 3 + lxvd2x 32+15, 16, 3 addi 3, 3, 64 vsrh 1, 12, 10 vsrh 0, 13, 10 @@ -120,10 +120,10 @@ xxsel 32+0, 32+8,32+13, 32+0 xxsel 32+3, 32+5,32+14, 32+3 xxsel 32+2, 32+6,32+15, 32+2 - stxv 32+3, -32(3) - stxv 32+2, -16(3) - stxv 32+1, -64(3) - stxv 32+0, -48(3) + stxvd2x 32+3, 10, 3 + stxvd2x 32+2, 11, 3 + stxvd2x 32+1, 8, 3 + stxvd2x 32+0, 9, 3 .endm .align 4 @@ -134,16 +134,23 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) std 14, 96(1) std 15, 104(1) std 16, 112(1) - stxv 32+20, 128(1) - stxv 32+21, 144(1) - stxv 32+22, 160(1) - stxv 32+23, 176(1) - stxv 32+24, 192(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 vxor 7, 7, 7 - lxv 32+V_MKQ, Q_OFFSET(4) - lxv 32+V20159, C20159_OFFSET(4) + li 6, Q_OFFSET + li 7, C20159_OFFSET + lxvx 32+V_MKQ, 6, 4 + lxvx 32+V20159, 7, 4 vspltisw V_26, 13 vadduwm V_26, V_26, V_26 @@ -201,11 +208,16 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) ld 14, 96(1) ld 15, 104(1) ld 16, 112(1) - lxv 32+20, 128(1) - lxv 32+21, 144(1) - lxv 32+22, 160(1) - lxv 32+23, 176(1) - lxv 32+24, 192(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 mtlr 0 addi 1, 1, 224 blr From 39cb1e2cb9d982715a737a2830b310ba1615d0be Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Wed, 29 Oct 2025 15:17:51 -0400 Subject: [PATCH 18/22] This commit fixed comments and added more comments and others, 1. De-tabified. 2. Merged next and step used in macro. 3. Used immediate values for offsets used in macro. 4. More comments explaining the operation and contents. 5. Changed the comment style. In this commit, numeric register identifiers have not been fixed yet. will do that next. Signed-off-by: Danny Tsen --- dev/ppc64le/src/intt_ppc.S | 1062 +++++++++++--------- dev/ppc64le/src/ntt_ppc.S | 820 ++++++++------- dev/ppc64le/src/poly_tomont.S | 296 +++--- dev/ppc64le/src/reduce.S | 370 +++---- mlkem/src/native/ppc64le/src/intt_ppc.S | 1062 +++++++++++--------- mlkem/src/native/ppc64le/src/ntt_ppc.S | 820 ++++++++------- mlkem/src/native/ppc64le/src/poly_tomont.S | 296 +++--- mlkem/src/native/ppc64le/src/reduce.S | 370 +++---- 8 files changed, 2672 insertions(+), 2424 deletions(-) diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 4fc49edcd6..85ba00482b 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -3,12 +3,12 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ @@ -20,13 +20,13 @@ .machine "any" .text -# Barrett reduce constatnts +/* Barrett reduce constatnts */ #define V20159 0 #define V_25 1 #define V_26 2 #define V_MKQ 3 -# Montgomery reduce constatnts +/* Montgomery reduce constatnts */ #define V_QINV 2 #define V_NMKQ 5 #define V_Z0 7 @@ -37,156 +37,221 @@ #define V1441 10 .macro SAVE_REGS - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - stxvx 32+20, 10, 1 - stxvx 32+21, 11, 1 - stxvx 32+22, 12, 1 - stxvx 32+23, 14, 1 - stxvx 32+24, 15, 1 - stxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - stxvx 32+26, 10, 1 - stxvx 32+27, 11, 1 - stxvx 32+28, 12, 1 - stxvx 32+29, 14, 1 - stxvx 32+30, 15, 1 - stxvx 32+31, 16, 1 + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 .endm .macro RESTORE_REGS - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - lxvx 32+20, 10, 1 - lxvx 32+21, 11, 1 - lxvx 32+22, 12, 1 - lxvx 32+23, 14, 1 - lxvx 32+24, 15, 1 - lxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - lxvx 32+26, 10, 1 - lxvx 32+27, 11, 1 - lxvx 32+28, 12, 1 - lxvx 32+29, 14, 1 - lxvx 32+30, 15, 1 - lxvx 32+31, 16, 1 - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 .endm +/* + * Compute final final r[j] and r[j+len] + * final r[j+len]: V8, V12, V16, V20 + * final r[j]: V21, V22, V23, V24 + */ .macro Compute_4Coeffs - vsubuhm 25, 8, 21 # r[j+len] - t - vsubuhm 26, 12, 22 # r[j+len] - t - vsubuhm 30, 16, 23 # r[j+len] - t - vsubuhm 31, 20, 24 # r[j+len] - t - vadduhm 8, 8, 21 # r[j+len] + t - vadduhm 12, 12, 22 # r[j+len] + t - vadduhm 16, 16, 23 # r[j+len] + t - vadduhm 20, 20, 24 # r[j+len] + t + /* Since the result of the Montgomery multiplication is bounded + by q in absolute value. + Finally to complete the final update of the results with add/sub + r[j] = r[j] + t. + r[j+len] = r[j] - t + */ + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 .endm -.macro Load_4Coeffs start next step - mr 9, \start # j - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+8, 3, 10 # r[j+len] - lxvd2x 32+12, 3, 17 # r[j+len] - lxvd2x 32+16, 3, 19 # r[j+len] - lxvd2x 32+20, 3, 21 # r[j+len] - - lxvd2x 32+21, 3, 9 - lxvd2x 32+22, 3, 16 - lxvd2x 32+23, 3, 18 - lxvd2x 32+24, 3, 20 - - Compute_4Coeffs +/* + * Init_Coeffs_offset: initial offset setup for the coeeficient array. + * + * start: beginning of the offset to the coefficient array. + * next: Next offset. + * len: Index difference between coefficients. + * + * r7: len * 2, each coefficient component is 2 bytes. + * + * register used for offset to coefficients, r[j] and r[j+len] + * R9: offset to r0 = j + * R16: offset to r1 = r0 + next + * R18: offset to r2 = r1 + next + * R20: offset to r3 = r2 + next + * + * R10: offset to r'0 = r0 + len*2 + * R17: offset to r'1 = r'0 + step + * R19: offset to r'2 = r'1 + step + * R21: offset to r'3 = r'2 + step + * + */ +.macro Init_Coeffs_offset start next + li 9, \start /* first offset to j */ + add 10, 7, 9 /* J + len*2 */ + addi 16, 9, \next + addi 17, 10, \next + addi 18, 16, \next + addi 19, 17, \next + addi 20, 18, \next + addi 21, 19, \next .endm -# -# Load Coeffients and setup vectors -# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 -# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 -# -# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 -# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 -# +/* + * Load coefficient vectors for r[j] (r) and r[j+len] (r'): + * Load coefficient in r' vectors from offset, R10, R17, R19 and R21 + * Load coefficient in r vectors from offset, R9, R16, R18 and R20 + * + * r[j+len]: V8, V12, V16, V20 + * r[j]: V21, V22, V23, V24 + */ +.macro Load_4Rjp + lxvd2x 32+8, 3, 10 /* V8: vector r'0 */ + lxvd2x 32+12, 3, 17 /* V12: vector for r'1 */ + lxvd2x 32+16, 3, 19 /* V16: vector for r'2 */ + lxvd2x 32+20, 3, 21 /* V20: vector for r'3 */ + + lxvd2x 32+21, 3, 9 /* V21: vector r0 */ + lxvd2x 32+22, 3, 16 /* V22: vector r1 */ + lxvd2x 32+23, 3, 18 /* V23: vector r2 */ + lxvd2x 32+24, 3, 20 /* V24: vector r3 */ +.endm + +/* + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7 + */ +.macro Load_4Coeffs start next + Init_Coeffs_offset \start \next + Load_4Rjp + Compute_4Coeffs +.endm + +/* + * Load 2 - 2 - 2 - 2 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7 + * rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15 + * Each vmrgew and vmrgow will transpose vectors as, + * r[j]= rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13 + * r[j+len]= rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15 + * + * r[j+len]: V8, V12, V16, V20 + * r[j]: V21, V22, V23, V24 + * + * In order to do the coefficient computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ .macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 # a[j], r[j+len] - lxvd2x 32+26, 10, 5 # a[j], r[j+len] + lxvd2x 32+25, 0, 5 + lxvd2x 32+26, 10, 5 vmrgew 8, 25, 26 vmrgow 21, 25, 26 - lxvd2x 32+25, 11, 5 # a[j], r[j+len] - lxvd2x 32+26, 12, 5 # a[j], r[j+len] + lxvd2x 32+25, 11, 5 + lxvd2x 32+26, 12, 5 vmrgew 12, 25, 26 vmrgow 22, 25, 26 - lxvd2x 32+25, 15, 5 # a[j], r[j+len] - lxvd2x 32+26, 16, 5 # a[j], r[j+len] + lxvd2x 32+25, 15, 5 + lxvd2x 32+26, 16, 5 vmrgew 16, 25, 26 vmrgow 23, 25, 26 - lxvd2x 32+25, 17, 5 # a[j], r[j+len] - lxvd2x 32+26, 18, 5 # a[j], r[j+len] + lxvd2x 32+25, 17, 5 + lxvd2x 32+26, 18, 5 vmrgew 20, 25, 26 vmrgow 24, 25, 26 .endm -# -# Permute -# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 -# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 -# -# to -# rjlen4 - rjlen7, rjlen12 - rjlen15 -# rj0 - rj4, rj8 - rj11 -# +/* + * Load 4 - 4 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 + * rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 + * + * Each xxpermdi will transpose vectors as, + * rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15 + * rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ .macro Load_L44Coeffs - lxvd2x 10, 0, 5 # rj0, rj1, rj2, rj3, - # rjlen4, rjlen5, rjlen6, rjlen7 - lxvd2x 11, 10, 5 # rj8, rj9, rj10, rj11 - # rjlen12, rjlen13, rjlen14, rjlen15 - xxpermdi 32+8, 11, 10, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 - xxpermdi 32+21, 11, 10, 0 # rj0 - rj4, rj8 - rj11 + lxvd2x 10, 0, 5 + lxvd2x 11, 10, 5 + xxpermdi 32+8, 11, 10, 3 + xxpermdi 32+21, 11, 10, 0 lxvd2x 10, 11, 5 lxvd2x 11, 12, 5 xxpermdi 32+12, 11, 10, 3 @@ -202,99 +267,107 @@ .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 - vxor 7, 7, 7 - xxlor 32+3, 6, 6 # V_MKQ - xxlor 32+1, 7, 7 # V_25 - xxlor 32+2, 8, 8 # V_26 - # Multify Odd/Even signed halfword; - # Results word bound by 2^32 in abs value. - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - # Right shift and pack lower halfword, - # results bond to 2^16 in abs value - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 - # Modulo multify-Low unsigned halfword; - # results bond to 2^16 * q in abs value. - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + /* Restore constant vectors + V_MKQ, V_25 and V_26 */ + vxor 7, 7, 7 + xxlor 32+3, 6, 6 + xxlor 32+1, 7, 7 + xxlor 32+2, 8, 8 + /* Multify Odd/Even signed halfword; + Results word bound by 2^32 in abs value. */ + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + /* Right shift and pack lower halfword, + results bond to 2^16 in abs value */ + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + /* Modulo multify-Low unsigned halfword; + results bond to 2^16 * q in abs value. */ + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm -#----------------------------------- -# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) -# +/* + * ----------------------------------- + * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) + */ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 25, \_vz0, 3 - vmladduhm 20, 26, \_vz1, 3 - vmladduhm 27, 30, \_vz2, 3 - vmladduhm 28, 31, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 25, \_vz0, 3 - vmhraddshs 19, 26, \_vz1, 3 - vmhraddshs 24, 30, \_vz2, 3 - vmhraddshs 29, 31, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 27, V_QINV, 3 - vmladduhm 30, 28, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah \_vo0, 15, 4 # >> 1 - vsrah \_vo1, 20, 4 # >> 1 - vsrah \_vo2, 25, 4 # >> 1 - vsrah \_vo3, 30, 4 # >> 1 + /* Modular multification bond by 2^16 * q in abs value */ + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + /* Shift right 1 bit */ + vsrah \_vo0, 15, 4 + vsrah \_vo1, 20, 4 + vsrah \_vo2, 25, 4 + vsrah \_vo3, 30, 4 .endm +/* + * setup constant vectors for Montgmery multiplication + * V_NMKQ, V_QINV, Zero vector, One vector + */ .macro Set_mont_consts - xxlor 32+5, 0, 0 # V_NMKQ - xxlor 32+2, 2, 2 # V_QINV - xxlor 32+3, 3, 3 # 0 - xxlor 32+4, 4, 4 # 1 + xxlor 32+5, 0, 0 /* V_NMKQ */ + xxlor 32+2, 2, 2 /* V_QINV */ + xxlor 32+3, 3, 3 /* all 0 */ + xxlor 32+4, 4, 4 /* all 1 */ .endm .macro Load_next_4zetas @@ -308,6 +381,10 @@ addi 14, 14, 64 .endm +/* + * Re-ordering of the 4-4 layout zetas. + * Swap double-words. + */ .macro Perm_4zetas xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 @@ -316,53 +393,57 @@ .endm .macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 9 - stxvd2x \_vs1, 3, 16 - stxvd2x \_vs2, 3, 18 - stxvd2x \_vs3, 3, 20 + stxvd2x \_vs0, 3, 9 + stxvd2x \_vs1, 3, 16 + stxvd2x \_vs2, 3, 18 + stxvd2x \_vs3, 3, 20 .endm .macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 10 - stxvd2x \_vs1, 3, 17 - stxvd2x \_vs2, 3, 19 - stxvd2x \_vs3, 3, 21 + stxvd2x \_vs0, 3, 10 + stxvd2x \_vs1, 3, 17 + stxvd2x \_vs2, 3, 19 + stxvd2x \_vs3, 3, 21 .endm .macro Reload_4coeffs - lxvd2x 32+25, 0, 3 - lxvd2x 32+26, 10, 3 - lxvd2x 32+30, 11, 3 - lxvd2x 32+31, 12, 3 - addi 3, 3, 64 + lxvd2x 32+25, 0, 3 + lxvd2x 32+26, 10, 3 + lxvd2x 32+30, 11, 3 + lxvd2x 32+31, 12, 3 + addi 3, 3, 64 .endm .macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - addi 3, 3, -128 - stxvd2x \_vs0, 0, 3 - stxvd2x \_vs1, 10, 3 - stxvd2x \_vs2, 11, 3 - stxvd2x \_vs3, 12, 3 - stxvd2x \_vs4, 15, 3 - stxvd2x \_vs5, 16, 3 - stxvd2x \_vs6, 17, 3 - stxvd2x \_vs7, 18, 3 - addi 3, 3, 128 + addi 3, 3, -128 + stxvd2x \_vs0, 0, 3 + stxvd2x \_vs1, 10, 3 + stxvd2x \_vs2, 11, 3 + stxvd2x \_vs3, 12, 3 + stxvd2x \_vs4, 15, 3 + stxvd2x \_vs5, 16, 3 + stxvd2x \_vs6, 17, 3 + stxvd2x \_vs7, 18, 3 + addi 3, 3, 128 .endm +/* + * Transpose the final coefficients of 4-4 layout to the orginal + * coefficient array order. + */ .macro PermWriteL44 - xxlor 32+14, 10, 10 - xxlor 32+19, 11, 11 - xxlor 32+24, 12, 12 - xxlor 32+29, 13, 13 - xxpermdi 32+10, 32+14, 32+13, 3 - xxpermdi 32+11, 32+14, 32+13, 0 - xxpermdi 32+12, 32+19, 32+18, 3 - xxpermdi 32+13, 32+19, 32+18, 0 - xxpermdi 32+14, 32+24, 32+23, 3 - xxpermdi 32+15, 32+24, 32+23, 0 - xxpermdi 32+16, 32+29, 32+28, 3 - xxpermdi 32+17, 32+29, 32+28, 0 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+14, 32+13, 3 + xxpermdi 32+11, 32+14, 32+13, 0 + xxpermdi 32+12, 32+19, 32+18, 3 + xxpermdi 32+13, 32+19, 32+18, 0 + xxpermdi 32+14, 32+24, 32+23, 3 + xxpermdi 32+15, 32+24, 32+23, 0 + xxpermdi 32+16, 32+29, 32+28, 3 + xxpermdi 32+17, 32+29, 32+28, 0 stxvd2x 32+10, 0, 5 stxvd2x 32+11, 10, 5 stxvd2x 32+12, 11, 5 @@ -373,11 +454,15 @@ stxvd2x 32+17, 18, 5 .endm +/* + * Transpose the final coefficients of 2-2-2-2 layout to the orginal + * coefficient array order. + */ .macro PermWriteL24 - xxlor 32+14, 10, 10 - xxlor 32+19, 11, 11 - xxlor 32+24, 12, 12 - xxlor 32+29, 13, 13 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 vmrgew 10, 13, 14 vmrgow 11, 13, 14 vmrgew 12, 18, 19 @@ -397,85 +482,87 @@ .endm .macro INTT_REDUCE_L24 - Load_L24Coeffs - Compute_4Coeffs - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - PermWriteL24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL24 .endm .macro INTT_REDUCE_L44 - Load_L44Coeffs - Compute_4Coeffs - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - Perm_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - PermWriteL44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + Perm_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL44 .endm -.macro INTT_REDUCE_4X start next step - Load_4Coeffs \start, \next, \step - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 +.macro INTT_REDUCE_4X start next + Load_4Coeffs \start, \next + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .endm -# intt -# t = r[j]; -# r[j] = barrett_reduce(t + r[j + len]); -# r[j + len] = r[j + len] - t; -# r[j + len] = fqmul(zeta, r[j + len]); +/* + * main operations for intt + * t = r[j]; + * r[j] = barrett_reduce(t + r[j + len]); + * r[j + len] = r[j + len] - t; + * r[j + len] = fqmul(zeta, r[j + len]); + */ -# -# mlk_intt_ppc(r) -# +/* + * mlk_intt_ppc(r) + */ .global MLK_ASM_NAMESPACE(intt_ppc) .align 4 MLK_ASM_FN_SYMBOL(intt_ppc) - SAVE_REGS + SAVE_REGS - # init vectors and constants - # Setup for Montgomery reduce - lxvx 0, 0, 4 + /* init vectors and constants + Setup for Montgomery reduce */ + lxvx 0, 0, 4 - li 10, QINV_OFFSET - lxvx 32+V_QINV, 10, 4 # QINV - xxlxor 32+3, 32+3, 32+3 - vspltish 4, 1 - xxlor 2, 32+2, 32+2 # QINV - xxlor 3, 32+3, 32+3 # 0 - xxlor 4, 32+4, 32+4 # 1 + li 10, QINV_OFFSET + lxvx 32+V_QINV, 10, 4 + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 /* QINV */ + xxlor 3, 32+3, 32+3 /* 0 vector */ + xxlor 4, 32+4, 32+4 /* 1 vector */ - # Setup for Barrett reduce - li 10, Q_OFFSET - li 11, C20159_OFFSET - lxvx 6, 10, 4 # V_MKQ - lxvx 32+V20159, 11, 4 # V20159 + /* Setup for Barrett reduce */ + li 10, Q_OFFSET + li 11, C20159_OFFSET + lxvx 6, 10, 4 /* V_MKQ */ + lxvx 32+V20159, 11, 4 /* V20159 */ - vspltisw 8, 13 - vadduwm 8, 8, 8 - xxlor 8, 32+8, 32+8 # V_26 store at vs8 + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 /* V_26 store at vs8 */ - vspltisw 9, 1 - vsubuwm 10, 8, 9 # 25 - vslw 9, 9, 10 - xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + vspltisw 9, 1 + vsubuwm 10, 8, 9 /* value 25 */ + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 /* V_25 syore at vs7 */ li 10, 16 li 11, 32 @@ -485,12 +572,12 @@ MLK_ASM_FN_SYMBOL(intt_ppc) li 17, 96 li 18, 112 - # - # Montgomery reduce loops with constant 1441 - # + /* + * Montgomery reduce loops with constant 1441 + */ addi 14, 4, C1441_OFFSET lvx V1441, 0, 14 - li 8, 4 # loops + li 8, 4 mtctr 8 Set_mont_consts @@ -505,193 +592,184 @@ intt_ppc__Loopf: addi 3, 3, -512 .align 4 - # - # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # Update zetas vectors, each vector has 2 zetas - addi 14, 4, ZETA_INTT_OFFSET - li 7, 4 - li 8, 4 - mtctr 8 - mr 5, 3 + /* + * 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + * Update zetas vectors, each vector has 2 zetas + * Load zeta array in 2-2-2-2 layout + */ + addi 14, 4, ZETA_INTT_OFFSET + li 7, 4 /* len * 2 */ + li 8, 4 + mtctr 8 + mr 5, 3 intt_ppc__Loop2: - INTT_REDUCE_L24 - addi 5, 5, 128 - bdnz intt_ppc__Loop2 + INTT_REDUCE_L24 + addi 5, 5, 128 + bdnz intt_ppc__Loop2 .align 4 - # - # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - mr 5, 3 - li 7, 8 - li 8, 4 # loops - mtctr 8 + /* + * 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + * Load zeta array in 4-4 layout + */ + mr 5, 3 + li 7, 8 + li 8, 4 + mtctr 8 intt_ppc__Loop4: - INTT_REDUCE_L44 - addi 5, 5, 128 - bdnz intt_ppc__Loop4 + INTT_REDUCE_L44 + addi 5, 5, 128 + bdnz intt_ppc__Loop4 .align 4 - # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - li 7, 16 - li 5, 0 - li 15, 4 # loops - mtctr 15 + /* + * 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + */ + li 7, 16 -intt_ppc__Loop8: - INTT_REDUCE_4X 5, 32, 32 - addi 5, 5, 128 - bdnz intt_ppc__Loop8 + INTT_REDUCE_4X 0, 32 + INTT_REDUCE_4X 128, 32 + INTT_REDUCE_4X 256, 32 + INTT_REDUCE_4X 384, 32 .align 4 - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - li 5, 0 - li 7, 32 + /* + * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + */ + li 7, 32 - INTT_REDUCE_4X 5, 64, 64 + INTT_REDUCE_4X 0, 64 - li 5, 16 - addi 14, 14, -64 - INTT_REDUCE_4X 5, 64, 64 + addi 14, 14, -64 + INTT_REDUCE_4X 16, 64 - li 5, 256 - INTT_REDUCE_4X 5, 64, 64 + INTT_REDUCE_4X 256, 64 - li 5, 272 - addi 14, 14, -64 - INTT_REDUCE_4X 5, 64, 64 + addi 14, 14, -64 + INTT_REDUCE_4X 272, 64 .align 4 - # - # 5. len = 32, start = 0, 64, 128, 192 - li 5, 0 - li 7, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + /* + * 5. len = 32, start = 0, 64, 128, 192 + */ + li 7, 64 + + Load_4Coeffs 0, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 128, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 256, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 384, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 - # - # 6. len = 64, start = 0, 128 - li 5, 0 - li 7, 128 - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 320 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + /* + * 6. len = 64, start = 0, 128 + */ + li 7, 128 + Load_4Coeffs 0, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 64, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 256, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 320, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 - # 7. len = 128, start = 0 - # - li 5, 0 # start - li 7, 256 # len * 2 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - xxlor 9, 32+10, 32+10 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 192 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - - RESTORE_REGS - blr + /* + * 7. len = 128, start = 0 + */ + li 7, 256 /* len*2 */ + + Load_4Coeffs 0, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 64, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 128, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 192, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + RESTORE_REGS + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index c8dba7b27e..d702973832 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -3,12 +3,12 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ @@ -19,148 +19,199 @@ #define V_QINV 2 #define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 #define V_ZETA 10 .machine "any" .text .macro SAVE_REGS - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - stxvx 32+20, 10, 1 - stxvx 32+21, 11, 1 - stxvx 32+22, 12, 1 - stxvx 32+23, 14, 1 - stxvx 32+24, 15, 1 - stxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - stxvx 32+26, 10, 1 - stxvx 32+27, 11, 1 - stxvx 32+28, 12, 1 - stxvx 32+29, 14, 1 - stxvx 32+30, 15, 1 - stxvx 32+31, 16, 1 + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 .endm .macro RESTORE_REGS - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - lxvx 32+20, 10, 1 - lxvx 32+21, 11, 1 - lxvx 32+22, 12, 1 - lxvx 32+23, 14, 1 - lxvx 32+24, 15, 1 - lxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - lxvx 32+26, 10, 1 - lxvx 32+27, 11, 1 - lxvx 32+28, 12, 1 - lxvx 32+29, 14, 1 - lxvx 32+30, 15, 1 - lxvx 32+31, 16, 1 - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 +.endm + +/* + * Init_Coeffs_offset: initial offset setup for the coeeficient array. + * + * start: beginning of the offset to the coefficient array. + * next: Next offset. + * len: Index difference between coefficients. + * + * r7: len * 2, each coefficient component is 2 bytes. + * + * registers used for offset to coefficients, r[j] and r[j+len] + * R9: offset to r0 = j + * R16: offset to r1 = r0 + next + * R18: offset to r2 = r1 + next + * R20: offset to r3 = r2 + next + * + * R10: offset to r'0 = r0 + len*2 + * R17: offset to r'1 = r'0 + step + * R19: offset to r'2 = r'1 + step + * R21: offset to r'3 = r'2 + step + * + */ +.macro Init_Coeffs_offset start next + li 9, \start /* first offset to j */ + add 10, 7, 9 /* J + len*2 */ + addi 16, 9, \next + addi 17, 10, \next + addi 18, 16, \next + addi 19, 17, \next + addi 20, 18, \next + addi 21, 19, \next +.endm + +/* + * Load coefficient in r[j+len] (r') vectors from offset, R10, R17, R19 and R21 + * r[j+len]: V13, V18, V23, V28 + */ +.macro Load_4Rjp + lxvd2x 32+13, 3, 10 /* V13: vector r'0 */ + lxvd2x 32+18, 3, 17 /* V18: vector for r'1 */ + lxvd2x 32+23, 3, 19 /* V23: vector for r'2 */ + lxvd2x 32+28, 3, 21 /* V28: vector for r'3 */ .endm -.macro Load_4Coeffs start next step - mr 9, \start - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+13, 3, 10 # r[j+len] - lxvd2x 32+18, 3, 17 # r[j+len] - lxvd2x 32+23, 3, 19 # r[j+len] - lxvd2x 32+28, 3, 21 # r[j+len] +/* + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7 + */ +.macro Load_4Coeffs start next + Init_Coeffs_offset \start \next + Load_4Rjp .endm -# -# Load Coeffients and setup vectors -# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 -# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 -# -# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 -# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 -# +/* + * Load 2 - 2 - 2 - 2 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7 + * rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15 + * Each vmrgew and vmrgow will transpose vectors as, + * r[j]= rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13 + * r[j+len]= rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15 + * + * r[j+len]: V13, V18, V23, V28 + * r[j]: V12, V17, V22, V27 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ .macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 # a[j], r[j+len] - lxvd2x 32+26, 10, 5 # a[j], r[j+len] + lxvd2x 32+25, 0, 5 + lxvd2x 32+26, 10, 5 vmrgew 13, 25, 26 vmrgow 12, 25, 26 - lxvd2x 32+25, 11, 5 # a[j], r[j+len] - lxvd2x 32+26, 12, 5 # a[j], r[j+len] + lxvd2x 32+25, 11, 5 + lxvd2x 32+26, 12, 5 vmrgew 18, 25, 26 vmrgow 17, 25, 26 - lxvd2x 32+25, 15, 5 # a[j], r[j+len] - lxvd2x 32+26, 16, 5 # a[j], r[j+len] + lxvd2x 32+25, 15, 5 + lxvd2x 32+26, 16, 5 vmrgew 23, 25, 26 vmrgow 22, 25, 26 - lxvd2x 32+25, 17, 5 # a[j], r[j+len] - lxvd2x 32+26, 18, 5 # a[j], r[j+len] + lxvd2x 32+25, 17, 5 + lxvd2x 32+26, 18, 5 vmrgew 28, 25, 26 vmrgow 27, 25, 26 .endm -# -# Permute -# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 -# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 -# -# to -# rjlen4 - rjlen7, rjlen12 - rjlen15 -# rj0 - rj4, rj8 - rj11 -# +/* + * Load 4 - 4 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 + * rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 + * + * Each xxpermdi will transpose vectors as, + * rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15 + * rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ .macro Load_L44Coeffs - lxvd2x 1, 0, 5 # rj0, rj1, rj2, rj3, - # rjlen4, rjlen5, rjlen6, rjlen7 - lxvd2x 2, 10, 5 # rj8, rj9, rj10, rj11 - # rjlen12, rjlen13, rjlen14, rjlen15 - xxpermdi 32+13, 2, 1, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 - xxpermdi 32+12, 2, 1, 0 # rj0 - rj4, rj8 - rj11 + lxvd2x 1, 0, 5 + lxvd2x 2, 10, 5 + xxpermdi 32+13, 2, 1, 3 + xxpermdi 32+12, 2, 1, 0 lxvd2x 3, 11, 5 lxvd2x 4, 12, 5 xxpermdi 32+18, 4, 3, 3 @@ -175,104 +226,118 @@ xxpermdi 32+27, 4, 3, 0 .endm -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) -# +/* + * montgomery_reduce + * t = a * QINV + * t = (a - (int32_t)t*_MLKEM_Q) >> 16 + * + * ----------------------------------- + * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) + */ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 - # fqmul = zeta * coefficient - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 13, \_vz0, 3 - vmladduhm 20, 18, \_vz1, 3 - vmladduhm 25, 23, \_vz2, 3 - vmladduhm 30, 28, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 13, \_vz0, 3 - vmhraddshs 19, 18, \_vz1, 3 - vmhraddshs 24, 23, \_vz2, 3 - vmhraddshs 29, 28, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 30, 30, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah 13, 15, 4 # >> 1 - vsrah 18, 20, 4 # >> 1 - vsrah 23, 25, 4 # >> 1 - vsrah 28, 30, 4 # >> 1 - + /* fqmul = zeta * coefficient + Modular multification bond by 2^16 * q in abs value */ + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + /* Shift right 1 bit */ + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 .endm -.macro Load_4Aj - lxvd2x 32+12, 3, 9 # r[j] - lxvd2x 32+17, 3, 16 # r[j] - lxvd2x 32+22, 3, 18 # r[j] - lxvd2x 32+27, 3, 20 # r[j] +/* + * Load 4 r[j] (r) coefficient vectors: + * Load coefficient in vectors from offset, R9, R16, R18 and R20 + * r[j]: V12, V17, V22, V27 + */ +.macro Load_4Rj + lxvd2x 32+12, 3, 9 /* V12: vector r0 */ + lxvd2x 32+17, 3, 16 /* V17: vector r1 */ + lxvd2x 32+22, 3, 18 /* V22: vector r2 */ + lxvd2x 32+27, 3, 20 /* V27: vector r3 */ .endm +/* + * Compute final final r[j] and r[j+len] + * final r[j+len]: V16, V21, V26, V31 + * final r[j]: V15, V20, V25, V30 + */ .macro Compute_4Coeffs - # Since the result of the Montgomery multiplication is bounded - # by q in absolute value. - # Finally to complete the final update of the results with add/sub - vsubuhm 16, 12, 13 # r - t - vadduhm 15, 13, 12 # r + t - vsubuhm 21, 17, 18 # r - t - vadduhm 20, 18, 17 # r + t - vsubuhm 26, 22, 23 # r - t - vadduhm 25, 23, 22 # r + t - vsubuhm 31, 27, 28 # r - t - vadduhm 30, 28, 27 # r + t -.endm - -.macro NTT_MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 - Load_4Coeffs \start, \next, \step - MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 - Load_4Aj - Compute_4Coeffs + /* Since the result of the Montgomery multiplication is bounded + by q in absolute value. + Finally to complete the final update of the results with add/sub + r[j] = r[j] + t. + r[j+len] = r[j] - t + */ + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 .endm .macro Write_One - stxvd2x 32+15, 3, 9 - stxvd2x 32+16, 3, 10 - stxvd2x 32+20, 3, 16 - stxvd2x 32+21, 3, 17 - stxvd2x 32+25, 3, 18 - stxvd2x 32+26, 3, 19 - stxvd2x 32+30, 3, 20 - stxvd2x 32+31, 3, 21 + stxvd2x 32+15, 3, 9 + stxvd2x 32+16, 3, 10 + stxvd2x 32+20, 3, 16 + stxvd2x 32+21, 3, 17 + stxvd2x 32+25, 3, 18 + stxvd2x 32+26, 3, 19 + stxvd2x 32+30, 3, 20 + stxvd2x 32+31, 3, 21 .endm +/* + * Transpose the final coefficients of 4-4 layout to the orginal + * coefficient array order. + */ .macro PermWriteL44 - Compute_4Coeffs - xxpermdi 0, 32+15, 32+16, 3 - xxpermdi 1, 32+15, 32+16, 0 - xxpermdi 2, 32+20, 32+21, 3 - xxpermdi 3, 32+20, 32+21, 0 - xxpermdi 4, 32+25, 32+26, 3 - xxpermdi 5, 32+25, 32+26, 0 - xxpermdi 6, 32+30, 32+31, 3 - xxpermdi 7, 32+30, 32+31, 0 - stxvd2x 0, 0, 5 - stxvd2x 1, 10, 5 - stxvd2x 2, 11, 5 - stxvd2x 3, 12, 5 - stxvd2x 4, 15, 5 - stxvd2x 5, 16, 5 - stxvd2x 6, 17, 5 - stxvd2x 7, 18, 5 + Compute_4Coeffs + xxpermdi 0, 32+15, 32+16, 3 + xxpermdi 1, 32+15, 32+16, 0 + xxpermdi 2, 32+20, 32+21, 3 + xxpermdi 3, 32+20, 32+21, 0 + xxpermdi 4, 32+25, 32+26, 3 + xxpermdi 5, 32+25, 32+26, 0 + xxpermdi 6, 32+30, 32+31, 3 + xxpermdi 7, 32+30, 32+31, 0 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 .endm +/* + * Transpose the final coefficients of 2-2-2-2 layout to the orginal + * coefficient array order. + */ .macro PermWriteL24 Compute_4Coeffs vmrgew 10, 16, 15 @@ -283,226 +348,205 @@ vmrgow 15, 26, 25 vmrgew 16, 31, 30 vmrgow 17, 31, 30 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm .macro Load_next_4zetas - li 10, 16 - li 11, 32 - li 12, 48 - lxvd2x 32+V_Z0, 0, 14 - lxvd2x 32+V_Z1, 10, 14 - lxvd2x 32+V_Z2, 11, 14 - lxvd2x 32+V_Z3, 12, 14 - addi 14, 14, 64 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 10, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 + addi 14, 14, 64 .endm +/* + * Re-ordering of the 4-4 layout zetas. + * Swap double-words. + */ .macro Perm_4zetas - xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 - xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 - xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 - xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 +.endm + +.macro NTT_MREDUCE_4X start next _vz0 _vz1 _vz2 _vz3 + Load_4Coeffs \start, \next + MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 + Load_4Rj + Compute_4Coeffs + Write_One .endm -# -# mlk_ntt_ppc(int16_t *r) -# +/* + * mlk_ntt_ppc(int16_t *r) + */ .global MLK_ASM_NAMESPACE(ntt_ppc) .align 4 MLK_ASM_FN_SYMBOL(ntt_ppc) - SAVE_REGS + SAVE_REGS - # get MLKEM_Q - lvx V_NMKQ,0,4 + /* load MLKEM_Q */ + lvx V_NMKQ,0,4 - # zetas array - addi 14, 4, ZETA_NTT_OFFSET + /* Register 14 as pointer to zetas array */ + addi 14, 4, ZETA_NTT_OFFSET - vxor 3, 3, 3 - vspltish 4, 1 + vxor 3, 3, 3 + vspltish 4, 1 - li 10, QINV_OFFSET - lvx V_QINV, 10, 4 + li 10, QINV_OFFSET + lvx V_QINV, 10, 4 .align 4 - # - # Compute coefficients of the NTT based on the following loop. - # for (len = 128; len ≥ 2; len = len/2) - # - # 1. len = 128, start = 0 - # - li 5, 0 # start - li 7, 256 # len * 2 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 128 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 192 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + /* + * Compute coefficients of the NTT based on the following loop. + * for (len = 128; len ≥ 2; len = len/2) + * + * 1. len = 128, start = 0 + */ + li 7, 256 /* len * 2 */ + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .align 4 - # - # 2. len = 64, start = 0, 128 - # k += 2 - li 5, 0 - li 7, 128 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 320 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + /* + * 2. len = 64, start = 0, 128 + * k += 2 + */ + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .align 4 - # - # 3. len = 32, start = 0, 64, 128, 192 - # k += 4 - li 5, 0 - li 7, 64 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 128 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 384 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + /* + * 3. len = 32, start = 0, 64, 128, 192 + * k += 4 + */ + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .align 4 - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - # k += 8 - li 5, 0 - li 7, 32 - Load_next_4zetas - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 16 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - Load_next_4zetas - li 5, 256 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 272 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One + /* + * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + * k += 8 + */ + li 7, 32 + Load_next_4zetas + NTT_MREDUCE_4X 0, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 16, 64, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 256, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 272, 64, V_Z0, V_Z1, V_Z2, V_Z3 .align 4 - # - # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - # k += 16 - li 5, 0 - li 7, 16 - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 128 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 256 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 384 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - # - # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - # k += 32 - li 15, 4 # loops - mtctr 15 - mr 5, 3 - li 7, 8 - - li 10, 16 - li 11, 32 - li 12, 48 - li 15, 64 - li 16, 80 - li 17, 96 - li 18, 112 + /* + * 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + * k += 16 + */ + li 7, 16 + Load_next_4zetas + NTT_MREDUCE_4X 0, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 128, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 256, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 384, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + /* + * 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + * k += 32 + * Load zeta vectors in 4-4 layout + */ + li 15, 4 + mtctr 15 + mr 5, 3 /* Let r5 points to coefficient array */ + li 7, 8 + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 .align 4 ntt_ppc__Len4: - Load_next_4zetas - Perm_4zetas + Load_next_4zetas + Perm_4zetas Load_L44Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 - PermWriteL44 - addi 5, 5, 128 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi 5, 5, 128 - bdnz ntt_ppc__Len4 + bdnz ntt_ppc__Len4 - # - # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # k += 64 - # Update zetas vectors, each vector has 2 zetas + /* + * 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + * k += 64 + * Load zeta vectors in 2-2-2-2 layout + */ - li 8, 4 - mtctr 8 - mr 5, 3 - li 7, 4 + li 8, 4 + mtctr 8 + mr 5, 3 /* Let r5 points to coefficient array */ + li 7, 4 .align 4 ntt_ppc__Len2: - Load_next_4zetas + Load_next_4zetas Load_L24Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 - PermWriteL24 - addi 5, 5, 128 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi 5, 5, 128 - bdnz ntt_ppc__Len2 + bdnz ntt_ppc__Len2 - RESTORE_REGS - blr + RESTORE_REGS + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S index 72c6310f28..354474d071 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont.S @@ -3,18 +3,19 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# - -# Poly_tomont: Inplace conversion of all coefficients of a polynomial -# from normal domain to Montgomery domain -# -# Arguments:*r: pointer to input/output polynomial -# +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ + +/* + * Poly_tomont: Inplace conversion of all coefficients of a polynomial + * from normal domain to Montgomery domain + * + * Arguments:*r: pointer to input/output polynomial + */ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ @@ -30,150 +31,151 @@ .machine "any" .text -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(_v0, _v1, _v2, _v3) -# +/* + * montgomery_reduce + * t = a * QINV + * t = (a - (int32_t)t*_MLKEM_Q) >> 16 + * + * ----------------------------------- + * MREDUCE_4X(_v0, _v1, _v2, _v3) + */ .macro MREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+13, 0, 3 - addi 3, 3, 16 - lxvd2x 32+18, 0, 3 - addi 3, 3, 16 - lxvd2x 32+23, 0, 3 - addi 3, 3, 16 - lxvd2x 32+7, 0, 3 - addi 3, 3, 16 - - vmladduhm 15, 13, V1353, 3 - vmladduhm 20, 18, V1353, 3 - vmladduhm 25, 23, V1353, 3 - vmladduhm 9, 7, V1353, 3 - - vmhraddshs 14, 13, V1353, 3 - vmhraddshs 19, 18, V1353, 3 - vmhraddshs 24, 23, V1353, 3 - vmhraddshs 8, 7, V1353, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 9, 9, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 9, 9, V_NMKQ, 8 - - vsrah \_v0, 15, 4 # >> 1 - vsrah \_v1, 20, 4 # >> 1 - vsrah \_v2, 25, 4 # >> 1 - vsrah \_v3, 9, 4 # >> 1 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + /* Shift right 1 bit */ + vsrah \_v0, 15, 4 + vsrah \_v1, 20, 4 + vsrah \_v2, 25, 4 + vsrah \_v3, 9, 4 .endm .macro Write_8X - stxvd2x 32+27, 4, 3 - stxvd2x 32+28, 5, 3 - stxvd2x 32+29, 6, 3 - stxvd2x 32+30, 7, 3 - stxvd2x 32+13, 8, 3 - stxvd2x 32+18, 9, 3 - stxvd2x 32+23, 10, 3 - stxvd2x 32+7, 11, 3 + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 .endm .align 4 .globl MLK_ASM_NAMESPACE(poly_tomont_ppc) MLK_ASM_FN_SYMBOL(poly_tomont_ppc) - stdu 1, -320(1) - mflr 0 - - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - li 11, 208 - li 12, 224 - stxvx 32+20, 6, 1 - stxvx 32+21, 7, 1 - stxvx 32+22, 8, 1 - stxvx 32+23, 9, 1 - stxvx 32+24, 10, 1 - stxvx 32+25, 11, 1 - stxvx 32+26, 12, 1 - li 6, 240 - li 7, 256 - li 8, 272 - li 9, 288 - stxvx 32+27, 6, 1 - stxvx 32+28, 7, 1 - stxvx 32+29, 8, 1 - stxvx 32+30, 9, 1 - - li 6, NQ_OFFSET - li 7, QINV_OFFSET - li 8, C1353_OFFSET - lxvx 32+V_NMKQ, 6, 4 - lxvx 32+V_QINV, 7, 4 - lxvx 32+V1353, 8, 4 - - vxor 3, 3, 3 - vspltish 4, 1 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - li 11, 208 - li 12, 224 - lxvx 32+20, 6, 1 - lxvx 32+21, 7, 1 - lxvx 32+22, 8, 1 - lxvx 32+23, 9, 1 - lxvx 32+24, 10, 1 - lxvx 32+25, 11, 1 - lxvx 32+26, 12, 1 - li 6, 240 - li 7, 256 - li 8, 272 - li 9, 288 - lxvx 32+27, 6, 1 - lxvx 32+28, 7, 1 - lxvx 32+29, 8, 1 - lxvx 32+30, 9, 1 - mtlr 0 - addi 1, 1, 320 - blr + stdu 1, -320(1) + mflr 0 + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + stxvx 32+25, 11, 1 + stxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvx 32+27, 6, 1 + stxvx 32+28, 7, 1 + stxvx 32+29, 8, 1 + stxvx 32+30, 9, 1 + + li 6, NQ_OFFSET + li 7, QINV_OFFSET + li 8, C1353_OFFSET + lxvx 32+V_NMKQ, 6, 4 + lxvx 32+V_QINV, 7, 4 + lxvx 32+V1353, 8, 4 + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + lxvx 32+25, 11, 1 + lxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvx 32+27, 6, 1 + lxvx 32+28, 7, 1 + lxvx 32+29, 8, 1 + lxvx 32+30, 9, 1 + mtlr 0 + addi 1, 1, 320 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index b7c6235b9a..084ae5959d 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -3,19 +3,19 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# - -# -# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial -# for details of the Barrett reduction -# -# Arguments: *r: pointer to input/output polynomial -# +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ + +/* + * poly_reduce: Applies Barrett reduction to all coefficients of a polynomial + * for details of the Barrett reduction + * + * Arguments: *r: pointer to input/output polynomial + */ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ @@ -34,194 +34,194 @@ .text .macro BREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+8, 0, 3 - lxvd2x 32+12, 14, 3 - lxvd2x 32+16, 15, 3 - lxvd2x 32+20, 16, 3 - addi 3, 3, 64 - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm .macro Write_8X - stxvd2x 32+21, 4, 3 - stxvd2x 32+22, 5, 3 - stxvd2x 32+23, 6, 3 - stxvd2x 32+24, 7, 3 - stxvd2x 32+4, 8, 3 - stxvd2x 32+9, 9, 3 - stxvd2x 32+13, 10, 3 - stxvd2x 32+17, 11, 3 + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 .endm -# -# Conditional addition to get unsigned canonical representative -# +/* + * Conditional addition to get unsigned canonical representative + */ .macro To_unsigned_16 lxvd2x 32+12, 0, 3 lxvd2x 32+13, 14, 3 lxvd2x 32+14, 15, 3 lxvd2x 32+15, 16, 3 - addi 3, 3, 64 - vsrh 1, 12, 10 - vsrh 0, 13, 10 - vsrh 3, 14, 10 - vsrh 2, 15, 10 - vadduhm 7, 12, 11 - vadduhm 8, 13, 11 - vadduhm 5, 14, 11 - vadduhm 6, 15, 11 - vcmpequh 1, 1, 9 - vcmpequh 0, 0, 9 - vcmpequh 3, 3, 9 - vcmpequh 2, 2, 9 - xxsel 32+1, 32+7,32+12, 32+1 - xxsel 32+0, 32+8,32+13, 32+0 - xxsel 32+3, 32+5,32+14, 32+3 - xxsel 32+2, 32+6,32+15, 32+2 - stxvd2x 32+3, 10, 3 - stxvd2x 32+2, 11, 3 - stxvd2x 32+1, 8, 3 - stxvd2x 32+0, 9, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxvd2x 32+3, 10, 3 + stxvd2x 32+2, 11, 3 + stxvd2x 32+1, 8, 3 + stxvd2x 32+0, 9, 3 .endm .align 4 .globl MLK_ASM_NAMESPACE(reduce_ppc) MLK_ASM_FN_SYMBOL(reduce_ppc) - stdu 1, -224(1) - mflr 0 - std 14, 96(1) - std 15, 104(1) - std 16, 112(1) - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - stxvx 32+20, 6, 1 - stxvx 32+21, 7, 1 - stxvx 32+22, 8, 1 - stxvx 32+23, 9, 1 - stxvx 32+24, 10, 1 - - vxor 7, 7, 7 - - li 6, Q_OFFSET - li 7, C20159_OFFSET - lxvx 32+V_MKQ, 6, 4 - lxvx 32+V20159, 7, 4 - - vspltisw V_26, 13 - vadduwm V_26, V_26, V_26 - vspltisw 4, 1 - vsubuwm 5, V_26, 4 - vslw V_25, 4, 5 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - li 14, 16 - li 15, 32 - li 16, 48 - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - # - # To unsigned canonical - # + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + + vxor 7, 7, 7 + + li 6, Q_OFFSET + li 7, C20159_OFFSET + lxvx 32+V_MKQ, 6, 4 + lxvx 32+V20159, 7, 4 + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + /* + * To unsigned canonical + */ .align 4 - addi 3, 3, -512 - vxor 9, 9, 9 - vspltish 10, 15 - vmr 11, V_MKQ - - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - - ld 14, 96(1) - ld 15, 104(1) - ld 16, 112(1) - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - lxvx 32+20, 6, 1 - lxvx 32+21, 7, 1 - lxvx 32+22, 8, 1 - lxvx 32+23, 9, 1 - lxvx 32+24, 10, 1 - mtlr 0 - addi 1, 1, 224 - blr + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + mtlr 0 + addi 1, 1, 224 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 07663c4950..169272c444 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -3,12 +3,12 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ @@ -19,13 +19,13 @@ .machine "any" .text -# Barrett reduce constatnts +/* Barrett reduce constatnts */ #define V20159 0 #define V_25 1 #define V_26 2 #define V_MKQ 3 -# Montgomery reduce constatnts +/* Montgomery reduce constatnts */ #define V_QINV 2 #define V_NMKQ 5 #define V_Z0 7 @@ -36,156 +36,221 @@ #define V1441 10 .macro SAVE_REGS - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - stxvx 32+20, 10, 1 - stxvx 32+21, 11, 1 - stxvx 32+22, 12, 1 - stxvx 32+23, 14, 1 - stxvx 32+24, 15, 1 - stxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - stxvx 32+26, 10, 1 - stxvx 32+27, 11, 1 - stxvx 32+28, 12, 1 - stxvx 32+29, 14, 1 - stxvx 32+30, 15, 1 - stxvx 32+31, 16, 1 + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 .endm .macro RESTORE_REGS - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - lxvx 32+20, 10, 1 - lxvx 32+21, 11, 1 - lxvx 32+22, 12, 1 - lxvx 32+23, 14, 1 - lxvx 32+24, 15, 1 - lxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - lxvx 32+26, 10, 1 - lxvx 32+27, 11, 1 - lxvx 32+28, 12, 1 - lxvx 32+29, 14, 1 - lxvx 32+30, 15, 1 - lxvx 32+31, 16, 1 - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 .endm +/* + * Compute final final r[j] and r[j+len] + * final r[j+len]: V8, V12, V16, V20 + * final r[j]: V21, V22, V23, V24 + */ .macro Compute_4Coeffs - vsubuhm 25, 8, 21 # r[j+len] - t - vsubuhm 26, 12, 22 # r[j+len] - t - vsubuhm 30, 16, 23 # r[j+len] - t - vsubuhm 31, 20, 24 # r[j+len] - t - vadduhm 8, 8, 21 # r[j+len] + t - vadduhm 12, 12, 22 # r[j+len] + t - vadduhm 16, 16, 23 # r[j+len] + t - vadduhm 20, 20, 24 # r[j+len] + t + /* Since the result of the Montgomery multiplication is bounded + by q in absolute value. + Finally to complete the final update of the results with add/sub + r[j] = r[j] + t. + r[j+len] = r[j] - t + */ + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 .endm -.macro Load_4Coeffs start next step - mr 9, \start # j - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+8, 3, 10 # r[j+len] - lxvd2x 32+12, 3, 17 # r[j+len] - lxvd2x 32+16, 3, 19 # r[j+len] - lxvd2x 32+20, 3, 21 # r[j+len] - - lxvd2x 32+21, 3, 9 - lxvd2x 32+22, 3, 16 - lxvd2x 32+23, 3, 18 - lxvd2x 32+24, 3, 20 - - Compute_4Coeffs +/* + * Init_Coeffs_offset: initial offset setup for the coeeficient array. + * + * start: beginning of the offset to the coefficient array. + * next: Next offset. + * len: Index difference between coefficients. + * + * r7: len * 2, each coefficient component is 2 bytes. + * + * register used for offset to coefficients, r[j] and r[j+len] + * R9: offset to r0 = j + * R16: offset to r1 = r0 + next + * R18: offset to r2 = r1 + next + * R20: offset to r3 = r2 + next + * + * R10: offset to r'0 = r0 + len*2 + * R17: offset to r'1 = r'0 + step + * R19: offset to r'2 = r'1 + step + * R21: offset to r'3 = r'2 + step + * + */ +.macro Init_Coeffs_offset start next + li 9, \start /* first offset to j */ + add 10, 7, 9 /* J + len*2 */ + addi 16, 9, \next + addi 17, 10, \next + addi 18, 16, \next + addi 19, 17, \next + addi 20, 18, \next + addi 21, 19, \next .endm -# -# Load Coeffients and setup vectors -# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 -# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 -# -# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 -# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 -# +/* + * Load coefficient vectors for r[j] (r) and r[j+len] (r'): + * Load coefficient in r' vectors from offset, R10, R17, R19 and R21 + * Load coefficient in r vectors from offset, R9, R16, R18 and R20 + * + * r[j+len]: V8, V12, V16, V20 + * r[j]: V21, V22, V23, V24 + */ +.macro Load_4Rjp + lxvd2x 32+8, 3, 10 /* V8: vector r'0 */ + lxvd2x 32+12, 3, 17 /* V12: vector for r'1 */ + lxvd2x 32+16, 3, 19 /* V16: vector for r'2 */ + lxvd2x 32+20, 3, 21 /* V20: vector for r'3 */ + + lxvd2x 32+21, 3, 9 /* V21: vector r0 */ + lxvd2x 32+22, 3, 16 /* V22: vector r1 */ + lxvd2x 32+23, 3, 18 /* V23: vector r2 */ + lxvd2x 32+24, 3, 20 /* V24: vector r3 */ +.endm + +/* + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7 + */ +.macro Load_4Coeffs start next + Init_Coeffs_offset \start \next + Load_4Rjp + Compute_4Coeffs +.endm + +/* + * Load 2 - 2 - 2 - 2 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7 + * rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15 + * Each vmrgew and vmrgow will transpose vectors as, + * r[j]= rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13 + * r[j+len]= rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15 + * + * r[j+len]: V8, V12, V16, V20 + * r[j]: V21, V22, V23, V24 + * + * In order to do the coefficient computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ .macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 # a[j], r[j+len] - lxvd2x 32+26, 10, 5 # a[j], r[j+len] + lxvd2x 32+25, 0, 5 + lxvd2x 32+26, 10, 5 vmrgew 8, 25, 26 vmrgow 21, 25, 26 - lxvd2x 32+25, 11, 5 # a[j], r[j+len] - lxvd2x 32+26, 12, 5 # a[j], r[j+len] + lxvd2x 32+25, 11, 5 + lxvd2x 32+26, 12, 5 vmrgew 12, 25, 26 vmrgow 22, 25, 26 - lxvd2x 32+25, 15, 5 # a[j], r[j+len] - lxvd2x 32+26, 16, 5 # a[j], r[j+len] + lxvd2x 32+25, 15, 5 + lxvd2x 32+26, 16, 5 vmrgew 16, 25, 26 vmrgow 23, 25, 26 - lxvd2x 32+25, 17, 5 # a[j], r[j+len] - lxvd2x 32+26, 18, 5 # a[j], r[j+len] + lxvd2x 32+25, 17, 5 + lxvd2x 32+26, 18, 5 vmrgew 20, 25, 26 vmrgow 24, 25, 26 .endm -# -# Permute -# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 -# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 -# -# to -# rjlen4 - rjlen7, rjlen12 - rjlen15 -# rj0 - rj4, rj8 - rj11 -# +/* + * Load 4 - 4 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 + * rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 + * + * Each xxpermdi will transpose vectors as, + * rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15 + * rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ .macro Load_L44Coeffs - lxvd2x 10, 0, 5 # rj0, rj1, rj2, rj3, - # rjlen4, rjlen5, rjlen6, rjlen7 - lxvd2x 11, 10, 5 # rj8, rj9, rj10, rj11 - # rjlen12, rjlen13, rjlen14, rjlen15 - xxpermdi 32+8, 11, 10, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 - xxpermdi 32+21, 11, 10, 0 # rj0 - rj4, rj8 - rj11 + lxvd2x 10, 0, 5 + lxvd2x 11, 10, 5 + xxpermdi 32+8, 11, 10, 3 + xxpermdi 32+21, 11, 10, 0 lxvd2x 10, 11, 5 lxvd2x 11, 12, 5 xxpermdi 32+12, 11, 10, 3 @@ -201,99 +266,107 @@ .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 - vxor 7, 7, 7 - xxlor 32+3, 6, 6 # V_MKQ - xxlor 32+1, 7, 7 # V_25 - xxlor 32+2, 8, 8 # V_26 - # Multify Odd/Even signed halfword; - # Results word bound by 2^32 in abs value. - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - # Right shift and pack lower halfword, - # results bond to 2^16 in abs value - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 - # Modulo multify-Low unsigned halfword; - # results bond to 2^16 * q in abs value. - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + /* Restore constant vectors + V_MKQ, V_25 and V_26 */ + vxor 7, 7, 7 + xxlor 32+3, 6, 6 + xxlor 32+1, 7, 7 + xxlor 32+2, 8, 8 + /* Multify Odd/Even signed halfword; + Results word bound by 2^32 in abs value. */ + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + /* Right shift and pack lower halfword, + results bond to 2^16 in abs value */ + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + /* Modulo multify-Low unsigned halfword; + results bond to 2^16 * q in abs value. */ + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm -#----------------------------------- -# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) -# +/* + * ----------------------------------- + * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) + */ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 25, \_vz0, 3 - vmladduhm 20, 26, \_vz1, 3 - vmladduhm 27, 30, \_vz2, 3 - vmladduhm 28, 31, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 25, \_vz0, 3 - vmhraddshs 19, 26, \_vz1, 3 - vmhraddshs 24, 30, \_vz2, 3 - vmhraddshs 29, 31, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 27, V_QINV, 3 - vmladduhm 30, 28, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah \_vo0, 15, 4 # >> 1 - vsrah \_vo1, 20, 4 # >> 1 - vsrah \_vo2, 25, 4 # >> 1 - vsrah \_vo3, 30, 4 # >> 1 + /* Modular multification bond by 2^16 * q in abs value */ + vmladduhm 15, 25, \_vz0, 3 + vmladduhm 20, 26, \_vz1, 3 + vmladduhm 27, 30, \_vz2, 3 + vmladduhm 28, 31, \_vz3, 3 + + /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ + vmhraddshs 14, 25, \_vz0, 3 + vmhraddshs 19, 26, \_vz1, 3 + vmhraddshs 24, 30, \_vz2, 3 + vmhraddshs 29, 31, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + /* Shift right 1 bit */ + vsrah \_vo0, 15, 4 + vsrah \_vo1, 20, 4 + vsrah \_vo2, 25, 4 + vsrah \_vo3, 30, 4 .endm +/* + * setup constant vectors for Montgmery multiplication + * V_NMKQ, V_QINV, Zero vector, One vector + */ .macro Set_mont_consts - xxlor 32+5, 0, 0 # V_NMKQ - xxlor 32+2, 2, 2 # V_QINV - xxlor 32+3, 3, 3 # 0 - xxlor 32+4, 4, 4 # 1 + xxlor 32+5, 0, 0 /* V_NMKQ */ + xxlor 32+2, 2, 2 /* V_QINV */ + xxlor 32+3, 3, 3 /* all 0 */ + xxlor 32+4, 4, 4 /* all 1 */ .endm .macro Load_next_4zetas @@ -307,6 +380,10 @@ addi 14, 14, 64 .endm +/* + * Re-ordering of the 4-4 layout zetas. + * Swap double-words. + */ .macro Perm_4zetas xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 @@ -315,53 +392,57 @@ .endm .macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 9 - stxvd2x \_vs1, 3, 16 - stxvd2x \_vs2, 3, 18 - stxvd2x \_vs3, 3, 20 + stxvd2x \_vs0, 3, 9 + stxvd2x \_vs1, 3, 16 + stxvd2x \_vs2, 3, 18 + stxvd2x \_vs3, 3, 20 .endm .macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 10 - stxvd2x \_vs1, 3, 17 - stxvd2x \_vs2, 3, 19 - stxvd2x \_vs3, 3, 21 + stxvd2x \_vs0, 3, 10 + stxvd2x \_vs1, 3, 17 + stxvd2x \_vs2, 3, 19 + stxvd2x \_vs3, 3, 21 .endm .macro Reload_4coeffs - lxvd2x 32+25, 0, 3 - lxvd2x 32+26, 10, 3 - lxvd2x 32+30, 11, 3 - lxvd2x 32+31, 12, 3 - addi 3, 3, 64 + lxvd2x 32+25, 0, 3 + lxvd2x 32+26, 10, 3 + lxvd2x 32+30, 11, 3 + lxvd2x 32+31, 12, 3 + addi 3, 3, 64 .endm .macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - addi 3, 3, -128 - stxvd2x \_vs0, 0, 3 - stxvd2x \_vs1, 10, 3 - stxvd2x \_vs2, 11, 3 - stxvd2x \_vs3, 12, 3 - stxvd2x \_vs4, 15, 3 - stxvd2x \_vs5, 16, 3 - stxvd2x \_vs6, 17, 3 - stxvd2x \_vs7, 18, 3 - addi 3, 3, 128 + addi 3, 3, -128 + stxvd2x \_vs0, 0, 3 + stxvd2x \_vs1, 10, 3 + stxvd2x \_vs2, 11, 3 + stxvd2x \_vs3, 12, 3 + stxvd2x \_vs4, 15, 3 + stxvd2x \_vs5, 16, 3 + stxvd2x \_vs6, 17, 3 + stxvd2x \_vs7, 18, 3 + addi 3, 3, 128 .endm +/* + * Transpose the final coefficients of 4-4 layout to the orginal + * coefficient array order. + */ .macro PermWriteL44 - xxlor 32+14, 10, 10 - xxlor 32+19, 11, 11 - xxlor 32+24, 12, 12 - xxlor 32+29, 13, 13 - xxpermdi 32+10, 32+14, 32+13, 3 - xxpermdi 32+11, 32+14, 32+13, 0 - xxpermdi 32+12, 32+19, 32+18, 3 - xxpermdi 32+13, 32+19, 32+18, 0 - xxpermdi 32+14, 32+24, 32+23, 3 - xxpermdi 32+15, 32+24, 32+23, 0 - xxpermdi 32+16, 32+29, 32+28, 3 - xxpermdi 32+17, 32+29, 32+28, 0 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+14, 32+13, 3 + xxpermdi 32+11, 32+14, 32+13, 0 + xxpermdi 32+12, 32+19, 32+18, 3 + xxpermdi 32+13, 32+19, 32+18, 0 + xxpermdi 32+14, 32+24, 32+23, 3 + xxpermdi 32+15, 32+24, 32+23, 0 + xxpermdi 32+16, 32+29, 32+28, 3 + xxpermdi 32+17, 32+29, 32+28, 0 stxvd2x 32+10, 0, 5 stxvd2x 32+11, 10, 5 stxvd2x 32+12, 11, 5 @@ -372,11 +453,15 @@ stxvd2x 32+17, 18, 5 .endm +/* + * Transpose the final coefficients of 2-2-2-2 layout to the orginal + * coefficient array order. + */ .macro PermWriteL24 - xxlor 32+14, 10, 10 - xxlor 32+19, 11, 11 - xxlor 32+24, 12, 12 - xxlor 32+29, 13, 13 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 vmrgew 10, 13, 14 vmrgow 11, 13, 14 vmrgew 12, 18, 19 @@ -396,85 +481,87 @@ .endm .macro INTT_REDUCE_L24 - Load_L24Coeffs - Compute_4Coeffs - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - PermWriteL24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL24 .endm .macro INTT_REDUCE_L44 - Load_L44Coeffs - Compute_4Coeffs - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 - Set_mont_consts - Load_next_4zetas - Perm_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - PermWriteL44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X 4, 9, 13, 17 + xxlor 10, 32+4, 32+4 + xxlor 11, 32+9, 32+9 + xxlor 12, 32+13, 32+13 + xxlor 13, 32+17, 32+17 + Set_mont_consts + Load_next_4zetas + Perm_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + PermWriteL44 .endm -.macro INTT_REDUCE_4X start next step - Load_4Coeffs \start, \next, \step - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 +.macro INTT_REDUCE_4X start next + Load_4Coeffs \start, \next + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .endm -# intt -# t = r[j]; -# r[j] = barrett_reduce(t + r[j + len]); -# r[j + len] = r[j + len] - t; -# r[j + len] = fqmul(zeta, r[j + len]); +/* + * main operations for intt + * t = r[j]; + * r[j] = barrett_reduce(t + r[j + len]); + * r[j + len] = r[j + len] - t; + * r[j + len] = fqmul(zeta, r[j + len]); + */ -# -# mlk_intt_ppc(r) -# +/* + * mlk_intt_ppc(r) + */ .global MLK_ASM_NAMESPACE(intt_ppc) .align 4 MLK_ASM_FN_SYMBOL(intt_ppc) - SAVE_REGS + SAVE_REGS - # init vectors and constants - # Setup for Montgomery reduce - lxvx 0, 0, 4 + /* init vectors and constants + Setup for Montgomery reduce */ + lxvx 0, 0, 4 - li 10, QINV_OFFSET - lxvx 32+V_QINV, 10, 4 # QINV - xxlxor 32+3, 32+3, 32+3 - vspltish 4, 1 - xxlor 2, 32+2, 32+2 # QINV - xxlor 3, 32+3, 32+3 # 0 - xxlor 4, 32+4, 32+4 # 1 + li 10, QINV_OFFSET + lxvx 32+V_QINV, 10, 4 + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 /* QINV */ + xxlor 3, 32+3, 32+3 /* 0 vector */ + xxlor 4, 32+4, 32+4 /* 1 vector */ - # Setup for Barrett reduce - li 10, Q_OFFSET - li 11, C20159_OFFSET - lxvx 6, 10, 4 # V_MKQ - lxvx 32+V20159, 11, 4 # V20159 + /* Setup for Barrett reduce */ + li 10, Q_OFFSET + li 11, C20159_OFFSET + lxvx 6, 10, 4 /* V_MKQ */ + lxvx 32+V20159, 11, 4 /* V20159 */ - vspltisw 8, 13 - vadduwm 8, 8, 8 - xxlor 8, 32+8, 32+8 # V_26 store at vs8 + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 /* V_26 store at vs8 */ - vspltisw 9, 1 - vsubuwm 10, 8, 9 # 25 - vslw 9, 9, 10 - xxlor 7, 32+9, 32+9 # V_25 syore at vs7 + vspltisw 9, 1 + vsubuwm 10, 8, 9 /* value 25 */ + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 /* V_25 syore at vs7 */ li 10, 16 li 11, 32 @@ -484,12 +571,12 @@ MLK_ASM_FN_SYMBOL(intt_ppc) li 17, 96 li 18, 112 - # - # Montgomery reduce loops with constant 1441 - # + /* + * Montgomery reduce loops with constant 1441 + */ addi 14, 4, C1441_OFFSET lvx V1441, 0, 14 - li 8, 4 # loops + li 8, 4 mtctr 8 Set_mont_consts @@ -504,193 +591,184 @@ intt_ppc__Loopf: addi 3, 3, -512 .align 4 - # - # 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # Update zetas vectors, each vector has 2 zetas - addi 14, 4, ZETA_INTT_OFFSET - li 7, 4 - li 8, 4 - mtctr 8 - mr 5, 3 + /* + * 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + * Update zetas vectors, each vector has 2 zetas + * Load zeta array in 2-2-2-2 layout + */ + addi 14, 4, ZETA_INTT_OFFSET + li 7, 4 /* len * 2 */ + li 8, 4 + mtctr 8 + mr 5, 3 intt_ppc__Loop2: - INTT_REDUCE_L24 - addi 5, 5, 128 - bdnz intt_ppc__Loop2 + INTT_REDUCE_L24 + addi 5, 5, 128 + bdnz intt_ppc__Loop2 .align 4 - # - # 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - mr 5, 3 - li 7, 8 - li 8, 4 # loops - mtctr 8 + /* + * 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + * Load zeta array in 4-4 layout + */ + mr 5, 3 + li 7, 8 + li 8, 4 + mtctr 8 intt_ppc__Loop4: - INTT_REDUCE_L44 - addi 5, 5, 128 - bdnz intt_ppc__Loop4 + INTT_REDUCE_L44 + addi 5, 5, 128 + bdnz intt_ppc__Loop4 .align 4 - # 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - li 7, 16 - li 5, 0 - li 15, 4 # loops - mtctr 15 + /* + * 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + */ + li 7, 16 -intt_ppc__Loop8: - INTT_REDUCE_4X 5, 32, 32 - addi 5, 5, 128 - bdnz intt_ppc__Loop8 + INTT_REDUCE_4X 0, 32 + INTT_REDUCE_4X 128, 32 + INTT_REDUCE_4X 256, 32 + INTT_REDUCE_4X 384, 32 .align 4 - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - li 5, 0 - li 7, 32 + /* + * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + */ + li 7, 32 - INTT_REDUCE_4X 5, 64, 64 + INTT_REDUCE_4X 0, 64 - li 5, 16 - addi 14, 14, -64 - INTT_REDUCE_4X 5, 64, 64 + addi 14, 14, -64 + INTT_REDUCE_4X 16, 64 - li 5, 256 - INTT_REDUCE_4X 5, 64, 64 + INTT_REDUCE_4X 256, 64 - li 5, 272 - addi 14, 14, -64 - INTT_REDUCE_4X 5, 64, 64 + addi 14, 14, -64 + INTT_REDUCE_4X 272, 64 .align 4 - # - # 5. len = 32, start = 0, 64, 128, 192 - li 5, 0 - li 7, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + /* + * 5. len = 32, start = 0, 64, 128, 192 + */ + li 7, 64 + + Load_4Coeffs 0, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 128, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 256, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 384 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 384, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 - # - # 6. len = 64, start = 0, 128 - li 5, 0 - li 7, 128 - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 256 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 320 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + /* + * 6. len = 64, start = 0, 128 + */ + li 7, 128 + Load_4Coeffs 0, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 64, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 256, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 320, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 .align 4 - # 7. len = 128, start = 0 - # - li 5, 0 # start - li 7, 256 # len * 2 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - lvx V_ZETA, 0, 14 - xxlor 9, 32+10, 32+10 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 64 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 128 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - li 5, 192 - - Load_4Coeffs 5, 16, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 - Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 - - RESTORE_REGS - blr + /* + * 7. len = 128, start = 0 + */ + li 7, 256 /* len*2 */ + + Load_4Coeffs 0, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + lvx V_ZETA, 0, 14 + xxlor 9, 32+10, 32+10 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 64, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 128, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + Load_4Coeffs 192, 16 + BREDUCE_4X 4, 9, 13, 17 + Write_B4C 32+4, 32+9, 32+13, 32+17 + Set_mont_consts + xxlor 32+10, 9, 9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 + Write_M4C 32+13, 32+18, 32+23, 32+28 + + RESTORE_REGS + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index dbe7c82fa5..9c837b0fb7 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -3,12 +3,12 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ @@ -18,148 +18,199 @@ #define V_QINV 2 #define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 #define V_ZETA 10 .machine "any" .text .macro SAVE_REGS - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - stxvx 32+20, 10, 1 - stxvx 32+21, 11, 1 - stxvx 32+22, 12, 1 - stxvx 32+23, 14, 1 - stxvx 32+24, 15, 1 - stxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - stxvx 32+26, 10, 1 - stxvx 32+27, 11, 1 - stxvx 32+28, 12, 1 - stxvx 32+29, 14, 1 - stxvx 32+30, 15, 1 - stxvx 32+31, 16, 1 + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 .endm .macro RESTORE_REGS - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - lxvx 32+20, 10, 1 - lxvx 32+21, 11, 1 - lxvx 32+22, 12, 1 - lxvx 32+23, 14, 1 - lxvx 32+24, 15, 1 - lxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - lxvx 32+26, 10, 1 - lxvx 32+27, 11, 1 - lxvx 32+28, 12, 1 - lxvx 32+29, 14, 1 - lxvx 32+30, 15, 1 - lxvx 32+31, 16, 1 - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 +.endm + +/* + * Init_Coeffs_offset: initial offset setup for the coeeficient array. + * + * start: beginning of the offset to the coefficient array. + * next: Next offset. + * len: Index difference between coefficients. + * + * r7: len * 2, each coefficient component is 2 bytes. + * + * registers used for offset to coefficients, r[j] and r[j+len] + * R9: offset to r0 = j + * R16: offset to r1 = r0 + next + * R18: offset to r2 = r1 + next + * R20: offset to r3 = r2 + next + * + * R10: offset to r'0 = r0 + len*2 + * R17: offset to r'1 = r'0 + step + * R19: offset to r'2 = r'1 + step + * R21: offset to r'3 = r'2 + step + * + */ +.macro Init_Coeffs_offset start next + li 9, \start /* first offset to j */ + add 10, 7, 9 /* J + len*2 */ + addi 16, 9, \next + addi 17, 10, \next + addi 18, 16, \next + addi 19, 17, \next + addi 20, 18, \next + addi 21, 19, \next +.endm + +/* + * Load coefficient in r[j+len] (r') vectors from offset, R10, R17, R19 and R21 + * r[j+len]: V13, V18, V23, V28 + */ +.macro Load_4Rjp + lxvd2x 32+13, 3, 10 /* V13: vector r'0 */ + lxvd2x 32+18, 3, 17 /* V18: vector for r'1 */ + lxvd2x 32+23, 3, 19 /* V23: vector for r'2 */ + lxvd2x 32+28, 3, 21 /* V28: vector for r'3 */ .endm -.macro Load_4Coeffs start next step - mr 9, \start - add 10, 7, 9 # J + len*2 - addi 16, 9, \next - addi 17, 10, \step - addi 18, 16, \next - addi 19, 17, \step - addi 20, 18, \next - addi 21, 19, \step - lxvd2x 32+13, 3, 10 # r[j+len] - lxvd2x 32+18, 3, 17 # r[j+len] - lxvd2x 32+23, 3, 19 # r[j+len] - lxvd2x 32+28, 3, 21 # r[j+len] +/* + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7 + */ +.macro Load_4Coeffs start next + Init_Coeffs_offset \start \next + Load_4Rjp .endm -# -# Load Coeffients and setup vectors -# aj0, aj1, ajlen2, ajlen3, aj4, aj5, ajlen6, ajlen7 -# aj8, aj9, ajlen10, ajlen11, aj12, aj13, ajlen14, ajlen15 -# -# a[j]= aj0, aj1, aj8, aj9, aj4, aj5, aj12, aj13 -# a[j+len]= ajlen2, ajlen3, ajlen10, ajlen11, ajlen6, ajlen7, ajlen14, ajlen15 -# +/* + * Load 2 - 2 - 2 - 2 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7 + * rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15 + * Each vmrgew and vmrgow will transpose vectors as, + * r[j]= rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13 + * r[j+len]= rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15 + * + * r[j+len]: V13, V18, V23, V28 + * r[j]: V12, V17, V22, V27 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ .macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 # a[j], r[j+len] - lxvd2x 32+26, 10, 5 # a[j], r[j+len] + lxvd2x 32+25, 0, 5 + lxvd2x 32+26, 10, 5 vmrgew 13, 25, 26 vmrgow 12, 25, 26 - lxvd2x 32+25, 11, 5 # a[j], r[j+len] - lxvd2x 32+26, 12, 5 # a[j], r[j+len] + lxvd2x 32+25, 11, 5 + lxvd2x 32+26, 12, 5 vmrgew 18, 25, 26 vmrgow 17, 25, 26 - lxvd2x 32+25, 15, 5 # a[j], r[j+len] - lxvd2x 32+26, 16, 5 # a[j], r[j+len] + lxvd2x 32+25, 15, 5 + lxvd2x 32+26, 16, 5 vmrgew 23, 25, 26 vmrgow 22, 25, 26 - lxvd2x 32+25, 17, 5 # a[j], r[j+len] - lxvd2x 32+26, 18, 5 # a[j], r[j+len] + lxvd2x 32+25, 17, 5 + lxvd2x 32+26, 18, 5 vmrgew 28, 25, 26 vmrgow 27, 25, 26 .endm -# -# Permute -# rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 -# rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 -# -# to -# rjlen4 - rjlen7, rjlen12 - rjlen15 -# rj0 - rj4, rj8 - rj11 -# +/* + * Load 4 - 4 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 + * rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 + * + * Each xxpermdi will transpose vectors as, + * rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15 + * rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ .macro Load_L44Coeffs - lxvd2x 1, 0, 5 # rj0, rj1, rj2, rj3, - # rjlen4, rjlen5, rjlen6, rjlen7 - lxvd2x 2, 10, 5 # rj8, rj9, rj10, rj11 - # rjlen12, rjlen13, rjlen14, rjlen15 - xxpermdi 32+13, 2, 1, 3 # rjlen4 - rjlen7, rjlen12 - rjlen15 - xxpermdi 32+12, 2, 1, 0 # rj0 - rj4, rj8 - rj11 + lxvd2x 1, 0, 5 + lxvd2x 2, 10, 5 + xxpermdi 32+13, 2, 1, 3 + xxpermdi 32+12, 2, 1, 0 lxvd2x 3, 11, 5 lxvd2x 4, 12, 5 xxpermdi 32+18, 4, 3, 3 @@ -174,104 +225,118 @@ xxpermdi 32+27, 4, 3, 0 .endm -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) -# +/* + * montgomery_reduce + * t = a * QINV + * t = (a - (int32_t)t*_MLKEM_Q) >> 16 + * + * ----------------------------------- + * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) + */ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 - # fqmul = zeta * coefficient - # Modular multification bond by 2^16 * q in abs value - vmladduhm 15, 13, \_vz0, 3 - vmladduhm 20, 18, \_vz1, 3 - vmladduhm 25, 23, \_vz2, 3 - vmladduhm 30, 28, \_vz3, 3 - - # Signed multiply-high-round; outputs are bound by 2^15 * q in abs value - vmhraddshs 14, 13, \_vz0, 3 - vmhraddshs 19, 18, \_vz1, 3 - vmhraddshs 24, 23, \_vz2, 3 - vmhraddshs 29, 28, \_vz3, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 30, 30, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - vsrah 13, 15, 4 # >> 1 - vsrah 18, 20, 4 # >> 1 - vsrah 23, 25, 4 # >> 1 - vsrah 28, 30, 4 # >> 1 - + /* fqmul = zeta * coefficient + Modular multification bond by 2^16 * q in abs value */ + vmladduhm 15, 13, \_vz0, 3 + vmladduhm 20, 18, \_vz1, 3 + vmladduhm 25, 23, \_vz2, 3 + vmladduhm 30, 28, \_vz3, 3 + + /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ + vmhraddshs 14, 13, \_vz0, 3 + vmhraddshs 19, 18, \_vz1, 3 + vmhraddshs 24, 23, \_vz2, 3 + vmhraddshs 29, 28, \_vz3, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + /* Shift right 1 bit */ + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 .endm -.macro Load_4Aj - lxvd2x 32+12, 3, 9 # r[j] - lxvd2x 32+17, 3, 16 # r[j] - lxvd2x 32+22, 3, 18 # r[j] - lxvd2x 32+27, 3, 20 # r[j] +/* + * Load 4 r[j] (r) coefficient vectors: + * Load coefficient in vectors from offset, R9, R16, R18 and R20 + * r[j]: V12, V17, V22, V27 + */ +.macro Load_4Rj + lxvd2x 32+12, 3, 9 /* V12: vector r0 */ + lxvd2x 32+17, 3, 16 /* V17: vector r1 */ + lxvd2x 32+22, 3, 18 /* V22: vector r2 */ + lxvd2x 32+27, 3, 20 /* V27: vector r3 */ .endm +/* + * Compute final final r[j] and r[j+len] + * final r[j+len]: V16, V21, V26, V31 + * final r[j]: V15, V20, V25, V30 + */ .macro Compute_4Coeffs - # Since the result of the Montgomery multiplication is bounded - # by q in absolute value. - # Finally to complete the final update of the results with add/sub - vsubuhm 16, 12, 13 # r - t - vadduhm 15, 13, 12 # r + t - vsubuhm 21, 17, 18 # r - t - vadduhm 20, 18, 17 # r + t - vsubuhm 26, 22, 23 # r - t - vadduhm 25, 23, 22 # r + t - vsubuhm 31, 27, 28 # r - t - vadduhm 30, 28, 27 # r + t -.endm - -.macro NTT_MREDUCE_4X start next step _vz0 _vz1 _vz2 _vz3 - Load_4Coeffs \start, \next, \step - MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 - Load_4Aj - Compute_4Coeffs + /* Since the result of the Montgomery multiplication is bounded + by q in absolute value. + Finally to complete the final update of the results with add/sub + r[j] = r[j] + t. + r[j+len] = r[j] - t + */ + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 .endm .macro Write_One - stxvd2x 32+15, 3, 9 - stxvd2x 32+16, 3, 10 - stxvd2x 32+20, 3, 16 - stxvd2x 32+21, 3, 17 - stxvd2x 32+25, 3, 18 - stxvd2x 32+26, 3, 19 - stxvd2x 32+30, 3, 20 - stxvd2x 32+31, 3, 21 + stxvd2x 32+15, 3, 9 + stxvd2x 32+16, 3, 10 + stxvd2x 32+20, 3, 16 + stxvd2x 32+21, 3, 17 + stxvd2x 32+25, 3, 18 + stxvd2x 32+26, 3, 19 + stxvd2x 32+30, 3, 20 + stxvd2x 32+31, 3, 21 .endm +/* + * Transpose the final coefficients of 4-4 layout to the orginal + * coefficient array order. + */ .macro PermWriteL44 - Compute_4Coeffs - xxpermdi 0, 32+15, 32+16, 3 - xxpermdi 1, 32+15, 32+16, 0 - xxpermdi 2, 32+20, 32+21, 3 - xxpermdi 3, 32+20, 32+21, 0 - xxpermdi 4, 32+25, 32+26, 3 - xxpermdi 5, 32+25, 32+26, 0 - xxpermdi 6, 32+30, 32+31, 3 - xxpermdi 7, 32+30, 32+31, 0 - stxvd2x 0, 0, 5 - stxvd2x 1, 10, 5 - stxvd2x 2, 11, 5 - stxvd2x 3, 12, 5 - stxvd2x 4, 15, 5 - stxvd2x 5, 16, 5 - stxvd2x 6, 17, 5 - stxvd2x 7, 18, 5 + Compute_4Coeffs + xxpermdi 0, 32+15, 32+16, 3 + xxpermdi 1, 32+15, 32+16, 0 + xxpermdi 2, 32+20, 32+21, 3 + xxpermdi 3, 32+20, 32+21, 0 + xxpermdi 4, 32+25, 32+26, 3 + xxpermdi 5, 32+25, 32+26, 0 + xxpermdi 6, 32+30, 32+31, 3 + xxpermdi 7, 32+30, 32+31, 0 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 .endm +/* + * Transpose the final coefficients of 2-2-2-2 layout to the orginal + * coefficient array order. + */ .macro PermWriteL24 Compute_4Coeffs vmrgew 10, 16, 15 @@ -282,226 +347,205 @@ vmrgow 15, 26, 25 vmrgew 16, 31, 30 vmrgow 17, 31, 30 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm .macro Load_next_4zetas - li 10, 16 - li 11, 32 - li 12, 48 - lxvd2x 32+V_Z0, 0, 14 - lxvd2x 32+V_Z1, 10, 14 - lxvd2x 32+V_Z2, 11, 14 - lxvd2x 32+V_Z3, 12, 14 - addi 14, 14, 64 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 10, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 + addi 14, 14, 64 .endm +/* + * Re-ordering of the 4-4 layout zetas. + * Swap double-words. + */ .macro Perm_4zetas - xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 - xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 - xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 - xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 + xxpermdi 32+V_Z0, 32+V_Z0, 32+V_Z0, 2 + xxpermdi 32+V_Z1, 32+V_Z1, 32+V_Z1, 2 + xxpermdi 32+V_Z2, 32+V_Z2, 32+V_Z2, 2 + xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 +.endm + +.macro NTT_MREDUCE_4X start next _vz0 _vz1 _vz2 _vz3 + Load_4Coeffs \start, \next + MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 + Load_4Rj + Compute_4Coeffs + Write_One .endm -# -# mlk_ntt_ppc(int16_t *r) -# +/* + * mlk_ntt_ppc(int16_t *r) + */ .global MLK_ASM_NAMESPACE(ntt_ppc) .align 4 MLK_ASM_FN_SYMBOL(ntt_ppc) - SAVE_REGS + SAVE_REGS - # get MLKEM_Q - lvx V_NMKQ,0,4 + /* load MLKEM_Q */ + lvx V_NMKQ,0,4 - # zetas array - addi 14, 4, ZETA_NTT_OFFSET + /* Register 14 as pointer to zetas array */ + addi 14, 4, ZETA_NTT_OFFSET - vxor 3, 3, 3 - vspltish 4, 1 + vxor 3, 3, 3 + vspltish 4, 1 - li 10, QINV_OFFSET - lvx V_QINV, 10, 4 + li 10, QINV_OFFSET + lvx V_QINV, 10, 4 .align 4 - # - # Compute coefficients of the NTT based on the following loop. - # for (len = 128; len ≥ 2; len = len/2) - # - # 1. len = 128, start = 0 - # - li 5, 0 # start - li 7, 256 # len * 2 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 128 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 192 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + /* + * Compute coefficients of the NTT based on the following loop. + * for (len = 128; len ≥ 2; len = len/2) + * + * 1. len = 128, start = 0 + */ + li 7, 256 /* len * 2 */ + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .align 4 - # - # 2. len = 64, start = 0, 128 - # k += 2 - li 5, 0 - li 7, 128 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 64 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 320 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + /* + * 2. len = 64, start = 0, 128 + * k += 2 + */ + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .align 4 - # - # 3. len = 32, start = 0, 64, 128, 192 - # k += 4 - li 5, 0 - li 7, 64 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 128 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 256 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One - li 5, 384 - - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - NTT_MREDUCE_4X 5, 16, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - Write_One + /* + * 3. len = 32, start = 0, 64, 128, 192 + * k += 4 + */ + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, 14 + addi 14, 14, 16 + NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .align 4 - # - # 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - # k += 8 - li 5, 0 - li 7, 32 - Load_next_4zetas - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 16 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - Load_next_4zetas - li 5, 256 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 272 - NTT_MREDUCE_4X 5, 64, 64, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One + /* + * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + * k += 8 + */ + li 7, 32 + Load_next_4zetas + NTT_MREDUCE_4X 0, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 16, 64, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 256, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 272, 64, V_Z0, V_Z1, V_Z2, V_Z3 .align 4 - # - # 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - # k += 16 - li 5, 0 - li 7, 16 - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 128 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 256 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - li 5, 384 - - Load_next_4zetas - NTT_MREDUCE_4X 5, 32, 32, V_Z0, V_Z1, V_Z2, V_Z3 - Write_One - - # - # 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - # k += 32 - li 15, 4 # loops - mtctr 15 - mr 5, 3 - li 7, 8 - - li 10, 16 - li 11, 32 - li 12, 48 - li 15, 64 - li 16, 80 - li 17, 96 - li 18, 112 + /* + * 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + * k += 16 + */ + li 7, 16 + Load_next_4zetas + NTT_MREDUCE_4X 0, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 128, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 256, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 384, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + /* + * 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 + * k += 32 + * Load zeta vectors in 4-4 layout + */ + li 15, 4 + mtctr 15 + mr 5, 3 /* Let r5 points to coefficient array */ + li 7, 8 + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 .align 4 ntt_ppc__Len4: - Load_next_4zetas - Perm_4zetas + Load_next_4zetas + Perm_4zetas Load_L44Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 - PermWriteL44 - addi 5, 5, 128 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi 5, 5, 128 - bdnz ntt_ppc__Len4 + bdnz ntt_ppc__Len4 - # - # 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - # k += 64 - # Update zetas vectors, each vector has 2 zetas + /* + * 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 + * k += 64 + * Load zeta vectors in 2-2-2-2 layout + */ - li 8, 4 - mtctr 8 - mr 5, 3 - li 7, 4 + li 8, 4 + mtctr 8 + mr 5, 3 /* Let r5 points to coefficient array */ + li 7, 4 .align 4 ntt_ppc__Len2: - Load_next_4zetas + Load_next_4zetas Load_L24Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 - PermWriteL24 - addi 5, 5, 128 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi 5, 5, 128 - bdnz ntt_ppc__Len2 + bdnz ntt_ppc__Len2 - RESTORE_REGS - blr + RESTORE_REGS + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S index 765ef91763..4ca5771314 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -3,18 +3,19 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# - -# Poly_tomont: Inplace conversion of all coefficients of a polynomial -# from normal domain to Montgomery domain -# -# Arguments:*r: pointer to input/output polynomial -# +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ + +/* + * Poly_tomont: Inplace conversion of all coefficients of a polynomial + * from normal domain to Montgomery domain + * + * Arguments:*r: pointer to input/output polynomial + */ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ @@ -29,150 +30,151 @@ .machine "any" .text -# -# montgomery_reduce -# t = a * QINV -# t = (a - (int32_t)t*_MLKEM_Q) >> 16 -# -#----------------------------------- -# MREDUCE_4X(_v0, _v1, _v2, _v3) -# +/* + * montgomery_reduce + * t = a * QINV + * t = (a - (int32_t)t*_MLKEM_Q) >> 16 + * + * ----------------------------------- + * MREDUCE_4X(_v0, _v1, _v2, _v3) + */ .macro MREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+13, 0, 3 - addi 3, 3, 16 - lxvd2x 32+18, 0, 3 - addi 3, 3, 16 - lxvd2x 32+23, 0, 3 - addi 3, 3, 16 - lxvd2x 32+7, 0, 3 - addi 3, 3, 16 - - vmladduhm 15, 13, V1353, 3 - vmladduhm 20, 18, V1353, 3 - vmladduhm 25, 23, V1353, 3 - vmladduhm 9, 7, V1353, 3 - - vmhraddshs 14, 13, V1353, 3 - vmhraddshs 19, 18, V1353, 3 - vmhraddshs 24, 23, V1353, 3 - vmhraddshs 8, 7, V1353, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 9, 9, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 9, 9, V_NMKQ, 8 - - vsrah \_v0, 15, 4 # >> 1 - vsrah \_v1, 20, 4 # >> 1 - vsrah \_v2, 25, 4 # >> 1 - vsrah \_v3, 9, 4 # >> 1 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + /* Shift right 1 bit */ + vsrah \_v0, 15, 4 + vsrah \_v1, 20, 4 + vsrah \_v2, 25, 4 + vsrah \_v3, 9, 4 .endm .macro Write_8X - stxvd2x 32+27, 4, 3 - stxvd2x 32+28, 5, 3 - stxvd2x 32+29, 6, 3 - stxvd2x 32+30, 7, 3 - stxvd2x 32+13, 8, 3 - stxvd2x 32+18, 9, 3 - stxvd2x 32+23, 10, 3 - stxvd2x 32+7, 11, 3 + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 .endm .align 4 .globl MLK_ASM_NAMESPACE(poly_tomont_ppc) MLK_ASM_FN_SYMBOL(poly_tomont_ppc) - stdu 1, -320(1) - mflr 0 - - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - li 11, 208 - li 12, 224 - stxvx 32+20, 6, 1 - stxvx 32+21, 7, 1 - stxvx 32+22, 8, 1 - stxvx 32+23, 9, 1 - stxvx 32+24, 10, 1 - stxvx 32+25, 11, 1 - stxvx 32+26, 12, 1 - li 6, 240 - li 7, 256 - li 8, 272 - li 9, 288 - stxvx 32+27, 6, 1 - stxvx 32+28, 7, 1 - stxvx 32+29, 8, 1 - stxvx 32+30, 9, 1 - - li 6, NQ_OFFSET - li 7, QINV_OFFSET - li 8, C1353_OFFSET - lxvx 32+V_NMKQ, 6, 4 - lxvx 32+V_QINV, 7, 4 - lxvx 32+V1353, 8, 4 - - vxor 3, 3, 3 - vspltish 4, 1 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 - Write_8X - - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - li 11, 208 - li 12, 224 - lxvx 32+20, 6, 1 - lxvx 32+21, 7, 1 - lxvx 32+22, 8, 1 - lxvx 32+23, 9, 1 - lxvx 32+24, 10, 1 - lxvx 32+25, 11, 1 - lxvx 32+26, 12, 1 - li 6, 240 - li 7, 256 - li 8, 272 - li 9, 288 - lxvx 32+27, 6, 1 - lxvx 32+28, 7, 1 - lxvx 32+29, 8, 1 - lxvx 32+30, 9, 1 - mtlr 0 - addi 1, 1, 320 - blr + stdu 1, -320(1) + mflr 0 + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + stxvx 32+25, 11, 1 + stxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvx 32+27, 6, 1 + stxvx 32+28, 7, 1 + stxvx 32+29, 8, 1 + stxvx 32+30, 9, 1 + + li 6, NQ_OFFSET + li 7, QINV_OFFSET + li 8, C1353_OFFSET + lxvx 32+V_NMKQ, 6, 4 + lxvx 32+V_QINV, 7, 4 + lxvx 32+V1353, 8, 4 + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + lxvx 32+25, 11, 1 + lxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvx 32+27, 6, 1 + lxvx 32+28, 7, 1 + lxvx 32+29, 8, 1 + lxvx 32+30, 9, 1 + mtlr 0 + addi 1, 1, 320 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index 40c7a4cef5..3b6892d867 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -3,19 +3,19 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -# -# Copyright 2025- IBM Corp. -# -#=================================================================================== -# Written by Danny Tsen -# - -# -# poly_reduce: Applies Barrett reduction to all coefficients of a polynomial -# for details of the Barrett reduction -# -# Arguments: *r: pointer to input/output polynomial -# +/* + * Copyright 2025- IBM Corp. + * + * =================================================================================== + * Written by Danny Tsen + */ + +/* + * poly_reduce: Applies Barrett reduction to all coefficients of a polynomial + * for details of the Barrett reduction + * + * Arguments: *r: pointer to input/output polynomial + */ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ @@ -33,194 +33,194 @@ .text .macro BREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+8, 0, 3 - lxvd2x 32+12, 14, 3 - lxvd2x 32+16, 15, 3 - lxvd2x 32+20, 16, 3 - addi 3, 3, 64 - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm .macro Write_8X - stxvd2x 32+21, 4, 3 - stxvd2x 32+22, 5, 3 - stxvd2x 32+23, 6, 3 - stxvd2x 32+24, 7, 3 - stxvd2x 32+4, 8, 3 - stxvd2x 32+9, 9, 3 - stxvd2x 32+13, 10, 3 - stxvd2x 32+17, 11, 3 + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 .endm -# -# Conditional addition to get unsigned canonical representative -# +/* + * Conditional addition to get unsigned canonical representative + */ .macro To_unsigned_16 lxvd2x 32+12, 0, 3 lxvd2x 32+13, 14, 3 lxvd2x 32+14, 15, 3 lxvd2x 32+15, 16, 3 - addi 3, 3, 64 - vsrh 1, 12, 10 - vsrh 0, 13, 10 - vsrh 3, 14, 10 - vsrh 2, 15, 10 - vadduhm 7, 12, 11 - vadduhm 8, 13, 11 - vadduhm 5, 14, 11 - vadduhm 6, 15, 11 - vcmpequh 1, 1, 9 - vcmpequh 0, 0, 9 - vcmpequh 3, 3, 9 - vcmpequh 2, 2, 9 - xxsel 32+1, 32+7,32+12, 32+1 - xxsel 32+0, 32+8,32+13, 32+0 - xxsel 32+3, 32+5,32+14, 32+3 - xxsel 32+2, 32+6,32+15, 32+2 - stxvd2x 32+3, 10, 3 - stxvd2x 32+2, 11, 3 - stxvd2x 32+1, 8, 3 - stxvd2x 32+0, 9, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxvd2x 32+3, 10, 3 + stxvd2x 32+2, 11, 3 + stxvd2x 32+1, 8, 3 + stxvd2x 32+0, 9, 3 .endm .align 4 .globl MLK_ASM_NAMESPACE(reduce_ppc) MLK_ASM_FN_SYMBOL(reduce_ppc) - stdu 1, -224(1) - mflr 0 - std 14, 96(1) - std 15, 104(1) - std 16, 112(1) - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - stxvx 32+20, 6, 1 - stxvx 32+21, 7, 1 - stxvx 32+22, 8, 1 - stxvx 32+23, 9, 1 - stxvx 32+24, 10, 1 - - vxor 7, 7, 7 - - li 6, Q_OFFSET - li 7, C20159_OFFSET - lxvx 32+V_MKQ, 6, 4 - lxvx 32+V20159, 7, 4 - - vspltisw V_26, 13 - vadduwm V_26, V_26, V_26 - vspltisw 4, 1 - vsubuwm 5, V_26, 4 - vslw V_25, 4, 5 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - li 14, 16 - li 15, 32 - li 16, 48 - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 - Write_8X - - # - # To unsigned canonical - # + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + + vxor 7, 7, 7 + + li 6, Q_OFFSET + li 7, C20159_OFFSET + lxvx 32+V_MKQ, 6, 4 + lxvx 32+V20159, 7, 4 + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + /* + * To unsigned canonical + */ .align 4 - addi 3, 3, -512 - vxor 9, 9, 9 - vspltish 10, 15 - vmr 11, V_MKQ - - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - To_unsigned_16 - - ld 14, 96(1) - ld 15, 104(1) - ld 16, 112(1) - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - lxvx 32+20, 6, 1 - lxvx 32+21, 7, 1 - lxvx 32+22, 8, 1 - lxvx 32+23, 9, 1 - lxvx 32+24, 10, 1 - mtlr 0 - addi 1, 1, 224 - blr + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + mtlr 0 + addi 1, 1, 224 + blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ From beef1fbeb215bd8b7f7f24a3bfaa489c9f041c9a Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Tue, 11 Nov 2025 06:01:46 -0500 Subject: [PATCH 19/22] Addition of #define for all registers used. Signed-off-by: Danny Tsen --- dev/ppc64le/src/consts.h | 67 ++ dev/ppc64le/src/intt_ppc.S | 814 ++++++++++----------- dev/ppc64le/src/ntt_ppc.S | 472 ++++++------ dev/ppc64le/src/poly_tomont.S | 234 +++--- dev/ppc64le/src/reduce.S | 296 ++++---- mlkem/src/native/ppc64le/src/consts.h | 67 ++ mlkem/src/native/ppc64le/src/intt_ppc.S | 814 ++++++++++----------- mlkem/src/native/ppc64le/src/ntt_ppc.S | 472 ++++++------ mlkem/src/native/ppc64le/src/poly_tomont.S | 234 +++--- mlkem/src/native/ppc64le/src/reduce.S | 296 ++++---- 10 files changed, 1950 insertions(+), 1816 deletions(-) diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h index b5e66983fe..96cf7cfc91 100644 --- a/dev/ppc64le/src/consts.h +++ b/dev/ppc64le/src/consts.h @@ -19,6 +19,73 @@ #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) extern const int16_t mlk_ppc_qdata[]; +#else +#define r0 0 +#define r1 1 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define v0 0 +#define v1 1 +#define v2 2 +#define v3 3 +#define v4 4 +#define v5 5 +#define v6 6 +#define v7 7 +#define v8 8 +#define v9 9 +#define v10 10 +#define v11 11 +#define v12 12 +#define v13 13 +#define v14 14 +#define v15 15 +#define v16 16 +#define v17 17 +#define v18 18 +#define v19 19 +#define v20 20 +#define v21 21 +#define v22 22 +#define v23 23 +#define v24 24 +#define v25 25 +#define v26 26 +#define v27 27 +#define v28 28 +#define v29 29 +#define v30 30 +#define v31 31 +#define vs0 0 +#define vs1 1 +#define vs2 2 +#define vs3 3 +#define vs4 4 +#define vs5 5 +#define vs6 6 +#define vs7 7 +#define vs8 8 +#define vs9 9 +#define vs10 10 +#define vs11 11 +#define vs12 12 +#define vs13 13 #endif #endif /* !MLK_DEV_PPC64LE_SRC_CONSTS_H */ diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 85ba00482b..d311138275 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -37,78 +37,78 @@ #define V1441 10 .macro SAVE_REGS - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - stxvx 32+20, 10, 1 - stxvx 32+21, 11, 1 - stxvx 32+22, 12, 1 - stxvx 32+23, 14, 1 - stxvx 32+24, 15, 1 - stxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - stxvx 32+26, 10, 1 - stxvx 32+27, 11, 1 - stxvx 32+28, 12, 1 - stxvx 32+29, 14, 1 - stxvx 32+30, 15, 1 - stxvx 32+31, 16, 1 + stdu r1, -352(r1) + mflr r0 + std r14, 56(r1) + std r15, 64(r1) + std r16, 72(r1) + std r17, 80(r1) + std r18, 88(r1) + std r19, 96(r1) + std r20, 104(r1) + std r21, 112(r1) + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + stxvx 32+v20, r10, r1 + stxvx 32+v21, r11, r1 + stxvx 32+v22, r12, r1 + stxvx 32+v23, r14, r1 + stxvx 32+v24, r15, r1 + stxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + stxvx 32+v26, r10, r1 + stxvx 32+v27, r11, r1 + stxvx 32+v28, r12, r1 + stxvx 32+v29, r14, r1 + stxvx 32+v30, r15, r1 + stxvx 32+v31, r16, r1 .endm .macro RESTORE_REGS - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - lxvx 32+20, 10, 1 - lxvx 32+21, 11, 1 - lxvx 32+22, 12, 1 - lxvx 32+23, 14, 1 - lxvx 32+24, 15, 1 - lxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - lxvx 32+26, 10, 1 - lxvx 32+27, 11, 1 - lxvx 32+28, 12, 1 - lxvx 32+29, 14, 1 - lxvx 32+30, 15, 1 - lxvx 32+31, 16, 1 - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + lxvx 32+v20, r10, r1 + lxvx 32+v21, r11, r1 + lxvx 32+v22, r12, r1 + lxvx 32+v23, r14, r1 + lxvx 32+v24, r15, r1 + lxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + lxvx 32+v26, r10, r1 + lxvx 32+v27, r11, r1 + lxvx 32+v28, r12, r1 + lxvx 32+v29, r14, r1 + lxvx 32+v30, r15, r1 + lxvx 32+v31, r16, r1 + ld r14, 56(r1) + ld r15, 64(r1) + ld r16, 72(r1) + ld r17, 80(r1) + ld r18, 88(r1) + ld r19, 96(r1) + ld r20, 104(r1) + ld r21, 112(r1) + + mtlr r0 + addi r1, r1, 352 .endm /* @@ -123,14 +123,14 @@ r[j] = r[j] + t. r[j+len] = r[j] - t */ - vsubuhm 25, 8, 21 - vsubuhm 26, 12, 22 - vsubuhm 30, 16, 23 - vsubuhm 31, 20, 24 - vadduhm 8, 8, 21 - vadduhm 12, 12, 22 - vadduhm 16, 16, 23 - vadduhm 20, 20, 24 + vsubuhm v25, v8, v21 + vsubuhm v26, v12, v22 + vsubuhm v30, v16, v23 + vsubuhm v31, v20, v24 + vadduhm v8, v8, v21 + vadduhm v12, v12, v22 + vadduhm v16, v16, v23 + vadduhm v20, v20, v24 .endm /* @@ -155,14 +155,14 @@ * */ .macro Init_Coeffs_offset start next - li 9, \start /* first offset to j */ - add 10, 7, 9 /* J + len*2 */ - addi 16, 9, \next - addi 17, 10, \next - addi 18, 16, \next - addi 19, 17, \next - addi 20, 18, \next - addi 21, 19, \next + li r9, \start /* first offset to j */ + add r10, r7, r9 /* J + len*2 */ + addi r16, r9, \next + addi r17, r10, \next + addi r18, r16, \next + addi r19, r17, \next + addi r20, r18, \next + addi r21, r19, \next .endm /* @@ -174,15 +174,15 @@ * r[j]: V21, V22, V23, V24 */ .macro Load_4Rjp - lxvd2x 32+8, 3, 10 /* V8: vector r'0 */ - lxvd2x 32+12, 3, 17 /* V12: vector for r'1 */ - lxvd2x 32+16, 3, 19 /* V16: vector for r'2 */ - lxvd2x 32+20, 3, 21 /* V20: vector for r'3 */ - - lxvd2x 32+21, 3, 9 /* V21: vector r0 */ - lxvd2x 32+22, 3, 16 /* V22: vector r1 */ - lxvd2x 32+23, 3, 18 /* V23: vector r2 */ - lxvd2x 32+24, 3, 20 /* V24: vector r3 */ + lxvd2x 32+v8, r3, r10 /* V8: vector r'0 */ + lxvd2x 32+v12, r3, r17 /* V12: vector for r'1 */ + lxvd2x 32+v16, r3, r19 /* V16: vector for r'2 */ + lxvd2x 32+v20, r3, r21 /* V20: vector for r'3 */ + + lxvd2x 32+v21, r3, r9 /* V21: vector r0 */ + lxvd2x 32+v22, r3, r16 /* V22: vector r1 */ + lxvd2x 32+v23, r3, r18 /* V23: vector r2 */ + lxvd2x 32+v24, r3, r20 /* V24: vector r3 */ .endm /* @@ -214,22 +214,22 @@ * in the proper order to match the multiplication. */ .macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 - lxvd2x 32+26, 10, 5 - vmrgew 8, 25, 26 - vmrgow 21, 25, 26 - lxvd2x 32+25, 11, 5 - lxvd2x 32+26, 12, 5 - vmrgew 12, 25, 26 - vmrgow 22, 25, 26 - lxvd2x 32+25, 15, 5 - lxvd2x 32+26, 16, 5 - vmrgew 16, 25, 26 - vmrgow 23, 25, 26 - lxvd2x 32+25, 17, 5 - lxvd2x 32+26, 18, 5 - vmrgew 20, 25, 26 - vmrgow 24, 25, 26 + lxvd2x 32+v25, 0, r5 + lxvd2x 32+v26, r10, r5 + vmrgew v8, v25, v26 + vmrgow v21, v25, v26 + lxvd2x 32+v25, r11, r5 + lxvd2x 32+v26, r12, r5 + vmrgew v12, v25, v26 + vmrgow v22, v25, v26 + lxvd2x 32+v25, r15, r5 + lxvd2x 32+v26, r16, r5 + vmrgew v16, v25, v26 + vmrgow v23, v25, v26 + lxvd2x 32+v25, r17, r5 + lxvd2x 32+v26, r18, r5 + vmrgew v20, v25, v26 + vmrgow v24, v25, v26 .endm /* @@ -248,81 +248,81 @@ * in the proper order to match the multiplication. */ .macro Load_L44Coeffs - lxvd2x 10, 0, 5 - lxvd2x 11, 10, 5 - xxpermdi 32+8, 11, 10, 3 - xxpermdi 32+21, 11, 10, 0 - lxvd2x 10, 11, 5 - lxvd2x 11, 12, 5 - xxpermdi 32+12, 11, 10, 3 - xxpermdi 32+22, 11, 10, 0 - lxvd2x 10, 15, 5 - lxvd2x 11, 16, 5 - xxpermdi 32+16, 11, 10, 3 - xxpermdi 32+23, 11, 10, 0 - lxvd2x 10, 17, 5 - lxvd2x 11, 18, 5 - xxpermdi 32+20, 11, 10, 3 - xxpermdi 32+24, 11, 10, 0 + lxvd2x vs10, 0, r5 + lxvd2x vs11, r10, r5 + xxpermdi 32+v8, vs11, vs10, 3 + xxpermdi 32+v21, vs11, vs10, 0 + lxvd2x vs10, r11, r5 + lxvd2x vs11, r12, r5 + xxpermdi 32+v12, vs11, vs10, 3 + xxpermdi 32+v22, vs11, vs10, 0 + lxvd2x vs10, r15, r5 + lxvd2x vs11, r16, r5 + xxpermdi 32+v16, vs11, vs10, 3 + xxpermdi 32+v23, vs11, vs10, 0 + lxvd2x vs10, r17, r5 + lxvd2x vs11, r18, r5 + xxpermdi 32+v20, vs11, vs10, 3 + xxpermdi 32+v24, vs11, vs10, 0 .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 /* Restore constant vectors V_MKQ, V_25 and V_26 */ - vxor 7, 7, 7 - xxlor 32+3, 6, 6 - xxlor 32+1, 7, 7 - xxlor 32+2, 8, 8 + vxor v7, v7, v7 + xxlor 32+v3, vs6, vs6 + xxlor 32+v1, vs7, vs7 + xxlor 32+v2, vs8, vs8 /* Multify Odd/Even signed halfword; Results word bound by 2^32 in abs value. */ - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 + vmulosh v6, v8, V20159 + vmulesh v5, v8, V20159 + vmulosh v11, v12, V20159 + vmulesh v10, v12, V20159 + vmulosh v15, v16, V20159 + vmulesh v14, v16, V20159 + vmulosh v19, v20, V20159 + vmulesh v18, v20, V20159 + xxmrglw 32+v4, 32+v5, 32+v6 + xxmrghw 32+v5, 32+v5, 32+v6 + xxmrglw 32+v9, 32+v10, 32+v11 + xxmrghw 32+v10, 32+v10, 32+v11 + xxmrglw 32+v13, 32+v14, 32+v15 + xxmrghw 32+v14, 32+v14, 32+v15 + xxmrglw 32+v17, 32+v18, 32+v19 + xxmrghw 32+v18, 32+v18, 32+v19 + vadduwm v4, v4, V_25 + vadduwm v5, v5, V_25 + vadduwm v9, v9, V_25 + vadduwm v10, v10, V_25 + vadduwm v13, v13, V_25 + vadduwm v14, v14, V_25 + vadduwm v17, v17, V_25 + vadduwm v18, v18, V_25 /* Right shift and pack lower halfword, results bond to 2^16 in abs value */ - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 + vsraw v4, v4, V_26 + vsraw v5, v5, V_26 + vsraw v9, v9, V_26 + vsraw v10, v10, V_26 + vsraw v13, v13, V_26 + vsraw v14, v14, V_26 + vsraw v17, v17, V_26 + vsraw v18, v18, V_26 + vpkuwum v4, v5, v4 + vsubuhm v4, v7, v4 + vpkuwum v9, v10, v9 + vsubuhm v9, v7, v9 + vpkuwum v13, v14, v13 + vsubuhm v13, v7, v13 + vpkuwum v17, v18, v17 + vsubuhm v17, v7, v17 /* Modulo multify-Low unsigned halfword; results bond to 2^16 * q in abs value. */ - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + vmladduhm \_v0, v4, V_MKQ, v8 + vmladduhm \_v1, v9, V_MKQ, v12 + vmladduhm \_v2, v13, V_MKQ, v16 + vmladduhm \_v3, v17, V_MKQ, v20 .endm /* @@ -331,32 +331,32 @@ */ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 /* Modular multification bond by 2^16 * q in abs value */ - vmladduhm 15, 25, \_vz0, 3 - vmladduhm 20, 26, \_vz1, 3 - vmladduhm 27, 30, \_vz2, 3 - vmladduhm 28, 31, \_vz3, 3 + vmladduhm v15, v25, \_vz0, v3 + vmladduhm v20, v26, \_vz1, v3 + vmladduhm v27, v30, \_vz2, v3 + vmladduhm v28, v31, \_vz3, v3 /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ - vmhraddshs 14, 25, \_vz0, 3 - vmhraddshs 19, 26, \_vz1, 3 - vmhraddshs 24, 30, \_vz2, 3 - vmhraddshs 29, 31, \_vz3, 3 + vmhraddshs v14, v25, \_vz0, v3 + vmhraddshs v19, v26, \_vz1, v3 + vmhraddshs v24, v30, \_vz2, v3 + vmhraddshs v29, v31, \_vz3, v3 - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 27, V_QINV, 3 - vmladduhm 30, 28, V_QINV, 3 + vmladduhm v15, v15, V_QINV, v3 + vmladduhm v20, v20, V_QINV, v3 + vmladduhm v25, v27, V_QINV, v3 + vmladduhm v30, v28, V_QINV, v3 - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 + vmhraddshs v15, v15, V_NMKQ, v14 + vmhraddshs v20, v20, V_NMKQ, v19 + vmhraddshs v25, v25, V_NMKQ, v24 + vmhraddshs v30, v30, V_NMKQ, v29 /* Shift right 1 bit */ - vsrah \_vo0, 15, 4 - vsrah \_vo1, 20, 4 - vsrah \_vo2, 25, 4 - vsrah \_vo3, 30, 4 + vsrah \_vo0, v15, v4 + vsrah \_vo1, v20, v4 + vsrah \_vo2, v25, v4 + vsrah \_vo3, v30, v4 .endm /* @@ -364,21 +364,21 @@ * V_NMKQ, V_QINV, Zero vector, One vector */ .macro Set_mont_consts - xxlor 32+5, 0, 0 /* V_NMKQ */ - xxlor 32+2, 2, 2 /* V_QINV */ - xxlor 32+3, 3, 3 /* all 0 */ - xxlor 32+4, 4, 4 /* all 1 */ + xxlor 32+v5, vs0, vs0 /* V_NMKQ */ + xxlor 32+v2, vs2, vs2 /* V_QINV */ + xxlor 32+v3, vs3, vs3 /* all 0 */ + xxlor 32+v4, vs4, vs4 /* all 1 */ .endm .macro Load_next_4zetas - li 8, 16 - li 11, 32 - li 12, 48 - lxvd2x 32+V_Z0, 0, 14 - lxvd2x 32+V_Z1, 8, 14 - lxvd2x 32+V_Z2, 11, 14 - lxvd2x 32+V_Z3, 12, 14 - addi 14, 14, 64 + li r8, 16 + li r11, 32 + li r12, 48 + lxvd2x 32+V_Z0, 0, r14 + lxvd2x 32+V_Z1, r8, r14 + lxvd2x 32+V_Z2, r11, r14 + lxvd2x 32+V_Z3, r12, r14 + addi r14, r14, 64 .endm /* @@ -393,38 +393,38 @@ .endm .macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 9 - stxvd2x \_vs1, 3, 16 - stxvd2x \_vs2, 3, 18 - stxvd2x \_vs3, 3, 20 + stxvd2x \_vs0, r3, r9 + stxvd2x \_vs1, r3, r16 + stxvd2x \_vs2, r3, r18 + stxvd2x \_vs3, r3, r20 .endm .macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 10 - stxvd2x \_vs1, 3, 17 - stxvd2x \_vs2, 3, 19 - stxvd2x \_vs3, 3, 21 + stxvd2x \_vs0, r3, r10 + stxvd2x \_vs1, r3, r17 + stxvd2x \_vs2, r3, r19 + stxvd2x \_vs3, r3, r21 .endm .macro Reload_4coeffs - lxvd2x 32+25, 0, 3 - lxvd2x 32+26, 10, 3 - lxvd2x 32+30, 11, 3 - lxvd2x 32+31, 12, 3 - addi 3, 3, 64 + lxvd2x 32+v25, 0, r3 + lxvd2x 32+v26, r10, r3 + lxvd2x 32+v30, r11, r3 + lxvd2x 32+v31, r12, r3 + addi r3, r3, 64 .endm .macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - addi 3, 3, -128 - stxvd2x \_vs0, 0, 3 - stxvd2x \_vs1, 10, 3 - stxvd2x \_vs2, 11, 3 - stxvd2x \_vs3, 12, 3 - stxvd2x \_vs4, 15, 3 - stxvd2x \_vs5, 16, 3 - stxvd2x \_vs6, 17, 3 - stxvd2x \_vs7, 18, 3 - addi 3, 3, 128 + addi r3, r3, -128 + stxvd2x \_vs0, 0, r3 + stxvd2x \_vs1, r10, r3 + stxvd2x \_vs2, r11, r3 + stxvd2x \_vs3, r12, r3 + stxvd2x \_vs4, r15, r3 + stxvd2x \_vs5, r16, r3 + stxvd2x \_vs6, r17, r3 + stxvd2x \_vs7, r18, r3 + addi r3, r3, 128 .endm /* @@ -432,26 +432,26 @@ * coefficient array order. */ .macro PermWriteL44 - xxlor 32+14, 10, 10 - xxlor 32+19, 11, 11 - xxlor 32+24, 12, 12 - xxlor 32+29, 13, 13 - xxpermdi 32+10, 32+14, 32+13, 3 - xxpermdi 32+11, 32+14, 32+13, 0 - xxpermdi 32+12, 32+19, 32+18, 3 - xxpermdi 32+13, 32+19, 32+18, 0 - xxpermdi 32+14, 32+24, 32+23, 3 - xxpermdi 32+15, 32+24, 32+23, 0 - xxpermdi 32+16, 32+29, 32+28, 3 - xxpermdi 32+17, 32+29, 32+28, 0 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + xxlor 32+v14, vs10, vs10 + xxlor 32+v19, vs11, vs11 + xxlor 32+v24, vs12, vs12 + xxlor 32+v29, vs13, vs13 + xxpermdi 32+v10, 32+v14, 32+v13, 3 + xxpermdi 32+v11, 32+v14, 32+v13, 0 + xxpermdi 32+v12, 32+v19, 32+v18, 3 + xxpermdi 32+v13, 32+v19, 32+v18, 0 + xxpermdi 32+v14, 32+v24, 32+v23, 3 + xxpermdi 32+v15, 32+v24, 32+v23, 0 + xxpermdi 32+v16, 32+v29, 32+v28, 3 + xxpermdi 32+v17, 32+v29, 32+v28, 0 + stxvd2x 32+v10, 0, r5 + stxvd2x 32+v11, r10, r5 + stxvd2x 32+v12, r11, r5 + stxvd2x 32+v13, r12, r5 + stxvd2x 32+v14, r15, r5 + stxvd2x 32+v15, r16, r5 + stxvd2x 32+v16, r17, r5 + stxvd2x 32+v17, r18, r5 .endm /* @@ -459,65 +459,65 @@ * coefficient array order. */ .macro PermWriteL24 - xxlor 32+14, 10, 10 - xxlor 32+19, 11, 11 - xxlor 32+24, 12, 12 - xxlor 32+29, 13, 13 - vmrgew 10, 13, 14 - vmrgow 11, 13, 14 - vmrgew 12, 18, 19 - vmrgow 13, 18, 19 - vmrgew 14, 23, 24 - vmrgow 15, 23, 24 - vmrgew 16, 28, 29 - vmrgow 17, 28, 29 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + xxlor 32+v14, vs10, vs10 + xxlor 32+v19, vs11, vs11 + xxlor 32+v24, vs12, vs12 + xxlor 32+v29, vs13, vs13 + vmrgew v10, v13, v14 + vmrgow v11, v13, v14 + vmrgew v12, v18, v19 + vmrgow v13, v18, v19 + vmrgew v14, v23, v24 + vmrgow v15, v23, v24 + vmrgew v16, v28, v29 + vmrgow v17, v28, v29 + stxvd2x 32+v10, 0, r5 + stxvd2x 32+v11, r10, r5 + stxvd2x 32+v12, r11, r5 + stxvd2x 32+v13, r12, r5 + stxvd2x 32+v14, r15, r5 + stxvd2x 32+v15, r16, r5 + stxvd2x 32+v16, r17, r5 + stxvd2x 32+v17, r18, r5 .endm .macro INTT_REDUCE_L24 Load_L24Coeffs Compute_4Coeffs - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 + BREDUCE_4X v4, v9, v13, v17 + xxlor vs10, 32+v4, 32+v4 + xxlor vs11, 32+v9, 32+v9 + xxlor vs12, 32+v13, 32+v13 + xxlor vs13, 32+v17, 32+v17 Set_mont_consts Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 PermWriteL24 .endm .macro INTT_REDUCE_L44 Load_L44Coeffs Compute_4Coeffs - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 + BREDUCE_4X v4, v9, v13, v17 + xxlor vs10, 32+v4, 32+v4 + xxlor vs11, 32+v9, 32+v9 + xxlor vs12, 32+v13, 32+v13 + xxlor vs13, 32+v17, 32+v17 Set_mont_consts Load_next_4zetas Perm_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 PermWriteL44 .endm .macro INTT_REDUCE_4X start next Load_4Coeffs \start, \next - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 .endm /* @@ -539,57 +539,57 @@ MLK_ASM_FN_SYMBOL(intt_ppc) /* init vectors and constants Setup for Montgomery reduce */ - lxvx 0, 0, 4 + lxvx vs0, 0, r4 - li 10, QINV_OFFSET - lxvx 32+V_QINV, 10, 4 - xxlxor 32+3, 32+3, 32+3 - vspltish 4, 1 - xxlor 2, 32+2, 32+2 /* QINV */ - xxlor 3, 32+3, 32+3 /* 0 vector */ - xxlor 4, 32+4, 32+4 /* 1 vector */ + li r10, QINV_OFFSET + lxvx 32+V_QINV, r10, r4 + xxlxor 32+v3, 32+v3, 32+v3 + vspltish v4, 1 + xxlor vs2, 32+v2, 32+v2 /* QINV */ + xxlor vs3, 32+v3, 32+v3 /* 0 vector */ + xxlor vs4, 32+v4, 32+v4 /* 1 vector */ /* Setup for Barrett reduce */ - li 10, Q_OFFSET - li 11, C20159_OFFSET - lxvx 6, 10, 4 /* V_MKQ */ - lxvx 32+V20159, 11, 4 /* V20159 */ - - vspltisw 8, 13 - vadduwm 8, 8, 8 - xxlor 8, 32+8, 32+8 /* V_26 store at vs8 */ - - vspltisw 9, 1 - vsubuwm 10, 8, 9 /* value 25 */ - vslw 9, 9, 10 - xxlor 7, 32+9, 32+9 /* V_25 syore at vs7 */ - - li 10, 16 - li 11, 32 - li 12, 48 - li 15, 64 - li 16, 80 - li 17, 96 - li 18, 112 + li r10, Q_OFFSET + li r11, C20159_OFFSET + lxvx vs6, r10, r4 /* V_MKQ */ + lxvx 32+V20159, r11, r4 /* V20159 */ + + vspltisw v8, 13 + vadduwm v8, v8, v8 + xxlor vs8, 32+v8, 32+v8 /* V_26 store at vs8 */ + + vspltisw v9, 1 + vsubuwm v10, v8, v9 /* value 25 */ + vslw v9, v9, v10 + xxlor vs7, 32+v9, 32+v9 /* V_25 syore at vs7 */ + + li r10, 16 + li r11, 32 + li r12, 48 + li r15, 64 + li r16, 80 + li r17, 96 + li r18, 112 /* * Montgomery reduce loops with constant 1441 */ - addi 14, 4, C1441_OFFSET - lvx V1441, 0, 14 - li 8, 4 - mtctr 8 + addi r14, r4, C1441_OFFSET + lvx V1441, 0, r14 + li r8, 4 + mtctr r8 Set_mont_consts intt_ppc__Loopf: Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + MREDUCE_4X V1441, V1441, V1441, V1441, v6, v7, v8, v9 Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + MREDUCE_4X V1441, V1441, V1441, V1441, v13, v18, v23, v28 + MWrite_8X 32+v6, 32+v7, 32+v8, 32+v9, 32+v13, 32+v18, 32+v23, 32+v28 bdnz intt_ppc__Loopf - addi 3, 3, -512 + addi r3, r3, -512 .align 4 /* @@ -597,14 +597,14 @@ intt_ppc__Loopf: * Update zetas vectors, each vector has 2 zetas * Load zeta array in 2-2-2-2 layout */ - addi 14, 4, ZETA_INTT_OFFSET - li 7, 4 /* len * 2 */ - li 8, 4 - mtctr 8 - mr 5, 3 + addi r14, r4, ZETA_INTT_OFFSET + li r7, 4 /* len * 2 */ + li r8, 4 + mtctr r8 + mr r5, r3 intt_ppc__Loop2: INTT_REDUCE_L24 - addi 5, 5, 128 + addi r5, r5, 128 bdnz intt_ppc__Loop2 .align 4 @@ -612,20 +612,20 @@ intt_ppc__Loop2: * 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 * Load zeta array in 4-4 layout */ - mr 5, 3 - li 7, 8 - li 8, 4 - mtctr 8 + mr r5, r3 + li r7, 8 + li r8, 4 + mtctr r8 intt_ppc__Loop4: INTT_REDUCE_L44 - addi 5, 5, 128 + addi r5, r5, 128 bdnz intt_ppc__Loop4 .align 4 /* * 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 */ - li 7, 16 + li r7, 16 INTT_REDUCE_4X 0, 32 INTT_REDUCE_4X 128, 32 @@ -636,137 +636,137 @@ intt_ppc__Loop4: /* * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 */ - li 7, 32 + li r7, 32 INTT_REDUCE_4X 0, 64 - addi 14, 14, -64 + addi r14, r14, -64 INTT_REDUCE_4X 16, 64 INTT_REDUCE_4X 256, 64 - addi 14, 14, -64 + addi r14, r14, -64 INTT_REDUCE_4X 272, 64 .align 4 /* * 5. len = 32, start = 0, 64, 128, 192 */ - li 7, 64 + li r7, 64 Load_4Coeffs 0, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 128, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 256, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 384, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 .align 4 /* * 6. len = 64, start = 0, 128 */ - li 7, 128 + li r7, 128 Load_4Coeffs 0, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 64, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 256, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 320, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 .align 4 /* * 7. len = 128, start = 0 */ - li 7, 256 /* len*2 */ + li r7, 256 /* len*2 */ Load_4Coeffs 0, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - xxlor 9, 32+10, 32+10 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + xxlor vs9, 32+V_ZETA, 32+V_ZETA + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 64, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + xxlor 32+V_ZETA, vs9, vs9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 128, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + xxlor 32+V_ZETA, vs9, vs9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 192, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + xxlor 32+V_ZETA, vs9, vs9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 RESTORE_REGS blr diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index d702973832..beee949702 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -29,78 +29,78 @@ .text .macro SAVE_REGS - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - stxvx 32+20, 10, 1 - stxvx 32+21, 11, 1 - stxvx 32+22, 12, 1 - stxvx 32+23, 14, 1 - stxvx 32+24, 15, 1 - stxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - stxvx 32+26, 10, 1 - stxvx 32+27, 11, 1 - stxvx 32+28, 12, 1 - stxvx 32+29, 14, 1 - stxvx 32+30, 15, 1 - stxvx 32+31, 16, 1 + stdu r1, -352(r1) + mflr r0 + std r14, 56(r1) + std r15, 64(r1) + std r16, 72(r1) + std r17, 80(r1) + std r18, 88(r1) + std r19, 96(r1) + std r20, 104(r1) + std r21, 112(r1) + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + stxvx 32+v20, r10, r1 + stxvx 32+v21, r11, r1 + stxvx 32+v22, r12, r1 + stxvx 32+v23, r14, r1 + stxvx 32+v24, r15, r1 + stxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + stxvx 32+v26, r10, r1 + stxvx 32+v27, r11, r1 + stxvx 32+v28, r12, r1 + stxvx 32+v29, r14, r1 + stxvx 32+v30, r15, r1 + stxvx 32+v31, r16, r1 .endm .macro RESTORE_REGS - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - lxvx 32+20, 10, 1 - lxvx 32+21, 11, 1 - lxvx 32+22, 12, 1 - lxvx 32+23, 14, 1 - lxvx 32+24, 15, 1 - lxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - lxvx 32+26, 10, 1 - lxvx 32+27, 11, 1 - lxvx 32+28, 12, 1 - lxvx 32+29, 14, 1 - lxvx 32+30, 15, 1 - lxvx 32+31, 16, 1 - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + lxvx 32+v20, r10, r1 + lxvx 32+v21, r11, r1 + lxvx 32+v22, r12, r1 + lxvx 32+v23, r14, r1 + lxvx 32+v24, r15, r1 + lxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + lxvx 32+v26, r10, r1 + lxvx 32+v27, r11, r1 + lxvx 32+v28, r12, r1 + lxvx 32+v29, r14, r1 + lxvx 32+v30, r15, r1 + lxvx 32+v31, r16, r1 + ld r14, 56(r1) + ld r15, 64(r1) + ld r16, 72(r1) + ld r17, 80(r1) + ld r18, 88(r1) + ld r19, 96(r1) + ld r20, 104(r1) + ld r21, 112(r1) + + mtlr r0 + addi r1, r1, 352 .endm /* @@ -125,14 +125,14 @@ * */ .macro Init_Coeffs_offset start next - li 9, \start /* first offset to j */ - add 10, 7, 9 /* J + len*2 */ - addi 16, 9, \next - addi 17, 10, \next - addi 18, 16, \next - addi 19, 17, \next - addi 20, 18, \next - addi 21, 19, \next + li r9, \start /* first offset to j */ + add r10, r7, r9 /* J + len*2 */ + addi r16, r9, \next + addi r17, r10, \next + addi r18, r16, \next + addi r19, r17, \next + addi r20, r18, \next + addi r21, r19, \next .endm /* @@ -140,10 +140,10 @@ * r[j+len]: V13, V18, V23, V28 */ .macro Load_4Rjp - lxvd2x 32+13, 3, 10 /* V13: vector r'0 */ - lxvd2x 32+18, 3, 17 /* V18: vector for r'1 */ - lxvd2x 32+23, 3, 19 /* V23: vector for r'2 */ - lxvd2x 32+28, 3, 21 /* V28: vector for r'3 */ + lxvd2x 32+v13, r3, r10 /* V13: vector r'0 */ + lxvd2x 32+v18, r3, r17 /* V18: vector for r'1 */ + lxvd2x 32+v23, r3, r19 /* V23: vector for r'2 */ + lxvd2x 32+v28, r3, r21 /* V28: vector for r'3 */ .endm /* @@ -174,22 +174,22 @@ * in the proper order to match the multiplication. */ .macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 - lxvd2x 32+26, 10, 5 - vmrgew 13, 25, 26 - vmrgow 12, 25, 26 - lxvd2x 32+25, 11, 5 - lxvd2x 32+26, 12, 5 - vmrgew 18, 25, 26 - vmrgow 17, 25, 26 - lxvd2x 32+25, 15, 5 - lxvd2x 32+26, 16, 5 - vmrgew 23, 25, 26 - vmrgow 22, 25, 26 - lxvd2x 32+25, 17, 5 - lxvd2x 32+26, 18, 5 - vmrgew 28, 25, 26 - vmrgow 27, 25, 26 + lxvd2x 32+v25, 0, r5 + lxvd2x 32+v26, r10, r5 + vmrgew v13, v25, v26 + vmrgow v12, v25, v26 + lxvd2x 32+v25, r11, r5 + lxvd2x 32+v26, r12, r5 + vmrgew v18, v25, v26 + vmrgow v17, v25, v26 + lxvd2x 32+v25, r15, r5 + lxvd2x 32+v26, r16, r5 + vmrgew v23, v25, v26 + vmrgow v22, v25, v26 + lxvd2x 32+v25, r17, r5 + lxvd2x 32+v26, r18, r5 + vmrgew v28, v25, v26 + vmrgow v27, v25, v26 .endm /* @@ -208,22 +208,22 @@ * in the proper order to match the multiplication. */ .macro Load_L44Coeffs - lxvd2x 1, 0, 5 - lxvd2x 2, 10, 5 - xxpermdi 32+13, 2, 1, 3 - xxpermdi 32+12, 2, 1, 0 - lxvd2x 3, 11, 5 - lxvd2x 4, 12, 5 - xxpermdi 32+18, 4, 3, 3 - xxpermdi 32+17, 4, 3, 0 - lxvd2x 1, 15, 5 - lxvd2x 2, 16, 5 - xxpermdi 32+23, 2, 1, 3 - xxpermdi 32+22, 2, 1, 0 - lxvd2x 3, 17, 5 - lxvd2x 4, 18, 5 - xxpermdi 32+28, 4, 3, 3 - xxpermdi 32+27, 4, 3, 0 + lxvd2x vs1, 0, r5 + lxvd2x vs2, r10, r5 + xxpermdi 32+v13, vs2, vs1, 3 + xxpermdi 32+v12, vs2, vs1, 0 + lxvd2x vs3, r11, r5 + lxvd2x vs4, r12, r5 + xxpermdi 32+v18, vs4, vs3, 3 + xxpermdi 32+v17, vs4, vs3, 0 + lxvd2x vs1, r15, r5 + lxvd2x vs2, r16, r5 + xxpermdi 32+v23, vs2, vs1, 3 + xxpermdi 32+v22, vs2, vs1, 0 + lxvd2x vs3, r17, r5 + lxvd2x vs4, r18, r5 + xxpermdi 32+v28, vs4, vs3, 3 + xxpermdi 32+v27, vs4, vs3, 0 .endm /* @@ -237,32 +237,32 @@ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 /* fqmul = zeta * coefficient Modular multification bond by 2^16 * q in abs value */ - vmladduhm 15, 13, \_vz0, 3 - vmladduhm 20, 18, \_vz1, 3 - vmladduhm 25, 23, \_vz2, 3 - vmladduhm 30, 28, \_vz3, 3 + vmladduhm v15, v13, \_vz0, v3 + vmladduhm v20, v18, \_vz1, v3 + vmladduhm v25, v23, \_vz2, v3 + vmladduhm v30, v28, \_vz3, v3 /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ - vmhraddshs 14, 13, \_vz0, 3 - vmhraddshs 19, 18, \_vz1, 3 - vmhraddshs 24, 23, \_vz2, 3 - vmhraddshs 29, 28, \_vz3, 3 + vmhraddshs v14, v13, \_vz0, v3 + vmhraddshs v19, v18, \_vz1, v3 + vmhraddshs v24, v23, \_vz2, v3 + vmhraddshs v29, v28, \_vz3, v3 - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 30, 30, V_QINV, 3 + vmladduhm v15, v15, V_QINV, v3 + vmladduhm v20, v20, V_QINV, v3 + vmladduhm v25, v25, V_QINV, v3 + vmladduhm v30, v30, V_QINV, v3 - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 + vmhraddshs v15, v15, V_NMKQ, v14 + vmhraddshs v20, v20, V_NMKQ, v19 + vmhraddshs v25, v25, V_NMKQ, v24 + vmhraddshs v30, v30, V_NMKQ, v29 /* Shift right 1 bit */ - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vsrah v13, v15, v4 + vsrah v18, v20, v4 + vsrah v23, v25, v4 + vsrah v28, v30, v4 .endm /* @@ -271,10 +271,10 @@ * r[j]: V12, V17, V22, V27 */ .macro Load_4Rj - lxvd2x 32+12, 3, 9 /* V12: vector r0 */ - lxvd2x 32+17, 3, 16 /* V17: vector r1 */ - lxvd2x 32+22, 3, 18 /* V22: vector r2 */ - lxvd2x 32+27, 3, 20 /* V27: vector r3 */ + lxvd2x 32+v12, r3, r9 /* V12: vector r0 */ + lxvd2x 32+v17, r3, r16 /* V17: vector r1 */ + lxvd2x 32+v22, r3, r18 /* V22: vector r2 */ + lxvd2x 32+v27, r3, r20 /* V27: vector r3 */ .endm /* @@ -289,25 +289,25 @@ r[j] = r[j] + t. r[j+len] = r[j] - t */ - vsubuhm 16, 12, 13 - vadduhm 15, 13, 12 - vsubuhm 21, 17, 18 - vadduhm 20, 18, 17 - vsubuhm 26, 22, 23 - vadduhm 25, 23, 22 - vsubuhm 31, 27, 28 - vadduhm 30, 28, 27 + vsubuhm v16, v12, v13 + vadduhm v15, v13, v12 + vsubuhm v21, v17, v18 + vadduhm v20, v18, v17 + vsubuhm v26, v22, v23 + vadduhm v25, v23, v22 + vsubuhm v31, v27, v28 + vadduhm v30, v28, v27 .endm .macro Write_One - stxvd2x 32+15, 3, 9 - stxvd2x 32+16, 3, 10 - stxvd2x 32+20, 3, 16 - stxvd2x 32+21, 3, 17 - stxvd2x 32+25, 3, 18 - stxvd2x 32+26, 3, 19 - stxvd2x 32+30, 3, 20 - stxvd2x 32+31, 3, 21 + stxvd2x 32+v15, r3, r9 + stxvd2x 32+v16, r3, r10 + stxvd2x 32+v20, r3, r16 + stxvd2x 32+v21, r3, r17 + stxvd2x 32+v25, r3, r18 + stxvd2x 32+v26, r3, r19 + stxvd2x 32+v30, r3, r20 + stxvd2x 32+v31, r3, r21 .endm /* @@ -316,22 +316,22 @@ */ .macro PermWriteL44 Compute_4Coeffs - xxpermdi 0, 32+15, 32+16, 3 - xxpermdi 1, 32+15, 32+16, 0 - xxpermdi 2, 32+20, 32+21, 3 - xxpermdi 3, 32+20, 32+21, 0 - xxpermdi 4, 32+25, 32+26, 3 - xxpermdi 5, 32+25, 32+26, 0 - xxpermdi 6, 32+30, 32+31, 3 - xxpermdi 7, 32+30, 32+31, 0 - stxvd2x 0, 0, 5 - stxvd2x 1, 10, 5 - stxvd2x 2, 11, 5 - stxvd2x 3, 12, 5 - stxvd2x 4, 15, 5 - stxvd2x 5, 16, 5 - stxvd2x 6, 17, 5 - stxvd2x 7, 18, 5 + xxpermdi vs0, 32+v15, 32+v16, 3 + xxpermdi vs1, 32+v15, 32+v16, 0 + xxpermdi vs2, 32+v20, 32+v21, 3 + xxpermdi vs3, 32+v20, 32+v21, 0 + xxpermdi vs4, 32+v25, 32+v26, 3 + xxpermdi vs5, 32+v25, 32+v26, 0 + xxpermdi vs6, 32+v30, 32+v31, 3 + xxpermdi vs7, 32+v30, 32+v31, 0 + stxvd2x vs0, 0, r5 + stxvd2x vs1, r10, r5 + stxvd2x vs2, r11, r5 + stxvd2x vs3, r12, r5 + stxvd2x vs4, r15, r5 + stxvd2x vs5, r16, r5 + stxvd2x vs6, r17, r5 + stxvd2x vs7, r18, r5 .endm /* @@ -340,33 +340,33 @@ */ .macro PermWriteL24 Compute_4Coeffs - vmrgew 10, 16, 15 - vmrgow 11, 16, 15 - vmrgew 12, 21, 20 - vmrgow 13, 21, 20 - vmrgew 14, 26, 25 - vmrgow 15, 26, 25 - vmrgew 16, 31, 30 - vmrgow 17, 31, 30 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + vmrgew v10, v16, v15 + vmrgow v11, v16, v15 + vmrgew v12, v21, v20 + vmrgow v13, v21, v20 + vmrgew v14, v26, v25 + vmrgow v15, v26, v25 + vmrgew v16, v31, v30 + vmrgow v17, v31, v30 + stxvd2x 32+v10, 0, r5 + stxvd2x 32+v11, r10, r5 + stxvd2x 32+v12, r11, r5 + stxvd2x 32+v13, r12, r5 + stxvd2x 32+v14, r15, r5 + stxvd2x 32+v15, r16, r5 + stxvd2x 32+v16, r17, r5 + stxvd2x 32+v17, r18, r5 .endm .macro Load_next_4zetas - li 10, 16 - li 11, 32 - li 12, 48 - lxvd2x 32+V_Z0, 0, 14 - lxvd2x 32+V_Z1, 10, 14 - lxvd2x 32+V_Z2, 11, 14 - lxvd2x 32+V_Z3, 12, 14 - addi 14, 14, 64 + li r10, 16 + li r11, 32 + li r12, 48 + lxvd2x 32+V_Z0, 0, r14 + lxvd2x 32+V_Z1, r10, r14 + lxvd2x 32+V_Z2, r11, r14 + lxvd2x 32+V_Z3, r12, r14 + addi r14, r14, 64 .endm /* @@ -398,16 +398,16 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) SAVE_REGS /* load MLKEM_Q */ - lvx V_NMKQ,0,4 + lvx V_NMKQ,0,r4 /* Register 14 as pointer to zetas array */ - addi 14, 4, ZETA_NTT_OFFSET + addi r14, r4, ZETA_NTT_OFFSET - vxor 3, 3, 3 - vspltish 4, 1 + vxor v3, v3, v3 + vspltish v4, 1 - li 10, QINV_OFFSET - lvx V_QINV, 10, 4 + li r10, QINV_OFFSET + lvx V_QINV, r10, r4 .align 4 /* @@ -416,9 +416,9 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * * 1. len = 128, start = 0 */ - li 7, 256 /* len * 2 */ - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + li r7, 256 /* len * 2 */ + lvx V_ZETA, 0, r14 + addi r14, r14, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -430,14 +430,14 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * 2. len = 64, start = 0, 128 * k += 2 */ - li 7, 128 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + li r7, 128 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -446,21 +446,21 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * 3. len = 32, start = 0, 64, 128, 192 * k += 4 */ - li 7, 64 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + li r7, 64 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .align 4 @@ -468,7 +468,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 * k += 8 */ - li 7, 32 + li r7, 32 Load_next_4zetas NTT_MREDUCE_4X 0, 64, V_Z0, V_Z1, V_Z2, V_Z3 NTT_MREDUCE_4X 16, 64, V_Z0, V_Z1, V_Z2, V_Z3 @@ -482,7 +482,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 * k += 16 */ - li 7, 16 + li r7, 16 Load_next_4zetas NTT_MREDUCE_4X 0, 32, V_Z0, V_Z1, V_Z2, V_Z3 @@ -500,18 +500,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * k += 32 * Load zeta vectors in 4-4 layout */ - li 15, 4 - mtctr 15 - mr 5, 3 /* Let r5 points to coefficient array */ - li 7, 8 - - li 10, 16 - li 11, 32 - li 12, 48 - li 15, 64 - li 16, 80 - li 17, 96 - li 18, 112 + li r15, 4 + mtctr r15 + mr r5, r3 /* Let r5 points to coefficient array */ + li r7, 8 + + li r10, 16 + li r11, 32 + li r12, 48 + li r15, 64 + li r16, 80 + li r17, 96 + li r18, 112 .align 4 ntt_ppc__Len4: @@ -520,7 +520,7 @@ ntt_ppc__Len4: Load_L44Coeffs MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 PermWriteL44 - addi 5, 5, 128 + addi r5, r5, 128 bdnz ntt_ppc__Len4 @@ -530,10 +530,10 @@ ntt_ppc__Len4: * Load zeta vectors in 2-2-2-2 layout */ - li 8, 4 - mtctr 8 - mr 5, 3 /* Let r5 points to coefficient array */ - li 7, 4 + li r8, 4 + mtctr r8 + mr r5, r3 /* Let r5 points to coefficient array */ + li r7, 4 .align 4 ntt_ppc__Len2: @@ -541,7 +541,7 @@ ntt_ppc__Len2: Load_L24Coeffs MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 PermWriteL24 - addi 5, 5, 128 + addi r5, r5, 128 bdnz ntt_ppc__Len2 diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S index 354474d071..4d16be6f05 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont.S @@ -40,141 +40,141 @@ * MREDUCE_4X(_v0, _v1, _v2, _v3) */ .macro MREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+13, 0, 3 - addi 3, 3, 16 - lxvd2x 32+18, 0, 3 - addi 3, 3, 16 - lxvd2x 32+23, 0, 3 - addi 3, 3, 16 - lxvd2x 32+7, 0, 3 - addi 3, 3, 16 - - vmladduhm 15, 13, V1353, 3 - vmladduhm 20, 18, V1353, 3 - vmladduhm 25, 23, V1353, 3 - vmladduhm 9, 7, V1353, 3 - - vmhraddshs 14, 13, V1353, 3 - vmhraddshs 19, 18, V1353, 3 - vmhraddshs 24, 23, V1353, 3 - vmhraddshs 8, 7, V1353, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 9, 9, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 9, 9, V_NMKQ, 8 + lxvd2x 32+v13, 0, r3 + addi r3, r3, 16 + lxvd2x 32+v18, 0, r3 + addi r3, r3, 16 + lxvd2x 32+v23, 0, r3 + addi r3, r3, 16 + lxvd2x 32+v7, 0, r3 + addi r3, r3, 16 + + vmladduhm v15, v13, V1353, v3 + vmladduhm v20, v18, V1353, v3 + vmladduhm v25, v23, V1353, v3 + vmladduhm v9, v7, V1353, v3 + + vmhraddshs v14, v13, V1353, v3 + vmhraddshs v19, v18, V1353, v3 + vmhraddshs v24, v23, V1353, v3 + vmhraddshs v8, v7, V1353, v3 + + vmladduhm v15, v15, V_QINV, v3 + vmladduhm v20, v20, V_QINV, v3 + vmladduhm v25, v25, V_QINV, v3 + vmladduhm v9, v9, V_QINV, v3 + + vmhraddshs v15, v15, V_NMKQ, v14 + vmhraddshs v20, v20, V_NMKQ, v19 + vmhraddshs v25, v25, V_NMKQ, v24 + vmhraddshs v9, v9, V_NMKQ, v8 /* Shift right 1 bit */ - vsrah \_v0, 15, 4 - vsrah \_v1, 20, 4 - vsrah \_v2, 25, 4 - vsrah \_v3, 9, 4 + vsrah \_v0, v15, v4 + vsrah \_v1, v20, v4 + vsrah \_v2, v25, v4 + vsrah \_v3, v9, v4 .endm .macro Write_8X - stxvd2x 32+27, 4, 3 - stxvd2x 32+28, 5, 3 - stxvd2x 32+29, 6, 3 - stxvd2x 32+30, 7, 3 - stxvd2x 32+13, 8, 3 - stxvd2x 32+18, 9, 3 - stxvd2x 32+23, 10, 3 - stxvd2x 32+7, 11, 3 + stxvd2x 32+v27, r4, r3 + stxvd2x 32+v28, r5, r3 + stxvd2x 32+v29, r6, r3 + stxvd2x 32+v30, r7, r3 + stxvd2x 32+v13, r8, r3 + stxvd2x 32+v18, r9, r3 + stxvd2x 32+v23, r10, r3 + stxvd2x 32+v7, r11, r3 .endm .align 4 .globl MLK_ASM_NAMESPACE(poly_tomont_ppc) MLK_ASM_FN_SYMBOL(poly_tomont_ppc) - stdu 1, -320(1) - mflr 0 - - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - li 11, 208 - li 12, 224 - stxvx 32+20, 6, 1 - stxvx 32+21, 7, 1 - stxvx 32+22, 8, 1 - stxvx 32+23, 9, 1 - stxvx 32+24, 10, 1 - stxvx 32+25, 11, 1 - stxvx 32+26, 12, 1 - li 6, 240 - li 7, 256 - li 8, 272 - li 9, 288 - stxvx 32+27, 6, 1 - stxvx 32+28, 7, 1 - stxvx 32+29, 8, 1 - stxvx 32+30, 9, 1 - - li 6, NQ_OFFSET - li 7, QINV_OFFSET - li 8, C1353_OFFSET - lxvx 32+V_NMKQ, 6, 4 - lxvx 32+V_QINV, 7, 4 - lxvx 32+V1353, 8, 4 - - vxor 3, 3, 3 - vspltish 4, 1 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 + stdu r1, -320(r1) + mflr r0 + + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + li r11, 208 + li r12, 224 + stxvx 32+v20, r6, r1 + stxvx 32+v21, r7, r1 + stxvx 32+v22, r8, r1 + stxvx 32+v23, r9, r1 + stxvx 32+v24, r10, r1 + stxvx 32+v25, r11, r1 + stxvx 32+v26, r12, r1 + li r6, 240 + li r7, 256 + li r8, 272 + li r9, 288 + stxvx 32+v27, r6, r1 + stxvx 32+v28, r7, r1 + stxvx 32+v29, r8, r1 + stxvx 32+v30, r9, r1 + + li r6, NQ_OFFSET + li r7, QINV_OFFSET + li r8, C1353_OFFSET + lxvx 32+V_NMKQ, r6, r4 + lxvx 32+V_QINV, r7, r4 + lxvx 32+V1353, r8, r4 + + vxor v3, v3, v3 + vspltish v4, 1 + + li r4, -128 + li r5, -112 + li r6, -96 + li r7, -80 + li r8, -64 + li r9, -48 + li r10, -32 + li r11, -16 + + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 Write_8X - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 Write_8X - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 Write_8X - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 Write_8X - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - li 11, 208 - li 12, 224 - lxvx 32+20, 6, 1 - lxvx 32+21, 7, 1 - lxvx 32+22, 8, 1 - lxvx 32+23, 9, 1 - lxvx 32+24, 10, 1 - lxvx 32+25, 11, 1 - lxvx 32+26, 12, 1 - li 6, 240 - li 7, 256 - li 8, 272 - li 9, 288 - lxvx 32+27, 6, 1 - lxvx 32+28, 7, 1 - lxvx 32+29, 8, 1 - lxvx 32+30, 9, 1 - mtlr 0 - addi 1, 1, 320 + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + li r11, 208 + li r12, 224 + lxvx 32+v20, r6, r1 + lxvx 32+v21, r7, r1 + lxvx 32+v22, r8, r1 + lxvx 32+v23, r9, r1 + lxvx 32+v24, r10, r1 + lxvx 32+v25, r11, r1 + lxvx 32+v26, r12, r1 + li r6, 240 + li r7, 256 + li r8, 272 + li r9, 288 + lxvx 32+v27, r6, r1 + lxvx 32+v28, r7, r1 + lxvx 32+v29, r8, r1 + lxvx 32+v30, r9, r1 + mtlr r0 + addi r1, r1, 320 blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index 084ae5959d..691ce3970c 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -34,168 +34,168 @@ .text .macro BREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+8, 0, 3 - lxvd2x 32+12, 14, 3 - lxvd2x 32+16, 15, 3 - lxvd2x 32+20, 16, 3 - addi 3, 3, 64 - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + lxvd2x 32+v8, 0, r3 + lxvd2x 32+v12, r14, r3 + lxvd2x 32+v16, r15, r3 + lxvd2x 32+v20, r16, r3 + addi r3, r3, 64 + vmulosh v6, v8, V20159 + vmulesh v5, v8, V20159 + vmulosh v11, v12, V20159 + vmulesh v10, v12, V20159 + vmulosh v15, v16, V20159 + vmulesh v14, v16, V20159 + vmulosh v19, v20, V20159 + vmulesh v18, v20, V20159 + xxmrglw 32+v4, 32+v5, 32+v6 + xxmrghw 32+v5, 32+v5, 32+v6 + xxmrglw 32+v9, 32+v10, 32+v11 + xxmrghw 32+v10, 32+v10, 32+v11 + xxmrglw 32+v13, 32+v14, 32+v15 + xxmrghw 32+v14, 32+v14, 32+v15 + xxmrglw 32+v17, 32+v18, 32+v19 + xxmrghw 32+v18, 32+v18, 32+v19 + vadduwm v4, v4, V_25 + vadduwm v5, v5, V_25 + vadduwm v9, v9, V_25 + vadduwm v10, v10, V_25 + vadduwm v13, v13, V_25 + vadduwm v14, v14, V_25 + vadduwm v17, v17, V_25 + vadduwm v18, v18, V_25 + vsraw v4, v4, V_26 + vsraw v5, v5, V_26 + vsraw v9, v9, V_26 + vsraw v10, v10, V_26 + vsraw v13, v13, V_26 + vsraw v14, v14, V_26 + vsraw v17, v17, V_26 + vsraw v18, v18, V_26 + vpkuwum v4, v5, v4 + vsubuhm v4, v7, v4 + vpkuwum v9, v10, v9 + vsubuhm v9, v7, v9 + vpkuwum v13, v14, v13 + vsubuhm v13, v7, v13 + vpkuwum v17, v18, v17 + vsubuhm v17, v7, v17 + vmladduhm \_v0, v4, V_MKQ, v8 + vmladduhm \_v1, v9, V_MKQ, v12 + vmladduhm \_v2, v13, V_MKQ, v16 + vmladduhm \_v3, v17, V_MKQ, v20 .endm .macro Write_8X - stxvd2x 32+21, 4, 3 - stxvd2x 32+22, 5, 3 - stxvd2x 32+23, 6, 3 - stxvd2x 32+24, 7, 3 - stxvd2x 32+4, 8, 3 - stxvd2x 32+9, 9, 3 - stxvd2x 32+13, 10, 3 - stxvd2x 32+17, 11, 3 + stxvd2x 32+v21, r4, r3 + stxvd2x 32+v22, r5, r3 + stxvd2x 32+v23, r6, r3 + stxvd2x 32+v24, r7, r3 + stxvd2x 32+v4, r8, r3 + stxvd2x 32+v9, r9, r3 + stxvd2x 32+v13, r10, r3 + stxvd2x 32+v17, r11, r3 .endm /* * Conditional addition to get unsigned canonical representative */ .macro To_unsigned_16 - lxvd2x 32+12, 0, 3 - lxvd2x 32+13, 14, 3 - lxvd2x 32+14, 15, 3 - lxvd2x 32+15, 16, 3 - addi 3, 3, 64 - vsrh 1, 12, 10 - vsrh 0, 13, 10 - vsrh 3, 14, 10 - vsrh 2, 15, 10 - vadduhm 7, 12, 11 - vadduhm 8, 13, 11 - vadduhm 5, 14, 11 - vadduhm 6, 15, 11 - vcmpequh 1, 1, 9 - vcmpequh 0, 0, 9 - vcmpequh 3, 3, 9 - vcmpequh 2, 2, 9 - xxsel 32+1, 32+7,32+12, 32+1 - xxsel 32+0, 32+8,32+13, 32+0 - xxsel 32+3, 32+5,32+14, 32+3 - xxsel 32+2, 32+6,32+15, 32+2 - stxvd2x 32+3, 10, 3 - stxvd2x 32+2, 11, 3 - stxvd2x 32+1, 8, 3 - stxvd2x 32+0, 9, 3 + lxvd2x 32+v12, 0, r3 + lxvd2x 32+v13, r14, r3 + lxvd2x 32+v14, r15, r3 + lxvd2x 32+v15, r16, r3 + addi r3, r3, 64 + vsrh v1, v12, v10 + vsrh v0, v13, v10 + vsrh v3, v14, v10 + vsrh v2, v15, v10 + vadduhm v7, v12, v11 + vadduhm v8, v13, v11 + vadduhm v5, v14, v11 + vadduhm v6, v15, v11 + vcmpequh v1, v1, v9 + vcmpequh v0, v0, v9 + vcmpequh v3, v3, v9 + vcmpequh v2, v2, v9 + xxsel 32+v1, 32+v7,32+v12, 32+v1 + xxsel 32+v0, 32+v8,32+v13, 32+v0 + xxsel 32+v3, 32+v5,32+v14, 32+v3 + xxsel 32+v2, 32+v6,32+v15, 32+v2 + stxvd2x 32+v3, r10, r3 + stxvd2x 32+v2, r11, r3 + stxvd2x 32+v1, r8, r3 + stxvd2x 32+v0, r9, r3 .endm .align 4 .globl MLK_ASM_NAMESPACE(reduce_ppc) MLK_ASM_FN_SYMBOL(reduce_ppc) - stdu 1, -224(1) - mflr 0 - std 14, 96(1) - std 15, 104(1) - std 16, 112(1) - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - stxvx 32+20, 6, 1 - stxvx 32+21, 7, 1 - stxvx 32+22, 8, 1 - stxvx 32+23, 9, 1 - stxvx 32+24, 10, 1 - - vxor 7, 7, 7 - - li 6, Q_OFFSET - li 7, C20159_OFFSET - lxvx 32+V_MKQ, 6, 4 - lxvx 32+V20159, 7, 4 + stdu r1, -224(r1) + mflr r0 + std r14, 96(r1) + std r15, 104(r1) + std r16, 112(r1) + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + stxvx 32+v20, r6, r1 + stxvx 32+v21, r7, r1 + stxvx 32+v22, r8, r1 + stxvx 32+v23, r9, r1 + stxvx 32+v24, r10, r1 + + vxor v7, v7, v7 + + li r6, Q_OFFSET + li r7, C20159_OFFSET + lxvx 32+V_MKQ, r6, r4 + lxvx 32+V20159, r7, r4 vspltisw V_26, 13 vadduwm V_26, V_26, V_26 - vspltisw 4, 1 - vsubuwm 5, V_26, 4 - vslw V_25, 4, 5 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - li 14, 16 - li 15, 32 - li 16, 48 - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 + vspltisw v4, 1 + vsubuwm v5, V_26, v4 + vslw V_25, v4, v5 + + li r4, -128 + li r5, -112 + li r6, -96 + li r7, -80 + li r8, -64 + li r9, -48 + li r10, -32 + li r11, -16 + + li r14, 16 + li r15, 32 + li r16, 48 + + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 Write_8X - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 Write_8X - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 Write_8X - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 Write_8X /* * To unsigned canonical */ .align 4 - addi 3, 3, -512 - vxor 9, 9, 9 - vspltish 10, 15 - vmr 11, V_MKQ + addi r3, r3, -512 + vxor v9, v9, v9 + vspltish v10, 15 + vmr v11, V_MKQ To_unsigned_16 To_unsigned_16 @@ -206,21 +206,21 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) To_unsigned_16 To_unsigned_16 - ld 14, 96(1) - ld 15, 104(1) - ld 16, 112(1) - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - lxvx 32+20, 6, 1 - lxvx 32+21, 7, 1 - lxvx 32+22, 8, 1 - lxvx 32+23, 9, 1 - lxvx 32+24, 10, 1 - mtlr 0 - addi 1, 1, 224 + ld r14, 96(r1) + ld r15, 104(r1) + ld r16, 112(r1) + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + lxvx 32+v20, r6, r1 + lxvx 32+v21, r7, r1 + lxvx 32+v22, r8, r1 + lxvx 32+v23, r9, r1 + lxvx 32+v24, r10, r1 + mtlr r0 + addi r1, r1, 224 blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h index df5d163f78..6c59a63b0b 100644 --- a/mlkem/src/native/ppc64le/src/consts.h +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -19,6 +19,73 @@ #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) extern const int16_t mlk_ppc_qdata[]; +#else +#define r0 0 +#define r1 1 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define v0 0 +#define v1 1 +#define v2 2 +#define v3 3 +#define v4 4 +#define v5 5 +#define v6 6 +#define v7 7 +#define v8 8 +#define v9 9 +#define v10 10 +#define v11 11 +#define v12 12 +#define v13 13 +#define v14 14 +#define v15 15 +#define v16 16 +#define v17 17 +#define v18 18 +#define v19 19 +#define v20 20 +#define v21 21 +#define v22 22 +#define v23 23 +#define v24 24 +#define v25 25 +#define v26 26 +#define v27 27 +#define v28 28 +#define v29 29 +#define v30 30 +#define v31 31 +#define vs0 0 +#define vs1 1 +#define vs2 2 +#define vs3 3 +#define vs4 4 +#define vs5 5 +#define vs6 6 +#define vs7 7 +#define vs8 8 +#define vs9 9 +#define vs10 10 +#define vs11 11 +#define vs12 12 +#define vs13 13 #endif #endif /* !MLK_NATIVE_PPC64LE_SRC_CONSTS_H */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 169272c444..946ae12e01 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -36,78 +36,78 @@ #define V1441 10 .macro SAVE_REGS - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - stxvx 32+20, 10, 1 - stxvx 32+21, 11, 1 - stxvx 32+22, 12, 1 - stxvx 32+23, 14, 1 - stxvx 32+24, 15, 1 - stxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - stxvx 32+26, 10, 1 - stxvx 32+27, 11, 1 - stxvx 32+28, 12, 1 - stxvx 32+29, 14, 1 - stxvx 32+30, 15, 1 - stxvx 32+31, 16, 1 + stdu r1, -352(r1) + mflr r0 + std r14, 56(r1) + std r15, 64(r1) + std r16, 72(r1) + std r17, 80(r1) + std r18, 88(r1) + std r19, 96(r1) + std r20, 104(r1) + std r21, 112(r1) + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + stxvx 32+v20, r10, r1 + stxvx 32+v21, r11, r1 + stxvx 32+v22, r12, r1 + stxvx 32+v23, r14, r1 + stxvx 32+v24, r15, r1 + stxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + stxvx 32+v26, r10, r1 + stxvx 32+v27, r11, r1 + stxvx 32+v28, r12, r1 + stxvx 32+v29, r14, r1 + stxvx 32+v30, r15, r1 + stxvx 32+v31, r16, r1 .endm .macro RESTORE_REGS - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - lxvx 32+20, 10, 1 - lxvx 32+21, 11, 1 - lxvx 32+22, 12, 1 - lxvx 32+23, 14, 1 - lxvx 32+24, 15, 1 - lxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - lxvx 32+26, 10, 1 - lxvx 32+27, 11, 1 - lxvx 32+28, 12, 1 - lxvx 32+29, 14, 1 - lxvx 32+30, 15, 1 - lxvx 32+31, 16, 1 - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + lxvx 32+v20, r10, r1 + lxvx 32+v21, r11, r1 + lxvx 32+v22, r12, r1 + lxvx 32+v23, r14, r1 + lxvx 32+v24, r15, r1 + lxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + lxvx 32+v26, r10, r1 + lxvx 32+v27, r11, r1 + lxvx 32+v28, r12, r1 + lxvx 32+v29, r14, r1 + lxvx 32+v30, r15, r1 + lxvx 32+v31, r16, r1 + ld r14, 56(r1) + ld r15, 64(r1) + ld r16, 72(r1) + ld r17, 80(r1) + ld r18, 88(r1) + ld r19, 96(r1) + ld r20, 104(r1) + ld r21, 112(r1) + + mtlr r0 + addi r1, r1, 352 .endm /* @@ -122,14 +122,14 @@ r[j] = r[j] + t. r[j+len] = r[j] - t */ - vsubuhm 25, 8, 21 - vsubuhm 26, 12, 22 - vsubuhm 30, 16, 23 - vsubuhm 31, 20, 24 - vadduhm 8, 8, 21 - vadduhm 12, 12, 22 - vadduhm 16, 16, 23 - vadduhm 20, 20, 24 + vsubuhm v25, v8, v21 + vsubuhm v26, v12, v22 + vsubuhm v30, v16, v23 + vsubuhm v31, v20, v24 + vadduhm v8, v8, v21 + vadduhm v12, v12, v22 + vadduhm v16, v16, v23 + vadduhm v20, v20, v24 .endm /* @@ -154,14 +154,14 @@ * */ .macro Init_Coeffs_offset start next - li 9, \start /* first offset to j */ - add 10, 7, 9 /* J + len*2 */ - addi 16, 9, \next - addi 17, 10, \next - addi 18, 16, \next - addi 19, 17, \next - addi 20, 18, \next - addi 21, 19, \next + li r9, \start /* first offset to j */ + add r10, r7, r9 /* J + len*2 */ + addi r16, r9, \next + addi r17, r10, \next + addi r18, r16, \next + addi r19, r17, \next + addi r20, r18, \next + addi r21, r19, \next .endm /* @@ -173,15 +173,15 @@ * r[j]: V21, V22, V23, V24 */ .macro Load_4Rjp - lxvd2x 32+8, 3, 10 /* V8: vector r'0 */ - lxvd2x 32+12, 3, 17 /* V12: vector for r'1 */ - lxvd2x 32+16, 3, 19 /* V16: vector for r'2 */ - lxvd2x 32+20, 3, 21 /* V20: vector for r'3 */ - - lxvd2x 32+21, 3, 9 /* V21: vector r0 */ - lxvd2x 32+22, 3, 16 /* V22: vector r1 */ - lxvd2x 32+23, 3, 18 /* V23: vector r2 */ - lxvd2x 32+24, 3, 20 /* V24: vector r3 */ + lxvd2x 32+v8, r3, r10 /* V8: vector r'0 */ + lxvd2x 32+v12, r3, r17 /* V12: vector for r'1 */ + lxvd2x 32+v16, r3, r19 /* V16: vector for r'2 */ + lxvd2x 32+v20, r3, r21 /* V20: vector for r'3 */ + + lxvd2x 32+v21, r3, r9 /* V21: vector r0 */ + lxvd2x 32+v22, r3, r16 /* V22: vector r1 */ + lxvd2x 32+v23, r3, r18 /* V23: vector r2 */ + lxvd2x 32+v24, r3, r20 /* V24: vector r3 */ .endm /* @@ -213,22 +213,22 @@ * in the proper order to match the multiplication. */ .macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 - lxvd2x 32+26, 10, 5 - vmrgew 8, 25, 26 - vmrgow 21, 25, 26 - lxvd2x 32+25, 11, 5 - lxvd2x 32+26, 12, 5 - vmrgew 12, 25, 26 - vmrgow 22, 25, 26 - lxvd2x 32+25, 15, 5 - lxvd2x 32+26, 16, 5 - vmrgew 16, 25, 26 - vmrgow 23, 25, 26 - lxvd2x 32+25, 17, 5 - lxvd2x 32+26, 18, 5 - vmrgew 20, 25, 26 - vmrgow 24, 25, 26 + lxvd2x 32+v25, 0, r5 + lxvd2x 32+v26, r10, r5 + vmrgew v8, v25, v26 + vmrgow v21, v25, v26 + lxvd2x 32+v25, r11, r5 + lxvd2x 32+v26, r12, r5 + vmrgew v12, v25, v26 + vmrgow v22, v25, v26 + lxvd2x 32+v25, r15, r5 + lxvd2x 32+v26, r16, r5 + vmrgew v16, v25, v26 + vmrgow v23, v25, v26 + lxvd2x 32+v25, r17, r5 + lxvd2x 32+v26, r18, r5 + vmrgew v20, v25, v26 + vmrgow v24, v25, v26 .endm /* @@ -247,81 +247,81 @@ * in the proper order to match the multiplication. */ .macro Load_L44Coeffs - lxvd2x 10, 0, 5 - lxvd2x 11, 10, 5 - xxpermdi 32+8, 11, 10, 3 - xxpermdi 32+21, 11, 10, 0 - lxvd2x 10, 11, 5 - lxvd2x 11, 12, 5 - xxpermdi 32+12, 11, 10, 3 - xxpermdi 32+22, 11, 10, 0 - lxvd2x 10, 15, 5 - lxvd2x 11, 16, 5 - xxpermdi 32+16, 11, 10, 3 - xxpermdi 32+23, 11, 10, 0 - lxvd2x 10, 17, 5 - lxvd2x 11, 18, 5 - xxpermdi 32+20, 11, 10, 3 - xxpermdi 32+24, 11, 10, 0 + lxvd2x vs10, 0, r5 + lxvd2x vs11, r10, r5 + xxpermdi 32+v8, vs11, vs10, 3 + xxpermdi 32+v21, vs11, vs10, 0 + lxvd2x vs10, r11, r5 + lxvd2x vs11, r12, r5 + xxpermdi 32+v12, vs11, vs10, 3 + xxpermdi 32+v22, vs11, vs10, 0 + lxvd2x vs10, r15, r5 + lxvd2x vs11, r16, r5 + xxpermdi 32+v16, vs11, vs10, 3 + xxpermdi 32+v23, vs11, vs10, 0 + lxvd2x vs10, r17, r5 + lxvd2x vs11, r18, r5 + xxpermdi 32+v20, vs11, vs10, 3 + xxpermdi 32+v24, vs11, vs10, 0 .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 /* Restore constant vectors V_MKQ, V_25 and V_26 */ - vxor 7, 7, 7 - xxlor 32+3, 6, 6 - xxlor 32+1, 7, 7 - xxlor 32+2, 8, 8 + vxor v7, v7, v7 + xxlor 32+v3, vs6, vs6 + xxlor 32+v1, vs7, vs7 + xxlor 32+v2, vs8, vs8 /* Multify Odd/Even signed halfword; Results word bound by 2^32 in abs value. */ - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 + vmulosh v6, v8, V20159 + vmulesh v5, v8, V20159 + vmulosh v11, v12, V20159 + vmulesh v10, v12, V20159 + vmulosh v15, v16, V20159 + vmulesh v14, v16, V20159 + vmulosh v19, v20, V20159 + vmulesh v18, v20, V20159 + xxmrglw 32+v4, 32+v5, 32+v6 + xxmrghw 32+v5, 32+v5, 32+v6 + xxmrglw 32+v9, 32+v10, 32+v11 + xxmrghw 32+v10, 32+v10, 32+v11 + xxmrglw 32+v13, 32+v14, 32+v15 + xxmrghw 32+v14, 32+v14, 32+v15 + xxmrglw 32+v17, 32+v18, 32+v19 + xxmrghw 32+v18, 32+v18, 32+v19 + vadduwm v4, v4, V_25 + vadduwm v5, v5, V_25 + vadduwm v9, v9, V_25 + vadduwm v10, v10, V_25 + vadduwm v13, v13, V_25 + vadduwm v14, v14, V_25 + vadduwm v17, v17, V_25 + vadduwm v18, v18, V_25 /* Right shift and pack lower halfword, results bond to 2^16 in abs value */ - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 + vsraw v4, v4, V_26 + vsraw v5, v5, V_26 + vsraw v9, v9, V_26 + vsraw v10, v10, V_26 + vsraw v13, v13, V_26 + vsraw v14, v14, V_26 + vsraw v17, v17, V_26 + vsraw v18, v18, V_26 + vpkuwum v4, v5, v4 + vsubuhm v4, v7, v4 + vpkuwum v9, v10, v9 + vsubuhm v9, v7, v9 + vpkuwum v13, v14, v13 + vsubuhm v13, v7, v13 + vpkuwum v17, v18, v17 + vsubuhm v17, v7, v17 /* Modulo multify-Low unsigned halfword; results bond to 2^16 * q in abs value. */ - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + vmladduhm \_v0, v4, V_MKQ, v8 + vmladduhm \_v1, v9, V_MKQ, v12 + vmladduhm \_v2, v13, V_MKQ, v16 + vmladduhm \_v3, v17, V_MKQ, v20 .endm /* @@ -330,32 +330,32 @@ */ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 /* Modular multification bond by 2^16 * q in abs value */ - vmladduhm 15, 25, \_vz0, 3 - vmladduhm 20, 26, \_vz1, 3 - vmladduhm 27, 30, \_vz2, 3 - vmladduhm 28, 31, \_vz3, 3 + vmladduhm v15, v25, \_vz0, v3 + vmladduhm v20, v26, \_vz1, v3 + vmladduhm v27, v30, \_vz2, v3 + vmladduhm v28, v31, \_vz3, v3 /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ - vmhraddshs 14, 25, \_vz0, 3 - vmhraddshs 19, 26, \_vz1, 3 - vmhraddshs 24, 30, \_vz2, 3 - vmhraddshs 29, 31, \_vz3, 3 + vmhraddshs v14, v25, \_vz0, v3 + vmhraddshs v19, v26, \_vz1, v3 + vmhraddshs v24, v30, \_vz2, v3 + vmhraddshs v29, v31, \_vz3, v3 - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 27, V_QINV, 3 - vmladduhm 30, 28, V_QINV, 3 + vmladduhm v15, v15, V_QINV, v3 + vmladduhm v20, v20, V_QINV, v3 + vmladduhm v25, v27, V_QINV, v3 + vmladduhm v30, v28, V_QINV, v3 - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 + vmhraddshs v15, v15, V_NMKQ, v14 + vmhraddshs v20, v20, V_NMKQ, v19 + vmhraddshs v25, v25, V_NMKQ, v24 + vmhraddshs v30, v30, V_NMKQ, v29 /* Shift right 1 bit */ - vsrah \_vo0, 15, 4 - vsrah \_vo1, 20, 4 - vsrah \_vo2, 25, 4 - vsrah \_vo3, 30, 4 + vsrah \_vo0, v15, v4 + vsrah \_vo1, v20, v4 + vsrah \_vo2, v25, v4 + vsrah \_vo3, v30, v4 .endm /* @@ -363,21 +363,21 @@ * V_NMKQ, V_QINV, Zero vector, One vector */ .macro Set_mont_consts - xxlor 32+5, 0, 0 /* V_NMKQ */ - xxlor 32+2, 2, 2 /* V_QINV */ - xxlor 32+3, 3, 3 /* all 0 */ - xxlor 32+4, 4, 4 /* all 1 */ + xxlor 32+v5, vs0, vs0 /* V_NMKQ */ + xxlor 32+v2, vs2, vs2 /* V_QINV */ + xxlor 32+v3, vs3, vs3 /* all 0 */ + xxlor 32+v4, vs4, vs4 /* all 1 */ .endm .macro Load_next_4zetas - li 8, 16 - li 11, 32 - li 12, 48 - lxvd2x 32+V_Z0, 0, 14 - lxvd2x 32+V_Z1, 8, 14 - lxvd2x 32+V_Z2, 11, 14 - lxvd2x 32+V_Z3, 12, 14 - addi 14, 14, 64 + li r8, 16 + li r11, 32 + li r12, 48 + lxvd2x 32+V_Z0, 0, r14 + lxvd2x 32+V_Z1, r8, r14 + lxvd2x 32+V_Z2, r11, r14 + lxvd2x 32+V_Z3, r12, r14 + addi r14, r14, 64 .endm /* @@ -392,38 +392,38 @@ .endm .macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 9 - stxvd2x \_vs1, 3, 16 - stxvd2x \_vs2, 3, 18 - stxvd2x \_vs3, 3, 20 + stxvd2x \_vs0, r3, r9 + stxvd2x \_vs1, r3, r16 + stxvd2x \_vs2, r3, r18 + stxvd2x \_vs3, r3, r20 .endm .macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 10 - stxvd2x \_vs1, 3, 17 - stxvd2x \_vs2, 3, 19 - stxvd2x \_vs3, 3, 21 + stxvd2x \_vs0, r3, r10 + stxvd2x \_vs1, r3, r17 + stxvd2x \_vs2, r3, r19 + stxvd2x \_vs3, r3, r21 .endm .macro Reload_4coeffs - lxvd2x 32+25, 0, 3 - lxvd2x 32+26, 10, 3 - lxvd2x 32+30, 11, 3 - lxvd2x 32+31, 12, 3 - addi 3, 3, 64 + lxvd2x 32+v25, 0, r3 + lxvd2x 32+v26, r10, r3 + lxvd2x 32+v30, r11, r3 + lxvd2x 32+v31, r12, r3 + addi r3, r3, 64 .endm .macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - addi 3, 3, -128 - stxvd2x \_vs0, 0, 3 - stxvd2x \_vs1, 10, 3 - stxvd2x \_vs2, 11, 3 - stxvd2x \_vs3, 12, 3 - stxvd2x \_vs4, 15, 3 - stxvd2x \_vs5, 16, 3 - stxvd2x \_vs6, 17, 3 - stxvd2x \_vs7, 18, 3 - addi 3, 3, 128 + addi r3, r3, -128 + stxvd2x \_vs0, 0, r3 + stxvd2x \_vs1, r10, r3 + stxvd2x \_vs2, r11, r3 + stxvd2x \_vs3, r12, r3 + stxvd2x \_vs4, r15, r3 + stxvd2x \_vs5, r16, r3 + stxvd2x \_vs6, r17, r3 + stxvd2x \_vs7, r18, r3 + addi r3, r3, 128 .endm /* @@ -431,26 +431,26 @@ * coefficient array order. */ .macro PermWriteL44 - xxlor 32+14, 10, 10 - xxlor 32+19, 11, 11 - xxlor 32+24, 12, 12 - xxlor 32+29, 13, 13 - xxpermdi 32+10, 32+14, 32+13, 3 - xxpermdi 32+11, 32+14, 32+13, 0 - xxpermdi 32+12, 32+19, 32+18, 3 - xxpermdi 32+13, 32+19, 32+18, 0 - xxpermdi 32+14, 32+24, 32+23, 3 - xxpermdi 32+15, 32+24, 32+23, 0 - xxpermdi 32+16, 32+29, 32+28, 3 - xxpermdi 32+17, 32+29, 32+28, 0 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + xxlor 32+v14, vs10, vs10 + xxlor 32+v19, vs11, vs11 + xxlor 32+v24, vs12, vs12 + xxlor 32+v29, vs13, vs13 + xxpermdi 32+v10, 32+v14, 32+v13, 3 + xxpermdi 32+v11, 32+v14, 32+v13, 0 + xxpermdi 32+v12, 32+v19, 32+v18, 3 + xxpermdi 32+v13, 32+v19, 32+v18, 0 + xxpermdi 32+v14, 32+v24, 32+v23, 3 + xxpermdi 32+v15, 32+v24, 32+v23, 0 + xxpermdi 32+v16, 32+v29, 32+v28, 3 + xxpermdi 32+v17, 32+v29, 32+v28, 0 + stxvd2x 32+v10, 0, r5 + stxvd2x 32+v11, r10, r5 + stxvd2x 32+v12, r11, r5 + stxvd2x 32+v13, r12, r5 + stxvd2x 32+v14, r15, r5 + stxvd2x 32+v15, r16, r5 + stxvd2x 32+v16, r17, r5 + stxvd2x 32+v17, r18, r5 .endm /* @@ -458,65 +458,65 @@ * coefficient array order. */ .macro PermWriteL24 - xxlor 32+14, 10, 10 - xxlor 32+19, 11, 11 - xxlor 32+24, 12, 12 - xxlor 32+29, 13, 13 - vmrgew 10, 13, 14 - vmrgow 11, 13, 14 - vmrgew 12, 18, 19 - vmrgow 13, 18, 19 - vmrgew 14, 23, 24 - vmrgow 15, 23, 24 - vmrgew 16, 28, 29 - vmrgow 17, 28, 29 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + xxlor 32+v14, vs10, vs10 + xxlor 32+v19, vs11, vs11 + xxlor 32+v24, vs12, vs12 + xxlor 32+v29, vs13, vs13 + vmrgew v10, v13, v14 + vmrgow v11, v13, v14 + vmrgew v12, v18, v19 + vmrgow v13, v18, v19 + vmrgew v14, v23, v24 + vmrgow v15, v23, v24 + vmrgew v16, v28, v29 + vmrgow v17, v28, v29 + stxvd2x 32+v10, 0, r5 + stxvd2x 32+v11, r10, r5 + stxvd2x 32+v12, r11, r5 + stxvd2x 32+v13, r12, r5 + stxvd2x 32+v14, r15, r5 + stxvd2x 32+v15, r16, r5 + stxvd2x 32+v16, r17, r5 + stxvd2x 32+v17, r18, r5 .endm .macro INTT_REDUCE_L24 Load_L24Coeffs Compute_4Coeffs - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 + BREDUCE_4X v4, v9, v13, v17 + xxlor vs10, 32+v4, 32+v4 + xxlor vs11, 32+v9, 32+v9 + xxlor vs12, 32+v13, 32+v13 + xxlor vs13, 32+v17, 32+v17 Set_mont_consts Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 PermWriteL24 .endm .macro INTT_REDUCE_L44 Load_L44Coeffs Compute_4Coeffs - BREDUCE_4X 4, 9, 13, 17 - xxlor 10, 32+4, 32+4 - xxlor 11, 32+9, 32+9 - xxlor 12, 32+13, 32+13 - xxlor 13, 32+17, 32+17 + BREDUCE_4X v4, v9, v13, v17 + xxlor vs10, 32+v4, 32+v4 + xxlor vs11, 32+v9, 32+v9 + xxlor vs12, 32+v13, 32+v13 + xxlor vs13, 32+v17, 32+v17 Set_mont_consts Load_next_4zetas Perm_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 PermWriteL44 .endm .macro INTT_REDUCE_4X start next Load_4Coeffs \start, \next - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 .endm /* @@ -538,57 +538,57 @@ MLK_ASM_FN_SYMBOL(intt_ppc) /* init vectors and constants Setup for Montgomery reduce */ - lxvx 0, 0, 4 + lxvx vs0, 0, r4 - li 10, QINV_OFFSET - lxvx 32+V_QINV, 10, 4 - xxlxor 32+3, 32+3, 32+3 - vspltish 4, 1 - xxlor 2, 32+2, 32+2 /* QINV */ - xxlor 3, 32+3, 32+3 /* 0 vector */ - xxlor 4, 32+4, 32+4 /* 1 vector */ + li r10, QINV_OFFSET + lxvx 32+V_QINV, r10, r4 + xxlxor 32+v3, 32+v3, 32+v3 + vspltish v4, 1 + xxlor vs2, 32+v2, 32+v2 /* QINV */ + xxlor vs3, 32+v3, 32+v3 /* 0 vector */ + xxlor vs4, 32+v4, 32+v4 /* 1 vector */ /* Setup for Barrett reduce */ - li 10, Q_OFFSET - li 11, C20159_OFFSET - lxvx 6, 10, 4 /* V_MKQ */ - lxvx 32+V20159, 11, 4 /* V20159 */ - - vspltisw 8, 13 - vadduwm 8, 8, 8 - xxlor 8, 32+8, 32+8 /* V_26 store at vs8 */ - - vspltisw 9, 1 - vsubuwm 10, 8, 9 /* value 25 */ - vslw 9, 9, 10 - xxlor 7, 32+9, 32+9 /* V_25 syore at vs7 */ - - li 10, 16 - li 11, 32 - li 12, 48 - li 15, 64 - li 16, 80 - li 17, 96 - li 18, 112 + li r10, Q_OFFSET + li r11, C20159_OFFSET + lxvx vs6, r10, r4 /* V_MKQ */ + lxvx 32+V20159, r11, r4 /* V20159 */ + + vspltisw v8, 13 + vadduwm v8, v8, v8 + xxlor vs8, 32+v8, 32+v8 /* V_26 store at vs8 */ + + vspltisw v9, 1 + vsubuwm v10, v8, v9 /* value 25 */ + vslw v9, v9, v10 + xxlor vs7, 32+v9, 32+v9 /* V_25 syore at vs7 */ + + li r10, 16 + li r11, 32 + li r12, 48 + li r15, 64 + li r16, 80 + li r17, 96 + li r18, 112 /* * Montgomery reduce loops with constant 1441 */ - addi 14, 4, C1441_OFFSET - lvx V1441, 0, 14 - li 8, 4 - mtctr 8 + addi r14, r4, C1441_OFFSET + lvx V1441, 0, r14 + li r8, 4 + mtctr r8 Set_mont_consts intt_ppc__Loopf: Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + MREDUCE_4X V1441, V1441, V1441, V1441, v6, v7, v8, v9 Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 - MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + MREDUCE_4X V1441, V1441, V1441, V1441, v13, v18, v23, v28 + MWrite_8X 32+v6, 32+v7, 32+v8, 32+v9, 32+v13, 32+v18, 32+v23, 32+v28 bdnz intt_ppc__Loopf - addi 3, 3, -512 + addi r3, r3, -512 .align 4 /* @@ -596,14 +596,14 @@ intt_ppc__Loopf: * Update zetas vectors, each vector has 2 zetas * Load zeta array in 2-2-2-2 layout */ - addi 14, 4, ZETA_INTT_OFFSET - li 7, 4 /* len * 2 */ - li 8, 4 - mtctr 8 - mr 5, 3 + addi r14, r4, ZETA_INTT_OFFSET + li r7, 4 /* len * 2 */ + li r8, 4 + mtctr r8 + mr r5, r3 intt_ppc__Loop2: INTT_REDUCE_L24 - addi 5, 5, 128 + addi r5, r5, 128 bdnz intt_ppc__Loop2 .align 4 @@ -611,20 +611,20 @@ intt_ppc__Loop2: * 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 * Load zeta array in 4-4 layout */ - mr 5, 3 - li 7, 8 - li 8, 4 - mtctr 8 + mr r5, r3 + li r7, 8 + li r8, 4 + mtctr r8 intt_ppc__Loop4: INTT_REDUCE_L44 - addi 5, 5, 128 + addi r5, r5, 128 bdnz intt_ppc__Loop4 .align 4 /* * 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 */ - li 7, 16 + li r7, 16 INTT_REDUCE_4X 0, 32 INTT_REDUCE_4X 128, 32 @@ -635,137 +635,137 @@ intt_ppc__Loop4: /* * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 */ - li 7, 32 + li r7, 32 INTT_REDUCE_4X 0, 64 - addi 14, 14, -64 + addi r14, r14, -64 INTT_REDUCE_4X 16, 64 INTT_REDUCE_4X 256, 64 - addi 14, 14, -64 + addi r14, r14, -64 INTT_REDUCE_4X 272, 64 .align 4 /* * 5. len = 32, start = 0, 64, 128, 192 */ - li 7, 64 + li r7, 64 Load_4Coeffs 0, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 128, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 256, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 384, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 .align 4 /* * 6. len = 64, start = 0, 128 */ - li 7, 128 + li r7, 128 Load_4Coeffs 0, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 64, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 256, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 320, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - addi 14, 14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 .align 4 /* * 7. len = 128, start = 0 */ - li 7, 256 /* len*2 */ + li r7, 256 /* len*2 */ Load_4Coeffs 0, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - lvx V_ZETA, 0, 14 - xxlor 9, 32+10, 32+10 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + lvx V_ZETA, 0, r14 + xxlor vs9, 32+V_ZETA, 32+V_ZETA + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 64, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + xxlor 32+V_ZETA, vs9, vs9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 128, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + xxlor 32+V_ZETA, vs9, vs9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 Load_4Coeffs 192, 16 - BREDUCE_4X 4, 9, 13, 17 - Write_B4C 32+4, 32+9, 32+13, 32+17 + BREDUCE_4X v4, v9, v13, v17 + Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 Set_mont_consts - xxlor 32+10, 9, 9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, 13, 18, 23, 28 - Write_M4C 32+13, 32+18, 32+23, 32+28 + xxlor 32+V_ZETA, vs9, vs9 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 + Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 RESTORE_REGS blr diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index 9c837b0fb7..3c06f0a319 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -28,78 +28,78 @@ .text .macro SAVE_REGS - stdu 1, -352(1) - mflr 0 - std 14, 56(1) - std 15, 64(1) - std 16, 72(1) - std 17, 80(1) - std 18, 88(1) - std 19, 96(1) - std 20, 104(1) - std 21, 112(1) - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - stxvx 32+20, 10, 1 - stxvx 32+21, 11, 1 - stxvx 32+22, 12, 1 - stxvx 32+23, 14, 1 - stxvx 32+24, 15, 1 - stxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - stxvx 32+26, 10, 1 - stxvx 32+27, 11, 1 - stxvx 32+28, 12, 1 - stxvx 32+29, 14, 1 - stxvx 32+30, 15, 1 - stxvx 32+31, 16, 1 + stdu r1, -352(r1) + mflr r0 + std r14, 56(r1) + std r15, 64(r1) + std r16, 72(r1) + std r17, 80(r1) + std r18, 88(r1) + std r19, 96(r1) + std r20, 104(r1) + std r21, 112(r1) + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + stxvx 32+v20, r10, r1 + stxvx 32+v21, r11, r1 + stxvx 32+v22, r12, r1 + stxvx 32+v23, r14, r1 + stxvx 32+v24, r15, r1 + stxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + stxvx 32+v26, r10, r1 + stxvx 32+v27, r11, r1 + stxvx 32+v28, r12, r1 + stxvx 32+v29, r14, r1 + stxvx 32+v30, r15, r1 + stxvx 32+v31, r16, r1 .endm .macro RESTORE_REGS - li 10, 128 - li 11, 144 - li 12, 160 - li 14, 176 - li 15, 192 - li 16, 208 - lxvx 32+20, 10, 1 - lxvx 32+21, 11, 1 - lxvx 32+22, 12, 1 - lxvx 32+23, 14, 1 - lxvx 32+24, 15, 1 - lxvx 32+25, 16, 1 - li 10, 224 - li 11, 240 - li 12, 256 - li 14, 272 - li 15, 288 - li 16, 304 - lxvx 32+26, 10, 1 - lxvx 32+27, 11, 1 - lxvx 32+28, 12, 1 - lxvx 32+29, 14, 1 - lxvx 32+30, 15, 1 - lxvx 32+31, 16, 1 - ld 14, 56(1) - ld 15, 64(1) - ld 16, 72(1) - ld 17, 80(1) - ld 18, 88(1) - ld 19, 96(1) - ld 20, 104(1) - ld 21, 112(1) - - mtlr 0 - addi 1, 1, 352 + li r10, 128 + li r11, 144 + li r12, 160 + li r14, 176 + li r15, 192 + li r16, 208 + lxvx 32+v20, r10, r1 + lxvx 32+v21, r11, r1 + lxvx 32+v22, r12, r1 + lxvx 32+v23, r14, r1 + lxvx 32+v24, r15, r1 + lxvx 32+v25, r16, r1 + li r10, 224 + li r11, 240 + li r12, 256 + li r14, 272 + li r15, 288 + li r16, 304 + lxvx 32+v26, r10, r1 + lxvx 32+v27, r11, r1 + lxvx 32+v28, r12, r1 + lxvx 32+v29, r14, r1 + lxvx 32+v30, r15, r1 + lxvx 32+v31, r16, r1 + ld r14, 56(r1) + ld r15, 64(r1) + ld r16, 72(r1) + ld r17, 80(r1) + ld r18, 88(r1) + ld r19, 96(r1) + ld r20, 104(r1) + ld r21, 112(r1) + + mtlr r0 + addi r1, r1, 352 .endm /* @@ -124,14 +124,14 @@ * */ .macro Init_Coeffs_offset start next - li 9, \start /* first offset to j */ - add 10, 7, 9 /* J + len*2 */ - addi 16, 9, \next - addi 17, 10, \next - addi 18, 16, \next - addi 19, 17, \next - addi 20, 18, \next - addi 21, 19, \next + li r9, \start /* first offset to j */ + add r10, r7, r9 /* J + len*2 */ + addi r16, r9, \next + addi r17, r10, \next + addi r18, r16, \next + addi r19, r17, \next + addi r20, r18, \next + addi r21, r19, \next .endm /* @@ -139,10 +139,10 @@ * r[j+len]: V13, V18, V23, V28 */ .macro Load_4Rjp - lxvd2x 32+13, 3, 10 /* V13: vector r'0 */ - lxvd2x 32+18, 3, 17 /* V18: vector for r'1 */ - lxvd2x 32+23, 3, 19 /* V23: vector for r'2 */ - lxvd2x 32+28, 3, 21 /* V28: vector for r'3 */ + lxvd2x 32+v13, r3, r10 /* V13: vector r'0 */ + lxvd2x 32+v18, r3, r17 /* V18: vector for r'1 */ + lxvd2x 32+v23, r3, r19 /* V23: vector for r'2 */ + lxvd2x 32+v28, r3, r21 /* V28: vector for r'3 */ .endm /* @@ -173,22 +173,22 @@ * in the proper order to match the multiplication. */ .macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 - lxvd2x 32+26, 10, 5 - vmrgew 13, 25, 26 - vmrgow 12, 25, 26 - lxvd2x 32+25, 11, 5 - lxvd2x 32+26, 12, 5 - vmrgew 18, 25, 26 - vmrgow 17, 25, 26 - lxvd2x 32+25, 15, 5 - lxvd2x 32+26, 16, 5 - vmrgew 23, 25, 26 - vmrgow 22, 25, 26 - lxvd2x 32+25, 17, 5 - lxvd2x 32+26, 18, 5 - vmrgew 28, 25, 26 - vmrgow 27, 25, 26 + lxvd2x 32+v25, 0, r5 + lxvd2x 32+v26, r10, r5 + vmrgew v13, v25, v26 + vmrgow v12, v25, v26 + lxvd2x 32+v25, r11, r5 + lxvd2x 32+v26, r12, r5 + vmrgew v18, v25, v26 + vmrgow v17, v25, v26 + lxvd2x 32+v25, r15, r5 + lxvd2x 32+v26, r16, r5 + vmrgew v23, v25, v26 + vmrgow v22, v25, v26 + lxvd2x 32+v25, r17, r5 + lxvd2x 32+v26, r18, r5 + vmrgew v28, v25, v26 + vmrgow v27, v25, v26 .endm /* @@ -207,22 +207,22 @@ * in the proper order to match the multiplication. */ .macro Load_L44Coeffs - lxvd2x 1, 0, 5 - lxvd2x 2, 10, 5 - xxpermdi 32+13, 2, 1, 3 - xxpermdi 32+12, 2, 1, 0 - lxvd2x 3, 11, 5 - lxvd2x 4, 12, 5 - xxpermdi 32+18, 4, 3, 3 - xxpermdi 32+17, 4, 3, 0 - lxvd2x 1, 15, 5 - lxvd2x 2, 16, 5 - xxpermdi 32+23, 2, 1, 3 - xxpermdi 32+22, 2, 1, 0 - lxvd2x 3, 17, 5 - lxvd2x 4, 18, 5 - xxpermdi 32+28, 4, 3, 3 - xxpermdi 32+27, 4, 3, 0 + lxvd2x vs1, 0, r5 + lxvd2x vs2, r10, r5 + xxpermdi 32+v13, vs2, vs1, 3 + xxpermdi 32+v12, vs2, vs1, 0 + lxvd2x vs3, r11, r5 + lxvd2x vs4, r12, r5 + xxpermdi 32+v18, vs4, vs3, 3 + xxpermdi 32+v17, vs4, vs3, 0 + lxvd2x vs1, r15, r5 + lxvd2x vs2, r16, r5 + xxpermdi 32+v23, vs2, vs1, 3 + xxpermdi 32+v22, vs2, vs1, 0 + lxvd2x vs3, r17, r5 + lxvd2x vs4, r18, r5 + xxpermdi 32+v28, vs4, vs3, 3 + xxpermdi 32+v27, vs4, vs3, 0 .endm /* @@ -236,32 +236,32 @@ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 /* fqmul = zeta * coefficient Modular multification bond by 2^16 * q in abs value */ - vmladduhm 15, 13, \_vz0, 3 - vmladduhm 20, 18, \_vz1, 3 - vmladduhm 25, 23, \_vz2, 3 - vmladduhm 30, 28, \_vz3, 3 + vmladduhm v15, v13, \_vz0, v3 + vmladduhm v20, v18, \_vz1, v3 + vmladduhm v25, v23, \_vz2, v3 + vmladduhm v30, v28, \_vz3, v3 /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ - vmhraddshs 14, 13, \_vz0, 3 - vmhraddshs 19, 18, \_vz1, 3 - vmhraddshs 24, 23, \_vz2, 3 - vmhraddshs 29, 28, \_vz3, 3 + vmhraddshs v14, v13, \_vz0, v3 + vmhraddshs v19, v18, \_vz1, v3 + vmhraddshs v24, v23, \_vz2, v3 + vmhraddshs v29, v28, \_vz3, v3 - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 30, 30, V_QINV, 3 + vmladduhm v15, v15, V_QINV, v3 + vmladduhm v20, v20, V_QINV, v3 + vmladduhm v25, v25, V_QINV, v3 + vmladduhm v30, v30, V_QINV, v3 - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 + vmhraddshs v15, v15, V_NMKQ, v14 + vmhraddshs v20, v20, V_NMKQ, v19 + vmhraddshs v25, v25, V_NMKQ, v24 + vmhraddshs v30, v30, V_NMKQ, v29 /* Shift right 1 bit */ - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vsrah v13, v15, v4 + vsrah v18, v20, v4 + vsrah v23, v25, v4 + vsrah v28, v30, v4 .endm /* @@ -270,10 +270,10 @@ * r[j]: V12, V17, V22, V27 */ .macro Load_4Rj - lxvd2x 32+12, 3, 9 /* V12: vector r0 */ - lxvd2x 32+17, 3, 16 /* V17: vector r1 */ - lxvd2x 32+22, 3, 18 /* V22: vector r2 */ - lxvd2x 32+27, 3, 20 /* V27: vector r3 */ + lxvd2x 32+v12, r3, r9 /* V12: vector r0 */ + lxvd2x 32+v17, r3, r16 /* V17: vector r1 */ + lxvd2x 32+v22, r3, r18 /* V22: vector r2 */ + lxvd2x 32+v27, r3, r20 /* V27: vector r3 */ .endm /* @@ -288,25 +288,25 @@ r[j] = r[j] + t. r[j+len] = r[j] - t */ - vsubuhm 16, 12, 13 - vadduhm 15, 13, 12 - vsubuhm 21, 17, 18 - vadduhm 20, 18, 17 - vsubuhm 26, 22, 23 - vadduhm 25, 23, 22 - vsubuhm 31, 27, 28 - vadduhm 30, 28, 27 + vsubuhm v16, v12, v13 + vadduhm v15, v13, v12 + vsubuhm v21, v17, v18 + vadduhm v20, v18, v17 + vsubuhm v26, v22, v23 + vadduhm v25, v23, v22 + vsubuhm v31, v27, v28 + vadduhm v30, v28, v27 .endm .macro Write_One - stxvd2x 32+15, 3, 9 - stxvd2x 32+16, 3, 10 - stxvd2x 32+20, 3, 16 - stxvd2x 32+21, 3, 17 - stxvd2x 32+25, 3, 18 - stxvd2x 32+26, 3, 19 - stxvd2x 32+30, 3, 20 - stxvd2x 32+31, 3, 21 + stxvd2x 32+v15, r3, r9 + stxvd2x 32+v16, r3, r10 + stxvd2x 32+v20, r3, r16 + stxvd2x 32+v21, r3, r17 + stxvd2x 32+v25, r3, r18 + stxvd2x 32+v26, r3, r19 + stxvd2x 32+v30, r3, r20 + stxvd2x 32+v31, r3, r21 .endm /* @@ -315,22 +315,22 @@ */ .macro PermWriteL44 Compute_4Coeffs - xxpermdi 0, 32+15, 32+16, 3 - xxpermdi 1, 32+15, 32+16, 0 - xxpermdi 2, 32+20, 32+21, 3 - xxpermdi 3, 32+20, 32+21, 0 - xxpermdi 4, 32+25, 32+26, 3 - xxpermdi 5, 32+25, 32+26, 0 - xxpermdi 6, 32+30, 32+31, 3 - xxpermdi 7, 32+30, 32+31, 0 - stxvd2x 0, 0, 5 - stxvd2x 1, 10, 5 - stxvd2x 2, 11, 5 - stxvd2x 3, 12, 5 - stxvd2x 4, 15, 5 - stxvd2x 5, 16, 5 - stxvd2x 6, 17, 5 - stxvd2x 7, 18, 5 + xxpermdi vs0, 32+v15, 32+v16, 3 + xxpermdi vs1, 32+v15, 32+v16, 0 + xxpermdi vs2, 32+v20, 32+v21, 3 + xxpermdi vs3, 32+v20, 32+v21, 0 + xxpermdi vs4, 32+v25, 32+v26, 3 + xxpermdi vs5, 32+v25, 32+v26, 0 + xxpermdi vs6, 32+v30, 32+v31, 3 + xxpermdi vs7, 32+v30, 32+v31, 0 + stxvd2x vs0, 0, r5 + stxvd2x vs1, r10, r5 + stxvd2x vs2, r11, r5 + stxvd2x vs3, r12, r5 + stxvd2x vs4, r15, r5 + stxvd2x vs5, r16, r5 + stxvd2x vs6, r17, r5 + stxvd2x vs7, r18, r5 .endm /* @@ -339,33 +339,33 @@ */ .macro PermWriteL24 Compute_4Coeffs - vmrgew 10, 16, 15 - vmrgow 11, 16, 15 - vmrgew 12, 21, 20 - vmrgow 13, 21, 20 - vmrgew 14, 26, 25 - vmrgow 15, 26, 25 - vmrgew 16, 31, 30 - vmrgow 17, 31, 30 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + vmrgew v10, v16, v15 + vmrgow v11, v16, v15 + vmrgew v12, v21, v20 + vmrgow v13, v21, v20 + vmrgew v14, v26, v25 + vmrgow v15, v26, v25 + vmrgew v16, v31, v30 + vmrgow v17, v31, v30 + stxvd2x 32+v10, 0, r5 + stxvd2x 32+v11, r10, r5 + stxvd2x 32+v12, r11, r5 + stxvd2x 32+v13, r12, r5 + stxvd2x 32+v14, r15, r5 + stxvd2x 32+v15, r16, r5 + stxvd2x 32+v16, r17, r5 + stxvd2x 32+v17, r18, r5 .endm .macro Load_next_4zetas - li 10, 16 - li 11, 32 - li 12, 48 - lxvd2x 32+V_Z0, 0, 14 - lxvd2x 32+V_Z1, 10, 14 - lxvd2x 32+V_Z2, 11, 14 - lxvd2x 32+V_Z3, 12, 14 - addi 14, 14, 64 + li r10, 16 + li r11, 32 + li r12, 48 + lxvd2x 32+V_Z0, 0, r14 + lxvd2x 32+V_Z1, r10, r14 + lxvd2x 32+V_Z2, r11, r14 + lxvd2x 32+V_Z3, r12, r14 + addi r14, r14, 64 .endm /* @@ -397,16 +397,16 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) SAVE_REGS /* load MLKEM_Q */ - lvx V_NMKQ,0,4 + lvx V_NMKQ,0,r4 /* Register 14 as pointer to zetas array */ - addi 14, 4, ZETA_NTT_OFFSET + addi r14, r4, ZETA_NTT_OFFSET - vxor 3, 3, 3 - vspltish 4, 1 + vxor v3, v3, v3 + vspltish v4, 1 - li 10, QINV_OFFSET - lvx V_QINV, 10, 4 + li r10, QINV_OFFSET + lvx V_QINV, r10, r4 .align 4 /* @@ -415,9 +415,9 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * * 1. len = 128, start = 0 */ - li 7, 256 /* len * 2 */ - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + li r7, 256 /* len * 2 */ + lvx V_ZETA, 0, r14 + addi r14, r14, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -429,14 +429,14 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * 2. len = 64, start = 0, 128 * k += 2 */ - li 7, 128 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + li r7, 128 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -445,21 +445,21 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * 3. len = 32, start = 0, 64, 128, 192 * k += 4 */ - li 7, 64 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + li r7, 64 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, r14 + addi r14, r14, 16 NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .align 4 @@ -467,7 +467,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 * k += 8 */ - li 7, 32 + li r7, 32 Load_next_4zetas NTT_MREDUCE_4X 0, 64, V_Z0, V_Z1, V_Z2, V_Z3 NTT_MREDUCE_4X 16, 64, V_Z0, V_Z1, V_Z2, V_Z3 @@ -481,7 +481,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 * k += 16 */ - li 7, 16 + li r7, 16 Load_next_4zetas NTT_MREDUCE_4X 0, 32, V_Z0, V_Z1, V_Z2, V_Z3 @@ -499,18 +499,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * k += 32 * Load zeta vectors in 4-4 layout */ - li 15, 4 - mtctr 15 - mr 5, 3 /* Let r5 points to coefficient array */ - li 7, 8 - - li 10, 16 - li 11, 32 - li 12, 48 - li 15, 64 - li 16, 80 - li 17, 96 - li 18, 112 + li r15, 4 + mtctr r15 + mr r5, r3 /* Let r5 points to coefficient array */ + li r7, 8 + + li r10, 16 + li r11, 32 + li r12, 48 + li r15, 64 + li r16, 80 + li r17, 96 + li r18, 112 .align 4 ntt_ppc__Len4: @@ -519,7 +519,7 @@ ntt_ppc__Len4: Load_L44Coeffs MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 PermWriteL44 - addi 5, 5, 128 + addi r5, r5, 128 bdnz ntt_ppc__Len4 @@ -529,10 +529,10 @@ ntt_ppc__Len4: * Load zeta vectors in 2-2-2-2 layout */ - li 8, 4 - mtctr 8 - mr 5, 3 /* Let r5 points to coefficient array */ - li 7, 4 + li r8, 4 + mtctr r8 + mr r5, r3 /* Let r5 points to coefficient array */ + li r7, 4 .align 4 ntt_ppc__Len2: @@ -540,7 +540,7 @@ ntt_ppc__Len2: Load_L24Coeffs MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 PermWriteL24 - addi 5, 5, 128 + addi r5, r5, 128 bdnz ntt_ppc__Len2 diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S index 4ca5771314..5c0703755c 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -39,141 +39,141 @@ * MREDUCE_4X(_v0, _v1, _v2, _v3) */ .macro MREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+13, 0, 3 - addi 3, 3, 16 - lxvd2x 32+18, 0, 3 - addi 3, 3, 16 - lxvd2x 32+23, 0, 3 - addi 3, 3, 16 - lxvd2x 32+7, 0, 3 - addi 3, 3, 16 - - vmladduhm 15, 13, V1353, 3 - vmladduhm 20, 18, V1353, 3 - vmladduhm 25, 23, V1353, 3 - vmladduhm 9, 7, V1353, 3 - - vmhraddshs 14, 13, V1353, 3 - vmhraddshs 19, 18, V1353, 3 - vmhraddshs 24, 23, V1353, 3 - vmhraddshs 8, 7, V1353, 3 - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 9, 9, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 9, 9, V_NMKQ, 8 + lxvd2x 32+v13, 0, r3 + addi r3, r3, 16 + lxvd2x 32+v18, 0, r3 + addi r3, r3, 16 + lxvd2x 32+v23, 0, r3 + addi r3, r3, 16 + lxvd2x 32+v7, 0, r3 + addi r3, r3, 16 + + vmladduhm v15, v13, V1353, v3 + vmladduhm v20, v18, V1353, v3 + vmladduhm v25, v23, V1353, v3 + vmladduhm v9, v7, V1353, v3 + + vmhraddshs v14, v13, V1353, v3 + vmhraddshs v19, v18, V1353, v3 + vmhraddshs v24, v23, V1353, v3 + vmhraddshs v8, v7, V1353, v3 + + vmladduhm v15, v15, V_QINV, v3 + vmladduhm v20, v20, V_QINV, v3 + vmladduhm v25, v25, V_QINV, v3 + vmladduhm v9, v9, V_QINV, v3 + + vmhraddshs v15, v15, V_NMKQ, v14 + vmhraddshs v20, v20, V_NMKQ, v19 + vmhraddshs v25, v25, V_NMKQ, v24 + vmhraddshs v9, v9, V_NMKQ, v8 /* Shift right 1 bit */ - vsrah \_v0, 15, 4 - vsrah \_v1, 20, 4 - vsrah \_v2, 25, 4 - vsrah \_v3, 9, 4 + vsrah \_v0, v15, v4 + vsrah \_v1, v20, v4 + vsrah \_v2, v25, v4 + vsrah \_v3, v9, v4 .endm .macro Write_8X - stxvd2x 32+27, 4, 3 - stxvd2x 32+28, 5, 3 - stxvd2x 32+29, 6, 3 - stxvd2x 32+30, 7, 3 - stxvd2x 32+13, 8, 3 - stxvd2x 32+18, 9, 3 - stxvd2x 32+23, 10, 3 - stxvd2x 32+7, 11, 3 + stxvd2x 32+v27, r4, r3 + stxvd2x 32+v28, r5, r3 + stxvd2x 32+v29, r6, r3 + stxvd2x 32+v30, r7, r3 + stxvd2x 32+v13, r8, r3 + stxvd2x 32+v18, r9, r3 + stxvd2x 32+v23, r10, r3 + stxvd2x 32+v7, r11, r3 .endm .align 4 .globl MLK_ASM_NAMESPACE(poly_tomont_ppc) MLK_ASM_FN_SYMBOL(poly_tomont_ppc) - stdu 1, -320(1) - mflr 0 - - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - li 11, 208 - li 12, 224 - stxvx 32+20, 6, 1 - stxvx 32+21, 7, 1 - stxvx 32+22, 8, 1 - stxvx 32+23, 9, 1 - stxvx 32+24, 10, 1 - stxvx 32+25, 11, 1 - stxvx 32+26, 12, 1 - li 6, 240 - li 7, 256 - li 8, 272 - li 9, 288 - stxvx 32+27, 6, 1 - stxvx 32+28, 7, 1 - stxvx 32+29, 8, 1 - stxvx 32+30, 9, 1 - - li 6, NQ_OFFSET - li 7, QINV_OFFSET - li 8, C1353_OFFSET - lxvx 32+V_NMKQ, 6, 4 - lxvx 32+V_QINV, 7, 4 - lxvx 32+V1353, 8, 4 - - vxor 3, 3, 3 - vspltish 4, 1 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 + stdu r1, -320(r1) + mflr r0 + + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + li r11, 208 + li r12, 224 + stxvx 32+v20, r6, r1 + stxvx 32+v21, r7, r1 + stxvx 32+v22, r8, r1 + stxvx 32+v23, r9, r1 + stxvx 32+v24, r10, r1 + stxvx 32+v25, r11, r1 + stxvx 32+v26, r12, r1 + li r6, 240 + li r7, 256 + li r8, 272 + li r9, 288 + stxvx 32+v27, r6, r1 + stxvx 32+v28, r7, r1 + stxvx 32+v29, r8, r1 + stxvx 32+v30, r9, r1 + + li r6, NQ_OFFSET + li r7, QINV_OFFSET + li r8, C1353_OFFSET + lxvx 32+V_NMKQ, r6, r4 + lxvx 32+V_QINV, r7, r4 + lxvx 32+V1353, r8, r4 + + vxor v3, v3, v3 + vspltish v4, 1 + + li r4, -128 + li r5, -112 + li r6, -96 + li r7, -80 + li r8, -64 + li r9, -48 + li r10, -32 + li r11, -16 + + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 Write_8X - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 Write_8X - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 Write_8X - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 + MREDUCE_4X v27, v28, v29, v30 + MREDUCE_4X v13, v18, v23, v7 Write_8X - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - li 11, 208 - li 12, 224 - lxvx 32+20, 6, 1 - lxvx 32+21, 7, 1 - lxvx 32+22, 8, 1 - lxvx 32+23, 9, 1 - lxvx 32+24, 10, 1 - lxvx 32+25, 11, 1 - lxvx 32+26, 12, 1 - li 6, 240 - li 7, 256 - li 8, 272 - li 9, 288 - lxvx 32+27, 6, 1 - lxvx 32+28, 7, 1 - lxvx 32+29, 8, 1 - lxvx 32+30, 9, 1 - mtlr 0 - addi 1, 1, 320 + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + li r11, 208 + li r12, 224 + lxvx 32+v20, r6, r1 + lxvx 32+v21, r7, r1 + lxvx 32+v22, r8, r1 + lxvx 32+v23, r9, r1 + lxvx 32+v24, r10, r1 + lxvx 32+v25, r11, r1 + lxvx 32+v26, r12, r1 + li r6, 240 + li r7, 256 + li r8, 272 + li r9, 288 + lxvx 32+v27, r6, r1 + lxvx 32+v28, r7, r1 + lxvx 32+v29, r8, r1 + lxvx 32+v30, r9, r1 + mtlr r0 + addi r1, r1, 320 blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index 3b6892d867..a6deedffc3 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -33,168 +33,168 @@ .text .macro BREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+8, 0, 3 - lxvd2x 32+12, 14, 3 - lxvd2x 32+16, 15, 3 - lxvd2x 32+20, 16, 3 - addi 3, 3, 64 - vmulosh 6, 8, V20159 - vmulesh 5, 8, V20159 - vmulosh 11, 12, V20159 - vmulesh 10, 12, V20159 - vmulosh 15, 16, V20159 - vmulesh 14, 16, V20159 - vmulosh 19, 20, V20159 - vmulesh 18, 20, V20159 - xxmrglw 32+4, 32+5, 32+6 - xxmrghw 32+5, 32+5, 32+6 - xxmrglw 32+9, 32+10, 32+11 - xxmrghw 32+10, 32+10, 32+11 - xxmrglw 32+13, 32+14, 32+15 - xxmrghw 32+14, 32+14, 32+15 - xxmrglw 32+17, 32+18, 32+19 - xxmrghw 32+18, 32+18, 32+19 - vadduwm 4, 4, V_25 - vadduwm 5, 5, V_25 - vadduwm 9, 9, V_25 - vadduwm 10, 10, V_25 - vadduwm 13, 13, V_25 - vadduwm 14, 14, V_25 - vadduwm 17, 17, V_25 - vadduwm 18, 18, V_25 - vsraw 4, 4, V_26 - vsraw 5, 5, V_26 - vsraw 9, 9, V_26 - vsraw 10, 10, V_26 - vsraw 13, 13, V_26 - vsraw 14, 14, V_26 - vsraw 17, 17, V_26 - vsraw 18, 18, V_26 - vpkuwum 4, 5, 4 - vsubuhm 4, 7, 4 - vpkuwum 9, 10, 9 - vsubuhm 9, 7, 9 - vpkuwum 13, 14, 13 - vsubuhm 13, 7, 13 - vpkuwum 17, 18, 17 - vsubuhm 17, 7, 17 - vmladduhm \_v0, 4, V_MKQ, 8 - vmladduhm \_v1, 9, V_MKQ, 12 - vmladduhm \_v2, 13, V_MKQ, 16 - vmladduhm \_v3, 17, V_MKQ, 20 + lxvd2x 32+v8, 0, r3 + lxvd2x 32+v12, r14, r3 + lxvd2x 32+v16, r15, r3 + lxvd2x 32+v20, r16, r3 + addi r3, r3, 64 + vmulosh v6, v8, V20159 + vmulesh v5, v8, V20159 + vmulosh v11, v12, V20159 + vmulesh v10, v12, V20159 + vmulosh v15, v16, V20159 + vmulesh v14, v16, V20159 + vmulosh v19, v20, V20159 + vmulesh v18, v20, V20159 + xxmrglw 32+v4, 32+v5, 32+v6 + xxmrghw 32+v5, 32+v5, 32+v6 + xxmrglw 32+v9, 32+v10, 32+v11 + xxmrghw 32+v10, 32+v10, 32+v11 + xxmrglw 32+v13, 32+v14, 32+v15 + xxmrghw 32+v14, 32+v14, 32+v15 + xxmrglw 32+v17, 32+v18, 32+v19 + xxmrghw 32+v18, 32+v18, 32+v19 + vadduwm v4, v4, V_25 + vadduwm v5, v5, V_25 + vadduwm v9, v9, V_25 + vadduwm v10, v10, V_25 + vadduwm v13, v13, V_25 + vadduwm v14, v14, V_25 + vadduwm v17, v17, V_25 + vadduwm v18, v18, V_25 + vsraw v4, v4, V_26 + vsraw v5, v5, V_26 + vsraw v9, v9, V_26 + vsraw v10, v10, V_26 + vsraw v13, v13, V_26 + vsraw v14, v14, V_26 + vsraw v17, v17, V_26 + vsraw v18, v18, V_26 + vpkuwum v4, v5, v4 + vsubuhm v4, v7, v4 + vpkuwum v9, v10, v9 + vsubuhm v9, v7, v9 + vpkuwum v13, v14, v13 + vsubuhm v13, v7, v13 + vpkuwum v17, v18, v17 + vsubuhm v17, v7, v17 + vmladduhm \_v0, v4, V_MKQ, v8 + vmladduhm \_v1, v9, V_MKQ, v12 + vmladduhm \_v2, v13, V_MKQ, v16 + vmladduhm \_v3, v17, V_MKQ, v20 .endm .macro Write_8X - stxvd2x 32+21, 4, 3 - stxvd2x 32+22, 5, 3 - stxvd2x 32+23, 6, 3 - stxvd2x 32+24, 7, 3 - stxvd2x 32+4, 8, 3 - stxvd2x 32+9, 9, 3 - stxvd2x 32+13, 10, 3 - stxvd2x 32+17, 11, 3 + stxvd2x 32+v21, r4, r3 + stxvd2x 32+v22, r5, r3 + stxvd2x 32+v23, r6, r3 + stxvd2x 32+v24, r7, r3 + stxvd2x 32+v4, r8, r3 + stxvd2x 32+v9, r9, r3 + stxvd2x 32+v13, r10, r3 + stxvd2x 32+v17, r11, r3 .endm /* * Conditional addition to get unsigned canonical representative */ .macro To_unsigned_16 - lxvd2x 32+12, 0, 3 - lxvd2x 32+13, 14, 3 - lxvd2x 32+14, 15, 3 - lxvd2x 32+15, 16, 3 - addi 3, 3, 64 - vsrh 1, 12, 10 - vsrh 0, 13, 10 - vsrh 3, 14, 10 - vsrh 2, 15, 10 - vadduhm 7, 12, 11 - vadduhm 8, 13, 11 - vadduhm 5, 14, 11 - vadduhm 6, 15, 11 - vcmpequh 1, 1, 9 - vcmpequh 0, 0, 9 - vcmpequh 3, 3, 9 - vcmpequh 2, 2, 9 - xxsel 32+1, 32+7,32+12, 32+1 - xxsel 32+0, 32+8,32+13, 32+0 - xxsel 32+3, 32+5,32+14, 32+3 - xxsel 32+2, 32+6,32+15, 32+2 - stxvd2x 32+3, 10, 3 - stxvd2x 32+2, 11, 3 - stxvd2x 32+1, 8, 3 - stxvd2x 32+0, 9, 3 + lxvd2x 32+v12, 0, r3 + lxvd2x 32+v13, r14, r3 + lxvd2x 32+v14, r15, r3 + lxvd2x 32+v15, r16, r3 + addi r3, r3, 64 + vsrh v1, v12, v10 + vsrh v0, v13, v10 + vsrh v3, v14, v10 + vsrh v2, v15, v10 + vadduhm v7, v12, v11 + vadduhm v8, v13, v11 + vadduhm v5, v14, v11 + vadduhm v6, v15, v11 + vcmpequh v1, v1, v9 + vcmpequh v0, v0, v9 + vcmpequh v3, v3, v9 + vcmpequh v2, v2, v9 + xxsel 32+v1, 32+v7,32+v12, 32+v1 + xxsel 32+v0, 32+v8,32+v13, 32+v0 + xxsel 32+v3, 32+v5,32+v14, 32+v3 + xxsel 32+v2, 32+v6,32+v15, 32+v2 + stxvd2x 32+v3, r10, r3 + stxvd2x 32+v2, r11, r3 + stxvd2x 32+v1, r8, r3 + stxvd2x 32+v0, r9, r3 .endm .align 4 .globl MLK_ASM_NAMESPACE(reduce_ppc) MLK_ASM_FN_SYMBOL(reduce_ppc) - stdu 1, -224(1) - mflr 0 - std 14, 96(1) - std 15, 104(1) - std 16, 112(1) - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - stxvx 32+20, 6, 1 - stxvx 32+21, 7, 1 - stxvx 32+22, 8, 1 - stxvx 32+23, 9, 1 - stxvx 32+24, 10, 1 - - vxor 7, 7, 7 - - li 6, Q_OFFSET - li 7, C20159_OFFSET - lxvx 32+V_MKQ, 6, 4 - lxvx 32+V20159, 7, 4 + stdu r1, -224(r1) + mflr r0 + std r14, 96(r1) + std r15, 104(r1) + std r16, 112(r1) + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + stxvx 32+v20, r6, r1 + stxvx 32+v21, r7, r1 + stxvx 32+v22, r8, r1 + stxvx 32+v23, r9, r1 + stxvx 32+v24, r10, r1 + + vxor v7, v7, v7 + + li r6, Q_OFFSET + li r7, C20159_OFFSET + lxvx 32+V_MKQ, r6, r4 + lxvx 32+V20159, r7, r4 vspltisw V_26, 13 vadduwm V_26, V_26, V_26 - vspltisw 4, 1 - vsubuwm 5, V_26, 4 - vslw V_25, 4, 5 - - li 4, -128 - li 5, -112 - li 6, -96 - li 7, -80 - li 8, -64 - li 9, -48 - li 10, -32 - li 11, -16 - - li 14, 16 - li 15, 32 - li 16, 48 - - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 + vspltisw v4, 1 + vsubuwm v5, V_26, v4 + vslw V_25, v4, v5 + + li r4, -128 + li r5, -112 + li r6, -96 + li r7, -80 + li r8, -64 + li r9, -48 + li r10, -32 + li r11, -16 + + li r14, 16 + li r15, 32 + li r16, 48 + + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 Write_8X - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 Write_8X - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 Write_8X - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 + BREDUCE_4X v21, v22, v23, v24 + BREDUCE_4X v4, v9, v13, v17 Write_8X /* * To unsigned canonical */ .align 4 - addi 3, 3, -512 - vxor 9, 9, 9 - vspltish 10, 15 - vmr 11, V_MKQ + addi r3, r3, -512 + vxor v9, v9, v9 + vspltish v10, 15 + vmr v11, V_MKQ To_unsigned_16 To_unsigned_16 @@ -205,21 +205,21 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) To_unsigned_16 To_unsigned_16 - ld 14, 96(1) - ld 15, 104(1) - ld 16, 112(1) - li 6, 128 - li 7, 144 - li 8, 160 - li 9, 176 - li 10, 192 - lxvx 32+20, 6, 1 - lxvx 32+21, 7, 1 - lxvx 32+22, 8, 1 - lxvx 32+23, 9, 1 - lxvx 32+24, 10, 1 - mtlr 0 - addi 1, 1, 224 + ld r14, 96(r1) + ld r15, 104(r1) + ld r16, 112(r1) + li r6, 128 + li r7, 144 + li r8, 160 + li r9, 176 + li r10, 192 + lxvx 32+v20, r6, r1 + lxvx 32+v21, r7, r1 + lxvx 32+v22, r8, r1 + lxvx 32+v23, r9, r1 + lxvx 32+v24, r10, r1 + mtlr r0 + addi r1, r1, 224 blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. From bceaa7b3ce7c12b710168efb9ac0e7a812a70867 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Tue, 9 Dec 2025 06:56:28 -0500 Subject: [PATCH 20/22] This patch fixes the following, 1. Added detailed comments on NTT and INTT implementations. 2. Used C type symbols to improve readability. Signed-off-by: Danny Tsen --- dev/ppc64le/src/consts.h | 67 -- dev/ppc64le/src/intt_ppc.S | 978 +++++++++++---------- dev/ppc64le/src/ntt_ppc.S | 599 +++++++------ dev/ppc64le/src/poly_tomont.S | 234 ++--- dev/ppc64le/src/reduce.S | 296 +++---- mlkem/src/native/ppc64le/src/consts.h | 67 -- mlkem/src/native/ppc64le/src/intt_ppc.S | 978 +++++++++++---------- mlkem/src/native/ppc64le/src/ntt_ppc.S | 599 +++++++------ mlkem/src/native/ppc64le/src/poly_tomont.S | 234 ++--- mlkem/src/native/ppc64le/src/reduce.S | 296 +++---- 10 files changed, 2256 insertions(+), 2092 deletions(-) diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h index 96cf7cfc91..b5e66983fe 100644 --- a/dev/ppc64le/src/consts.h +++ b/dev/ppc64le/src/consts.h @@ -19,73 +19,6 @@ #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) extern const int16_t mlk_ppc_qdata[]; -#else -#define r0 0 -#define r1 1 -#define r3 3 -#define r4 4 -#define r5 5 -#define r6 6 -#define r7 7 -#define r8 8 -#define r9 9 -#define r10 10 -#define r11 11 -#define r12 12 -#define r14 14 -#define r15 15 -#define r16 16 -#define r17 17 -#define r18 18 -#define r19 19 -#define r20 20 -#define r21 21 -#define v0 0 -#define v1 1 -#define v2 2 -#define v3 3 -#define v4 4 -#define v5 5 -#define v6 6 -#define v7 7 -#define v8 8 -#define v9 9 -#define v10 10 -#define v11 11 -#define v12 12 -#define v13 13 -#define v14 14 -#define v15 15 -#define v16 16 -#define v17 17 -#define v18 18 -#define v19 19 -#define v20 20 -#define v21 21 -#define v22 22 -#define v23 23 -#define v24 24 -#define v25 25 -#define v26 26 -#define v27 27 -#define v28 28 -#define v29 29 -#define v30 30 -#define v31 31 -#define vs0 0 -#define vs1 1 -#define vs2 2 -#define vs3 3 -#define vs4 4 -#define vs5 5 -#define vs6 6 -#define vs7 7 -#define vs8 8 -#define vs9 9 -#define vs10 10 -#define vs11 11 -#define vs12 12 -#define vs13 13 #endif #endif /* !MLK_DEV_PPC64LE_SRC_CONSTS_H */ diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index d311138275..38b1777688 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -22,7 +22,7 @@ /* Barrett reduce constatnts */ #define V20159 0 -#define V_25 1 +#define V2pw25 1 #define V_26 2 #define V_MKQ 3 @@ -36,101 +36,123 @@ #define V_ZETA 10 #define V1441 10 +#define vdata_a1 21 +#define vdata_a2 22 +#define vdata_a3 23 +#define vdata_a4 24 +#define vdata_b1 8 +#define vdata_b2 12 +#define vdata_b3 16 +#define vdata_b4 20 + +#define vdata_brt1 8 +#define vdata_brt2 12 +#define vdata_brt3 16 +#define vdata_brt4 20 + +#define vdata_mont1 25 +#define vdata_mont2 26 +#define vdata_mont3 30 +#define vdata_mont4 31 + +#define vresult_brt1 4 +#define vresult_brt2 9 +#define vresult_brt3 13 +#define vresult_brt4 17 +#define vresult_mont1 13 +#define vresult_mont2 18 +#define vresult_mont3 23 +#define vresult_mont4 28 + .macro SAVE_REGS - stdu r1, -352(r1) - mflr r0 - std r14, 56(r1) - std r15, 64(r1) - std r16, 72(r1) - std r17, 80(r1) - std r18, 88(r1) - std r19, 96(r1) - std r20, 104(r1) - std r21, 112(r1) - li r10, 128 - li r11, 144 - li r12, 160 - li r14, 176 - li r15, 192 - li r16, 208 - stxvx 32+v20, r10, r1 - stxvx 32+v21, r11, r1 - stxvx 32+v22, r12, r1 - stxvx 32+v23, r14, r1 - stxvx 32+v24, r15, r1 - stxvx 32+v25, r16, r1 - li r10, 224 - li r11, 240 - li r12, 256 - li r14, 272 - li r15, 288 - li r16, 304 - stxvx 32+v26, r10, r1 - stxvx 32+v27, r11, r1 - stxvx 32+v28, r12, r1 - stxvx 32+v29, r14, r1 - stxvx 32+v30, r15, r1 - stxvx 32+v31, r16, r1 + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 .endm .macro RESTORE_REGS - li r10, 128 - li r11, 144 - li r12, 160 - li r14, 176 - li r15, 192 - li r16, 208 - lxvx 32+v20, r10, r1 - lxvx 32+v21, r11, r1 - lxvx 32+v22, r12, r1 - lxvx 32+v23, r14, r1 - lxvx 32+v24, r15, r1 - lxvx 32+v25, r16, r1 - li r10, 224 - li r11, 240 - li r12, 256 - li r14, 272 - li r15, 288 - li r16, 304 - lxvx 32+v26, r10, r1 - lxvx 32+v27, r11, r1 - lxvx 32+v28, r12, r1 - lxvx 32+v29, r14, r1 - lxvx 32+v30, r15, r1 - lxvx 32+v31, r16, r1 - ld r14, 56(r1) - ld r15, 64(r1) - ld r16, 72(r1) - ld r17, 80(r1) - ld r18, 88(r1) - ld r19, 96(r1) - ld r20, 104(r1) - ld r21, 112(r1) - - mtlr r0 - addi r1, r1, 352 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 .endm /* - * Compute final final r[j] and r[j+len] - * final r[j+len]: V8, V12, V16, V20 - * final r[j]: V21, V22, V23, V24 + * Compute r[j] and r[j+len] from computed coefficients + * r[j] + r[j+len] : V8, V12, V16, V20 (data for Barett reduce) + * r[j+len] - r[j]: V25, V26, V30, V31 (data for Montgomery reduce) */ .macro Compute_4Coeffs - /* Since the result of the Montgomery multiplication is bounded - by q in absolute value. - Finally to complete the final update of the results with add/sub - r[j] = r[j] + t. - r[j+len] = r[j] - t - */ - vsubuhm v25, v8, v21 - vsubuhm v26, v12, v22 - vsubuhm v30, v16, v23 - vsubuhm v31, v20, v24 - vadduhm v8, v8, v21 - vadduhm v12, v12, v22 - vadduhm v16, v16, v23 - vadduhm v20, v20, v24 + vsubuhm vdata_mont1, vdata_b1, vdata_a1 + vsubuhm vdata_mont2, vdata_b2, vdata_a2 + vsubuhm vdata_mont3, vdata_b3, vdata_a3 + vsubuhm vdata_mont4, vdata_b4, vdata_a4 + vadduhm vdata_brt1, vdata_b1, vdata_a1 + vadduhm vdata_brt2, vdata_b2, vdata_a2 + vadduhm vdata_brt3, vdata_b3, vdata_a3 + vadduhm vdata_brt4, vdata_b4, vdata_a4 .endm /* @@ -155,14 +177,14 @@ * */ .macro Init_Coeffs_offset start next - li r9, \start /* first offset to j */ - add r10, r7, r9 /* J + len*2 */ - addi r16, r9, \next - addi r17, r10, \next - addi r18, r16, \next - addi r19, r17, \next - addi r20, r18, \next - addi r21, r19, \next + li 9, \start /* first offset to j */ + add 10, 7, 9 /* J + len*2 */ + addi 16, 9, \next + addi 17, 10, \next + addi 18, 16, \next + addi 19, 17, \next + addi 20, 18, \next + addi 21, 19, \next .endm /* @@ -174,15 +196,15 @@ * r[j]: V21, V22, V23, V24 */ .macro Load_4Rjp - lxvd2x 32+v8, r3, r10 /* V8: vector r'0 */ - lxvd2x 32+v12, r3, r17 /* V12: vector for r'1 */ - lxvd2x 32+v16, r3, r19 /* V16: vector for r'2 */ - lxvd2x 32+v20, r3, r21 /* V20: vector for r'3 */ - - lxvd2x 32+v21, r3, r9 /* V21: vector r0 */ - lxvd2x 32+v22, r3, r16 /* V22: vector r1 */ - lxvd2x 32+v23, r3, r18 /* V23: vector r2 */ - lxvd2x 32+v24, r3, r20 /* V24: vector r3 */ + lxvd2x 32+vdata_b1, 3, 10 /* V8: vector r'0 */ + lxvd2x 32+vdata_b2, 3, 17 /* V12: vector for r'1 */ + lxvd2x 32+vdata_b3, 3, 19 /* V16: vector for r'2 */ + lxvd2x 32+vdata_b4, 3, 21 /* V20: vector for r'3 */ + + lxvd2x 32+vdata_a1, 3, 9 /* V21: vector r0 */ + lxvd2x 32+vdata_a2, 3, 16 /* V22: vector r1 */ + lxvd2x 32+vdata_a3, 3, 18 /* V23: vector r2 */ + lxvd2x 32+vdata_a4, 3, 20 /* V24: vector r3 */ .endm /* @@ -214,22 +236,22 @@ * in the proper order to match the multiplication. */ .macro Load_L24Coeffs - lxvd2x 32+v25, 0, r5 - lxvd2x 32+v26, r10, r5 - vmrgew v8, v25, v26 - vmrgow v21, v25, v26 - lxvd2x 32+v25, r11, r5 - lxvd2x 32+v26, r12, r5 - vmrgew v12, v25, v26 - vmrgow v22, v25, v26 - lxvd2x 32+v25, r15, r5 - lxvd2x 32+v26, r16, r5 - vmrgew v16, v25, v26 - vmrgow v23, v25, v26 - lxvd2x 32+v25, r17, r5 - lxvd2x 32+v26, r18, r5 - vmrgew v20, v25, v26 - vmrgow v24, v25, v26 + lxvd2x 32+25, 0, 5 + lxvd2x 32+26, 10, 5 + vmrgew vdata_b1, 25, 26 + vmrgow vdata_a1, 25, 26 + lxvd2x 32+25, 11, 5 + lxvd2x 32+26, 12, 5 + vmrgew vdata_b2, 25, 26 + vmrgow vdata_a2, 25, 26 + lxvd2x 32+25, 15, 5 + lxvd2x 32+26, 16, 5 + vmrgew vdata_b3, 25, 26 + vmrgow vdata_a3, 25, 26 + lxvd2x 32+25, 17, 5 + lxvd2x 32+26, 18, 5 + vmrgew vdata_b4, 25, 26 + vmrgow vdata_a4, 25, 26 .endm /* @@ -248,81 +270,81 @@ * in the proper order to match the multiplication. */ .macro Load_L44Coeffs - lxvd2x vs10, 0, r5 - lxvd2x vs11, r10, r5 - xxpermdi 32+v8, vs11, vs10, 3 - xxpermdi 32+v21, vs11, vs10, 0 - lxvd2x vs10, r11, r5 - lxvd2x vs11, r12, r5 - xxpermdi 32+v12, vs11, vs10, 3 - xxpermdi 32+v22, vs11, vs10, 0 - lxvd2x vs10, r15, r5 - lxvd2x vs11, r16, r5 - xxpermdi 32+v16, vs11, vs10, 3 - xxpermdi 32+v23, vs11, vs10, 0 - lxvd2x vs10, r17, r5 - lxvd2x vs11, r18, r5 - xxpermdi 32+v20, vs11, vs10, 3 - xxpermdi 32+v24, vs11, vs10, 0 + lxvd2x 10, 0, 5 + lxvd2x 11, 10, 5 + xxpermdi 32+vdata_b1, 11, 10, 3 + xxpermdi 32+vdata_a1, 11, 10, 0 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxpermdi 32+vdata_b2, 11, 10, 3 + xxpermdi 32+vdata_a2, 11, 10, 0 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxpermdi 32+vdata_b3, 11, 10, 3 + xxpermdi 32+vdata_a3, 11, 10, 0 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxpermdi 32+vdata_b4, 11, 10, 3 + xxpermdi 32+vdata_a4, 11, 10, 0 .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 /* Restore constant vectors - V_MKQ, V_25 and V_26 */ - vxor v7, v7, v7 - xxlor 32+v3, vs6, vs6 - xxlor 32+v1, vs7, vs7 - xxlor 32+v2, vs8, vs8 + V_MKQ, V2pw25 and V_26 */ + vxor 7, 7, 7 + xxlor 32+3, 6, 6 + xxlor 32+1, 7, 7 + xxlor 32+2, 8, 8 /* Multify Odd/Even signed halfword; Results word bound by 2^32 in abs value. */ - vmulosh v6, v8, V20159 - vmulesh v5, v8, V20159 - vmulosh v11, v12, V20159 - vmulesh v10, v12, V20159 - vmulosh v15, v16, V20159 - vmulesh v14, v16, V20159 - vmulosh v19, v20, V20159 - vmulesh v18, v20, V20159 - xxmrglw 32+v4, 32+v5, 32+v6 - xxmrghw 32+v5, 32+v5, 32+v6 - xxmrglw 32+v9, 32+v10, 32+v11 - xxmrghw 32+v10, 32+v10, 32+v11 - xxmrglw 32+v13, 32+v14, 32+v15 - xxmrghw 32+v14, 32+v14, 32+v15 - xxmrglw 32+v17, 32+v18, 32+v19 - xxmrghw 32+v18, 32+v18, 32+v19 - vadduwm v4, v4, V_25 - vadduwm v5, v5, V_25 - vadduwm v9, v9, V_25 - vadduwm v10, v10, V_25 - vadduwm v13, v13, V_25 - vadduwm v14, v14, V_25 - vadduwm v17, v17, V_25 - vadduwm v18, v18, V_25 + vmulosh 6, vdata_brt1, V20159 + vmulesh 5, vdata_brt1, V20159 + vmulosh 11, vdata_brt2, V20159 + vmulesh 10, vdata_brt2, V20159 + vmulosh 15, vdata_brt3, V20159 + vmulesh 14, vdata_brt3, V20159 + vmulosh 19, vdata_brt4, V20159 + vmulesh 18, vdata_brt4, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V2pw25 + vadduwm 5, 5, V2pw25 + vadduwm 9, 9, V2pw25 + vadduwm 10, 10, V2pw25 + vadduwm 13, 13, V2pw25 + vadduwm 14, 14, V2pw25 + vadduwm 17, 17, V2pw25 + vadduwm 18, 18, V2pw25 /* Right shift and pack lower halfword, results bond to 2^16 in abs value */ - vsraw v4, v4, V_26 - vsraw v5, v5, V_26 - vsraw v9, v9, V_26 - vsraw v10, v10, V_26 - vsraw v13, v13, V_26 - vsraw v14, v14, V_26 - vsraw v17, v17, V_26 - vsraw v18, v18, V_26 - vpkuwum v4, v5, v4 - vsubuhm v4, v7, v4 - vpkuwum v9, v10, v9 - vsubuhm v9, v7, v9 - vpkuwum v13, v14, v13 - vsubuhm v13, v7, v13 - vpkuwum v17, v18, v17 - vsubuhm v17, v7, v17 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 /* Modulo multify-Low unsigned halfword; results bond to 2^16 * q in abs value. */ - vmladduhm \_v0, v4, V_MKQ, v8 - vmladduhm \_v1, v9, V_MKQ, v12 - vmladduhm \_v2, v13, V_MKQ, v16 - vmladduhm \_v3, v17, V_MKQ, v20 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm /* @@ -331,32 +353,32 @@ */ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 /* Modular multification bond by 2^16 * q in abs value */ - vmladduhm v15, v25, \_vz0, v3 - vmladduhm v20, v26, \_vz1, v3 - vmladduhm v27, v30, \_vz2, v3 - vmladduhm v28, v31, \_vz3, v3 + vmladduhm 15, vdata_mont1, \_vz0, 3 + vmladduhm 20, vdata_mont2, \_vz1, 3 + vmladduhm 27, vdata_mont3, \_vz2, 3 + vmladduhm 28, vdata_mont4, \_vz3, 3 /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ - vmhraddshs v14, v25, \_vz0, v3 - vmhraddshs v19, v26, \_vz1, v3 - vmhraddshs v24, v30, \_vz2, v3 - vmhraddshs v29, v31, \_vz3, v3 + vmhraddshs 14, vdata_mont1, \_vz0, 3 + vmhraddshs 19, vdata_mont2, \_vz1, 3 + vmhraddshs 24, vdata_mont3, \_vz2, 3 + vmhraddshs 29, vdata_mont4, \_vz3, 3 - vmladduhm v15, v15, V_QINV, v3 - vmladduhm v20, v20, V_QINV, v3 - vmladduhm v25, v27, V_QINV, v3 - vmladduhm v30, v28, V_QINV, v3 + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 - vmhraddshs v15, v15, V_NMKQ, v14 - vmhraddshs v20, v20, V_NMKQ, v19 - vmhraddshs v25, v25, V_NMKQ, v24 - vmhraddshs v30, v30, V_NMKQ, v29 + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 /* Shift right 1 bit */ - vsrah \_vo0, v15, v4 - vsrah \_vo1, v20, v4 - vsrah \_vo2, v25, v4 - vsrah \_vo3, v30, v4 + vsrah \_vo0, 15, 4 + vsrah \_vo1, 20, 4 + vsrah \_vo2, 25, 4 + vsrah \_vo3, 30, 4 .endm /* @@ -364,21 +386,21 @@ * V_NMKQ, V_QINV, Zero vector, One vector */ .macro Set_mont_consts - xxlor 32+v5, vs0, vs0 /* V_NMKQ */ - xxlor 32+v2, vs2, vs2 /* V_QINV */ - xxlor 32+v3, vs3, vs3 /* all 0 */ - xxlor 32+v4, vs4, vs4 /* all 1 */ + xxlor 32+5, 0, 0 /* V_NMKQ */ + xxlor 32+2, 2, 2 /* V_QINV */ + xxlor 32+3, 3, 3 /* all 0 */ + xxlor 32+4, 4, 4 /* all 1 */ .endm .macro Load_next_4zetas - li r8, 16 - li r11, 32 - li r12, 48 - lxvd2x 32+V_Z0, 0, r14 - lxvd2x 32+V_Z1, r8, r14 - lxvd2x 32+V_Z2, r11, r14 - lxvd2x 32+V_Z3, r12, r14 - addi r14, r14, 64 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 8, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 + addi 14, 14, 64 .endm /* @@ -393,38 +415,38 @@ .endm .macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, r3, r9 - stxvd2x \_vs1, r3, r16 - stxvd2x \_vs2, r3, r18 - stxvd2x \_vs3, r3, r20 + stxvd2x \_vs0, 3, 9 + stxvd2x \_vs1, 3, 16 + stxvd2x \_vs2, 3, 18 + stxvd2x \_vs3, 3, 20 .endm .macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, r3, r10 - stxvd2x \_vs1, r3, r17 - stxvd2x \_vs2, r3, r19 - stxvd2x \_vs3, r3, r21 + stxvd2x \_vs0, 3, 10 + stxvd2x \_vs1, 3, 17 + stxvd2x \_vs2, 3, 19 + stxvd2x \_vs3, 3, 21 .endm .macro Reload_4coeffs - lxvd2x 32+v25, 0, r3 - lxvd2x 32+v26, r10, r3 - lxvd2x 32+v30, r11, r3 - lxvd2x 32+v31, r12, r3 - addi r3, r3, 64 + lxvd2x 32+vdata_mont1, 0, 3 + lxvd2x 32+vdata_mont2, 10, 3 + lxvd2x 32+vdata_mont3, 11, 3 + lxvd2x 32+vdata_mont4, 12, 3 + addi 3, 3, 64 .endm .macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - addi r3, r3, -128 - stxvd2x \_vs0, 0, r3 - stxvd2x \_vs1, r10, r3 - stxvd2x \_vs2, r11, r3 - stxvd2x \_vs3, r12, r3 - stxvd2x \_vs4, r15, r3 - stxvd2x \_vs5, r16, r3 - stxvd2x \_vs6, r17, r3 - stxvd2x \_vs7, r18, r3 - addi r3, r3, 128 + addi 3, 3, -128 + stxvd2x \_vs0, 0, 3 + stxvd2x \_vs1, 10, 3 + stxvd2x \_vs2, 11, 3 + stxvd2x \_vs3, 12, 3 + stxvd2x \_vs4, 15, 3 + stxvd2x \_vs5, 16, 3 + stxvd2x \_vs6, 17, 3 + stxvd2x \_vs7, 18, 3 + addi 3, 3, 128 .endm /* @@ -432,26 +454,26 @@ * coefficient array order. */ .macro PermWriteL44 - xxlor 32+v14, vs10, vs10 - xxlor 32+v19, vs11, vs11 - xxlor 32+v24, vs12, vs12 - xxlor 32+v29, vs13, vs13 - xxpermdi 32+v10, 32+v14, 32+v13, 3 - xxpermdi 32+v11, 32+v14, 32+v13, 0 - xxpermdi 32+v12, 32+v19, 32+v18, 3 - xxpermdi 32+v13, 32+v19, 32+v18, 0 - xxpermdi 32+v14, 32+v24, 32+v23, 3 - xxpermdi 32+v15, 32+v24, 32+v23, 0 - xxpermdi 32+v16, 32+v29, 32+v28, 3 - xxpermdi 32+v17, 32+v29, 32+v28, 0 - stxvd2x 32+v10, 0, r5 - stxvd2x 32+v11, r10, r5 - stxvd2x 32+v12, r11, r5 - stxvd2x 32+v13, r12, r5 - stxvd2x 32+v14, r15, r5 - stxvd2x 32+v15, r16, r5 - stxvd2x 32+v16, r17, r5 - stxvd2x 32+v17, r18, r5 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+14, 32+vresult_mont1, 3 + xxpermdi 32+11, 32+14, 32+vresult_mont1, 0 + xxpermdi 32+12, 32+19, 32+vresult_mont2, 3 + xxpermdi 32+13, 32+19, 32+vresult_mont2, 0 + xxpermdi 32+14, 32+24, 32+vresult_mont3, 3 + xxpermdi 32+15, 32+24, 32+vresult_mont3, 0 + xxpermdi 32+16, 32+29, 32+vresult_mont4, 3 + xxpermdi 32+17, 32+29, 32+vresult_mont4, 0 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm /* @@ -459,77 +481,123 @@ * coefficient array order. */ .macro PermWriteL24 - xxlor 32+v14, vs10, vs10 - xxlor 32+v19, vs11, vs11 - xxlor 32+v24, vs12, vs12 - xxlor 32+v29, vs13, vs13 - vmrgew v10, v13, v14 - vmrgow v11, v13, v14 - vmrgew v12, v18, v19 - vmrgow v13, v18, v19 - vmrgew v14, v23, v24 - vmrgow v15, v23, v24 - vmrgew v16, v28, v29 - vmrgow v17, v28, v29 - stxvd2x 32+v10, 0, r5 - stxvd2x 32+v11, r10, r5 - stxvd2x 32+v12, r11, r5 - stxvd2x 32+v13, r12, r5 - stxvd2x 32+v14, r15, r5 - stxvd2x 32+v15, r16, r5 - stxvd2x 32+v16, r17, r5 - stxvd2x 32+v17, r18, r5 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + vmrgew 10, vresult_mont1, 14 + vmrgow 11, vresult_mont1, 14 + vmrgew 12, vresult_mont2, 19 + vmrgow 13, vresult_mont2, 19 + vmrgew 14, vresult_mont3, 24 + vmrgow 15, vresult_mont3, 24 + vmrgew 16, vresult_mont4, 29 + vmrgow 17, vresult_mont4, 29 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm +/* + * INTT layer Len=2. + */ .macro INTT_REDUCE_L24 Load_L24Coeffs Compute_4Coeffs - BREDUCE_4X v4, v9, v13, v17 - xxlor vs10, 32+v4, 32+v4 - xxlor vs11, 32+v9, 32+v9 - xxlor vs12, 32+v13, 32+v13 - xxlor vs13, 32+v17, 32+v17 + BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + xxlor 10, 32+vresult_brt1, 32+vresult_brt1 + xxlor 11, 32+vresult_brt2, 32+vresult_brt2 + xxlor 12, 32+vresult_brt3, 32+vresult_brt3 + xxlor 13, 32+vresult_brt4, 32+vresult_brt4 Set_mont_consts Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 PermWriteL24 .endm +/* + * INTT layer Len=4. + */ .macro INTT_REDUCE_L44 Load_L44Coeffs Compute_4Coeffs - BREDUCE_4X v4, v9, v13, v17 - xxlor vs10, 32+v4, 32+v4 - xxlor vs11, 32+v9, 32+v9 - xxlor vs12, 32+v13, 32+v13 - xxlor vs13, 32+v17, 32+v17 + BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + xxlor 10, 32+vresult_brt1, 32+vresult_brt1 + xxlor 11, 32+vresult_brt2, 32+vresult_brt2 + xxlor 12, 32+vresult_brt3, 32+vresult_brt3 + xxlor 13, 32+vresult_brt4, 32+vresult_brt4 Set_mont_consts Load_next_4zetas Perm_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 PermWriteL44 .endm +/* + * INTT layer Len=8 and 16. + */ .macro INTT_REDUCE_4X start next Load_4Coeffs \start, \next - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + Write_B4C 32+vresult_brt1, 32+vresult_brt2, 32+vresult_brt3, 32+vresult_brt4 Set_mont_consts Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 + Write_M4C 32+vresult_mont1, 32+vresult_mont2, 32+vresult_mont3, 32+vresult_mont4 .endm /* - * main operations for intt - * t = r[j]; - * r[j] = barrett_reduce(t + r[j + len]); - * r[j + len] = r[j + len] - t; - * r[j + len] = fqmul(zeta, r[j + len]); + * INTT layer Len=32, 64 and 128. */ +.macro INTT_REDUCE_L567 start next + Load_4Coeffs \start, \next + BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + Write_B4C 32+vresult_brt1, 32+vresult_brt2, 32+vresult_brt3, 32+vresult_brt4 + Set_mont_consts + lvx V_ZETA, 0, 14 + //addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 + Write_M4C 32+vresult_mont1, 32+vresult_mont2, 32+vresult_mont3, 32+vresult_mont4 +.endm /* - * mlk_intt_ppc(r) + * mlk_intt_ppc(int16_t *r, int16_t *qdata) + * Compute inverse NTT based on the following 7 layers - + * len = 2, 4, 8, 16, 32, 64, 128 + * + * Each layer compute the coeffients on 2 legs, start and start + len*2 offsets. + * + * leg 1 leg 2 + * ----- ----- + * start start+len*2 + * start+next start+len*2+next + * start+next+next start+len*2+next+next + * start+next+next+next start+len*2+next+next+next + * + * Each computation loads 8 vectors, 4 for each leg. + * The final coefficient (t) from each vector of leg1 and leg2 then do the + * add/sub operations to obtain the final results. + * + * -> leg1 = leg1 + t, leg2 = leg1 - t + * + * The resulting coeffients then store back to each leg's offset. + * + * Each vector has the same corresponding zeta except len=4 and len=2. + * + * len=4 has 4-4 layout which means every 4 16-bit coeffients has the same zeta. + * and len=2 has 2-2-2-2 layout which means every 2 16-bit coeffients has the same zeta. + * e.g. + * coeff vector a1 a2 a3 a4 a5 a6 a7 a8 + * zeta vector z1 z1 z2 z2 z3 z3 z4 z4 + * + * For len=4 and len=2, each vector will get permuted to leg1 and leg2. Zeta is + * pre-arranged for the leg1 and leg2. After the computation, each vector needs + * to transpose back to its original 4-4 or 2-2-2-2 layout. */ .global MLK_ASM_NAMESPACE(intt_ppc) .align 4 @@ -539,93 +607,125 @@ MLK_ASM_FN_SYMBOL(intt_ppc) /* init vectors and constants Setup for Montgomery reduce */ - lxvx vs0, 0, r4 + lxvx 0, 0, 4 - li r10, QINV_OFFSET - lxvx 32+V_QINV, r10, r4 - xxlxor 32+v3, 32+v3, 32+v3 - vspltish v4, 1 - xxlor vs2, 32+v2, 32+v2 /* QINV */ - xxlor vs3, 32+v3, 32+v3 /* 0 vector */ - xxlor vs4, 32+v4, 32+v4 /* 1 vector */ + li 10, QINV_OFFSET + lxvx 32+V_QINV, 10, 4 + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 /* QINV */ + xxlor 3, 32+3, 32+3 /* 0 vector */ + xxlor 4, 32+4, 32+4 /* 1 vector */ /* Setup for Barrett reduce */ - li r10, Q_OFFSET - li r11, C20159_OFFSET - lxvx vs6, r10, r4 /* V_MKQ */ - lxvx 32+V20159, r11, r4 /* V20159 */ - - vspltisw v8, 13 - vadduwm v8, v8, v8 - xxlor vs8, 32+v8, 32+v8 /* V_26 store at vs8 */ - - vspltisw v9, 1 - vsubuwm v10, v8, v9 /* value 25 */ - vslw v9, v9, v10 - xxlor vs7, 32+v9, 32+v9 /* V_25 syore at vs7 */ - - li r10, 16 - li r11, 32 - li r12, 48 - li r15, 64 - li r16, 80 - li r17, 96 - li r18, 112 + li 10, Q_OFFSET + li 11, C20159_OFFSET + lxvx 6, 10, 4 /* V_MKQ */ + lxvx 32+V20159, 11, 4 /* V20159 */ + + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 /* V_26 store at vs8 */ + + vspltisw 9, 1 + vsubuwm 10, 8, 9 /* value 25 */ + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 /* V2pw25 store at vs7 */ + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 /* * Montgomery reduce loops with constant 1441 */ - addi r14, r4, C1441_OFFSET - lvx V1441, 0, r14 - li r8, 4 - mtctr r8 + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + li 8, 4 + mtctr 8 Set_mont_consts intt_ppc__Loopf: Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, v6, v7, v8, v9 + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, v13, v18, v23, v28 - MWrite_8X 32+v6, 32+v7, 32+v8, 32+v9, 32+v13, 32+v18, 32+v23, 32+v28 + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 bdnz intt_ppc__Loopf - addi r3, r3, -512 + addi 3, 3, -512 .align 4 /* * 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - * Update zetas vectors, each vector has 2 zetas - * Load zeta array in 2-2-2-2 layout + * Update zetas vectors, each vector has 2 zetas + * Load zeta vectors in 2-2-2-2 layout + * + * Compute coefficients of the NTT based on the following sequences, + * 0, 1, 2, 3, 4, 5, 6, 7 + * 8, 9, 10, 11, 12, 13, 14, 15 + * ... + * 240, 241, 242, 243, 244, 245, 246, 247 + * 248, 249, 250, 251, 252, 253, 254, 255 + * + * These are indexes to the 16 bits array. Each loads 4 vectors. */ - addi r14, r4, ZETA_INTT_OFFSET - li r7, 4 /* len * 2 */ - li r8, 4 - mtctr r8 - mr r5, r3 -intt_ppc__Loop2: + addi 14, 4, ZETA_INTT_OFFSET + li 7, 4 /* len * 2 */ + mr 5, 3 + + INTT_REDUCE_L24 + addi 5, 5, 128 INTT_REDUCE_L24 - addi r5, r5, 128 - bdnz intt_ppc__Loop2 + addi 5, 5, 128 + INTT_REDUCE_L24 + addi 5, 5, 128 + INTT_REDUCE_L24 + addi 5, 5, 128 .align 4 /* * 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - * Load zeta array in 4-4 layout + * Load zeta vectors in 4-4 layout + * + * Compute coefficients of the NTT based on the following sequences, + * 0, 1, 2, 3, 4, 5, 6, 7 + * 8, 9, 10, 11, 12, 13, 14, 15 + * ... + * 240, 241, 242, 243, 244, 245, 246, 247 + * 248, 249, 250, 251, 252, 253, 254, 255 + * + * These are indexes to the 16 bits array. Each loads 4 vectors. */ - mr r5, r3 - li r7, 8 - li r8, 4 - mtctr r8 -intt_ppc__Loop4: + mr 5, 3 + li 7, 8 + + INTT_REDUCE_L44 + addi 5, 5, 128 + INTT_REDUCE_L44 + addi 5, 5, 128 + INTT_REDUCE_L44 + addi 5, 5, 128 INTT_REDUCE_L44 - addi r5, r5, 128 - bdnz intt_ppc__Loop4 + addi 5, 5, 128 .align 4 /* * 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 8 + * 64 - 72 + * 128 - 136 + * 192 - 200 + * + * These are indexes to the 16 bits array */ - li r7, 16 + li 7, 16 INTT_REDUCE_4X 0, 32 INTT_REDUCE_4X 128, 32 @@ -635,138 +735,89 @@ intt_ppc__Loop4: .align 4 /* * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 16 + * 8 - 24 + * 128 - 144 + * 136 - 152 + * + * These are indexes to the 16 bits array */ - li r7, 32 + li 7, 32 INTT_REDUCE_4X 0, 64 - addi r14, r14, -64 + addi 14, 14, -64 INTT_REDUCE_4X 16, 64 INTT_REDUCE_4X 256, 64 - addi r14, r14, -64 + addi 14, 14, -64 INTT_REDUCE_4X 272, 64 .align 4 /* * 5. len = 32, start = 0, 64, 128, 192 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 32 + * 64 - 96 + * 128 - 160 + * 192 - 224 + * + * These are indexes to the 16 bits array */ - li r7, 64 + li 7, 64 - Load_4Coeffs 0, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - addi r14, r14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 - - Load_4Coeffs 128, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - addi r14, r14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 - - Load_4Coeffs 256, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - addi r14, r14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 - - Load_4Coeffs 384, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - addi r14, r14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + INTT_REDUCE_L567 0, 16 + addi 14, 14, 16 + INTT_REDUCE_L567 128, 16 + addi 14, 14, 16 + INTT_REDUCE_L567 256, 16 + addi 14, 14, 16 + INTT_REDUCE_L567 384, 16 + addi 14, 14, 16 .align 4 /* * 6. len = 64, start = 0, 128 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 64 + * 32 - 96 + * 128 - 192 + * 160 - 224 + * + * These are indexes to the 16 bits array */ - li r7, 128 - Load_4Coeffs 0, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + li 7, 128 - Load_4Coeffs 64, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - addi r14, r14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 - - Load_4Coeffs 256, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 - - Load_4Coeffs 320, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - addi r14, r14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + INTT_REDUCE_L567 0, 16 + INTT_REDUCE_L567 64, 16 + addi 14, 14, 16 + INTT_REDUCE_L567 256, 16 + INTT_REDUCE_L567 320, 16 + addi 14, 14, 16 .align 4 /* * 7. len = 128, start = 0 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 128 + * 32 - 160 + * 64 - 192 + * 96 - 224 + * + * These are indexes to the 16 bits array */ - li r7, 256 /* len*2 */ - - Load_4Coeffs 0, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - xxlor vs9, 32+V_ZETA, 32+V_ZETA - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 - - Load_4Coeffs 64, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - xxlor 32+V_ZETA, vs9, vs9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + li 7, 256 /* len*2 */ - Load_4Coeffs 128, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - xxlor 32+V_ZETA, vs9, vs9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 - - Load_4Coeffs 192, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - xxlor 32+V_ZETA, vs9, vs9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + INTT_REDUCE_L567 0, 16 + INTT_REDUCE_L567 64, 16 + INTT_REDUCE_L567 128, 16 + INTT_REDUCE_L567 192, 16 RESTORE_REGS blr @@ -774,7 +825,6 @@ intt_ppc__Loop4: /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ #undef V20159 -#undef V_25 #undef V_26 #undef V_MKQ #undef V_QINV diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index beee949702..32bfa56fdf 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -25,82 +25,100 @@ #define V_Z3 10 #define V_ZETA 10 +#define vdata_a1 12 +#define vdata_a2 17 +#define vdata_a3 22 +#define vdata_a4 27 +#define vdata_b1 13 +#define vdata_b2 18 +#define vdata_b3 23 +#define vdata_b4 28 + +#define vresult_a1 15 +#define vresult_b1 16 +#define vresult_a2 20 +#define vresult_b2 21 +#define vresult_a3 25 +#define vresult_b3 26 +#define vresult_a4 30 +#define vresult_b4 31 + .machine "any" .text .macro SAVE_REGS - stdu r1, -352(r1) - mflr r0 - std r14, 56(r1) - std r15, 64(r1) - std r16, 72(r1) - std r17, 80(r1) - std r18, 88(r1) - std r19, 96(r1) - std r20, 104(r1) - std r21, 112(r1) - li r10, 128 - li r11, 144 - li r12, 160 - li r14, 176 - li r15, 192 - li r16, 208 - stxvx 32+v20, r10, r1 - stxvx 32+v21, r11, r1 - stxvx 32+v22, r12, r1 - stxvx 32+v23, r14, r1 - stxvx 32+v24, r15, r1 - stxvx 32+v25, r16, r1 - li r10, 224 - li r11, 240 - li r12, 256 - li r14, 272 - li r15, 288 - li r16, 304 - stxvx 32+v26, r10, r1 - stxvx 32+v27, r11, r1 - stxvx 32+v28, r12, r1 - stxvx 32+v29, r14, r1 - stxvx 32+v30, r15, r1 - stxvx 32+v31, r16, r1 + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 .endm .macro RESTORE_REGS - li r10, 128 - li r11, 144 - li r12, 160 - li r14, 176 - li r15, 192 - li r16, 208 - lxvx 32+v20, r10, r1 - lxvx 32+v21, r11, r1 - lxvx 32+v22, r12, r1 - lxvx 32+v23, r14, r1 - lxvx 32+v24, r15, r1 - lxvx 32+v25, r16, r1 - li r10, 224 - li r11, 240 - li r12, 256 - li r14, 272 - li r15, 288 - li r16, 304 - lxvx 32+v26, r10, r1 - lxvx 32+v27, r11, r1 - lxvx 32+v28, r12, r1 - lxvx 32+v29, r14, r1 - lxvx 32+v30, r15, r1 - lxvx 32+v31, r16, r1 - ld r14, 56(r1) - ld r15, 64(r1) - ld r16, 72(r1) - ld r17, 80(r1) - ld r18, 88(r1) - ld r19, 96(r1) - ld r20, 104(r1) - ld r21, 112(r1) - - mtlr r0 - addi r1, r1, 352 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 .endm /* @@ -125,14 +143,14 @@ * */ .macro Init_Coeffs_offset start next - li r9, \start /* first offset to j */ - add r10, r7, r9 /* J + len*2 */ - addi r16, r9, \next - addi r17, r10, \next - addi r18, r16, \next - addi r19, r17, \next - addi r20, r18, \next - addi r21, r19, \next + li 9, \start /* first offset to j */ + add 10, 7, 9 /* J + len*2 */ + addi 16, 9, \next + addi 17, 10, \next + addi 18, 16, \next + addi 19, 17, \next + addi 20, 18, \next + addi 21, 19, \next .endm /* @@ -140,10 +158,10 @@ * r[j+len]: V13, V18, V23, V28 */ .macro Load_4Rjp - lxvd2x 32+v13, r3, r10 /* V13: vector r'0 */ - lxvd2x 32+v18, r3, r17 /* V18: vector for r'1 */ - lxvd2x 32+v23, r3, r19 /* V23: vector for r'2 */ - lxvd2x 32+v28, r3, r21 /* V28: vector for r'3 */ + lxvd2x 32+vdata_b1, 3, 10 /* V13: vector r'0 */ + lxvd2x 32+vdata_b2, 3, 17 /* V18: vector for r'1 */ + lxvd2x 32+vdata_b3, 3, 19 /* V23: vector for r'2 */ + lxvd2x 32+vdata_b4, 3, 21 /* V28: vector for r'3 */ .endm /* @@ -174,22 +192,22 @@ * in the proper order to match the multiplication. */ .macro Load_L24Coeffs - lxvd2x 32+v25, 0, r5 - lxvd2x 32+v26, r10, r5 - vmrgew v13, v25, v26 - vmrgow v12, v25, v26 - lxvd2x 32+v25, r11, r5 - lxvd2x 32+v26, r12, r5 - vmrgew v18, v25, v26 - vmrgow v17, v25, v26 - lxvd2x 32+v25, r15, r5 - lxvd2x 32+v26, r16, r5 - vmrgew v23, v25, v26 - vmrgow v22, v25, v26 - lxvd2x 32+v25, r17, r5 - lxvd2x 32+v26, r18, r5 - vmrgew v28, v25, v26 - vmrgow v27, v25, v26 + lxvd2x 32+25, 0, 5 + lxvd2x 32+26, 10, 5 + vmrgew vdata_b1, 25, 26 + vmrgow vdata_a1, 25, 26 + lxvd2x 32+25, 11, 5 + lxvd2x 32+26, 12, 5 + vmrgew vdata_b2, 25, 26 + vmrgow vdata_a2, 25, 26 + lxvd2x 32+25, 15, 5 + lxvd2x 32+26, 16, 5 + vmrgew vdata_b3, 25, 26 + vmrgow vdata_a3, 25, 26 + lxvd2x 32+25, 17, 5 + lxvd2x 32+26, 18, 5 + vmrgew vdata_b4, 25, 26 + vmrgow vdata_a4, 25, 26 .endm /* @@ -208,22 +226,22 @@ * in the proper order to match the multiplication. */ .macro Load_L44Coeffs - lxvd2x vs1, 0, r5 - lxvd2x vs2, r10, r5 - xxpermdi 32+v13, vs2, vs1, 3 - xxpermdi 32+v12, vs2, vs1, 0 - lxvd2x vs3, r11, r5 - lxvd2x vs4, r12, r5 - xxpermdi 32+v18, vs4, vs3, 3 - xxpermdi 32+v17, vs4, vs3, 0 - lxvd2x vs1, r15, r5 - lxvd2x vs2, r16, r5 - xxpermdi 32+v23, vs2, vs1, 3 - xxpermdi 32+v22, vs2, vs1, 0 - lxvd2x vs3, r17, r5 - lxvd2x vs4, r18, r5 - xxpermdi 32+v28, vs4, vs3, 3 - xxpermdi 32+v27, vs4, vs3, 0 + lxvd2x 1, 0, 5 + lxvd2x 2, 10, 5 + xxpermdi 32+vdata_b1, 2, 1, 3 + xxpermdi 32+vdata_a1, 2, 1, 0 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxpermdi 32+vdata_b2, 4, 3, 3 + xxpermdi 32+vdata_a2, 4, 3, 0 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxpermdi 32+vdata_b3, 2, 1, 3 + xxpermdi 32+vdata_a3, 2, 1, 0 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxpermdi 32+vdata_b4, 4, 3, 3 + xxpermdi 32+vdata_a4, 4, 3, 0 .endm /* @@ -237,32 +255,32 @@ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 /* fqmul = zeta * coefficient Modular multification bond by 2^16 * q in abs value */ - vmladduhm v15, v13, \_vz0, v3 - vmladduhm v20, v18, \_vz1, v3 - vmladduhm v25, v23, \_vz2, v3 - vmladduhm v30, v28, \_vz3, v3 + vmladduhm 15, vdata_b1, \_vz0, 3 + vmladduhm 20, vdata_b2, \_vz1, 3 + vmladduhm 25, vdata_b3, \_vz2, 3 + vmladduhm 30, vdata_b4, \_vz3, 3 /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ - vmhraddshs v14, v13, \_vz0, v3 - vmhraddshs v19, v18, \_vz1, v3 - vmhraddshs v24, v23, \_vz2, v3 - vmhraddshs v29, v28, \_vz3, v3 + vmhraddshs 14, vdata_b1, \_vz0, 3 + vmhraddshs 19, vdata_b2, \_vz1, 3 + vmhraddshs 24, vdata_b3, \_vz2, 3 + vmhraddshs 29, vdata_b4, \_vz3, 3 - vmladduhm v15, v15, V_QINV, v3 - vmladduhm v20, v20, V_QINV, v3 - vmladduhm v25, v25, V_QINV, v3 - vmladduhm v30, v30, V_QINV, v3 + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 - vmhraddshs v15, v15, V_NMKQ, v14 - vmhraddshs v20, v20, V_NMKQ, v19 - vmhraddshs v25, v25, V_NMKQ, v24 - vmhraddshs v30, v30, V_NMKQ, v29 + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 /* Shift right 1 bit */ - vsrah v13, v15, v4 - vsrah v18, v20, v4 - vsrah v23, v25, v4 - vsrah v28, v30, v4 + vsrah vdata_b1, 15, 4 + vsrah vdata_b2, 20, 4 + vsrah vdata_b3, 25, 4 + vsrah vdata_b4, 30, 4 .endm /* @@ -271,10 +289,10 @@ * r[j]: V12, V17, V22, V27 */ .macro Load_4Rj - lxvd2x 32+v12, r3, r9 /* V12: vector r0 */ - lxvd2x 32+v17, r3, r16 /* V17: vector r1 */ - lxvd2x 32+v22, r3, r18 /* V22: vector r2 */ - lxvd2x 32+v27, r3, r20 /* V27: vector r3 */ + lxvd2x 32+vdata_a1, 3, 9 /* V12: vector r0 */ + lxvd2x 32+vdata_a2, 3, 16 /* V17: vector r1 */ + lxvd2x 32+vdata_a3, 3, 18 /* V22: vector r2 */ + lxvd2x 32+vdata_a4, 3, 20 /* V27: vector r3 */ .endm /* @@ -289,25 +307,25 @@ r[j] = r[j] + t. r[j+len] = r[j] - t */ - vsubuhm v16, v12, v13 - vadduhm v15, v13, v12 - vsubuhm v21, v17, v18 - vadduhm v20, v18, v17 - vsubuhm v26, v22, v23 - vadduhm v25, v23, v22 - vsubuhm v31, v27, v28 - vadduhm v30, v28, v27 + vsubuhm vresult_b1, vdata_a1, vdata_b1 + vadduhm vresult_a1, vdata_b1, vdata_a1 + vsubuhm vresult_b2, vdata_a2, vdata_b2 + vadduhm vresult_a2, vdata_b2, vdata_a2 + vsubuhm vresult_b3, vdata_a3, vdata_b3 + vadduhm vresult_a3, vdata_b3, vdata_a3 + vsubuhm vresult_b4, vdata_a4, vdata_b4 + vadduhm vresult_a4, vdata_b4, vdata_a4 .endm .macro Write_One - stxvd2x 32+v15, r3, r9 - stxvd2x 32+v16, r3, r10 - stxvd2x 32+v20, r3, r16 - stxvd2x 32+v21, r3, r17 - stxvd2x 32+v25, r3, r18 - stxvd2x 32+v26, r3, r19 - stxvd2x 32+v30, r3, r20 - stxvd2x 32+v31, r3, r21 + stxvd2x 32+vresult_a1, 3, 9 + stxvd2x 32+vresult_b1, 3, 10 + stxvd2x 32+vresult_a2, 3, 16 + stxvd2x 32+vresult_b2, 3, 17 + stxvd2x 32+vresult_a3, 3, 18 + stxvd2x 32+vresult_b3, 3, 19 + stxvd2x 32+vresult_a4, 3, 20 + stxvd2x 32+vresult_b4, 3, 21 .endm /* @@ -316,22 +334,22 @@ */ .macro PermWriteL44 Compute_4Coeffs - xxpermdi vs0, 32+v15, 32+v16, 3 - xxpermdi vs1, 32+v15, 32+v16, 0 - xxpermdi vs2, 32+v20, 32+v21, 3 - xxpermdi vs3, 32+v20, 32+v21, 0 - xxpermdi vs4, 32+v25, 32+v26, 3 - xxpermdi vs5, 32+v25, 32+v26, 0 - xxpermdi vs6, 32+v30, 32+v31, 3 - xxpermdi vs7, 32+v30, 32+v31, 0 - stxvd2x vs0, 0, r5 - stxvd2x vs1, r10, r5 - stxvd2x vs2, r11, r5 - stxvd2x vs3, r12, r5 - stxvd2x vs4, r15, r5 - stxvd2x vs5, r16, r5 - stxvd2x vs6, r17, r5 - stxvd2x vs7, r18, r5 + xxpermdi 0, 32+vresult_a1, 32+vresult_b1, 3 + xxpermdi 1, 32+vresult_a1, 32+vresult_b1, 0 + xxpermdi 2, 32+vresult_a2, 32+vresult_b2, 3 + xxpermdi 3, 32+vresult_a2, 32+vresult_b2, 0 + xxpermdi 4, 32+vresult_a3, 32+vresult_b3, 3 + xxpermdi 5, 32+vresult_a3, 32+vresult_b3, 0 + xxpermdi 6, 32+vresult_a4, 32+vresult_b4, 3 + xxpermdi 7, 32+vresult_a4, 32+vresult_b4, 0 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 .endm /* @@ -340,33 +358,33 @@ */ .macro PermWriteL24 Compute_4Coeffs - vmrgew v10, v16, v15 - vmrgow v11, v16, v15 - vmrgew v12, v21, v20 - vmrgow v13, v21, v20 - vmrgew v14, v26, v25 - vmrgow v15, v26, v25 - vmrgew v16, v31, v30 - vmrgow v17, v31, v30 - stxvd2x 32+v10, 0, r5 - stxvd2x 32+v11, r10, r5 - stxvd2x 32+v12, r11, r5 - stxvd2x 32+v13, r12, r5 - stxvd2x 32+v14, r15, r5 - stxvd2x 32+v15, r16, r5 - stxvd2x 32+v16, r17, r5 - stxvd2x 32+v17, r18, r5 + vmrgew 10, vresult_b1, vresult_a1 + vmrgow 11, vresult_b1, vresult_a1 + vmrgew 12, vresult_b2, vresult_a2 + vmrgow 13, vresult_b2, vresult_a2 + vmrgew 14, vresult_b3, vresult_a3 + vmrgow 15, vresult_b3, vresult_a3 + vmrgew 16, vresult_b4, vresult_a4 + vmrgow 17, vresult_b4, vresult_a4 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm .macro Load_next_4zetas - li r10, 16 - li r11, 32 - li r12, 48 - lxvd2x 32+V_Z0, 0, r14 - lxvd2x 32+V_Z1, r10, r14 - lxvd2x 32+V_Z2, r11, r14 - lxvd2x 32+V_Z3, r12, r14 - addi r14, r14, 64 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 10, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 + addi 14, 14, 64 .endm /* @@ -389,7 +407,39 @@ .endm /* - * mlk_ntt_ppc(int16_t *r) + * mlk_ntt_ppc(int16_t *r, int16_t *qdata) + * Compute forward NTT based on the following 7 layers - + * len = 128, 64, 32, 16, 8, 4, 2. + * + * Each layer compute the coeffients on 2 legs, start and start + len*2 offsets. + * + * leg 1 leg 2 + * ----- ----- + * start start+len*2 + * start+next start+len*2+next + * start+next+next start+len*2+next+next + * start+next+next+next start+len*2+next+next+next + * + * Each computation loads 8 vectors, 4 for each leg. + * The final coefficient (t) from each vector of leg1 and leg2 then do the + * add/sub operations to obtain the final results. + * + * -> leg1 = leg1 + t, leg2 = leg1 - t + * + * The resulting coeffients then store back to each leg's offset. + * + * Each vector has the same corresponding zeta except len=4 and len=2. + * + * len=4 has 4-4 layout which means every 4 16-bit coeffients has the same zeta. + * and len=2 has 2-2-2-2 layout which means every 2 16-bit coeffients has the same zeta. + * e.g. + * coeff vector a1 a2 a3 a4 a5 a6 a7 a8 + * zeta vector z1 z1 z2 z2 z3 z3 z4 z4 + * + * For len=4 and len=2, each vector will get permuted to leg1 and leg2. Zeta is + * pre-arranged for the leg1 and leg2. After the computation, each vector needs + * to transpose back to its original 4-4 or 2-2-2-2 layout. + * */ .global MLK_ASM_NAMESPACE(ntt_ppc) .align 4 @@ -398,27 +448,32 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) SAVE_REGS /* load MLKEM_Q */ - lvx V_NMKQ,0,r4 + lvx V_NMKQ,0,4 /* Register 14 as pointer to zetas array */ - addi r14, r4, ZETA_NTT_OFFSET + addi 14, 4, ZETA_NTT_OFFSET - vxor v3, v3, v3 - vspltish v4, 1 + vxor 3, 3, 3 + vspltish 4, 1 - li r10, QINV_OFFSET - lvx V_QINV, r10, r4 + li 10, QINV_OFFSET + lvx V_QINV, 10, 4 .align 4 /* - * Compute coefficients of the NTT based on the following loop. - * for (len = 128; len ≥ 2; len = len/2) - * * 1. len = 128, start = 0 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 128 + * 32 - 160 + * 64 - 192 + * 96 - 224 + * + * These are indexes to the 16 bits array */ - li r7, 256 /* len * 2 */ - lvx V_ZETA, 0, r14 - addi r14, r14, 16 + li 7, 256 /* len * 2 */ + lvx V_ZETA, 0, 14 + addi 14, 14, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -428,47 +483,68 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) .align 4 /* * 2. len = 64, start = 0, 128 - * k += 2 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 64 + * 32 - 96 + * 128 - 192 + * 160 - 224 + * + * These are indexes to the 16 bits array */ - li r7, 128 - lvx V_ZETA, 0, r14 - addi r14, r14, 16 + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, r14 - addi r14, r14, 16 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .align 4 /* * 3. len = 32, start = 0, 64, 128, 192 - * k += 4 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 32 + * 64 - 96 + * 128 - 160 + * 192 - 224 + * + * These are indexes to the 16 bits array */ - li r7, 64 - lvx V_ZETA, 0, r14 - addi r14, r14, 16 + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, r14 - addi r14, r14, 16 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, r14 - addi r14, r14, 16 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, r14 - addi r14, r14, 16 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .align 4 /* - * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - * k += 8 + * 4. len = 16, start = 0, 8, 128, 136 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 16 + * 8 - 24 + * 128 - 144 + * 136 - 152 + * + * These are indexes to the 16 bits array */ - li r7, 32 + li 7, 32 Load_next_4zetas NTT_MREDUCE_4X 0, 64, V_Z0, V_Z1, V_Z2, V_Z3 NTT_MREDUCE_4X 16, 64, V_Z0, V_Z1, V_Z2, V_Z3 @@ -479,10 +555,17 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) .align 4 /* - * 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - * k += 16 + * 5. len = 8, start = 0, 64, 128, 192 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 8 + * 64 - 72 + * 128 - 136 + * 192 - 200 + * + * These are indexes to the 16 bits array */ - li r7, 16 + li 7, 16 Load_next_4zetas NTT_MREDUCE_4X 0, 32, V_Z0, V_Z1, V_Z2, V_Z3 @@ -497,21 +580,29 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) /* * 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - * k += 32 - * Load zeta vectors in 4-4 layout + * Load zeta vectors in 4-4 layout + * + * Compute coefficients of the NTT based on the following sequences, + * 0, 1, 2, 3, 4, 5, 6, 7 + * 8, 9, 10, 11, 12, 13, 14, 15 + * ... + * 240, 241, 242, 243, 244, 245, 246, 247 + * 248, 249, 250, 251, 252, 253, 254, 255 + * + * These are indexes to the 16 bits array. Each loads 4 vectors. */ - li r15, 4 - mtctr r15 - mr r5, r3 /* Let r5 points to coefficient array */ - li r7, 8 - - li r10, 16 - li r11, 32 - li r12, 48 - li r15, 64 - li r16, 80 - li r17, 96 - li r18, 112 + li 15, 4 + mtctr 15 + mr 5, 3 /* Let r5 points to coefficient array */ + li 7, 8 + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 .align 4 ntt_ppc__Len4: @@ -520,20 +611,28 @@ ntt_ppc__Len4: Load_L44Coeffs MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 PermWriteL44 - addi r5, r5, 128 + addi 5, 5, 128 bdnz ntt_ppc__Len4 /* * 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - * k += 64 - * Load zeta vectors in 2-2-2-2 layout + * Load zeta vectors in 2-2-2-2 layout + * + * Compute coefficients of the NTT based on the following sequences, + * 0, 1, 2, 3, 4, 5, 6, 7 + * 8, 9, 10, 11, 12, 13, 14, 15 + * ... + * 240, 241, 242, 243, 244, 245, 246, 247 + * 248, 249, 250, 251, 252, 253, 254, 255 + * + * These are indexes to the 16 bits array. Each loads 4 vectors. */ - li r8, 4 - mtctr r8 - mr r5, r3 /* Let r5 points to coefficient array */ - li r7, 4 + li 8, 4 + mtctr 8 + mr 5, 3 /* Let r5 points to coefficient array */ + li 7, 4 .align 4 ntt_ppc__Len2: @@ -541,7 +640,7 @@ ntt_ppc__Len2: Load_L24Coeffs MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 PermWriteL24 - addi r5, r5, 128 + addi 5, 5, 128 bdnz ntt_ppc__Len2 diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S index 4d16be6f05..354474d071 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont.S @@ -40,141 +40,141 @@ * MREDUCE_4X(_v0, _v1, _v2, _v3) */ .macro MREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+v13, 0, r3 - addi r3, r3, 16 - lxvd2x 32+v18, 0, r3 - addi r3, r3, 16 - lxvd2x 32+v23, 0, r3 - addi r3, r3, 16 - lxvd2x 32+v7, 0, r3 - addi r3, r3, 16 - - vmladduhm v15, v13, V1353, v3 - vmladduhm v20, v18, V1353, v3 - vmladduhm v25, v23, V1353, v3 - vmladduhm v9, v7, V1353, v3 - - vmhraddshs v14, v13, V1353, v3 - vmhraddshs v19, v18, V1353, v3 - vmhraddshs v24, v23, V1353, v3 - vmhraddshs v8, v7, V1353, v3 - - vmladduhm v15, v15, V_QINV, v3 - vmladduhm v20, v20, V_QINV, v3 - vmladduhm v25, v25, V_QINV, v3 - vmladduhm v9, v9, V_QINV, v3 - - vmhraddshs v15, v15, V_NMKQ, v14 - vmhraddshs v20, v20, V_NMKQ, v19 - vmhraddshs v25, v25, V_NMKQ, v24 - vmhraddshs v9, v9, V_NMKQ, v8 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 /* Shift right 1 bit */ - vsrah \_v0, v15, v4 - vsrah \_v1, v20, v4 - vsrah \_v2, v25, v4 - vsrah \_v3, v9, v4 + vsrah \_v0, 15, 4 + vsrah \_v1, 20, 4 + vsrah \_v2, 25, 4 + vsrah \_v3, 9, 4 .endm .macro Write_8X - stxvd2x 32+v27, r4, r3 - stxvd2x 32+v28, r5, r3 - stxvd2x 32+v29, r6, r3 - stxvd2x 32+v30, r7, r3 - stxvd2x 32+v13, r8, r3 - stxvd2x 32+v18, r9, r3 - stxvd2x 32+v23, r10, r3 - stxvd2x 32+v7, r11, r3 + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 .endm .align 4 .globl MLK_ASM_NAMESPACE(poly_tomont_ppc) MLK_ASM_FN_SYMBOL(poly_tomont_ppc) - stdu r1, -320(r1) - mflr r0 - - li r6, 128 - li r7, 144 - li r8, 160 - li r9, 176 - li r10, 192 - li r11, 208 - li r12, 224 - stxvx 32+v20, r6, r1 - stxvx 32+v21, r7, r1 - stxvx 32+v22, r8, r1 - stxvx 32+v23, r9, r1 - stxvx 32+v24, r10, r1 - stxvx 32+v25, r11, r1 - stxvx 32+v26, r12, r1 - li r6, 240 - li r7, 256 - li r8, 272 - li r9, 288 - stxvx 32+v27, r6, r1 - stxvx 32+v28, r7, r1 - stxvx 32+v29, r8, r1 - stxvx 32+v30, r9, r1 - - li r6, NQ_OFFSET - li r7, QINV_OFFSET - li r8, C1353_OFFSET - lxvx 32+V_NMKQ, r6, r4 - lxvx 32+V_QINV, r7, r4 - lxvx 32+V1353, r8, r4 - - vxor v3, v3, v3 - vspltish v4, 1 - - li r4, -128 - li r5, -112 - li r6, -96 - li r7, -80 - li r8, -64 - li r9, -48 - li r10, -32 - li r11, -16 - - MREDUCE_4X v27, v28, v29, v30 - MREDUCE_4X v13, v18, v23, v7 + stdu 1, -320(1) + mflr 0 + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + stxvx 32+25, 11, 1 + stxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvx 32+27, 6, 1 + stxvx 32+28, 7, 1 + stxvx 32+29, 8, 1 + stxvx 32+30, 9, 1 + + li 6, NQ_OFFSET + li 7, QINV_OFFSET + li 8, C1353_OFFSET + lxvx 32+V_NMKQ, 6, 4 + lxvx 32+V_QINV, 7, 4 + lxvx 32+V1353, 8, 4 + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 Write_8X - MREDUCE_4X v27, v28, v29, v30 - MREDUCE_4X v13, v18, v23, v7 + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 Write_8X - MREDUCE_4X v27, v28, v29, v30 - MREDUCE_4X v13, v18, v23, v7 + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 Write_8X - MREDUCE_4X v27, v28, v29, v30 - MREDUCE_4X v13, v18, v23, v7 + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 Write_8X - li r6, 128 - li r7, 144 - li r8, 160 - li r9, 176 - li r10, 192 - li r11, 208 - li r12, 224 - lxvx 32+v20, r6, r1 - lxvx 32+v21, r7, r1 - lxvx 32+v22, r8, r1 - lxvx 32+v23, r9, r1 - lxvx 32+v24, r10, r1 - lxvx 32+v25, r11, r1 - lxvx 32+v26, r12, r1 - li r6, 240 - li r7, 256 - li r8, 272 - li r9, 288 - lxvx 32+v27, r6, r1 - lxvx 32+v28, r7, r1 - lxvx 32+v29, r8, r1 - lxvx 32+v30, r9, r1 - mtlr r0 - addi r1, r1, 320 + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + lxvx 32+25, 11, 1 + lxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvx 32+27, 6, 1 + lxvx 32+28, 7, 1 + lxvx 32+29, 8, 1 + lxvx 32+30, 9, 1 + mtlr 0 + addi 1, 1, 320 blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index 691ce3970c..084ae5959d 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -34,168 +34,168 @@ .text .macro BREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+v8, 0, r3 - lxvd2x 32+v12, r14, r3 - lxvd2x 32+v16, r15, r3 - lxvd2x 32+v20, r16, r3 - addi r3, r3, 64 - vmulosh v6, v8, V20159 - vmulesh v5, v8, V20159 - vmulosh v11, v12, V20159 - vmulesh v10, v12, V20159 - vmulosh v15, v16, V20159 - vmulesh v14, v16, V20159 - vmulosh v19, v20, V20159 - vmulesh v18, v20, V20159 - xxmrglw 32+v4, 32+v5, 32+v6 - xxmrghw 32+v5, 32+v5, 32+v6 - xxmrglw 32+v9, 32+v10, 32+v11 - xxmrghw 32+v10, 32+v10, 32+v11 - xxmrglw 32+v13, 32+v14, 32+v15 - xxmrghw 32+v14, 32+v14, 32+v15 - xxmrglw 32+v17, 32+v18, 32+v19 - xxmrghw 32+v18, 32+v18, 32+v19 - vadduwm v4, v4, V_25 - vadduwm v5, v5, V_25 - vadduwm v9, v9, V_25 - vadduwm v10, v10, V_25 - vadduwm v13, v13, V_25 - vadduwm v14, v14, V_25 - vadduwm v17, v17, V_25 - vadduwm v18, v18, V_25 - vsraw v4, v4, V_26 - vsraw v5, v5, V_26 - vsraw v9, v9, V_26 - vsraw v10, v10, V_26 - vsraw v13, v13, V_26 - vsraw v14, v14, V_26 - vsraw v17, v17, V_26 - vsraw v18, v18, V_26 - vpkuwum v4, v5, v4 - vsubuhm v4, v7, v4 - vpkuwum v9, v10, v9 - vsubuhm v9, v7, v9 - vpkuwum v13, v14, v13 - vsubuhm v13, v7, v13 - vpkuwum v17, v18, v17 - vsubuhm v17, v7, v17 - vmladduhm \_v0, v4, V_MKQ, v8 - vmladduhm \_v1, v9, V_MKQ, v12 - vmladduhm \_v2, v13, V_MKQ, v16 - vmladduhm \_v3, v17, V_MKQ, v20 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm .macro Write_8X - stxvd2x 32+v21, r4, r3 - stxvd2x 32+v22, r5, r3 - stxvd2x 32+v23, r6, r3 - stxvd2x 32+v24, r7, r3 - stxvd2x 32+v4, r8, r3 - stxvd2x 32+v9, r9, r3 - stxvd2x 32+v13, r10, r3 - stxvd2x 32+v17, r11, r3 + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 .endm /* * Conditional addition to get unsigned canonical representative */ .macro To_unsigned_16 - lxvd2x 32+v12, 0, r3 - lxvd2x 32+v13, r14, r3 - lxvd2x 32+v14, r15, r3 - lxvd2x 32+v15, r16, r3 - addi r3, r3, 64 - vsrh v1, v12, v10 - vsrh v0, v13, v10 - vsrh v3, v14, v10 - vsrh v2, v15, v10 - vadduhm v7, v12, v11 - vadduhm v8, v13, v11 - vadduhm v5, v14, v11 - vadduhm v6, v15, v11 - vcmpequh v1, v1, v9 - vcmpequh v0, v0, v9 - vcmpequh v3, v3, v9 - vcmpequh v2, v2, v9 - xxsel 32+v1, 32+v7,32+v12, 32+v1 - xxsel 32+v0, 32+v8,32+v13, 32+v0 - xxsel 32+v3, 32+v5,32+v14, 32+v3 - xxsel 32+v2, 32+v6,32+v15, 32+v2 - stxvd2x 32+v3, r10, r3 - stxvd2x 32+v2, r11, r3 - stxvd2x 32+v1, r8, r3 - stxvd2x 32+v0, r9, r3 + lxvd2x 32+12, 0, 3 + lxvd2x 32+13, 14, 3 + lxvd2x 32+14, 15, 3 + lxvd2x 32+15, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxvd2x 32+3, 10, 3 + stxvd2x 32+2, 11, 3 + stxvd2x 32+1, 8, 3 + stxvd2x 32+0, 9, 3 .endm .align 4 .globl MLK_ASM_NAMESPACE(reduce_ppc) MLK_ASM_FN_SYMBOL(reduce_ppc) - stdu r1, -224(r1) - mflr r0 - std r14, 96(r1) - std r15, 104(r1) - std r16, 112(r1) - li r6, 128 - li r7, 144 - li r8, 160 - li r9, 176 - li r10, 192 - stxvx 32+v20, r6, r1 - stxvx 32+v21, r7, r1 - stxvx 32+v22, r8, r1 - stxvx 32+v23, r9, r1 - stxvx 32+v24, r10, r1 - - vxor v7, v7, v7 - - li r6, Q_OFFSET - li r7, C20159_OFFSET - lxvx 32+V_MKQ, r6, r4 - lxvx 32+V20159, r7, r4 + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + + vxor 7, 7, 7 + + li 6, Q_OFFSET + li 7, C20159_OFFSET + lxvx 32+V_MKQ, 6, 4 + lxvx 32+V20159, 7, 4 vspltisw V_26, 13 vadduwm V_26, V_26, V_26 - vspltisw v4, 1 - vsubuwm v5, V_26, v4 - vslw V_25, v4, v5 - - li r4, -128 - li r5, -112 - li r6, -96 - li r7, -80 - li r8, -64 - li r9, -48 - li r10, -32 - li r11, -16 - - li r14, 16 - li r15, 32 - li r16, 48 - - BREDUCE_4X v21, v22, v23, v24 - BREDUCE_4X v4, v9, v13, v17 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 Write_8X - BREDUCE_4X v21, v22, v23, v24 - BREDUCE_4X v4, v9, v13, v17 + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 Write_8X - BREDUCE_4X v21, v22, v23, v24 - BREDUCE_4X v4, v9, v13, v17 + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 Write_8X - BREDUCE_4X v21, v22, v23, v24 - BREDUCE_4X v4, v9, v13, v17 + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 Write_8X /* * To unsigned canonical */ .align 4 - addi r3, r3, -512 - vxor v9, v9, v9 - vspltish v10, 15 - vmr v11, V_MKQ + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, V_MKQ To_unsigned_16 To_unsigned_16 @@ -206,21 +206,21 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) To_unsigned_16 To_unsigned_16 - ld r14, 96(r1) - ld r15, 104(r1) - ld r16, 112(r1) - li r6, 128 - li r7, 144 - li r8, 160 - li r9, 176 - li r10, 192 - lxvx 32+v20, r6, r1 - lxvx 32+v21, r7, r1 - lxvx 32+v22, r8, r1 - lxvx 32+v23, r9, r1 - lxvx 32+v24, r10, r1 - mtlr r0 - addi r1, r1, 224 + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + mtlr 0 + addi 1, 1, 224 blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h index 6c59a63b0b..df5d163f78 100644 --- a/mlkem/src/native/ppc64le/src/consts.h +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -19,73 +19,6 @@ #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) extern const int16_t mlk_ppc_qdata[]; -#else -#define r0 0 -#define r1 1 -#define r3 3 -#define r4 4 -#define r5 5 -#define r6 6 -#define r7 7 -#define r8 8 -#define r9 9 -#define r10 10 -#define r11 11 -#define r12 12 -#define r14 14 -#define r15 15 -#define r16 16 -#define r17 17 -#define r18 18 -#define r19 19 -#define r20 20 -#define r21 21 -#define v0 0 -#define v1 1 -#define v2 2 -#define v3 3 -#define v4 4 -#define v5 5 -#define v6 6 -#define v7 7 -#define v8 8 -#define v9 9 -#define v10 10 -#define v11 11 -#define v12 12 -#define v13 13 -#define v14 14 -#define v15 15 -#define v16 16 -#define v17 17 -#define v18 18 -#define v19 19 -#define v20 20 -#define v21 21 -#define v22 22 -#define v23 23 -#define v24 24 -#define v25 25 -#define v26 26 -#define v27 27 -#define v28 28 -#define v29 29 -#define v30 30 -#define v31 31 -#define vs0 0 -#define vs1 1 -#define vs2 2 -#define vs3 3 -#define vs4 4 -#define vs5 5 -#define vs6 6 -#define vs7 7 -#define vs8 8 -#define vs9 9 -#define vs10 10 -#define vs11 11 -#define vs12 12 -#define vs13 13 #endif #endif /* !MLK_NATIVE_PPC64LE_SRC_CONSTS_H */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 946ae12e01..4aab1e7c3e 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -21,7 +21,7 @@ /* Barrett reduce constatnts */ #define V20159 0 -#define V_25 1 +#define V2pw25 1 #define V_26 2 #define V_MKQ 3 @@ -35,101 +35,123 @@ #define V_ZETA 10 #define V1441 10 +#define vdata_a1 21 +#define vdata_a2 22 +#define vdata_a3 23 +#define vdata_a4 24 +#define vdata_b1 8 +#define vdata_b2 12 +#define vdata_b3 16 +#define vdata_b4 20 + +#define vdata_brt1 8 +#define vdata_brt2 12 +#define vdata_brt3 16 +#define vdata_brt4 20 + +#define vdata_mont1 25 +#define vdata_mont2 26 +#define vdata_mont3 30 +#define vdata_mont4 31 + +#define vresult_brt1 4 +#define vresult_brt2 9 +#define vresult_brt3 13 +#define vresult_brt4 17 +#define vresult_mont1 13 +#define vresult_mont2 18 +#define vresult_mont3 23 +#define vresult_mont4 28 + .macro SAVE_REGS - stdu r1, -352(r1) - mflr r0 - std r14, 56(r1) - std r15, 64(r1) - std r16, 72(r1) - std r17, 80(r1) - std r18, 88(r1) - std r19, 96(r1) - std r20, 104(r1) - std r21, 112(r1) - li r10, 128 - li r11, 144 - li r12, 160 - li r14, 176 - li r15, 192 - li r16, 208 - stxvx 32+v20, r10, r1 - stxvx 32+v21, r11, r1 - stxvx 32+v22, r12, r1 - stxvx 32+v23, r14, r1 - stxvx 32+v24, r15, r1 - stxvx 32+v25, r16, r1 - li r10, 224 - li r11, 240 - li r12, 256 - li r14, 272 - li r15, 288 - li r16, 304 - stxvx 32+v26, r10, r1 - stxvx 32+v27, r11, r1 - stxvx 32+v28, r12, r1 - stxvx 32+v29, r14, r1 - stxvx 32+v30, r15, r1 - stxvx 32+v31, r16, r1 + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 .endm .macro RESTORE_REGS - li r10, 128 - li r11, 144 - li r12, 160 - li r14, 176 - li r15, 192 - li r16, 208 - lxvx 32+v20, r10, r1 - lxvx 32+v21, r11, r1 - lxvx 32+v22, r12, r1 - lxvx 32+v23, r14, r1 - lxvx 32+v24, r15, r1 - lxvx 32+v25, r16, r1 - li r10, 224 - li r11, 240 - li r12, 256 - li r14, 272 - li r15, 288 - li r16, 304 - lxvx 32+v26, r10, r1 - lxvx 32+v27, r11, r1 - lxvx 32+v28, r12, r1 - lxvx 32+v29, r14, r1 - lxvx 32+v30, r15, r1 - lxvx 32+v31, r16, r1 - ld r14, 56(r1) - ld r15, 64(r1) - ld r16, 72(r1) - ld r17, 80(r1) - ld r18, 88(r1) - ld r19, 96(r1) - ld r20, 104(r1) - ld r21, 112(r1) - - mtlr r0 - addi r1, r1, 352 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 .endm /* - * Compute final final r[j] and r[j+len] - * final r[j+len]: V8, V12, V16, V20 - * final r[j]: V21, V22, V23, V24 + * Compute r[j] and r[j+len] from computed coefficients + * r[j] + r[j+len] : V8, V12, V16, V20 (data for Barett reduce) + * r[j+len] - r[j]: V25, V26, V30, V31 (data for Montgomery reduce) */ .macro Compute_4Coeffs - /* Since the result of the Montgomery multiplication is bounded - by q in absolute value. - Finally to complete the final update of the results with add/sub - r[j] = r[j] + t. - r[j+len] = r[j] - t - */ - vsubuhm v25, v8, v21 - vsubuhm v26, v12, v22 - vsubuhm v30, v16, v23 - vsubuhm v31, v20, v24 - vadduhm v8, v8, v21 - vadduhm v12, v12, v22 - vadduhm v16, v16, v23 - vadduhm v20, v20, v24 + vsubuhm vdata_mont1, vdata_b1, vdata_a1 + vsubuhm vdata_mont2, vdata_b2, vdata_a2 + vsubuhm vdata_mont3, vdata_b3, vdata_a3 + vsubuhm vdata_mont4, vdata_b4, vdata_a4 + vadduhm vdata_brt1, vdata_b1, vdata_a1 + vadduhm vdata_brt2, vdata_b2, vdata_a2 + vadduhm vdata_brt3, vdata_b3, vdata_a3 + vadduhm vdata_brt4, vdata_b4, vdata_a4 .endm /* @@ -154,14 +176,14 @@ * */ .macro Init_Coeffs_offset start next - li r9, \start /* first offset to j */ - add r10, r7, r9 /* J + len*2 */ - addi r16, r9, \next - addi r17, r10, \next - addi r18, r16, \next - addi r19, r17, \next - addi r20, r18, \next - addi r21, r19, \next + li 9, \start /* first offset to j */ + add 10, 7, 9 /* J + len*2 */ + addi 16, 9, \next + addi 17, 10, \next + addi 18, 16, \next + addi 19, 17, \next + addi 20, 18, \next + addi 21, 19, \next .endm /* @@ -173,15 +195,15 @@ * r[j]: V21, V22, V23, V24 */ .macro Load_4Rjp - lxvd2x 32+v8, r3, r10 /* V8: vector r'0 */ - lxvd2x 32+v12, r3, r17 /* V12: vector for r'1 */ - lxvd2x 32+v16, r3, r19 /* V16: vector for r'2 */ - lxvd2x 32+v20, r3, r21 /* V20: vector for r'3 */ - - lxvd2x 32+v21, r3, r9 /* V21: vector r0 */ - lxvd2x 32+v22, r3, r16 /* V22: vector r1 */ - lxvd2x 32+v23, r3, r18 /* V23: vector r2 */ - lxvd2x 32+v24, r3, r20 /* V24: vector r3 */ + lxvd2x 32+vdata_b1, 3, 10 /* V8: vector r'0 */ + lxvd2x 32+vdata_b2, 3, 17 /* V12: vector for r'1 */ + lxvd2x 32+vdata_b3, 3, 19 /* V16: vector for r'2 */ + lxvd2x 32+vdata_b4, 3, 21 /* V20: vector for r'3 */ + + lxvd2x 32+vdata_a1, 3, 9 /* V21: vector r0 */ + lxvd2x 32+vdata_a2, 3, 16 /* V22: vector r1 */ + lxvd2x 32+vdata_a3, 3, 18 /* V23: vector r2 */ + lxvd2x 32+vdata_a4, 3, 20 /* V24: vector r3 */ .endm /* @@ -213,22 +235,22 @@ * in the proper order to match the multiplication. */ .macro Load_L24Coeffs - lxvd2x 32+v25, 0, r5 - lxvd2x 32+v26, r10, r5 - vmrgew v8, v25, v26 - vmrgow v21, v25, v26 - lxvd2x 32+v25, r11, r5 - lxvd2x 32+v26, r12, r5 - vmrgew v12, v25, v26 - vmrgow v22, v25, v26 - lxvd2x 32+v25, r15, r5 - lxvd2x 32+v26, r16, r5 - vmrgew v16, v25, v26 - vmrgow v23, v25, v26 - lxvd2x 32+v25, r17, r5 - lxvd2x 32+v26, r18, r5 - vmrgew v20, v25, v26 - vmrgow v24, v25, v26 + lxvd2x 32+25, 0, 5 + lxvd2x 32+26, 10, 5 + vmrgew vdata_b1, 25, 26 + vmrgow vdata_a1, 25, 26 + lxvd2x 32+25, 11, 5 + lxvd2x 32+26, 12, 5 + vmrgew vdata_b2, 25, 26 + vmrgow vdata_a2, 25, 26 + lxvd2x 32+25, 15, 5 + lxvd2x 32+26, 16, 5 + vmrgew vdata_b3, 25, 26 + vmrgow vdata_a3, 25, 26 + lxvd2x 32+25, 17, 5 + lxvd2x 32+26, 18, 5 + vmrgew vdata_b4, 25, 26 + vmrgow vdata_a4, 25, 26 .endm /* @@ -247,81 +269,81 @@ * in the proper order to match the multiplication. */ .macro Load_L44Coeffs - lxvd2x vs10, 0, r5 - lxvd2x vs11, r10, r5 - xxpermdi 32+v8, vs11, vs10, 3 - xxpermdi 32+v21, vs11, vs10, 0 - lxvd2x vs10, r11, r5 - lxvd2x vs11, r12, r5 - xxpermdi 32+v12, vs11, vs10, 3 - xxpermdi 32+v22, vs11, vs10, 0 - lxvd2x vs10, r15, r5 - lxvd2x vs11, r16, r5 - xxpermdi 32+v16, vs11, vs10, 3 - xxpermdi 32+v23, vs11, vs10, 0 - lxvd2x vs10, r17, r5 - lxvd2x vs11, r18, r5 - xxpermdi 32+v20, vs11, vs10, 3 - xxpermdi 32+v24, vs11, vs10, 0 + lxvd2x 10, 0, 5 + lxvd2x 11, 10, 5 + xxpermdi 32+vdata_b1, 11, 10, 3 + xxpermdi 32+vdata_a1, 11, 10, 0 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxpermdi 32+vdata_b2, 11, 10, 3 + xxpermdi 32+vdata_a2, 11, 10, 0 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxpermdi 32+vdata_b3, 11, 10, 3 + xxpermdi 32+vdata_a3, 11, 10, 0 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxpermdi 32+vdata_b4, 11, 10, 3 + xxpermdi 32+vdata_a4, 11, 10, 0 .endm .macro BREDUCE_4X _v0 _v1 _v2 _v3 /* Restore constant vectors - V_MKQ, V_25 and V_26 */ - vxor v7, v7, v7 - xxlor 32+v3, vs6, vs6 - xxlor 32+v1, vs7, vs7 - xxlor 32+v2, vs8, vs8 + V_MKQ, V2pw25 and V_26 */ + vxor 7, 7, 7 + xxlor 32+3, 6, 6 + xxlor 32+1, 7, 7 + xxlor 32+2, 8, 8 /* Multify Odd/Even signed halfword; Results word bound by 2^32 in abs value. */ - vmulosh v6, v8, V20159 - vmulesh v5, v8, V20159 - vmulosh v11, v12, V20159 - vmulesh v10, v12, V20159 - vmulosh v15, v16, V20159 - vmulesh v14, v16, V20159 - vmulosh v19, v20, V20159 - vmulesh v18, v20, V20159 - xxmrglw 32+v4, 32+v5, 32+v6 - xxmrghw 32+v5, 32+v5, 32+v6 - xxmrglw 32+v9, 32+v10, 32+v11 - xxmrghw 32+v10, 32+v10, 32+v11 - xxmrglw 32+v13, 32+v14, 32+v15 - xxmrghw 32+v14, 32+v14, 32+v15 - xxmrglw 32+v17, 32+v18, 32+v19 - xxmrghw 32+v18, 32+v18, 32+v19 - vadduwm v4, v4, V_25 - vadduwm v5, v5, V_25 - vadduwm v9, v9, V_25 - vadduwm v10, v10, V_25 - vadduwm v13, v13, V_25 - vadduwm v14, v14, V_25 - vadduwm v17, v17, V_25 - vadduwm v18, v18, V_25 + vmulosh 6, vdata_brt1, V20159 + vmulesh 5, vdata_brt1, V20159 + vmulosh 11, vdata_brt2, V20159 + vmulesh 10, vdata_brt2, V20159 + vmulosh 15, vdata_brt3, V20159 + vmulesh 14, vdata_brt3, V20159 + vmulosh 19, vdata_brt4, V20159 + vmulesh 18, vdata_brt4, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V2pw25 + vadduwm 5, 5, V2pw25 + vadduwm 9, 9, V2pw25 + vadduwm 10, 10, V2pw25 + vadduwm 13, 13, V2pw25 + vadduwm 14, 14, V2pw25 + vadduwm 17, 17, V2pw25 + vadduwm 18, 18, V2pw25 /* Right shift and pack lower halfword, results bond to 2^16 in abs value */ - vsraw v4, v4, V_26 - vsraw v5, v5, V_26 - vsraw v9, v9, V_26 - vsraw v10, v10, V_26 - vsraw v13, v13, V_26 - vsraw v14, v14, V_26 - vsraw v17, v17, V_26 - vsraw v18, v18, V_26 - vpkuwum v4, v5, v4 - vsubuhm v4, v7, v4 - vpkuwum v9, v10, v9 - vsubuhm v9, v7, v9 - vpkuwum v13, v14, v13 - vsubuhm v13, v7, v13 - vpkuwum v17, v18, v17 - vsubuhm v17, v7, v17 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 /* Modulo multify-Low unsigned halfword; results bond to 2^16 * q in abs value. */ - vmladduhm \_v0, v4, V_MKQ, v8 - vmladduhm \_v1, v9, V_MKQ, v12 - vmladduhm \_v2, v13, V_MKQ, v16 - vmladduhm \_v3, v17, V_MKQ, v20 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm /* @@ -330,32 +352,32 @@ */ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 /* Modular multification bond by 2^16 * q in abs value */ - vmladduhm v15, v25, \_vz0, v3 - vmladduhm v20, v26, \_vz1, v3 - vmladduhm v27, v30, \_vz2, v3 - vmladduhm v28, v31, \_vz3, v3 + vmladduhm 15, vdata_mont1, \_vz0, 3 + vmladduhm 20, vdata_mont2, \_vz1, 3 + vmladduhm 27, vdata_mont3, \_vz2, 3 + vmladduhm 28, vdata_mont4, \_vz3, 3 /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ - vmhraddshs v14, v25, \_vz0, v3 - vmhraddshs v19, v26, \_vz1, v3 - vmhraddshs v24, v30, \_vz2, v3 - vmhraddshs v29, v31, \_vz3, v3 + vmhraddshs 14, vdata_mont1, \_vz0, 3 + vmhraddshs 19, vdata_mont2, \_vz1, 3 + vmhraddshs 24, vdata_mont3, \_vz2, 3 + vmhraddshs 29, vdata_mont4, \_vz3, 3 - vmladduhm v15, v15, V_QINV, v3 - vmladduhm v20, v20, V_QINV, v3 - vmladduhm v25, v27, V_QINV, v3 - vmladduhm v30, v28, V_QINV, v3 + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 - vmhraddshs v15, v15, V_NMKQ, v14 - vmhraddshs v20, v20, V_NMKQ, v19 - vmhraddshs v25, v25, V_NMKQ, v24 - vmhraddshs v30, v30, V_NMKQ, v29 + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 /* Shift right 1 bit */ - vsrah \_vo0, v15, v4 - vsrah \_vo1, v20, v4 - vsrah \_vo2, v25, v4 - vsrah \_vo3, v30, v4 + vsrah \_vo0, 15, 4 + vsrah \_vo1, 20, 4 + vsrah \_vo2, 25, 4 + vsrah \_vo3, 30, 4 .endm /* @@ -363,21 +385,21 @@ * V_NMKQ, V_QINV, Zero vector, One vector */ .macro Set_mont_consts - xxlor 32+v5, vs0, vs0 /* V_NMKQ */ - xxlor 32+v2, vs2, vs2 /* V_QINV */ - xxlor 32+v3, vs3, vs3 /* all 0 */ - xxlor 32+v4, vs4, vs4 /* all 1 */ + xxlor 32+5, 0, 0 /* V_NMKQ */ + xxlor 32+2, 2, 2 /* V_QINV */ + xxlor 32+3, 3, 3 /* all 0 */ + xxlor 32+4, 4, 4 /* all 1 */ .endm .macro Load_next_4zetas - li r8, 16 - li r11, 32 - li r12, 48 - lxvd2x 32+V_Z0, 0, r14 - lxvd2x 32+V_Z1, r8, r14 - lxvd2x 32+V_Z2, r11, r14 - lxvd2x 32+V_Z3, r12, r14 - addi r14, r14, 64 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 8, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 + addi 14, 14, 64 .endm /* @@ -392,38 +414,38 @@ .endm .macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, r3, r9 - stxvd2x \_vs1, r3, r16 - stxvd2x \_vs2, r3, r18 - stxvd2x \_vs3, r3, r20 + stxvd2x \_vs0, 3, 9 + stxvd2x \_vs1, 3, 16 + stxvd2x \_vs2, 3, 18 + stxvd2x \_vs3, 3, 20 .endm .macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, r3, r10 - stxvd2x \_vs1, r3, r17 - stxvd2x \_vs2, r3, r19 - stxvd2x \_vs3, r3, r21 + stxvd2x \_vs0, 3, 10 + stxvd2x \_vs1, 3, 17 + stxvd2x \_vs2, 3, 19 + stxvd2x \_vs3, 3, 21 .endm .macro Reload_4coeffs - lxvd2x 32+v25, 0, r3 - lxvd2x 32+v26, r10, r3 - lxvd2x 32+v30, r11, r3 - lxvd2x 32+v31, r12, r3 - addi r3, r3, 64 + lxvd2x 32+vdata_mont1, 0, 3 + lxvd2x 32+vdata_mont2, 10, 3 + lxvd2x 32+vdata_mont3, 11, 3 + lxvd2x 32+vdata_mont4, 12, 3 + addi 3, 3, 64 .endm .macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - addi r3, r3, -128 - stxvd2x \_vs0, 0, r3 - stxvd2x \_vs1, r10, r3 - stxvd2x \_vs2, r11, r3 - stxvd2x \_vs3, r12, r3 - stxvd2x \_vs4, r15, r3 - stxvd2x \_vs5, r16, r3 - stxvd2x \_vs6, r17, r3 - stxvd2x \_vs7, r18, r3 - addi r3, r3, 128 + addi 3, 3, -128 + stxvd2x \_vs0, 0, 3 + stxvd2x \_vs1, 10, 3 + stxvd2x \_vs2, 11, 3 + stxvd2x \_vs3, 12, 3 + stxvd2x \_vs4, 15, 3 + stxvd2x \_vs5, 16, 3 + stxvd2x \_vs6, 17, 3 + stxvd2x \_vs7, 18, 3 + addi 3, 3, 128 .endm /* @@ -431,26 +453,26 @@ * coefficient array order. */ .macro PermWriteL44 - xxlor 32+v14, vs10, vs10 - xxlor 32+v19, vs11, vs11 - xxlor 32+v24, vs12, vs12 - xxlor 32+v29, vs13, vs13 - xxpermdi 32+v10, 32+v14, 32+v13, 3 - xxpermdi 32+v11, 32+v14, 32+v13, 0 - xxpermdi 32+v12, 32+v19, 32+v18, 3 - xxpermdi 32+v13, 32+v19, 32+v18, 0 - xxpermdi 32+v14, 32+v24, 32+v23, 3 - xxpermdi 32+v15, 32+v24, 32+v23, 0 - xxpermdi 32+v16, 32+v29, 32+v28, 3 - xxpermdi 32+v17, 32+v29, 32+v28, 0 - stxvd2x 32+v10, 0, r5 - stxvd2x 32+v11, r10, r5 - stxvd2x 32+v12, r11, r5 - stxvd2x 32+v13, r12, r5 - stxvd2x 32+v14, r15, r5 - stxvd2x 32+v15, r16, r5 - stxvd2x 32+v16, r17, r5 - stxvd2x 32+v17, r18, r5 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+14, 32+vresult_mont1, 3 + xxpermdi 32+11, 32+14, 32+vresult_mont1, 0 + xxpermdi 32+12, 32+19, 32+vresult_mont2, 3 + xxpermdi 32+13, 32+19, 32+vresult_mont2, 0 + xxpermdi 32+14, 32+24, 32+vresult_mont3, 3 + xxpermdi 32+15, 32+24, 32+vresult_mont3, 0 + xxpermdi 32+16, 32+29, 32+vresult_mont4, 3 + xxpermdi 32+17, 32+29, 32+vresult_mont4, 0 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm /* @@ -458,77 +480,123 @@ * coefficient array order. */ .macro PermWriteL24 - xxlor 32+v14, vs10, vs10 - xxlor 32+v19, vs11, vs11 - xxlor 32+v24, vs12, vs12 - xxlor 32+v29, vs13, vs13 - vmrgew v10, v13, v14 - vmrgow v11, v13, v14 - vmrgew v12, v18, v19 - vmrgow v13, v18, v19 - vmrgew v14, v23, v24 - vmrgow v15, v23, v24 - vmrgew v16, v28, v29 - vmrgow v17, v28, v29 - stxvd2x 32+v10, 0, r5 - stxvd2x 32+v11, r10, r5 - stxvd2x 32+v12, r11, r5 - stxvd2x 32+v13, r12, r5 - stxvd2x 32+v14, r15, r5 - stxvd2x 32+v15, r16, r5 - stxvd2x 32+v16, r17, r5 - stxvd2x 32+v17, r18, r5 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + vmrgew 10, vresult_mont1, 14 + vmrgow 11, vresult_mont1, 14 + vmrgew 12, vresult_mont2, 19 + vmrgow 13, vresult_mont2, 19 + vmrgew 14, vresult_mont3, 24 + vmrgow 15, vresult_mont3, 24 + vmrgew 16, vresult_mont4, 29 + vmrgow 17, vresult_mont4, 29 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm +/* + * INTT layer Len=2. + */ .macro INTT_REDUCE_L24 Load_L24Coeffs Compute_4Coeffs - BREDUCE_4X v4, v9, v13, v17 - xxlor vs10, 32+v4, 32+v4 - xxlor vs11, 32+v9, 32+v9 - xxlor vs12, 32+v13, 32+v13 - xxlor vs13, 32+v17, 32+v17 + BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + xxlor 10, 32+vresult_brt1, 32+vresult_brt1 + xxlor 11, 32+vresult_brt2, 32+vresult_brt2 + xxlor 12, 32+vresult_brt3, 32+vresult_brt3 + xxlor 13, 32+vresult_brt4, 32+vresult_brt4 Set_mont_consts Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 PermWriteL24 .endm +/* + * INTT layer Len=4. + */ .macro INTT_REDUCE_L44 Load_L44Coeffs Compute_4Coeffs - BREDUCE_4X v4, v9, v13, v17 - xxlor vs10, 32+v4, 32+v4 - xxlor vs11, 32+v9, 32+v9 - xxlor vs12, 32+v13, 32+v13 - xxlor vs13, 32+v17, 32+v17 + BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + xxlor 10, 32+vresult_brt1, 32+vresult_brt1 + xxlor 11, 32+vresult_brt2, 32+vresult_brt2 + xxlor 12, 32+vresult_brt3, 32+vresult_brt3 + xxlor 13, 32+vresult_brt4, 32+vresult_brt4 Set_mont_consts Load_next_4zetas Perm_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 PermWriteL44 .endm +/* + * INTT layer Len=8 and 16. + */ .macro INTT_REDUCE_4X start next Load_4Coeffs \start, \next - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 + BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + Write_B4C 32+vresult_brt1, 32+vresult_brt2, 32+vresult_brt3, 32+vresult_brt4 Set_mont_consts Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 + Write_M4C 32+vresult_mont1, 32+vresult_mont2, 32+vresult_mont3, 32+vresult_mont4 .endm /* - * main operations for intt - * t = r[j]; - * r[j] = barrett_reduce(t + r[j + len]); - * r[j + len] = r[j + len] - t; - * r[j + len] = fqmul(zeta, r[j + len]); + * INTT layer Len=32, 64 and 128. */ +.macro INTT_REDUCE_L567 start next + Load_4Coeffs \start, \next + BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + Write_B4C 32+vresult_brt1, 32+vresult_brt2, 32+vresult_brt3, 32+vresult_brt4 + Set_mont_consts + lvx V_ZETA, 0, 14 + //addi 14, 14, 16 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 + Write_M4C 32+vresult_mont1, 32+vresult_mont2, 32+vresult_mont3, 32+vresult_mont4 +.endm /* - * mlk_intt_ppc(r) + * mlk_intt_ppc(int16_t *r, int16_t *qdata) + * Compute inverse NTT based on the following 7 layers - + * len = 2, 4, 8, 16, 32, 64, 128 + * + * Each layer compute the coeffients on 2 legs, start and start + len*2 offsets. + * + * leg 1 leg 2 + * ----- ----- + * start start+len*2 + * start+next start+len*2+next + * start+next+next start+len*2+next+next + * start+next+next+next start+len*2+next+next+next + * + * Each computation loads 8 vectors, 4 for each leg. + * The final coefficient (t) from each vector of leg1 and leg2 then do the + * add/sub operations to obtain the final results. + * + * -> leg1 = leg1 + t, leg2 = leg1 - t + * + * The resulting coeffients then store back to each leg's offset. + * + * Each vector has the same corresponding zeta except len=4 and len=2. + * + * len=4 has 4-4 layout which means every 4 16-bit coeffients has the same zeta. + * and len=2 has 2-2-2-2 layout which means every 2 16-bit coeffients has the same zeta. + * e.g. + * coeff vector a1 a2 a3 a4 a5 a6 a7 a8 + * zeta vector z1 z1 z2 z2 z3 z3 z4 z4 + * + * For len=4 and len=2, each vector will get permuted to leg1 and leg2. Zeta is + * pre-arranged for the leg1 and leg2. After the computation, each vector needs + * to transpose back to its original 4-4 or 2-2-2-2 layout. */ .global MLK_ASM_NAMESPACE(intt_ppc) .align 4 @@ -538,93 +606,125 @@ MLK_ASM_FN_SYMBOL(intt_ppc) /* init vectors and constants Setup for Montgomery reduce */ - lxvx vs0, 0, r4 + lxvx 0, 0, 4 - li r10, QINV_OFFSET - lxvx 32+V_QINV, r10, r4 - xxlxor 32+v3, 32+v3, 32+v3 - vspltish v4, 1 - xxlor vs2, 32+v2, 32+v2 /* QINV */ - xxlor vs3, 32+v3, 32+v3 /* 0 vector */ - xxlor vs4, 32+v4, 32+v4 /* 1 vector */ + li 10, QINV_OFFSET + lxvx 32+V_QINV, 10, 4 + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 /* QINV */ + xxlor 3, 32+3, 32+3 /* 0 vector */ + xxlor 4, 32+4, 32+4 /* 1 vector */ /* Setup for Barrett reduce */ - li r10, Q_OFFSET - li r11, C20159_OFFSET - lxvx vs6, r10, r4 /* V_MKQ */ - lxvx 32+V20159, r11, r4 /* V20159 */ - - vspltisw v8, 13 - vadduwm v8, v8, v8 - xxlor vs8, 32+v8, 32+v8 /* V_26 store at vs8 */ - - vspltisw v9, 1 - vsubuwm v10, v8, v9 /* value 25 */ - vslw v9, v9, v10 - xxlor vs7, 32+v9, 32+v9 /* V_25 syore at vs7 */ - - li r10, 16 - li r11, 32 - li r12, 48 - li r15, 64 - li r16, 80 - li r17, 96 - li r18, 112 + li 10, Q_OFFSET + li 11, C20159_OFFSET + lxvx 6, 10, 4 /* V_MKQ */ + lxvx 32+V20159, 11, 4 /* V20159 */ + + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 /* V_26 store at vs8 */ + + vspltisw 9, 1 + vsubuwm 10, 8, 9 /* value 25 */ + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 /* V2pw25 store at vs7 */ + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 /* * Montgomery reduce loops with constant 1441 */ - addi r14, r4, C1441_OFFSET - lvx V1441, 0, r14 - li r8, 4 - mtctr r8 + addi 14, 4, C1441_OFFSET + lvx V1441, 0, 14 + li 8, 4 + mtctr 8 Set_mont_consts intt_ppc__Loopf: Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, v6, v7, v8, v9 + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, v13, v18, v23, v28 - MWrite_8X 32+v6, 32+v7, 32+v8, 32+v9, 32+v13, 32+v18, 32+v23, 32+v28 + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 bdnz intt_ppc__Loopf - addi r3, r3, -512 + addi 3, 3, -512 .align 4 /* * 1. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - * Update zetas vectors, each vector has 2 zetas - * Load zeta array in 2-2-2-2 layout + * Update zetas vectors, each vector has 2 zetas + * Load zeta vectors in 2-2-2-2 layout + * + * Compute coefficients of the NTT based on the following sequences, + * 0, 1, 2, 3, 4, 5, 6, 7 + * 8, 9, 10, 11, 12, 13, 14, 15 + * ... + * 240, 241, 242, 243, 244, 245, 246, 247 + * 248, 249, 250, 251, 252, 253, 254, 255 + * + * These are indexes to the 16 bits array. Each loads 4 vectors. */ - addi r14, r4, ZETA_INTT_OFFSET - li r7, 4 /* len * 2 */ - li r8, 4 - mtctr r8 - mr r5, r3 -intt_ppc__Loop2: + addi 14, 4, ZETA_INTT_OFFSET + li 7, 4 /* len * 2 */ + mr 5, 3 + + INTT_REDUCE_L24 + addi 5, 5, 128 INTT_REDUCE_L24 - addi r5, r5, 128 - bdnz intt_ppc__Loop2 + addi 5, 5, 128 + INTT_REDUCE_L24 + addi 5, 5, 128 + INTT_REDUCE_L24 + addi 5, 5, 128 .align 4 /* * 2. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - * Load zeta array in 4-4 layout + * Load zeta vectors in 4-4 layout + * + * Compute coefficients of the NTT based on the following sequences, + * 0, 1, 2, 3, 4, 5, 6, 7 + * 8, 9, 10, 11, 12, 13, 14, 15 + * ... + * 240, 241, 242, 243, 244, 245, 246, 247 + * 248, 249, 250, 251, 252, 253, 254, 255 + * + * These are indexes to the 16 bits array. Each loads 4 vectors. */ - mr r5, r3 - li r7, 8 - li r8, 4 - mtctr r8 -intt_ppc__Loop4: + mr 5, 3 + li 7, 8 + + INTT_REDUCE_L44 + addi 5, 5, 128 + INTT_REDUCE_L44 + addi 5, 5, 128 + INTT_REDUCE_L44 + addi 5, 5, 128 INTT_REDUCE_L44 - addi r5, r5, 128 - bdnz intt_ppc__Loop4 + addi 5, 5, 128 .align 4 /* * 3. len = 8, start = 0, 16, 32, 48,...208, 224, 240 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 8 + * 64 - 72 + * 128 - 136 + * 192 - 200 + * + * These are indexes to the 16 bits array */ - li r7, 16 + li 7, 16 INTT_REDUCE_4X 0, 32 INTT_REDUCE_4X 128, 32 @@ -634,138 +734,89 @@ intt_ppc__Loop4: .align 4 /* * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 16 + * 8 - 24 + * 128 - 144 + * 136 - 152 + * + * These are indexes to the 16 bits array */ - li r7, 32 + li 7, 32 INTT_REDUCE_4X 0, 64 - addi r14, r14, -64 + addi 14, 14, -64 INTT_REDUCE_4X 16, 64 INTT_REDUCE_4X 256, 64 - addi r14, r14, -64 + addi 14, 14, -64 INTT_REDUCE_4X 272, 64 .align 4 /* * 5. len = 32, start = 0, 64, 128, 192 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 32 + * 64 - 96 + * 128 - 160 + * 192 - 224 + * + * These are indexes to the 16 bits array */ - li r7, 64 + li 7, 64 - Load_4Coeffs 0, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - addi r14, r14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 - - Load_4Coeffs 128, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - addi r14, r14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 - - Load_4Coeffs 256, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - addi r14, r14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 - - Load_4Coeffs 384, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - addi r14, r14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + INTT_REDUCE_L567 0, 16 + addi 14, 14, 16 + INTT_REDUCE_L567 128, 16 + addi 14, 14, 16 + INTT_REDUCE_L567 256, 16 + addi 14, 14, 16 + INTT_REDUCE_L567 384, 16 + addi 14, 14, 16 .align 4 /* * 6. len = 64, start = 0, 128 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 64 + * 32 - 96 + * 128 - 192 + * 160 - 224 + * + * These are indexes to the 16 bits array */ - li r7, 128 - Load_4Coeffs 0, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + li 7, 128 - Load_4Coeffs 64, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - addi r14, r14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 - - Load_4Coeffs 256, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 - - Load_4Coeffs 320, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - addi r14, r14, 16 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + INTT_REDUCE_L567 0, 16 + INTT_REDUCE_L567 64, 16 + addi 14, 14, 16 + INTT_REDUCE_L567 256, 16 + INTT_REDUCE_L567 320, 16 + addi 14, 14, 16 .align 4 /* * 7. len = 128, start = 0 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 128 + * 32 - 160 + * 64 - 192 + * 96 - 224 + * + * These are indexes to the 16 bits array */ - li r7, 256 /* len*2 */ - - Load_4Coeffs 0, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - lvx V_ZETA, 0, r14 - xxlor vs9, 32+V_ZETA, 32+V_ZETA - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 - - Load_4Coeffs 64, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - xxlor 32+V_ZETA, vs9, vs9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + li 7, 256 /* len*2 */ - Load_4Coeffs 128, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - xxlor 32+V_ZETA, vs9, vs9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 - - Load_4Coeffs 192, 16 - BREDUCE_4X v4, v9, v13, v17 - Write_B4C 32+v4, 32+v9, 32+v13, 32+v17 - Set_mont_consts - xxlor 32+V_ZETA, vs9, vs9 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, v13, v18, v23, v28 - Write_M4C 32+v13, 32+v18, 32+v23, 32+v28 + INTT_REDUCE_L567 0, 16 + INTT_REDUCE_L567 64, 16 + INTT_REDUCE_L567 128, 16 + INTT_REDUCE_L567 192, 16 RESTORE_REGS blr @@ -773,7 +824,6 @@ intt_ppc__Loop4: /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ #undef V20159 -#undef V_25 #undef V_26 #undef V_MKQ #undef V_QINV diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index 3c06f0a319..88f1b1f60d 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -24,82 +24,100 @@ #define V_Z3 10 #define V_ZETA 10 +#define vdata_a1 12 +#define vdata_a2 17 +#define vdata_a3 22 +#define vdata_a4 27 +#define vdata_b1 13 +#define vdata_b2 18 +#define vdata_b3 23 +#define vdata_b4 28 + +#define vresult_a1 15 +#define vresult_b1 16 +#define vresult_a2 20 +#define vresult_b2 21 +#define vresult_a3 25 +#define vresult_b3 26 +#define vresult_a4 30 +#define vresult_b4 31 + .machine "any" .text .macro SAVE_REGS - stdu r1, -352(r1) - mflr r0 - std r14, 56(r1) - std r15, 64(r1) - std r16, 72(r1) - std r17, 80(r1) - std r18, 88(r1) - std r19, 96(r1) - std r20, 104(r1) - std r21, 112(r1) - li r10, 128 - li r11, 144 - li r12, 160 - li r14, 176 - li r15, 192 - li r16, 208 - stxvx 32+v20, r10, r1 - stxvx 32+v21, r11, r1 - stxvx 32+v22, r12, r1 - stxvx 32+v23, r14, r1 - stxvx 32+v24, r15, r1 - stxvx 32+v25, r16, r1 - li r10, 224 - li r11, 240 - li r12, 256 - li r14, 272 - li r15, 288 - li r16, 304 - stxvx 32+v26, r10, r1 - stxvx 32+v27, r11, r1 - stxvx 32+v28, r12, r1 - stxvx 32+v29, r14, r1 - stxvx 32+v30, r15, r1 - stxvx 32+v31, r16, r1 + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 .endm .macro RESTORE_REGS - li r10, 128 - li r11, 144 - li r12, 160 - li r14, 176 - li r15, 192 - li r16, 208 - lxvx 32+v20, r10, r1 - lxvx 32+v21, r11, r1 - lxvx 32+v22, r12, r1 - lxvx 32+v23, r14, r1 - lxvx 32+v24, r15, r1 - lxvx 32+v25, r16, r1 - li r10, 224 - li r11, 240 - li r12, 256 - li r14, 272 - li r15, 288 - li r16, 304 - lxvx 32+v26, r10, r1 - lxvx 32+v27, r11, r1 - lxvx 32+v28, r12, r1 - lxvx 32+v29, r14, r1 - lxvx 32+v30, r15, r1 - lxvx 32+v31, r16, r1 - ld r14, 56(r1) - ld r15, 64(r1) - ld r16, 72(r1) - ld r17, 80(r1) - ld r18, 88(r1) - ld r19, 96(r1) - ld r20, 104(r1) - ld r21, 112(r1) - - mtlr r0 - addi r1, r1, 352 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 .endm /* @@ -124,14 +142,14 @@ * */ .macro Init_Coeffs_offset start next - li r9, \start /* first offset to j */ - add r10, r7, r9 /* J + len*2 */ - addi r16, r9, \next - addi r17, r10, \next - addi r18, r16, \next - addi r19, r17, \next - addi r20, r18, \next - addi r21, r19, \next + li 9, \start /* first offset to j */ + add 10, 7, 9 /* J + len*2 */ + addi 16, 9, \next + addi 17, 10, \next + addi 18, 16, \next + addi 19, 17, \next + addi 20, 18, \next + addi 21, 19, \next .endm /* @@ -139,10 +157,10 @@ * r[j+len]: V13, V18, V23, V28 */ .macro Load_4Rjp - lxvd2x 32+v13, r3, r10 /* V13: vector r'0 */ - lxvd2x 32+v18, r3, r17 /* V18: vector for r'1 */ - lxvd2x 32+v23, r3, r19 /* V23: vector for r'2 */ - lxvd2x 32+v28, r3, r21 /* V28: vector for r'3 */ + lxvd2x 32+vdata_b1, 3, 10 /* V13: vector r'0 */ + lxvd2x 32+vdata_b2, 3, 17 /* V18: vector for r'1 */ + lxvd2x 32+vdata_b3, 3, 19 /* V23: vector for r'2 */ + lxvd2x 32+vdata_b4, 3, 21 /* V28: vector for r'3 */ .endm /* @@ -173,22 +191,22 @@ * in the proper order to match the multiplication. */ .macro Load_L24Coeffs - lxvd2x 32+v25, 0, r5 - lxvd2x 32+v26, r10, r5 - vmrgew v13, v25, v26 - vmrgow v12, v25, v26 - lxvd2x 32+v25, r11, r5 - lxvd2x 32+v26, r12, r5 - vmrgew v18, v25, v26 - vmrgow v17, v25, v26 - lxvd2x 32+v25, r15, r5 - lxvd2x 32+v26, r16, r5 - vmrgew v23, v25, v26 - vmrgow v22, v25, v26 - lxvd2x 32+v25, r17, r5 - lxvd2x 32+v26, r18, r5 - vmrgew v28, v25, v26 - vmrgow v27, v25, v26 + lxvd2x 32+25, 0, 5 + lxvd2x 32+26, 10, 5 + vmrgew vdata_b1, 25, 26 + vmrgow vdata_a1, 25, 26 + lxvd2x 32+25, 11, 5 + lxvd2x 32+26, 12, 5 + vmrgew vdata_b2, 25, 26 + vmrgow vdata_a2, 25, 26 + lxvd2x 32+25, 15, 5 + lxvd2x 32+26, 16, 5 + vmrgew vdata_b3, 25, 26 + vmrgow vdata_a3, 25, 26 + lxvd2x 32+25, 17, 5 + lxvd2x 32+26, 18, 5 + vmrgew vdata_b4, 25, 26 + vmrgow vdata_a4, 25, 26 .endm /* @@ -207,22 +225,22 @@ * in the proper order to match the multiplication. */ .macro Load_L44Coeffs - lxvd2x vs1, 0, r5 - lxvd2x vs2, r10, r5 - xxpermdi 32+v13, vs2, vs1, 3 - xxpermdi 32+v12, vs2, vs1, 0 - lxvd2x vs3, r11, r5 - lxvd2x vs4, r12, r5 - xxpermdi 32+v18, vs4, vs3, 3 - xxpermdi 32+v17, vs4, vs3, 0 - lxvd2x vs1, r15, r5 - lxvd2x vs2, r16, r5 - xxpermdi 32+v23, vs2, vs1, 3 - xxpermdi 32+v22, vs2, vs1, 0 - lxvd2x vs3, r17, r5 - lxvd2x vs4, r18, r5 - xxpermdi 32+v28, vs4, vs3, 3 - xxpermdi 32+v27, vs4, vs3, 0 + lxvd2x 1, 0, 5 + lxvd2x 2, 10, 5 + xxpermdi 32+vdata_b1, 2, 1, 3 + xxpermdi 32+vdata_a1, 2, 1, 0 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxpermdi 32+vdata_b2, 4, 3, 3 + xxpermdi 32+vdata_a2, 4, 3, 0 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxpermdi 32+vdata_b3, 2, 1, 3 + xxpermdi 32+vdata_a3, 2, 1, 0 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxpermdi 32+vdata_b4, 4, 3, 3 + xxpermdi 32+vdata_a4, 4, 3, 0 .endm /* @@ -236,32 +254,32 @@ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 /* fqmul = zeta * coefficient Modular multification bond by 2^16 * q in abs value */ - vmladduhm v15, v13, \_vz0, v3 - vmladduhm v20, v18, \_vz1, v3 - vmladduhm v25, v23, \_vz2, v3 - vmladduhm v30, v28, \_vz3, v3 + vmladduhm 15, vdata_b1, \_vz0, 3 + vmladduhm 20, vdata_b2, \_vz1, 3 + vmladduhm 25, vdata_b3, \_vz2, 3 + vmladduhm 30, vdata_b4, \_vz3, 3 /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ - vmhraddshs v14, v13, \_vz0, v3 - vmhraddshs v19, v18, \_vz1, v3 - vmhraddshs v24, v23, \_vz2, v3 - vmhraddshs v29, v28, \_vz3, v3 + vmhraddshs 14, vdata_b1, \_vz0, 3 + vmhraddshs 19, vdata_b2, \_vz1, 3 + vmhraddshs 24, vdata_b3, \_vz2, 3 + vmhraddshs 29, vdata_b4, \_vz3, 3 - vmladduhm v15, v15, V_QINV, v3 - vmladduhm v20, v20, V_QINV, v3 - vmladduhm v25, v25, V_QINV, v3 - vmladduhm v30, v30, V_QINV, v3 + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 30, 30, V_QINV, 3 - vmhraddshs v15, v15, V_NMKQ, v14 - vmhraddshs v20, v20, V_NMKQ, v19 - vmhraddshs v25, v25, V_NMKQ, v24 - vmhraddshs v30, v30, V_NMKQ, v29 + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 /* Shift right 1 bit */ - vsrah v13, v15, v4 - vsrah v18, v20, v4 - vsrah v23, v25, v4 - vsrah v28, v30, v4 + vsrah vdata_b1, 15, 4 + vsrah vdata_b2, 20, 4 + vsrah vdata_b3, 25, 4 + vsrah vdata_b4, 30, 4 .endm /* @@ -270,10 +288,10 @@ * r[j]: V12, V17, V22, V27 */ .macro Load_4Rj - lxvd2x 32+v12, r3, r9 /* V12: vector r0 */ - lxvd2x 32+v17, r3, r16 /* V17: vector r1 */ - lxvd2x 32+v22, r3, r18 /* V22: vector r2 */ - lxvd2x 32+v27, r3, r20 /* V27: vector r3 */ + lxvd2x 32+vdata_a1, 3, 9 /* V12: vector r0 */ + lxvd2x 32+vdata_a2, 3, 16 /* V17: vector r1 */ + lxvd2x 32+vdata_a3, 3, 18 /* V22: vector r2 */ + lxvd2x 32+vdata_a4, 3, 20 /* V27: vector r3 */ .endm /* @@ -288,25 +306,25 @@ r[j] = r[j] + t. r[j+len] = r[j] - t */ - vsubuhm v16, v12, v13 - vadduhm v15, v13, v12 - vsubuhm v21, v17, v18 - vadduhm v20, v18, v17 - vsubuhm v26, v22, v23 - vadduhm v25, v23, v22 - vsubuhm v31, v27, v28 - vadduhm v30, v28, v27 + vsubuhm vresult_b1, vdata_a1, vdata_b1 + vadduhm vresult_a1, vdata_b1, vdata_a1 + vsubuhm vresult_b2, vdata_a2, vdata_b2 + vadduhm vresult_a2, vdata_b2, vdata_a2 + vsubuhm vresult_b3, vdata_a3, vdata_b3 + vadduhm vresult_a3, vdata_b3, vdata_a3 + vsubuhm vresult_b4, vdata_a4, vdata_b4 + vadduhm vresult_a4, vdata_b4, vdata_a4 .endm .macro Write_One - stxvd2x 32+v15, r3, r9 - stxvd2x 32+v16, r3, r10 - stxvd2x 32+v20, r3, r16 - stxvd2x 32+v21, r3, r17 - stxvd2x 32+v25, r3, r18 - stxvd2x 32+v26, r3, r19 - stxvd2x 32+v30, r3, r20 - stxvd2x 32+v31, r3, r21 + stxvd2x 32+vresult_a1, 3, 9 + stxvd2x 32+vresult_b1, 3, 10 + stxvd2x 32+vresult_a2, 3, 16 + stxvd2x 32+vresult_b2, 3, 17 + stxvd2x 32+vresult_a3, 3, 18 + stxvd2x 32+vresult_b3, 3, 19 + stxvd2x 32+vresult_a4, 3, 20 + stxvd2x 32+vresult_b4, 3, 21 .endm /* @@ -315,22 +333,22 @@ */ .macro PermWriteL44 Compute_4Coeffs - xxpermdi vs0, 32+v15, 32+v16, 3 - xxpermdi vs1, 32+v15, 32+v16, 0 - xxpermdi vs2, 32+v20, 32+v21, 3 - xxpermdi vs3, 32+v20, 32+v21, 0 - xxpermdi vs4, 32+v25, 32+v26, 3 - xxpermdi vs5, 32+v25, 32+v26, 0 - xxpermdi vs6, 32+v30, 32+v31, 3 - xxpermdi vs7, 32+v30, 32+v31, 0 - stxvd2x vs0, 0, r5 - stxvd2x vs1, r10, r5 - stxvd2x vs2, r11, r5 - stxvd2x vs3, r12, r5 - stxvd2x vs4, r15, r5 - stxvd2x vs5, r16, r5 - stxvd2x vs6, r17, r5 - stxvd2x vs7, r18, r5 + xxpermdi 0, 32+vresult_a1, 32+vresult_b1, 3 + xxpermdi 1, 32+vresult_a1, 32+vresult_b1, 0 + xxpermdi 2, 32+vresult_a2, 32+vresult_b2, 3 + xxpermdi 3, 32+vresult_a2, 32+vresult_b2, 0 + xxpermdi 4, 32+vresult_a3, 32+vresult_b3, 3 + xxpermdi 5, 32+vresult_a3, 32+vresult_b3, 0 + xxpermdi 6, 32+vresult_a4, 32+vresult_b4, 3 + xxpermdi 7, 32+vresult_a4, 32+vresult_b4, 0 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 .endm /* @@ -339,33 +357,33 @@ */ .macro PermWriteL24 Compute_4Coeffs - vmrgew v10, v16, v15 - vmrgow v11, v16, v15 - vmrgew v12, v21, v20 - vmrgow v13, v21, v20 - vmrgew v14, v26, v25 - vmrgow v15, v26, v25 - vmrgew v16, v31, v30 - vmrgow v17, v31, v30 - stxvd2x 32+v10, 0, r5 - stxvd2x 32+v11, r10, r5 - stxvd2x 32+v12, r11, r5 - stxvd2x 32+v13, r12, r5 - stxvd2x 32+v14, r15, r5 - stxvd2x 32+v15, r16, r5 - stxvd2x 32+v16, r17, r5 - stxvd2x 32+v17, r18, r5 + vmrgew 10, vresult_b1, vresult_a1 + vmrgow 11, vresult_b1, vresult_a1 + vmrgew 12, vresult_b2, vresult_a2 + vmrgow 13, vresult_b2, vresult_a2 + vmrgew 14, vresult_b3, vresult_a3 + vmrgow 15, vresult_b3, vresult_a3 + vmrgew 16, vresult_b4, vresult_a4 + vmrgow 17, vresult_b4, vresult_a4 + stxvd2x 32+10, 0, 5 + stxvd2x 32+11, 10, 5 + stxvd2x 32+12, 11, 5 + stxvd2x 32+13, 12, 5 + stxvd2x 32+14, 15, 5 + stxvd2x 32+15, 16, 5 + stxvd2x 32+16, 17, 5 + stxvd2x 32+17, 18, 5 .endm .macro Load_next_4zetas - li r10, 16 - li r11, 32 - li r12, 48 - lxvd2x 32+V_Z0, 0, r14 - lxvd2x 32+V_Z1, r10, r14 - lxvd2x 32+V_Z2, r11, r14 - lxvd2x 32+V_Z3, r12, r14 - addi r14, r14, 64 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, 14 + lxvd2x 32+V_Z1, 10, 14 + lxvd2x 32+V_Z2, 11, 14 + lxvd2x 32+V_Z3, 12, 14 + addi 14, 14, 64 .endm /* @@ -388,7 +406,39 @@ .endm /* - * mlk_ntt_ppc(int16_t *r) + * mlk_ntt_ppc(int16_t *r, int16_t *qdata) + * Compute forward NTT based on the following 7 layers - + * len = 128, 64, 32, 16, 8, 4, 2. + * + * Each layer compute the coeffients on 2 legs, start and start + len*2 offsets. + * + * leg 1 leg 2 + * ----- ----- + * start start+len*2 + * start+next start+len*2+next + * start+next+next start+len*2+next+next + * start+next+next+next start+len*2+next+next+next + * + * Each computation loads 8 vectors, 4 for each leg. + * The final coefficient (t) from each vector of leg1 and leg2 then do the + * add/sub operations to obtain the final results. + * + * -> leg1 = leg1 + t, leg2 = leg1 - t + * + * The resulting coeffients then store back to each leg's offset. + * + * Each vector has the same corresponding zeta except len=4 and len=2. + * + * len=4 has 4-4 layout which means every 4 16-bit coeffients has the same zeta. + * and len=2 has 2-2-2-2 layout which means every 2 16-bit coeffients has the same zeta. + * e.g. + * coeff vector a1 a2 a3 a4 a5 a6 a7 a8 + * zeta vector z1 z1 z2 z2 z3 z3 z4 z4 + * + * For len=4 and len=2, each vector will get permuted to leg1 and leg2. Zeta is + * pre-arranged for the leg1 and leg2. After the computation, each vector needs + * to transpose back to its original 4-4 or 2-2-2-2 layout. + * */ .global MLK_ASM_NAMESPACE(ntt_ppc) .align 4 @@ -397,27 +447,32 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) SAVE_REGS /* load MLKEM_Q */ - lvx V_NMKQ,0,r4 + lvx V_NMKQ,0,4 /* Register 14 as pointer to zetas array */ - addi r14, r4, ZETA_NTT_OFFSET + addi 14, 4, ZETA_NTT_OFFSET - vxor v3, v3, v3 - vspltish v4, 1 + vxor 3, 3, 3 + vspltish 4, 1 - li r10, QINV_OFFSET - lvx V_QINV, r10, r4 + li 10, QINV_OFFSET + lvx V_QINV, 10, 4 .align 4 /* - * Compute coefficients of the NTT based on the following loop. - * for (len = 128; len ≥ 2; len = len/2) - * * 1. len = 128, start = 0 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 128 + * 32 - 160 + * 64 - 192 + * 96 - 224 + * + * These are indexes to the 16 bits array */ - li r7, 256 /* len * 2 */ - lvx V_ZETA, 0, r14 - addi r14, r14, 16 + li 7, 256 /* len * 2 */ + lvx V_ZETA, 0, 14 + addi 14, 14, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -427,47 +482,68 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) .align 4 /* * 2. len = 64, start = 0, 128 - * k += 2 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 64 + * 32 - 96 + * 128 - 192 + * 160 - 224 + * + * These are indexes to the 16 bits array */ - li r7, 128 - lvx V_ZETA, 0, r14 - addi r14, r14, 16 + li 7, 128 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, r14 - addi r14, r14, 16 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .align 4 /* * 3. len = 32, start = 0, 64, 128, 192 - * k += 4 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 32 + * 64 - 96 + * 128 - 160 + * 192 - 224 + * + * These are indexes to the 16 bits array */ - li r7, 64 - lvx V_ZETA, 0, r14 - addi r14, r14, 16 + li 7, 64 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, r14 - addi r14, r14, 16 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, r14 - addi r14, r14, 16 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, r14 - addi r14, r14, 16 + lvx V_ZETA, 0, 14 + addi 14, 14, 16 NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .align 4 /* - * 4. len = 16, start = 0, 32, 64,,...160, 192, 224 - * k += 8 + * 4. len = 16, start = 0, 8, 128, 136 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 16 + * 8 - 24 + * 128 - 144 + * 136 - 152 + * + * These are indexes to the 16 bits array */ - li r7, 32 + li 7, 32 Load_next_4zetas NTT_MREDUCE_4X 0, 64, V_Z0, V_Z1, V_Z2, V_Z3 NTT_MREDUCE_4X 16, 64, V_Z0, V_Z1, V_Z2, V_Z3 @@ -478,10 +554,17 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) .align 4 /* - * 5. len = 8, start = 0, 16, 32, 48,...208, 224, 240 - * k += 16 + * 5. len = 8, start = 0, 64, 128, 192 + * + * Compute coefficients of the NTT based on 2 legs, + * 0 - 8 + * 64 - 72 + * 128 - 136 + * 192 - 200 + * + * These are indexes to the 16 bits array */ - li r7, 16 + li 7, 16 Load_next_4zetas NTT_MREDUCE_4X 0, 32, V_Z0, V_Z1, V_Z2, V_Z3 @@ -496,21 +579,29 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) /* * 6. len = 4, start = 0, 8, 16, 24,...232, 240, 248 - * k += 32 - * Load zeta vectors in 4-4 layout + * Load zeta vectors in 4-4 layout + * + * Compute coefficients of the NTT based on the following sequences, + * 0, 1, 2, 3, 4, 5, 6, 7 + * 8, 9, 10, 11, 12, 13, 14, 15 + * ... + * 240, 241, 242, 243, 244, 245, 246, 247 + * 248, 249, 250, 251, 252, 253, 254, 255 + * + * These are indexes to the 16 bits array. Each loads 4 vectors. */ - li r15, 4 - mtctr r15 - mr r5, r3 /* Let r5 points to coefficient array */ - li r7, 8 - - li r10, 16 - li r11, 32 - li r12, 48 - li r15, 64 - li r16, 80 - li r17, 96 - li r18, 112 + li 15, 4 + mtctr 15 + mr 5, 3 /* Let r5 points to coefficient array */ + li 7, 8 + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 .align 4 ntt_ppc__Len4: @@ -519,20 +610,28 @@ ntt_ppc__Len4: Load_L44Coeffs MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 PermWriteL44 - addi r5, r5, 128 + addi 5, 5, 128 bdnz ntt_ppc__Len4 /* * 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 - * k += 64 - * Load zeta vectors in 2-2-2-2 layout + * Load zeta vectors in 2-2-2-2 layout + * + * Compute coefficients of the NTT based on the following sequences, + * 0, 1, 2, 3, 4, 5, 6, 7 + * 8, 9, 10, 11, 12, 13, 14, 15 + * ... + * 240, 241, 242, 243, 244, 245, 246, 247 + * 248, 249, 250, 251, 252, 253, 254, 255 + * + * These are indexes to the 16 bits array. Each loads 4 vectors. */ - li r8, 4 - mtctr r8 - mr r5, r3 /* Let r5 points to coefficient array */ - li r7, 4 + li 8, 4 + mtctr 8 + mr 5, 3 /* Let r5 points to coefficient array */ + li 7, 4 .align 4 ntt_ppc__Len2: @@ -540,7 +639,7 @@ ntt_ppc__Len2: Load_L24Coeffs MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 PermWriteL24 - addi r5, r5, 128 + addi 5, 5, 128 bdnz ntt_ppc__Len2 diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S index 5c0703755c..4ca5771314 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -39,141 +39,141 @@ * MREDUCE_4X(_v0, _v1, _v2, _v3) */ .macro MREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+v13, 0, r3 - addi r3, r3, 16 - lxvd2x 32+v18, 0, r3 - addi r3, r3, 16 - lxvd2x 32+v23, 0, r3 - addi r3, r3, 16 - lxvd2x 32+v7, 0, r3 - addi r3, r3, 16 - - vmladduhm v15, v13, V1353, v3 - vmladduhm v20, v18, V1353, v3 - vmladduhm v25, v23, V1353, v3 - vmladduhm v9, v7, V1353, v3 - - vmhraddshs v14, v13, V1353, v3 - vmhraddshs v19, v18, V1353, v3 - vmhraddshs v24, v23, V1353, v3 - vmhraddshs v8, v7, V1353, v3 - - vmladduhm v15, v15, V_QINV, v3 - vmladduhm v20, v20, V_QINV, v3 - vmladduhm v25, v25, V_QINV, v3 - vmladduhm v9, v9, V_QINV, v3 - - vmhraddshs v15, v15, V_NMKQ, v14 - vmhraddshs v20, v20, V_NMKQ, v19 - vmhraddshs v25, v25, V_NMKQ, v24 - vmhraddshs v9, v9, V_NMKQ, v8 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 /* Shift right 1 bit */ - vsrah \_v0, v15, v4 - vsrah \_v1, v20, v4 - vsrah \_v2, v25, v4 - vsrah \_v3, v9, v4 + vsrah \_v0, 15, 4 + vsrah \_v1, 20, 4 + vsrah \_v2, 25, 4 + vsrah \_v3, 9, 4 .endm .macro Write_8X - stxvd2x 32+v27, r4, r3 - stxvd2x 32+v28, r5, r3 - stxvd2x 32+v29, r6, r3 - stxvd2x 32+v30, r7, r3 - stxvd2x 32+v13, r8, r3 - stxvd2x 32+v18, r9, r3 - stxvd2x 32+v23, r10, r3 - stxvd2x 32+v7, r11, r3 + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 .endm .align 4 .globl MLK_ASM_NAMESPACE(poly_tomont_ppc) MLK_ASM_FN_SYMBOL(poly_tomont_ppc) - stdu r1, -320(r1) - mflr r0 - - li r6, 128 - li r7, 144 - li r8, 160 - li r9, 176 - li r10, 192 - li r11, 208 - li r12, 224 - stxvx 32+v20, r6, r1 - stxvx 32+v21, r7, r1 - stxvx 32+v22, r8, r1 - stxvx 32+v23, r9, r1 - stxvx 32+v24, r10, r1 - stxvx 32+v25, r11, r1 - stxvx 32+v26, r12, r1 - li r6, 240 - li r7, 256 - li r8, 272 - li r9, 288 - stxvx 32+v27, r6, r1 - stxvx 32+v28, r7, r1 - stxvx 32+v29, r8, r1 - stxvx 32+v30, r9, r1 - - li r6, NQ_OFFSET - li r7, QINV_OFFSET - li r8, C1353_OFFSET - lxvx 32+V_NMKQ, r6, r4 - lxvx 32+V_QINV, r7, r4 - lxvx 32+V1353, r8, r4 - - vxor v3, v3, v3 - vspltish v4, 1 - - li r4, -128 - li r5, -112 - li r6, -96 - li r7, -80 - li r8, -64 - li r9, -48 - li r10, -32 - li r11, -16 - - MREDUCE_4X v27, v28, v29, v30 - MREDUCE_4X v13, v18, v23, v7 + stdu 1, -320(1) + mflr 0 + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + stxvx 32+25, 11, 1 + stxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvx 32+27, 6, 1 + stxvx 32+28, 7, 1 + stxvx 32+29, 8, 1 + stxvx 32+30, 9, 1 + + li 6, NQ_OFFSET + li 7, QINV_OFFSET + li 8, C1353_OFFSET + lxvx 32+V_NMKQ, 6, 4 + lxvx 32+V_QINV, 7, 4 + lxvx 32+V1353, 8, 4 + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 Write_8X - MREDUCE_4X v27, v28, v29, v30 - MREDUCE_4X v13, v18, v23, v7 + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 Write_8X - MREDUCE_4X v27, v28, v29, v30 - MREDUCE_4X v13, v18, v23, v7 + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 Write_8X - MREDUCE_4X v27, v28, v29, v30 - MREDUCE_4X v13, v18, v23, v7 + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 Write_8X - li r6, 128 - li r7, 144 - li r8, 160 - li r9, 176 - li r10, 192 - li r11, 208 - li r12, 224 - lxvx 32+v20, r6, r1 - lxvx 32+v21, r7, r1 - lxvx 32+v22, r8, r1 - lxvx 32+v23, r9, r1 - lxvx 32+v24, r10, r1 - lxvx 32+v25, r11, r1 - lxvx 32+v26, r12, r1 - li r6, 240 - li r7, 256 - li r8, 272 - li r9, 288 - lxvx 32+v27, r6, r1 - lxvx 32+v28, r7, r1 - lxvx 32+v29, r8, r1 - lxvx 32+v30, r9, r1 - mtlr r0 - addi r1, r1, 320 + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + lxvx 32+25, 11, 1 + lxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvx 32+27, 6, 1 + lxvx 32+28, 7, 1 + lxvx 32+29, 8, 1 + lxvx 32+30, 9, 1 + mtlr 0 + addi 1, 1, 320 blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index a6deedffc3..3b6892d867 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -33,168 +33,168 @@ .text .macro BREDUCE_4X _v0 _v1 _v2 _v3 - lxvd2x 32+v8, 0, r3 - lxvd2x 32+v12, r14, r3 - lxvd2x 32+v16, r15, r3 - lxvd2x 32+v20, r16, r3 - addi r3, r3, 64 - vmulosh v6, v8, V20159 - vmulesh v5, v8, V20159 - vmulosh v11, v12, V20159 - vmulesh v10, v12, V20159 - vmulosh v15, v16, V20159 - vmulesh v14, v16, V20159 - vmulosh v19, v20, V20159 - vmulesh v18, v20, V20159 - xxmrglw 32+v4, 32+v5, 32+v6 - xxmrghw 32+v5, 32+v5, 32+v6 - xxmrglw 32+v9, 32+v10, 32+v11 - xxmrghw 32+v10, 32+v10, 32+v11 - xxmrglw 32+v13, 32+v14, 32+v15 - xxmrghw 32+v14, 32+v14, 32+v15 - xxmrglw 32+v17, 32+v18, 32+v19 - xxmrghw 32+v18, 32+v18, 32+v19 - vadduwm v4, v4, V_25 - vadduwm v5, v5, V_25 - vadduwm v9, v9, V_25 - vadduwm v10, v10, V_25 - vadduwm v13, v13, V_25 - vadduwm v14, v14, V_25 - vadduwm v17, v17, V_25 - vadduwm v18, v18, V_25 - vsraw v4, v4, V_26 - vsraw v5, v5, V_26 - vsraw v9, v9, V_26 - vsraw v10, v10, V_26 - vsraw v13, v13, V_26 - vsraw v14, v14, V_26 - vsraw v17, v17, V_26 - vsraw v18, v18, V_26 - vpkuwum v4, v5, v4 - vsubuhm v4, v7, v4 - vpkuwum v9, v10, v9 - vsubuhm v9, v7, v9 - vpkuwum v13, v14, v13 - vsubuhm v13, v7, v13 - vpkuwum v17, v18, v17 - vsubuhm v17, v7, v17 - vmladduhm \_v0, v4, V_MKQ, v8 - vmladduhm \_v1, v9, V_MKQ, v12 - vmladduhm \_v2, v13, V_MKQ, v16 - vmladduhm \_v3, v17, V_MKQ, v20 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 .endm .macro Write_8X - stxvd2x 32+v21, r4, r3 - stxvd2x 32+v22, r5, r3 - stxvd2x 32+v23, r6, r3 - stxvd2x 32+v24, r7, r3 - stxvd2x 32+v4, r8, r3 - stxvd2x 32+v9, r9, r3 - stxvd2x 32+v13, r10, r3 - stxvd2x 32+v17, r11, r3 + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 .endm /* * Conditional addition to get unsigned canonical representative */ .macro To_unsigned_16 - lxvd2x 32+v12, 0, r3 - lxvd2x 32+v13, r14, r3 - lxvd2x 32+v14, r15, r3 - lxvd2x 32+v15, r16, r3 - addi r3, r3, 64 - vsrh v1, v12, v10 - vsrh v0, v13, v10 - vsrh v3, v14, v10 - vsrh v2, v15, v10 - vadduhm v7, v12, v11 - vadduhm v8, v13, v11 - vadduhm v5, v14, v11 - vadduhm v6, v15, v11 - vcmpequh v1, v1, v9 - vcmpequh v0, v0, v9 - vcmpequh v3, v3, v9 - vcmpequh v2, v2, v9 - xxsel 32+v1, 32+v7,32+v12, 32+v1 - xxsel 32+v0, 32+v8,32+v13, 32+v0 - xxsel 32+v3, 32+v5,32+v14, 32+v3 - xxsel 32+v2, 32+v6,32+v15, 32+v2 - stxvd2x 32+v3, r10, r3 - stxvd2x 32+v2, r11, r3 - stxvd2x 32+v1, r8, r3 - stxvd2x 32+v0, r9, r3 + lxvd2x 32+12, 0, 3 + lxvd2x 32+13, 14, 3 + lxvd2x 32+14, 15, 3 + lxvd2x 32+15, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxvd2x 32+3, 10, 3 + stxvd2x 32+2, 11, 3 + stxvd2x 32+1, 8, 3 + stxvd2x 32+0, 9, 3 .endm .align 4 .globl MLK_ASM_NAMESPACE(reduce_ppc) MLK_ASM_FN_SYMBOL(reduce_ppc) - stdu r1, -224(r1) - mflr r0 - std r14, 96(r1) - std r15, 104(r1) - std r16, 112(r1) - li r6, 128 - li r7, 144 - li r8, 160 - li r9, 176 - li r10, 192 - stxvx 32+v20, r6, r1 - stxvx 32+v21, r7, r1 - stxvx 32+v22, r8, r1 - stxvx 32+v23, r9, r1 - stxvx 32+v24, r10, r1 - - vxor v7, v7, v7 - - li r6, Q_OFFSET - li r7, C20159_OFFSET - lxvx 32+V_MKQ, r6, r4 - lxvx 32+V20159, r7, r4 + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + + vxor 7, 7, 7 + + li 6, Q_OFFSET + li 7, C20159_OFFSET + lxvx 32+V_MKQ, 6, 4 + lxvx 32+V20159, 7, 4 vspltisw V_26, 13 vadduwm V_26, V_26, V_26 - vspltisw v4, 1 - vsubuwm v5, V_26, v4 - vslw V_25, v4, v5 - - li r4, -128 - li r5, -112 - li r6, -96 - li r7, -80 - li r8, -64 - li r9, -48 - li r10, -32 - li r11, -16 - - li r14, 16 - li r15, 32 - li r16, 48 - - BREDUCE_4X v21, v22, v23, v24 - BREDUCE_4X v4, v9, v13, v17 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 Write_8X - BREDUCE_4X v21, v22, v23, v24 - BREDUCE_4X v4, v9, v13, v17 + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 Write_8X - BREDUCE_4X v21, v22, v23, v24 - BREDUCE_4X v4, v9, v13, v17 + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 Write_8X - BREDUCE_4X v21, v22, v23, v24 - BREDUCE_4X v4, v9, v13, v17 + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 Write_8X /* * To unsigned canonical */ .align 4 - addi r3, r3, -512 - vxor v9, v9, v9 - vspltish v10, 15 - vmr v11, V_MKQ + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, V_MKQ To_unsigned_16 To_unsigned_16 @@ -205,21 +205,21 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) To_unsigned_16 To_unsigned_16 - ld r14, 96(r1) - ld r15, 104(r1) - ld r16, 112(r1) - li r6, 128 - li r7, 144 - li r8, 160 - li r9, 176 - li r10, 192 - lxvx 32+v20, r6, r1 - lxvx 32+v21, r7, r1 - lxvx 32+v22, r8, r1 - lxvx 32+v23, r9, r1 - lxvx 32+v24, r10, r1 - mtlr r0 - addi r1, r1, 224 + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + mtlr 0 + addi 1, 1, 224 blr /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. From a0f8486243ac4747feacb5c9e55ed897b2308041 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Mon, 15 Dec 2025 07:23:46 -0500 Subject: [PATCH 21/22] Used macro and removed ntt loops for len=2 and 4. Replaced more resgiters number with C type variable names. Signed-off-by: Danny Tsen --- dev/ppc64le/src/intt_ppc.S | 250 +++++++++++----------- dev/ppc64le/src/ntt_ppc.S | 268 +++++++++++++----------- mlkem/src/native/ppc64le/src/intt_ppc.S | 250 +++++++++++----------- mlkem/src/native/ppc64le/src/ntt_ppc.S | 268 +++++++++++++----------- 4 files changed, 558 insertions(+), 478 deletions(-) diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 38b1777688..a8acfff070 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -64,6 +64,20 @@ #define vresult_mont3 23 #define vresult_mont4 28 +#define rinp 3 +#define dup_rinp 5 +#define qinp 4 +#define len_2 7 +#define zeta_inp 14 +#define a1_offset 9 +#define a2_offset 16 +#define a3_offset 18 +#define a4_offset 20 +#define b1_offset 10 +#define b2_offset 17 +#define b3_offset 19 +#define b4_offset 21 + .macro SAVE_REGS stdu 1, -352(1) mflr 0 @@ -177,14 +191,14 @@ * */ .macro Init_Coeffs_offset start next - li 9, \start /* first offset to j */ - add 10, 7, 9 /* J + len*2 */ - addi 16, 9, \next - addi 17, 10, \next - addi 18, 16, \next - addi 19, 17, \next - addi 20, 18, \next - addi 21, 19, \next + li a1_offset, \start /* first offset to j */ + add b1_offset, len_2, a1_offset /* J + len*2 */ + addi a2_offset, a1_offset, \next + addi b2_offset, b1_offset, \next + addi a3_offset, a2_offset, \next + addi b3_offset, b2_offset, \next + addi a4_offset, a3_offset, \next + addi b4_offset, b3_offset, \next .endm /* @@ -196,15 +210,15 @@ * r[j]: V21, V22, V23, V24 */ .macro Load_4Rjp - lxvd2x 32+vdata_b1, 3, 10 /* V8: vector r'0 */ - lxvd2x 32+vdata_b2, 3, 17 /* V12: vector for r'1 */ - lxvd2x 32+vdata_b3, 3, 19 /* V16: vector for r'2 */ - lxvd2x 32+vdata_b4, 3, 21 /* V20: vector for r'3 */ - - lxvd2x 32+vdata_a1, 3, 9 /* V21: vector r0 */ - lxvd2x 32+vdata_a2, 3, 16 /* V22: vector r1 */ - lxvd2x 32+vdata_a3, 3, 18 /* V23: vector r2 */ - lxvd2x 32+vdata_a4, 3, 20 /* V24: vector r3 */ + lxvd2x 32+vdata_b1, rinp, b1_offset /* V8: vector r'0 */ + lxvd2x 32+vdata_b2, rinp, b2_offset /* V12: vector for r'1 */ + lxvd2x 32+vdata_b3, rinp, b3_offset /* V16: vector for r'2 */ + lxvd2x 32+vdata_b4, rinp, b4_offset /* V20: vector for r'3 */ + + lxvd2x 32+vdata_a1, rinp, a1_offset /* V21: vector r0 */ + lxvd2x 32+vdata_a2, rinp, a2_offset /* V22: vector r1 */ + lxvd2x 32+vdata_a3, rinp, a3_offset /* V23: vector r2 */ + lxvd2x 32+vdata_a4, rinp, a4_offset /* V24: vector r3 */ .endm /* @@ -236,20 +250,20 @@ * in the proper order to match the multiplication. */ .macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 - lxvd2x 32+26, 10, 5 + lxvd2x 32+25, 0, dup_rinp + lxvd2x 32+26, 10, dup_rinp vmrgew vdata_b1, 25, 26 vmrgow vdata_a1, 25, 26 - lxvd2x 32+25, 11, 5 - lxvd2x 32+26, 12, 5 + lxvd2x 32+25, 11, dup_rinp + lxvd2x 32+26, 12, dup_rinp vmrgew vdata_b2, 25, 26 vmrgow vdata_a2, 25, 26 - lxvd2x 32+25, 15, 5 - lxvd2x 32+26, 16, 5 + lxvd2x 32+25, 15, dup_rinp + lxvd2x 32+26, 16, dup_rinp vmrgew vdata_b3, 25, 26 vmrgow vdata_a3, 25, 26 - lxvd2x 32+25, 17, 5 - lxvd2x 32+26, 18, 5 + lxvd2x 32+25, 17, dup_rinp + lxvd2x 32+26, 18, dup_rinp vmrgew vdata_b4, 25, 26 vmrgow vdata_a4, 25, 26 .endm @@ -270,20 +284,20 @@ * in the proper order to match the multiplication. */ .macro Load_L44Coeffs - lxvd2x 10, 0, 5 - lxvd2x 11, 10, 5 + lxvd2x 10, 0, dup_rinp + lxvd2x 11, 10, dup_rinp xxpermdi 32+vdata_b1, 11, 10, 3 xxpermdi 32+vdata_a1, 11, 10, 0 - lxvd2x 10, 11, 5 - lxvd2x 11, 12, 5 + lxvd2x 10, 11, dup_rinp + lxvd2x 11, 12, dup_rinp xxpermdi 32+vdata_b2, 11, 10, 3 xxpermdi 32+vdata_a2, 11, 10, 0 - lxvd2x 10, 15, 5 - lxvd2x 11, 16, 5 + lxvd2x 10, 15, dup_rinp + lxvd2x 11, 16, dup_rinp xxpermdi 32+vdata_b3, 11, 10, 3 xxpermdi 32+vdata_a3, 11, 10, 0 - lxvd2x 10, 17, 5 - lxvd2x 11, 18, 5 + lxvd2x 10, 17, dup_rinp + lxvd2x 11, 18, dup_rinp xxpermdi 32+vdata_b4, 11, 10, 3 xxpermdi 32+vdata_a4, 11, 10, 0 .endm @@ -353,16 +367,16 @@ */ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 /* Modular multification bond by 2^16 * q in abs value */ - vmladduhm 15, vdata_mont1, \_vz0, 3 - vmladduhm 20, vdata_mont2, \_vz1, 3 - vmladduhm 27, vdata_mont3, \_vz2, 3 - vmladduhm 28, vdata_mont4, \_vz3, 3 + vmladduhm 15, vdata_mont1, \_vz0, rinp + vmladduhm 20, vdata_mont2, \_vz1, rinp + vmladduhm 27, vdata_mont3, \_vz2, rinp + vmladduhm 28, vdata_mont4, \_vz3, rinp /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ - vmhraddshs 14, vdata_mont1, \_vz0, 3 - vmhraddshs 19, vdata_mont2, \_vz1, 3 - vmhraddshs 24, vdata_mont3, \_vz2, 3 - vmhraddshs 29, vdata_mont4, \_vz3, 3 + vmhraddshs 14, vdata_mont1, \_vz0, rinp + vmhraddshs 19, vdata_mont2, \_vz1, rinp + vmhraddshs 24, vdata_mont3, \_vz2, rinp + vmhraddshs 29, vdata_mont4, \_vz3, rinp vmladduhm 15, 15, V_QINV, 3 vmladduhm 20, 20, V_QINV, 3 @@ -396,11 +410,11 @@ li 8, 16 li 11, 32 li 12, 48 - lxvd2x 32+V_Z0, 0, 14 - lxvd2x 32+V_Z1, 8, 14 - lxvd2x 32+V_Z2, 11, 14 - lxvd2x 32+V_Z3, 12, 14 - addi 14, 14, 64 + lxvd2x 32+V_Z0, 0, zeta_inp + lxvd2x 32+V_Z1, 8, zeta_inp + lxvd2x 32+V_Z2, 11, zeta_inp + lxvd2x 32+V_Z3, 12, zeta_inp + addi zeta_inp, zeta_inp, 64 .endm /* @@ -415,38 +429,38 @@ .endm .macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 9 - stxvd2x \_vs1, 3, 16 - stxvd2x \_vs2, 3, 18 - stxvd2x \_vs3, 3, 20 + stxvd2x \_vs0, rinp, a1_offset + stxvd2x \_vs1, rinp, a2_offset + stxvd2x \_vs2, rinp, a3_offset + stxvd2x \_vs3, rinp, a4_offset .endm .macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 10 - stxvd2x \_vs1, 3, 17 - stxvd2x \_vs2, 3, 19 - stxvd2x \_vs3, 3, 21 + stxvd2x \_vs0, rinp, b1_offset + stxvd2x \_vs1, rinp, b2_offset + stxvd2x \_vs2, rinp, b3_offset + stxvd2x \_vs3, rinp, b4_offset .endm .macro Reload_4coeffs - lxvd2x 32+vdata_mont1, 0, 3 - lxvd2x 32+vdata_mont2, 10, 3 - lxvd2x 32+vdata_mont3, 11, 3 - lxvd2x 32+vdata_mont4, 12, 3 - addi 3, 3, 64 + lxvd2x 32+vdata_mont1, 0, rinp + lxvd2x 32+vdata_mont2, 10, rinp + lxvd2x 32+vdata_mont3, 11, rinp + lxvd2x 32+vdata_mont4, 12, rinp + addi rinp, rinp, 64 .endm .macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - addi 3, 3, -128 - stxvd2x \_vs0, 0, 3 - stxvd2x \_vs1, 10, 3 - stxvd2x \_vs2, 11, 3 - stxvd2x \_vs3, 12, 3 - stxvd2x \_vs4, 15, 3 - stxvd2x \_vs5, 16, 3 - stxvd2x \_vs6, 17, 3 - stxvd2x \_vs7, 18, 3 - addi 3, 3, 128 + addi rinp, rinp, -128 + stxvd2x \_vs0, 0, rinp + stxvd2x \_vs1, 10, rinp + stxvd2x \_vs2, 11, rinp + stxvd2x \_vs3, 12, rinp + stxvd2x \_vs4, 15, rinp + stxvd2x \_vs5, 16, rinp + stxvd2x \_vs6, 17, rinp + stxvd2x \_vs7, 18, rinp + addi rinp, rinp, 128 .endm /* @@ -466,14 +480,14 @@ xxpermdi 32+15, 32+24, 32+vresult_mont3, 0 xxpermdi 32+16, 32+29, 32+vresult_mont4, 3 xxpermdi 32+17, 32+29, 32+vresult_mont4, 0 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + stxvd2x 32+10, 0, dup_rinp + stxvd2x 32+11, 10, dup_rinp + stxvd2x 32+12, 11, dup_rinp + stxvd2x 32+13, 12, dup_rinp + stxvd2x 32+14, 15, dup_rinp + stxvd2x 32+15, 16, dup_rinp + stxvd2x 32+16, 17, dup_rinp + stxvd2x 32+17, 18, dup_rinp .endm /* @@ -493,14 +507,14 @@ vmrgow 15, vresult_mont3, 24 vmrgew 16, vresult_mont4, 29 vmrgow 17, vresult_mont4, 29 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + stxvd2x 32+10, 0, dup_rinp + stxvd2x 32+11, 10, dup_rinp + stxvd2x 32+12, 11, dup_rinp + stxvd2x 32+13, 12, dup_rinp + stxvd2x 32+14, 15, dup_rinp + stxvd2x 32+15, 16, dup_rinp + stxvd2x 32+16, 17, dup_rinp + stxvd2x 32+17, 18, dup_rinp .endm /* @@ -607,10 +621,10 @@ MLK_ASM_FN_SYMBOL(intt_ppc) /* init vectors and constants Setup for Montgomery reduce */ - lxvx 0, 0, 4 + lxvx 0, 0, qinp li 10, QINV_OFFSET - lxvx 32+V_QINV, 10, 4 + lxvx 32+V_QINV, 10, qinp xxlxor 32+3, 32+3, 32+3 vspltish 4, 1 xxlor 2, 32+2, 32+2 /* QINV */ @@ -620,8 +634,8 @@ MLK_ASM_FN_SYMBOL(intt_ppc) /* Setup for Barrett reduce */ li 10, Q_OFFSET li 11, C20159_OFFSET - lxvx 6, 10, 4 /* V_MKQ */ - lxvx 32+V20159, 11, 4 /* V20159 */ + lxvx 6, 10, qinp /* V_MKQ */ + lxvx 32+V20159, 11, qinp /* V20159 */ vspltisw 8, 13 vadduwm 8, 8, 8 @@ -643,8 +657,8 @@ MLK_ASM_FN_SYMBOL(intt_ppc) /* * Montgomery reduce loops with constant 1441 */ - addi 14, 4, C1441_OFFSET - lvx V1441, 0, 14 + addi zeta_inp, qinp, C1441_OFFSET + lvx V1441, 0, zeta_inp li 8, 4 mtctr 8 @@ -657,7 +671,7 @@ intt_ppc__Loopf: MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 bdnz intt_ppc__Loopf - addi 3, 3, -512 + addi rinp, rinp, -512 .align 4 /* @@ -674,18 +688,18 @@ intt_ppc__Loopf: * * These are indexes to the 16 bits array. Each loads 4 vectors. */ - addi 14, 4, ZETA_INTT_OFFSET - li 7, 4 /* len * 2 */ - mr 5, 3 + addi zeta_inp, qinp, ZETA_INTT_OFFSET + li len_2, 4 /* len * 2 */ + mr dup_rinp, rinp INTT_REDUCE_L24 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 INTT_REDUCE_L24 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 INTT_REDUCE_L24 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 INTT_REDUCE_L24 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 .align 4 /* @@ -701,17 +715,17 @@ intt_ppc__Loopf: * * These are indexes to the 16 bits array. Each loads 4 vectors. */ - mr 5, 3 - li 7, 8 + mr dup_rinp, rinp + li len_2, 8 INTT_REDUCE_L44 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 INTT_REDUCE_L44 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 INTT_REDUCE_L44 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 INTT_REDUCE_L44 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 .align 4 /* @@ -725,7 +739,7 @@ intt_ppc__Loopf: * * These are indexes to the 16 bits array */ - li 7, 16 + li len_2, 16 INTT_REDUCE_4X 0, 32 INTT_REDUCE_4X 128, 32 @@ -744,16 +758,16 @@ intt_ppc__Loopf: * * These are indexes to the 16 bits array */ - li 7, 32 + li len_2, 32 INTT_REDUCE_4X 0, 64 - addi 14, 14, -64 + addi zeta_inp, zeta_inp, -64 INTT_REDUCE_4X 16, 64 INTT_REDUCE_4X 256, 64 - addi 14, 14, -64 + addi zeta_inp, zeta_inp, -64 INTT_REDUCE_4X 272, 64 .align 4 @@ -768,16 +782,16 @@ intt_ppc__Loopf: * * These are indexes to the 16 bits array */ - li 7, 64 + li len_2, 64 INTT_REDUCE_L567 0, 16 - addi 14, 14, 16 + addi zeta_inp, zeta_inp, 16 INTT_REDUCE_L567 128, 16 - addi 14, 14, 16 + addi zeta_inp, zeta_inp, 16 INTT_REDUCE_L567 256, 16 - addi 14, 14, 16 + addi zeta_inp, zeta_inp, 16 INTT_REDUCE_L567 384, 16 - addi 14, 14, 16 + addi zeta_inp, zeta_inp, 16 .align 4 /* @@ -791,14 +805,14 @@ intt_ppc__Loopf: * * These are indexes to the 16 bits array */ - li 7, 128 + li len_2, 128 INTT_REDUCE_L567 0, 16 INTT_REDUCE_L567 64, 16 - addi 14, 14, 16 + addi zeta_inp, zeta_inp, 16 INTT_REDUCE_L567 256, 16 INTT_REDUCE_L567 320, 16 - addi 14, 14, 16 + addi zeta_inp, zeta_inp, 16 .align 4 /* @@ -812,7 +826,7 @@ intt_ppc__Loopf: * * These are indexes to the 16 bits array */ - li 7, 256 /* len*2 */ + li len_2, 256 /* len*2 */ INTT_REDUCE_L567 0, 16 INTT_REDUCE_L567 64, 16 diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index 32bfa56fdf..a2a8120696 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -43,6 +43,20 @@ #define vresult_a4 30 #define vresult_b4 31 +#define rinp 3 +#define dup_rinp 5 +#define qinp 4 +#define len_2 7 +#define zeta_inp 14 +#define a1_offset 9 +#define a2_offset 16 +#define a3_offset 18 +#define a4_offset 20 +#define b1_offset 10 +#define b2_offset 17 +#define b3_offset 19 +#define b4_offset 21 + .machine "any" .text @@ -143,14 +157,14 @@ * */ .macro Init_Coeffs_offset start next - li 9, \start /* first offset to j */ - add 10, 7, 9 /* J + len*2 */ - addi 16, 9, \next - addi 17, 10, \next - addi 18, 16, \next - addi 19, 17, \next - addi 20, 18, \next - addi 21, 19, \next + li a1_offset, \start /* first offset to j */ + add b1_offset, len_2, a1_offset /* J + len*2 */ + addi a2_offset, a1_offset, \next + addi b2_offset, b1_offset, \next + addi a3_offset, a2_offset, \next + addi b3_offset, b2_offset, \next + addi a4_offset, a3_offset, \next + addi b4_offset, b3_offset, \next .endm /* @@ -158,10 +172,10 @@ * r[j+len]: V13, V18, V23, V28 */ .macro Load_4Rjp - lxvd2x 32+vdata_b1, 3, 10 /* V13: vector r'0 */ - lxvd2x 32+vdata_b2, 3, 17 /* V18: vector for r'1 */ - lxvd2x 32+vdata_b3, 3, 19 /* V23: vector for r'2 */ - lxvd2x 32+vdata_b4, 3, 21 /* V28: vector for r'3 */ + lxvd2x 32+vdata_b1, rinp, b1_offset /* V13: vector r'0 */ + lxvd2x 32+vdata_b2, rinp, b2_offset /* V18: vector for r'1 */ + lxvd2x 32+vdata_b3, rinp, b3_offset /* V23: vector for r'2 */ + lxvd2x 32+vdata_b4, rinp, b4_offset /* V28: vector for r'3 */ .endm /* @@ -192,20 +206,20 @@ * in the proper order to match the multiplication. */ .macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 - lxvd2x 32+26, 10, 5 + lxvd2x 32+25, 0, dup_rinp + lxvd2x 32+26, 10, dup_rinp vmrgew vdata_b1, 25, 26 vmrgow vdata_a1, 25, 26 - lxvd2x 32+25, 11, 5 - lxvd2x 32+26, 12, 5 + lxvd2x 32+25, 11, dup_rinp + lxvd2x 32+26, 12, dup_rinp vmrgew vdata_b2, 25, 26 vmrgow vdata_a2, 25, 26 - lxvd2x 32+25, 15, 5 - lxvd2x 32+26, 16, 5 + lxvd2x 32+25, 15, dup_rinp + lxvd2x 32+26, 16, dup_rinp vmrgew vdata_b3, 25, 26 vmrgow vdata_a3, 25, 26 - lxvd2x 32+25, 17, 5 - lxvd2x 32+26, 18, 5 + lxvd2x 32+25, 17, dup_rinp + lxvd2x 32+26, 18, dup_rinp vmrgew vdata_b4, 25, 26 vmrgow vdata_a4, 25, 26 .endm @@ -226,20 +240,20 @@ * in the proper order to match the multiplication. */ .macro Load_L44Coeffs - lxvd2x 1, 0, 5 - lxvd2x 2, 10, 5 + lxvd2x 1, 0, dup_rinp + lxvd2x 2, 10, dup_rinp xxpermdi 32+vdata_b1, 2, 1, 3 xxpermdi 32+vdata_a1, 2, 1, 0 - lxvd2x 3, 11, 5 - lxvd2x 4, 12, 5 + lxvd2x 3, 11, dup_rinp + lxvd2x 4, 12, dup_rinp xxpermdi 32+vdata_b2, 4, 3, 3 xxpermdi 32+vdata_a2, 4, 3, 0 - lxvd2x 1, 15, 5 - lxvd2x 2, 16, 5 + lxvd2x 1, 15, dup_rinp + lxvd2x 2, 16, dup_rinp xxpermdi 32+vdata_b3, 2, 1, 3 xxpermdi 32+vdata_a3, 2, 1, 0 - lxvd2x 3, 17, 5 - lxvd2x 4, 18, 5 + lxvd2x 3, 17, dup_rinp + lxvd2x 4, 18, dup_rinp xxpermdi 32+vdata_b4, 4, 3, 3 xxpermdi 32+vdata_a4, 4, 3, 0 .endm @@ -255,21 +269,21 @@ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 /* fqmul = zeta * coefficient Modular multification bond by 2^16 * q in abs value */ - vmladduhm 15, vdata_b1, \_vz0, 3 - vmladduhm 20, vdata_b2, \_vz1, 3 - vmladduhm 25, vdata_b3, \_vz2, 3 - vmladduhm 30, vdata_b4, \_vz3, 3 + vmladduhm 15, vdata_b1, \_vz0, rinp + vmladduhm 20, vdata_b2, \_vz1, rinp + vmladduhm 25, vdata_b3, \_vz2, rinp + vmladduhm 30, vdata_b4, \_vz3, rinp /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ - vmhraddshs 14, vdata_b1, \_vz0, 3 - vmhraddshs 19, vdata_b2, \_vz1, 3 - vmhraddshs 24, vdata_b3, \_vz2, 3 - vmhraddshs 29, vdata_b4, \_vz3, 3 + vmhraddshs 14, vdata_b1, \_vz0, rinp + vmhraddshs 19, vdata_b2, \_vz1, rinp + vmhraddshs 24, vdata_b3, \_vz2, rinp + vmhraddshs 29, vdata_b4, \_vz3, rinp - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 30, 30, V_QINV, 3 + vmladduhm 15, 15, V_QINV, rinp + vmladduhm 20, 20, V_QINV, rinp + vmladduhm 25, 25, V_QINV, rinp + vmladduhm 30, 30, V_QINV, rinp vmhraddshs 15, 15, V_NMKQ, 14 vmhraddshs 20, 20, V_NMKQ, 19 @@ -289,10 +303,10 @@ * r[j]: V12, V17, V22, V27 */ .macro Load_4Rj - lxvd2x 32+vdata_a1, 3, 9 /* V12: vector r0 */ - lxvd2x 32+vdata_a2, 3, 16 /* V17: vector r1 */ - lxvd2x 32+vdata_a3, 3, 18 /* V22: vector r2 */ - lxvd2x 32+vdata_a4, 3, 20 /* V27: vector r3 */ + lxvd2x 32+vdata_a1, rinp, a1_offset /* V12: vector r0 */ + lxvd2x 32+vdata_a2, rinp, a2_offset /* V17: vector r1 */ + lxvd2x 32+vdata_a3, rinp, a3_offset /* V22: vector r2 */ + lxvd2x 32+vdata_a4, rinp, a4_offset /* V27: vector r3 */ .endm /* @@ -318,14 +332,14 @@ .endm .macro Write_One - stxvd2x 32+vresult_a1, 3, 9 - stxvd2x 32+vresult_b1, 3, 10 - stxvd2x 32+vresult_a2, 3, 16 - stxvd2x 32+vresult_b2, 3, 17 - stxvd2x 32+vresult_a3, 3, 18 - stxvd2x 32+vresult_b3, 3, 19 - stxvd2x 32+vresult_a4, 3, 20 - stxvd2x 32+vresult_b4, 3, 21 + stxvd2x 32+vresult_a1, rinp, a1_offset + stxvd2x 32+vresult_b1, rinp, b1_offset + stxvd2x 32+vresult_a2, rinp, a2_offset + stxvd2x 32+vresult_b2, rinp, b2_offset + stxvd2x 32+vresult_a3, rinp, a3_offset + stxvd2x 32+vresult_b3, rinp, b3_offset + stxvd2x 32+vresult_a4, rinp, a4_offset + stxvd2x 32+vresult_b4, rinp, b4_offset .endm /* @@ -342,14 +356,14 @@ xxpermdi 5, 32+vresult_a3, 32+vresult_b3, 0 xxpermdi 6, 32+vresult_a4, 32+vresult_b4, 3 xxpermdi 7, 32+vresult_a4, 32+vresult_b4, 0 - stxvd2x 0, 0, 5 - stxvd2x 1, 10, 5 - stxvd2x 2, 11, 5 - stxvd2x 3, 12, 5 - stxvd2x 4, 15, 5 - stxvd2x 5, 16, 5 - stxvd2x 6, 17, 5 - stxvd2x 7, 18, 5 + stxvd2x 0, 0, dup_rinp + stxvd2x 1, 10, dup_rinp + stxvd2x 2, 11, dup_rinp + stxvd2x 3, 12, dup_rinp + stxvd2x 4, 15, dup_rinp + stxvd2x 5, 16, dup_rinp + stxvd2x 6, 17, dup_rinp + stxvd2x 7, 18, dup_rinp .endm /* @@ -366,25 +380,25 @@ vmrgow 15, vresult_b3, vresult_a3 vmrgew 16, vresult_b4, vresult_a4 vmrgow 17, vresult_b4, vresult_a4 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + stxvd2x 32+10, 0, dup_rinp + stxvd2x 32+11, 10, dup_rinp + stxvd2x 32+12, 11, dup_rinp + stxvd2x 32+13, 12, dup_rinp + stxvd2x 32+14, 15, dup_rinp + stxvd2x 32+15, 16, dup_rinp + stxvd2x 32+16, 17, dup_rinp + stxvd2x 32+17, 18, dup_rinp .endm .macro Load_next_4zetas li 10, 16 li 11, 32 li 12, 48 - lxvd2x 32+V_Z0, 0, 14 - lxvd2x 32+V_Z1, 10, 14 - lxvd2x 32+V_Z2, 11, 14 - lxvd2x 32+V_Z3, 12, 14 - addi 14, 14, 64 + lxvd2x 32+V_Z0, 0, zeta_inp + lxvd2x 32+V_Z1, 10, zeta_inp + lxvd2x 32+V_Z2, 11, zeta_inp + lxvd2x 32+V_Z3, 12, zeta_inp + addi zeta_inp, zeta_inp, 64 .endm /* @@ -398,6 +412,32 @@ xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 .endm +/* + * NTT layer Len=2. + */ +.macro NTT_REDUCE_L24 + Load_next_4zetas + Load_L24Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi dup_rinp, dup_rinp, 128 +.endm + +/* + * NTT layer Len=4. + */ +.macro NTT_REDUCE_L44 + Load_next_4zetas + Perm_4zetas + Load_L44Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi dup_rinp, dup_rinp, 128 +.endm + +/* + * NTT other layers. + */ .macro NTT_MREDUCE_4X start next _vz0 _vz1 _vz2 _vz3 Load_4Coeffs \start, \next MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 @@ -448,16 +488,16 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) SAVE_REGS /* load MLKEM_Q */ - lvx V_NMKQ,0,4 + lvx V_NMKQ,0,qinp /* Register 14 as pointer to zetas array */ - addi 14, 4, ZETA_NTT_OFFSET + addi zeta_inp, qinp, ZETA_NTT_OFFSET vxor 3, 3, 3 vspltish 4, 1 li 10, QINV_OFFSET - lvx V_QINV, 10, 4 + lvx V_QINV, 10, qinp .align 4 /* @@ -471,9 +511,9 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * * These are indexes to the 16 bits array */ - li 7, 256 /* len * 2 */ - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + li len_2, 256 /* len * 2 */ + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -492,14 +532,14 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * * These are indexes to the 16 bits array */ - li 7, 128 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + li len_2, 128 + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -515,21 +555,21 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * * These are indexes to the 16 bits array */ - li 7, 64 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + li len_2, 64 + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .align 4 @@ -544,7 +584,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * * These are indexes to the 16 bits array */ - li 7, 32 + li len_2, 32 Load_next_4zetas NTT_MREDUCE_4X 0, 64, V_Z0, V_Z1, V_Z2, V_Z3 NTT_MREDUCE_4X 16, 64, V_Z0, V_Z1, V_Z2, V_Z3 @@ -565,7 +605,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * * These are indexes to the 16 bits array */ - li 7, 16 + li len_2, 16 Load_next_4zetas NTT_MREDUCE_4X 0, 32, V_Z0, V_Z1, V_Z2, V_Z3 @@ -591,10 +631,8 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * * These are indexes to the 16 bits array. Each loads 4 vectors. */ - li 15, 4 - mtctr 15 - mr 5, 3 /* Let r5 points to coefficient array */ - li 7, 8 + mr dup_rinp, rinp /* Let r5 points to coefficient array */ + li len_2, 8 li 10, 16 li 11, 32 @@ -605,15 +643,10 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 18, 112 .align 4 -ntt_ppc__Len4: - Load_next_4zetas - Perm_4zetas - Load_L44Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 - PermWriteL44 - addi 5, 5, 128 - - bdnz ntt_ppc__Len4 + NTT_REDUCE_L44 + NTT_REDUCE_L44 + NTT_REDUCE_L44 + NTT_REDUCE_L44 /* * 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 @@ -628,21 +661,14 @@ ntt_ppc__Len4: * * These are indexes to the 16 bits array. Each loads 4 vectors. */ - - li 8, 4 - mtctr 8 - mr 5, 3 /* Let r5 points to coefficient array */ - li 7, 4 + mr dup_rinp, rinp /* Let r5 points to coefficient array */ + li len_2, 4 .align 4 -ntt_ppc__Len2: - Load_next_4zetas - Load_L24Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 - PermWriteL24 - addi 5, 5, 128 - - bdnz ntt_ppc__Len2 + NTT_REDUCE_L24 + NTT_REDUCE_L24 + NTT_REDUCE_L24 + NTT_REDUCE_L24 RESTORE_REGS blr diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 4aab1e7c3e..b9169366dd 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -63,6 +63,20 @@ #define vresult_mont3 23 #define vresult_mont4 28 +#define rinp 3 +#define dup_rinp 5 +#define qinp 4 +#define len_2 7 +#define zeta_inp 14 +#define a1_offset 9 +#define a2_offset 16 +#define a3_offset 18 +#define a4_offset 20 +#define b1_offset 10 +#define b2_offset 17 +#define b3_offset 19 +#define b4_offset 21 + .macro SAVE_REGS stdu 1, -352(1) mflr 0 @@ -176,14 +190,14 @@ * */ .macro Init_Coeffs_offset start next - li 9, \start /* first offset to j */ - add 10, 7, 9 /* J + len*2 */ - addi 16, 9, \next - addi 17, 10, \next - addi 18, 16, \next - addi 19, 17, \next - addi 20, 18, \next - addi 21, 19, \next + li a1_offset, \start /* first offset to j */ + add b1_offset, len_2, a1_offset /* J + len*2 */ + addi a2_offset, a1_offset, \next + addi b2_offset, b1_offset, \next + addi a3_offset, a2_offset, \next + addi b3_offset, b2_offset, \next + addi a4_offset, a3_offset, \next + addi b4_offset, b3_offset, \next .endm /* @@ -195,15 +209,15 @@ * r[j]: V21, V22, V23, V24 */ .macro Load_4Rjp - lxvd2x 32+vdata_b1, 3, 10 /* V8: vector r'0 */ - lxvd2x 32+vdata_b2, 3, 17 /* V12: vector for r'1 */ - lxvd2x 32+vdata_b3, 3, 19 /* V16: vector for r'2 */ - lxvd2x 32+vdata_b4, 3, 21 /* V20: vector for r'3 */ - - lxvd2x 32+vdata_a1, 3, 9 /* V21: vector r0 */ - lxvd2x 32+vdata_a2, 3, 16 /* V22: vector r1 */ - lxvd2x 32+vdata_a3, 3, 18 /* V23: vector r2 */ - lxvd2x 32+vdata_a4, 3, 20 /* V24: vector r3 */ + lxvd2x 32+vdata_b1, rinp, b1_offset /* V8: vector r'0 */ + lxvd2x 32+vdata_b2, rinp, b2_offset /* V12: vector for r'1 */ + lxvd2x 32+vdata_b3, rinp, b3_offset /* V16: vector for r'2 */ + lxvd2x 32+vdata_b4, rinp, b4_offset /* V20: vector for r'3 */ + + lxvd2x 32+vdata_a1, rinp, a1_offset /* V21: vector r0 */ + lxvd2x 32+vdata_a2, rinp, a2_offset /* V22: vector r1 */ + lxvd2x 32+vdata_a3, rinp, a3_offset /* V23: vector r2 */ + lxvd2x 32+vdata_a4, rinp, a4_offset /* V24: vector r3 */ .endm /* @@ -235,20 +249,20 @@ * in the proper order to match the multiplication. */ .macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 - lxvd2x 32+26, 10, 5 + lxvd2x 32+25, 0, dup_rinp + lxvd2x 32+26, 10, dup_rinp vmrgew vdata_b1, 25, 26 vmrgow vdata_a1, 25, 26 - lxvd2x 32+25, 11, 5 - lxvd2x 32+26, 12, 5 + lxvd2x 32+25, 11, dup_rinp + lxvd2x 32+26, 12, dup_rinp vmrgew vdata_b2, 25, 26 vmrgow vdata_a2, 25, 26 - lxvd2x 32+25, 15, 5 - lxvd2x 32+26, 16, 5 + lxvd2x 32+25, 15, dup_rinp + lxvd2x 32+26, 16, dup_rinp vmrgew vdata_b3, 25, 26 vmrgow vdata_a3, 25, 26 - lxvd2x 32+25, 17, 5 - lxvd2x 32+26, 18, 5 + lxvd2x 32+25, 17, dup_rinp + lxvd2x 32+26, 18, dup_rinp vmrgew vdata_b4, 25, 26 vmrgow vdata_a4, 25, 26 .endm @@ -269,20 +283,20 @@ * in the proper order to match the multiplication. */ .macro Load_L44Coeffs - lxvd2x 10, 0, 5 - lxvd2x 11, 10, 5 + lxvd2x 10, 0, dup_rinp + lxvd2x 11, 10, dup_rinp xxpermdi 32+vdata_b1, 11, 10, 3 xxpermdi 32+vdata_a1, 11, 10, 0 - lxvd2x 10, 11, 5 - lxvd2x 11, 12, 5 + lxvd2x 10, 11, dup_rinp + lxvd2x 11, 12, dup_rinp xxpermdi 32+vdata_b2, 11, 10, 3 xxpermdi 32+vdata_a2, 11, 10, 0 - lxvd2x 10, 15, 5 - lxvd2x 11, 16, 5 + lxvd2x 10, 15, dup_rinp + lxvd2x 11, 16, dup_rinp xxpermdi 32+vdata_b3, 11, 10, 3 xxpermdi 32+vdata_a3, 11, 10, 0 - lxvd2x 10, 17, 5 - lxvd2x 11, 18, 5 + lxvd2x 10, 17, dup_rinp + lxvd2x 11, 18, dup_rinp xxpermdi 32+vdata_b4, 11, 10, 3 xxpermdi 32+vdata_a4, 11, 10, 0 .endm @@ -352,16 +366,16 @@ */ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 /* Modular multification bond by 2^16 * q in abs value */ - vmladduhm 15, vdata_mont1, \_vz0, 3 - vmladduhm 20, vdata_mont2, \_vz1, 3 - vmladduhm 27, vdata_mont3, \_vz2, 3 - vmladduhm 28, vdata_mont4, \_vz3, 3 + vmladduhm 15, vdata_mont1, \_vz0, rinp + vmladduhm 20, vdata_mont2, \_vz1, rinp + vmladduhm 27, vdata_mont3, \_vz2, rinp + vmladduhm 28, vdata_mont4, \_vz3, rinp /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ - vmhraddshs 14, vdata_mont1, \_vz0, 3 - vmhraddshs 19, vdata_mont2, \_vz1, 3 - vmhraddshs 24, vdata_mont3, \_vz2, 3 - vmhraddshs 29, vdata_mont4, \_vz3, 3 + vmhraddshs 14, vdata_mont1, \_vz0, rinp + vmhraddshs 19, vdata_mont2, \_vz1, rinp + vmhraddshs 24, vdata_mont3, \_vz2, rinp + vmhraddshs 29, vdata_mont4, \_vz3, rinp vmladduhm 15, 15, V_QINV, 3 vmladduhm 20, 20, V_QINV, 3 @@ -395,11 +409,11 @@ li 8, 16 li 11, 32 li 12, 48 - lxvd2x 32+V_Z0, 0, 14 - lxvd2x 32+V_Z1, 8, 14 - lxvd2x 32+V_Z2, 11, 14 - lxvd2x 32+V_Z3, 12, 14 - addi 14, 14, 64 + lxvd2x 32+V_Z0, 0, zeta_inp + lxvd2x 32+V_Z1, 8, zeta_inp + lxvd2x 32+V_Z2, 11, zeta_inp + lxvd2x 32+V_Z3, 12, zeta_inp + addi zeta_inp, zeta_inp, 64 .endm /* @@ -414,38 +428,38 @@ .endm .macro Write_B4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 9 - stxvd2x \_vs1, 3, 16 - stxvd2x \_vs2, 3, 18 - stxvd2x \_vs3, 3, 20 + stxvd2x \_vs0, rinp, a1_offset + stxvd2x \_vs1, rinp, a2_offset + stxvd2x \_vs2, rinp, a3_offset + stxvd2x \_vs3, rinp, a4_offset .endm .macro Write_M4C _vs0 _vs1 _vs2 _vs3 - stxvd2x \_vs0, 3, 10 - stxvd2x \_vs1, 3, 17 - stxvd2x \_vs2, 3, 19 - stxvd2x \_vs3, 3, 21 + stxvd2x \_vs0, rinp, b1_offset + stxvd2x \_vs1, rinp, b2_offset + stxvd2x \_vs2, rinp, b3_offset + stxvd2x \_vs3, rinp, b4_offset .endm .macro Reload_4coeffs - lxvd2x 32+vdata_mont1, 0, 3 - lxvd2x 32+vdata_mont2, 10, 3 - lxvd2x 32+vdata_mont3, 11, 3 - lxvd2x 32+vdata_mont4, 12, 3 - addi 3, 3, 64 + lxvd2x 32+vdata_mont1, 0, rinp + lxvd2x 32+vdata_mont2, 10, rinp + lxvd2x 32+vdata_mont3, 11, rinp + lxvd2x 32+vdata_mont4, 12, rinp + addi rinp, rinp, 64 .endm .macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 - addi 3, 3, -128 - stxvd2x \_vs0, 0, 3 - stxvd2x \_vs1, 10, 3 - stxvd2x \_vs2, 11, 3 - stxvd2x \_vs3, 12, 3 - stxvd2x \_vs4, 15, 3 - stxvd2x \_vs5, 16, 3 - stxvd2x \_vs6, 17, 3 - stxvd2x \_vs7, 18, 3 - addi 3, 3, 128 + addi rinp, rinp, -128 + stxvd2x \_vs0, 0, rinp + stxvd2x \_vs1, 10, rinp + stxvd2x \_vs2, 11, rinp + stxvd2x \_vs3, 12, rinp + stxvd2x \_vs4, 15, rinp + stxvd2x \_vs5, 16, rinp + stxvd2x \_vs6, 17, rinp + stxvd2x \_vs7, 18, rinp + addi rinp, rinp, 128 .endm /* @@ -465,14 +479,14 @@ xxpermdi 32+15, 32+24, 32+vresult_mont3, 0 xxpermdi 32+16, 32+29, 32+vresult_mont4, 3 xxpermdi 32+17, 32+29, 32+vresult_mont4, 0 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + stxvd2x 32+10, 0, dup_rinp + stxvd2x 32+11, 10, dup_rinp + stxvd2x 32+12, 11, dup_rinp + stxvd2x 32+13, 12, dup_rinp + stxvd2x 32+14, 15, dup_rinp + stxvd2x 32+15, 16, dup_rinp + stxvd2x 32+16, 17, dup_rinp + stxvd2x 32+17, 18, dup_rinp .endm /* @@ -492,14 +506,14 @@ vmrgow 15, vresult_mont3, 24 vmrgew 16, vresult_mont4, 29 vmrgow 17, vresult_mont4, 29 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + stxvd2x 32+10, 0, dup_rinp + stxvd2x 32+11, 10, dup_rinp + stxvd2x 32+12, 11, dup_rinp + stxvd2x 32+13, 12, dup_rinp + stxvd2x 32+14, 15, dup_rinp + stxvd2x 32+15, 16, dup_rinp + stxvd2x 32+16, 17, dup_rinp + stxvd2x 32+17, 18, dup_rinp .endm /* @@ -606,10 +620,10 @@ MLK_ASM_FN_SYMBOL(intt_ppc) /* init vectors and constants Setup for Montgomery reduce */ - lxvx 0, 0, 4 + lxvx 0, 0, qinp li 10, QINV_OFFSET - lxvx 32+V_QINV, 10, 4 + lxvx 32+V_QINV, 10, qinp xxlxor 32+3, 32+3, 32+3 vspltish 4, 1 xxlor 2, 32+2, 32+2 /* QINV */ @@ -619,8 +633,8 @@ MLK_ASM_FN_SYMBOL(intt_ppc) /* Setup for Barrett reduce */ li 10, Q_OFFSET li 11, C20159_OFFSET - lxvx 6, 10, 4 /* V_MKQ */ - lxvx 32+V20159, 11, 4 /* V20159 */ + lxvx 6, 10, qinp /* V_MKQ */ + lxvx 32+V20159, 11, qinp /* V20159 */ vspltisw 8, 13 vadduwm 8, 8, 8 @@ -642,8 +656,8 @@ MLK_ASM_FN_SYMBOL(intt_ppc) /* * Montgomery reduce loops with constant 1441 */ - addi 14, 4, C1441_OFFSET - lvx V1441, 0, 14 + addi zeta_inp, qinp, C1441_OFFSET + lvx V1441, 0, zeta_inp li 8, 4 mtctr 8 @@ -656,7 +670,7 @@ intt_ppc__Loopf: MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 bdnz intt_ppc__Loopf - addi 3, 3, -512 + addi rinp, rinp, -512 .align 4 /* @@ -673,18 +687,18 @@ intt_ppc__Loopf: * * These are indexes to the 16 bits array. Each loads 4 vectors. */ - addi 14, 4, ZETA_INTT_OFFSET - li 7, 4 /* len * 2 */ - mr 5, 3 + addi zeta_inp, qinp, ZETA_INTT_OFFSET + li len_2, 4 /* len * 2 */ + mr dup_rinp, rinp INTT_REDUCE_L24 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 INTT_REDUCE_L24 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 INTT_REDUCE_L24 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 INTT_REDUCE_L24 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 .align 4 /* @@ -700,17 +714,17 @@ intt_ppc__Loopf: * * These are indexes to the 16 bits array. Each loads 4 vectors. */ - mr 5, 3 - li 7, 8 + mr dup_rinp, rinp + li len_2, 8 INTT_REDUCE_L44 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 INTT_REDUCE_L44 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 INTT_REDUCE_L44 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 INTT_REDUCE_L44 - addi 5, 5, 128 + addi dup_rinp, dup_rinp, 128 .align 4 /* @@ -724,7 +738,7 @@ intt_ppc__Loopf: * * These are indexes to the 16 bits array */ - li 7, 16 + li len_2, 16 INTT_REDUCE_4X 0, 32 INTT_REDUCE_4X 128, 32 @@ -743,16 +757,16 @@ intt_ppc__Loopf: * * These are indexes to the 16 bits array */ - li 7, 32 + li len_2, 32 INTT_REDUCE_4X 0, 64 - addi 14, 14, -64 + addi zeta_inp, zeta_inp, -64 INTT_REDUCE_4X 16, 64 INTT_REDUCE_4X 256, 64 - addi 14, 14, -64 + addi zeta_inp, zeta_inp, -64 INTT_REDUCE_4X 272, 64 .align 4 @@ -767,16 +781,16 @@ intt_ppc__Loopf: * * These are indexes to the 16 bits array */ - li 7, 64 + li len_2, 64 INTT_REDUCE_L567 0, 16 - addi 14, 14, 16 + addi zeta_inp, zeta_inp, 16 INTT_REDUCE_L567 128, 16 - addi 14, 14, 16 + addi zeta_inp, zeta_inp, 16 INTT_REDUCE_L567 256, 16 - addi 14, 14, 16 + addi zeta_inp, zeta_inp, 16 INTT_REDUCE_L567 384, 16 - addi 14, 14, 16 + addi zeta_inp, zeta_inp, 16 .align 4 /* @@ -790,14 +804,14 @@ intt_ppc__Loopf: * * These are indexes to the 16 bits array */ - li 7, 128 + li len_2, 128 INTT_REDUCE_L567 0, 16 INTT_REDUCE_L567 64, 16 - addi 14, 14, 16 + addi zeta_inp, zeta_inp, 16 INTT_REDUCE_L567 256, 16 INTT_REDUCE_L567 320, 16 - addi 14, 14, 16 + addi zeta_inp, zeta_inp, 16 .align 4 /* @@ -811,7 +825,7 @@ intt_ppc__Loopf: * * These are indexes to the 16 bits array */ - li 7, 256 /* len*2 */ + li len_2, 256 /* len*2 */ INTT_REDUCE_L567 0, 16 INTT_REDUCE_L567 64, 16 diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index 88f1b1f60d..ca02979be7 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -42,6 +42,20 @@ #define vresult_a4 30 #define vresult_b4 31 +#define rinp 3 +#define dup_rinp 5 +#define qinp 4 +#define len_2 7 +#define zeta_inp 14 +#define a1_offset 9 +#define a2_offset 16 +#define a3_offset 18 +#define a4_offset 20 +#define b1_offset 10 +#define b2_offset 17 +#define b3_offset 19 +#define b4_offset 21 + .machine "any" .text @@ -142,14 +156,14 @@ * */ .macro Init_Coeffs_offset start next - li 9, \start /* first offset to j */ - add 10, 7, 9 /* J + len*2 */ - addi 16, 9, \next - addi 17, 10, \next - addi 18, 16, \next - addi 19, 17, \next - addi 20, 18, \next - addi 21, 19, \next + li a1_offset, \start /* first offset to j */ + add b1_offset, len_2, a1_offset /* J + len*2 */ + addi a2_offset, a1_offset, \next + addi b2_offset, b1_offset, \next + addi a3_offset, a2_offset, \next + addi b3_offset, b2_offset, \next + addi a4_offset, a3_offset, \next + addi b4_offset, b3_offset, \next .endm /* @@ -157,10 +171,10 @@ * r[j+len]: V13, V18, V23, V28 */ .macro Load_4Rjp - lxvd2x 32+vdata_b1, 3, 10 /* V13: vector r'0 */ - lxvd2x 32+vdata_b2, 3, 17 /* V18: vector for r'1 */ - lxvd2x 32+vdata_b3, 3, 19 /* V23: vector for r'2 */ - lxvd2x 32+vdata_b4, 3, 21 /* V28: vector for r'3 */ + lxvd2x 32+vdata_b1, rinp, b1_offset /* V13: vector r'0 */ + lxvd2x 32+vdata_b2, rinp, b2_offset /* V18: vector for r'1 */ + lxvd2x 32+vdata_b3, rinp, b3_offset /* V23: vector for r'2 */ + lxvd2x 32+vdata_b4, rinp, b4_offset /* V28: vector for r'3 */ .endm /* @@ -191,20 +205,20 @@ * in the proper order to match the multiplication. */ .macro Load_L24Coeffs - lxvd2x 32+25, 0, 5 - lxvd2x 32+26, 10, 5 + lxvd2x 32+25, 0, dup_rinp + lxvd2x 32+26, 10, dup_rinp vmrgew vdata_b1, 25, 26 vmrgow vdata_a1, 25, 26 - lxvd2x 32+25, 11, 5 - lxvd2x 32+26, 12, 5 + lxvd2x 32+25, 11, dup_rinp + lxvd2x 32+26, 12, dup_rinp vmrgew vdata_b2, 25, 26 vmrgow vdata_a2, 25, 26 - lxvd2x 32+25, 15, 5 - lxvd2x 32+26, 16, 5 + lxvd2x 32+25, 15, dup_rinp + lxvd2x 32+26, 16, dup_rinp vmrgew vdata_b3, 25, 26 vmrgow vdata_a3, 25, 26 - lxvd2x 32+25, 17, 5 - lxvd2x 32+26, 18, 5 + lxvd2x 32+25, 17, dup_rinp + lxvd2x 32+26, 18, dup_rinp vmrgew vdata_b4, 25, 26 vmrgow vdata_a4, 25, 26 .endm @@ -225,20 +239,20 @@ * in the proper order to match the multiplication. */ .macro Load_L44Coeffs - lxvd2x 1, 0, 5 - lxvd2x 2, 10, 5 + lxvd2x 1, 0, dup_rinp + lxvd2x 2, 10, dup_rinp xxpermdi 32+vdata_b1, 2, 1, 3 xxpermdi 32+vdata_a1, 2, 1, 0 - lxvd2x 3, 11, 5 - lxvd2x 4, 12, 5 + lxvd2x 3, 11, dup_rinp + lxvd2x 4, 12, dup_rinp xxpermdi 32+vdata_b2, 4, 3, 3 xxpermdi 32+vdata_a2, 4, 3, 0 - lxvd2x 1, 15, 5 - lxvd2x 2, 16, 5 + lxvd2x 1, 15, dup_rinp + lxvd2x 2, 16, dup_rinp xxpermdi 32+vdata_b3, 2, 1, 3 xxpermdi 32+vdata_a3, 2, 1, 0 - lxvd2x 3, 17, 5 - lxvd2x 4, 18, 5 + lxvd2x 3, 17, dup_rinp + lxvd2x 4, 18, dup_rinp xxpermdi 32+vdata_b4, 4, 3, 3 xxpermdi 32+vdata_a4, 4, 3, 0 .endm @@ -254,21 +268,21 @@ .macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 /* fqmul = zeta * coefficient Modular multification bond by 2^16 * q in abs value */ - vmladduhm 15, vdata_b1, \_vz0, 3 - vmladduhm 20, vdata_b2, \_vz1, 3 - vmladduhm 25, vdata_b3, \_vz2, 3 - vmladduhm 30, vdata_b4, \_vz3, 3 + vmladduhm 15, vdata_b1, \_vz0, rinp + vmladduhm 20, vdata_b2, \_vz1, rinp + vmladduhm 25, vdata_b3, \_vz2, rinp + vmladduhm 30, vdata_b4, \_vz3, rinp /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ - vmhraddshs 14, vdata_b1, \_vz0, 3 - vmhraddshs 19, vdata_b2, \_vz1, 3 - vmhraddshs 24, vdata_b3, \_vz2, 3 - vmhraddshs 29, vdata_b4, \_vz3, 3 + vmhraddshs 14, vdata_b1, \_vz0, rinp + vmhraddshs 19, vdata_b2, \_vz1, rinp + vmhraddshs 24, vdata_b3, \_vz2, rinp + vmhraddshs 29, vdata_b4, \_vz3, rinp - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 25, V_QINV, 3 - vmladduhm 30, 30, V_QINV, 3 + vmladduhm 15, 15, V_QINV, rinp + vmladduhm 20, 20, V_QINV, rinp + vmladduhm 25, 25, V_QINV, rinp + vmladduhm 30, 30, V_QINV, rinp vmhraddshs 15, 15, V_NMKQ, 14 vmhraddshs 20, 20, V_NMKQ, 19 @@ -288,10 +302,10 @@ * r[j]: V12, V17, V22, V27 */ .macro Load_4Rj - lxvd2x 32+vdata_a1, 3, 9 /* V12: vector r0 */ - lxvd2x 32+vdata_a2, 3, 16 /* V17: vector r1 */ - lxvd2x 32+vdata_a3, 3, 18 /* V22: vector r2 */ - lxvd2x 32+vdata_a4, 3, 20 /* V27: vector r3 */ + lxvd2x 32+vdata_a1, rinp, a1_offset /* V12: vector r0 */ + lxvd2x 32+vdata_a2, rinp, a2_offset /* V17: vector r1 */ + lxvd2x 32+vdata_a3, rinp, a3_offset /* V22: vector r2 */ + lxvd2x 32+vdata_a4, rinp, a4_offset /* V27: vector r3 */ .endm /* @@ -317,14 +331,14 @@ .endm .macro Write_One - stxvd2x 32+vresult_a1, 3, 9 - stxvd2x 32+vresult_b1, 3, 10 - stxvd2x 32+vresult_a2, 3, 16 - stxvd2x 32+vresult_b2, 3, 17 - stxvd2x 32+vresult_a3, 3, 18 - stxvd2x 32+vresult_b3, 3, 19 - stxvd2x 32+vresult_a4, 3, 20 - stxvd2x 32+vresult_b4, 3, 21 + stxvd2x 32+vresult_a1, rinp, a1_offset + stxvd2x 32+vresult_b1, rinp, b1_offset + stxvd2x 32+vresult_a2, rinp, a2_offset + stxvd2x 32+vresult_b2, rinp, b2_offset + stxvd2x 32+vresult_a3, rinp, a3_offset + stxvd2x 32+vresult_b3, rinp, b3_offset + stxvd2x 32+vresult_a4, rinp, a4_offset + stxvd2x 32+vresult_b4, rinp, b4_offset .endm /* @@ -341,14 +355,14 @@ xxpermdi 5, 32+vresult_a3, 32+vresult_b3, 0 xxpermdi 6, 32+vresult_a4, 32+vresult_b4, 3 xxpermdi 7, 32+vresult_a4, 32+vresult_b4, 0 - stxvd2x 0, 0, 5 - stxvd2x 1, 10, 5 - stxvd2x 2, 11, 5 - stxvd2x 3, 12, 5 - stxvd2x 4, 15, 5 - stxvd2x 5, 16, 5 - stxvd2x 6, 17, 5 - stxvd2x 7, 18, 5 + stxvd2x 0, 0, dup_rinp + stxvd2x 1, 10, dup_rinp + stxvd2x 2, 11, dup_rinp + stxvd2x 3, 12, dup_rinp + stxvd2x 4, 15, dup_rinp + stxvd2x 5, 16, dup_rinp + stxvd2x 6, 17, dup_rinp + stxvd2x 7, 18, dup_rinp .endm /* @@ -365,25 +379,25 @@ vmrgow 15, vresult_b3, vresult_a3 vmrgew 16, vresult_b4, vresult_a4 vmrgow 17, vresult_b4, vresult_a4 - stxvd2x 32+10, 0, 5 - stxvd2x 32+11, 10, 5 - stxvd2x 32+12, 11, 5 - stxvd2x 32+13, 12, 5 - stxvd2x 32+14, 15, 5 - stxvd2x 32+15, 16, 5 - stxvd2x 32+16, 17, 5 - stxvd2x 32+17, 18, 5 + stxvd2x 32+10, 0, dup_rinp + stxvd2x 32+11, 10, dup_rinp + stxvd2x 32+12, 11, dup_rinp + stxvd2x 32+13, 12, dup_rinp + stxvd2x 32+14, 15, dup_rinp + stxvd2x 32+15, 16, dup_rinp + stxvd2x 32+16, 17, dup_rinp + stxvd2x 32+17, 18, dup_rinp .endm .macro Load_next_4zetas li 10, 16 li 11, 32 li 12, 48 - lxvd2x 32+V_Z0, 0, 14 - lxvd2x 32+V_Z1, 10, 14 - lxvd2x 32+V_Z2, 11, 14 - lxvd2x 32+V_Z3, 12, 14 - addi 14, 14, 64 + lxvd2x 32+V_Z0, 0, zeta_inp + lxvd2x 32+V_Z1, 10, zeta_inp + lxvd2x 32+V_Z2, 11, zeta_inp + lxvd2x 32+V_Z3, 12, zeta_inp + addi zeta_inp, zeta_inp, 64 .endm /* @@ -397,6 +411,32 @@ xxpermdi 32+V_Z3, 32+V_Z3, 32+V_Z3, 2 .endm +/* + * NTT layer Len=2. + */ +.macro NTT_REDUCE_L24 + Load_next_4zetas + Load_L24Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi dup_rinp, dup_rinp, 128 +.endm + +/* + * NTT layer Len=4. + */ +.macro NTT_REDUCE_L44 + Load_next_4zetas + Perm_4zetas + Load_L44Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi dup_rinp, dup_rinp, 128 +.endm + +/* + * NTT other layers. + */ .macro NTT_MREDUCE_4X start next _vz0 _vz1 _vz2 _vz3 Load_4Coeffs \start, \next MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 @@ -447,16 +487,16 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) SAVE_REGS /* load MLKEM_Q */ - lvx V_NMKQ,0,4 + lvx V_NMKQ,0,qinp /* Register 14 as pointer to zetas array */ - addi 14, 4, ZETA_NTT_OFFSET + addi zeta_inp, qinp, ZETA_NTT_OFFSET vxor 3, 3, 3 vspltish 4, 1 li 10, QINV_OFFSET - lvx V_QINV, 10, 4 + lvx V_QINV, 10, qinp .align 4 /* @@ -470,9 +510,9 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * * These are indexes to the 16 bits array */ - li 7, 256 /* len * 2 */ - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + li len_2, 256 /* len * 2 */ + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -491,14 +531,14 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * * These are indexes to the 16 bits array */ - li 7, 128 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + li len_2, 128 + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA @@ -514,21 +554,21 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * * These are indexes to the 16 bits array */ - li 7, 64 - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + li len_2, 64 + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - lvx V_ZETA, 0, 14 - addi 14, 14, 16 + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .align 4 @@ -543,7 +583,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * * These are indexes to the 16 bits array */ - li 7, 32 + li len_2, 32 Load_next_4zetas NTT_MREDUCE_4X 0, 64, V_Z0, V_Z1, V_Z2, V_Z3 NTT_MREDUCE_4X 16, 64, V_Z0, V_Z1, V_Z2, V_Z3 @@ -564,7 +604,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * * These are indexes to the 16 bits array */ - li 7, 16 + li len_2, 16 Load_next_4zetas NTT_MREDUCE_4X 0, 32, V_Z0, V_Z1, V_Z2, V_Z3 @@ -590,10 +630,8 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) * * These are indexes to the 16 bits array. Each loads 4 vectors. */ - li 15, 4 - mtctr 15 - mr 5, 3 /* Let r5 points to coefficient array */ - li 7, 8 + mr dup_rinp, rinp /* Let r5 points to coefficient array */ + li len_2, 8 li 10, 16 li 11, 32 @@ -604,15 +642,10 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 18, 112 .align 4 -ntt_ppc__Len4: - Load_next_4zetas - Perm_4zetas - Load_L44Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 - PermWriteL44 - addi 5, 5, 128 - - bdnz ntt_ppc__Len4 + NTT_REDUCE_L44 + NTT_REDUCE_L44 + NTT_REDUCE_L44 + NTT_REDUCE_L44 /* * 7. len = 2, start = 0, 4, 8, 12,...244, 248, 252 @@ -627,21 +660,14 @@ ntt_ppc__Len4: * * These are indexes to the 16 bits array. Each loads 4 vectors. */ - - li 8, 4 - mtctr 8 - mr 5, 3 /* Let r5 points to coefficient array */ - li 7, 4 + mr dup_rinp, rinp /* Let r5 points to coefficient array */ + li len_2, 4 .align 4 -ntt_ppc__Len2: - Load_next_4zetas - Load_L24Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 - PermWriteL24 - addi 5, 5, 128 - - bdnz ntt_ppc__Len2 + NTT_REDUCE_L24 + NTT_REDUCE_L24 + NTT_REDUCE_L24 + NTT_REDUCE_L24 RESTORE_REGS blr From 82de3d42ac6b1d1f13c20392f300ab5cd43dc789 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Mon, 12 Jan 2026 03:35:09 -0500 Subject: [PATCH 22/22] Fixed copyright header. Signed-off-by: Danny Tsen --- dev/ppc64le/src/arith_native_ppc64le.h | 2 +- mlkem/src/native/ppc64le/src/arith_native_ppc64le.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h index 1c75346689..aebb4711ab 100644 --- a/dev/ppc64le/src/arith_native_ppc64le.h +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2024-2025 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ #ifndef MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H #define MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h index dbcee3e3ee..116f6d7a6b 100644 --- a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2024-2025 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ #ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H #define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H