diff --git a/README.md b/README.md
index 94d8191e3f..2a21d05a3e 100644
--- a/README.md
+++ b/README.md
@@ -63,7 +63,7 @@ All names other than `ML-KEM` and `ML-DSA` are subject to change. `liboqs` makes
| FrodoKEM | Under [ISO](https://frodokem.org/) consideration | [`microsoft/PQCrypto-LWEKE@a2f9dec`](https://github.com/microsoft/PQCrypto-LWEKE/commit/a2f9dec8917ccc3464b3378d46b140fa7353320d) |
| HQC | Selected by [NIST](https://pqc-hqc.org/doc/hqc_specifications_2025_08_22.pdf) for upcoming standardization | [`PQClean/PQClean@1eacfda`](https://github.com/PQClean/PQClean/commit/1eacfdafc15ddc5d5759d0b85b4cef26627df181) |
| Kyber | Selected by [NIST](https://csrc.nist.gov/CSRC/media/Projects/post-quantum-cryptography/documents/round-3/submissions/Kyber-Round3.zip) as basis for ML-KEM (FIPS 203) | [`pq-crystals/kyber@441c051`](https://github.com/pq-crystals/kyber/commit/441c0519a07e8b86c8d079954a6b10bd31d29efc) |
-| ML-KEM | Standardized by [NIST](https://csrc.nist.gov/pubs/fips/203/final) | [`pq-code-package/mlkem-native@048fc2a`](https://github.com/pq-code-package/mlkem-native/commit/048fc2a7a7b4ba0ad4c989c1ac82491aa94d5bfa) |
+| ML-KEM | Standardized by [NIST](https://csrc.nist.gov/pubs/fips/203/final) | [`pq-code-package/mlkem-native@d2cae2b`](https://github.com/pq-code-package/mlkem-native/commit/d2cae2be522a67bfae26100fdb520576f1b2ef90) |
| NTRU | Not selected by [NIST](https://csrc.nist.gov/CSRC/media/Projects/post-quantum-cryptography/documents/round-3/submissions/NTRU-Round3.zip), under standardization consideration by [NTT](https://info.isl.ntt.co.jp/crypt/ntru/index.html) | [`PQClean/PQClean@4c9e5a3`](https://github.com/PQClean/PQClean/commit/4c9e5a3aa715cc8d1d0e377e4e6e682ebd7602d6) |
| NTRU-Prime | Not selected by [NIST](https://csrc.nist.gov/CSRC/media/Projects/post-quantum-cryptography/documents/round-3/submissions/NTRU-Prime-Round3.zip) | [`openssh/openssh-portable`](https://github.com/openssh/openssh-portable/blob/1cc936b2fabffeac7fff14ca1070d7d7a317ab7b/sntrup761.c) |
diff --git a/docs/algorithms/kem/ml_kem.md b/docs/algorithms/kem/ml_kem.md
index 2a3c6cbdc6..9edaf0d87d 100644
--- a/docs/algorithms/kem/ml_kem.md
+++ b/docs/algorithms/kem/ml_kem.md
@@ -7,14 +7,14 @@
- **Authors' website**: https://pq-crystals.org/kyber/ and https://csrc.nist.gov/pubs/fips/203
- **Specification version**: ML-KEM.
- **Primary Source**:
- - **Source**: https://github.com/pq-code-package/mlkem-native/commit/048fc2a7a7b4ba0ad4c989c1ac82491aa94d5bfa
+ - **Source**: https://github.com/pq-code-package/mlkem-native/commit/d2cae2be522a67bfae26100fdb520576f1b2ef90
- **Implementation license (SPDX-Identifier)**: MIT or Apache-2.0 or ISC
- **Optimized Implementation sources**:
- **x86_64**:
- - **Source**: https://github.com/pq-code-package/mlkem-native/commit/048fc2a7a7b4ba0ad4c989c1ac82491aa94d5bfa
+ - **Source**: https://github.com/pq-code-package/mlkem-native/commit/d2cae2be522a67bfae26100fdb520576f1b2ef90
- **Implementation license (SPDX-Identifier)**: MIT or Apache-2.0 or ISC
- **aarch64**:
- - **Source**: https://github.com/pq-code-package/mlkem-native/commit/048fc2a7a7b4ba0ad4c989c1ac82491aa94d5bfa
+ - **Source**: https://github.com/pq-code-package/mlkem-native/commit/d2cae2be522a67bfae26100fdb520576f1b2ef90
- **Implementation license (SPDX-Identifier)**: MIT or Apache-2.0 or ISC
- **cupqc-cuda**:
- **Source**: https://github.com/open-quantum-safe/liboqs-cupqc-meta/commit/b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e
diff --git a/docs/algorithms/kem/ml_kem.yml b/docs/algorithms/kem/ml_kem.yml
index e922214d8d..597eff4ac0 100644
--- a/docs/algorithms/kem/ml_kem.yml
+++ b/docs/algorithms/kem/ml_kem.yml
@@ -18,7 +18,7 @@ nist-round: FIPS203
standardization-status: Standardized by [NIST](https://csrc.nist.gov/pubs/fips/203/final)
spec-version: ML-KEM
primary-upstream:
- source: https://github.com/pq-code-package/mlkem-native/commit/048fc2a7a7b4ba0ad4c989c1ac82491aa94d5bfa
+ source: https://github.com/pq-code-package/mlkem-native/commit/d2cae2be522a67bfae26100fdb520576f1b2ef90
spdx-license-identifier: MIT or Apache-2.0 or ISC
optimized-upstreams:
cupqc-cuda:
diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml
index 6b5bf19385..7e70bd7f8f 100644
--- a/scripts/copy_from_upstream/copy_from_upstream.yml
+++ b/scripts/copy_from_upstream/copy_from_upstream.yml
@@ -33,11 +33,11 @@ upstreams:
-
name: mlkem-native
git_url: https://github.com/pq-code-package/mlkem-native.git
- git_branch: v1.0.0
- git_commit: 048fc2a7a7b4ba0ad4c989c1ac82491aa94d5bfa
+ git_branch: v1.1.0
+ git_commit: d2cae2be522a67bfae26100fdb520576f1b2ef90
kem_meta_path: 'integration/liboqs/{pretty_name_full}_META.yml'
kem_scheme_path: '.'
- patches: [mlkem-native-encaps-derand.patch]
+ patches: []
preserve_folder_structure: True
-
name: cupqc
diff --git a/scripts/copy_from_upstream/patches/mlkem-native-encaps-derand.patch b/scripts/copy_from_upstream/patches/mlkem-native-encaps-derand.patch
deleted file mode 100644
index 44f0f3e78f..0000000000
--- a/scripts/copy_from_upstream/patches/mlkem-native-encaps-derand.patch
+++ /dev/null
@@ -1,109 +0,0 @@
-c3b0ad741b73539055fba2d8e637a84cfcec60de
-diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml
-index 4bf38aa..91d9b29 100644
---- a/integration/liboqs/ML-KEM-1024_META.yml
-+++ b/integration/liboqs/ML-KEM-1024_META.yml
-@@ -9,6 +9,7 @@ length-ciphertext: 1568
- length-secret-key: 3168
- length-shared-secret: 32
- length-keypair-seed: 64
-+length-encaps-seed: 32
- nistkat-sha256: f580d851e5fb27e6876e5e203fa18be4cdbfd49e05d48fec3d3992c8f43a13e6
- testvectors-sha256: ff1a854b9b6761a70c65ccae85246fe0596a949e72eae0866a8a2a2d4ea54b10
- principal-submitters:
-@@ -31,6 +32,7 @@ implementations:
- signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair
- signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_C_keypair_derand
- signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc
-+ signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_C_enc_derand
- signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_C_dec
- sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc
- - name: x86_64
-@@ -40,6 +42,7 @@ implementations:
- signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair
- signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_keypair_derand
- signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_enc
-+ signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_enc_derand
- signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_X86_64_dec
- sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc
- supported_platforms:
-@@ -58,6 +61,7 @@ implementations:
- signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair
- signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_keypair_derand
- signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_enc
-+ signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_enc_derand
- signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_dec
- sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc
- supported_platforms:
-diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml
-index 7cf5f9f..1193a4f 100644
---- a/integration/liboqs/ML-KEM-512_META.yml
-+++ b/integration/liboqs/ML-KEM-512_META.yml
-@@ -9,6 +9,7 @@ length-ciphertext: 768
- length-secret-key: 1632
- length-shared-secret: 32
- length-keypair-seed: 64
-+length-encaps-seed: 32
- nistkat-sha256: c70041a761e01cd6426fa60e9fd6a4412c2be817386c8d0f3334898082512782
- testvectors-sha256: 6730bb552c22d9d2176ffb5568e48eb30952cf1f065073ec5f9724f6a3c6ea85
- principal-submitters:
-@@ -31,6 +32,7 @@ implementations:
- signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair
- signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_C_keypair_derand
- signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_C_enc
-+ signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_C_enc_derand
- signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_C_dec
- sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc
- - name: x86_64
-@@ -40,6 +42,7 @@ implementations:
- signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair
- signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_keypair_derand
- signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_enc
-+ signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_enc_derand
- signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_X86_64_dec
- sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc
- supported_platforms:
-@@ -58,6 +61,7 @@ implementations:
- signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair
- signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_keypair_derand
- signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_enc
-+ signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_enc_derand
- signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_dec
- sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc
- supported_platforms:
-diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml
-index 8582f46..48439f2 100644
---- a/integration/liboqs/ML-KEM-768_META.yml
-+++ b/integration/liboqs/ML-KEM-768_META.yml
-@@ -9,6 +9,7 @@ length-ciphertext: 1088
- length-secret-key: 2400
- length-shared-secret: 32
- length-keypair-seed: 64
-+length-encaps-seed: 32
- nistkat-sha256: 5352539586b6c3df58be6158a6250aeff402bd73060b0a3de68850ac074c17c3
- testvectors-sha256: 667c8ca2ca93729c0df6ff24588460bad1bbdbfb64ece0fe8563852a7ff348c6
- principal-submitters:
-@@ -31,6 +32,7 @@ implementations:
- signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair
- signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_C_keypair_derand
- signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_C_enc
-+ signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_C_enc_derand
- signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_C_dec
- sources: integration/liboqs/config_c.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc
- - name: x86_64
-@@ -40,6 +42,7 @@ implementations:
- signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair
- signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_keypair_derand
- signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_enc
-+ signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_enc_derand
- signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_X86_64_dec
- sources: integration/liboqs/config_x86_64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/x86_64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc
- supported_platforms:
-@@ -58,6 +61,7 @@ implementations:
- signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair
- signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_keypair_derand
- signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_enc
-+ signature_enc_derand: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_enc_derand
- signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_dec
- sources: integration/liboqs/config_aarch64.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/native/aarch64 mlkem/src/params.h mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc
- supported_platforms:
diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt
index c1bd870557..89c5ef0373 100644
--- a/src/kem/ml_kem/CMakeLists.txt
+++ b/src/kem/ml_kem/CMakeLists.txt
@@ -15,7 +15,7 @@ if(OQS_ENABLE_KEM_ml_kem_512)
endif()
if(OQS_ENABLE_KEM_ml_kem_512_x86_64)
- add_library(ml_kem_512_x86_64 OBJECT mlkem-native_ml-kem-512_x86_64/mlkem/src/compress.c mlkem-native_ml-kem-512_x86_64/mlkem/src/debug.c mlkem-native_ml-kem-512_x86_64/mlkem/src/indcpa.c mlkem-native_ml-kem-512_x86_64/mlkem/src/kem.c mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/basemul.c mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/basemul.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/consts.c mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/intt.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/mulcache_compute.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/ntt.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/nttfrombytes.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/ntttobytes.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/nttunpack.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/reduce.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/tomont.S mlkem-native_ml-kem-512_x86_64/mlkem/src/poly.c mlkem-native_ml-kem-512_x86_64/mlkem/src/poly_k.c mlkem-native_ml-kem-512_x86_64/mlkem/src/sampling.c mlkem-native_ml-kem-512_x86_64/mlkem/src/verify.c)
+ add_library(ml_kem_512_x86_64 OBJECT mlkem-native_ml-kem-512_x86_64/mlkem/src/compress.c mlkem-native_ml-kem-512_x86_64/mlkem/src/debug.c mlkem-native_ml-kem-512_x86_64/mlkem/src/indcpa.c mlkem-native_ml-kem-512_x86_64/mlkem/src/kem.c mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/compress_consts.c mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/consts.c mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/intt.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/mulcache_compute.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/ntt.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/nttfrombytes.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/ntttobytes.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/nttunpack.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d10.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d11.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d4.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d5.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d10.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d11.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d4.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d5.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/reduce.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/rej_uniform_asm.S mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/tomont.S mlkem-native_ml-kem-512_x86_64/mlkem/src/poly.c mlkem-native_ml-kem-512_x86_64/mlkem/src/poly_k.c mlkem-native_ml-kem-512_x86_64/mlkem/src/sampling.c mlkem-native_ml-kem-512_x86_64/mlkem/src/verify.c)
target_include_directories(ml_kem_512_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_x86_64)
target_include_directories(ml_kem_512_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
target_compile_options(ml_kem_512_x86_64 PRIVATE -mavx2 -mbmi2 -mpopcnt )
@@ -55,7 +55,7 @@ if(OQS_ENABLE_KEM_ml_kem_768)
endif()
if(OQS_ENABLE_KEM_ml_kem_768_x86_64)
- add_library(ml_kem_768_x86_64 OBJECT mlkem-native_ml-kem-768_x86_64/mlkem/src/compress.c mlkem-native_ml-kem-768_x86_64/mlkem/src/debug.c mlkem-native_ml-kem-768_x86_64/mlkem/src/indcpa.c mlkem-native_ml-kem-768_x86_64/mlkem/src/kem.c mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/basemul.c mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/basemul.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/consts.c mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/intt.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/mulcache_compute.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/ntt.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/nttfrombytes.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/ntttobytes.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/nttunpack.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/reduce.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/tomont.S mlkem-native_ml-kem-768_x86_64/mlkem/src/poly.c mlkem-native_ml-kem-768_x86_64/mlkem/src/poly_k.c mlkem-native_ml-kem-768_x86_64/mlkem/src/sampling.c mlkem-native_ml-kem-768_x86_64/mlkem/src/verify.c)
+ add_library(ml_kem_768_x86_64 OBJECT mlkem-native_ml-kem-768_x86_64/mlkem/src/compress.c mlkem-native_ml-kem-768_x86_64/mlkem/src/debug.c mlkem-native_ml-kem-768_x86_64/mlkem/src/indcpa.c mlkem-native_ml-kem-768_x86_64/mlkem/src/kem.c mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/compress_consts.c mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/consts.c mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/intt.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/mulcache_compute.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/ntt.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/nttfrombytes.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/ntttobytes.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/nttunpack.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/poly_compress_d10.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/poly_compress_d11.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/poly_compress_d4.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/poly_compress_d5.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d10.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d11.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d4.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d5.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/reduce.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/rej_uniform_asm.S mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-768_x86_64/mlkem/src/native/x86_64/src/tomont.S mlkem-native_ml-kem-768_x86_64/mlkem/src/poly.c mlkem-native_ml-kem-768_x86_64/mlkem/src/poly_k.c mlkem-native_ml-kem-768_x86_64/mlkem/src/sampling.c mlkem-native_ml-kem-768_x86_64/mlkem/src/verify.c)
target_include_directories(ml_kem_768_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_x86_64)
target_include_directories(ml_kem_768_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
target_compile_options(ml_kem_768_x86_64 PRIVATE -mavx2 -mbmi2 -mpopcnt )
@@ -95,7 +95,7 @@ if(OQS_ENABLE_KEM_ml_kem_1024)
endif()
if(OQS_ENABLE_KEM_ml_kem_1024_x86_64)
- add_library(ml_kem_1024_x86_64 OBJECT mlkem-native_ml-kem-1024_x86_64/mlkem/src/compress.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/debug.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/indcpa.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/kem.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/basemul.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/basemul.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/consts.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/intt.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/mulcache_compute.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/ntt.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/nttfrombytes.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/ntttobytes.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/nttunpack.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/reduce.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/tomont.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly_k.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/sampling.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/verify.c)
+ add_library(ml_kem_1024_x86_64 OBJECT mlkem-native_ml-kem-1024_x86_64/mlkem/src/compress.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/debug.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/indcpa.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/kem.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/compress_consts.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/consts.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/intt.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/mulcache_compute.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/ntt.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/nttfrombytes.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/ntttobytes.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/nttunpack.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d10.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d11.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d4.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d5.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d10.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d11.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d4.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d5.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/reduce.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/rej_uniform_asm.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/tomont.S mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly_k.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/sampling.c mlkem-native_ml-kem-1024_x86_64/mlkem/src/verify.c)
target_include_directories(ml_kem_1024_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_x86_64)
target_include_directories(ml_kem_1024_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
target_compile_options(ml_kem_1024_x86_64 PRIVATE -mavx2 -mbmi2 -mpopcnt )
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/integration/liboqs/config_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/integration/liboqs/config_aarch64.h
index 65fe4bb4b7..29f27388a9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/integration/liboqs/config_aarch64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/integration/liboqs/config_aarch64.h
@@ -8,13 +8,23 @@
*
* - [FIPS140_3_IG]
* Implementation Guidance for FIPS 140-3 and the Cryptographic Module
- * Validation Program National Institute of Standards and Technology
+ * Validation Program
+ * National Institute of Standards and Technology
* https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
*/
#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_AARCH64_H
#define MLK_INTEGRATION_LIBOQS_CONFIG_AARCH64_H
+/* Enable valgrind-based assertions in mlkem-native through macro
+ * from libOQS. */
+#if !defined(__ASSEMBLER__)
+#include
+#if defined(OQS_ENABLE_TEST_CONSTANT_TIME)
+#define MLK_CONFIG_CT_TESTING_ENABLED
+#endif
+#endif /* !__ASSEMBLER__ */
+
/******************************************************************************
* Name: MLK_CONFIG_PARAMETER_SET
*
@@ -172,7 +182,7 @@
* consumer.
*
* If this option is not set, mlkem-native expects a function
- * void randombytes(uint8_t *out, size_t outlen).
+ * int randombytes(uint8_t *out, size_t outlen).
*
* Set this option and define `mlk_randombytes` if you want to
* use a custom method to sample randombytes with a different name
@@ -184,9 +194,10 @@
#include
#include
#include "../../mlkem/src/sys.h"
-static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len)
+static MLK_INLINE int mlk_randombytes(uint8_t *ptr, size_t len)
{
OQS_randombytes(ptr, len);
+ return 0;
}
#endif /* !__ASSEMBLER__ */
@@ -250,13 +261,4 @@ static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len)
#endif
*/
-/* Enable valgrind-based assertions in mlkem-native through macro
- * from libOQS. */
-#if !defined(__ASSEMBLER__)
-#include
-#if defined(OQS_ENABLE_TEST_CONSTANT_TIME)
-#define MLK_CONFIG_CT_TESTING_ENABLED
-#endif
-#endif /* !__ASSEMBLER__ */
-
#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_AARCH64_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/cbmc.h
index 650d32b95b..80e1a36fc7 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/cbmc.h
@@ -8,7 +8,6 @@
/***************************************************
* Basic replacements for __CPROVER_XXX contracts
***************************************************/
-
#ifndef CBMC
#define __contract__(x)
@@ -16,6 +15,7 @@
#else /* !CBMC */
+
#define __contract__(x) x
#define __loop__(x) x
@@ -49,7 +49,6 @@
*/
#define object_whole(...) __CPROVER_object_whole(__VA_ARGS__)
#define memory_slice(...) __CPROVER_object_upto(__VA_ARGS__)
-#define same_object(...) __CPROVER_same_object(__VA_ARGS__)
/*
* Pointer-related predicates
@@ -59,6 +58,17 @@
#define readable(...) __CPROVER_r_ok(__VA_ARGS__)
#define writeable(...) __CPROVER_w_ok(__VA_ARGS__)
+/* Maximum supported buffer size
+ *
+ * Larger buffers may be supported, but due to internal modeling constraints
+ * in CBMC, the proofs of memory- and type-safety won't be able to run.
+ *
+ * If you find yourself in need for a buffer size larger than this,
+ * please contact the maintainers, so we can prioritize work to relax
+ * this somewhat artificial bound.
+ */
+#define MLK_MAX_BUFFER_SIZE (SIZE_MAX >> 12)
+
/*
* History variables
* https://diffblue.github.io/cbmc/contracts-history-variables.html
@@ -83,7 +93,7 @@
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> (predicate) \
}
-#define EXISTS(qvar, qvar_lb, qvar_ub, predicate) \
+#define exists(qvar, qvar_lb, qvar_ub, predicate) \
__CPROVER_exists \
{ \
unsigned qvar; \
@@ -118,13 +128,35 @@
{ \
unsigned qvar; \
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
- (((int)(value_lb) <= ((array_var)[(qvar)])) && \
- (((array_var)[(qvar)]) < (int)(value_ub))) \
+ (((int)(value_lb) <= ((array_var)[(qvar)])) && \
+ (((array_var)[(qvar)]) < (int)(value_ub))) \
}
-#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
- array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb), \
+#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
+ array_bound_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), (qvar_lb), \
(qvar_ub), (array_var), (value_lb), (value_ub))
+
+#define array_unchanged_core(qvar, qvar_lb, qvar_ub, array_var) \
+ __CPROVER_forall \
+ { \
+ unsigned qvar; \
+ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
+ ((array_var)[(qvar)]) == (old(* (int16_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
+ }
+
+#define array_unchanged(array_var, N) \
+ array_unchanged_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var))
+
+#define array_unchanged_u64_core(qvar, qvar_lb, qvar_ub, array_var) \
+ __CPROVER_forall \
+ { \
+ unsigned qvar; \
+ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
+ ((array_var)[(qvar)]) == (old(* (uint64_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
+ }
+
+#define array_unchanged_u64(array_var, N) \
+ array_unchanged_u64_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var))
/* clang-format on */
/* Wrapper around array_bound operating on absolute values.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/common.h
index 9de9875556..bc4e9ed72c 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/common.h
@@ -5,10 +5,16 @@
#ifndef MLK_COMMON_H
#define MLK_COMMON_H
+#ifndef __ASSEMBLER__
+#include
+#endif
+
+#define MLK_BUILD_INTERNAL
+
#if defined(MLK_CONFIG_FILE)
#include MLK_CONFIG_FILE
#else
-#include "config.h"
+#include "mlkem_native_config.h"
#endif
#include "params.h"
@@ -28,15 +34,11 @@
#define MLK_EXTERNAL_API MLK_CONFIG_EXTERNAL_API_QUALIFIER
#endif
-#if defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) || \
- defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED)
-#define MLK_MULTILEVEL_BUILD
-#endif
-
#define MLK_CONCAT_(x1, x2) x1##x2
#define MLK_CONCAT(x1, x2) MLK_CONCAT_(x1, x2)
-#if defined(MLK_MULTILEVEL_BUILD)
+#if (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || \
+ defined(MLK_CONFIG_MULTILEVEL_NO_SHARED))
#define MLK_ADD_PARAM_SET(s) MLK_CONCAT(s, MLK_CONFIG_PARAMETER_SET)
#else
#define MLK_ADD_PARAM_SET(s) s
@@ -49,7 +51,7 @@
/* Functions are prefixed by MLK_CONFIG_NAMESPACE_PREFIX.
*
* If multiple parameter sets are used, functions depending on the parameter
- * set are additionally prefixed with 512/768/1024. See config.h.
+ * set are additionally prefixed with 512/768/1024. See mlkem_native_config.h.
*
* Example: If MLK_CONFIG_NAMESPACE_PREFIX is mlkem, then
* MLK_NAMESPACE_K(enc) becomes mlkem512_enc/mlkem768_enc/mlkem1024_enc.
@@ -73,8 +75,24 @@
*/
#if defined(MLK_SYS_X86_64)
#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) : MLK_CET_ENDBR
-#else
+#elif defined(MLK_SYS_ARMV81M_MVE)
+/* clang-format off */
+#define MLK_ASM_FN_SYMBOL(sym) \
+ .type MLK_ASM_NAMESPACE(sym), %function; \
+ MLK_ASM_NAMESPACE(sym) :
+/* clang-format on */
+#else /* !MLK_SYS_X86_64 && MLK_SYS_ARMV81M_MVE */
#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) :
+#endif /* !MLK_SYS_X86_64 && !MLK_SYS_ARMV81M_MVE */
+
+/*
+ * Output the size of an assembly function.
+ */
+#if defined(__ELF__)
+#define MLK_ASM_FN_SIZE(sym) \
+ .size MLK_ASM_NAMESPACE(sym), .- MLK_ASM_NAMESPACE(sym)
+#else
+#define MLK_ASM_FN_SIZE(sym)
#endif
/* We aim to simplify the user's life by supporting builds where
@@ -99,6 +117,10 @@
#error Bad configuration: MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, but MLK_CONFIG_FIPS202_BACKEND_FILE is not.
#endif
+#if defined(MLK_CONFIG_NO_RANDOMIZED_API) && defined(MLK_CONFIG_KEYGEN_PCT)
+#error Bad configuration: MLK_CONFIG_NO_RANDOMIZED_API is incompatible with MLK_CONFIG_KEYGEN_PCT as the current PCT implementation requires crypto_kem_enc()
+#endif
+
#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH)
#include MLK_CONFIG_ARITH_BACKEND_FILE
/* Include to enforce consistency of API and implementation,
@@ -135,20 +157,118 @@
#define MLK_FIPS202X4_HEADER_FILE MLK_CONFIG_FIPS202X4_CUSTOM_HEADER
#endif
-/* Just in case we want to include mlkem_native.h, set the configuration
- * for that header in accordance with the configuration used here. */
+/* Standard library function replacements */
+#if !defined(__ASSEMBLER__)
+#if !defined(MLK_CONFIG_CUSTOM_MEMCPY)
+#include
+#define mlk_memcpy memcpy
+#endif
+
+#if !defined(MLK_CONFIG_CUSTOM_MEMSET)
+#include
+#define mlk_memset memset
+#endif
+
+
+/* Allocation macros for large local structures
+ *
+ * MLK_ALLOC(v, T, N) declares T *v and attempts to point it to an T[N]
+ * MLK_FREE(v, T, N) zeroizes and frees the allocation
+ *
+ * Default implementation uses stack allocation.
+ * Can be overridden by setting the config option MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * and defining MLK_CUSTOM_ALLOC and MLK_CUSTOM_FREE.
+ */
+#if defined(MLK_CONFIG_CUSTOM_ALLOC_FREE) != \
+ (defined(MLK_CUSTOM_ALLOC) && defined(MLK_CUSTOM_FREE))
+#error Bad configuration: MLK_CONFIG_CUSTOM_ALLOC_FREE must be set together with MLK_CUSTOM_ALLOC and MLK_CUSTOM_FREE
+#endif
+
+/*
+ * If the integration wants to provide a context parameter for use in
+ * platform-specific hooks, then it should define this parameter.
+ *
+ * The MLK_CONTEXT_PARAMETERS_n macros are intended to be used with macros
+ * defining the function names and expand to either pass or discard the context
+ * argument as required by the current build. If there is no context parameter
+ * requested then these are removed from the prototypes and from all calls.
+ */
+#ifdef MLK_CONFIG_CONTEXT_PARAMETER
+#define MLK_CONTEXT_PARAMETERS_0(context) (context)
+#define MLK_CONTEXT_PARAMETERS_1(arg0, context) (arg0, context)
+#define MLK_CONTEXT_PARAMETERS_2(arg0, arg1, context) (arg0, arg1, context)
+#define MLK_CONTEXT_PARAMETERS_3(arg0, arg1, arg2, context) \
+ (arg0, arg1, arg2, context)
+#define MLK_CONTEXT_PARAMETERS_4(arg0, arg1, arg2, arg3, context) \
+ (arg0, arg1, arg2, arg3, context)
+#else /* MLK_CONFIG_CONTEXT_PARAMETER */
+#define MLK_CONTEXT_PARAMETERS_0(context) ()
+#define MLK_CONTEXT_PARAMETERS_1(arg0, context) (arg0)
+#define MLK_CONTEXT_PARAMETERS_2(arg0, arg1, context) (arg0, arg1)
+#define MLK_CONTEXT_PARAMETERS_3(arg0, arg1, arg2, context) (arg0, arg1, arg2)
+#define MLK_CONTEXT_PARAMETERS_4(arg0, arg1, arg2, arg3, context) \
+ (arg0, arg1, arg2, arg3)
+#endif /* !MLK_CONFIG_CONTEXT_PARAMETER */
+
+#if defined(MLK_CONFIG_CONTEXT_PARAMETER_TYPE) != \
+ defined(MLK_CONFIG_CONTEXT_PARAMETER)
+#error MLK_CONFIG_CONTEXT_PARAMETER_TYPE must be defined if and only if MLK_CONFIG_CONTEXT_PARAMETER is defined
+#endif
+
+#if !defined(MLK_CONFIG_CUSTOM_ALLOC_FREE)
+/* Default: stack allocation */
+
+#define MLK_ALLOC(v, T, N, context) \
+ MLK_ALIGN T mlk_alloc_##v[N]; \
+ T *v = mlk_alloc_##v
+
+/* TODO: This leads to a circular dependency between common and verify.h
+ * It just works out before we're at the end of the file, but it's still
+ * prone to issues in the future. */
+#include "verify.h"
+#define MLK_FREE(v, T, N, context) \
+ do \
+ { \
+ mlk_zeroize(mlk_alloc_##v, sizeof(mlk_alloc_##v)); \
+ (v) = NULL; \
+ } while (0)
+
+#else /* !MLK_CONFIG_CUSTOM_ALLOC_FREE */
+
+/* Custom allocation */
+
+/*
+ * The indirection here is necessary to use MLK_CONTEXT_PARAMETERS_3 here.
+ */
+#define MLK_APPLY(f, args) f args
+
+#define MLK_ALLOC(v, T, N, context) \
+ MLK_APPLY(MLK_CUSTOM_ALLOC, MLK_CONTEXT_PARAMETERS_3(v, T, N, context))
+
+#define MLK_FREE(v, T, N, context) \
+ do \
+ { \
+ if (v != NULL) \
+ { \
+ mlk_zeroize(v, sizeof(T) * (N)); \
+ MLK_APPLY(MLK_CUSTOM_FREE, MLK_CONTEXT_PARAMETERS_3(v, T, N, context)); \
+ v = NULL; \
+ } \
+ } while (0)
+
+#endif /* MLK_CONFIG_CUSTOM_ALLOC_FREE */
+
+/****************************** Error codes ***********************************/
-/* Double-check that this is not conflicting with pre-existing definitions. */
-#if defined(MLK_CONFIG_API_PARAMETER_SET) || \
- defined(MLK_CONFIG_API_NAMESPACE_PREFIX) || \
- defined(MLK_CONFIG_API_NO_SUPERCOP) || \
- defined(MLK_CONFIG_API_CONSTANTS_ONLY)
-#error Pre-existing MLK_CONFIG_API_XXX configuration is neither useful nor allowed during an mlkem-native build
-#endif /* MLK_CONFIG_API_PARAMETER_SET || MLK_CONFIG_API_NAMESPACE_PREFIX || \
- MLK_CONFIG_API_NO_SUPERCOP || MLK_CONFIG_API_CONSTANTS_ONLY */
+/* Generic failure condition */
+#define MLK_ERR_FAIL -1
+/* An allocation failed. This can only happen if MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * is defined and the provided MLK_CUSTOM_ALLOC can fail. */
+#define MLK_ERR_OUT_OF_MEMORY -2
+/* An rng failure occured. Might be due to insufficient entropy or
+ * system misconfiguration. */
+#define MLK_ERR_RNG_FAIL -3
-#define MLK_CONFIG_API_PARAMETER_SET MLK_CONFIG_PARAMETER_SET
-#define MLK_CONFIG_API_NAMESPACE_PREFIX \
- MLK_ADD_PARAM_SET(MLK_CONFIG_NAMESPACE_PREFIX)
+#endif /* !__ASSEMBLER__ */
#endif /* !MLK_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/compress.c
index d7ff2bbe7a..50da36d0e4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/compress.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/compress.c
@@ -20,24 +20,27 @@
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
-#include
+
#include "cbmc.h"
#include "compress.h"
#include "debug.h"
#include "verify.h"
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
/* Reference: `poly_compress()` in the reference implementation @[REF],
* for ML-KEM-{512,768}.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d4_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -55,32 +58,51 @@ void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
t[j] = mlk_scalar_compress_d4(a->coeffs[8 * i + j]);
}
- r[i * 4] = t[0] | (t[1] << 4);
- r[i * 4 + 1] = t[2] | (t[3] << 4);
- r[i * 4 + 2] = t[4] | (t[5] << 4);
- r[i * 4 + 3] = t[6] | (t[7] << 4);
+ /* All t[i] are 4-bit wide, so the truncations don't alter the value. */
+ r[i * 4] = (uint8_t)(t[0] | (t[1] << 4));
+ r[i * 4 + 1] = (uint8_t)(t[2] | (t[3] << 4));
+ r[i * 4 + 2] = (uint8_t)(t[4] | (t[5] << 4));
+ r[i * 4 + 3] = (uint8_t)(t[6] | (t[7] << 4));
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D4 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d4_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d4_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
+ mlk_poly_compress_d4_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation, for ML-KEM-{512,768}.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d10_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+)
{
unsigned j;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -101,29 +123,47 @@ void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
* Make all implicit truncation explicit. No data is being
* truncated for the LHS's since each t[i] is 10-bit in size.
*/
- r[5 * j + 0] = (t[0] >> 0) & 0xFF;
- r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
- r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
- r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
- r[5 * j + 4] = (t[3] >> 2);
+ r[5 * j + 0] = (uint8_t)((t[0] >> 0) & 0xFF);
+ r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] << 2) & 0xFF));
+ r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] << 4) & 0xFF));
+ r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] << 6) & 0xFF));
+ r[5 * j + 4] = (uint8_t)(t[3] >> 2);
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D10 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d10_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d10_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
+ mlk_poly_compress_d10_c(r, a);
+}
+
/* Reference: `poly_decompress()` in the reference implementation @[REF],
* for ML-KEM-{512,768}. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d4(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d4_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 2; i++)
@@ -137,22 +177,40 @@ void mlk_poly_decompress_d4(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d4(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d4_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
+ int ret;
+ ret = mlk_poly_decompress_d4_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
+ mlk_poly_decompress_d4_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation, for ML-KEM-{512,768}. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d10(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d10_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned j;
for (j = 0; j < MLKEM_N / 4; j++)
@@ -180,28 +238,46 @@ void mlk_poly_decompress_d10(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d10(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d10_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
+ int ret;
+ ret = mlk_poly_decompress_d10_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
+
+ mlk_poly_decompress_d10_c(r, a);
+}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
/* Reference: `poly_compress()` in the reference implementation @[REF],
* for ML-KEM-1024.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d5_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -219,38 +295,51 @@ void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
t[j] = mlk_scalar_compress_d5(a->coeffs[8 * i + j]);
}
- /*
- * Explicitly truncate to avoid warning about
- * implicit truncation in CBMC, and use array indexing into
- * r rather than pointer-arithmetic to simplify verification
- */
- r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
- r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
- r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
- r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
- r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+ r[i * 5] = (uint8_t)(0xFF & ((t[0] >> 0) | (t[1] << 5)));
+ r[i * 5 + 1] = (uint8_t)(0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)));
+ r[i * 5 + 2] = (uint8_t)(0xFF & ((t[3] >> 1) | (t[4] << 4)));
+ r[i * 5 + 3] = (uint8_t)(0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)));
+ r[i * 5 + 4] = (uint8_t)(0xFF & ((t[6] >> 2) | (t[7] << 3)));
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D5 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d5_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d5_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
+ mlk_poly_compress_d5_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation, for ML-KEM-1024.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d11_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+)
{
unsigned j;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -272,35 +361,53 @@ void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
* Make all implicit truncation explicit. No data is being
* truncated for the LHS's since each t[i] is 11-bit in size.
*/
- r[11 * j + 0] = (t[0] >> 0) & 0xFF;
- r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
- r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
- r[11 * j + 3] = (t[2] >> 2) & 0xFF;
- r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
- r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
- r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
- r[11 * j + 7] = (t[5] >> 1) & 0xFF;
- r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
- r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
- r[11 * j + 10] = (t[7] >> 3);
+ r[11 * j + 0] = (uint8_t)((t[0] >> 0) & 0xFF);
+ r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] << 3) & 0xFF));
+ r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] << 6) & 0xFF));
+ r[11 * j + 3] = (uint8_t)((t[2] >> 2) & 0xFF);
+ r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] << 1) & 0xFF));
+ r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] << 4) & 0xFF));
+ r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] << 7) & 0xFF));
+ r[11 * j + 7] = (uint8_t)((t[5] >> 1) & 0xFF);
+ r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] << 2) & 0xFF));
+ r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] << 5) & 0xFF));
+ r[11 * j + 10] = (uint8_t)(t[7] >> 3);
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D11 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d11_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d11_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
+ mlk_poly_compress_d11_c(r, a);
+}
+
/* Reference: `poly_decompress()` in the reference implementation @[REF],
* for ML-KEM-1024. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d5(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d5_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 8; i++)
@@ -342,22 +449,40 @@ void mlk_poly_decompress_d5(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d5(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d5_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
+ int ret;
+ ret = mlk_poly_decompress_d5_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
+ mlk_poly_decompress_d5_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation, for ML-KEM-1024. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d11(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d11_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned j;
for (j = 0; j < MLKEM_N / 8; j++)
@@ -390,26 +515,45 @@ void mlk_poly_decompress_d11(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d11(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d11_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
+ int ret;
+ ret = mlk_poly_decompress_d11_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
+ mlk_poly_decompress_d11_c(r, a);
+}
+
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
-#if !defined(MLK_USE_NATIVE_POLY_TOBYTES)
/* Reference: `poly_tobytes()` in the reference implementation @[REF].
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_tobytes_c(uint8_t r[MLKEM_POLYBYTES],
+ const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYBYTES))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -417,8 +561,10 @@ void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(invariant(i <= MLKEM_N / 2))
{
- const uint16_t t0 = a->coeffs[2 * i];
- const uint16_t t1 = a->coeffs[2 * i + 1];
+ /* The conversion to uint16_t is safe since we assume that
+ * the coefficients of `a` are non-negative. */
+ const uint16_t t0 = (uint16_t)a->coeffs[2 * i];
+ const uint16_t t1 = (uint16_t)a->coeffs[2 * i + 1];
/*
* t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
* significant data, so these can be packed into 24 bits or exactly
@@ -426,32 +572,48 @@ void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
*/
/* Least significant bits 0 - 7 of t0. */
- r[3 * i + 0] = t0 & 0xFF;
+ r[3 * i + 0] = (uint8_t)(t0 & 0xFF);
/*
* Most significant bits 8 - 11 of t0 become the least significant
* nibble of the second byte. The least significant 4 bits
* of t1 become the upper nibble of the second byte.
+ *
+ * The conversion to uint8_t does not alter the value.
*/
- r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+ r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 << 4) & 0xF0));
- /* Bits 4 - 11 of t1 become the third byte. */
- r[3 * i + 2] = t1 >> 4;
+ /* Bits 4 - 11 of t1 become the third byte. The conversion to uint8_t
+ * does not alter the value because t1 is 12-bit wide. */
+ r[3 * i + 2] = (uint8_t)(t1 >> 4);
}
}
-#else /* !MLK_USE_NATIVE_POLY_TOBYTES */
+
MLK_INTERNAL_API
void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
{
+#if defined(MLK_USE_NATIVE_POLY_TOBYTES)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_tobytes_native(r, a->coeffs);
-}
+ ret = mlk_poly_tobytes_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
-#if !defined(MLK_USE_NATIVE_POLY_FROMBYTES)
+ mlk_poly_tobytes_c(r, a);
+}
+
/* Reference: `poly_frombytes()` in the reference implementation @[REF]. */
-MLK_INTERNAL_API
-void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
+MLK_STATIC_TESTABLE void mlk_poly_frombytes_c(mlk_poly *r,
+ const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+ requires(memory_no_alias(a, MLKEM_POLYBYTES))
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 2; i++)
@@ -462,21 +624,29 @@ void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
const uint8_t t0 = a[3 * i + 0];
const uint8_t t1 = a[3 * i + 1];
const uint8_t t2 = a[3 * i + 2];
- r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
- r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+ r->coeffs[2 * i + 0] = (int16_t)(t0 | ((t1 << 8) & 0xFFF));
+ r->coeffs[2 * i + 1] = (int16_t)((t1 >> 4) | (t2 << 4));
}
/* Note that the coefficients are not canonical */
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
}
-#else /* !MLK_USE_NATIVE_POLY_FROMBYTES */
+
MLK_INTERNAL_API
void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
{
- mlk_poly_frombytes_native(r->coeffs, a);
-}
+#if defined(MLK_USE_NATIVE_POLY_FROMBYTES)
+ int ret;
+ ret = mlk_poly_frombytes_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
+ mlk_poly_frombytes_c(r, a);
+}
+
/* Reference: `poly_frommsg()` in the reference implementation @[REF].
* - We use a value barrier around the bit-selection mask to
* reduce the risk of compiler-introduced branches.
@@ -506,7 +676,7 @@ void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
* as per @[FIPS203, Eq (4.8)]. */
/* Prevent the compiler from recognizing this as a bit selection */
- uint8_t mask = mlk_value_barrier_u8(1u << j);
+ uint8_t mask = mlk_value_barrier_u8((uint8_t)(1u << j));
r->coeffs[8 * i + j] = mlk_ct_sel_int16(MLKEM_Q_HALF, 0, msg[i] & mask);
}
}
@@ -535,7 +705,7 @@ void mlk_poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const mlk_poly *a)
invariant(i <= MLKEM_N / 8 && j <= 8))
{
uint32_t t = mlk_scalar_compress_d1(a->coeffs[8 * i + j]);
- msg[i] |= t << j;
+ msg[i] |= (uint8_t)(t << j);
}
}
}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/compress.h
index f0789d42d6..b16b0889b5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/compress.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/compress.h
@@ -20,8 +20,7 @@
#ifndef MLK_COMPRESS_H
#define MLK_COMPRESS_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include "debug.h"
@@ -50,9 +49,9 @@
#endif
/* Reference: Part of poly_tomsg() in the reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d1(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d1(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 2)
ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) )
{
@@ -65,7 +64,8 @@ __contract__(
*/
/* check-magic: 1290168 == 2*round(2^31 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290168;
- return (d0 + (1u << 30)) >> 31;
+ /* Unsigned shifting by 31 positions leaves only the top bit. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 30)) >> 31);
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -93,9 +93,9 @@ __contract__(
/* Reference: Embedded into `poly_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d4(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d4(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 16)
ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
{
@@ -108,7 +108,8 @@ __contract__(
*/
/* check-magic: 1290160 == 16 * round(2^28 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290160;
- return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */
+ /* The return value is < 16, so not altered by the conversion to uint8_t. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 27)) >> 28); /* round(d0/2^28) */
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -128,11 +129,16 @@ __contract__(
/* Reference: Embedded into `poly_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d4(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d4(uint8_t u)
__contract__(
requires(0 <= u && u < 16)
ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) >> 4; }
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 8) >> 4);
+}
/************************************************************
* Name: mlk_scalar_compress_d5
@@ -156,9 +162,9 @@ __contract__(
/* Reference: Embedded into `poly_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d5(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d5(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 32)
ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) )
{
@@ -171,7 +177,8 @@ __contract__(
*/
/* check-magic: 1290176 == 2^5 * round(2^27 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290176;
- return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */
+ /* The return value is < 32, so not altered by the conversion to uint8_t. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 26)) >> 27); /* round(d0/2^27) */
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -191,11 +198,16 @@ __contract__(
/* Reference: Embedded into `poly_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d5(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d5(uint8_t u)
__contract__(
requires(0 <= u && u < 32)
- ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) >> 5; }
+ ensures(0 <= return_value && return_value <= MLKEM_Q - 1)
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 16) >> 5);
+}
/************************************************************
* Name: mlk_scalar_compress_d10
@@ -219,9 +231,9 @@ __contract__(
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d10(uint16_t u)
+static MLK_INLINE uint16_t mlk_scalar_compress_d10(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < (1u << 10))
ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
{
@@ -255,11 +267,16 @@ __contract__(
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d10(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d10(uint16_t u)
__contract__(
requires(0 <= u && u < 1024)
- ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) >> 10; }
+ ensures(0 <= return_value && return_value <= (MLKEM_Q - 1))
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 512) >> 10);
+}
/************************************************************
* Name: mlk_scalar_compress_d11
@@ -283,9 +300,9 @@ __contract__(
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d11(uint16_t u)
+static MLK_INLINE uint16_t mlk_scalar_compress_d11(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < (1u << 11))
ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
{
@@ -319,11 +336,16 @@ __contract__(
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d11(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d11(uint16_t u)
__contract__(
requires(0 <= u && u < 2048)
- ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) >> 11; }
+ ensures(0 <= return_value && return_value <= (MLKEM_Q - 1))
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 1024) >> 11);
+}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
#define mlk_poly_compress_d4 MLK_NAMESPACE(poly_compress_d4)
@@ -575,7 +597,7 @@ __contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
);
@@ -631,7 +653,7 @@ void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
__contract__(
requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
);
@@ -660,7 +682,7 @@ __contract__(
requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(msg))
+ assigns(memory_slice(msg, MLKEM_INDCPA_MSGBYTES))
);
#endif /* !MLK_COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/debug.h
index 01f7c88ccf..47c864bd36 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/debug.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/debug.h
@@ -7,7 +7,6 @@
#include "common.h"
#if defined(MLKEM_DEBUG)
-#include
/*************************************************
* Name: mlk_assert
@@ -89,14 +88,14 @@ void mlk_debug_check_bounds(const char *file, int line, const int16_t *ptr,
/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
* just use a single flattened array_bound(...) here. */
-#define mlk_assert_bound_2d(ptr, M, N, value_lb, value_ub) \
- cassert(forall(kN, 0, (M), \
- array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+#define mlk_assert_bound_2d(ptr, M, N, value_lb, value_ub) \
+ cassert(forall(kN, 0, (M), \
+ array_bound(&((int16_t (*)[(N)])(ptr))[kN][0], 0, (N), \
(value_lb), (value_ub))))
-#define mlk_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \
- cassert(forall(kN, 0, (M), \
- array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+#define mlk_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \
+ cassert(forall(kN, 0, (M), \
+ array_abs_bound(&((int16_t (*)[(N)])(ptr))[kN][0], 0, (N), \
(value_abs_bd))))
#else /* !MLKEM_DEBUG && CBMC */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/indcpa.c
index 85d4f595a9..e03b16c38b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/indcpa.c
@@ -17,15 +17,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
-#include
+#include "indcpa.h"
-#include "cbmc.h"
#include "debug.h"
-#include "indcpa.h"
-#include "poly.h"
-#include "poly_k.h"
#include "randombytes.h"
#include "sampling.h"
#include "symmetric.h"
@@ -41,6 +35,10 @@
#define mlk_pack_ciphertext MLK_ADD_PARAM_SET(mlk_pack_ciphertext)
#define mlk_unpack_ciphertext MLK_ADD_PARAM_SET(mlk_unpack_ciphertext)
#define mlk_matvec_mul MLK_ADD_PARAM_SET(mlk_matvec_mul)
+#define mlk_polyvec_permute_bitrev_to_custom \
+ MLK_ADD_PARAM_SET(mlk_polyvec_permute_bitrev_to_custom)
+#define mlk_polymat_permute_bitrev_to_custom \
+ MLK_ADD_PARAM_SET(mlk_polymat_permute_bitrev_to_custom)
/* End of parameter set namespacing */
/*************************************************
@@ -59,12 +57,13 @@
* Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L19]
*
**************************************************/
-static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], mlk_polyvec pk,
+static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const mlk_polyvec *pk,
const uint8_t seed[MLKEM_SYMBYTES])
{
- mlk_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(pk->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
mlk_polyvec_tobytes(r, pk);
- memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
+ mlk_memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
}
/*************************************************
@@ -83,11 +82,11 @@ static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], mlk_polyvec pk,
* Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L2-3]
*
**************************************************/
-static void mlk_unpack_pk(mlk_polyvec pk, uint8_t seed[MLKEM_SYMBYTES],
+static void mlk_unpack_pk(mlk_polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
const uint8_t packedpk[MLKEM_INDCPA_PUBLICKEYBYTES])
{
mlk_polyvec_frombytes(pk, packedpk);
- memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
/* NOTE: If a modulus check was conducted on the PK, we know at this
* point that the coefficients of `pk` are unsigned canonical. The
@@ -108,9 +107,10 @@ static void mlk_unpack_pk(mlk_polyvec pk, uint8_t seed[MLKEM_SYMBYTES],
* Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L20]
*
**************************************************/
-static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], mlk_polyvec sk)
+static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES],
+ const mlk_polyvec *sk)
{
- mlk_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(sk->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
mlk_polyvec_tobytes(r, sk);
}
@@ -128,7 +128,7 @@ static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], mlk_polyvec sk)
* Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L5]
*
**************************************************/
-static void mlk_unpack_sk(mlk_polyvec sk,
+static void mlk_unpack_sk(mlk_polyvec *sk,
const uint8_t packedsk[MLKEM_INDCPA_SECRETKEYBYTES])
{
mlk_polyvec_frombytes(sk, packedsk);
@@ -149,8 +149,8 @@ static void mlk_unpack_sk(mlk_polyvec sk,
* Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22-23]
*
**************************************************/
-static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], mlk_polyvec b,
- mlk_poly *v)
+static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES],
+ const mlk_polyvec *b, mlk_poly *v)
{
mlk_polyvec_compress_du(r, b);
mlk_poly_compress_dv(r + MLKEM_POLYVECCOMPRESSEDBYTES_DU, v);
@@ -170,28 +170,69 @@ static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], mlk_polyvec b,
* Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L1-4]
*
**************************************************/
-static void mlk_unpack_ciphertext(mlk_polyvec b, mlk_poly *v,
+static void mlk_unpack_ciphertext(mlk_polyvec *b, mlk_poly *v,
const uint8_t c[MLKEM_INDCPA_BYTES])
{
mlk_polyvec_decompress_du(b, c);
mlk_poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
}
-#if !defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
-/* This namespacing is not done at the top to avoid a naming conflict
- * with native backends, which are currently not yet namespaced. */
-#define mlk_poly_permute_bitrev_to_custom \
- MLK_ADD_PARAM_SET(mlk_poly_permute_bitrev_to_custom)
-
-static MLK_INLINE void mlk_poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
+/* Helper function to ensure that the polynomial entries in the output
+ * of gen_matrix use the standard (bitreversed) ordering of coefficients.
+ * No-op unless a native backend with a custom ordering is used.
+ *
+ * We don't inline this into gen_matrix to avoid having to split the CBMC
+ * proof for gen_matrix based on MLK_USE_NATIVE_NTT_CUSTOM_ORDER. */
+static void mlk_polyvec_permute_bitrev_to_custom(mlk_polyvec *v)
__contract__(
/* We don't specify that this should be a permutation, but only
* that it does not change the bound established at the end of mlk_gen_matrix. */
- requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
- requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(memory_slice(data, sizeof(mlk_poly)))
- ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+ requires(memory_no_alias(v, sizeof(mlk_polyvec)))
+ requires(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(v, sizeof(mlk_polyvec)))
+ ensures(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+{
+#if defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
+ unsigned i;
+ for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(v, sizeof(mlk_polyvec)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+ {
+ mlk_poly_permute_bitrev_to_custom(v->vec[i].coeffs);
+ }
+#else /* MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
+ /* Nothing to do */
+ (void)v;
#endif /* !MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
+}
+
+static void mlk_polymat_permute_bitrev_to_custom(mlk_polymat *a)
+__contract__(
+ /* We don't specify that this should be a permutation, but only
+ * that it does not change the bound established at the end of mlk_gen_matrix. */
+ requires(memory_no_alias(a, sizeof(mlk_polymat)))
+ requires(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+ assigns(memory_slice(a, sizeof(mlk_polymat)))
+ ensures(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))))
+{
+ unsigned i;
+ for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(a, sizeof(mlk_polymat)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))))
+ {
+ mlk_polyvec_permute_bitrev_to_custom(&a->vec[i]);
+ }
+}
/* Reference: `gen_matrix()` in the reference implementation @[REF].
* - We use a special subroutine to generate 4 polynomials
@@ -201,32 +242,27 @@ __contract__(
*
* Not static for benchmarking */
MLK_INTERNAL_API
-void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
+void mlk_gen_matrix(mlk_polymat *a, const uint8_t seed[MLKEM_SYMBYTES],
int transposed)
{
unsigned i, j;
- /*
- * We generate four separate seed arrays rather than a single one to work
- * around limitations in CBMC function contracts dealing with disjoint slices
- * of the same parent object.
- */
-
MLK_ALIGN uint8_t seed_ext[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)];
for (j = 0; j < 4; j++)
{
- memcpy(seed_ext[j], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(seed_ext[j], seed, MLKEM_SYMBYTES);
}
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
/* Sample 4 matrix entries a time. */
for (i = 0; i < (MLKEM_K * MLKEM_K / 4) * 4; i += 4)
{
- uint8_t x, y;
-
for (j = 0; j < 4; j++)
{
- x = (i + j) / MLKEM_K;
- y = (i + j) % MLKEM_K;
+ uint8_t x, y;
+ /* MLKEM_K <= 4, so the values fit in uint8_t. */
+ x = (uint8_t)((i + j) / MLKEM_K);
+ y = (uint8_t)((i + j) % MLKEM_K);
if (transposed)
{
seed_ext[j][MLKEM_SYMBYTES + 0] = x;
@@ -239,19 +275,26 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
}
}
- /*
- * This call writes across mlk_polyvec boundaries for K=2 and K=3.
- * This is intentional and safe.
- */
- mlk_poly_rej_uniform_x4(&a[i], seed_ext);
+ mlk_poly_rej_uniform_x4(&a->vec[i / MLKEM_K].vec[i % MLKEM_K],
+ &a->vec[(i + 1) / MLKEM_K].vec[(i + 1) % MLKEM_K],
+ &a->vec[(i + 2) / MLKEM_K].vec[(i + 2) % MLKEM_K],
+ &a->vec[(i + 3) / MLKEM_K].vec[(i + 3) % MLKEM_K],
+ seed_ext);
}
-
- /* For MLKEM_K == 3, sample the last entry individually. */
- if (i < MLKEM_K * MLKEM_K)
+#else /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
+ /* When using serial FIPS202, sample all entries individually. */
+ i = 0;
+#endif /* MLK_CONFIG_SERIAL_FIPS202_ONLY */
+
+ /* For MLKEM_K == 3, sample the last entry individually.
+ * When MLK_CONFIG_SERIAL_FIPS202_ONLY is set, sample all entries
+ * individually. */
+ for (; i < MLKEM_K * MLKEM_K; i++)
{
uint8_t x, y;
- x = i / MLKEM_K;
- y = i % MLKEM_K;
+ /* MLKEM_K <= 4, so the values fit in uint8_t. */
+ x = (uint8_t)(i / MLKEM_K);
+ y = (uint8_t)(i % MLKEM_K);
if (transposed)
{
@@ -264,8 +307,7 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
seed_ext[0][MLKEM_SYMBYTES + 1] = x;
}
- mlk_poly_rej_uniform(&a[i], seed_ext[0]);
- i++;
+ mlk_poly_rej_uniform(&a->vec[i / MLKEM_K].vec[i % MLKEM_K], seed_ext[0]);
}
mlk_assert(i == MLKEM_K * MLKEM_K);
@@ -274,10 +316,7 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
* The public matrix is generated in NTT domain. If the native backend
* uses a custom order in NTT domain, permute A accordingly.
*/
- for (i = 0; i < MLKEM_K * MLKEM_K; i++)
- {
- mlk_poly_permute_bitrev_to_custom(a[i].coeffs);
- }
+ mlk_polymat_permute_bitrev_to_custom(a);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -301,24 +340,25 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
* Specification: Implements @[FIPS203, Section 2.4.7, Eq (2.12), (2.13)]
*
**************************************************/
-static void mlk_matvec_mul(mlk_polyvec out, const mlk_polymat a,
- const mlk_polyvec v, const mlk_polyvec_mulcache vc)
+static void mlk_matvec_mul(mlk_polyvec *out, const mlk_polymat *a,
+ const mlk_polyvec *v, const mlk_polyvec_mulcache *vc)
__contract__(
requires(memory_no_alias(out, sizeof(mlk_polyvec)))
requires(memory_no_alias(a, sizeof(mlk_polymat)))
requires(memory_no_alias(v, sizeof(mlk_polyvec)))
requires(memory_no_alias(vc, sizeof(mlk_polyvec_mulcache)))
- requires(forall(k0, 0, MLKEM_K * MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
- assigns(object_whole(out)))
+ requires(forall(k0, 0, MLKEM_K,
+ forall(k1, 0, MLKEM_K,
+ array_bound(a->vec[k0].vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))))
+ assigns(memory_slice(out, sizeof(mlk_polyvec))))
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
__loop__(
- assigns(i, object_whole(out))
+ assigns(i, memory_slice(out, sizeof(mlk_polyvec)))
invariant(i <= MLKEM_K))
{
- mlk_polyvec_basemul_acc_montgomery_cached(&out[i], &a[MLKEM_K * i], v, vc);
+ mlk_polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a->vec[i], v, vc);
}
}
@@ -331,20 +371,34 @@ __contract__(
* - We include buffer zeroization.
*/
MLK_INTERNAL_API
-void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- const uint8_t *publicseed = buf;
- const uint8_t *noiseseed = buf + MLKEM_SYMBYTES;
- mlk_polymat a;
- mlk_polyvec e, pkpv, skpv;
- mlk_polyvec_mulcache skpv_cache;
-
- MLK_ALIGN uint8_t coins_with_domain_separator[MLKEM_SYMBYTES + 1];
+ int ret = 0;
+ const uint8_t *publicseed;
+ const uint8_t *noiseseed;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(coins_with_domain_separator, uint8_t, MLKEM_SYMBYTES + 1, context);
+ MLK_ALLOC(a, mlk_polymat, 1, context);
+ MLK_ALLOC(e, mlk_polyvec, 1, context);
+ MLK_ALLOC(pkpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (buf == NULL || coins_with_domain_separator == NULL || a == NULL ||
+ e == NULL || pkpv == NULL || skpv == NULL || skpv_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
+ publicseed = buf;
+ noiseseed = buf + MLKEM_SYMBYTES;
+
/* Concatenate coins with MLKEM_K for domain separation of security levels */
- memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
+ mlk_memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
coins_with_domain_separator[MLKEM_SYMBYTES] = MLKEM_K;
mlk_hash_g(buf, coins_with_domain_separator, MLKEM_SYMBYTES + 1);
@@ -360,24 +414,24 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
mlk_gen_matrix(a, publicseed, 0 /* no transpose */);
#if MLKEM_K == 2
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &e[0], &e[1], noiseseed, 0, 1,
- 2, 3);
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &e->vec[0],
+ &e->vec[1], noiseseed, 0, 1, 2, 3);
#elif MLKEM_K == 3
/*
* Only the first three output buffers are needed.
* The laster parameter is a dummy that's overwritten later.
*/
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &skpv[2],
- &pkpv[0] /* irrelevant */, noiseseed, 0, 1, 2,
- 0xFF /* irrelevant */);
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &skpv->vec[2], NULL,
+ noiseseed, 0, 1, 2, 0xFF /* irrelevant */);
/* Same here */
- mlk_poly_getnoise_eta1_4x(&e[0], &e[1], &e[2], &pkpv[0] /* irrelevant */,
- noiseseed, 3, 4, 5, 0xFF /* irrelevant */);
+ mlk_poly_getnoise_eta1_4x(&e->vec[0], &e->vec[1], &e->vec[2], NULL, noiseseed,
+ 3, 4, 5, 0xFF /* irrelevant */);
#elif MLKEM_K == 4
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &skpv[2], &skpv[3], noiseseed,
- 0, 1, 2, 3);
- mlk_poly_getnoise_eta1_4x(&e[0], &e[1], &e[2], &e[3], noiseseed, 4, 5, 6, 7);
-#endif
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &skpv->vec[2],
+ &skpv->vec[3], noiseseed, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta1_4x(&e->vec[0], &e->vec[1], &e->vec[2], &e->vec[3],
+ noiseseed, 4, 5, 6, 7);
+#endif /* MLKEM_K == 4 */
mlk_polyvec_ntt(skpv);
mlk_polyvec_ntt(e);
@@ -393,14 +447,17 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
mlk_pack_sk(sk, skpv);
mlk_pack_pk(pk, pkpv, publicseed);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(coins_with_domain_separator, sizeof(coins_with_domain_separator));
- mlk_zeroize(a, sizeof(a));
- mlk_zeroize(&e, sizeof(e));
- mlk_zeroize(&skpv, sizeof(skpv));
- mlk_zeroize(&skpv_cache, sizeof(skpv_cache));
+ MLK_FREE(skpv_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(skpv, mlk_polyvec, 1, context);
+ MLK_FREE(pkpv, mlk_polyvec, 1, context);
+ MLK_FREE(e, mlk_polyvec, 1, context);
+ MLK_FREE(a, mlk_polymat, 1, context);
+ MLK_FREE(coins_with_domain_separator, uint8_t, MLKEM_SYMBYTES + 1, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
/* Reference: `indcpa_enc()` in the reference implementation @[REF].
@@ -412,19 +469,33 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
* - We include buffer zeroization.
*/
MLK_INTERNAL_API
-void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t seed[MLKEM_SYMBYTES];
- mlk_polymat at;
- mlk_polyvec sp, pkpv, ep, b;
- mlk_poly v, k, epp;
- mlk_polyvec_mulcache sp_cache;
+ int ret = 0;
+ MLK_ALLOC(seed, uint8_t, MLKEM_SYMBYTES, context);
+ MLK_ALLOC(at, mlk_polymat, 1, context);
+ MLK_ALLOC(sp, mlk_polyvec, 1, context);
+ MLK_ALLOC(pkpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(ep, mlk_polyvec, 1, context);
+ MLK_ALLOC(b, mlk_polyvec, 1, context);
+ MLK_ALLOC(v, mlk_poly, 1, context);
+ MLK_ALLOC(k, mlk_poly, 1, context);
+ MLK_ALLOC(epp, mlk_poly, 1, context);
+ MLK_ALLOC(sp_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (seed == NULL || at == NULL || sp == NULL || pkpv == NULL || ep == NULL ||
+ b == NULL || v == NULL || k == NULL || epp == NULL || sp_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
mlk_unpack_pk(pkpv, seed, pk);
- mlk_poly_frommsg(&k, m);
+ mlk_poly_frommsg(k, m);
/*
* Declassify the public seed.
@@ -437,87 +508,105 @@ void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
mlk_gen_matrix(at, seed, 1 /* transpose */);
#if MLKEM_K == 2
- mlk_poly_getnoise_eta1122_4x(&sp[0], &sp[1], &ep[0], &ep[1], coins, 0, 1, 2,
- 3);
- mlk_poly_getnoise_eta2(&epp, coins, 4);
+ mlk_poly_getnoise_eta1122_4x(&sp->vec[0], &sp->vec[1], &ep->vec[0],
+ &ep->vec[1], coins, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta2(epp, coins, 4);
#elif MLKEM_K == 3
/*
* In this call, only the first three output buffers are needed.
* The last parameter is a dummy that's overwritten later.
*/
- mlk_poly_getnoise_eta1_4x(&sp[0], &sp[1], &sp[2], &b[0], coins, 0, 1, 2,
- 0xFF);
+ mlk_poly_getnoise_eta1_4x(&sp->vec[0], &sp->vec[1], &sp->vec[2], NULL, coins,
+ 0, 1, 2, 0xFF /* irrelevant */);
/* The fourth output buffer in this call _is_ used. */
- mlk_poly_getnoise_eta2_4x(&ep[0], &ep[1], &ep[2], &epp, coins, 3, 4, 5, 6);
+ mlk_poly_getnoise_eta2_4x(&ep->vec[0], &ep->vec[1], &ep->vec[2], epp, coins,
+ 3, 4, 5, 6);
#elif MLKEM_K == 4
- mlk_poly_getnoise_eta1_4x(&sp[0], &sp[1], &sp[2], &sp[3], coins, 0, 1, 2, 3);
- mlk_poly_getnoise_eta2_4x(&ep[0], &ep[1], &ep[2], &ep[3], coins, 4, 5, 6, 7);
- mlk_poly_getnoise_eta2(&epp, coins, 8);
-#endif
+ mlk_poly_getnoise_eta1_4x(&sp->vec[0], &sp->vec[1], &sp->vec[2], &sp->vec[3],
+ coins, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta2_4x(&ep->vec[0], &ep->vec[1], &ep->vec[2], &ep->vec[3],
+ coins, 4, 5, 6, 7);
+ mlk_poly_getnoise_eta2(epp, coins, 8);
+#endif /* MLKEM_K == 4 */
mlk_polyvec_ntt(sp);
mlk_polyvec_mulcache_compute(sp_cache, sp);
mlk_matvec_mul(b, at, sp, sp_cache);
- mlk_polyvec_basemul_acc_montgomery_cached(&v, pkpv, sp, sp_cache);
+ mlk_polyvec_basemul_acc_montgomery_cached(v, pkpv, sp, sp_cache);
mlk_polyvec_invntt_tomont(b);
- mlk_poly_invntt_tomont(&v);
+ mlk_poly_invntt_tomont(v);
mlk_polyvec_add(b, ep);
- mlk_poly_add(&v, &epp);
- mlk_poly_add(&v, &k);
+ mlk_poly_add(v, epp);
+ mlk_poly_add(v, k);
mlk_polyvec_reduce(b);
- mlk_poly_reduce(&v);
+ mlk_poly_reduce(v);
- mlk_pack_ciphertext(c, b, &v);
+ mlk_pack_ciphertext(c, b, v);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(seed, sizeof(seed));
- mlk_zeroize(&sp, sizeof(sp));
- mlk_zeroize(&sp_cache, sizeof(sp_cache));
- mlk_zeroize(&b, sizeof(b));
- mlk_zeroize(&v, sizeof(v));
- mlk_zeroize(at, sizeof(at));
- mlk_zeroize(&k, sizeof(k));
- mlk_zeroize(&ep, sizeof(ep));
- mlk_zeroize(&epp, sizeof(epp));
+ MLK_FREE(sp_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(epp, mlk_poly, 1, context);
+ MLK_FREE(k, mlk_poly, 1, context);
+ MLK_FREE(v, mlk_poly, 1, context);
+ MLK_FREE(b, mlk_polyvec, 1, context);
+ MLK_FREE(ep, mlk_polyvec, 1, context);
+ MLK_FREE(pkpv, mlk_polyvec, 1, context);
+ MLK_FREE(sp, mlk_polyvec, 1, context);
+ MLK_FREE(at, mlk_polymat, 1, context);
+ MLK_FREE(seed, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
/* Reference: `indcpa_dec()` in the reference implementation @[REF].
* - We use a mulcache for the scalar product.
* - We include buffer zeroization. */
MLK_INTERNAL_API
-void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+int mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- mlk_polyvec b, skpv;
- mlk_poly v, sb;
- mlk_polyvec_mulcache b_cache;
+ int ret = 0;
+ MLK_ALLOC(b, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(v, mlk_poly, 1, context);
+ MLK_ALLOC(sb, mlk_poly, 1, context);
+ MLK_ALLOC(b_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (b == NULL || skpv == NULL || v == NULL || sb == NULL || b_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
- mlk_unpack_ciphertext(b, &v, c);
+ mlk_unpack_ciphertext(b, v, c);
mlk_unpack_sk(skpv, sk);
mlk_polyvec_ntt(b);
mlk_polyvec_mulcache_compute(b_cache, b);
- mlk_polyvec_basemul_acc_montgomery_cached(&sb, skpv, b, b_cache);
- mlk_poly_invntt_tomont(&sb);
+ mlk_polyvec_basemul_acc_montgomery_cached(sb, skpv, b, b_cache);
+ mlk_poly_invntt_tomont(sb);
- mlk_poly_sub(&v, &sb);
- mlk_poly_reduce(&v);
+ mlk_poly_sub(v, sb);
+ mlk_poly_reduce(v);
- mlk_poly_tomsg(m, &v);
+ mlk_poly_tomsg(m, v);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(&skpv, sizeof(skpv));
- mlk_zeroize(&b, sizeof(b));
- mlk_zeroize(&b_cache, sizeof(b_cache));
- mlk_zeroize(&v, sizeof(v));
- mlk_zeroize(&sb, sizeof(sb));
+ MLK_FREE(b_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(sb, mlk_poly, 1, context);
+ MLK_FREE(v, mlk_poly, 1, context);
+ MLK_FREE(skpv, mlk_polyvec, 1, context);
+ MLK_FREE(b, mlk_polyvec, 1, context);
+ return ret;
}
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
@@ -529,4 +618,5 @@ void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
#undef mlk_pack_ciphertext
#undef mlk_unpack_ciphertext
#undef mlk_matvec_mul
-#undef mlk_poly_permute_bitrev_to_custom
+#undef mlk_polyvec_permute_bitrev_to_custom
+#undef mlk_polymat_permute_bitrev_to_custom
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/indcpa.h
index 4c44d0d411..b31756dcb6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/indcpa.h
@@ -15,7 +15,6 @@
#ifndef MLK_INDCPA_H
#define MLK_INDCPA_H
-#include
#include "cbmc.h"
#include "common.h"
#include "poly_k.h"
@@ -39,18 +38,19 @@
*
**************************************************/
MLK_INTERNAL_API
-void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
+void mlk_gen_matrix(mlk_polymat *a, const uint8_t seed[MLKEM_SYMBYTES],
int transposed)
__contract__(
requires(memory_no_alias(a, sizeof(mlk_polymat)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
requires(transposed == 0 || transposed == 1)
- assigns(object_whole(a))
- ensures(forall(x, 0, MLKEM_K * MLKEM_K,
- array_bound(a[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(a, sizeof(mlk_polymat)))
+ ensures(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
);
-#define mlk_indcpa_keypair_derand MLK_NAMESPACE_K(indcpa_keypair_derand)
+#define mlk_indcpa_keypair_derand \
+ MLK_NAMESPACE_K(indcpa_keypair_derand) MLK_CONTEXT_PARAMETERS_3
/*************************************************
* Name: mlk_indcpa_keypair_derand
*
@@ -68,18 +68,23 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
-#define mlk_indcpa_enc MLK_NAMESPACE_K(indcpa_enc)
+#define mlk_indcpa_enc MLK_NAMESPACE_K(indcpa_enc) MLK_CONTEXT_PARAMETERS_4
/*************************************************
* Name: mlk_indcpa_enc
*
@@ -100,19 +105,23 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(c))
+ assigns(memory_slice(c, MLKEM_INDCPA_BYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
-#define mlk_indcpa_dec MLK_NAMESPACE_K(indcpa_dec)
+#define mlk_indcpa_dec MLK_NAMESPACE_K(indcpa_dec) MLK_CONTEXT_PARAMETERS_3
/*************************************************
* Name: mlk_indcpa_dec
*
@@ -130,14 +139,18 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
- assigns(object_whole(m))
+ assigns(memory_slice(m, MLKEM_INDCPA_MSGBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
#endif /* !MLK_INDCPA_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/kem.c
index d6f4e83628..3c82d6df70 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/kem.c
@@ -8,7 +8,8 @@
*
* - [FIPS140_3_IG]
* Implementation Guidance for FIPS 140-3 and the Cryptographic Module
- * Validation Program National Institute of Standards and Technology
+ * Validation Program
+ * National Institute of Standards and Technology
* https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
*
* - [FIPS203]
@@ -22,12 +23,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
-#include
+#include "kem.h"
#include "indcpa.h"
-#include "kem.h"
#include "randombytes.h"
#include "symmetric.h"
#include "verify.h"
@@ -36,44 +34,24 @@
* This is to facilitate building multiple instances
* of mlkem-native (e.g. with varying security levels)
* within a single compilation unit. */
-#define mlk_check_pk MLK_ADD_PARAM_SET(mlk_check_pk)
-#define mlk_check_sk MLK_ADD_PARAM_SET(mlk_check_sk)
-#define mlk_check_pct MLK_ADD_PARAM_SET(mlk_check_pct)
+#define mlk_check_pct MLK_ADD_PARAM_SET(mlk_check_pct) MLK_CONTEXT_PARAMETERS_2
/* End of parameter set namespacing */
-#if defined(CBMC)
-/* Redeclaration with contract needed for CBMC only */
-int memcmp(const void *str1, const void *str2, size_t n)
-__contract__(
- requires(memory_no_alias(str1, n))
- requires(memory_no_alias(str2, n))
-);
-#endif /* CBMC */
-
-/*************************************************
- * Name: mlk_check_pk
- *
- * Description: Implements modulus check mandated by FIPS 203,
- * i.e., ensures that coefficients are in [0,q-1].
- *
- * Arguments: - const uint8_t *pk: pointer to input public key
- * (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
- * bytes)
- *
- * Returns: - 0 on success
- * - -1 on failure
- *
- * Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
- *
- **************************************************/
-
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-static int mlk_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- mlk_polyvec p;
- uint8_t p_reencoded[MLKEM_POLYVECBYTES];
+ int ret = 0;
+ MLK_ALLOC(p, mlk_polyvec, 1, context);
+ MLK_ALLOC(p_reencoded, uint8_t, MLKEM_POLYVECBYTES, context);
+
+ if (p == NULL || p_reencoded == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
mlk_polyvec_frombytes(p, pk);
mlk_polyvec_reduce(p);
@@ -81,39 +59,32 @@ static int mlk_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
/* We use a constant-time memcmp here to avoid having to
* declassify the PK before the PCT has succeeded. */
- res = mlk_ct_memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES) ? -1 : 0;
+ ret = mlk_ct_memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES) ? MLK_ERR_FAIL : 0;
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(p_reencoded, sizeof(p_reencoded));
- mlk_zeroize(&p, sizeof(p));
- return res;
+ MLK_FREE(p_reencoded, uint8_t, MLKEM_POLYVECBYTES, context);
+ MLK_FREE(p, mlk_polyvec, 1, context);
+ return ret;
}
-/*************************************************
- * Name: mlk_check_sk
- *
- * Description: Implements public key hash check mandated by FIPS 203,
- * i.e., ensures that
- * sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
- *
- * Arguments: - const uint8_t *sk: pointer to input private key
- * (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
- * bytes)
- *
- * Returns: - 0 on success
- * - -1 on failure
- *
- * Specification: Implements @[FIPS203, Section 7.3, 'hash check']
- *
- **************************************************/
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-static int mlk_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t test[MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(test, uint8_t, MLKEM_SYMBYTES, context);
+
+ if (test == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
/*
* The parts of `sk` being hashed and compared here are public, so
* no public information is leaked through the runtime or the return value
@@ -128,23 +99,32 @@ static int mlk_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
mlk_hash_h(test, sk + MLKEM_INDCPA_SECRETKEYBYTES,
MLKEM_INDCCA_PUBLICKEYBYTES);
- res = memcmp(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, test,
- MLKEM_SYMBYTES)
- ? -1
+ /* This doesn't have to be a constant-time memcmp, but it's the only place
+ * in the library where a normal memcmp would be used otherwise, so for sake
+ * of minimizing stdlib dependency, we use our constant-time one anyway. */
+ ret = mlk_ct_memcmp(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+ test, MLKEM_SYMBYTES)
+ ? MLK_ERR_FAIL
: 0;
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(test, sizeof(test));
- return res;
+ MLK_FREE(test, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
- requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES)));
+ requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
+);
#if defined(MLK_CONFIG_KEYGEN_PCT)
/* Specification:
@@ -152,21 +132,30 @@ __contract__(
* @[FIPS203, Section 7.1, Pairwise Consistency]. */
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES];
- uint8_t ss_enc[MLKEM_SSBYTES], ss_dec[MLKEM_SSBYTES];
+ int ret = 0;
+ MLK_ALLOC(ct, uint8_t, MLKEM_INDCCA_CIPHERTEXTBYTES, context);
+ MLK_ALLOC(ss_enc, uint8_t, MLKEM_SSBYTES, context);
+ MLK_ALLOC(ss_dec, uint8_t, MLKEM_SSBYTES, context);
+
+ if (ct == NULL || ss_enc == NULL || ss_dec == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
- res = crypto_kem_enc(ct, ss_enc, pk);
- if (res != 0)
+ ret = mlk_kem_enc(ct, ss_enc, pk, context);
+ if (ret != 0)
{
goto cleanup;
}
- res = crypto_kem_dec(ss_dec, ct, sk);
- if (res != 0)
+ ret = mlk_kem_dec(ss_dec, ct, sk, context);
+ if (ret != 0)
{
goto cleanup;
}
@@ -179,26 +168,36 @@ static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
}
#endif /* MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST */
- res = mlk_ct_memcmp(ss_enc, ss_dec, sizeof(ss_dec));
+ ret = mlk_ct_memcmp(ss_enc, ss_dec, MLKEM_SSBYTES);
+ /* The result of the PCT is public. */
+ MLK_CT_TESTING_DECLASSIFY(&ret, sizeof(ret));
+
+ if (ret != 0)
+ {
+ ret = MLK_ERR_FAIL;
+ }
cleanup:
- /* The result of the PCT is public. */
- MLK_CT_TESTING_DECLASSIFY(&res, sizeof(res));
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(ct, sizeof(ct));
- mlk_zeroize(ss_enc, sizeof(ss_enc));
- mlk_zeroize(ss_dec, sizeof(ss_dec));
- return res;
+ MLK_FREE(ss_dec, uint8_t, MLKEM_SSBYTES, context);
+ MLK_FREE(ss_enc, uint8_t, MLKEM_SSBYTES, context);
+ MLK_FREE(ct, uint8_t, MLKEM_INDCCA_CIPHERTEXTBYTES, context);
+ return ret;
}
-#else /* MLK_CONFIG_KEYGEN_PCT */
+#else /* MLK_CONFIG_KEYGEN_PCT */
+MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
/* Skip PCT */
((void)pk);
((void)sk);
+#if defined(MLK_CONFIG_CONTEXT_PARAMETER)
+ ((void)context);
+#endif
return 0;
}
#endif /* !MLK_CONFIG_KEYGEN_PCT */
@@ -208,164 +207,240 @@ static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
* - We optionally include PCT which is not present in
* the reference code. */
MLK_EXTERNAL_API
-int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
- const uint8_t coins[2 * MLKEM_SYMBYTES])
+int mlk_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ const uint8_t coins[2 * MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- mlk_indcpa_keypair_derand(pk, sk, coins);
- memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+ int ret;
+
+ ret = mlk_indcpa_keypair_derand(pk, sk, coins, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
+
+ mlk_memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
mlk_hash_h(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, pk,
MLKEM_INDCCA_PUBLICKEYBYTES);
/* Value z for pseudo-random output on reject */
- memcpy(sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
- coins + MLKEM_SYMBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
+ coins + MLKEM_SYMBYTES, MLKEM_SYMBYTES);
/* Declassify public key */
MLK_CT_TESTING_DECLASSIFY(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
/* Pairwise Consistency Test (PCT) @[FIPS140_3_IG, p.87] */
- if (mlk_check_pct(pk, sk))
+ ret = mlk_check_pct(pk, sk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- return 0;
+cleanup:
+ if (ret != 0)
+ {
+ mlk_zeroize(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+ mlk_zeroize(sk, MLKEM_INDCCA_SECRETKEYBYTES);
+ }
+
+ return ret;
}
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
/* Reference: `crypto_kem_keypair()` in the reference implementation @[REF]
* - We zeroize the stack buffer */
MLK_EXTERNAL_API
-int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t coins[2 * MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(coins, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+ if (coins == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Acquire necessary randomness, and mark it as secret. */
- mlk_randombytes(coins, 2 * MLKEM_SYMBYTES);
- MLK_CT_TESTING_SECRET(coins, sizeof(coins));
+ if (mlk_randombytes(coins, 2 * MLKEM_SYMBYTES) != 0)
+ {
+ ret = MLK_ERR_RNG_FAIL;
+ goto cleanup;
+ }
+
+ MLK_CT_TESTING_SECRET(coins, 2 * MLKEM_SYMBYTES);
- res = crypto_kem_keypair_derand(pk, sk, coins);
+ ret = mlk_kem_keypair_derand(pk, sk, coins, context);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(coins, sizeof(coins));
- return res;
+ MLK_FREE(coins, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
/* Reference: `crypto_kem_enc_derand()` in the reference implementation @[REF]
* - We include public key check
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- /* Will contain key, coins */
- MLK_ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+ if (buf == NULL || kr == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Specification: Implements @[FIPS203, Section 7.2, Modulus check] */
- if (mlk_check_pk(pk))
+ ret = mlk_kem_check_pk(pk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- memcpy(buf, coins, MLKEM_SYMBYTES);
+ mlk_memcpy(buf, coins, MLKEM_SYMBYTES);
/* Multitarget countermeasure for coins + contributory KEM */
mlk_hash_h(buf + MLKEM_SYMBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
/* coins are in kr+MLKEM_SYMBYTES */
- mlk_indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES);
+ ret = mlk_indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
- memcpy(ss, kr, MLKEM_SYMBYTES);
+ mlk_memcpy(ss, kr, MLKEM_SYMBYTES);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(kr, sizeof(kr));
-
- return 0;
+ MLK_FREE(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
/* Reference: `crypto_kem_enc()` in the reference implementation @[REF]
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t coins[MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(coins, uint8_t, MLKEM_SYMBYTES, context);
- mlk_randombytes(coins, MLKEM_SYMBYTES);
- MLK_CT_TESTING_SECRET(coins, sizeof(coins));
+ if (coins == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
+ if (mlk_randombytes(coins, MLKEM_SYMBYTES) != 0)
+ {
+ ret = MLK_ERR_RNG_FAIL;
+ goto cleanup;
+ }
+
+ MLK_CT_TESTING_SECRET(coins, MLKEM_SYMBYTES);
- res = crypto_kem_enc_derand(ct, ss, pk, coins);
+ ret = mlk_kem_enc_derand(ct, ss, pk, coins, context);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(coins, sizeof(coins));
- return res;
+ MLK_FREE(coins, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
/* Reference: `crypto_kem_dec()` in the reference implementation @[REF]
* - We include secret key check
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_dec(uint8_t ss[MLKEM_SSBYTES],
- const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_dec(uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
+ int ret = 0;
uint8_t fail;
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- /* Will contain key, coins */
- MLK_ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
- MLK_ALIGN uint8_t tmp[MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES];
-
const uint8_t *pk = sk + MLKEM_INDCPA_SECRETKEYBYTES;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(tmp, uint8_t, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES,
+ context);
+
+ if (buf == NULL || kr == NULL || tmp == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Specification: Implements @[FIPS203, Section 7.3, Hash check] */
- if (mlk_check_sk(sk))
+ ret = mlk_kem_check_sk(sk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- mlk_indcpa_dec(buf, ct, sk);
+ ret = mlk_indcpa_dec(buf, ct, sk, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
/* Multitarget countermeasure for coins + contributory KEM */
- memcpy(buf + MLKEM_SYMBYTES,
- sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(buf + MLKEM_SYMBYTES,
+ sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+ MLKEM_SYMBYTES);
mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
/* Recompute and compare ciphertext */
/* coins are in kr+MLKEM_SYMBYTES */
- mlk_indcpa_enc(tmp, buf, pk, kr + MLKEM_SYMBYTES);
+ ret = mlk_indcpa_enc(tmp, buf, pk, kr + MLKEM_SYMBYTES, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
+
fail = mlk_ct_memcmp(ct, tmp, MLKEM_INDCCA_CIPHERTEXTBYTES);
/* Compute rejection key */
- memcpy(tmp, sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
- MLKEM_SYMBYTES);
- memcpy(tmp + MLKEM_SYMBYTES, ct, MLKEM_INDCCA_CIPHERTEXTBYTES);
- mlk_hash_j(ss, tmp, sizeof(tmp));
+ mlk_memcpy(tmp, sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
+ MLKEM_SYMBYTES);
+ mlk_memcpy(tmp + MLKEM_SYMBYTES, ct, MLKEM_INDCCA_CIPHERTEXTBYTES);
+ mlk_hash_j(ss, tmp, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES);
/* Copy true key to return buffer if fail is 0 */
mlk_ct_cmov_zero(ss, kr, MLKEM_SYMBYTES, fail);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(kr, sizeof(kr));
- mlk_zeroize(tmp, sizeof(tmp));
+ MLK_FREE(tmp, uint8_t, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES,
+ context);
+ MLK_FREE(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
- return 0;
+ return ret;
}
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
-#undef mlk_check_pk
-#undef mlk_check_sk
#undef mlk_check_pct
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/kem.h
index d3e5f50ce6..0502715c39 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/kem.h
@@ -10,12 +10,16 @@
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [REF]
+ * CRYSTALS-Kyber C reference implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/ref
*/
#ifndef MLK_KEM_H
#define MLK_KEM_H
-#include
#include "cbmc.h"
#include "common.h"
#include "sys.h"
@@ -23,9 +27,7 @@
#if defined(MLK_CHECK_APIS)
/* Include to ensure consistency between internal kem.h
* and external mlkem_native.h. */
-#define MLK_CONFIG_API_NO_SUPERCOP
#include "mlkem_native.h"
-#undef MLK_CONFIG_API_NO_SUPERCOP
#if MLKEM_INDCCA_SECRETKEYBYTES != \
MLKEM_SECRETKEYBYTES(MLK_CONFIG_PARAMETER_SET)
@@ -44,14 +46,79 @@
#endif /* MLK_CHECK_APIS */
-#define crypto_kem_keypair_derand MLK_NAMESPACE_K(keypair_derand)
-#define crypto_kem_keypair MLK_NAMESPACE_K(keypair)
-#define crypto_kem_enc_derand MLK_NAMESPACE_K(enc_derand)
-#define crypto_kem_enc MLK_NAMESPACE_K(enc)
-#define crypto_kem_dec MLK_NAMESPACE_K(dec)
+#define mlk_kem_keypair_derand \
+ MLK_NAMESPACE_K(keypair_derand) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_keypair MLK_NAMESPACE_K(keypair) MLK_CONTEXT_PARAMETERS_2
+#define mlk_kem_enc_derand MLK_NAMESPACE_K(enc_derand) MLK_CONTEXT_PARAMETERS_4
+#define mlk_kem_enc MLK_NAMESPACE_K(enc) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_dec MLK_NAMESPACE_K(dec) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_check_pk MLK_NAMESPACE_K(check_pk) MLK_CONTEXT_PARAMETERS_1
+#define mlk_kem_check_sk MLK_NAMESPACE_K(check_sk) MLK_CONTEXT_PARAMETERS_1
+
+/*************************************************
+ * Name: mlk_kem_check_pk
+ *
+ * Description: Implements modulus check mandated by FIPS 203,
+ * i.e., ensures that coefficients are in [0,q-1].
+ *
+ * Arguments: - const uint8_t *pk: pointer to input public key
+ * (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
+ * bytes)
+ *
+ * Returns: - 0 on success
+ * - MLK_ERR_FAIL: If the modulus check failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
+ *
+ **************************************************/
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+ requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
+);
+
+
+/*************************************************
+ * Name: mlk_kem_check_sk
+ *
+ * Description: Implements public key hash check mandated by FIPS 203,
+ * i.e., ensures that
+ * sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
+ *
+ * Arguments: - const uint8_t *sk: pointer to input private key
+ * (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
+ * bytes)
+ *
+ * Returns: - 0 on success
+ * - MLK_ERR_FAIL: If the public key hash check failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.3, 'hash check']
+ *
+ **************************************************/
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+ requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
+);
/*************************************************
- * Name: crypto_kem_keypair_derand
+ * Name: mlk_kem_keypair_derand
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
@@ -67,26 +134,33 @@
* random bytes)
*
* Returns: - 0: On success
- * - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
+ * - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ * PCT failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 16, ML-KEM.KeyGen_Internal]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
- const uint8_t coins[2 * MLKEM_SYMBYTES])
+int mlk_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ const uint8_t coins[2 * MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
requires(memory_no_alias(coins, 2 * MLKEM_SYMBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_keypair
+ * Name: mlk_kem_keypair
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
@@ -99,24 +173,32 @@ __contract__(
* bytes)
*
* Returns: - 0: On success
- * - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ * - MLK_ERR_RNG_FAIL: Random number generation failed.
+ * - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ * PCT failed.
*
* Specification: Implements @[FIPS203, Algorithm 19, ML-KEM.KeyGen]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_enc_derand
+ * Name: mlk_kem_enc_derand
*
* Description: Generates cipher text and shared
* secret for given public key
@@ -134,29 +216,34 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'modulus check' @[FIPS203, Section 7.2]
- * for the public key fails.
+ * - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ * for the public key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 17, ML-KEM.Encaps_Internal]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(ct))
- assigns(object_whole(ss))
+ assigns(memory_slice(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
/*************************************************
- * Name: crypto_kem_enc
+ * Name: mlk_kem_enc
*
* Description: Generates cipher text and shared
* secret for given public key
@@ -171,27 +258,34 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'modulus check' @[FIPS203, Section 7.2]
- * for the public key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ * - MLK_ERR_RNG_FAIL: Random number generation failed.
+ * - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ * for the public key fails.
*
* Specification: Implements @[FIPS203, Algorithm 20, ML-KEM.Encaps]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
- assigns(object_whole(ct))
- assigns(object_whole(ss))
+ assigns(memory_slice(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_dec
+ * Name: mlk_kem_dec
*
* Description: Generates shared secret for given
* cipher text and private key
@@ -206,22 +300,27 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'hash check' @[FIPS203, Section 7.3]
- * for the secret key fails.
+ * - MLK_ERR_FAIL: If the 'hash check' @[FIPS203, Section 7.3]
+ * for the secret key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 21, ML-KEM.Decaps]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_dec(uint8_t ss[MLKEM_SSBYTES],
- const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_dec(uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
- assigns(object_whole(ss))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
#endif /* !MLK_KEM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/meta.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/meta.h
index edcc8b02a1..e487e68b83 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/meta.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/meta.h
@@ -22,77 +22,98 @@
#if !defined(__ASSEMBLER__)
+#include "../api.h"
#include "src/arith_native_aarch64.h"
-static MLK_INLINE void mlk_ntt_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N])
{
mlk_ntt_asm(data, mlk_aarch64_ntt_zetas_layer12345,
mlk_aarch64_ntt_zetas_layer67);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_intt_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N])
{
mlk_intt_asm(data, mlk_aarch64_invntt_zetas_layer12345,
mlk_aarch64_invntt_zetas_layer67);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_reduce_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N])
{
mlk_poly_reduce_asm(data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_tomont_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N])
{
mlk_poly_tomont_asm(data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_mulcache_compute_native(
- int16_t x[MLKEM_N / 2], const int16_t y[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
+ const int16_t y[MLKEM_N])
{
mlk_poly_mulcache_compute_asm(x, y, mlk_aarch64_zetas_mulcache_native,
mlk_aarch64_zetas_mulcache_twisted_native);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N],
const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)])
{
mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(r, a, b, b_cache);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 3
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N],
const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)])
{
mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(r, a, b, b_cache);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N],
const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)])
{
mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(r, a, b, b_cache);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
-static MLK_INLINE void mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
- const int16_t a[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+ const int16_t a[MLKEM_N])
{
mlk_poly_tobytes_asm(r, a);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
+MLK_MUST_CHECK_RETURN_VALUE
static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
const uint8_t *buf,
unsigned buflen)
{
- if (len != MLKEM_N || buflen % 24 != 0)
+ if (len != MLKEM_N ||
+ buflen % 24 != 0) /* NEON support is mandatory for AArch64 */
{
- return -1;
+ return MLK_NATIVE_FUNC_FALLBACK;
}
return (int)mlk_rej_uniform_asm(r, buf, buflen, mlk_rej_uniform_table);
}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/aarch64_zetas.c
index 487f697481..4b3f0d86c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/aarch64_zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/aarch64_zetas.c
@@ -5,6 +5,7 @@
/*
* WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
* Do not modify it directly.
*/
@@ -13,7 +14,6 @@
#if defined(MLK_ARITH_BACKEND_AARCH64) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
#include "arith_native_aarch64.h"
/*
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/arith_native_aarch64.h
index 939fed7109..2941ecbd4b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/arith_native_aarch64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/arith_native_aarch64.h
@@ -5,7 +5,6 @@
#ifndef MLK_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H
#define MLK_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H
-#include
#include "../../../cbmc.h"
#include "../../../common.h"
@@ -31,10 +30,10 @@ extern const int16_t mlk_aarch64_zetas_mulcache_twisted_native[];
extern const uint8_t mlk_rej_uniform_table[];
#define mlk_ntt_asm MLK_NAMESPACE(ntt_asm)
-void mlk_ntt_asm(int16_t *p, const int16_t *twiddles12345,
- const int16_t *twiddles56)
+void mlk_ntt_asm(int16_t p[256], const int16_t twiddles12345[80],
+ const int16_t twiddles56[384])
/* This must be kept in sync with the HOL-Light specification
- * in proofs/hol_light/arm/proofs/mlkem_ntt.ml */
+ * in proofs/hol_light/aarch64/proofs/mlkem_ntt.ml */
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
requires(array_abs_bound(p, 0, MLKEM_N, 8192))
@@ -47,10 +46,10 @@ __contract__(
);
#define mlk_intt_asm MLK_NAMESPACE(intt_asm)
-void mlk_intt_asm(int16_t *p, const int16_t *twiddles12345,
- const int16_t *twiddles56)
+void mlk_intt_asm(int16_t p[256], const int16_t twiddles12345[80],
+ const int16_t twiddles56[384])
/* This must be kept in sync with the HOL-Light specification
- * in proofs/hol_light/arm/proofs/mlkem_intt.ml */
+ * in proofs/hol_light/aarch64/proofs/mlkem_intt.ml */
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
requires(twiddles12345 == mlk_aarch64_invntt_zetas_layer12345)
@@ -62,9 +61,9 @@ __contract__(
);
#define mlk_poly_reduce_asm MLK_NAMESPACE(poly_reduce_asm)
-void mlk_poly_reduce_asm(int16_t *p)
+void mlk_poly_reduce_asm(int16_t p[256])
/* This must be kept in sync with the HOL-Light specification
- * in proofs/hol_light/arm/proofs/mlkem_poly_reduce.ml */
+ * in proofs/hol_light/aarch64/proofs/mlkem_poly_reduce.ml */
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
@@ -72,9 +71,9 @@ __contract__(
);
#define mlk_poly_tomont_asm MLK_NAMESPACE(poly_tomont_asm)
-void mlk_poly_tomont_asm(int16_t *p)
+void mlk_poly_tomont_asm(int16_t p[256])
/* This must be kept in sync with the HOL-Light specification
- * in proofs/hol_light/arm/proofs/mlkem_poly_tomont.ml */
+ * in proofs/hol_light/aarch64/proofs/mlkem_poly_tomont.ml */
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
@@ -82,39 +81,39 @@ __contract__(
);
#define mlk_poly_mulcache_compute_asm MLK_NAMESPACE(poly_mulcache_compute_asm)
-void mlk_poly_mulcache_compute_asm(int16_t *cache, const int16_t *mlk_poly,
- const int16_t *zetas,
- const int16_t *zetas_twisted)
+void mlk_poly_mulcache_compute_asm(int16_t cache[128],
+ const int16_t mlk_poly[256],
+ const int16_t zetas[128],
+ const int16_t zetas_twisted[128])
/* This must be kept in sync with the HOL-Light specification
- * in proofs/hol_light/arm/proofs/mlkem_poly_mulcache_compute.ml */
+ * in proofs/hol_light/aarch64/proofs/mlkem_poly_mulcache_compute.ml */
__contract__(
requires(memory_no_alias(cache, sizeof(int16_t) * (MLKEM_N / 2)))
requires(memory_no_alias(mlk_poly, sizeof(int16_t) * MLKEM_N))
requires(zetas == mlk_aarch64_zetas_mulcache_native)
requires(zetas_twisted == mlk_aarch64_zetas_mulcache_twisted_native)
- assigns(object_whole(cache))
+ assigns(memory_slice(cache, sizeof(int16_t) * (MLKEM_N / 2)))
ensures(array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
);
#define mlk_poly_tobytes_asm MLK_NAMESPACE(poly_tobytes_asm)
-void mlk_poly_tobytes_asm(uint8_t *r, const int16_t *a)
+void mlk_poly_tobytes_asm(uint8_t r[384], const int16_t a[256])
/* This must be kept in sync with the HOL-Light specification
- * in proofs/hol_light/arm/proofs/mlkem_poly_tobytes.ml */
+ * in proofs/hol_light/aarch64/proofs/mlkem_poly_tobytes.ml */
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
- assigns(object_whole(r))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
);
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k2 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2)
-void mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(int16_t *r,
- const int16_t *a,
- const int16_t *b,
- const int16_t *b_cache)
+void mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(
+ int16_t r[256], const int16_t a[512], const int16_t b[512],
+ const int16_t b_cache[256])
/* This must be kept in sync with the HOL-Light specification in
- * proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml.
+ * proofs/hol_light/aarch64/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml.
*/
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
@@ -127,12 +126,11 @@ __contract__(
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k3 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3)
-void mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(int16_t *r,
- const int16_t *a,
- const int16_t *b,
- const int16_t *b_cache)
+void mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(
+ int16_t r[256], const int16_t a[768], const int16_t b[768],
+ const int16_t b_cache[384])
/* This must be kept in sync with the HOL-Light specification in
- * proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml.
+ * proofs/hol_light/aarch64/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml.
*/
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
@@ -145,12 +143,11 @@ __contract__(
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k4 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4)
-void mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(int16_t *r,
- const int16_t *a,
- const int16_t *b,
- const int16_t *b_cache)
+void mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(
+ int16_t r[256], const int16_t a[1024], const int16_t b[1024],
+ const int16_t b_cache[512])
/* This must be kept in sync with the HOL-Light specification in
- * proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml.
+ * proofs/hol_light/aarch64/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml.
*/
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
@@ -162,10 +159,11 @@ __contract__(
);
#define mlk_rej_uniform_asm MLK_NAMESPACE(rej_uniform_asm)
-uint64_t mlk_rej_uniform_asm(int16_t *r, const uint8_t *buf, unsigned buflen,
- const uint8_t *table)
+MLK_MUST_CHECK_RETURN_VALUE
+uint64_t mlk_rej_uniform_asm(int16_t r[256], const uint8_t *buf,
+ unsigned buflen, const uint8_t table[2048])
/* This must be kept in sync with the HOL-Light specification
- * in proofs/hol_light/arm/proofs/mlkem_rej_uniform.ml. */
+ * in proofs/hol_light/aarch64/proofs/mlkem_rej_uniform.ml. */
__contract__(
requires(buflen % 24 == 0)
requires(memory_no_alias(buf, buflen))
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/consts.h
deleted file mode 100644
index e9f877e831..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/consts.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-#ifndef MLK_NATIVE_AARCH64_SRC_CONSTS_H
-#define MLK_NATIVE_AARCH64_SRC_CONSTS_H
-
-#include
-#include "../../../common.h"
-
-#define mlk_zetas_mulcache_native MLK_NAMESPACE(zetas_mulcache_native)
-extern const int16_t mlk_zetas_mulcache_native[256];
-
-#define mlk_zetas_mulcache_twisted_native \
- MLK_NAMESPACE(zetas_mulcache_twisted_native)
-extern const int16_t mlk_zetas_mulcache_twisted_native[256];
-
-#endif /* !MLK_NATIVE_AARCH64_SRC_CONSTS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/intt.S
index fe5f1e2d14..8410ac9b30 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/intt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/intt.S
@@ -19,7 +19,33 @@
* https://eprint.iacr.org/2022/1303
*/
-/* AArch64 ML-KEM inverse NTT following @[NeonNTT] and @[SLOTHY_Paper]. */
+/*yaml
+ Name: intt_asm
+ Description: AArch64 ML-KEM inverse NTT following @[NeonNTT] and @[SLOTHY_Paper]
+ Signature: void mlk_intt_asm(int16_t p[256], const int16_t twiddles12345[80], const int16_t twiddles56[384])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: read/write
+ c_parameter: int16_t p[256]
+ description: Input/output polynomial
+ x1:
+ type: buffer
+ size_bytes: 160
+ permissions: read-only
+ c_parameter: const int16_t twiddles12345[80]
+ description: Twiddle factors for layers 1-5
+ x2:
+ type: buffer
+ size_bytes: 768
+ permissions: read-only
+ c_parameter: const int16_t twiddles56[384]
+ description: Twiddle factors for layers 6-7
+ Stack:
+ bytes: 64
+ description: saving callee-saved Neon registers
+*/
#include "../../../common.h"
#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
@@ -29,539 +55,574 @@
* dev/aarch64_opt/src/intt.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(intt_asm)
MLK_ASM_FN_SYMBOL(intt_asm)
- sub sp, sp, #0x40
- stp d8, d9, [sp]
- stp d10, d11, [sp, #0x10]
- stp d12, d13, [sp, #0x20]
- stp d14, d15, [sp, #0x30]
- mov w5, #0xd01 // =3329
- mov v7.h[0], w5
- mov w5, #0x4ebf // =20159
- mov v7.h[1], w5
- mov w5, #0x200 // =512
- dup v29.8h, w5
- mov w5, #0x13b0 // =5040
- dup v30.8h, w5
- mov x3, x0
- mov x4, #0x8 // =8
+ .cfi_startproc
+ sub sp, sp, #0x40
+ .cfi_adjust_cfa_offset 0x40
+ stp d8, d9, [sp]
+ .cfi_rel_offset d8, 0x0
+ .cfi_rel_offset d9, 0x8
+ stp d10, d11, [sp, #0x10]
+ .cfi_rel_offset d10, 0x10
+ .cfi_rel_offset d11, 0x18
+ stp d12, d13, [sp, #0x20]
+ .cfi_rel_offset d12, 0x20
+ .cfi_rel_offset d13, 0x28
+ stp d14, d15, [sp, #0x30]
+ .cfi_rel_offset d14, 0x30
+ .cfi_rel_offset d15, 0x38
+ mov w5, #0xd01 // =3329
+ mov v7.h[0], w5
+ mov w5, #0x4ebf // =20159
+ mov v7.h[1], w5
+ mov w5, #0x200 // =512
+ dup v29.8h, w5
+ mov w5, #0x13b0 // =5040
+ dup v30.8h, w5
+ mov x3, x0
+ mov x4, #0x8 // =8
+ ldr q13, [x3, #0x20]
+ ldr q8, [x3, #0x30]
+ ldr q6, [x3]
+ ldr q16, [x3, #0x10]
+ ldr q4, [x3, #0x50]
+ ldr q11, [x3, #0x40]
+ ldr q3, [x3, #0x70]
+ trn1 v23.4s, v13.4s, v8.4s
+ ldr q0, [x3, #0x60]
+ trn2 v19.4s, v6.4s, v16.4s
+ trn2 v21.4s, v13.4s, v8.4s
+ trn1 v6.4s, v6.4s, v16.4s
+ ldr q24, [x2, #0x20]
+ trn1 v10.2d, v19.2d, v21.2d
+ ldr q16, [x2], #0x60
+ trn1 v5.2d, v6.2d, v23.2d
+ trn1 v28.4s, v0.4s, v3.4s
+ trn2 v18.2d, v6.2d, v23.2d
+ mul v31.8h, v10.8h, v29.8h
+ trn2 v13.4s, v0.4s, v3.4s
+ ldur q14, [x2, #-0x50]
+ sqrdmulh v26.8h, v18.8h, v30.8h
+ ldur q20, [x2, #-0x20]
+ mul v17.8h, v18.8h, v29.8h
+ trn2 v18.2d, v19.2d, v21.2d
+ mul v9.8h, v18.8h, v29.8h
+ trn1 v12.4s, v11.4s, v4.4s
+ sqrdmulh v22.8h, v18.8h, v30.8h
+ sqrdmulh v3.8h, v10.8h, v30.8h
+ sqrdmulh v25.8h, v5.8h, v30.8h
+ mls v9.8h, v22.8h, v7.h[0]
+ mls v17.8h, v26.8h, v7.h[0]
+ trn2 v26.4s, v11.4s, v4.4s
+ mul v8.8h, v5.8h, v29.8h
+ trn1 v10.2d, v26.2d, v13.2d
+ ldur q11, [x2, #-0x10]
+ mls v31.8h, v3.8h, v7.h[0]
+ trn1 v6.2d, v12.2d, v28.2d
+ trn2 v3.2d, v26.2d, v13.2d
+ ldur q4, [x2, #-0x30]
+ mls v8.8h, v25.8h, v7.h[0]
+ sub v19.8h, v17.8h, v9.8h
+ trn2 v13.2d, v12.2d, v28.2d
+ sqrdmulh v1.8h, v3.8h, v30.8h
+ add v9.8h, v17.8h, v9.8h
+ mul v18.8h, v19.8h, v20.8h
+ add v28.8h, v8.8h, v31.8h
+ sqrdmulh v20.8h, v19.8h, v11.8h
+ sub v12.8h, v28.8h, v9.8h
+ sub v23.8h, v8.8h, v31.8h
+ sqrdmulh v11.8h, v13.8h, v30.8h
+ sqrdmulh v5.8h, v23.8h, v4.8h
+ mul v0.8h, v23.8h, v24.8h
+ mul v2.8h, v13.8h, v29.8h
+ mls v0.8h, v5.8h, v7.h[0]
+ add v24.8h, v28.8h, v9.8h
+ mls v18.8h, v20.8h, v7.h[0]
+ sqrdmulh v15.8h, v6.8h, v30.8h
+ sqrdmulh v25.8h, v12.8h, v14.8h
+ mul v21.8h, v12.8h, v16.8h
+ sub v23.8h, v0.8h, v18.8h
+ sqrdmulh v8.8h, v23.8h, v14.8h
+ mul v23.8h, v23.8h, v16.8h
+ mls v21.8h, v25.8h, v7.h[0]
+ mls v23.8h, v8.8h, v7.h[0]
+ mul v14.8h, v3.8h, v29.8h
+ add v3.8h, v0.8h, v18.8h
+ trn2 v4.4s, v24.4s, v3.4s
+ mls v14.8h, v1.8h, v7.h[0]
+ trn1 v9.4s, v24.4s, v3.4s
+ trn2 v12.4s, v21.4s, v23.4s
+ mls v2.8h, v11.8h, v7.h[0]
+ trn1 v28.4s, v21.4s, v23.4s
+ ldr q11, [x1], #0x10
+ mul v31.8h, v10.8h, v29.8h
+ trn1 v25.2d, v4.2d, v12.2d
+ trn1 v20.2d, v9.2d, v28.2d
+ ldr q23, [x2, #0x50]
+ trn2 v13.2d, v4.2d, v12.2d
+ sqrdmulh v21.8h, v10.8h, v30.8h
+ trn2 v4.2d, v9.2d, v28.2d
+ ldr q9, [x2, #0x40]
+ mul v27.8h, v6.8h, v29.8h
+ add v26.8h, v20.8h, v25.8h
+ sub v3.8h, v2.8h, v14.8h
+ sqdmulh v12.8h, v26.8h, v7.h[1]
+ add v5.8h, v4.8h, v13.8h
+ sub v8.8h, v4.8h, v13.8h
+ add v10.8h, v2.8h, v14.8h
+ sqdmulh v6.8h, v5.8h, v7.h[1]
+ ldr q2, [x2, #0x10]
+ mls v27.8h, v15.8h, v7.h[0]
+ ldr q15, [x2, #0x20]
+ srshr v12.8h, v12.8h, #0xb
+ mls v31.8h, v21.8h, v7.h[0]
+ srshr v6.8h, v6.8h, #0xb
+ sqrdmulh v23.8h, v3.8h, v23.8h
+ mls v26.8h, v12.8h, v7.h[0]
+ add v21.8h, v27.8h, v31.8h
+ mls v5.8h, v6.8h, v7.h[0]
+ sub v6.8h, v27.8h, v31.8h
+ sub v14.8h, v21.8h, v10.8h
+ ldr q27, [x2], #0x60
+ mul v3.8h, v3.8h, v9.8h
+ mls v3.8h, v23.8h, v7.h[0]
+ ldur q13, [x2, #-0x30]
+ sub v12.8h, v26.8h, v5.8h
+ add v5.8h, v26.8h, v5.8h
+ sqrdmulh v31.8h, v8.8h, v11.h[5]
+ sqrdmulh v19.8h, v12.8h, v11.h[1]
+ mul v24.8h, v12.8h, v11.h[0]
+ sqrdmulh v13.8h, v6.8h, v13.8h
+ mls v24.8h, v19.8h, v7.h[0]
+ sub x4, x4, #0x2
-intt_scale_start:
- ldr q8, [x3]
- ldr q9, [x3, #0x10]
- ldr q10, [x3, #0x20]
- ldr q11, [x3, #0x30]
- sqrdmulh v27.8h, v8.8h, v30.8h
- mul v8.8h, v8.8h, v29.8h
- mls v8.8h, v27.8h, v7.h[0]
- sqrdmulh v27.8h, v9.8h, v30.8h
- mul v9.8h, v9.8h, v29.8h
- mls v9.8h, v27.8h, v7.h[0]
- sqrdmulh v27.8h, v10.8h, v30.8h
- mul v10.8h, v10.8h, v29.8h
- mls v10.8h, v27.8h, v7.h[0]
- sqrdmulh v27.8h, v11.8h, v30.8h
- mul v11.8h, v11.8h, v29.8h
- mls v11.8h, v27.8h, v7.h[0]
- str q8, [x3], #0x40
- stur q9, [x3, #-0x30]
- stur q10, [x3, #-0x20]
- stur q11, [x3, #-0x10]
- subs x4, x4, #0x1
- cbnz x4, intt_scale_start
- mov x3, x0
- mov x4, #0x8 // =8
- ldr q3, [x3, #0x10]
- ldr q20, [x3]
- ldr q25, [x3, #0x20]
- ldr q24, [x3, #0x30]
- ldr q21, [x2, #0x50]
- trn1 v18.4s, v25.4s, v24.4s
- trn1 v6.4s, v20.4s, v3.4s
- trn2 v12.4s, v25.4s, v24.4s
- trn2 v31.4s, v20.4s, v3.4s
- trn2 v28.2d, v6.2d, v18.2d
- trn1 v25.2d, v6.2d, v18.2d
- trn2 v15.2d, v31.2d, v12.2d
- trn1 v20.2d, v31.2d, v12.2d
- add v4.8h, v28.8h, v15.8h
- add v1.8h, v25.8h, v20.8h
- sub v30.8h, v28.8h, v15.8h
- sub v3.8h, v25.8h, v20.8h
- add v6.8h, v1.8h, v4.8h
- sqrdmulh v9.8h, v30.8h, v21.8h
- ldr q21, [x2, #0x40]
- ldr q25, [x2, #0x30]
- mul v21.8h, v30.8h, v21.8h
- ldr q30, [x2, #0x20]
- sub v28.8h, v1.8h, v4.8h
- ldr q1, [x2, #0x10]
- mls v21.8h, v9.8h, v7.h[0]
- sqrdmulh v9.8h, v3.8h, v25.8h
- mul v20.8h, v3.8h, v30.8h
- ldr q29, [x2], #0x60
- ldr q17, [x3, #0x60]
- mls v20.8h, v9.8h, v7.h[0]
- ldr q3, [x3, #0x70]
- mul v4.8h, v28.8h, v29.8h
- sub v25.8h, v20.8h, v21.8h
- trn1 v15.4s, v17.4s, v3.4s
- sqrdmulh v28.8h, v28.8h, v1.8h
- trn2 v31.4s, v17.4s, v3.4s
- mul v30.8h, v25.8h, v29.8h
- add v20.8h, v20.8h, v21.8h
- mls v4.8h, v28.8h, v7.h[0]
- sqrdmulh v3.8h, v25.8h, v1.8h
- ldr q28, [x3, #0x40]
- trn1 v25.4s, v6.4s, v20.4s
- mls v30.8h, v3.8h, v7.h[0]
- ldr q27, [x3, #0x50]
- trn2 v6.4s, v6.4s, v20.4s
- trn1 v3.4s, v4.4s, v30.4s
- trn2 v10.4s, v28.4s, v27.4s
- trn2 v20.4s, v4.4s, v30.4s
- trn2 v8.2d, v25.2d, v3.2d
- trn1 v9.2d, v25.2d, v3.2d
- trn1 v1.2d, v6.2d, v20.2d
- trn2 v30.2d, v6.2d, v20.2d
- add v4.8h, v9.8h, v1.8h
- add v11.8h, v8.8h, v30.8h
- trn2 v25.2d, v10.2d, v31.2d
- sqdmulh v6.8h, v4.8h, v7.h[1]
- sqdmulh v20.8h, v11.8h, v7.h[1]
- ldr q21, [x2, #0x50]
- srshr v0.8h, v6.8h, #0xb
- srshr v3.8h, v20.8h, #0xb
- trn1 v2.4s, v28.4s, v27.4s
- mls v4.8h, v0.8h, v7.h[0]
- mls v11.8h, v3.8h, v7.h[0]
- ldr q0, [x1], #0x10
- trn2 v20.2d, v2.2d, v15.2d
- sub v6.8h, v4.8h, v11.8h
- sub v5.8h, v20.8h, v25.8h
- sub v22.8h, v9.8h, v1.8h
- sqrdmulh v3.8h, v6.8h, v0.h[1]
- mul v6.8h, v6.8h, v0.h[0]
- sqrdmulh v12.8h, v5.8h, v21.8h
- ldr q19, [x2, #0x40]
- mls v6.8h, v3.8h, v7.h[0]
- ldr q14, [x2], #0x60
- sub x4, x4, #0x2
+Lintt_layer4567_start:
+ add v16.8h, v21.8h, v10.8h
+ mul v18.8h, v6.8h, v15.8h
+ sub v19.8h, v20.8h, v25.8h
+ ldr q21, [x3, #0xa0]
+ str q5, [x3], #0x40
+ mls v18.8h, v13.8h, v7.h[0]
+ sqrdmulh v15.8h, v14.8h, v2.8h
+ ldr q10, [x3, #0x50]
+ ldr q12, [x3, #0x40]
+ stur q24, [x3, #-0x20]
+ mul v5.8h, v8.8h, v11.h[4]
+ sub v0.8h, v18.8h, v3.8h
+ ldr q24, [x3, #0x70]
+ mls v5.8h, v31.8h, v7.h[0]
+ ldr q26, [x2, #0x50]
+ trn2 v1.4s, v12.4s, v10.4s
+ add v6.8h, v18.8h, v3.8h
+ sqrdmulh v20.8h, v0.8h, v2.8h
+ trn1 v13.4s, v12.4s, v10.4s
+ trn1 v18.4s, v16.4s, v6.4s
+ mul v22.8h, v0.8h, v27.8h
+ trn1 v17.4s, v21.4s, v24.4s
+ sqrdmulh v0.8h, v19.8h, v11.h[3]
+ trn1 v25.2d, v13.2d, v17.2d
+ mls v22.8h, v20.8h, v7.h[0]
+ trn2 v21.4s, v21.4s, v24.4s
+ mul v24.8h, v25.8h, v29.8h
+ trn2 v28.2d, v13.2d, v17.2d
+ sqrdmulh v4.8h, v25.8h, v30.8h
+ trn2 v3.2d, v1.2d, v21.2d
+ mul v17.8h, v28.8h, v29.8h
+ sqrdmulh v31.8h, v28.8h, v30.8h
+ ldr q2, [x2, #0x10]
+ mls v24.8h, v4.8h, v7.h[0]
+ mul v4.8h, v19.8h, v11.h[2]
+ ldr q19, [x2, #0x40]
+ mls v4.8h, v0.8h, v7.h[0]
+ mul v0.8h, v14.8h, v27.8h
+ mls v0.8h, v15.8h, v7.h[0]
+ sub v8.8h, v4.8h, v5.8h
+ mul v12.8h, v3.8h, v29.8h
+ mul v23.8h, v8.8h, v11.h[0]
+ trn2 v28.4s, v16.4s, v6.4s
+ sqrdmulh v10.8h, v8.8h, v11.h[1]
+ trn1 v9.4s, v0.4s, v22.4s
+ trn2 v22.4s, v0.4s, v22.4s
+ ldr q11, [x1], #0x10
+ mls v17.8h, v31.8h, v7.h[0]
+ trn1 v20.2d, v18.2d, v9.2d
+ trn2 v14.2d, v18.2d, v9.2d
+ ldr q15, [x2, #0x20]
+ trn1 v6.2d, v1.2d, v21.2d
+ sqrdmulh v9.8h, v3.8h, v30.8h
+ trn1 v25.2d, v28.2d, v22.2d
+ trn2 v16.2d, v28.2d, v22.2d
+ mls v23.8h, v10.8h, v7.h[0]
+ add v1.8h, v20.8h, v25.8h
+ sqrdmulh v21.8h, v6.8h, v30.8h
+ add v8.8h, v14.8h, v16.8h
+ ldr q27, [x2], #0x60
+ sqdmulh v28.8h, v8.8h, v7.h[1]
+ mls v12.8h, v9.8h, v7.h[0]
+ sqdmulh v31.8h, v1.8h, v7.h[1]
+ mul v0.8h, v6.8h, v29.8h
+ sub v10.8h, v17.8h, v12.8h
+ mls v0.8h, v21.8h, v7.h[0]
+ srshr v21.8h, v28.8h, #0xb
+ srshr v13.8h, v31.8h, #0xb
+ sqrdmulh v22.8h, v10.8h, v26.8h
+ mls v8.8h, v21.8h, v7.h[0]
+ mls v1.8h, v13.8h, v7.h[0]
+ add v21.8h, v24.8h, v0.8h
+ stur q23, [x3, #-0x10]
+ sub v6.8h, v24.8h, v0.8h
+ mul v3.8h, v10.8h, v19.8h
+ add v0.8h, v4.8h, v5.8h
+ sqdmulh v13.8h, v0.8h, v7.h[1]
+ ldur q10, [x2, #-0x30]
+ add v5.8h, v1.8h, v8.8h
+ mls v3.8h, v22.8h, v7.h[0]
+ sub v8.8h, v1.8h, v8.8h
+ mul v24.8h, v8.8h, v11.h[0]
+ sqrdmulh v8.8h, v8.8h, v11.h[1]
+ srshr v1.8h, v13.8h, #0xb
+ sqrdmulh v13.8h, v6.8h, v10.8h
+ mls v0.8h, v1.8h, v7.h[0]
+ add v10.8h, v17.8h, v12.8h
+ mls v24.8h, v8.8h, v7.h[0]
+ sub v8.8h, v14.8h, v16.8h
+ sqrdmulh v31.8h, v8.8h, v11.h[5]
+ sub v14.8h, v21.8h, v10.8h
+ stur q0, [x3, #-0x30]
+ subs x4, x4, #0x1
+ cbnz x4, Lintt_layer4567_start
+ mul v15.8h, v6.8h, v15.8h
+ sub v22.8h, v20.8h, v25.8h
+ add v4.8h, v21.8h, v10.8h
+ str q24, [x3, #0x20]
+ mls v15.8h, v13.8h, v7.h[0]
+ str q5, [x3], #0x40
+ ldr q9, [x1], #0x10
+ sqrdmulh v28.8h, v14.8h, v2.8h
+ mul v16.8h, v14.8h, v27.8h
+ sub v18.8h, v15.8h, v3.8h
+ add v15.8h, v15.8h, v3.8h
+ sqrdmulh v0.8h, v18.8h, v2.8h
+ trn2 v24.4s, v4.4s, v15.4s
+ trn1 v2.4s, v4.4s, v15.4s
+ mul v18.8h, v18.8h, v27.8h
+ mls v16.8h, v28.8h, v7.h[0]
+ mls v18.8h, v0.8h, v7.h[0]
+ mul v23.8h, v8.8h, v11.h[4]
+ sqrdmulh v12.8h, v22.8h, v11.h[3]
+ trn1 v17.4s, v16.4s, v18.4s
+ trn2 v4.4s, v16.4s, v18.4s
+ mls v23.8h, v31.8h, v7.h[0]
+ trn2 v3.2d, v2.2d, v17.2d
+ trn2 v6.2d, v24.2d, v4.2d
+ mul v26.8h, v22.8h, v11.h[2]
+ trn1 v28.2d, v2.2d, v17.2d
+ mls v26.8h, v12.8h, v7.h[0]
+ add v25.8h, v3.8h, v6.8h
+ sub v18.8h, v3.8h, v6.8h
+ trn1 v24.2d, v24.2d, v4.2d
+ sqdmulh v1.8h, v25.8h, v7.h[1]
+ sub v27.8h, v28.8h, v24.8h
+ sqrdmulh v2.8h, v18.8h, v9.h[5]
+ add v28.8h, v28.8h, v24.8h
+ mul v24.8h, v27.8h, v9.h[2]
+ sqdmulh v12.8h, v28.8h, v7.h[1]
+ mul v20.8h, v18.8h, v9.h[4]
+ mls v20.8h, v2.8h, v7.h[0]
+ srshr v1.8h, v1.8h, #0xb
+ sqrdmulh v19.8h, v27.8h, v9.h[3]
+ srshr v15.8h, v12.8h, #0xb
+ mls v25.8h, v1.8h, v7.h[0]
+ add v8.8h, v26.8h, v23.8h
+ sub v4.8h, v26.8h, v23.8h
+ mls v28.8h, v15.8h, v7.h[0]
+ mls v24.8h, v19.8h, v7.h[0]
+ mul v2.8h, v4.8h, v11.h[0]
+ sub v19.8h, v28.8h, v25.8h
+ sqrdmulh v15.8h, v4.8h, v11.h[1]
+ add v25.8h, v28.8h, v25.8h
+ sub v10.8h, v24.8h, v20.8h
+ str q25, [x3], #0x40
+ sqrdmulh v22.8h, v19.8h, v9.h[1]
+ add v28.8h, v24.8h, v20.8h
+ sqrdmulh v25.8h, v10.8h, v9.h[1]
+ mul v27.8h, v19.8h, v9.h[0]
+ mul v26.8h, v10.8h, v9.h[0]
+ sqdmulh v20.8h, v28.8h, v7.h[1]
+ sqdmulh v16.8h, v8.8h, v7.h[1]
+ mls v26.8h, v25.8h, v7.h[0]
+ mls v2.8h, v15.8h, v7.h[0]
+ srshr v15.8h, v20.8h, #0xb
+ srshr v1.8h, v16.8h, #0xb
+ mls v27.8h, v22.8h, v7.h[0]
+ mls v28.8h, v15.8h, v7.h[0]
+ mls v8.8h, v1.8h, v7.h[0]
+ stur q27, [x3, #-0x20]
+ stur q2, [x3, #-0x50]
+ stur q28, [x3, #-0x30]
+ stur q26, [x3, #-0x10]
+ stur q8, [x3, #-0x70]
+ mov x4, #0x4 // =4
+ ldr q0, [x1], #0x20
+ ldur q1, [x1, #-0x10]
+ ldr q26, [x0]
+ ldr q13, [x0, #0x40]
+ ldr q28, [x0, #0xc0]
+ ldr q2, [x0, #0x140]
+ ldr q6, [x0, #0x80]
+ ldr q9, [x0, #0x100]
+ ldr q29, [x0, #0x1c0]
+ ldr q23, [x0, #0x180]
+ sub v17.8h, v26.8h, v13.8h
+ add v4.8h, v26.8h, v13.8h
+ ldr q25, [x0, #0xd0]
+ ldr q24, [x0, #0x50]
+ add v5.8h, v6.8h, v28.8h
+ mul v19.8h, v17.8h, v0.h[6]
+ sub v10.8h, v6.8h, v28.8h
+ ldr q30, [x0, #0x150]
+ sqrdmulh v12.8h, v17.8h, v0.h[7]
+ add v17.8h, v9.8h, v2.8h
+ sub v28.8h, v9.8h, v2.8h
+ ldr q2, [x0, #0x90]
+ sub v26.8h, v23.8h, v29.8h
+ sqrdmulh v31.8h, v10.8h, v1.h[1]
+ add v22.8h, v23.8h, v29.8h
+ ldr q3, [x0, #0x110]
+ sqrdmulh v9.8h, v28.8h, v1.h[3]
+ sub v20.8h, v4.8h, v5.8h
+ sub v27.8h, v17.8h, v22.8h
+ ldr q29, [x0, #0x10]
+ add v16.8h, v4.8h, v5.8h
+ sqrdmulh v4.8h, v26.8h, v1.h[5]
+ add v6.8h, v17.8h, v22.8h
+ ldr q22, [x0, #0x1d0]
+ mul v8.8h, v28.8h, v1.h[2]
+ sub v21.8h, v2.8h, v25.8h
+ sub v5.8h, v16.8h, v6.8h
+ mul v17.8h, v26.8h, v1.h[4]
+ mul v26.8h, v10.8h, v1.h[0]
+ mls v26.8h, v31.8h, v7.h[0]
+ mls v17.8h, v4.8h, v7.h[0]
+ mls v19.8h, v12.8h, v7.h[0]
+ mls v8.8h, v9.8h, v7.h[0]
+ sqrdmulh v10.8h, v27.8h, v0.h[5]
+ sub v12.8h, v19.8h, v26.8h
+ add v9.8h, v19.8h, v26.8h
+ sqrdmulh v26.8h, v20.8h, v0.h[3]
+ sub v11.8h, v8.8h, v17.8h
+ add v14.8h, v8.8h, v17.8h
+ sqrdmulh v13.8h, v12.8h, v0.h[3]
+ add v23.8h, v9.8h, v14.8h
+ sqrdmulh v28.8h, v11.8h, v0.h[5]
+ sub v19.8h, v9.8h, v14.8h
+ mul v17.8h, v27.8h, v0.h[4]
+ str q23, [x0, #0x40]
+ mul v14.8h, v20.8h, v0.h[2]
+ mul v8.8h, v11.8h, v0.h[4]
+ mul v4.8h, v12.8h, v0.h[2]
+ mls v14.8h, v26.8h, v7.h[0]
+ mls v17.8h, v10.8h, v7.h[0]
+ mls v8.8h, v28.8h, v7.h[0]
+ mls v4.8h, v13.8h, v7.h[0]
+ sub v10.8h, v14.8h, v17.8h
+ add v20.8h, v14.8h, v17.8h
+ sqrdmulh v28.8h, v5.8h, v0.h[1]
+ mul v18.8h, v5.8h, v0.h[0]
+ str q20, [x0, #0x80]
+ sub v13.8h, v4.8h, v8.8h
+ mul v23.8h, v10.8h, v0.h[0]
+ mul v17.8h, v19.8h, v0.h[0]
+ sqrdmulh v9.8h, v13.8h, v0.h[1]
+ mls v18.8h, v28.8h, v7.h[0]
+ sqrdmulh v10.8h, v10.8h, v0.h[1]
+ sub x4, x4, #0x2
-intt_layer4567_start:
- str q6, [x3, #0x20]
- ldur q18, [x2, #-0x50]
- mul v26.8h, v5.8h, v19.8h
- trn1 v16.2d, v10.2d, v31.2d
- mul v27.8h, v22.8h, v0.h[2]
- trn1 v10.2d, v2.2d, v15.2d
- add v5.8h, v4.8h, v11.8h
- mls v26.8h, v12.8h, v7.h[0]
- add v11.8h, v10.8h, v16.8h
- add v6.8h, v20.8h, v25.8h
- ldur q25, [x2, #-0x40]
- ldur q28, [x2, #-0x30]
- ldr q2, [x3, #0xa0]
- ldr q19, [x2, #0x40]
- sub v17.8h, v8.8h, v30.8h
- ldr q1, [x3, #0x90]
- sqrdmulh v9.8h, v17.8h, v0.h[5]
- str q5, [x3], #0x40
- ldr q30, [x3, #0x70]
- sub v10.8h, v10.8h, v16.8h
- ldr q16, [x3, #0x40]
- sqrdmulh v24.8h, v10.8h, v28.8h
- mul v13.8h, v10.8h, v25.8h
- sub v21.8h, v11.8h, v6.8h
- trn1 v15.4s, v2.4s, v30.4s
- trn2 v31.4s, v2.4s, v30.4s
- mls v13.8h, v24.8h, v7.h[0]
- mul v29.8h, v21.8h, v14.8h
- ldr q12, [x2, #0x50]
- sub v28.8h, v13.8h, v26.8h
- trn2 v10.4s, v16.4s, v1.4s
- add v30.8h, v11.8h, v6.8h
- sqrdmulh v2.8h, v28.8h, v18.8h
- mul v8.8h, v28.8h, v14.8h
- sqrdmulh v18.8h, v21.8h, v18.8h
- ldr q14, [x2], #0x60
- mls v8.8h, v2.8h, v7.h[0]
- add v11.8h, v13.8h, v26.8h
- mls v29.8h, v18.8h, v7.h[0]
- sqrdmulh v20.8h, v22.8h, v0.h[3]
- trn1 v23.4s, v30.4s, v11.4s
- trn2 v28.4s, v30.4s, v11.4s
- trn2 v13.4s, v29.4s, v8.4s
- trn1 v11.4s, v29.4s, v8.4s
- mls v27.8h, v20.8h, v7.h[0]
- trn1 v21.2d, v28.2d, v13.2d
- trn2 v8.2d, v23.2d, v11.2d
- trn1 v24.2d, v23.2d, v11.2d
- mul v26.8h, v17.8h, v0.h[4]
- trn2 v30.2d, v28.2d, v13.2d
- add v4.8h, v24.8h, v21.8h
- add v11.8h, v8.8h, v30.8h
- mls v26.8h, v9.8h, v7.h[0]
- sqdmulh v17.8h, v4.8h, v7.h[1]
- sqdmulh v29.8h, v11.8h, v7.h[1]
- trn2 v25.2d, v10.2d, v31.2d
- add v2.8h, v27.8h, v26.8h
- srshr v28.8h, v17.8h, #0xb
- srshr v13.8h, v29.8h, #0xb
- sqdmulh v20.8h, v2.8h, v7.h[1]
- sub v5.8h, v27.8h, v26.8h
- mls v4.8h, v28.8h, v7.h[0]
- mls v11.8h, v13.8h, v7.h[0]
- srshr v23.8h, v20.8h, #0xb
- sqrdmulh v17.8h, v5.8h, v0.h[1]
- mul v9.8h, v5.8h, v0.h[0]
- mls v2.8h, v23.8h, v7.h[0]
- sub v29.8h, v4.8h, v11.8h
- ldr q0, [x1], #0x10
- stur q2, [x3, #-0x30]
- trn1 v2.4s, v16.4s, v1.4s
- sqrdmulh v3.8h, v29.8h, v0.h[1]
- mul v6.8h, v29.8h, v0.h[0]
- trn2 v20.2d, v2.2d, v15.2d
- mls v9.8h, v17.8h, v7.h[0]
- sub v5.8h, v20.8h, v25.8h
- mls v6.8h, v3.8h, v7.h[0]
- sub v22.8h, v24.8h, v21.8h
- stur q9, [x3, #-0x10]
- sqrdmulh v12.8h, v5.8h, v12.8h
- subs x4, x4, #0x1
- cbnz x4, intt_layer4567_start
- mul v21.8h, v22.8h, v0.h[2]
- mul v28.8h, v5.8h, v19.8h
- trn1 v10.2d, v10.2d, v31.2d
- trn1 v2.2d, v2.2d, v15.2d
- add v11.8h, v4.8h, v11.8h
- sub v30.8h, v8.8h, v30.8h
- add v23.8h, v20.8h, v25.8h
- add v24.8h, v2.8h, v10.8h
- mul v8.8h, v30.8h, v0.h[4]
- sqrdmulh v5.8h, v30.8h, v0.h[5]
- sqrdmulh v22.8h, v22.8h, v0.h[3]
- add v30.8h, v24.8h, v23.8h
- ldur q26, [x2, #-0x30]
- mls v8.8h, v5.8h, v7.h[0]
- sub v5.8h, v2.8h, v10.8h
- ldur q13, [x2, #-0x40]
- mls v21.8h, v22.8h, v7.h[0]
- str q6, [x3, #0x20]
- mul v3.8h, v5.8h, v13.8h
- sqrdmulh v22.8h, v5.8h, v26.8h
- sub v18.8h, v21.8h, v8.8h
- mls v28.8h, v12.8h, v7.h[0]
- str q11, [x3], #0x40
- mls v3.8h, v22.8h, v7.h[0]
- sqrdmulh v16.8h, v18.8h, v0.h[1]
- sub v10.8h, v24.8h, v23.8h
- mul v17.8h, v18.8h, v0.h[0]
- sub v11.8h, v3.8h, v28.8h
- mul v13.8h, v10.8h, v14.8h
- add v22.8h, v3.8h, v28.8h
- mul v14.8h, v11.8h, v14.8h
- ldur q26, [x2, #-0x50]
- trn2 v2.4s, v30.4s, v22.4s
- mls v17.8h, v16.8h, v7.h[0]
- sqrdmulh v10.8h, v10.8h, v26.8h
- sqrdmulh v11.8h, v11.8h, v26.8h
- ldr q9, [x1], #0x10
- mls v13.8h, v10.8h, v7.h[0]
- mls v14.8h, v11.8h, v7.h[0]
- trn1 v6.4s, v30.4s, v22.4s
- add v8.8h, v21.8h, v8.8h
- stur q17, [x3, #-0x10]
- trn2 v0.4s, v13.4s, v14.4s
- trn1 v1.4s, v13.4s, v14.4s
- sqdmulh v13.8h, v8.8h, v7.h[1]
- trn1 v24.2d, v2.2d, v0.2d
- trn2 v2.2d, v2.2d, v0.2d
- trn2 v26.2d, v6.2d, v1.2d
- trn1 v11.2d, v6.2d, v1.2d
- add v22.8h, v26.8h, v2.8h
- sub v28.8h, v11.8h, v24.8h
- sub v27.8h, v26.8h, v2.8h
- add v10.8h, v11.8h, v24.8h
- sqrdmulh v11.8h, v28.8h, v9.h[3]
- mul v24.8h, v28.8h, v9.h[2]
- sqdmulh v1.8h, v22.8h, v7.h[1]
- sqrdmulh v0.8h, v27.8h, v9.h[5]
- srshr v12.8h, v13.8h, #0xb
- mls v24.8h, v11.8h, v7.h[0]
- sqdmulh v14.8h, v10.8h, v7.h[1]
- mul v27.8h, v27.8h, v9.h[4]
- mls v8.8h, v12.8h, v7.h[0]
- srshr v5.8h, v1.8h, #0xb
- srshr v14.8h, v14.8h, #0xb
- mls v27.8h, v0.8h, v7.h[0]
- mls v22.8h, v5.8h, v7.h[0]
- mls v10.8h, v14.8h, v7.h[0]
- stur q8, [x3, #-0x30]
- sub v2.8h, v24.8h, v27.8h
- add v14.8h, v24.8h, v27.8h
- sub v11.8h, v10.8h, v22.8h
- add v20.8h, v10.8h, v22.8h
- sqdmulh v22.8h, v14.8h, v7.h[1]
- sqrdmulh v8.8h, v11.8h, v9.h[1]
- mul v27.8h, v11.8h, v9.h[0]
- sqrdmulh v0.8h, v2.8h, v9.h[1]
- mul v11.8h, v2.8h, v9.h[0]
- srshr v10.8h, v22.8h, #0xb
- mls v27.8h, v8.8h, v7.h[0]
- str q20, [x3], #0x40
- mls v11.8h, v0.8h, v7.h[0]
- mls v14.8h, v10.8h, v7.h[0]
- stur q27, [x3, #-0x20]
- stur q11, [x3, #-0x10]
- stur q14, [x3, #-0x30]
- mov x4, #0x4 // =4
- ldr q0, [x1], #0x20
- ldur q1, [x1, #-0x10]
- ldr q2, [x0]
- ldr q10, [x0, #0x40]
- ldr q11, [x0, #0x80]
- sub v14.8h, v2.8h, v10.8h
- add v2.8h, v2.8h, v10.8h
- ldr q10, [x0, #0xc0]
- sqrdmulh v8.8h, v14.8h, v0.h[7]
- mul v14.8h, v14.8h, v0.h[6]
- sub v22.8h, v11.8h, v10.8h
- add v10.8h, v11.8h, v10.8h
- ldr q11, [x0, #0x1c0]
- add v13.8h, v2.8h, v10.8h
- sub v2.8h, v2.8h, v10.8h
- sqrdmulh v10.8h, v22.8h, v1.h[1]
- mul v22.8h, v22.8h, v1.h[0]
- mls v14.8h, v8.8h, v7.h[0]
- sqrdmulh v8.8h, v2.8h, v0.h[3]
- mul v2.8h, v2.8h, v0.h[2]
- mls v22.8h, v10.8h, v7.h[0]
- ldr q10, [x0, #0x100]
- mls v2.8h, v8.8h, v7.h[0]
- sub v8.8h, v14.8h, v22.8h
- add v14.8h, v14.8h, v22.8h
- ldr q22, [x0, #0x180]
- sqrdmulh v24.8h, v8.8h, v0.h[3]
- mul v8.8h, v8.8h, v0.h[2]
- sub v26.8h, v22.8h, v11.8h
- add v11.8h, v22.8h, v11.8h
- ldr q22, [x0, #0x140]
- sqrdmulh v16.8h, v26.8h, v1.h[5]
- mul v26.8h, v26.8h, v1.h[4]
- add v23.8h, v10.8h, v22.8h
- sub v10.8h, v10.8h, v22.8h
- mls v8.8h, v24.8h, v7.h[0]
- add v22.8h, v23.8h, v11.8h
- mul v24.8h, v10.8h, v1.h[2]
- sqrdmulh v10.8h, v10.8h, v1.h[3]
- sub v19.8h, v13.8h, v22.8h
- add v18.8h, v13.8h, v22.8h
- sub v11.8h, v23.8h, v11.8h
- mls v24.8h, v10.8h, v7.h[0]
- mls v26.8h, v16.8h, v7.h[0]
- sqrdmulh v10.8h, v11.8h, v0.h[5]
- mul v11.8h, v11.8h, v0.h[4]
- sqrdmulh v22.8h, v19.8h, v0.h[1]
- sub v13.8h, v24.8h, v26.8h
- mul v16.8h, v19.8h, v0.h[0]
- mls v11.8h, v10.8h, v7.h[0]
- sqrdmulh v10.8h, v13.8h, v0.h[5]
- mul v13.8h, v13.8h, v0.h[4]
- add v24.8h, v24.8h, v26.8h
- sub v26.8h, v2.8h, v11.8h
- add v9.8h, v2.8h, v11.8h
- add v11.8h, v14.8h, v24.8h
- sub v14.8h, v14.8h, v24.8h
- sqrdmulh v2.8h, v26.8h, v0.h[1]
- mul v24.8h, v26.8h, v0.h[0]
- mls v13.8h, v10.8h, v7.h[0]
- mls v16.8h, v22.8h, v7.h[0]
- sqrdmulh v10.8h, v14.8h, v0.h[1]
- mls v24.8h, v2.8h, v7.h[0]
- add v22.8h, v8.8h, v13.8h
- str q16, [x0, #0x100]
- sub v2.8h, v8.8h, v13.8h
- str q24, [x0, #0x180]
- mul v13.8h, v14.8h, v0.h[0]
- str q22, [x0, #0xc0]
- sqrdmulh v21.8h, v2.8h, v0.h[1]
- ldr q6, [x0, #0x90]
- ldr q14, [x0, #0xd0]
- mls v13.8h, v10.8h, v7.h[0]
- str q11, [x0, #0x40]
- sub v10.8h, v6.8h, v14.8h
- ldr q11, [x0, #0x10]
- sqrdmulh v19.8h, v10.8h, v1.h[1]
- mul v20.8h, v10.8h, v1.h[0]
- ldr q28, [x0, #0x50]
- sub x4, x4, #0x2
-
-intt_layer123_start:
- mls v20.8h, v19.8h, v7.h[0]
- ldr q31, [x0, #0x1d0]
- sub v22.8h, v11.8h, v28.8h
- ldr q30, [x0, #0x110]
- sqrdmulh v8.8h, v22.8h, v0.h[7]
- mul v3.8h, v22.8h, v0.h[6]
- mul v5.8h, v2.8h, v0.h[0]
- str q13, [x0, #0x140]
- add v10.8h, v11.8h, v28.8h
- ldr q22, [x0, #0x150]
- ldr q4, [x0, #0x190]
- sub v23.8h, v30.8h, v22.8h
- add v27.8h, v30.8h, v22.8h
- mls v3.8h, v8.8h, v7.h[0]
- mls v5.8h, v21.8h, v7.h[0]
- ldr q11, [x0, #0x20]
- sub v17.8h, v4.8h, v31.8h
- add v2.8h, v6.8h, v14.8h
- mul v19.8h, v23.8h, v1.h[2]
- sub v22.8h, v3.8h, v20.8h
- add v14.8h, v10.8h, v2.8h
- sub v24.8h, v10.8h, v2.8h
- sqrdmulh v2.8h, v23.8h, v1.h[3]
- sqrdmulh v30.8h, v22.8h, v0.h[3]
- mul v23.8h, v22.8h, v0.h[2]
- sqrdmulh v15.8h, v17.8h, v1.h[5]
- mls v19.8h, v2.8h, v7.h[0]
- add v2.8h, v4.8h, v31.8h
- mul v21.8h, v17.8h, v1.h[4]
- sqrdmulh v22.8h, v24.8h, v0.h[3]
- sub v26.8h, v27.8h, v2.8h
- add v8.8h, v27.8h, v2.8h
- mul v28.8h, v24.8h, v0.h[2]
- sqrdmulh v10.8h, v26.8h, v0.h[5]
- mul v31.8h, v26.8h, v0.h[4]
- mls v21.8h, v15.8h, v7.h[0]
- mls v28.8h, v22.8h, v7.h[0]
- sub v17.8h, v14.8h, v8.8h
- mls v31.8h, v10.8h, v7.h[0]
- sub v27.8h, v19.8h, v21.8h
- sqrdmulh v29.8h, v17.8h, v0.h[1]
- mul v10.8h, v17.8h, v0.h[0]
- sub v15.8h, v28.8h, v31.8h
- sqrdmulh v17.8h, v27.8h, v0.h[5]
- mul v25.8h, v27.8h, v0.h[4]
- sqrdmulh v6.8h, v15.8h, v0.h[1]
- mul v27.8h, v15.8h, v0.h[0]
- add v16.8h, v19.8h, v21.8h
- mls v25.8h, v17.8h, v7.h[0]
- mls v23.8h, v30.8h, v7.h[0]
- mls v27.8h, v6.8h, v7.h[0]
- ldr q6, [x0, #0xa0]
- add v22.8h, v23.8h, v25.8h
- str q27, [x0, #0x190]
- add v4.8h, v3.8h, v20.8h
- str q22, [x0, #0xd0]
- mls v10.8h, v29.8h, v7.h[0]
- str q5, [x0, #0x1c0]
- add v20.8h, v4.8h, v16.8h
- str q18, [x0], #0x10
- sub v18.8h, v4.8h, v16.8h
- str q10, [x0, #0x100]
- sub v2.8h, v23.8h, v25.8h
- sqrdmulh v12.8h, v18.8h, v0.h[1]
- mul v13.8h, v18.8h, v0.h[0]
- add v18.8h, v14.8h, v8.8h
- ldr q14, [x0, #0xd0]
- mls v13.8h, v12.8h, v7.h[0]
- str q9, [x0, #0x70]
- sub v3.8h, v6.8h, v14.8h
- add v9.8h, v28.8h, v31.8h
- str q20, [x0, #0x40]
- sqrdmulh v19.8h, v3.8h, v1.h[1]
- mul v20.8h, v3.8h, v1.h[0]
- sqrdmulh v21.8h, v2.8h, v0.h[1]
- ldr q28, [x0, #0x50]
- subs x4, x4, #0x1
- cbnz x4, intt_layer123_start
- mls v20.8h, v19.8h, v7.h[0]
- sub v10.8h, v11.8h, v28.8h
- add v11.8h, v11.8h, v28.8h
- mul v2.8h, v2.8h, v0.h[0]
- str q13, [x0, #0x140]
- add v25.8h, v6.8h, v14.8h
- str q18, [x0], #0x10
- sqrdmulh v17.8h, v10.8h, v0.h[7]
- str q9, [x0, #0x70]
- ldr q8, [x0, #0x1c0]
- ldr q13, [x0, #0x100]
- ldr q26, [x0, #0x180]
- ldr q24, [x0, #0x140]
- add v15.8h, v26.8h, v8.8h
- sub v8.8h, v26.8h, v8.8h
- sub v12.8h, v13.8h, v24.8h
- add v24.8h, v13.8h, v24.8h
- sqrdmulh v18.8h, v8.8h, v1.h[5]
- mul v26.8h, v12.8h, v1.h[2]
- mul v8.8h, v8.8h, v1.h[4]
- sqrdmulh v16.8h, v12.8h, v1.h[3]
- mul v10.8h, v10.8h, v0.h[6]
- add v22.8h, v11.8h, v25.8h
- mls v8.8h, v18.8h, v7.h[0]
- mls v26.8h, v16.8h, v7.h[0]
- mls v10.8h, v17.8h, v7.h[0]
- add v23.8h, v24.8h, v15.8h
- sub v11.8h, v11.8h, v25.8h
- sub v3.8h, v26.8h, v8.8h
- sub v14.8h, v10.8h, v20.8h
- sub v19.8h, v22.8h, v23.8h
- mul v18.8h, v3.8h, v0.h[4]
- sqrdmulh v17.8h, v14.8h, v0.h[3]
- mul v14.8h, v14.8h, v0.h[2]
- sqrdmulh v3.8h, v3.8h, v0.h[5]
- sub v16.8h, v24.8h, v15.8h
- mls v2.8h, v21.8h, v7.h[0]
- mls v14.8h, v17.8h, v7.h[0]
- mls v18.8h, v3.8h, v7.h[0]
- sqrdmulh v31.8h, v16.8h, v0.h[5]
- str q2, [x0, #0x1b0]
- mul v13.8h, v16.8h, v0.h[4]
- add v24.8h, v14.8h, v18.8h
- sqrdmulh v2.8h, v11.8h, v0.h[3]
- mul v21.8h, v11.8h, v0.h[2]
- mls v13.8h, v31.8h, v7.h[0]
- add v16.8h, v26.8h, v8.8h
- add v28.8h, v10.8h, v20.8h
- mls v21.8h, v2.8h, v7.h[0]
- sub v14.8h, v14.8h, v18.8h
- add v2.8h, v28.8h, v16.8h
- sub v10.8h, v28.8h, v16.8h
- sub v16.8h, v21.8h, v13.8h
- sqrdmulh v27.8h, v19.8h, v0.h[1]
- mul v26.8h, v19.8h, v0.h[0]
- sqrdmulh v19.8h, v16.8h, v0.h[1]
- mul v28.8h, v16.8h, v0.h[0]
- sqrdmulh v8.8h, v14.8h, v0.h[1]
- mls v26.8h, v27.8h, v7.h[0]
- mul v14.8h, v14.8h, v0.h[0]
- mls v28.8h, v19.8h, v7.h[0]
- sqrdmulh v20.8h, v10.8h, v0.h[1]
- str q26, [x0, #0x100]
- mul v10.8h, v10.8h, v0.h[0]
- str q28, [x0, #0x180]
- add v22.8h, v22.8h, v23.8h
- str q24, [x0, #0xc0]
- mls v10.8h, v20.8h, v7.h[0]
- str q2, [x0, #0x40]
- mls v14.8h, v8.8h, v7.h[0]
- str q22, [x0], #0x10
- add v11.8h, v21.8h, v13.8h
- str q10, [x0, #0x130]
- str q11, [x0, #0x70]
- str q14, [x0, #0x1b0]
- ldp d8, d9, [sp]
- ldp d10, d11, [sp, #0x10]
- ldp d12, d13, [sp, #0x20]
- ldp d14, d15, [sp, #0x30]
- add sp, sp, #0x40
+Lintt_layer123_start:
+ sub v12.8h, v3.8h, v30.8h
+ mul v11.8h, v21.8h, v1.h[0]
+ add v28.8h, v4.8h, v8.8h
+ ldr q20, [x0, #0x190]
+ add v27.8h, v16.8h, v6.8h
+ sqrdmulh v8.8h, v12.8h, v1.h[3]
+ add v16.8h, v29.8h, v24.8h
+ str q28, [x0, #0xc0]
+ mls v23.8h, v10.8h, v7.h[0]
+ str q27, [x0], #0x10
+ add v15.8h, v20.8h, v22.8h
+ str q18, [x0, #0xf0]
+ mul v14.8h, v13.8h, v0.h[0]
+ add v2.8h, v2.8h, v25.8h
+ sub v26.8h, v20.8h, v22.8h
+ mul v4.8h, v12.8h, v1.h[2]
+ sub v5.8h, v16.8h, v2.8h
+ str q23, [x0, #0x170]
+ add v20.8h, v3.8h, v30.8h
+ sqrdmulh v27.8h, v26.8h, v1.h[5]
+ add v16.8h, v16.8h, v2.8h
+ mul v18.8h, v26.8h, v1.h[4]
+ sub v31.8h, v20.8h, v15.8h
+ mls v4.8h, v8.8h, v7.h[0]
+ sub v28.8h, v29.8h, v24.8h
+ mls v18.8h, v27.8h, v7.h[0]
+ ldr q22, [x0, #0x1d0]
+ mul v26.8h, v28.8h, v0.h[6]
+ mul v2.8h, v5.8h, v0.h[2]
+ sub v12.8h, v4.8h, v18.8h
+ sqrdmulh v24.8h, v28.8h, v0.h[7]
+ mls v14.8h, v9.8h, v7.h[0]
+ sqrdmulh v10.8h, v12.8h, v0.h[5]
+ mls v26.8h, v24.8h, v7.h[0]
+ ldr q24, [x0, #0x50]
+ mul v8.8h, v12.8h, v0.h[4]
+ str q14, [x0, #0x1b0]
+ add v28.8h, v4.8h, v18.8h
+ sqrdmulh v5.8h, v5.8h, v0.h[3]
+ add v6.8h, v20.8h, v15.8h
+ sqrdmulh v3.8h, v19.8h, v0.h[1]
+ sub v13.8h, v16.8h, v6.8h
+ sqrdmulh v12.8h, v21.8h, v1.h[1]
+ sqrdmulh v21.8h, v13.8h, v0.h[1]
+ sqrdmulh v27.8h, v31.8h, v0.h[5]
+ ldr q25, [x0, #0xd0]
+ mls v11.8h, v12.8h, v7.h[0]
+ mul v23.8h, v31.8h, v0.h[4]
+ mul v18.8h, v13.8h, v0.h[0]
+ add v30.8h, v26.8h, v11.8h
+ sub v13.8h, v26.8h, v11.8h
+ mls v23.8h, v27.8h, v7.h[0]
+ add v12.8h, v30.8h, v28.8h
+ sub v19.8h, v30.8h, v28.8h
+ mls v2.8h, v5.8h, v7.h[0]
+ str q12, [x0, #0x40]
+ sqrdmulh v26.8h, v13.8h, v0.h[3]
+ mls v8.8h, v10.8h, v7.h[0]
+ ldr q30, [x0, #0x150]
+ sub v20.8h, v2.8h, v23.8h
+ mul v4.8h, v13.8h, v0.h[2]
+ add v13.8h, v2.8h, v23.8h
+ mls v4.8h, v26.8h, v7.h[0]
+ ldr q2, [x0, #0x90]
+ mul v23.8h, v20.8h, v0.h[0]
+ ldr q29, [x0, #0x10]
+ sqrdmulh v10.8h, v20.8h, v0.h[1]
+ str q13, [x0, #0x80]
+ sub v13.8h, v4.8h, v8.8h
+ mls v17.8h, v3.8h, v7.h[0]
+ ldr q3, [x0, #0x110]
+ mls v18.8h, v21.8h, v7.h[0]
+ sub v21.8h, v2.8h, v25.8h
+ sqrdmulh v9.8h, v13.8h, v0.h[1]
+ str q17, [x0, #0x130]
+ mul v17.8h, v19.8h, v0.h[0]
+ subs x4, x4, #0x1
+ cbnz x4, Lintt_layer123_start
+ mls v23.8h, v10.8h, v7.h[0]
+ ldr q11, [x0, #0x190]
+ str q18, [x0, #0x100]
+ add v27.8h, v3.8h, v30.8h
+ mul v13.8h, v13.8h, v0.h[0]
+ sub v5.8h, v29.8h, v24.8h
+ add v14.8h, v16.8h, v6.8h
+ mls v13.8h, v9.8h, v7.h[0]
+ add v10.8h, v11.8h, v22.8h
+ str q23, [x0, #0x180]
+ sub v20.8h, v11.8h, v22.8h
+ sub v23.8h, v27.8h, v10.8h
+ sqrdmulh v16.8h, v21.8h, v1.h[1]
+ sqrdmulh v31.8h, v23.8h, v0.h[5]
+ str q13, [x0, #0x1c0]
+ add v13.8h, v4.8h, v8.8h
+ mul v18.8h, v21.8h, v1.h[0]
+ str q13, [x0, #0xc0]
+ sqrdmulh v13.8h, v19.8h, v0.h[1]
+ sqrdmulh v28.8h, v20.8h, v1.h[5]
+ str q14, [x0], #0x10
+ mul v4.8h, v20.8h, v1.h[4]
+ mls v17.8h, v13.8h, v7.h[0]
+ sub v13.8h, v3.8h, v30.8h
+ sqrdmulh v8.8h, v13.8h, v1.h[3]
+ mul v12.8h, v13.8h, v1.h[2]
+ mls v4.8h, v28.8h, v7.h[0]
+ mls v12.8h, v8.8h, v7.h[0]
+ mls v18.8h, v16.8h, v7.h[0]
+ str q17, [x0, #0x130]
+ sqrdmulh v15.8h, v5.8h, v0.h[7]
+ add v11.8h, v27.8h, v10.8h
+ mul v16.8h, v5.8h, v0.h[6]
+ sub v8.8h, v12.8h, v4.8h
+ sqrdmulh v28.8h, v8.8h, v0.h[5]
+ add v13.8h, v2.8h, v25.8h
+ mls v16.8h, v15.8h, v7.h[0]
+ add v26.8h, v12.8h, v4.8h
+ mul v8.8h, v8.8h, v0.h[4]
+ add v4.8h, v29.8h, v24.8h
+ mls v8.8h, v28.8h, v7.h[0]
+ sub v20.8h, v4.8h, v13.8h
+ add v14.8h, v4.8h, v13.8h
+ add v12.8h, v16.8h, v18.8h
+ sqrdmulh v22.8h, v20.8h, v0.h[3]
+ add v27.8h, v14.8h, v11.8h
+ sub v13.8h, v16.8h, v18.8h
+ mul v4.8h, v20.8h, v0.h[2]
+ str q27, [x0], #0x10
+ sub v24.8h, v12.8h, v26.8h
+ sqrdmulh v3.8h, v13.8h, v0.h[3]
+ mul v13.8h, v13.8h, v0.h[2]
+ sqrdmulh v27.8h, v24.8h, v0.h[1]
+ mls v13.8h, v3.8h, v7.h[0]
+ mul v9.8h, v24.8h, v0.h[0]
+ mls v9.8h, v27.8h, v7.h[0]
+ add v30.8h, v13.8h, v8.8h
+ sub v13.8h, v13.8h, v8.8h
+ mls v4.8h, v22.8h, v7.h[0]
+ str q30, [x0, #0xb0]
+ sqrdmulh v16.8h, v13.8h, v0.h[1]
+ str q9, [x0, #0x130]
+ mul v9.8h, v13.8h, v0.h[0]
+ add v13.8h, v12.8h, v26.8h
+ str q13, [x0, #0x30]
+ mul v13.8h, v23.8h, v0.h[4]
+ sub v23.8h, v14.8h, v11.8h
+ mls v13.8h, v31.8h, v7.h[0]
+ mls v9.8h, v16.8h, v7.h[0]
+ mul v30.8h, v23.8h, v0.h[0]
+ sub v24.8h, v4.8h, v13.8h
+ add v13.8h, v4.8h, v13.8h
+ sqrdmulh v23.8h, v23.8h, v0.h[1]
+ str q9, [x0, #0x1b0]
+ str q13, [x0, #0x70]
+ sqrdmulh v13.8h, v24.8h, v0.h[1]
+ mul v21.8h, v24.8h, v0.h[0]
+ mls v30.8h, v23.8h, v7.h[0]
+ mls v21.8h, v13.8h, v7.h[0]
+ str q30, [x0, #0xf0]
+ str q21, [x0, #0x170]
+ ldp d8, d9, [sp]
+ .cfi_restore d8
+ .cfi_restore d9
+ ldp d10, d11, [sp, #0x10]
+ .cfi_restore d10
+ .cfi_restore d11
+ ldp d12, d13, [sp, #0x20]
+ .cfi_restore d12
+ .cfi_restore d13
+ ldp d14, d15, [sp, #0x30]
+ .cfi_restore d14
+ .cfi_restore d15
+ add sp, sp, #0x40
+ .cfi_adjust_cfa_offset -0x40
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(intt_asm)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/ntt.S
index bf5922c144..2ce53dc579 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/ntt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/ntt.S
@@ -19,7 +19,33 @@
* https://eprint.iacr.org/2022/1303
*/
-/* AArch64 ML-KEM forward NTT following @[NeonNTT] and @[SLOTHY_Paper]. */
+/*yaml
+ Name: ntt_asm
+ Description: AArch64 ML-KEM forward NTT following @[NeonNTT] and @[SLOTHY_Paper]
+ Signature: void mlk_ntt_asm(int16_t p[256], const int16_t twiddles12345[80], const int16_t twiddles56[384])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: read/write
+ c_parameter: int16_t p[256]
+ description: Input/output polynomial
+ x1:
+ type: buffer
+ size_bytes: 160
+ permissions: read-only
+ c_parameter: const int16_t twiddles12345[80]
+ description: Twiddle factors for layers 1-5
+ x2:
+ type: buffer
+ size_bytes: 768
+ permissions: read-only
+ c_parameter: const int16_t twiddles56[384]
+ description: Twiddle factors for layers 6-7
+ Stack:
+ bytes: 64
+ description: saving callee-saved Neon registers
+*/
#include "../../../common.h"
#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
@@ -29,336 +55,508 @@
* dev/aarch64_opt/src/ntt.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(ntt_asm)
MLK_ASM_FN_SYMBOL(ntt_asm)
- sub sp, sp, #0x40
- stp d8, d9, [sp]
- stp d10, d11, [sp, #0x10]
- stp d12, d13, [sp, #0x20]
- stp d14, d15, [sp, #0x30]
- mov w5, #0xd01 // =3329
- mov v7.h[0], w5
- mov w5, #0x4ebf // =20159
- mov v7.h[1], w5
- mov x3, x0
- mov x4, #0x4 // =4
- ldr q0, [x1], #0x20
- ldur q1, [x1, #-0x10]
- ldr q5, [x0]
- ldr q13, [x0, #0x40]
- ldr q3, [x0, #0x80]
- ldr q22, [x0, #0xc0]
- ldr q24, [x0, #0x100]
- ldr q11, [x0, #0x1c0]
- mul v23.8h, v24.8h, v0.h[0]
- ldr q2, [x0, #0x140]
- mul v17.8h, v11.8h, v0.h[0]
- ldr q19, [x0, #0x180]
- sub x4, x4, #0x1
+ .cfi_startproc
+ sub sp, sp, #0x40
+ .cfi_adjust_cfa_offset 0x40
+ stp d8, d9, [sp]
+ .cfi_rel_offset d8, 0x0
+ .cfi_rel_offset d9, 0x8
+ stp d10, d11, [sp, #0x10]
+ .cfi_rel_offset d10, 0x10
+ .cfi_rel_offset d11, 0x18
+ stp d12, d13, [sp, #0x20]
+ .cfi_rel_offset d12, 0x20
+ .cfi_rel_offset d13, 0x28
+ stp d14, d15, [sp, #0x30]
+ .cfi_rel_offset d14, 0x30
+ .cfi_rel_offset d15, 0x38
+ mov w5, #0xd01 // =3329
+ mov v7.h[0], w5
+ mov w5, #0x4ebf // =20159
+ mov v7.h[1], w5
+ mov x3, x0
+ mov x4, #0x4 // =4
+ ldr q0, [x1], #0x20
+ ldur q1, [x1, #-0x10]
+ ldr q21, [x0, #0x40]
+ ldr q5, [x0, #0x1c0]
+ ldr q30, [x0, #0x110]
+ ldr q24, [x0, #0x140]
+ ldr q12, [x0, #0x80]
+ sqrdmulh v9.8h, v5.8h, v0.h[1]
+ mul v23.8h, v5.8h, v0.h[0]
+ sqrdmulh v17.8h, v24.8h, v0.h[1]
+ ldr q13, [x0, #0xc0]
+ mls v23.8h, v9.8h, v7.h[0]
+ mul v8.8h, v24.8h, v0.h[0]
+ mls v8.8h, v17.8h, v7.h[0]
+ add v9.8h, v13.8h, v23.8h
+ sub v10.8h, v13.8h, v23.8h
+ mul v11.8h, v30.8h, v0.h[0]
+ ldr q13, [x0, #0x180]
+ sqrdmulh v28.8h, v9.8h, v0.h[3]
+ sub v29.8h, v21.8h, v8.8h
+ mul v26.8h, v9.8h, v0.h[2]
+ add v8.8h, v21.8h, v8.8h
+ mul v2.8h, v13.8h, v0.h[0]
+ mls v26.8h, v28.8h, v7.h[0]
+ mul v28.8h, v10.8h, v0.h[4]
+ sqrdmulh v23.8h, v10.8h, v0.h[5]
+ add v22.8h, v8.8h, v26.8h
+ sqrdmulh v10.8h, v13.8h, v0.h[1]
+ sqrdmulh v21.8h, v22.8h, v0.h[7]
+ ldr q13, [x0, #0x100]
+ mul v16.8h, v22.8h, v0.h[6]
+ mls v28.8h, v23.8h, v7.h[0]
+ mls v2.8h, v10.8h, v7.h[0]
+ sqrdmulh v23.8h, v13.8h, v0.h[1]
+ sub v10.8h, v29.8h, v28.8h
+ add v17.8h, v29.8h, v28.8h
+ mls v16.8h, v21.8h, v7.h[0]
+ sub v18.8h, v12.8h, v2.8h
+ ldr q29, [x0]
+ sqrdmulh v14.8h, v17.8h, v1.h[3]
+ add v22.8h, v12.8h, v2.8h
+ sqrdmulh v9.8h, v18.8h, v0.h[5]
+ mul v21.8h, v13.8h, v0.h[0]
+ ldr q13, [x0, #0x150]
+ mul v5.8h, v18.8h, v0.h[4]
+ mls v5.8h, v9.8h, v7.h[0]
+ mul v18.8h, v13.8h, v0.h[0]
+ mls v21.8h, v23.8h, v7.h[0]
+ sqrdmulh v2.8h, v13.8h, v0.h[1]
+ mul v13.8h, v17.8h, v1.h[2]
+ sub v4.8h, v29.8h, v21.8h
+ mls v13.8h, v14.8h, v7.h[0]
+ add v25.8h, v29.8h, v21.8h
+ add v6.8h, v4.8h, v5.8h
+ sqrdmulh v15.8h, v22.8h, v0.h[3]
+ sub v21.8h, v4.8h, v5.8h
+ sub v5.8h, v8.8h, v26.8h
+ mul v23.8h, v22.8h, v0.h[2]
+ add v28.8h, v6.8h, v13.8h
+ sub v13.8h, v6.8h, v13.8h
+ mul v4.8h, v5.8h, v1.h[0]
+ sub x4, x4, #0x2
-ntt_layer123_start:
- sqrdmulh v8.8h, v24.8h, v0.h[1]
- sqrdmulh v24.8h, v2.8h, v0.h[1]
- mul v2.8h, v2.8h, v0.h[0]
- sqrdmulh v14.8h, v19.8h, v0.h[1]
- mls v23.8h, v8.8h, v7.h[0]
- mul v8.8h, v19.8h, v0.h[0]
- mls v2.8h, v24.8h, v7.h[0]
- sqrdmulh v24.8h, v11.8h, v0.h[1]
- sub v11.8h, v5.8h, v23.8h
- mls v8.8h, v14.8h, v7.h[0]
- sub v14.8h, v13.8h, v2.8h
- add v2.8h, v13.8h, v2.8h
- add v23.8h, v5.8h, v23.8h
- sub v19.8h, v3.8h, v8.8h
- add v8.8h, v3.8h, v8.8h
- mls v17.8h, v24.8h, v7.h[0]
- sqrdmulh v24.8h, v19.8h, v0.h[5]
- mul v19.8h, v19.8h, v0.h[4]
- sqrdmulh v5.8h, v8.8h, v0.h[3]
- sub v13.8h, v22.8h, v17.8h
- add v17.8h, v22.8h, v17.8h
- mls v19.8h, v24.8h, v7.h[0]
- sqrdmulh v24.8h, v13.8h, v0.h[5]
- mul v13.8h, v13.8h, v0.h[4]
- mul v8.8h, v8.8h, v0.h[2]
- sub v3.8h, v11.8h, v19.8h
- add v11.8h, v11.8h, v19.8h
- mls v13.8h, v24.8h, v7.h[0]
- sqrdmulh v24.8h, v17.8h, v0.h[3]
- mul v19.8h, v17.8h, v0.h[2]
- mls v8.8h, v5.8h, v7.h[0]
- sub v17.8h, v14.8h, v13.8h
- add v14.8h, v14.8h, v13.8h
- mls v19.8h, v24.8h, v7.h[0]
- sub v24.8h, v23.8h, v8.8h
- add v8.8h, v23.8h, v8.8h
- sqrdmulh v23.8h, v14.8h, v1.h[3]
- sub v5.8h, v2.8h, v19.8h
- add v2.8h, v2.8h, v19.8h
- mul v14.8h, v14.8h, v1.h[2]
- sqrdmulh v19.8h, v5.8h, v1.h[1]
- sqrdmulh v13.8h, v2.8h, v0.h[7]
- mul v2.8h, v2.8h, v0.h[6]
- mul v5.8h, v5.8h, v1.h[0]
- mls v14.8h, v23.8h, v7.h[0]
- sqrdmulh v23.8h, v17.8h, v1.h[5]
- mls v2.8h, v13.8h, v7.h[0]
- mls v5.8h, v19.8h, v7.h[0]
- sub v19.8h, v11.8h, v14.8h
- add v14.8h, v11.8h, v14.8h
- sub v11.8h, v8.8h, v2.8h
- mul v17.8h, v17.8h, v1.h[4]
- add v8.8h, v8.8h, v2.8h
- sub v2.8h, v24.8h, v5.8h
- add v24.8h, v24.8h, v5.8h
- mls v17.8h, v23.8h, v7.h[0]
- str q8, [x0], #0x10
- ldr q5, [x0]
- sub v8.8h, v3.8h, v17.8h
- add v23.8h, v3.8h, v17.8h
- str q11, [x0, #0x30]
- ldr q13, [x0, #0x40]
- str q24, [x0, #0x70]
- ldr q3, [x0, #0x80]
- str q2, [x0, #0xb0]
- ldr q22, [x0, #0xc0]
- str q14, [x0, #0xf0]
- ldr q24, [x0, #0x100]
- str q19, [x0, #0x130]
- ldr q2, [x0, #0x140]
- str q23, [x0, #0x170]
- mul v23.8h, v24.8h, v0.h[0]
- str q8, [x0, #0x1b0]
- ldr q11, [x0, #0x1c0]
- ldr q19, [x0, #0x180]
- mul v17.8h, v11.8h, v0.h[0]
- subs x4, x4, #0x1
- cbnz x4, ntt_layer123_start
- sqrdmulh v6.8h, v11.8h, v0.h[1]
- mul v25.8h, v19.8h, v0.h[0]
- sqrdmulh v12.8h, v19.8h, v0.h[1]
- mul v11.8h, v2.8h, v0.h[0]
- mls v17.8h, v6.8h, v7.h[0]
- sqrdmulh v14.8h, v2.8h, v0.h[1]
- mls v25.8h, v12.8h, v7.h[0]
- sqrdmulh v27.8h, v24.8h, v0.h[1]
- add v9.8h, v22.8h, v17.8h
- mls v11.8h, v14.8h, v7.h[0]
- sub v26.8h, v3.8h, v25.8h
- sqrdmulh v2.8h, v9.8h, v0.h[3]
- mul v24.8h, v9.8h, v0.h[2]
- mul v19.8h, v26.8h, v0.h[4]
- sqrdmulh v14.8h, v26.8h, v0.h[5]
- mls v23.8h, v27.8h, v7.h[0]
- mls v24.8h, v2.8h, v7.h[0]
- add v6.8h, v13.8h, v11.8h
- mls v19.8h, v14.8h, v7.h[0]
- sub v4.8h, v5.8h, v23.8h
- add v10.8h, v3.8h, v25.8h
- sub v8.8h, v6.8h, v24.8h
- add v3.8h, v4.8h, v19.8h
- sub v31.8h, v4.8h, v19.8h
- mul v14.8h, v8.8h, v1.h[0]
- sqrdmulh v4.8h, v10.8h, v0.h[3]
- mul v12.8h, v10.8h, v0.h[2]
- sqrdmulh v2.8h, v8.8h, v1.h[1]
- sub v8.8h, v22.8h, v17.8h
- add v30.8h, v5.8h, v23.8h
- mls v12.8h, v4.8h, v7.h[0]
- sqrdmulh v4.8h, v8.8h, v0.h[5]
- mul v19.8h, v8.8h, v0.h[4]
- mls v14.8h, v2.8h, v7.h[0]
- sub v27.8h, v30.8h, v12.8h
- sub v23.8h, v13.8h, v11.8h
- mls v19.8h, v4.8h, v7.h[0]
- sub v2.8h, v27.8h, v14.8h
- add v8.8h, v27.8h, v14.8h
- add v14.8h, v6.8h, v24.8h
- str q2, [x0, #0xc0]
- add v2.8h, v23.8h, v19.8h
- str q8, [x0, #0x80]
- sub v19.8h, v23.8h, v19.8h
- sqrdmulh v13.8h, v2.8h, v1.h[3]
- mul v17.8h, v2.8h, v1.h[2]
- add v27.8h, v30.8h, v12.8h
- sqrdmulh v24.8h, v19.8h, v1.h[5]
- mul v19.8h, v19.8h, v1.h[4]
- mls v17.8h, v13.8h, v7.h[0]
- sqrdmulh v8.8h, v14.8h, v0.h[7]
- mul v2.8h, v14.8h, v0.h[6]
- mls v19.8h, v24.8h, v7.h[0]
- add v26.8h, v3.8h, v17.8h
- sub v14.8h, v3.8h, v17.8h
- mls v2.8h, v8.8h, v7.h[0]
- str q26, [x0, #0x100]
- add v8.8h, v31.8h, v19.8h
- str q14, [x0, #0x140]
- sub v24.8h, v31.8h, v19.8h
- str q8, [x0, #0x180]
- add v18.8h, v27.8h, v2.8h
- str q24, [x0, #0x1c0]
- sub v14.8h, v27.8h, v2.8h
- str q18, [x0], #0x10
- str q14, [x0, #0x30]
- mov x0, x3
- mov x4, #0x8 // =8
- ldr q11, [x1], #0x10
- ldr q24, [x0, #0x30]
- ldr q8, [x0, #0x20]
- sqrdmulh v14.8h, v24.8h, v11.h[1]
- mul v2.8h, v24.8h, v11.h[0]
- sqrdmulh v9.8h, v8.8h, v11.h[1]
- ldr q24, [x0, #0x10]
- mls v2.8h, v14.8h, v7.h[0]
- mul v14.8h, v8.8h, v11.h[0]
- ldr q6, [x2, #0x40]
- sub v8.8h, v24.8h, v2.8h
- mls v14.8h, v9.8h, v7.h[0]
- add v2.8h, v24.8h, v2.8h
- mul v27.8h, v8.8h, v11.h[4]
- sqrdmulh v8.8h, v8.8h, v11.h[5]
- mul v24.8h, v2.8h, v11.h[2]
- sqrdmulh v11.8h, v2.8h, v11.h[3]
- mls v27.8h, v8.8h, v7.h[0]
- ldr q5, [x2, #0x50]
- sub x4, x4, #0x1
+Lntt_layer123_start:
+ mls v23.8h, v15.8h, v7.h[0]
+ ldr q6, [x0, #0x190]
+ ldr q15, [x0, #0x90]
+ ldr q19, [x0, #0x10]
+ mul v22.8h, v10.8h, v1.h[4]
+ ldr q24, [x0, #0x50]
+ str q13, [x0, #0x140]
+ sqrdmulh v13.8h, v6.8h, v0.h[1]
+ sub v20.8h, v25.8h, v23.8h
+ sqrdmulh v3.8h, v30.8h, v0.h[1]
+ str q28, [x0, #0x100]
+ ldr q30, [x0, #0x120]
+ mul v8.8h, v6.8h, v0.h[0]
+ sqrdmulh v27.8h, v10.8h, v1.h[5]
+ mls v11.8h, v3.8h, v7.h[0]
+ mls v18.8h, v2.8h, v7.h[0]
+ ldr q31, [x0, #0x160]
+ sqrdmulh v10.8h, v5.8h, v1.h[1]
+ mls v8.8h, v13.8h, v7.h[0]
+ ldr q13, [x0, #0x1d0]
+ sub v14.8h, v24.8h, v18.8h
+ add v9.8h, v24.8h, v18.8h
+ sqrdmulh v2.8h, v31.8h, v0.h[1]
+ mls v4.8h, v10.8h, v7.h[0]
+ add v10.8h, v25.8h, v23.8h
+ sub v24.8h, v19.8h, v11.8h
+ add v25.8h, v19.8h, v11.8h
+ sqrdmulh v28.8h, v13.8h, v0.h[1]
+ mul v11.8h, v30.8h, v0.h[0]
+ mul v17.8h, v13.8h, v0.h[0]
+ sub v13.8h, v10.8h, v16.8h
+ sub v6.8h, v15.8h, v8.8h
+ mls v17.8h, v28.8h, v7.h[0]
+ str q13, [x0, #0x40]
+ mls v22.8h, v27.8h, v7.h[0]
+ ldr q13, [x0, #0xd0]
+ add v26.8h, v20.8h, v4.8h
+ mul v18.8h, v31.8h, v0.h[0]
+ add v27.8h, v10.8h, v16.8h
+ str q26, [x0, #0x80]
+ sqrdmulh v31.8h, v6.8h, v0.h[5]
+ add v3.8h, v21.8h, v22.8h
+ str q27, [x0], #0x10
+ mul v26.8h, v6.8h, v0.h[4]
+ add v6.8h, v13.8h, v17.8h
+ sub v5.8h, v13.8h, v17.8h
+ str q3, [x0, #0x170]
+ sub v17.8h, v21.8h, v22.8h
+ sqrdmulh v10.8h, v6.8h, v0.h[3]
+ sub v13.8h, v20.8h, v4.8h
+ add v20.8h, v15.8h, v8.8h
+ sqrdmulh v12.8h, v5.8h, v0.h[5]
+ str q13, [x0, #0xb0]
+ mul v8.8h, v6.8h, v0.h[2]
+ str q17, [x0, #0x1b0]
+ mls v8.8h, v10.8h, v7.h[0]
+ mul v29.8h, v5.8h, v0.h[4]
+ mls v29.8h, v12.8h, v7.h[0]
+ sub v5.8h, v9.8h, v8.8h
+ add v3.8h, v9.8h, v8.8h
+ sqrdmulh v15.8h, v20.8h, v0.h[3]
+ mul v4.8h, v5.8h, v1.h[0]
+ add v6.8h, v14.8h, v29.8h
+ sqrdmulh v9.8h, v3.8h, v0.h[7]
+ sqrdmulh v12.8h, v6.8h, v1.h[3]
+ sub v10.8h, v14.8h, v29.8h
+ mul v23.8h, v6.8h, v1.h[2]
+ mls v26.8h, v31.8h, v7.h[0]
+ mls v23.8h, v12.8h, v7.h[0]
+ mul v16.8h, v3.8h, v0.h[6]
+ add v13.8h, v24.8h, v26.8h
+ sub v21.8h, v24.8h, v26.8h
+ mls v16.8h, v9.8h, v7.h[0]
+ add v28.8h, v13.8h, v23.8h
+ sub v13.8h, v13.8h, v23.8h
+ mul v23.8h, v20.8h, v0.h[2]
+ subs x4, x4, #0x1
+ cbnz x4, Lntt_layer123_start
+ sqrdmulh v3.8h, v5.8h, v1.h[1]
+ mls v23.8h, v15.8h, v7.h[0]
+ ldr q5, [x0, #0x190]
+ mul v29.8h, v10.8h, v1.h[4]
+ mls v4.8h, v3.8h, v7.h[0]
+ sub v19.8h, v25.8h, v23.8h
+ sqrdmulh v31.8h, v5.8h, v0.h[1]
+ sqrdmulh v6.8h, v30.8h, v0.h[1]
+ sub v3.8h, v19.8h, v4.8h
+ mul v5.8h, v5.8h, v0.h[0]
+ str q3, [x0, #0xc0]
+ sqrdmulh v12.8h, v10.8h, v1.h[5]
+ mls v18.8h, v2.8h, v7.h[0]
+ ldr q3, [x0, #0x1d0]
+ mls v5.8h, v31.8h, v7.h[0]
+ sqrdmulh v10.8h, v3.8h, v0.h[1]
+ mls v11.8h, v6.8h, v7.h[0]
+ ldr q31, [x0, #0x90]
+ mul v30.8h, v3.8h, v0.h[0]
+ mls v30.8h, v10.8h, v7.h[0]
+ sub v10.8h, v31.8h, v5.8h
+ mls v29.8h, v12.8h, v7.h[0]
+ ldr q6, [x0, #0xd0]
+ sqrdmulh v15.8h, v10.8h, v0.h[5]
+ mul v17.8h, v10.8h, v0.h[4]
+ add v10.8h, v6.8h, v30.8h
+ sub v6.8h, v6.8h, v30.8h
+ sqrdmulh v12.8h, v10.8h, v0.h[3]
+ sub v27.8h, v21.8h, v29.8h
+ sqrdmulh v3.8h, v6.8h, v0.h[5]
+ mul v10.8h, v10.8h, v0.h[2]
+ ldr q20, [x0, #0x50]
+ mls v10.8h, v12.8h, v7.h[0]
+ mul v2.8h, v6.8h, v0.h[4]
+ add v6.8h, v20.8h, v18.8h
+ add v5.8h, v31.8h, v5.8h
+ mls v2.8h, v3.8h, v7.h[0]
+ sub v31.8h, v6.8h, v10.8h
+ sqrdmulh v12.8h, v5.8h, v0.h[3]
+ sub v22.8h, v20.8h, v18.8h
+ add v6.8h, v6.8h, v10.8h
+ mul v20.8h, v31.8h, v1.h[0]
+ add v30.8h, v22.8h, v2.8h
+ sqrdmulh v3.8h, v6.8h, v0.h[7]
+ sqrdmulh v10.8h, v30.8h, v1.h[3]
+ mul v9.8h, v30.8h, v1.h[2]
+ ldr q30, [x0, #0x10]
+ mls v17.8h, v15.8h, v7.h[0]
+ mls v9.8h, v10.8h, v7.h[0]
+ mul v15.8h, v6.8h, v0.h[6]
+ add v24.8h, v30.8h, v11.8h
+ sub v10.8h, v22.8h, v2.8h
+ mls v15.8h, v3.8h, v7.h[0]
+ add v6.8h, v19.8h, v4.8h
+ add v22.8h, v25.8h, v23.8h
+ sqrdmulh v3.8h, v10.8h, v1.h[5]
+ str q13, [x0, #0x140]
+ sub v19.8h, v30.8h, v11.8h
+ add v25.8h, v22.8h, v16.8h
+ mul v5.8h, v5.8h, v0.h[2]
+ sub v13.8h, v22.8h, v16.8h
+ str q28, [x0, #0x100]
+ mls v5.8h, v12.8h, v7.h[0]
+ str q13, [x0, #0x40]
+ str q6, [x0, #0x80]
+ add v21.8h, v21.8h, v29.8h
+ sqrdmulh v13.8h, v31.8h, v1.h[1]
+ str q25, [x0], #0x10
+ add v12.8h, v19.8h, v17.8h
+ sub v31.8h, v19.8h, v17.8h
+ mul v30.8h, v10.8h, v1.h[4]
+ str q21, [x0, #0x170]
+ add v21.8h, v24.8h, v5.8h
+ add v6.8h, v12.8h, v9.8h
+ mls v30.8h, v3.8h, v7.h[0]
+ str q27, [x0, #0x1b0]
+ sub v10.8h, v21.8h, v15.8h
+ sub v12.8h, v12.8h, v9.8h
+ mls v20.8h, v13.8h, v7.h[0]
+ str q6, [x0, #0x100]
+ str q10, [x0, #0x40]
+ sub v13.8h, v24.8h, v5.8h
+ add v3.8h, v21.8h, v15.8h
+ str q12, [x0, #0x140]
+ sub v10.8h, v31.8h, v30.8h
+ add v21.8h, v31.8h, v30.8h
+ str q3, [x0], #0x10
+ add v12.8h, v13.8h, v20.8h
+ sub v13.8h, v13.8h, v20.8h
+ str q21, [x0, #0x170]
+ str q10, [x0, #0x1b0]
+ str q12, [x0, #0x70]
+ str q13, [x0, #0xb0]
+ mov x0, x3
+ mov x4, #0x8 // =8
+ ldr q2, [x0, #0x20]
+ ldr q13, [x1], #0x10
+ ldr q30, [x0, #0x30]
+ ldr q25, [x2, #0x40]
+ ldr q5, [x0]
+ ldr q18, [x0, #0x60]
+ ldr q12, [x0, #0x70]
+ sqrdmulh v17.8h, v2.8h, v13.h[1]
+ ldr q4, [x1], #0x10
+ ldr q23, [x0, #0x10]
+ sqrdmulh v21.8h, v30.8h, v13.h[1]
+ ldr q24, [x2, #0x20]
+ ldr q9, [x2], #0x60
+ mul v10.8h, v30.8h, v13.h[0]
+ mul v11.8h, v2.8h, v13.h[0]
+ mls v10.8h, v21.8h, v7.h[0]
+ sqrdmulh v29.8h, v12.8h, v4.h[1]
+ mul v1.8h, v12.8h, v4.h[0]
+ add v21.8h, v23.8h, v10.8h
+ sub v10.8h, v23.8h, v10.8h
+ mul v8.8h, v18.8h, v4.h[0]
+ sqrdmulh v23.8h, v21.8h, v13.h[3]
+ mul v2.8h, v21.8h, v13.h[2]
+ mls v1.8h, v29.8h, v7.h[0]
+ mls v2.8h, v23.8h, v7.h[0]
+ ldur q15, [x2, #-0x50]
+ sqrdmulh v0.8h, v10.8h, v13.h[5]
+ mls v11.8h, v17.8h, v7.h[0]
+ ldr q29, [x0, #0x50]
+ mul v23.8h, v10.8h, v13.h[4]
+ mls v23.8h, v0.8h, v7.h[0]
+ sub v16.8h, v29.8h, v1.8h
+ add v3.8h, v5.8h, v11.8h
+ sub v31.8h, v5.8h, v11.8h
+ sqrdmulh v22.8h, v16.8h, v4.h[5]
+ add v30.8h, v3.8h, v2.8h
+ sub v0.8h, v3.8h, v2.8h
+ sqrdmulh v28.8h, v18.8h, v4.h[1]
+ add v21.8h, v31.8h, v23.8h
+ sub v19.8h, v31.8h, v23.8h
+ mul v26.8h, v16.8h, v4.h[4]
+ trn2 v3.4s, v30.4s, v0.4s
+ ldur q23, [x2, #-0x10]
+ trn2 v18.4s, v21.4s, v19.4s
+ mls v26.8h, v22.8h, v7.h[0]
+ trn1 v13.4s, v30.4s, v0.4s
+ mls v8.8h, v28.8h, v7.h[0]
+ trn2 v31.2d, v3.2d, v18.2d
+ trn1 v11.4s, v21.4s, v19.4s
+ add v27.8h, v29.8h, v1.8h
+ sqrdmulh v6.8h, v31.8h, v15.8h
+ trn1 v2.2d, v13.2d, v11.2d
+ trn2 v13.2d, v13.2d, v11.2d
+ mul v1.8h, v31.8h, v9.8h
+ ldr q11, [x0, #0x40]
+ sqrdmulh v29.8h, v13.8h, v15.8h
+ mls v1.8h, v6.8h, v7.h[0]
+ trn1 v6.2d, v3.2d, v18.2d
+ mul v17.8h, v13.8h, v9.8h
+ sub v13.8h, v11.8h, v8.8h
+ sqrdmulh v10.8h, v27.8h, v4.h[3]
+ sub v12.8h, v13.8h, v26.8h
+ sub v18.8h, v6.8h, v1.8h
+ mls v17.8h, v29.8h, v7.h[0]
+ add v30.8h, v6.8h, v1.8h
+ add v6.8h, v13.8h, v26.8h
+ ldur q13, [x2, #-0x30]
+ sqrdmulh v16.8h, v18.8h, v23.8h
+ trn1 v28.4s, v6.4s, v12.4s
+ mul v23.8h, v18.8h, v25.8h
+ ldr q25, [x2, #0x10]
+ add v20.8h, v2.8h, v17.8h
+ mul v0.8h, v30.8h, v24.8h
+ sqrdmulh v29.8h, v30.8h, v13.8h
+ sub v30.8h, v2.8h, v17.8h
+ mls v23.8h, v16.8h, v7.h[0]
+ sub x4, x4, #0x2
-ntt_layer4567_start:
- ldr q8, [x0]
- ldr q17, [x2, #0x10]
- sub v1.8h, v8.8h, v14.8h
- mls v24.8h, v11.8h, v7.h[0]
- add v8.8h, v8.8h, v14.8h
- sub v0.8h, v1.8h, v27.8h
- add v12.8h, v1.8h, v27.8h
- sub v19.8h, v8.8h, v24.8h
- add v8.8h, v8.8h, v24.8h
- trn1 v24.4s, v12.4s, v0.4s
- trn2 v13.4s, v12.4s, v0.4s
- trn1 v23.4s, v8.4s, v19.4s
- ldr q2, [x2], #0x60
- trn2 v9.2d, v23.2d, v24.2d
- trn2 v8.4s, v8.4s, v19.4s
- sqrdmulh v26.8h, v9.8h, v17.8h
- trn1 v24.2d, v23.2d, v24.2d
- trn2 v11.2d, v8.2d, v13.2d
- trn1 v29.2d, v8.2d, v13.2d
- sqrdmulh v23.8h, v11.8h, v17.8h
- mul v10.8h, v11.8h, v2.8h
- mul v0.8h, v9.8h, v2.8h
- ldur q11, [x2, #-0x40]
- mls v10.8h, v23.8h, v7.h[0]
- mls v0.8h, v26.8h, v7.h[0]
- ldur q19, [x2, #-0x30]
- add v17.8h, v29.8h, v10.8h
- sub v23.8h, v24.8h, v0.8h
- sub v30.8h, v29.8h, v10.8h
- mul v2.8h, v17.8h, v11.8h
- sqrdmulh v11.8h, v17.8h, v19.8h
- mul v8.8h, v30.8h, v6.8h
- ldr q22, [x0, #0x70]
- mls v2.8h, v11.8h, v7.h[0]
- add v24.8h, v24.8h, v0.8h
- ldr q15, [x1], #0x10
- sub v14.8h, v24.8h, v2.8h
- add v24.8h, v24.8h, v2.8h
- sqrdmulh v1.8h, v22.8h, v15.h[1]
- mul v2.8h, v22.8h, v15.h[0]
- trn1 v0.4s, v24.4s, v14.4s
- trn2 v24.4s, v24.4s, v14.4s
- sqrdmulh v19.8h, v30.8h, v5.8h
- mls v2.8h, v1.8h, v7.h[0]
- ldr q16, [x0, #0x60]
- mls v8.8h, v19.8h, v7.h[0]
- ldr q6, [x2, #0x40]
- mul v14.8h, v16.8h, v15.h[0]
- sub v3.8h, v23.8h, v8.8h
- add v8.8h, v23.8h, v8.8h
- ldr q5, [x2, #0x50]
- trn2 v23.4s, v8.4s, v3.4s
- trn1 v31.4s, v8.4s, v3.4s
- sqrdmulh v8.8h, v16.8h, v15.h[1]
- trn2 v25.2d, v24.2d, v23.2d
- trn1 v29.2d, v24.2d, v23.2d
- ldr q24, [x0, #0x50]
- trn1 v16.2d, v0.2d, v31.2d
- mls v14.8h, v8.8h, v7.h[0]
- sub v13.8h, v24.8h, v2.8h
- add v24.8h, v24.8h, v2.8h
- trn2 v2.2d, v0.2d, v31.2d
- sqrdmulh v19.8h, v13.8h, v15.h[5]
- str q2, [x0, #0x20]
- sqrdmulh v11.8h, v24.8h, v15.h[3]
- str q16, [x0], #0x40
- mul v27.8h, v13.8h, v15.h[4]
- stur q29, [x0, #-0x30]
- mul v24.8h, v24.8h, v15.h[2]
- stur q25, [x0, #-0x10]
- mls v27.8h, v19.8h, v7.h[0]
- subs x4, x4, #0x1
- cbnz x4, ntt_layer4567_start
- ldr q23, [x0]
- ldr q17, [x2], #0x60
- sub v19.8h, v23.8h, v14.8h
- mls v24.8h, v11.8h, v7.h[0]
- add v14.8h, v23.8h, v14.8h
- add v8.8h, v19.8h, v27.8h
- sub v13.8h, v19.8h, v27.8h
- add v12.8h, v14.8h, v24.8h
- sub v24.8h, v14.8h, v24.8h
- trn1 v0.4s, v8.4s, v13.4s
- trn2 v23.4s, v8.4s, v13.4s
- trn2 v19.4s, v12.4s, v24.4s
- ldur q27, [x2, #-0x50]
- trn2 v8.2d, v19.2d, v23.2d
- trn1 v22.4s, v12.4s, v24.4s
- mul v14.8h, v8.8h, v17.8h
- sqrdmulh v24.8h, v8.8h, v27.8h
- trn2 v2.2d, v22.2d, v0.2d
- trn1 v8.2d, v19.2d, v23.2d
- mul v11.8h, v2.8h, v17.8h
- mls v14.8h, v24.8h, v7.h[0]
- ldur q26, [x2, #-0x30]
- sqrdmulh v23.8h, v2.8h, v27.8h
- sub v24.8h, v8.8h, v14.8h
- ldur q2, [x2, #-0x40]
- sqrdmulh v19.8h, v24.8h, v5.8h
- add v14.8h, v8.8h, v14.8h
- mul v24.8h, v24.8h, v6.8h
- mls v11.8h, v23.8h, v7.h[0]
- sqrdmulh v8.8h, v14.8h, v26.8h
- mul v2.8h, v14.8h, v2.8h
- trn1 v14.2d, v22.2d, v0.2d
- mls v24.8h, v19.8h, v7.h[0]
- sub v23.8h, v14.8h, v11.8h
- mls v2.8h, v8.8h, v7.h[0]
- add v14.8h, v14.8h, v11.8h
- add v8.8h, v23.8h, v24.8h
- sub v24.8h, v23.8h, v24.8h
- sub v19.8h, v14.8h, v2.8h
- add v11.8h, v14.8h, v2.8h
- trn1 v2.4s, v8.4s, v24.4s
- trn2 v14.4s, v8.4s, v24.4s
- trn2 v23.4s, v11.4s, v19.4s
- trn1 v11.4s, v11.4s, v19.4s
- trn2 v8.2d, v23.2d, v14.2d
- trn1 v24.2d, v11.2d, v2.2d
- str q8, [x0, #0x30]
- trn2 v8.2d, v11.2d, v2.2d
- str q24, [x0], #0x40
- trn1 v24.2d, v23.2d, v14.2d
- stur q8, [x0, #-0x20]
- stur q24, [x0, #-0x30]
- ldp d8, d9, [sp]
- ldp d10, d11, [sp, #0x10]
- ldp d12, d13, [sp, #0x20]
- ldp d14, d15, [sp, #0x30]
- add sp, sp, #0x40
+Lntt_layer4567_start:
+ ldr q19, [x2, #0x50]
+ sub v31.8h, v30.8h, v23.8h
+ mls v0.8h, v29.8h, v7.h[0]
+ add v16.8h, v11.8h, v8.8h
+ ldr q18, [x0, #0xa0]
+ trn2 v14.4s, v6.4s, v12.4s
+ mul v26.8h, v27.8h, v4.h[2]
+ ldr q4, [x1], #0x10
+ ldr q24, [x2, #0x40]
+ ldr q21, [x0, #0xb0]
+ mls v26.8h, v10.8h, v7.h[0]
+ add v23.8h, v30.8h, v23.8h
+ sub v15.8h, v20.8h, v0.8h
+ ldr q9, [x0, #0x90]
+ add v10.8h, v20.8h, v0.8h
+ mul v8.8h, v18.8h, v4.h[0]
+ ldr q1, [x2], #0x60
+ trn1 v27.4s, v23.4s, v31.4s
+ sqrdmulh v12.8h, v18.8h, v4.h[1]
+ trn1 v5.4s, v10.4s, v15.4s
+ sub v30.8h, v16.8h, v26.8h
+ trn2 v13.2d, v5.2d, v27.2d
+ sqrdmulh v2.8h, v21.8h, v4.h[1]
+ add v29.8h, v16.8h, v26.8h
+ mul v0.8h, v21.8h, v4.h[0]
+ str q13, [x0, #0x20]
+ trn1 v11.4s, v29.4s, v30.4s
+ mls v8.8h, v12.8h, v7.h[0]
+ trn2 v26.4s, v29.4s, v30.4s
+ trn2 v6.2d, v11.2d, v28.2d
+ mls v0.8h, v2.8h, v7.h[0]
+ trn2 v16.2d, v26.2d, v14.2d
+ trn1 v26.2d, v26.2d, v14.2d
+ trn1 v20.2d, v5.2d, v27.2d
+ sqrdmulh v29.8h, v6.8h, v25.8h
+ trn2 v15.4s, v10.4s, v15.4s
+ sqrdmulh v13.8h, v16.8h, v25.8h
+ str q20, [x0], #0x40
+ sub v30.8h, v9.8h, v0.8h
+ add v27.8h, v9.8h, v0.8h
+ mul v17.8h, v6.8h, v1.8h
+ sqrdmulh v22.8h, v30.8h, v4.h[5]
+ mul v18.8h, v16.8h, v1.8h
+ mls v18.8h, v13.8h, v7.h[0]
+ mul v2.8h, v30.8h, v4.h[4]
+ mls v2.8h, v22.8h, v7.h[0]
+ trn2 v22.4s, v23.4s, v31.4s
+ sub v3.8h, v26.8h, v18.8h
+ ldur q25, [x2, #-0x30]
+ mls v17.8h, v29.8h, v7.h[0]
+ trn2 v31.2d, v15.2d, v22.2d
+ trn1 v20.2d, v15.2d, v22.2d
+ add v16.8h, v26.8h, v18.8h
+ sqrdmulh v26.8h, v3.8h, v19.8h
+ trn1 v21.2d, v11.2d, v28.2d
+ ldr q11, [x0, #0x40]
+ sqrdmulh v29.8h, v16.8h, v25.8h
+ stur q20, [x0, #-0x30]
+ add v20.8h, v21.8h, v17.8h
+ stur q31, [x0, #-0x10]
+ mul v23.8h, v3.8h, v24.8h
+ ldr q25, [x2, #0x10]
+ sub v13.8h, v11.8h, v8.8h
+ mls v23.8h, v26.8h, v7.h[0]
+ ldur q1, [x2, #-0x40]
+ sub v12.8h, v13.8h, v2.8h
+ add v6.8h, v13.8h, v2.8h
+ sqrdmulh v10.8h, v27.8h, v4.h[3]
+ sub v30.8h, v21.8h, v17.8h
+ mul v0.8h, v16.8h, v1.8h
+ trn1 v28.4s, v6.4s, v12.4s
+ subs x4, x4, #0x1
+ cbnz x4, Lntt_layer4567_start
+ add v22.8h, v11.8h, v8.8h
+ mul v27.8h, v27.8h, v4.h[2]
+ trn2 v17.4s, v6.4s, v12.4s
+ ldr q15, [x2], #0x60
+ mls v27.8h, v10.8h, v7.h[0]
+ add v4.8h, v30.8h, v23.8h
+ sub v18.8h, v30.8h, v23.8h
+ ldur q6, [x2, #-0x30]
+ mls v0.8h, v29.8h, v7.h[0]
+ ldur q12, [x2, #-0x40]
+ ldur q24, [x2, #-0x20]
+ ldur q2, [x2, #-0x10]
+ trn1 v9.4s, v4.4s, v18.4s
+ add v10.8h, v22.8h, v27.8h
+ sub v13.8h, v22.8h, v27.8h
+ sub v1.8h, v20.8h, v0.8h
+ trn2 v21.4s, v10.4s, v13.4s
+ add v27.8h, v20.8h, v0.8h
+ trn2 v3.2d, v21.2d, v17.2d
+ trn1 v13.4s, v10.4s, v13.4s
+ trn1 v31.4s, v27.4s, v1.4s
+ sqrdmulh v10.8h, v3.8h, v25.8h
+ trn2 v5.2d, v13.2d, v28.2d
+ trn1 v13.2d, v13.2d, v28.2d
+ trn1 v21.2d, v21.2d, v17.2d
+ sqrdmulh v17.8h, v5.8h, v25.8h
+ trn2 v30.2d, v31.2d, v9.2d
+ mul v25.8h, v3.8h, v15.8h
+ str q30, [x0, #0x20]
+ trn2 v30.4s, v4.4s, v18.4s
+ mls v25.8h, v10.8h, v7.h[0]
+ trn2 v3.4s, v27.4s, v1.4s
+ mul v20.8h, v5.8h, v15.8h
+ trn2 v10.2d, v3.2d, v30.2d
+ mls v20.8h, v17.8h, v7.h[0]
+ str q10, [x0, #0x30]
+ sub v18.8h, v21.8h, v25.8h
+ add v10.8h, v21.8h, v25.8h
+ trn1 v3.2d, v3.2d, v30.2d
+ sqrdmulh v30.8h, v18.8h, v2.8h
+ mul v12.8h, v10.8h, v12.8h
+ sqrdmulh v6.8h, v10.8h, v6.8h
+ str q3, [x0, #0x10]
+ add v21.8h, v13.8h, v20.8h
+ mul v10.8h, v18.8h, v24.8h
+ sub v13.8h, v13.8h, v20.8h
+ mls v10.8h, v30.8h, v7.h[0]
+ mls v12.8h, v6.8h, v7.h[0]
+ trn1 v30.2d, v31.2d, v9.2d
+ sub v3.8h, v13.8h, v10.8h
+ add v6.8h, v13.8h, v10.8h
+ add v10.8h, v21.8h, v12.8h
+ sub v21.8h, v21.8h, v12.8h
+ trn2 v13.4s, v6.4s, v3.4s
+ trn1 v12.4s, v10.4s, v21.4s
+ trn2 v21.4s, v10.4s, v21.4s
+ trn1 v3.4s, v6.4s, v3.4s
+ str q30, [x0], #0x40
+ trn2 v10.2d, v21.2d, v13.2d
+ trn1 v13.2d, v21.2d, v13.2d
+ trn2 v21.2d, v12.2d, v3.2d
+ trn1 v3.2d, v12.2d, v3.2d
+ str q10, [x0, #0x30]
+ str q13, [x0, #0x10]
+ str q3, [x0], #0x40
+ stur q21, [x0, #-0x20]
+ ldp d8, d9, [sp]
+ .cfi_restore d8
+ .cfi_restore d9
+ ldp d10, d11, [sp, #0x10]
+ .cfi_restore d10
+ .cfi_restore d11
+ ldp d12, d13, [sp, #0x20]
+ .cfi_restore d12
+ .cfi_restore d13
+ ldp d14, d15, [sp, #0x30]
+ .cfi_restore d14
+ .cfi_restore d15
+ add sp, sp, #0x40
+ .cfi_adjust_cfa_offset -0x40
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(ntt_asm)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_mulcache_compute_asm.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_mulcache_compute_asm.S
index ec7ca0c6fa..71ebbeca53 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_mulcache_compute_asm.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_mulcache_compute_asm.S
@@ -3,49 +3,125 @@
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
+/*yaml
+ Name: poly_mulcache_compute_asm
+ Description: Compute multiplication cache for polynomial
+ Signature: void mlk_poly_mulcache_compute_asm(int16_t cache[128], const int16_t mlk_poly[256], const int16_t zetas[128], const int16_t zetas_twisted[128])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 256
+ permissions: write-only
+ c_parameter: int16_t cache[128]
+ description: Output cache
+ x1:
+ type: buffer
+ size_bytes: 512
+ permissions: read-only
+ c_parameter: const int16_t mlk_poly[256]
+ description: Input polynomial
+ x2:
+ type: buffer
+ size_bytes: 256
+ permissions: read-only
+ c_parameter: const int16_t zetas[128]
+ description: Zeta values
+ x3:
+ type: buffer
+ size_bytes: 256
+ permissions: read-only
+ c_parameter: const int16_t zetas_twisted[128]
+ description: Twisted zeta values
+ Stack:
+ bytes: 0
+*/
+
#include "../../../common.h"
-#if defined(MLK_ARITH_BACKEND_AARCH64) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/poly_mulcache_compute_asm.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(poly_mulcache_compute_asm)
MLK_ASM_FN_SYMBOL(poly_mulcache_compute_asm)
- mov w5, #0xd01 // =3329
- dup v6.8h, w5
- mov w5, #0x4ebf // =20159
- dup v7.8h, w5
- mov x4, #0x10 // =16
- ldr q1, [x1, #0x10]
- ldr q27, [x1], #0x20
- ldr q23, [x2], #0x10
- uzp2 v27.8h, v27.8h, v1.8h
- ldr q1, [x3], #0x10
- mul v2.8h, v27.8h, v23.8h
- sqrdmulh v27.8h, v27.8h, v1.8h
- sub x4, x4, #0x1
-
-poly_mulcache_compute_loop:
- ldr q29, [x1, #0x10]
- ldr q21, [x2], #0x10
- mls v2.8h, v27.8h, v6.h[0]
- ldr q27, [x1], #0x20
- ldr q7, [x3], #0x10
- uzp2 v28.8h, v27.8h, v29.8h
- str q2, [x0], #0x10
- mul v2.8h, v28.8h, v21.8h
- sqrdmulh v27.8h, v28.8h, v7.8h
- sub x4, x4, #0x1
- cbnz x4, poly_mulcache_compute_loop
- mls v2.8h, v27.8h, v6.h[0]
- str q2, [x0], #0x10
+ .cfi_startproc
+ mov w5, #0xd01 // =3329
+ dup v6.8h, w5
+ mov w5, #0x4ebf // =20159
+ dup v7.8h, w5
+ mov x4, #0x10 // =16
+ ldr q0, [x1], #0x20
+ ldur q2, [x1, #-0x10]
+ ldr q19, [x1], #0x20
+ ldr q29, [x3], #0x10
+ ldur q16, [x1, #-0x10]
+ ldr q18, [x2], #0x10
+ ldr q26, [x1], #0x20
+ ldr q25, [x2], #0x10
+ uzp2 v5.8h, v0.8h, v2.8h
+ ldr q28, [x3], #0x10
+ ldur q7, [x1, #-0x10]
+ ldr q2, [x1], #0x20
+ uzp2 v27.8h, v19.8h, v16.8h
+ sqrdmulh v16.8h, v5.8h, v29.8h
+ ldr q17, [x3], #0x10
+ ldr q19, [x3], #0x10
+ mul v5.8h, v5.8h, v18.8h
+ uzp2 v29.8h, v26.8h, v7.8h
+ mul v26.8h, v27.8h, v25.8h
+ sqrdmulh v4.8h, v27.8h, v28.8h
+ mls v5.8h, v16.8h, v6.h[0]
+ lsr x4, x4, #1
+ sub x4, x4, #0x2
+
+Lpoly_mulcache_compute_loop_start:
+ str q5, [x0], #0x10
+ sqrdmulh v22.8h, v29.8h, v17.8h
+ ldr q28, [x2], #0x10
+ ldur q24, [x1, #-0x10]
+ ldr q0, [x1], #0x20
+ mls v26.8h, v4.8h, v6.h[0]
+ ldur q16, [x1, #-0x10]
+ ldr q17, [x3], #0x10
+ mul v5.8h, v29.8h, v28.8h
+ uzp2 v23.8h, v2.8h, v24.8h
+ ldr q18, [x2], #0x10
+ mls v5.8h, v22.8h, v6.h[0]
+ uzp2 v29.8h, v0.8h, v16.8h
+ sqrdmulh v4.8h, v23.8h, v19.8h
+ ldr q2, [x1], #0x20
+ ldr q19, [x3], #0x10
+ str q26, [x0], #0x10
+ mul v26.8h, v23.8h, v18.8h
+ subs x4, x4, #0x1
+ cbnz x4, Lpoly_mulcache_compute_loop_start
+ mls v26.8h, v4.8h, v6.h[0]
+ str q5, [x0], #0x10
+ ldr q5, [x2], #0x10
+ ldur q4, [x1, #-0x10]
+ sqrdmulh v16.8h, v29.8h, v17.8h
+ ldr q0, [x2], #0x10
+ mul v29.8h, v29.8h, v5.8h
+ uzp2 v18.8h, v2.8h, v4.8h
+ str q26, [x0], #0x10
+ sqrdmulh v17.8h, v18.8h, v19.8h
+ mls v29.8h, v16.8h, v6.h[0]
+ mul v26.8h, v18.8h, v0.8h
+ mls v26.8h, v17.8h, v6.h[0]
+ str q29, [x0], #0x10
+ str q26, [x0], #0x10
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_mulcache_compute_asm)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_reduce_asm.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_reduce_asm.S
index b14447f0bc..28666853b4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_reduce_asm.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_reduce_asm.S
@@ -3,95 +3,148 @@
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
+/*yaml
+ Name: poly_reduce_asm
+ Description: Barrett reduction of polynomial coefficients
+ Signature: void mlk_poly_reduce_asm(int16_t p[256])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: read/write
+ c_parameter: int16_t p[256]
+ description: Input/output polynomial
+ Stack:
+ bytes: 0
+*/
+
#include "../../../common.h"
-#if defined(MLK_ARITH_BACKEND_AARCH64) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/poly_reduce_asm.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(poly_reduce_asm)
MLK_ASM_FN_SYMBOL(poly_reduce_asm)
- mov w2, #0xd01 // =3329
- dup v3.8h, w2
- mov w2, #0x4ebf // =20159
- dup v4.8h, w2
- mov x1, #0x8 // =8
- ldr q21, [x0, #0x20]
- ldr q23, [x0, #0x30]
- sqdmulh v7.8h, v21.8h, v4.h[0]
- sqdmulh v30.8h, v23.8h, v4.h[0]
- srshr v7.8h, v7.8h, #0xb
- srshr v30.8h, v30.8h, #0xb
- mls v21.8h, v7.8h, v3.h[0]
- mls v23.8h, v30.8h, v3.h[0]
- ldr q5, [x0, #0x10]
- sshr v7.8h, v21.8h, #0xf
- sshr v30.8h, v23.8h, #0xf
- and v7.16b, v3.16b, v7.16b
- add v21.8h, v21.8h, v7.8h
- and v7.16b, v3.16b, v30.16b
- add v16.8h, v23.8h, v7.8h
- sub x1, x1, #0x1
+ .cfi_startproc
+ mov w2, #0xd01 // =3329
+ dup v3.8h, w2
+ mov w2, #0x4ebf // =20159
+ dup v4.8h, w2
+ mov x1, #0x8 // =8
+ ldr q21, [x0], #0x40
+ ldur q18, [x0, #-0x20]
+ ldur q0, [x0, #-0x30]
+ ldur q5, [x0, #-0x10]
+ ldr q26, [x0], #0x40
+ sqdmulh v17.8h, v21.8h, v4.h[0]
+ sqdmulh v27.8h, v18.8h, v4.h[0]
+ sqdmulh v22.8h, v0.8h, v4.h[0]
+ srshr v17.8h, v17.8h, #0xb
+ sqdmulh v23.8h, v5.8h, v4.h[0]
+ srshr v29.8h, v27.8h, #0xb
+ mls v21.8h, v17.8h, v3.h[0]
+ srshr v17.8h, v22.8h, #0xb
+ mls v18.8h, v29.8h, v3.h[0]
+ srshr v22.8h, v23.8h, #0xb
+ mls v0.8h, v17.8h, v3.h[0]
+ sshr v2.8h, v21.8h, #0xf
+ mls v5.8h, v22.8h, v3.h[0]
+ sshr v29.8h, v18.8h, #0xf
+ and v19.16b, v3.16b, v2.16b
+ sqdmulh v2.8h, v26.8h, v4.h[0]
+ sshr v31.8h, v0.8h, #0xf
+ add v17.8h, v21.8h, v19.8h
+ and v21.16b, v3.16b, v29.16b
+ and v31.16b, v3.16b, v31.16b
+ sub x1, x1, #0x2
-poly_reduce_loop:
- ldr q6, [x0], #0x40
- ldr q30, [x0, #0x20]
- sqdmulh v31.8h, v6.8h, v4.h[0]
- sqdmulh v29.8h, v5.8h, v4.h[0]
- sqdmulh v22.8h, v30.8h, v4.h[0]
- stur q16, [x0, #-0x10]
- srshr v20.8h, v31.8h, #0xb
- srshr v28.8h, v29.8h, #0xb
- stur q21, [x0, #-0x20]
- mls v6.8h, v20.8h, v3.h[0]
- mls v5.8h, v28.8h, v3.h[0]
- ldr q2, [x0, #0x30]
- sshr v31.8h, v6.8h, #0xf
- srshr v19.8h, v22.8h, #0xb
- and v22.16b, v3.16b, v31.16b
- add v0.8h, v6.8h, v22.8h
- mls v30.8h, v19.8h, v3.h[0]
- sshr v26.8h, v5.8h, #0xf
- sqdmulh v25.8h, v2.8h, v4.h[0]
- and v17.16b, v3.16b, v26.16b
- add v1.8h, v5.8h, v17.8h
- sshr v31.8h, v30.8h, #0xf
- srshr v25.8h, v25.8h, #0xb
- stur q1, [x0, #-0x30]
- and v18.16b, v3.16b, v31.16b
- mls v2.8h, v25.8h, v3.h[0]
- add v21.8h, v30.8h, v18.8h
- ldr q5, [x0, #0x10]
- sshr v18.8h, v2.8h, #0xf
- stur q0, [x0, #-0x40]
- and v27.16b, v3.16b, v18.16b
- add v16.8h, v2.8h, v27.8h
- sub x1, x1, #0x1
- cbnz x1, poly_reduce_loop
- sqdmulh v20.8h, v5.8h, v4.h[0]
- ldr q24, [x0], #0x40
- stur q21, [x0, #-0x20]
- srshr v20.8h, v20.8h, #0xb
- sqdmulh v25.8h, v24.8h, v4.h[0]
- stur q16, [x0, #-0x10]
- mls v5.8h, v20.8h, v3.h[0]
- srshr v20.8h, v25.8h, #0xb
- sshr v2.8h, v5.8h, #0xf
- mls v24.8h, v20.8h, v3.h[0]
- and v20.16b, v3.16b, v2.16b
- add v31.8h, v5.8h, v20.8h
- sshr v20.8h, v24.8h, #0xf
- stur q31, [x0, #-0x30]
- and v31.16b, v3.16b, v20.16b
- add v24.8h, v24.8h, v31.8h
- stur q24, [x0, #-0x40]
+Lpoly_reduce_loop_start:
+ add v21.8h, v18.8h, v21.8h
+ ldur q18, [x0, #-0x20]
+ add v25.8h, v0.8h, v31.8h
+ ldur q0, [x0, #-0x30]
+ stur q21, [x0, #-0x60]
+ sshr v28.8h, v5.8h, #0xf
+ stur q17, [x0, #-0x80]
+ srshr v23.8h, v2.8h, #0xb
+ sqdmulh v30.8h, v18.8h, v4.h[0]
+ stur q25, [x0, #-0x70]
+ and v22.16b, v3.16b, v28.16b
+ sqdmulh v7.8h, v0.8h, v4.h[0]
+ add v16.8h, v5.8h, v22.8h
+ ldur q5, [x0, #-0x10]
+ mls v26.8h, v23.8h, v3.h[0]
+ stur q16, [x0, #-0x50]
+ srshr v6.8h, v30.8h, #0xb
+ srshr v1.8h, v7.8h, #0xb
+ sqdmulh v19.8h, v5.8h, v4.h[0]
+ mls v18.8h, v6.8h, v3.h[0]
+ sshr v24.8h, v26.8h, #0xf
+ mls v0.8h, v1.8h, v3.h[0]
+ and v27.16b, v3.16b, v24.16b
+ srshr v29.8h, v19.8h, #0xb
+ add v17.8h, v26.8h, v27.8h
+ ldr q26, [x0], #0x40
+ sshr v1.8h, v18.8h, #0xf
+ mls v5.8h, v29.8h, v3.h[0]
+ sshr v20.8h, v0.8h, #0xf
+ and v21.16b, v3.16b, v1.16b
+ and v31.16b, v3.16b, v20.16b
+ sqdmulh v2.8h, v26.8h, v4.h[0]
+ subs x1, x1, #0x1
+ cbnz x1, Lpoly_reduce_loop_start
+ add v28.8h, v0.8h, v31.8h
+ ldur q29, [x0, #-0x10]
+ add v21.8h, v18.8h, v21.8h
+ srshr v18.8h, v2.8h, #0xb
+ sshr v2.8h, v5.8h, #0xf
+ ldur q16, [x0, #-0x20]
+ stur q17, [x0, #-0x80]
+ ldur q0, [x0, #-0x30]
+ and v2.16b, v3.16b, v2.16b
+ sqdmulh v24.8h, v29.8h, v4.h[0]
+ stur q28, [x0, #-0x70]
+ stur q21, [x0, #-0x60]
+ add v31.8h, v5.8h, v2.8h
+ sqdmulh v6.8h, v16.8h, v4.h[0]
+ stur q31, [x0, #-0x50]
+ sqdmulh v17.8h, v0.8h, v4.h[0]
+ srshr v22.8h, v24.8h, #0xb
+ mls v26.8h, v18.8h, v3.h[0]
+ srshr v31.8h, v6.8h, #0xb
+ mls v29.8h, v22.8h, v3.h[0]
+ srshr v19.8h, v17.8h, #0xb
+ mls v16.8h, v31.8h, v3.h[0]
+ sshr v7.8h, v26.8h, #0xf
+ mls v0.8h, v19.8h, v3.h[0]
+ and v5.16b, v3.16b, v7.16b
+ sshr v22.8h, v29.8h, #0xf
+ add v27.8h, v26.8h, v5.8h
+ and v26.16b, v3.16b, v22.16b
+ sshr v20.8h, v16.8h, #0xf
+ stur q27, [x0, #-0x40]
+ and v2.16b, v3.16b, v20.16b
+ sshr v23.8h, v0.8h, #0xf
+ add v18.8h, v29.8h, v26.8h
+ add v31.8h, v16.8h, v2.8h
+ and v29.16b, v3.16b, v23.16b
+ stur q18, [x0, #-0x10]
+ add v25.8h, v0.8h, v29.8h
+ stur q31, [x0, #-0x20]
+ stur q25, [x0, #-0x30]
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_reduce_asm)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_tobytes_asm.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_tobytes_asm.S
index 6afb25986a..2d8f01cc10 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_tobytes_asm.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_tobytes_asm.S
@@ -3,6 +3,27 @@
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
+/*yaml
+ Name: poly_tobytes_asm
+ Description: Convert polynomial to byte representation
+ Signature: void mlk_poly_tobytes_asm(uint8_t r[384], const int16_t a[256])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 384
+ permissions: write-only
+ c_parameter: uint8_t r[384]
+ description: Output byte array
+ x1:
+ type: buffer
+ size_bytes: 512
+ permissions: read-only
+ c_parameter: const int16_t a[256]
+ description: Input polynomial
+ Stack:
+ bytes: 0
+*/
+
#include "../../../common.h"
#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
@@ -11,99 +32,86 @@
* dev/aarch64_opt/src/poly_tobytes_asm.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(poly_tobytes_asm)
MLK_ASM_FN_SYMBOL(poly_tobytes_asm)
- mov x2, #0x10 // =16
- ldr q6, [x1], #0x20
- ldur q24, [x1, #-0x10]
- ldr q30, [x1], #0x20
- ldur q22, [x1, #-0x10]
- ldr q5, [x1], #0x20
- ldur q17, [x1, #-0x10]
- ldr q19, [x1], #0x20
- ldur q4, [x1, #-0x10]
- lsr x2, x2, #2
- sub x2, x2, #0x1
+ .cfi_startproc
+ mov x2, #0x10 // =16
+ ldr q5, [x1, #0x10]
+ ldr q3, [x1], #0x20
+ ldr q29, [x1], #0x20
+ ldur q2, [x1, #-0x10]
+ ldr q27, [x1, #0x10]
+ ldr q23, [x1, #0x30]
+ ldr q17, [x1], #0x20
+ ldr q16, [x1], #0x20
+ uzp2 v26.8h, v3.8h, v5.8h
+ uzp1 v19.8h, v3.8h, v5.8h
+ uzp2 v0.8h, v29.8h, v2.8h
+ uzp1 v1.8h, v29.8h, v2.8h
+ xtn v5.8b, v26.8h
+ shrn v3.8b, v19.8h, #0x8
+ shrn v4.8b, v26.8h, #0x4
+ xtn v18.8b, v0.8h
+ shrn v30.8b, v0.8h, #0x4
+ xtn v28.8b, v1.8h
+ shrn v29.8b, v1.8h, #0x8
+ sli v3.8b, v5.8b, #0x4
+ xtn v2.8b, v19.8h
+ sli v29.8b, v18.8b, #0x4
+ lsr x2, x2, #1
+ sub x2, x2, #0x2
-poly_tobytes_loop_start:
- uzp1 v25.8h, v6.8h, v24.8h
- uzp2 v6.8h, v6.8h, v24.8h
- xtn v24.8b, v25.8h
- shrn v25.8b, v25.8h, #0x8
- xtn v18.8b, v6.8h
- shrn v26.8b, v6.8h, #0x4
- sli v25.8b, v18.8b, #0x4
- st3 { v24.8b, v25.8b, v26.8b }, [x0], #24
- uzp1 v25.8h, v30.8h, v22.8h
- uzp2 v6.8h, v30.8h, v22.8h
- xtn v24.8b, v25.8h
- xtn v18.8b, v6.8h
- uzp1 v30.8h, v5.8h, v17.8h
- uzp2 v22.8h, v5.8h, v17.8h
- xtn v5.8b, v30.8h
- xtn v17.8b, v22.8h
- uzp1 v28.8h, v19.8h, v4.8h
- uzp2 v19.8h, v19.8h, v4.8h
- xtn v4.8b, v28.8h
- xtn v20.8b, v19.8h
- shrn v25.8b, v25.8h, #0x8
- sli v25.8b, v18.8b, #0x4
- shrn v26.8b, v6.8h, #0x4
- st3 { v24.8b, v25.8b, v26.8b }, [x0], #24
- shrn v6.8b, v30.8h, #0x8
- sli v6.8b, v17.8b, #0x4
- shrn v7.8b, v22.8h, #0x4
- st3 { v5.8b, v6.8b, v7.8b }, [x0], #24
- shrn v5.8b, v28.8h, #0x8
- shrn v6.8b, v19.8h, #0x4
- sli v5.8b, v20.8b, #0x4
- st3 { v4.8b, v5.8b, v6.8b }, [x0], #24
- ldr q6, [x1], #0x20
- ldur q24, [x1, #-0x10]
- ldr q30, [x1], #0x20
- ldur q22, [x1, #-0x10]
- ldr q5, [x1], #0x20
- ldur q17, [x1, #-0x10]
- ldr q19, [x1], #0x20
- ldur q4, [x1, #-0x10]
- sub x2, x2, #0x1
- cbnz x2, poly_tobytes_loop_start
- uzp1 v25.8h, v30.8h, v22.8h
- uzp2 v18.8h, v30.8h, v22.8h
- uzp1 v30.8h, v6.8h, v24.8h
- uzp2 v6.8h, v6.8h, v24.8h
- uzp1 v24.8h, v5.8h, v17.8h
- uzp2 v22.8h, v5.8h, v17.8h
- uzp1 v5.8h, v19.8h, v4.8h
- uzp2 v17.8h, v19.8h, v4.8h
- xtn v19.8b, v25.8h
- shrn v20.8b, v25.8h, #0x8
- xtn v25.8b, v18.8h
- shrn v21.8b, v18.8h, #0x4
- xtn v28.8b, v30.8h
- shrn v29.8b, v30.8h, #0x8
- xtn v18.8b, v6.8h
- shrn v30.8b, v6.8h, #0x4
- xtn v1.8b, v24.8h
- shrn v2.8b, v24.8h, #0x8
- xtn v6.8b, v22.8h
- shrn v3.8b, v22.8h, #0x4
- xtn v22.8b, v5.8h
- shrn v23.8b, v5.8h, #0x8
- xtn v5.8b, v17.8h
- shrn v24.8b, v17.8h, #0x4
- sli v20.8b, v25.8b, #0x4
- sli v29.8b, v18.8b, #0x4
- st3 { v28.8b, v29.8b, v30.8b }, [x0], #24
- st3 { v19.8b, v20.8b, v21.8b }, [x0], #24
- sli v2.8b, v6.8b, #0x4
- st3 { v1.8b, v2.8b, v3.8b }, [x0], #24
- sli v23.8b, v5.8b, #0x4
- st3 { v22.8b, v23.8b, v24.8b }, [x0], #24
+Lpoly_tobytes_loop_start:
+ uzp1 v25.8h, v17.8h, v27.8h
+ uzp2 v31.8h, v17.8h, v27.8h
+ uzp1 v24.8h, v16.8h, v23.8h
+ uzp2 v6.8h, v16.8h, v23.8h
+ st3 { v2.8b, v3.8b, v4.8b }, [x0], #24
+ shrn v3.8b, v25.8h, #0x8
+ ldr q17, [x1], #0x20
+ shrn v4.8b, v31.8h, #0x4
+ xtn v21.8b, v6.8h
+ ldr q23, [x1, #0x10]
+ st3 { v28.8b, v29.8b, v30.8b }, [x0], #24
+ shrn v29.8b, v24.8h, #0x8
+ ldur q27, [x1, #-0x10]
+ xtn v20.8b, v31.8h
+ ldr q16, [x1], #0x20
+ sli v29.8b, v21.8b, #0x4
+ xtn v2.8b, v25.8h
+ sli v3.8b, v20.8b, #0x4
+ xtn v28.8b, v24.8h
+ shrn v30.8b, v6.8h, #0x4
+ subs x2, x2, #0x1
+ cbnz x2, Lpoly_tobytes_loop_start
+ uzp2 v7.8h, v17.8h, v27.8h
+ uzp1 v25.8h, v17.8h, v27.8h
+ uzp2 v0.8h, v16.8h, v23.8h
+ st3 { v2.8b, v3.8b, v4.8b }, [x0], #24
+ st3 { v28.8b, v29.8b, v30.8b }, [x0], #24
+ shrn v21.8b, v25.8h, #0x8
+ uzp1 v2.8h, v16.8h, v23.8h
+ shrn v22.8b, v7.8h, #0x4
+ shrn v4.8b, v0.8h, #0x4
+ xtn v28.8b, v7.8h
+ xtn v27.8b, v0.8h
+ shrn v3.8b, v2.8h, #0x8
+ sli v21.8b, v28.8b, #0x4
+ xtn v2.8b, v2.8h
+ sli v3.8b, v27.8b, #0x4
+ xtn v20.8b, v25.8h
+ st3 { v20.8b, v21.8b, v22.8b }, [x0], #24
+ st3 { v2.8b, v3.8b, v4.8b }, [x0], #24
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_tobytes_asm)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_tomont_asm.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_tomont_asm.S
index f7a427f4e5..7deb2c812e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_tomont_asm.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/poly_tomont_asm.S
@@ -3,75 +3,96 @@
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
+/*yaml
+ Name: poly_tomont_asm
+ Description: Convert polynomial to Montgomery domain
+ Signature: void mlk_poly_tomont_asm(int16_t p[256])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: read/write
+ c_parameter: int16_t p[256]
+ description: Input/output polynomial
+ Stack:
+ bytes: 0
+*/
+
#include "../../../common.h"
-#if defined(MLK_ARITH_BACKEND_AARCH64) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/poly_tomont_asm.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(poly_tomont_asm)
MLK_ASM_FN_SYMBOL(poly_tomont_asm)
- mov w2, #0xd01 // =3329
- dup v4.8h, w2
- mov w2, #0x4ebf // =20159
- dup v5.8h, w2
- mov w2, #-0x414 // =-1044
- dup v2.8h, w2
- mov w2, #-0x2824 // =-10276
- dup v3.8h, w2
- mov x1, #0x8 // =8
- ldr q26, [x0, #0x30]
- ldr q23, [x0, #0x10]
- mul v17.8h, v26.8h, v2.8h
- sqrdmulh v7.8h, v26.8h, v3.8h
- ldr q27, [x0, #0x20]
- sub x1, x1, #0x1
+ .cfi_startproc
+ mov w2, #0xd01 // =3329
+ dup v4.8h, w2
+ mov w2, #0x4ebf // =20159
+ dup v5.8h, w2
+ mov w2, #-0x414 // =-1044
+ dup v2.8h, w2
+ mov w2, #-0x2824 // =-10276
+ dup v3.8h, w2
+ mov x1, #0x8 // =8
+ ldr q18, [x0, #0x20]
+ ldr q0, [x0, #0x10]
+ ldr q16, [x0], #0x40
+ sqrdmulh v23.8h, v0.8h, v3.8h
+ mul v26.8h, v0.8h, v2.8h
+ sqrdmulh v19.8h, v16.8h, v3.8h
+ mls v26.8h, v23.8h, v4.h[0]
+ mul v29.8h, v16.8h, v2.8h
+ ldur q16, [x0, #-0x10]
+ mls v29.8h, v19.8h, v4.h[0]
+ stur q26, [x0, #-0x30]
+ sqrdmulh v26.8h, v18.8h, v3.8h
+ mul v18.8h, v18.8h, v2.8h
+ stur q29, [x0, #-0x40]
+ sqrdmulh v29.8h, v16.8h, v3.8h
+ mls v18.8h, v26.8h, v4.h[0]
+ sub x1, x1, #0x1
-poly_tomont_loop:
- mls v17.8h, v7.8h, v4.h[0]
- sqrdmulh v5.8h, v23.8h, v3.8h
- ldr q7, [x0], #0x40
- stur q17, [x0, #-0x10]
- sqrdmulh v29.8h, v27.8h, v3.8h
- sqrdmulh v19.8h, v7.8h, v3.8h
- mul v25.8h, v23.8h, v2.8h
- mul v0.8h, v7.8h, v2.8h
- mul v26.8h, v27.8h, v2.8h
- ldr q7, [x0, #0x30]
- mls v25.8h, v5.8h, v4.h[0]
- ldr q23, [x0, #0x10]
- mls v26.8h, v29.8h, v4.h[0]
- mls v0.8h, v19.8h, v4.h[0]
- stur q25, [x0, #-0x30]
- mul v17.8h, v7.8h, v2.8h
- sqrdmulh v7.8h, v7.8h, v3.8h
- stur q0, [x0, #-0x40]
- ldr q27, [x0, #0x20]
- stur q26, [x0, #-0x20]
- sub x1, x1, #0x1
- cbnz x1, poly_tomont_loop
- mls v17.8h, v7.8h, v4.h[0]
- sqrdmulh v7.8h, v23.8h, v3.8h
- mul v26.8h, v23.8h, v2.8h
- sqrdmulh v25.8h, v27.8h, v3.8h
- ldr q23, [x0], #0x40
- mul v27.8h, v27.8h, v2.8h
- mls v26.8h, v7.8h, v4.h[0]
- sqrdmulh v7.8h, v23.8h, v3.8h
- mul v23.8h, v23.8h, v2.8h
- stur q17, [x0, #-0x10]
- mls v27.8h, v25.8h, v4.h[0]
- stur q26, [x0, #-0x30]
- mls v23.8h, v7.8h, v4.h[0]
- stur q27, [x0, #-0x20]
- stur q23, [x0, #-0x40]
+Lpoly_tomont_loop:
+ ldr q19, [x0, #0x10]
+ mul v26.8h, v16.8h, v2.8h
+ ldr q23, [x0, #0x20]
+ ldr q17, [x0], #0x40
+ mls v26.8h, v29.8h, v4.h[0]
+ ldur q16, [x0, #-0x10]
+ sqrdmulh v28.8h, v19.8h, v3.8h
+ stur q18, [x0, #-0x60]
+ mul v0.8h, v19.8h, v2.8h
+ stur q26, [x0, #-0x50]
+ sqrdmulh v24.8h, v23.8h, v3.8h
+ mul v18.8h, v23.8h, v2.8h
+ sqrdmulh v22.8h, v17.8h, v3.8h
+ mul v26.8h, v17.8h, v2.8h
+ mls v0.8h, v28.8h, v4.h[0]
+ mls v26.8h, v22.8h, v4.h[0]
+ sqrdmulh v29.8h, v16.8h, v3.8h
+ stur q0, [x0, #-0x30]
+ mls v18.8h, v24.8h, v4.h[0]
+ stur q26, [x0, #-0x40]
+ sub x1, x1, #0x1
+ cbnz x1, Lpoly_tomont_loop
+ mul v16.8h, v16.8h, v2.8h
+ stur q18, [x0, #-0x20]
+ mls v16.8h, v29.8h, v4.h[0]
+ stur q16, [x0, #-0x10]
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_tomont_asm)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S
index 25ed53fd6b..b8bca5fdeb 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S
@@ -12,192 +12,250 @@
* https://eprint.iacr.org/2021/986
*/
+/*yaml
+ Name: polyvec_basemul_acc_montgomery_cached_asm_k2
+ Description: Re-implementation of asymmetric base multiplication following @[NeonNTT] for k=2
+ Signature: void mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(int16_t r[256], const int16_t a[512], const int16_t b[512], const int16_t b_cache[256])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: write-only
+ c_parameter: int16_t r[256]
+ description: Output polynomial
+ x1:
+ type: buffer
+ size_bytes: 1024
+ permissions: read-only
+ c_parameter: const int16_t a[512]
+ description: Input polynomial vector a
+ x2:
+ type: buffer
+ size_bytes: 1024
+ permissions: read-only
+ c_parameter: const int16_t b[512]
+ description: Input polynomial vector b
+ x3:
+ type: buffer
+ size_bytes: 512
+ permissions: read-only
+ c_parameter: const int16_t b_cache[256]
+ description: Cached values for b
+ Stack:
+ bytes: 64
+ description: saving callee-saved Neon registers
+*/
+
/* Re-implementation of asymmetric base multiplication following @[NeonNTT] */
#include "../../../common.h"
-#if defined(MLK_ARITH_BACKEND_AARCH64) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
- (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2)
+#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2)
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2)
MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k2)
- sub sp, sp, #0x40
- stp d8, d9, [sp]
- stp d10, d11, [sp, #0x10]
- stp d12, d13, [sp, #0x20]
- stp d14, d15, [sp, #0x30]
- mov w14, #0xd01 // =3329
- dup v0.8h, w14
- mov w14, #0xcff // =3327
- dup v2.8h, w14
- add x4, x1, #0x200
- add x5, x2, #0x200
- add x6, x3, #0x100
- mov x13, #0x10 // =16
- ldr q9, [x4], #0x20
- ldur q5, [x4, #-0x10]
- ldr q11, [x5], #0x20
- uzp1 v23.8h, v9.8h, v5.8h
- uzp2 v9.8h, v9.8h, v5.8h
- ldr q5, [x2], #0x20
- ldur q7, [x5, #-0x10]
- ldur q21, [x2, #-0x10]
- uzp2 v10.8h, v11.8h, v7.8h
- uzp1 v11.8h, v11.8h, v7.8h
- uzp1 v7.8h, v5.8h, v21.8h
- uzp2 v5.8h, v5.8h, v21.8h
- ldr q21, [x1], #0x20
- ldur q25, [x1, #-0x10]
- ld1 { v6.8h }, [x3], #16
- uzp1 v26.8h, v21.8h, v25.8h
- uzp2 v21.8h, v21.8h, v25.8h
- smull v25.4s, v26.4h, v5.4h
- smull2 v5.4s, v26.8h, v5.8h
- smull v19.4s, v26.4h, v7.4h
- smull2 v26.4s, v26.8h, v7.8h
- smlal v25.4s, v21.4h, v7.4h
- smlal2 v5.4s, v21.8h, v7.8h
- smlal v19.4s, v21.4h, v6.4h
- smlal2 v26.4s, v21.8h, v6.8h
- smlal v25.4s, v23.4h, v10.4h
- smlal2 v5.4s, v23.8h, v10.8h
- smlal v19.4s, v23.4h, v11.4h
- smlal2 v26.4s, v23.8h, v11.8h
- ld1 { v23.8h }, [x6], #16
- smlal v25.4s, v9.4h, v11.4h
- smlal2 v5.4s, v9.8h, v11.8h
- smlal2 v26.4s, v9.8h, v23.8h
- smlal v19.4s, v9.4h, v23.4h
- ldr q9, [x4], #0x20
- uzp1 v11.8h, v25.8h, v5.8h
- uzp1 v23.8h, v19.8h, v26.8h
- mul v11.8h, v11.8h, v2.8h
- mul v23.8h, v23.8h, v2.8h
- ldr q7, [x5], #0x20
- smlal2 v5.4s, v11.8h, v0.8h
- smlal v25.4s, v11.4h, v0.4h
- ldr q11, [x2], #0x20
- ldur q21, [x2, #-0x10]
- ldur q6, [x4, #-0x10]
- uzp1 v17.8h, v11.8h, v21.8h
- ldr q10, [x1], #0x20
- ldur q29, [x1, #-0x10]
- uzp2 v11.8h, v11.8h, v21.8h
- uzp1 v13.8h, v9.8h, v6.8h
- uzp1 v3.8h, v10.8h, v29.8h
- uzp2 v10.8h, v10.8h, v29.8h
- smull v12.4s, v3.4h, v11.4h
- smull2 v11.4s, v3.8h, v11.8h
- ldur q21, [x5, #-0x10]
- smlal v12.4s, v10.4h, v17.4h
- smlal2 v11.4s, v10.8h, v17.8h
- uzp2 v29.8h, v7.8h, v21.8h
- uzp1 v15.8h, v7.8h, v21.8h
- smlal v12.4s, v13.4h, v29.4h
- smlal2 v11.4s, v13.8h, v29.8h
- uzp2 v28.8h, v9.8h, v6.8h
- smlal2 v26.4s, v23.8h, v0.8h
- smlal v12.4s, v28.4h, v15.4h
- smlal2 v11.4s, v28.8h, v15.8h
- smlal v19.4s, v23.4h, v0.4h
- uzp2 v27.8h, v25.8h, v5.8h
- smull v23.4s, v3.4h, v17.4h
- uzp1 v9.8h, v12.8h, v11.8h
- uzp2 v19.8h, v19.8h, v26.8h
- mul v14.8h, v9.8h, v2.8h
- ld1 { v22.8h }, [x6], #16
- zip2 v9.8h, v19.8h, v27.8h
- smlal2 v11.4s, v14.8h, v0.8h
- ld1 { v4.8h }, [x3], #16
- sub x13, x13, #0x2
+ .cfi_startproc
+ sub sp, sp, #0x40
+ .cfi_adjust_cfa_offset 0x40
+ stp d8, d9, [sp]
+ .cfi_rel_offset d8, 0x0
+ .cfi_rel_offset d9, 0x8
+ stp d10, d11, [sp, #0x10]
+ .cfi_rel_offset d10, 0x10
+ .cfi_rel_offset d11, 0x18
+ stp d12, d13, [sp, #0x20]
+ .cfi_rel_offset d12, 0x20
+ .cfi_rel_offset d13, 0x28
+ stp d14, d15, [sp, #0x30]
+ .cfi_rel_offset d14, 0x30
+ .cfi_rel_offset d15, 0x38
+ mov w14, #0xd01 // =3329
+ dup v0.8h, w14
+ mov w14, #0xcff // =3327
+ dup v2.8h, w14
+ add x4, x1, #0x200
+ add x5, x2, #0x200
+ add x6, x3, #0x100
+ mov x13, #0x10 // =16
+ ldr q12, [x1], #0x20
+ ldur q9, [x1, #-0x10]
+ ldr q22, [x2], #0x20
+ ldur q30, [x2, #-0x10]
+ ldr q6, [x5], #0x20
+ ldr q7, [x4, #0x10]
+ ldr q8, [x4], #0x20
+ ldur q23, [x5, #-0x10]
+ uzp1 v16.8h, v12.8h, v9.8h
+ uzp2 v14.8h, v12.8h, v9.8h
+ uzp2 v13.8h, v22.8h, v30.8h
+ uzp1 v18.8h, v22.8h, v30.8h
+ ld1 { v27.8h }, [x3], #16
+ ld1 { v17.8h }, [x6], #16
+ smull2 v4.4s, v16.8h, v18.8h
+ ldr q31, [x1, #0x10]
+ smull v19.4s, v16.4h, v13.4h
+ ldr q24, [x1], #0x20
+ smlal v19.4s, v14.4h, v18.4h
+ ldr q22, [x2], #0x20
+ smlal2 v4.4s, v14.8h, v27.8h
+ uzp2 v5.8h, v6.8h, v23.8h
+ smull2 v29.4s, v16.8h, v13.8h
+ uzp2 v26.8h, v8.8h, v7.8h
+ smlal2 v29.4s, v14.8h, v18.8h
+ uzp1 v30.8h, v24.8h, v31.8h
+ uzp1 v8.8h, v8.8h, v7.8h
+ smull v11.4s, v16.4h, v18.4h
+ smlal v11.4s, v14.4h, v27.4h
+ ldur q1, [x2, #-0x10]
+ uzp1 v28.8h, v6.8h, v23.8h
+ smlal2 v29.4s, v8.8h, v5.8h
+ ldr q25, [x5], #0x20
+ smlal v19.4s, v8.4h, v5.4h
+ ldr q3, [x4, #0x10]
+ smlal2 v29.4s, v26.8h, v28.8h
+ uzp1 v27.8h, v22.8h, v1.8h
+ smlal v19.4s, v26.4h, v28.4h
+ ldr q12, [x4], #0x20
+ smlal2 v4.4s, v8.8h, v28.8h
+ ldur q21, [x5, #-0x10]
+ smlal2 v4.4s, v26.8h, v17.8h
+ smlal v11.4s, v8.4h, v28.4h
+ ld1 { v15.8h }, [x6], #16
+ smlal v11.4s, v26.4h, v17.4h
+ ld1 { v20.8h }, [x3], #16
+ uzp1 v28.8h, v19.8h, v29.8h
+ smull2 v23.4s, v30.8h, v27.8h
+ smull v26.4s, v30.4h, v27.4h
+ uzp2 v16.8h, v22.8h, v1.8h
+ mul v28.8h, v28.8h, v2.8h
+ uzp1 v10.8h, v11.8h, v4.8h
+ smull2 v8.4s, v30.8h, v16.8h
+ mul v13.8h, v10.8h, v2.8h
+ smlal v19.4s, v28.4h, v0.4h
+ smlal2 v29.4s, v28.8h, v0.8h
+ smull v18.4s, v30.4h, v16.4h
+ uzp1 v30.8h, v25.8h, v21.8h
+ smlal v11.4s, v13.4h, v0.4h
+ uzp2 v6.8h, v24.8h, v31.8h
+ uzp1 v16.8h, v12.8h, v3.8h
+ smlal2 v4.4s, v13.8h, v0.8h
+ uzp2 v17.8h, v25.8h, v21.8h
+ smlal2 v8.4s, v6.8h, v27.8h
+ uzp2 v12.8h, v12.8h, v3.8h
+ smlal v18.4s, v6.4h, v27.4h
+ uzp2 v9.8h, v19.8h, v29.8h
+ smlal2 v8.4s, v16.8h, v17.8h
+ smlal2 v8.4s, v12.8h, v30.8h
+ uzp2 v19.8h, v11.8h, v4.8h
+ sub x13, x13, #0x2
-polyvec_basemul_acc_montgomery_cached_k2_loop:
- smull2 v20.4s, v3.8h, v17.8h
- ldr q18, [x4], #0x20
- ldr q30, [x5], #0x20
- smlal2 v20.4s, v10.8h, v4.8h
- smlal v12.4s, v14.4h, v0.4h
- smlal v23.4s, v10.4h, v4.4h
- str q9, [x0, #0x10]
- smlal2 v20.4s, v13.8h, v15.8h
- ldr q8, [x2], #0x20
- smlal v23.4s, v13.4h, v15.4h
- smlal2 v20.4s, v28.8h, v22.8h
- zip1 v26.8h, v19.8h, v27.8h
- ldur q9, [x2, #-0x10]
- smlal v23.4s, v28.4h, v22.4h
- uzp2 v27.8h, v12.8h, v11.8h
- uzp1 v17.8h, v8.8h, v9.8h
- uzp2 v4.8h, v8.8h, v9.8h
- uzp1 v5.8h, v23.8h, v20.8h
- str q26, [x0], #0x20
- mul v31.8h, v5.8h, v2.8h
- ldur q19, [x4, #-0x10]
- ldr q29, [x1], #0x20
- ldur q12, [x1, #-0x10]
- smlal2 v20.4s, v31.8h, v0.8h
- uzp1 v13.8h, v18.8h, v19.8h
- uzp1 v3.8h, v29.8h, v12.8h
- uzp2 v10.8h, v29.8h, v12.8h
- smull v12.4s, v3.4h, v4.4h
- smull2 v11.4s, v3.8h, v4.8h
- ldur q5, [x5, #-0x10]
- smlal v12.4s, v10.4h, v17.4h
- smlal2 v11.4s, v10.8h, v17.8h
- uzp2 v14.8h, v30.8h, v5.8h
- uzp1 v15.8h, v30.8h, v5.8h
- smlal v12.4s, v13.4h, v14.4h
- smlal2 v11.4s, v13.8h, v14.8h
- uzp2 v28.8h, v18.8h, v19.8h
- smlal v23.4s, v31.4h, v0.4h
- smlal v12.4s, v28.4h, v15.4h
- smlal2 v11.4s, v28.8h, v15.8h
- ld1 { v22.8h }, [x6], #16
- uzp2 v19.8h, v23.8h, v20.8h
- uzp1 v1.8h, v12.8h, v11.8h
- smull v23.4s, v3.4h, v17.4h
- mul v14.8h, v1.8h, v2.8h
- zip2 v9.8h, v19.8h, v27.8h
- ld1 { v4.8h }, [x3], #16
- smlal2 v11.4s, v14.8h, v0.8h
- sub x13, x13, #0x1
- cbnz x13, polyvec_basemul_acc_montgomery_cached_k2_loop
- smull2 v5.4s, v3.8h, v17.8h
- smlal v12.4s, v14.4h, v0.4h
- smlal v23.4s, v10.4h, v4.4h
- str q9, [x0, #0x10]
- smlal2 v5.4s, v10.8h, v4.8h
- uzp2 v11.8h, v12.8h, v11.8h
- zip1 v9.8h, v19.8h, v27.8h
- smlal v23.4s, v13.4h, v15.4h
- smlal2 v5.4s, v13.8h, v15.8h
- str q9, [x0], #0x20
- smlal v23.4s, v28.4h, v22.4h
- smlal2 v5.4s, v28.8h, v22.8h
- uzp1 v9.8h, v23.8h, v5.8h
- mul v9.8h, v9.8h, v2.8h
- smlal2 v5.4s, v9.8h, v0.8h
- smlal v23.4s, v9.4h, v0.4h
- uzp2 v9.8h, v23.8h, v5.8h
- zip2 v5.8h, v9.8h, v11.8h
- zip1 v9.8h, v9.8h, v11.8h
- str q5, [x0, #0x10]
- str q9, [x0], #0x20
- ldp d8, d9, [sp]
- ldp d10, d11, [sp, #0x10]
- ldp d12, d13, [sp, #0x20]
- ldp d14, d15, [sp, #0x30]
- add sp, sp, #0x40
+Lpolyvec_basemul_acc_montgomery_cached_k2_loop_start:
+ smlal v18.4s, v16.4h, v17.4h
+ ldr q7, [x4], #0x20
+ ldr q10, [x2, #0x10]
+ smlal v18.4s, v12.4h, v30.4h
+ smlal2 v23.4s, v6.8h, v20.8h
+ ldr q14, [x2], #0x20
+ smlal2 v23.4s, v16.8h, v30.8h
+ zip1 v25.8h, v19.8h, v9.8h
+ zip2 v3.8h, v19.8h, v9.8h
+ smlal2 v23.4s, v12.8h, v15.8h
+ smlal v26.4s, v6.4h, v20.4h
+ uzp1 v5.8h, v18.8h, v8.8h
+ uzp2 v21.8h, v14.8h, v10.8h
+ smlal v26.4s, v16.4h, v30.4h
+ str q25, [x0], #0x20
+ mul v29.8h, v5.8h, v2.8h
+ uzp1 v24.8h, v14.8h, v10.8h
+ stur q3, [x0, #-0x10]
+ smlal v26.4s, v12.4h, v15.4h
+ ld1 { v15.8h }, [x6], #16
+ ldr q28, [x1, #0x10]
+ ldr q11, [x1], #0x20
+ ldr q13, [x5], #0x20
+ ldur q27, [x4, #-0x10]
+ smlal2 v8.4s, v29.8h, v0.8h
+ ldur q22, [x5, #-0x10]
+ smlal v18.4s, v29.4h, v0.4h
+ uzp1 v4.8h, v26.8h, v23.8h
+ uzp1 v1.8h, v11.8h, v28.8h
+ uzp2 v6.8h, v11.8h, v28.8h
+ uzp1 v16.8h, v7.8h, v27.8h
+ mul v31.8h, v4.8h, v2.8h
+ uzp2 v17.8h, v13.8h, v22.8h
+ ld1 { v20.8h }, [x3], #16
+ uzp2 v9.8h, v18.8h, v8.8h
+ smull2 v8.4s, v1.8h, v21.8h
+ uzp1 v30.8h, v13.8h, v22.8h
+ smlal2 v8.4s, v6.8h, v24.8h
+ smlal2 v8.4s, v16.8h, v17.8h
+ uzp2 v12.8h, v7.8h, v27.8h
+ smlal v26.4s, v31.4h, v0.4h
+ smlal2 v23.4s, v31.8h, v0.8h
+ smull v18.4s, v1.4h, v21.4h
+ smlal v18.4s, v6.4h, v24.4h
+ smlal2 v8.4s, v12.8h, v30.8h
+ uzp2 v19.8h, v26.8h, v23.8h
+ smull2 v23.4s, v1.8h, v24.8h
+ smull v26.4s, v1.4h, v24.4h
+ subs x13, x13, #0x1
+ cbnz x13, Lpolyvec_basemul_acc_montgomery_cached_k2_loop_start
+ smlal v26.4s, v6.4h, v20.4h
+ smlal2 v23.4s, v6.8h, v20.8h
+ smlal v26.4s, v16.4h, v30.4h
+ smlal2 v23.4s, v16.8h, v30.8h
+ smlal v26.4s, v12.4h, v15.4h
+ smlal2 v23.4s, v12.8h, v15.8h
+ smlal v18.4s, v16.4h, v17.4h
+ smlal v18.4s, v12.4h, v30.4h
+ zip1 v12.8h, v19.8h, v9.8h
+ str q12, [x0], #0x20
+ uzp1 v12.8h, v26.8h, v23.8h
+ mul v6.8h, v12.8h, v2.8h
+ uzp1 v12.8h, v18.8h, v8.8h
+ mul v12.8h, v12.8h, v2.8h
+ smlal v26.4s, v6.4h, v0.4h
+ smlal2 v23.4s, v6.8h, v0.8h
+ smlal2 v8.4s, v12.8h, v0.8h
+ smlal v18.4s, v12.4h, v0.4h
+ zip2 v12.8h, v19.8h, v9.8h
+ uzp2 v6.8h, v26.8h, v23.8h
+ stur q12, [x0, #-0x10]
+ uzp2 v12.8h, v18.8h, v8.8h
+ zip2 v1.8h, v6.8h, v12.8h
+ zip1 v12.8h, v6.8h, v12.8h
+ str q1, [x0, #0x10]
+ str q12, [x0], #0x20
+ ldp d8, d9, [sp]
+ .cfi_restore d8
+ .cfi_restore d9
+ ldp d10, d11, [sp, #0x10]
+ .cfi_restore d10
+ .cfi_restore d11
+ ldp d12, d13, [sp, #0x20]
+ .cfi_restore d12
+ .cfi_restore d13
+ ldp d14, d15, [sp, #0x30]
+ .cfi_restore d14
+ .cfi_restore d15
+ add sp, sp, #0x40
+ .cfi_adjust_cfa_offset -0x40
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_asm_k2)
+
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED && \
(MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S
index 9a80e1d9be..885b765eea 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S
@@ -12,246 +12,303 @@
* https://eprint.iacr.org/2021/986
*/
+/*yaml
+ Name: polyvec_basemul_acc_montgomery_cached_asm_k3
+ Description: Re-implementation of asymmetric base multiplication following @[NeonNTT] for k=3
+ Signature: void mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(int16_t r[256], const int16_t a[768], const int16_t b[768], const int16_t b_cache[384])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: write-only
+ c_parameter: int16_t r[256]
+ description: Output polynomial
+ x1:
+ type: buffer
+ size_bytes: 1536
+ permissions: read-only
+ c_parameter: const int16_t a[768]
+ description: Input polynomial vector a
+ x2:
+ type: buffer
+ size_bytes: 1536
+ permissions: read-only
+ c_parameter: const int16_t b[768]
+ description: Input polynomial vector b
+ x3:
+ type: buffer
+ size_bytes: 768
+ permissions: read-only
+ c_parameter: const int16_t b_cache[384]
+ description: Cached values for b
+ Stack:
+ bytes: 64
+ description: saving callee-saved Neon registers
+*/
+
/* Re-implementation of asymmetric base multiplication following @[NeonNTT] */
#include "../../../common.h"
-#if defined(MLK_ARITH_BACKEND_AARCH64) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
- (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 3)
+#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 3)
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3)
MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k3)
- sub sp, sp, #0x40
- stp d8, d9, [sp]
- stp d10, d11, [sp, #0x10]
- stp d12, d13, [sp, #0x20]
- stp d14, d15, [sp, #0x30]
- mov w14, #0xd01 // =3329
- dup v0.8h, w14
- mov w14, #0xcff // =3327
- dup v2.8h, w14
- add x4, x1, #0x200
- add x5, x2, #0x200
- add x6, x3, #0x100
- add x7, x1, #0x400
- add x8, x2, #0x400
- add x9, x3, #0x200
- mov x13, #0x10 // =16
- ldr q7, [x2, #0x10]
- ldr q20, [x2], #0x20
- ldr q15, [x1, #0x10]
- uzp1 v8.8h, v20.8h, v7.8h
- uzp2 v7.8h, v20.8h, v7.8h
- ld1 { v20.8h }, [x3], #16
- ldr q30, [x1], #0x20
- ldr q11, [x4], #0x20
- uzp1 v16.8h, v30.8h, v15.8h
- uzp2 v15.8h, v30.8h, v15.8h
- smull v30.4s, v16.4h, v7.4h
- smull2 v7.4s, v16.8h, v7.8h
- smull v9.4s, v16.4h, v8.4h
- smull2 v16.4s, v16.8h, v8.8h
- smlal v30.4s, v15.4h, v8.4h
- smlal2 v7.4s, v15.8h, v8.8h
- smlal v9.4s, v15.4h, v20.4h
- smlal2 v16.4s, v15.8h, v20.8h
- ldur q20, [x4, #-0x10]
- ldr q15, [x5], #0x20
- uzp1 v8.8h, v11.8h, v20.8h
- uzp2 v20.8h, v11.8h, v20.8h
- ldur q11, [x5, #-0x10]
- ld1 { v27.8h }, [x6], #16
- uzp1 v10.8h, v15.8h, v11.8h
- uzp2 v15.8h, v15.8h, v11.8h
- smlal v9.4s, v8.4h, v10.4h
- smlal2 v16.4s, v8.8h, v10.8h
- smlal v30.4s, v8.4h, v15.4h
- smlal2 v7.4s, v8.8h, v15.8h
- smlal v9.4s, v20.4h, v27.4h
- smlal2 v16.4s, v20.8h, v27.8h
- smlal v30.4s, v20.4h, v10.4h
- smlal2 v7.4s, v20.8h, v10.8h
- ldr q20, [x7], #0x20
- ldur q15, [x7, #-0x10]
- ldr q8, [x8], #0x20
- uzp1 v11.8h, v20.8h, v15.8h
- uzp2 v20.8h, v20.8h, v15.8h
- ldur q15, [x8, #-0x10]
- ld1 { v27.8h }, [x9], #16
- uzp1 v10.8h, v8.8h, v15.8h
- uzp2 v15.8h, v8.8h, v15.8h
- smlal v9.4s, v11.4h, v10.4h
- smlal2 v16.4s, v11.8h, v10.8h
- smlal v30.4s, v11.4h, v15.4h
- smlal2 v7.4s, v11.8h, v15.8h
- smlal v9.4s, v20.4h, v27.4h
- smlal2 v16.4s, v20.8h, v27.8h
- smlal v30.4s, v20.4h, v10.4h
- smlal2 v7.4s, v20.8h, v10.8h
- ldr q15, [x2], #0x20
- uzp1 v20.8h, v9.8h, v16.8h
- uzp1 v8.8h, v30.8h, v7.8h
- mul v20.8h, v20.8h, v2.8h
- mul v8.8h, v8.8h, v2.8h
- ldr q21, [x4], #0x20
- smlal v9.4s, v20.4h, v0.4h
- smlal2 v16.4s, v20.8h, v0.8h
- smlal v30.4s, v8.4h, v0.4h
- smlal2 v7.4s, v8.8h, v0.8h
- ldur q6, [x4, #-0x10]
- uzp2 v27.8h, v9.8h, v16.8h
- uzp2 v10.8h, v30.8h, v7.8h
- ldur q16, [x2, #-0x10]
- ldr q30, [x1, #0x10]
- ld1 { v9.8h }, [x3], #16
- ldr q1, [x5], #0x20
- ldur q12, [x5, #-0x10]
- ld1 { v24.8h }, [x6], #16
- ldr q19, [x7], #0x20
- ldur q31, [x7, #-0x10]
- ldr q17, [x8], #0x20
- ldur q18, [x8, #-0x10]
- ld1 { v25.8h }, [x9], #16
- sub x13, x13, #0x2
+ .cfi_startproc
+ sub sp, sp, #0x40
+ .cfi_adjust_cfa_offset 0x40
+ stp d8, d9, [sp]
+ .cfi_rel_offset d8, 0x0
+ .cfi_rel_offset d9, 0x8
+ stp d10, d11, [sp, #0x10]
+ .cfi_rel_offset d10, 0x10
+ .cfi_rel_offset d11, 0x18
+ stp d12, d13, [sp, #0x20]
+ .cfi_rel_offset d12, 0x20
+ .cfi_rel_offset d13, 0x28
+ stp d14, d15, [sp, #0x30]
+ .cfi_rel_offset d14, 0x30
+ .cfi_rel_offset d15, 0x38
+ mov w14, #0xd01 // =3329
+ dup v0.8h, w14
+ mov w14, #0xcff // =3327
+ dup v2.8h, w14
+ add x4, x1, #0x200
+ add x5, x2, #0x200
+ add x6, x3, #0x100
+ add x7, x1, #0x400
+ add x8, x2, #0x400
+ add x9, x3, #0x200
+ mov x13, #0x10 // =16
+ ldr q6, [x7], #0x20
+ ldr q19, [x2, #0x10]
+ ldr q23, [x1], #0x20
+ ldur q14, [x1, #-0x10]
+ ldr q17, [x2], #0x20
+ ldr q11, [x4, #0x10]
+ ldur q28, [x7, #-0x10]
+ ld1 { v30.8h }, [x3], #16
+ ldr q26, [x4], #0x20
+ ldr q16, [x8, #0x10]
+ uzp1 v8.8h, v23.8h, v14.8h
+ ldr q22, [x5, #0x10]
+ ldr q18, [x5], #0x20
+ uzp1 v20.8h, v17.8h, v19.8h
+ uzp2 v24.8h, v23.8h, v14.8h
+ ldr q31, [x8], #0x20
+ smull2 v4.4s, v8.8h, v20.8h
+ uzp1 v25.8h, v26.8h, v11.8h
+ smull v13.4s, v8.4h, v20.4h
+ ld1 { v23.8h }, [x6], #16
+ uzp1 v1.8h, v18.8h, v22.8h
+ smlal v13.4s, v24.4h, v30.4h
+ smlal2 v4.4s, v24.8h, v30.8h
+ uzp2 v5.8h, v26.8h, v11.8h
+ smlal2 v4.4s, v25.8h, v1.8h
+ uzp1 v29.8h, v6.8h, v28.8h
+ smlal2 v4.4s, v5.8h, v23.8h
+ ld1 { v7.8h }, [x9], #16
+ smlal v13.4s, v25.4h, v1.4h
+ uzp2 v17.8h, v17.8h, v19.8h
+ uzp1 v27.8h, v31.8h, v16.8h
+ smlal v13.4s, v5.4h, v23.4h
+ uzp2 v22.8h, v18.8h, v22.8h
+ smull v18.4s, v8.4h, v17.4h
+ uzp2 v28.8h, v6.8h, v28.8h
+ smlal v13.4s, v29.4h, v27.4h
+ smlal2 v4.4s, v29.8h, v27.8h
+ uzp2 v26.8h, v31.8h, v16.8h
+ smlal2 v4.4s, v28.8h, v7.8h
+ ldr q3, [x7, #0x10]
+ smlal v13.4s, v28.4h, v7.4h
+ ldr q7, [x1], #0x20
+ smlal v18.4s, v24.4h, v20.4h
+ ldr q15, [x2], #0x20
+ smlal v18.4s, v25.4h, v22.4h
+ smull2 v8.4s, v8.8h, v17.8h
+ ldur q17, [x1, #-0x10]
+ uzp1 v23.8h, v13.8h, v4.8h
+ smlal v18.4s, v5.4h, v1.4h
+ smlal2 v8.4s, v24.8h, v20.8h
+ ld1 { v16.8h }, [x3], #16
+ mul v23.8h, v23.8h, v2.8h
+ ldr q19, [x5, #0x10]
+ ldr q14, [x4, #0x10]
+ ldr q11, [x4], #0x20
+ ldur q20, [x2, #-0x10]
+ smlal2 v8.4s, v25.8h, v22.8h
+ smlal2 v8.4s, v5.8h, v1.8h
+ ldr q22, [x5], #0x20
+ uzp1 v1.8h, v7.8h, v17.8h
+ smlal v18.4s, v29.4h, v26.4h
+ smlal v13.4s, v23.4h, v0.4h
+ uzp2 v31.8h, v11.8h, v14.8h
+ uzp1 v21.8h, v15.8h, v20.8h
+ smlal2 v4.4s, v23.8h, v0.8h
+ ld1 { v9.8h }, [x6], #16
+ smlal v18.4s, v28.4h, v27.4h
+ smlal2 v8.4s, v29.8h, v26.8h
+ ldr q25, [x7], #0x20
+ smull v26.4s, v1.4h, v21.4h
+ uzp1 v24.8h, v22.8h, v19.8h
+ smlal2 v8.4s, v28.8h, v27.8h
+ uzp2 v28.8h, v7.8h, v17.8h
+ uzp1 v29.8h, v11.8h, v14.8h
+ smull2 v23.4s, v1.8h, v21.8h
+ ldr q27, [x8], #0x20
+ smlal2 v23.4s, v28.8h, v16.8h
+ ldur q11, [x8, #-0x10]
+ smlal2 v23.4s, v29.8h, v24.8h
+ uzp2 v7.8h, v13.8h, v4.8h
+ uzp2 v19.8h, v22.8h, v19.8h
+ ld1 { v4.8h }, [x9], #16
+ smlal2 v23.4s, v31.8h, v9.8h
+ uzp1 v13.8h, v25.8h, v3.8h
+ uzp1 v14.8h, v18.8h, v8.8h
+ smlal v26.4s, v28.4h, v16.4h
+ uzp2 v17.8h, v27.8h, v11.8h
+ uzp2 v20.8h, v15.8h, v20.8h
+ mul v14.8h, v14.8h, v2.8h
+ sub x13, x13, #0x2
-polyvec_basemul_acc_montgomery_cached_k3_loop:
- ldr q20, [x1], #0x20
- uzp1 v7.8h, v15.8h, v16.8h
- uzp2 v15.8h, v15.8h, v16.8h
- uzp1 v8.8h, v20.8h, v30.8h
- uzp2 v20.8h, v20.8h, v30.8h
- smull v30.4s, v8.4h, v15.4h
- smull2 v15.4s, v8.8h, v15.8h
- smull v11.4s, v8.4h, v7.4h
- smull2 v8.4s, v8.8h, v7.8h
- smlal v30.4s, v20.4h, v7.4h
- smlal2 v15.4s, v20.8h, v7.8h
- smlal v11.4s, v20.4h, v9.4h
- smlal2 v8.4s, v20.8h, v9.8h
- uzp1 v7.8h, v21.8h, v6.8h
- uzp2 v20.8h, v21.8h, v6.8h
- uzp1 v16.8h, v1.8h, v12.8h
- uzp2 v9.8h, v1.8h, v12.8h
- smlal v11.4s, v7.4h, v16.4h
- smlal2 v8.4s, v7.8h, v16.8h
- smlal v30.4s, v7.4h, v9.4h
- smlal2 v15.4s, v7.8h, v9.8h
- smlal v11.4s, v20.4h, v24.4h
- smlal2 v8.4s, v20.8h, v24.8h
- smlal v30.4s, v20.4h, v16.4h
- smlal2 v15.4s, v20.8h, v16.8h
- uzp1 v7.8h, v19.8h, v31.8h
- uzp2 v20.8h, v19.8h, v31.8h
- uzp1 v16.8h, v17.8h, v18.8h
- uzp2 v9.8h, v17.8h, v18.8h
- smlal v11.4s, v7.4h, v16.4h
- smlal2 v8.4s, v7.8h, v16.8h
- smlal v30.4s, v7.4h, v9.4h
- smlal2 v15.4s, v7.8h, v9.8h
- smlal v11.4s, v20.4h, v25.4h
- smlal2 v8.4s, v20.8h, v25.8h
- smlal v30.4s, v20.4h, v16.4h
- smlal2 v15.4s, v20.8h, v16.8h
- ldr q16, [x2, #0x10]
- uzp1 v7.8h, v11.8h, v8.8h
- uzp1 v20.8h, v30.8h, v15.8h
- mul v7.8h, v7.8h, v2.8h
- mul v20.8h, v20.8h, v2.8h
- zip2 v9.8h, v27.8h, v10.8h
- zip1 v27.8h, v27.8h, v10.8h
- smlal v11.4s, v7.4h, v0.4h
- smlal2 v8.4s, v7.8h, v0.8h
- smlal v30.4s, v20.4h, v0.4h
- smlal2 v15.4s, v20.8h, v0.8h
- str q27, [x0], #0x20
- uzp2 v27.8h, v11.8h, v8.8h
- stur q9, [x0, #-0x10]
- uzp2 v10.8h, v30.8h, v15.8h
- ldr q30, [x1, #0x10]
- ldr q15, [x2], #0x20
- ld1 { v9.8h }, [x3], #16
- ldr q21, [x4], #0x20
- ldur q6, [x4, #-0x10]
- ldr q1, [x5], #0x20
- ldur q12, [x5, #-0x10]
- ld1 { v24.8h }, [x6], #16
- ldr q19, [x7], #0x20
- ldur q31, [x7, #-0x10]
- ldr q17, [x8], #0x20
- ldur q18, [x8, #-0x10]
- ld1 { v25.8h }, [x9], #16
- sub x13, x13, #0x1
- cbnz x13, polyvec_basemul_acc_montgomery_cached_k3_loop
- ldr q7, [x1], #0x20
- uzp1 v20.8h, v15.8h, v16.8h
- uzp2 v15.8h, v15.8h, v16.8h
- uzp1 v23.8h, v7.8h, v30.8h
- uzp2 v11.8h, v7.8h, v30.8h
- smull2 v8.4s, v23.8h, v20.8h
- smull v5.4s, v23.4h, v20.4h
- smull2 v30.4s, v23.8h, v15.8h
- uzp1 v28.8h, v1.8h, v12.8h
- smlal2 v8.4s, v11.8h, v9.8h
- smlal v5.4s, v11.4h, v9.4h
- uzp1 v3.8h, v21.8h, v6.8h
- smull v16.4s, v23.4h, v15.4h
- smlal2 v8.4s, v3.8h, v28.8h
- smlal v5.4s, v3.4h, v28.4h
- uzp2 v29.8h, v21.8h, v6.8h
- uzp1 v7.8h, v17.8h, v18.8h
- smlal2 v8.4s, v29.8h, v24.8h
- uzp1 v14.8h, v19.8h, v31.8h
- smlal v16.4s, v11.4h, v20.4h
- smlal2 v30.4s, v11.8h, v20.8h
- smlal2 v8.4s, v14.8h, v7.8h
- uzp2 v20.8h, v1.8h, v12.8h
- uzp2 v21.8h, v19.8h, v31.8h
- smlal2 v30.4s, v3.8h, v20.8h
- smlal v16.4s, v3.4h, v20.4h
- smlal v5.4s, v29.4h, v24.4h
- uzp2 v9.8h, v17.8h, v18.8h
- smlal2 v30.4s, v29.8h, v28.8h
- smlal v16.4s, v29.4h, v28.4h
- smlal v5.4s, v14.4h, v7.4h
- smlal2 v8.4s, v21.8h, v25.8h
- smlal2 v30.4s, v14.8h, v9.8h
- smlal v16.4s, v14.4h, v9.4h
- smlal v5.4s, v21.4h, v25.4h
- zip1 v20.8h, v27.8h, v10.8h
- smlal2 v30.4s, v21.8h, v7.8h
- smlal v16.4s, v21.4h, v7.4h
- uzp1 v7.8h, v5.8h, v8.8h
- str q20, [x0], #0x20
- mul v15.8h, v7.8h, v2.8h
- uzp1 v7.8h, v16.8h, v30.8h
- zip2 v31.8h, v27.8h, v10.8h
- mul v20.8h, v7.8h, v2.8h
- smlal v5.4s, v15.4h, v0.4h
- smlal2 v8.4s, v15.8h, v0.8h
- stur q31, [x0, #-0x10]
- smlal2 v30.4s, v20.8h, v0.8h
- smlal v16.4s, v20.4h, v0.4h
- uzp2 v15.8h, v5.8h, v8.8h
- uzp2 v20.8h, v16.8h, v30.8h
- zip1 v7.8h, v15.8h, v20.8h
- zip2 v20.8h, v15.8h, v20.8h
- str q7, [x0], #0x20
- stur q20, [x0, #-0x10]
- ldp d8, d9, [sp]
- ldp d10, d11, [sp, #0x10]
- ldp d12, d13, [sp, #0x20]
- ldp d14, d15, [sp, #0x30]
- add sp, sp, #0x40
+Lpolyvec_basemul_acc_montgomery_cached_k3_loop_start:
+ uzp1 v6.8h, v27.8h, v11.8h
+ smlal v26.4s, v29.4h, v24.4h
+ uzp2 v16.8h, v25.8h, v3.8h
+ smlal v26.4s, v31.4h, v9.4h
+ ldr q3, [x7, #0x10]
+ smlal v26.4s, v13.4h, v6.4h
+ smlal2 v8.4s, v14.8h, v0.8h
+ ldr q27, [x8], #0x20
+ smlal v18.4s, v14.4h, v0.4h
+ ldr q25, [x7], #0x20
+ smlal2 v23.4s, v13.8h, v6.8h
+ ldr q11, [x1], #0x20
+ smlal2 v23.4s, v16.8h, v4.8h
+ smlal v26.4s, v16.4h, v4.4h
+ ldur q22, [x1, #-0x10]
+ uzp2 v30.8h, v18.8h, v8.8h
+ smull v18.4s, v1.4h, v20.4h
+ smlal v18.4s, v28.4h, v21.4h
+ ldr q14, [x2], #0x20
+ smlal v18.4s, v29.4h, v19.4h
+ zip1 v5.8h, v7.8h, v30.8h
+ uzp1 v4.8h, v26.8h, v23.8h
+ smull2 v8.4s, v1.8h, v20.8h
+ zip2 v10.8h, v7.8h, v30.8h
+ smlal v18.4s, v31.4h, v24.4h
+ mul v12.8h, v4.8h, v2.8h
+ ldr q4, [x5, #0x10]
+ ldr q20, [x4, #0x10]
+ ldr q1, [x4], #0x20
+ ldur q30, [x2, #-0x10]
+ smlal2 v8.4s, v28.8h, v21.8h
+ smlal2 v8.4s, v29.8h, v19.8h
+ ldr q19, [x5], #0x20
+ smlal2 v8.4s, v31.8h, v24.8h
+ ld1 { v15.8h }, [x3], #16
+ uzp2 v31.8h, v1.8h, v20.8h
+ smlal v26.4s, v12.4h, v0.4h
+ smlal2 v23.4s, v12.8h, v0.8h
+ uzp1 v21.8h, v14.8h, v30.8h
+ uzp1 v29.8h, v1.8h, v20.8h
+ uzp1 v1.8h, v11.8h, v22.8h
+ smlal2 v8.4s, v13.8h, v17.8h
+ ld1 { v9.8h }, [x6], #16
+ smlal v18.4s, v13.4h, v17.4h
+ uzp1 v24.8h, v19.8h, v4.8h
+ uzp2 v7.8h, v26.8h, v23.8h
+ smull v26.4s, v1.4h, v21.4h
+ smlal v18.4s, v16.4h, v6.4h
+ uzp2 v19.8h, v19.8h, v4.8h
+ smlal2 v8.4s, v16.8h, v6.8h
+ uzp2 v28.8h, v11.8h, v22.8h
+ smull2 v23.4s, v1.8h, v21.8h
+ uzp1 v13.8h, v25.8h, v3.8h
+ smlal2 v23.4s, v28.8h, v15.8h
+ ldur q11, [x8, #-0x10]
+ smlal2 v23.4s, v29.8h, v24.8h
+ ld1 { v4.8h }, [x9], #16
+ smlal2 v23.4s, v31.8h, v9.8h
+ uzp1 v12.8h, v18.8h, v8.8h
+ uzp2 v20.8h, v14.8h, v30.8h
+ smlal v26.4s, v28.4h, v15.4h
+ str q5, [x0], #0x20
+ mul v14.8h, v12.8h, v2.8h
+ stur q10, [x0, #-0x10]
+ uzp2 v17.8h, v27.8h, v11.8h
+ subs x13, x13, #0x1
+ cbnz x13, Lpolyvec_basemul_acc_montgomery_cached_k3_loop_start
+ uzp2 v3.8h, v25.8h, v3.8h
+ smull2 v16.4s, v1.8h, v20.8h
+ smull v25.4s, v1.4h, v20.4h
+ uzp1 v22.8h, v27.8h, v11.8h
+ smlal2 v16.4s, v28.8h, v21.8h
+ smlal v25.4s, v28.4h, v21.4h
+ smlal2 v16.4s, v29.8h, v19.8h
+ smlal v25.4s, v29.4h, v19.4h
+ smlal2 v16.4s, v31.8h, v24.8h
+ smlal v25.4s, v31.4h, v24.4h
+ smlal v25.4s, v13.4h, v17.4h
+ smlal2 v16.4s, v13.8h, v17.8h
+ smlal2 v16.4s, v3.8h, v22.8h
+ smlal v25.4s, v3.4h, v22.4h
+ smlal2 v23.4s, v13.8h, v22.8h
+ smlal v26.4s, v29.4h, v24.4h
+ smlal v26.4s, v31.4h, v9.4h
+ smlal v26.4s, v13.4h, v22.4h
+ uzp1 v10.8h, v25.8h, v16.8h
+ smlal2 v23.4s, v3.8h, v4.8h
+ smlal v26.4s, v3.4h, v4.4h
+ mul v13.8h, v10.8h, v2.8h
+ smlal v18.4s, v14.4h, v0.4h
+ smlal2 v8.4s, v14.8h, v0.8h
+ uzp1 v3.8h, v26.8h, v23.8h
+ mul v24.8h, v3.8h, v2.8h
+ uzp2 v17.8h, v18.8h, v8.8h
+ smlal v25.4s, v13.4h, v0.4h
+ smlal2 v16.4s, v13.8h, v0.8h
+ zip1 v21.8h, v7.8h, v17.8h
+ zip2 v20.8h, v7.8h, v17.8h
+ smlal2 v23.4s, v24.8h, v0.8h
+ str q21, [x0], #0x20
+ smlal v26.4s, v24.4h, v0.4h
+ uzp2 v13.8h, v25.8h, v16.8h
+ stur q20, [x0, #-0x10]
+ uzp2 v23.8h, v26.8h, v23.8h
+ zip1 v18.8h, v23.8h, v13.8h
+ zip2 v13.8h, v23.8h, v13.8h
+ str q18, [x0], #0x20
+ stur q13, [x0, #-0x10]
+ ldp d8, d9, [sp]
+ .cfi_restore d8
+ .cfi_restore d9
+ ldp d10, d11, [sp, #0x10]
+ .cfi_restore d10
+ .cfi_restore d11
+ ldp d12, d13, [sp, #0x20]
+ .cfi_restore d12
+ .cfi_restore d13
+ ldp d14, d15, [sp, #0x30]
+ .cfi_restore d14
+ .cfi_restore d15
+ add sp, sp, #0x40
+ .cfi_adjust_cfa_offset -0x40
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_asm_k3)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED && \
(MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S
index 78f8693774..7c09167b17 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S
@@ -12,300 +12,357 @@
* https://eprint.iacr.org/2021/986
*/
+/*yaml
+ Name: polyvec_basemul_acc_montgomery_cached_asm_k4
+ Description: Re-implementation of asymmetric base multiplication following @[NeonNTT] for k=4
+ Signature: void mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(int16_t r[256], const int16_t a[1024], const int16_t b[1024], const int16_t b_cache[512])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: write-only
+ c_parameter: int16_t r[256]
+ description: Output polynomial
+ x1:
+ type: buffer
+ size_bytes: 2048
+ permissions: read-only
+ c_parameter: const int16_t a[1024]
+ description: Input polynomial vector a
+ x2:
+ type: buffer
+ size_bytes: 2048
+ permissions: read-only
+ c_parameter: const int16_t b[1024]
+ description: Input polynomial vector b
+ x3:
+ type: buffer
+ size_bytes: 1024
+ permissions: read-only
+ c_parameter: const int16_t b_cache[512]
+ description: Cached values for b
+ Stack:
+ bytes: 64
+ description: saving callee-saved Neon registers
+*/
+
/* Re-implementation of asymmetric base multiplication following @[NeonNTT] */
#include "../../../common.h"
-#if defined(MLK_ARITH_BACKEND_AARCH64) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
- (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
+#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4)
MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k4)
- sub sp, sp, #0x40
- stp d8, d9, [sp]
- stp d10, d11, [sp, #0x10]
- stp d12, d13, [sp, #0x20]
- stp d14, d15, [sp, #0x30]
- mov w14, #0xd01 // =3329
- dup v0.8h, w14
- mov w14, #0xcff // =3327
- dup v2.8h, w14
- add x4, x1, #0x200
- add x5, x2, #0x200
- add x6, x3, #0x100
- add x7, x1, #0x400
- add x8, x2, #0x400
- add x9, x3, #0x200
- add x10, x1, #0x600
- add x11, x2, #0x600
- add x12, x3, #0x300
- mov x13, #0x10 // =16
- ldr q23, [x2, #0x10]
- ldr q19, [x2], #0x20
- ldr q17, [x5], #0x20
- uzp2 v13.8h, v19.8h, v23.8h
- uzp1 v19.8h, v19.8h, v23.8h
- ldur q23, [x5, #-0x10]
- ldr q30, [x1, #0x10]
- uzp2 v9.8h, v17.8h, v23.8h
- uzp1 v23.8h, v17.8h, v23.8h
- ldr q17, [x1], #0x20
- ldr q10, [x7, #0x10]
- uzp1 v12.8h, v17.8h, v30.8h
- uzp2 v17.8h, v17.8h, v30.8h
- smull2 v30.4s, v12.8h, v13.8h
- smull v13.4s, v12.4h, v13.4h
- smull2 v22.4s, v12.8h, v19.8h
- smull v12.4s, v12.4h, v19.4h
- smlal2 v30.4s, v17.8h, v19.8h
- smlal v13.4s, v17.4h, v19.4h
- ldr q19, [x4], #0x20
- ldur q16, [x4, #-0x10]
- ld1 { v8.8h }, [x3], #16
- uzp1 v26.8h, v19.8h, v16.8h
- uzp2 v19.8h, v19.8h, v16.8h
- smlal2 v30.4s, v26.8h, v9.8h
- smlal v13.4s, v26.4h, v9.4h
- smlal2 v22.4s, v17.8h, v8.8h
- smlal v12.4s, v17.4h, v8.4h
- smlal2 v30.4s, v19.8h, v23.8h
- smlal v13.4s, v19.4h, v23.4h
- smlal2 v22.4s, v26.8h, v23.8h
- smlal v12.4s, v26.4h, v23.4h
- ldr q23, [x7], #0x20
- ldr q17, [x8, #0x10]
- uzp1 v9.8h, v23.8h, v10.8h
- uzp2 v23.8h, v23.8h, v10.8h
- ldr q10, [x10], #0x20
- ldur q16, [x10, #-0x10]
- ld1 { v8.8h }, [x12], #16
- uzp1 v26.8h, v10.8h, v16.8h
- uzp2 v10.8h, v10.8h, v16.8h
- ld1 { v16.8h }, [x6], #16
- ldr q3, [x11, #0x10]
- smlal2 v22.4s, v19.8h, v16.8h
- smlal v12.4s, v19.4h, v16.4h
- ldr q19, [x11], #0x20
- ld1 { v16.8h }, [x9], #16
- uzp1 v4.8h, v19.8h, v3.8h
- uzp2 v19.8h, v19.8h, v3.8h
- ldr q3, [x8], #0x20
- ldr q31, [x2], #0x20
- uzp1 v6.8h, v3.8h, v17.8h
- uzp2 v17.8h, v3.8h, v17.8h
- smlal2 v22.4s, v9.8h, v6.8h
- smlal2 v30.4s, v9.8h, v17.8h
- smlal v13.4s, v9.4h, v17.4h
- smlal v12.4s, v9.4h, v6.4h
- smlal2 v22.4s, v23.8h, v16.8h
- smlal2 v30.4s, v23.8h, v6.8h
- smlal v13.4s, v23.4h, v6.4h
- smlal v12.4s, v23.4h, v16.4h
- smlal2 v22.4s, v26.8h, v4.8h
- smlal2 v30.4s, v26.8h, v19.8h
- smlal v13.4s, v26.4h, v19.4h
- smlal v12.4s, v26.4h, v4.4h
- smlal2 v22.4s, v10.8h, v8.8h
- smlal2 v30.4s, v10.8h, v4.8h
- smlal v13.4s, v10.4h, v4.4h
- smlal v12.4s, v10.4h, v8.4h
- ldur q19, [x2, #-0x10]
- uzp1 v23.8h, v13.8h, v30.8h
- uzp1 v17.8h, v12.8h, v22.8h
- mul v23.8h, v23.8h, v2.8h
- uzp2 v21.8h, v31.8h, v19.8h
- uzp1 v19.8h, v31.8h, v19.8h
- mul v17.8h, v17.8h, v2.8h
- smlal v13.4s, v23.4h, v0.4h
- smlal2 v30.4s, v23.8h, v0.8h
- ldr q23, [x5], #0x20
- smlal2 v22.4s, v17.8h, v0.8h
- uzp2 v15.8h, v13.8h, v30.8h
- smlal v12.4s, v17.4h, v0.4h
- ldur q17, [x5, #-0x10]
- ldr q13, [x1, #0x10]
- uzp2 v27.8h, v23.8h, v17.8h
- uzp1 v28.8h, v23.8h, v17.8h
- uzp2 v7.8h, v12.8h, v22.8h
- ldr q23, [x1], #0x20
- zip1 v5.8h, v7.8h, v15.8h
- ldr q3, [x7, #0x10]
- uzp1 v31.8h, v23.8h, v13.8h
- uzp2 v16.8h, v23.8h, v13.8h
- smull2 v24.4s, v31.8h, v21.8h
- ldr q6, [x8, #0x10]
- ldr q23, [x10], #0x20
- smlal2 v24.4s, v16.8h, v19.8h
- ldur q17, [x10, #-0x10]
- ld1 { v22.8h }, [x12], #16
- uzp1 v30.8h, v23.8h, v17.8h
- uzp2 v11.8h, v23.8h, v17.8h
- ldr q23, [x4], #0x20
- ldur q17, [x4, #-0x10]
- ldr q4, [x7], #0x20
- uzp1 v20.8h, v23.8h, v17.8h
- uzp2 v26.8h, v23.8h, v17.8h
- uzp1 v9.8h, v4.8h, v3.8h
- smlal2 v24.4s, v20.8h, v27.8h
- ld1 { v8.8h }, [x6], #16
- ldr q25, [x11, #0x10]
- ldr q29, [x11], #0x20
- ld1 { v12.8h }, [x9], #16
- uzp1 v10.8h, v29.8h, v25.8h
- ldr q14, [x8], #0x20
- ld1 { v23.8h }, [x3], #16
- sub x13, x13, #0x2
+ .cfi_startproc
+ sub sp, sp, #0x40
+ .cfi_adjust_cfa_offset 0x40
+ stp d8, d9, [sp]
+ .cfi_rel_offset d8, 0x0
+ .cfi_rel_offset d9, 0x8
+ stp d10, d11, [sp, #0x10]
+ .cfi_rel_offset d10, 0x10
+ .cfi_rel_offset d11, 0x18
+ stp d12, d13, [sp, #0x20]
+ .cfi_rel_offset d12, 0x20
+ .cfi_rel_offset d13, 0x28
+ stp d14, d15, [sp, #0x30]
+ .cfi_rel_offset d14, 0x30
+ .cfi_rel_offset d15, 0x38
+ mov w14, #0xd01 // =3329
+ dup v0.8h, w14
+ mov w14, #0xcff // =3327
+ dup v2.8h, w14
+ add x4, x1, #0x200
+ add x5, x2, #0x200
+ add x6, x3, #0x100
+ add x7, x1, #0x400
+ add x8, x2, #0x400
+ add x9, x3, #0x200
+ add x10, x1, #0x600
+ add x11, x2, #0x600
+ add x12, x3, #0x300
+ mov x13, #0x10 // =16
+ ldr q28, [x1], #0x20
+ ldur q5, [x1, #-0x10]
+ ldr q31, [x2], #0x20
+ ldur q27, [x2, #-0x10]
+ ldr q7, [x5], #0x20
+ ldr q10, [x4], #0x20
+ ldur q18, [x5, #-0x10]
+ ldur q9, [x4, #-0x10]
+ uzp1 v11.8h, v28.8h, v5.8h
+ uzp2 v19.8h, v28.8h, v5.8h
+ uzp2 v4.8h, v31.8h, v27.8h
+ uzp1 v1.8h, v31.8h, v27.8h
+ ldr q29, [x7], #0x20
+ ldr q28, [x8, #0x10]
+ uzp1 v24.8h, v10.8h, v9.8h
+ uzp1 v17.8h, v7.8h, v18.8h
+ uzp2 v7.8h, v7.8h, v18.8h
+ ldr q21, [x8], #0x20
+ uzp2 v27.8h, v10.8h, v9.8h
+ ldur q6, [x7, #-0x10]
+ smull v18.4s, v11.4h, v4.4h
+ ld1 { v9.8h }, [x3], #16
+ smull2 v8.4s, v11.8h, v4.8h
+ ldr q16, [x11], #0x20
+ smlal2 v8.4s, v19.8h, v1.8h
+ ldur q14, [x11, #-0x10]
+ smlal v18.4s, v19.4h, v1.4h
+ uzp1 v10.8h, v21.8h, v28.8h
+ smlal v18.4s, v24.4h, v7.4h
+ ldr q4, [x10], #0x20
+ smlal2 v8.4s, v24.8h, v7.8h
+ ld1 { v12.8h }, [x6], #16
+ smull2 v23.4s, v11.8h, v1.8h
+ uzp2 v13.8h, v29.8h, v6.8h
+ smull v26.4s, v11.4h, v1.4h
+ uzp1 v29.8h, v29.8h, v6.8h
+ smlal v26.4s, v19.4h, v9.4h
+ ldur q15, [x10, #-0x10]
+ smlal2 v23.4s, v19.8h, v9.8h
+ uzp2 v9.8h, v21.8h, v28.8h
+ smlal v18.4s, v27.4h, v17.4h
+ uzp2 v6.8h, v16.8h, v14.8h
+ uzp1 v21.8h, v16.8h, v14.8h
+ smlal2 v8.4s, v27.8h, v17.8h
+ smlal2 v8.4s, v29.8h, v9.8h
+ uzp1 v30.8h, v4.8h, v15.8h
+ uzp2 v16.8h, v4.8h, v15.8h
+ smlal v18.4s, v29.4h, v9.4h
+ smlal2 v8.4s, v13.8h, v10.8h
+ ld1 { v15.8h }, [x9], #16
+ smlal v18.4s, v13.4h, v10.4h
+ ldr q11, [x4], #0x20
+ smlal v18.4s, v30.4h, v6.4h
+ ldr q7, [x2], #0x20
+ smlal2 v8.4s, v30.8h, v6.8h
+ ld1 { v9.8h }, [x12], #16
+ smlal2 v23.4s, v24.8h, v17.8h
+ ldur q4, [x2, #-0x10]
+ smlal v26.4s, v24.4h, v17.4h
+ ldur q25, [x4, #-0x10]
+ smlal2 v8.4s, v16.8h, v21.8h
+ ldr q5, [x5], #0x20
+ smlal v18.4s, v16.4h, v21.4h
+ ldur q22, [x5, #-0x10]
+ smlal v26.4s, v27.4h, v12.4h
+ ldr q19, [x1, #0x10]
+ smlal v26.4s, v29.4h, v10.4h
+ ld1 { v20.8h }, [x3], #16
+ smlal v26.4s, v13.4h, v15.4h
+ uzp1 v24.8h, v7.8h, v4.8h
+ smlal2 v23.4s, v27.8h, v12.8h
+ uzp1 v28.8h, v18.8h, v8.8h
+ smlal v26.4s, v30.4h, v21.4h
+ uzp2 v27.8h, v11.8h, v25.8h
+ smlal2 v23.4s, v29.8h, v10.8h
+ uzp2 v31.8h, v7.8h, v4.8h
+ smlal2 v23.4s, v13.8h, v15.8h
+ uzp1 v14.8h, v5.8h, v22.8h
+ uzp1 v17.8h, v11.8h, v25.8h
+ smlal v26.4s, v16.4h, v9.4h
+ mul v29.8h, v28.8h, v2.8h
+ sub x13, x13, #0x2
-polyvec_basemul_acc_montgomery_cached_k4_loop:
- smlal2 v24.4s, v26.8h, v28.8h
- uzp2 v4.8h, v4.8h, v3.8h
- smull2 v13.4s, v31.8h, v19.8h
- ldr q3, [x2], #0x20
- uzp2 v1.8h, v29.8h, v25.8h
- smlal2 v13.4s, v16.8h, v23.8h
- ldur q17, [x2, #-0x10]
- smull v18.4s, v31.4h, v19.4h
- smlal2 v13.4s, v20.8h, v28.8h
- smull v29.4s, v31.4h, v21.4h
- ldr q21, [x5], #0x20
- smlal2 v13.4s, v26.8h, v8.8h
- smlal v29.4s, v16.4h, v19.4h
- ldur q19, [x5, #-0x10]
- smlal v18.4s, v16.4h, v23.4h
- smlal v29.4s, v20.4h, v27.4h
- uzp1 v31.8h, v14.8h, v6.8h
- uzp2 v27.8h, v21.8h, v19.8h
- smlal v18.4s, v20.4h, v28.4h
- ldr q25, [x1, #0x10]
- smlal v29.4s, v26.4h, v28.4h
- smlal v18.4s, v26.4h, v8.4h
- uzp2 v26.8h, v14.8h, v6.8h
- smlal2 v13.4s, v9.8h, v31.8h
- smlal2 v24.4s, v9.8h, v26.8h
- smlal v29.4s, v9.4h, v26.4h
- smlal v18.4s, v9.4h, v31.4h
- smlal2 v13.4s, v4.8h, v12.8h
- smlal2 v24.4s, v4.8h, v31.8h
- smlal v29.4s, v4.4h, v31.4h
- smlal v18.4s, v4.4h, v12.4h
- smlal2 v13.4s, v30.8h, v10.8h
- smlal2 v24.4s, v30.8h, v1.8h
- smlal v29.4s, v30.4h, v1.4h
- smlal v18.4s, v30.4h, v10.4h
- smlal2 v13.4s, v11.8h, v22.8h
- smlal2 v24.4s, v11.8h, v10.8h
- smlal v29.4s, v11.4h, v10.4h
- smlal v18.4s, v11.4h, v22.4h
- ldr q22, [x1], #0x20
- uzp1 v31.8h, v29.8h, v24.8h
- uzp1 v28.8h, v21.8h, v19.8h
- mul v19.8h, v31.8h, v2.8h
- uzp1 v31.8h, v22.8h, v25.8h
- uzp2 v16.8h, v22.8h, v25.8h
- uzp2 v21.8h, v3.8h, v17.8h
- smlal v29.4s, v19.4h, v0.4h
- smlal2 v24.4s, v19.8h, v0.8h
- uzp1 v19.8h, v3.8h, v17.8h
- uzp1 v26.8h, v18.8h, v13.8h
- zip2 v14.8h, v7.8h, v15.8h
- mul v23.8h, v26.8h, v2.8h
- uzp2 v15.8h, v29.8h, v24.8h
- smull2 v24.4s, v31.8h, v21.8h
- str q14, [x0, #0x10]
- ldr q3, [x7, #0x10]
- ldr q6, [x8, #0x10]
- ldr q8, [x10], #0x20
- ldur q26, [x10, #-0x10]
- ld1 { v22.8h }, [x12], #16
- uzp1 v30.8h, v8.8h, v26.8h
- uzp2 v11.8h, v8.8h, v26.8h
- ldr q8, [x4], #0x20
- ldur q26, [x4, #-0x10]
- ldr q4, [x7], #0x20
- uzp1 v20.8h, v8.8h, v26.8h
- uzp2 v26.8h, v8.8h, v26.8h
- ld1 { v8.8h }, [x6], #16
- uzp1 v9.8h, v4.8h, v3.8h
- ldr q25, [x11, #0x10]
- ldr q29, [x11], #0x20
- ld1 { v12.8h }, [x9], #16
- ldr q14, [x8], #0x20
- smlal2 v24.4s, v16.8h, v19.8h
- smlal2 v13.4s, v23.8h, v0.8h
- smlal v18.4s, v23.4h, v0.4h
- ld1 { v23.8h }, [x3], #16
- smlal2 v24.4s, v20.8h, v27.8h
- uzp2 v7.8h, v18.8h, v13.8h
- uzp1 v10.8h, v29.8h, v25.8h
- str q5, [x0], #0x20
- zip1 v5.8h, v7.8h, v15.8h
- sub x13, x13, #0x1
- cbnz x13, polyvec_basemul_acc_montgomery_cached_k4_loop
- smull2 v17.4s, v31.8h, v19.8h
- uzp2 v1.8h, v14.8h, v6.8h
- smull v18.4s, v31.4h, v21.4h
- smlal2 v24.4s, v26.8h, v28.8h
- smlal2 v17.4s, v16.8h, v23.8h
- smull v21.4s, v31.4h, v19.4h
- smlal v18.4s, v16.4h, v19.4h
- uzp2 v31.8h, v4.8h, v3.8h
- uzp1 v3.8h, v14.8h, v6.8h
- smlal v21.4s, v16.4h, v23.4h
- smlal v18.4s, v20.4h, v27.4h
- uzp2 v14.8h, v29.8h, v25.8h
- smlal2 v17.4s, v20.8h, v28.8h
- smlal v21.4s, v20.4h, v28.4h
- smlal v18.4s, v26.4h, v28.4h
- smlal2 v24.4s, v9.8h, v1.8h
- smlal2 v17.4s, v26.8h, v8.8h
- smlal v21.4s, v26.4h, v8.4h
- smlal v18.4s, v9.4h, v1.4h
- smlal2 v24.4s, v31.8h, v3.8h
- smlal2 v17.4s, v9.8h, v3.8h
- smlal v21.4s, v9.4h, v3.4h
- smlal v18.4s, v31.4h, v3.4h
- smlal2 v24.4s, v30.8h, v14.8h
- smlal2 v17.4s, v31.8h, v12.8h
- smlal v21.4s, v31.4h, v12.4h
- smlal v18.4s, v30.4h, v14.4h
- smlal2 v24.4s, v11.8h, v10.8h
- smlal2 v17.4s, v30.8h, v10.8h
- smlal v21.4s, v30.4h, v10.4h
- smlal v18.4s, v11.4h, v10.4h
- zip2 v19.8h, v7.8h, v15.8h
- smlal2 v17.4s, v11.8h, v22.8h
- smlal v21.4s, v11.4h, v22.4h
- uzp1 v23.8h, v18.8h, v24.8h
- str q19, [x0, #0x10]
- mul v19.8h, v23.8h, v2.8h
- uzp1 v23.8h, v21.8h, v17.8h
- str q5, [x0], #0x20
- mul v26.8h, v23.8h, v2.8h
- smlal v18.4s, v19.4h, v0.4h
- smlal2 v24.4s, v19.8h, v0.8h
- smlal v21.4s, v26.4h, v0.4h
- smlal2 v17.4s, v26.8h, v0.8h
- uzp2 v13.8h, v18.8h, v24.8h
- uzp2 v19.8h, v21.8h, v17.8h
- zip1 v23.8h, v19.8h, v13.8h
- zip2 v19.8h, v19.8h, v13.8h
- str q23, [x0], #0x20
- stur q19, [x0, #-0x10]
- ldp d8, d9, [sp]
- ldp d10, d11, [sp, #0x10]
- ldp d12, d13, [sp, #0x20]
- ldp d14, d15, [sp, #0x30]
- add sp, sp, #0x40
+Lpolyvec_basemul_acc_montgomery_cached_k4_loop_start:
+ smlal2 v23.4s, v30.8h, v21.8h
+ ldr q11, [x1], #0x20
+ uzp2 v15.8h, v5.8h, v22.8h
+ smlal v18.4s, v29.4h, v0.4h
+ ldr q12, [x7], #0x20
+ smlal2 v8.4s, v29.8h, v0.8h
+ ldur q3, [x7, #-0x10]
+ ldr q21, [x8], #0x20
+ uzp1 v29.8h, v11.8h, v19.8h
+ ldur q13, [x8, #-0x10]
+ uzp2 v5.8h, v11.8h, v19.8h
+ smlal2 v23.4s, v16.8h, v9.8h
+ uzp2 v28.8h, v18.8h, v8.8h
+ smull2 v8.4s, v29.8h, v31.8h
+ smlal2 v8.4s, v5.8h, v24.8h
+ uzp1 v7.8h, v12.8h, v3.8h
+ smlal2 v8.4s, v17.8h, v15.8h
+ uzp2 v11.8h, v21.8h, v13.8h
+ uzp1 v4.8h, v26.8h, v23.8h
+ smlal2 v8.4s, v27.8h, v14.8h
+ smlal2 v8.4s, v7.8h, v11.8h
+ mul v6.8h, v4.8h, v2.8h
+ ldr q19, [x11], #0x20
+ uzp2 v25.8h, v12.8h, v3.8h
+ ldr q12, [x10], #0x20
+ smull v18.4s, v29.4h, v31.4h
+ ldur q3, [x10, #-0x10]
+ smlal v18.4s, v5.4h, v24.4h
+ uzp1 v4.8h, v21.8h, v13.8h
+ smlal v18.4s, v17.4h, v15.4h
+ ldur q13, [x11, #-0x10]
+ ld1 { v1.8h }, [x6], #16
+ smlal v26.4s, v6.4h, v0.4h
+ smlal2 v23.4s, v6.8h, v0.8h
+ ld1 { v10.8h }, [x9], #16
+ smlal v18.4s, v27.4h, v14.4h
+ uzp1 v30.8h, v12.8h, v3.8h
+ smlal2 v8.4s, v25.8h, v4.8h
+ uzp2 v31.8h, v19.8h, v13.8h
+ smlal v18.4s, v7.4h, v11.4h
+ ld1 { v9.8h }, [x12], #16
+ smlal v18.4s, v25.4h, v4.4h
+ uzp1 v21.8h, v19.8h, v13.8h
+ uzp2 v16.8h, v12.8h, v3.8h
+ smlal v18.4s, v30.4h, v31.4h
+ smlal2 v8.4s, v30.8h, v31.8h
+ uzp2 v31.8h, v26.8h, v23.8h
+ smlal2 v8.4s, v16.8h, v21.8h
+ smlal v18.4s, v16.4h, v21.4h
+ zip1 v15.8h, v31.8h, v28.8h
+ ldr q19, [x1, #0x10]
+ smull2 v23.4s, v29.8h, v24.8h
+ smull v26.4s, v29.4h, v24.4h
+ ldr q3, [x2, #0x10]
+ smlal v26.4s, v5.4h, v20.4h
+ ldr q11, [x2], #0x20
+ uzp1 v6.8h, v18.8h, v8.8h
+ smlal v26.4s, v17.4h, v14.4h
+ smlal v26.4s, v27.4h, v1.4h
+ zip2 v13.8h, v31.8h, v28.8h
+ smlal v26.4s, v7.4h, v4.4h
+ str q15, [x0], #0x20
+ smlal v26.4s, v25.4h, v10.4h
+ stur q13, [x0, #-0x10]
+ mul v29.8h, v6.8h, v2.8h
+ uzp1 v24.8h, v11.8h, v3.8h
+ uzp2 v31.8h, v11.8h, v3.8h
+ ldr q11, [x4], #0x20
+ smlal2 v23.4s, v5.8h, v20.8h
+ ldur q28, [x4, #-0x10]
+ smlal2 v23.4s, v17.8h, v14.8h
+ ldr q5, [x5], #0x20
+ smlal2 v23.4s, v27.8h, v1.8h
+ ldur q22, [x5, #-0x10]
+ smlal v26.4s, v30.4h, v21.4h
+ ld1 { v20.8h }, [x3], #16
+ smlal v26.4s, v16.4h, v9.4h
+ uzp1 v17.8h, v11.8h, v28.8h
+ smlal2 v23.4s, v7.8h, v4.8h
+ uzp2 v27.8h, v11.8h, v28.8h
+ smlal2 v23.4s, v25.8h, v10.8h
+ uzp1 v14.8h, v5.8h, v22.8h
+ subs x13, x13, #0x1
+ cbnz x13, Lpolyvec_basemul_acc_montgomery_cached_k4_loop_start
+ smlal v18.4s, v29.4h, v0.4h
+ ldr q11, [x1], #0x20
+ uzp2 v28.8h, v5.8h, v22.8h
+ smlal2 v23.4s, v30.8h, v21.8h
+ smlal2 v8.4s, v29.8h, v0.8h
+ ldr q15, [x8, #0x10]
+ smlal2 v23.4s, v16.8h, v9.8h
+ ldr q21, [x8], #0x20
+ uzp1 v22.8h, v11.8h, v19.8h
+ uzp2 v12.8h, v11.8h, v19.8h
+ ldr q1, [x7, #0x10]
+ ld1 { v6.8h }, [x6], #16
+ uzp2 v3.8h, v18.8h, v8.8h
+ smull v9.4s, v22.4h, v31.4h
+ smull2 v18.4s, v22.8h, v31.8h
+ ldr q16, [x7], #0x20
+ smull v19.4s, v22.4h, v24.4h
+ uzp1 v30.8h, v21.8h, v15.8h
+ uzp2 v25.8h, v21.8h, v15.8h
+ smull2 v8.4s, v22.8h, v24.8h
+ smlal v19.4s, v12.4h, v20.4h
+ ldr q13, [x10, #0x10]
+ smlal2 v8.4s, v12.8h, v20.8h
+ uzp1 v29.8h, v16.8h, v1.8h
+ smlal2 v18.4s, v12.8h, v24.8h
+ ldr q5, [x10], #0x20
+ smlal v9.4s, v12.4h, v24.4h
+ ldr q4, [x11], #0x20
+ smlal v9.4s, v17.4h, v28.4h
+ ldur q22, [x11, #-0x10]
+ smlal2 v18.4s, v17.8h, v28.8h
+ uzp2 v16.8h, v16.8h, v1.8h
+ smlal v19.4s, v17.4h, v14.4h
+ ld1 { v28.8h }, [x9], #16
+ smlal2 v8.4s, v17.8h, v14.8h
+ uzp1 v7.8h, v5.8h, v13.8h
+ smlal v9.4s, v27.4h, v14.4h
+ uzp1 v17.8h, v4.8h, v22.8h
+ smlal2 v18.4s, v27.8h, v14.8h
+ uzp2 v12.8h, v5.8h, v13.8h
+ uzp2 v21.8h, v4.8h, v22.8h
+ smlal v19.4s, v27.4h, v6.4h
+ smlal2 v8.4s, v27.8h, v6.8h
+ ld1 { v15.8h }, [x12], #16
+ smlal v19.4s, v29.4h, v30.4h
+ uzp1 v20.8h, v26.8h, v23.8h
+ smlal v9.4s, v29.4h, v25.4h
+ smlal2 v18.4s, v29.8h, v25.8h
+ smlal2 v8.4s, v29.8h, v30.8h
+ smlal v19.4s, v16.4h, v28.4h
+ smlal2 v8.4s, v16.8h, v28.8h
+ smlal2 v18.4s, v16.8h, v30.8h
+ smlal v9.4s, v16.4h, v30.4h
+ smlal v9.4s, v7.4h, v21.4h
+ smlal2 v18.4s, v7.8h, v21.8h
+ smlal2 v8.4s, v7.8h, v17.8h
+ smlal v19.4s, v7.4h, v17.4h
+ smlal v19.4s, v12.4h, v15.4h
+ smlal2 v8.4s, v12.8h, v15.8h
+ smlal2 v18.4s, v12.8h, v17.8h
+ smlal v9.4s, v12.4h, v17.4h
+ mul v6.8h, v20.8h, v2.8h
+ uzp1 v4.8h, v19.8h, v8.8h
+ mul v17.8h, v4.8h, v2.8h
+ uzp1 v12.8h, v9.8h, v18.8h
+ smlal v26.4s, v6.4h, v0.4h
+ mul v21.8h, v12.8h, v2.8h
+ smlal2 v23.4s, v6.8h, v0.8h
+ smlal2 v8.4s, v17.8h, v0.8h
+ smlal v19.4s, v17.4h, v0.4h
+ smlal2 v18.4s, v21.8h, v0.8h
+ uzp2 v23.8h, v26.8h, v23.8h
+ smlal v9.4s, v21.4h, v0.4h
+ zip2 v12.8h, v23.8h, v3.8h
+ zip1 v22.8h, v23.8h, v3.8h
+ uzp2 v14.8h, v19.8h, v8.8h
+ uzp2 v18.8h, v9.8h, v18.8h
+ str q12, [x0, #0x10]
+ str q22, [x0], #0x20
+ zip2 v24.8h, v14.8h, v18.8h
+ zip1 v21.8h, v14.8h, v18.8h
+ str q24, [x0, #0x10]
+ str q21, [x0], #0x20
+ ldp d8, d9, [sp]
+ .cfi_restore d8
+ .cfi_restore d9
+ ldp d10, d11, [sp, #0x10]
+ .cfi_restore d10
+ .cfi_restore d11
+ ldp d12, d13, [sp, #0x20]
+ .cfi_restore d12
+ .cfi_restore d13
+ ldp d14, d15, [sp, #0x30]
+ .cfi_restore d14
+ .cfi_restore d15
+ add sp, sp, #0x40
+ .cfi_adjust_cfa_offset -0x40
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_asm_k4)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED && \
(MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/rej_uniform_asm.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/rej_uniform_asm.S
index 6bf3b0c958..c1ad796c23 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/rej_uniform_asm.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/rej_uniform_asm.S
@@ -3,21 +3,39 @@
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
-/*************************************************
- * Name: mlk_rej_uniform_asm
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- * uniform random integers mod q
- *
- * Arguments: - int16_t *r: pointer to output buffer of MLKEM_N
- * 16-bit coefficients.
- * - const uint8_t *buf: pointer to input buffer
- * (assumed to be uniform random bytes)
- * - unsigned buflen: length of input buffer in bytes.
- * Must be a multiple of 24.
- *
- * Returns number of sampled 16-bit integers (at most MLKEM_N).
- **************************************************/
+/*yaml
+ Name: rej_uniform_asm
+ Description: Run rejection sampling on uniform random bytes to generate uniform random integers mod q
+ Signature: uint64_t mlk_rej_uniform_asm(int16_t r[256], const uint8_t *buf, unsigned buflen, const uint8_t table[2048])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: write-only
+ c_parameter: int16_t r[256]
+ description: Output buffer
+ x1:
+ type: buffer
+ size_bytes: x2
+ permissions: read-only
+ c_parameter: const uint8_t *buf
+ description: Input buffer
+ x2:
+ type: scalar
+ c_parameter: unsigned buflen
+ description: Length of input buffer (must be multiple of 24)
+ test_with: 504 # MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE
+ x3:
+ type: buffer
+ size_bytes: 2048
+ permissions: read-only
+ c_parameter: const uint8_t table[2048]
+ description: Lookup table
+ Stack:
+ bytes: 576
+ description: register preservation and temporary storage
+*/
+
#include "../../../common.h"
#if defined(MLK_ARITH_BACKEND_AARCH64) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
@@ -27,173 +45,182 @@
* dev/aarch64_opt/src/rej_uniform_asm.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(rej_uniform_asm)
MLK_ASM_FN_SYMBOL(rej_uniform_asm)
- sub sp, sp, #0x240
- mov x7, #0x1 // =1
- movk x7, #0x2, lsl #16
- movk x7, #0x4, lsl #32
- movk x7, #0x8, lsl #48
- mov v31.d[0], x7
- mov x7, #0x10 // =16
- movk x7, #0x20, lsl #16
- movk x7, #0x40, lsl #32
- movk x7, #0x80, lsl #48
- mov v31.d[1], x7
- mov w11, #0xd01 // =3329
- dup v30.8h, w11
- mov x8, sp
- mov x7, x8
- mov x11, #0x0 // =0
- eor v16.16b, v16.16b, v16.16b
+ .cfi_startproc
+ sub sp, sp, #0x240
+ .cfi_adjust_cfa_offset 0x240
+ mov x7, #0x1 // =1
+ movk x7, #0x2, lsl #16
+ movk x7, #0x4, lsl #32
+ movk x7, #0x8, lsl #48
+ mov v31.d[0], x7
+ mov x7, #0x10 // =16
+ movk x7, #0x20, lsl #16
+ movk x7, #0x40, lsl #32
+ movk x7, #0x80, lsl #48
+ mov v31.d[1], x7
+ mov w11, #0xd01 // =3329
+ dup v30.8h, w11
+ mov x8, sp
+ mov x7, x8
+ mov x11, #0x0 // =0
+ eor v16.16b, v16.16b, v16.16b
-rej_uniform_initial_zero:
- str q16, [x7], #0x40
- stur q16, [x7, #-0x30]
- stur q16, [x7, #-0x20]
- stur q16, [x7, #-0x10]
- add x11, x11, #0x20
- cmp x11, #0x100
- b.lt rej_uniform_initial_zero
- mov x7, x8
- mov x9, #0x0 // =0
- mov x4, #0x100 // =256
- cmp x2, #0x30
- b.lo rej_uniform_loop48_end
+Lrej_uniform_initial_zero:
+ str q16, [x7], #0x40
+ stur q16, [x7, #-0x30]
+ stur q16, [x7, #-0x20]
+ stur q16, [x7, #-0x10]
+ add x11, x11, #0x20
+ cmp x11, #0x100
+ b.lt Lrej_uniform_initial_zero
+ mov x7, x8
+ mov x9, #0x0 // =0
+ mov x4, #0x100 // =256
+ cmp x2, #0x30
+ b.lo Lrej_uniform_loop48_end
-rej_uniform_loop48:
- cmp x9, x4
- b.hs rej_uniform_memory_copy
- sub x2, x2, #0x30
- ld3 { v0.16b, v1.16b, v2.16b }, [x1], #48
- zip1 v4.16b, v0.16b, v1.16b
- zip2 v5.16b, v0.16b, v1.16b
- zip1 v6.16b, v1.16b, v2.16b
- zip2 v7.16b, v1.16b, v2.16b
- bic v4.8h, #0xf0, lsl #8
- bic v5.8h, #0xf0, lsl #8
- ushr v6.8h, v6.8h, #0x4
- ushr v7.8h, v7.8h, #0x4
- zip1 v16.8h, v4.8h, v6.8h
- zip2 v17.8h, v4.8h, v6.8h
- zip1 v18.8h, v5.8h, v7.8h
- zip2 v19.8h, v5.8h, v7.8h
- cmhi v4.8h, v30.8h, v16.8h
- cmhi v5.8h, v30.8h, v17.8h
- cmhi v6.8h, v30.8h, v18.8h
- cmhi v7.8h, v30.8h, v19.8h
- and v4.16b, v4.16b, v31.16b
- and v5.16b, v5.16b, v31.16b
- and v6.16b, v6.16b, v31.16b
- and v7.16b, v7.16b, v31.16b
- uaddlv s20, v4.8h
- uaddlv s21, v5.8h
- uaddlv s22, v6.8h
- uaddlv s23, v7.8h
- fmov w12, s20
- fmov w13, s21
- fmov w14, s22
- fmov w15, s23
- ldr q24, [x3, x12, lsl #4]
- ldr q25, [x3, x13, lsl #4]
- ldr q26, [x3, x14, lsl #4]
- ldr q27, [x3, x15, lsl #4]
- cnt v4.16b, v4.16b
- cnt v5.16b, v5.16b
- cnt v6.16b, v6.16b
- cnt v7.16b, v7.16b
- uaddlv s20, v4.8h
- uaddlv s21, v5.8h
- uaddlv s22, v6.8h
- uaddlv s23, v7.8h
- fmov w12, s20
- fmov w13, s21
- fmov w14, s22
- fmov w15, s23
- tbl v16.16b, { v16.16b }, v24.16b
- tbl v17.16b, { v17.16b }, v25.16b
- tbl v18.16b, { v18.16b }, v26.16b
- tbl v19.16b, { v19.16b }, v27.16b
- str q16, [x7]
- add x7, x7, x12, lsl #1
- str q17, [x7]
- add x7, x7, x13, lsl #1
- str q18, [x7]
- add x7, x7, x14, lsl #1
- str q19, [x7]
- add x7, x7, x15, lsl #1
- add x12, x12, x13
- add x14, x14, x15
- add x9, x9, x12
- add x9, x9, x14
- cmp x2, #0x30
- b.hs rej_uniform_loop48
+Lrej_uniform_loop48:
+ cmp x9, x4
+ b.hs Lrej_uniform_memory_copy
+ sub x2, x2, #0x30
+ ld3 { v0.16b, v1.16b, v2.16b }, [x1], #48
+ zip1 v4.16b, v0.16b, v1.16b
+ zip2 v5.16b, v0.16b, v1.16b
+ zip1 v6.16b, v1.16b, v2.16b
+ zip2 v7.16b, v1.16b, v2.16b
+ bic v4.8h, #0xf0, lsl #8
+ bic v5.8h, #0xf0, lsl #8
+ ushr v6.8h, v6.8h, #0x4
+ ushr v7.8h, v7.8h, #0x4
+ zip1 v16.8h, v4.8h, v6.8h
+ zip2 v17.8h, v4.8h, v6.8h
+ zip1 v18.8h, v5.8h, v7.8h
+ zip2 v19.8h, v5.8h, v7.8h
+ cmhi v4.8h, v30.8h, v16.8h
+ cmhi v5.8h, v30.8h, v17.8h
+ cmhi v6.8h, v30.8h, v18.8h
+ cmhi v7.8h, v30.8h, v19.8h
+ and v4.16b, v4.16b, v31.16b
+ and v5.16b, v5.16b, v31.16b
+ and v6.16b, v6.16b, v31.16b
+ and v7.16b, v7.16b, v31.16b
+ uaddlv s20, v4.8h
+ uaddlv s21, v5.8h
+ uaddlv s22, v6.8h
+ uaddlv s23, v7.8h
+ fmov w12, s20
+ fmov w13, s21
+ fmov w14, s22
+ fmov w15, s23
+ ldr q24, [x3, x12, lsl #4]
+ ldr q25, [x3, x13, lsl #4]
+ ldr q26, [x3, x14, lsl #4]
+ ldr q27, [x3, x15, lsl #4]
+ cnt v4.16b, v4.16b
+ cnt v5.16b, v5.16b
+ cnt v6.16b, v6.16b
+ cnt v7.16b, v7.16b
+ uaddlv s20, v4.8h
+ uaddlv s21, v5.8h
+ uaddlv s22, v6.8h
+ uaddlv s23, v7.8h
+ fmov w12, s20
+ fmov w13, s21
+ fmov w14, s22
+ fmov w15, s23
+ tbl v16.16b, { v16.16b }, v24.16b
+ tbl v17.16b, { v17.16b }, v25.16b
+ tbl v18.16b, { v18.16b }, v26.16b
+ tbl v19.16b, { v19.16b }, v27.16b
+ st1 { v16.8h }, [x7]
+ add x7, x7, x12, lsl #1
+ st1 { v17.8h }, [x7]
+ add x7, x7, x13, lsl #1
+ st1 { v18.8h }, [x7]
+ add x7, x7, x14, lsl #1
+ st1 { v19.8h }, [x7]
+ add x7, x7, x15, lsl #1
+ add x12, x12, x13
+ add x14, x14, x15
+ add x9, x9, x12
+ add x9, x9, x14
+ cmp x2, #0x30
+ b.hs Lrej_uniform_loop48
-rej_uniform_loop48_end:
- cmp x9, x4
- b.hs rej_uniform_memory_copy
- cmp x2, #0x18
- b.lo rej_uniform_memory_copy
- sub x2, x2, #0x18
- ld3 { v0.8b, v1.8b, v2.8b }, [x1], #24
- zip1 v4.16b, v0.16b, v1.16b
- zip1 v5.16b, v1.16b, v2.16b
- bic v4.8h, #0xf0, lsl #8
- ushr v5.8h, v5.8h, #0x4
- zip1 v16.8h, v4.8h, v5.8h
- zip2 v17.8h, v4.8h, v5.8h
- cmhi v4.8h, v30.8h, v16.8h
- cmhi v5.8h, v30.8h, v17.8h
- and v4.16b, v4.16b, v31.16b
- and v5.16b, v5.16b, v31.16b
- uaddlv s20, v4.8h
- uaddlv s21, v5.8h
- fmov w12, s20
- fmov w13, s21
- ldr q24, [x3, x12, lsl #4]
- ldr q25, [x3, x13, lsl #4]
- cnt v4.16b, v4.16b
- cnt v5.16b, v5.16b
- uaddlv s20, v4.8h
- uaddlv s21, v5.8h
- fmov w12, s20
- fmov w13, s21
- tbl v16.16b, { v16.16b }, v24.16b
- tbl v17.16b, { v17.16b }, v25.16b
- str q16, [x7]
- add x7, x7, x12, lsl #1
- str q17, [x7]
- add x7, x7, x13, lsl #1
- add x9, x9, x12
- add x9, x9, x13
+Lrej_uniform_loop48_end:
+ cmp x9, x4
+ b.hs Lrej_uniform_memory_copy
+ cmp x2, #0x18
+ b.lo Lrej_uniform_memory_copy
+ sub x2, x2, #0x18
+ ld3 { v0.8b, v1.8b, v2.8b }, [x1], #24
+ zip1 v4.16b, v0.16b, v1.16b
+ zip1 v5.16b, v1.16b, v2.16b
+ bic v4.8h, #0xf0, lsl #8
+ ushr v5.8h, v5.8h, #0x4
+ zip1 v16.8h, v4.8h, v5.8h
+ zip2 v17.8h, v4.8h, v5.8h
+ cmhi v4.8h, v30.8h, v16.8h
+ cmhi v5.8h, v30.8h, v17.8h
+ and v4.16b, v4.16b, v31.16b
+ and v5.16b, v5.16b, v31.16b
+ uaddlv s20, v4.8h
+ uaddlv s21, v5.8h
+ fmov w12, s20
+ fmov w13, s21
+ ldr q24, [x3, x12, lsl #4]
+ ldr q25, [x3, x13, lsl #4]
+ cnt v4.16b, v4.16b
+ cnt v5.16b, v5.16b
+ uaddlv s20, v4.8h
+ uaddlv s21, v5.8h
+ fmov w12, s20
+ fmov w13, s21
+ tbl v16.16b, { v16.16b }, v24.16b
+ tbl v17.16b, { v17.16b }, v25.16b
+ st1 { v16.8h }, [x7]
+ add x7, x7, x12, lsl #1
+ st1 { v17.8h }, [x7]
+ add x7, x7, x13, lsl #1
+ add x9, x9, x12
+ add x9, x9, x13
-rej_uniform_memory_copy:
- cmp x9, x4
- csel x9, x9, x4, lo
- mov x11, #0x0 // =0
- mov x7, x8
+Lrej_uniform_memory_copy:
+ cmp x9, x4
+ csel x9, x9, x4, lo
+ mov x11, #0x0 // =0
+ mov x7, x8
-rej_uniform_final_copy:
- ldr q16, [x7], #0x40
- ldur q17, [x7, #-0x30]
- ldur q18, [x7, #-0x20]
- ldur q19, [x7, #-0x10]
- str q16, [x0], #0x40
- stur q17, [x0, #-0x30]
- stur q18, [x0, #-0x20]
- stur q19, [x0, #-0x10]
- add x11, x11, #0x20
- cmp x11, #0x100
- b.lt rej_uniform_final_copy
- mov x0, x9
- b rej_uniform_return
+Lrej_uniform_final_copy:
+ ldr q16, [x7], #0x40
+ ldur q17, [x7, #-0x30]
+ ldur q18, [x7, #-0x20]
+ ldur q19, [x7, #-0x10]
+ str q16, [x0], #0x40
+ stur q17, [x0, #-0x30]
+ stur q18, [x0, #-0x20]
+ stur q19, [x0, #-0x10]
+ add x11, x11, #0x20
+ cmp x11, #0x100
+ b.lt Lrej_uniform_final_copy
+ mov x0, x9
+ b Lrej_uniform_return
-rej_uniform_return:
- add sp, sp, #0x240
+Lrej_uniform_return:
+ add sp, sp, #0x240
+ .cfi_adjust_cfa_offset -0x240
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(rej_uniform_asm)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/rej_uniform_table.c
index 74a931bc4a..9a7bc210a4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/rej_uniform_table.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/aarch64/src/rej_uniform_table.c
@@ -5,6 +5,7 @@
/*
* WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
* Do not modify it directly.
*/
@@ -13,7 +14,6 @@
#if defined(MLK_ARITH_BACKEND_AARCH64) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
#include "arith_native_aarch64.h"
/*
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/api.h
index aea28a3af4..0308f2bd51 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/api.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/api.h
@@ -17,10 +17,18 @@
* and run sanity checks.
*/
-#include
#include "../cbmc.h"
#include "../common.h"
+/* Backends must return MLK_NATIVE_FUNC_SUCCESS upon success. */
+#define MLK_NATIVE_FUNC_SUCCESS (0)
+/* Backends may return MLK_NATIVE_FUNC_FALLBACK to signal to the frontend that
+ * the target/parameters are unsupported; typically, this would be because of
+ * dependencies on CPU features not detected on the host CPU. In this case,
+ * the frontend falls back to the default C implementation. */
+#define MLK_NATIVE_FUNC_FALLBACK (-1)
+
+
/* Absolute exclusive upper bound for the output of the inverse NTT
*
* NOTE: This is the same bound as in poly.h and has to be kept
@@ -74,12 +82,16 @@
*
* Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial
**************************************************/
-static MLK_INLINE void mlk_ntt_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_ntt_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
requires(array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLK_NTT_BOUND))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLK_NTT_BOUND))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_NTT */
@@ -140,11 +152,14 @@ __contract__(
*
* Arguments: - uint16_t *a: pointer to in/output polynomial
**************************************************/
-static MLK_INLINE void mlk_intt_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_intt_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLK_INVNTT_BOUND))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLK_INVNTT_BOUND))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_INTT */
@@ -156,11 +171,14 @@ __contract__(
*
* Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial
**************************************************/
-static MLK_INLINE void mlk_poly_reduce_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_reduce_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
@@ -173,11 +191,14 @@ __contract__(
*
* Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial
**************************************************/
-static MLK_INLINE void mlk_poly_tomont_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tomont_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
@@ -203,13 +224,15 @@ __contract__(
* OUTPUT
* - cache: pointer to multiplication cache
**************************************************/
-static MLK_INLINE void mlk_poly_mulcache_compute_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_mulcache_compute_native(
int16_t cache[MLKEM_N / 2], const int16_t mlk_poly[MLKEM_N])
__contract__(
requires(memory_no_alias(cache, sizeof(int16_t) * (MLKEM_N / 2)))
requires(memory_no_alias(mlk_poly, sizeof(int16_t) * MLKEM_N))
- assigns(object_whole(cache))
- ensures(array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
+ assigns(memory_slice(cache, sizeof(int16_t) * (MLKEM_N / 2)))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
);
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
@@ -234,7 +257,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N],
const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)])
__contract__(
@@ -244,6 +268,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 2 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 2 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 */
@@ -267,7 +292,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N],
const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)])
__contract__(
@@ -277,6 +303,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 3 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 3 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3 */
@@ -300,7 +327,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N],
const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)])
__contract__(
@@ -310,6 +338,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 4 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 4 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
@@ -324,18 +353,20 @@ __contract__(
*
* Arguments: INPUT:
* - a: const pointer to input polynomial,
- * with each coefficient in the range -Q+1 .. Q-1
+ * with each coefficient in the range 0 .. Q-1
* OUTPUT
* - r: pointer to output byte array
* (of MLKEM_POLYBYTES bytes)
**************************************************/
-static MLK_INLINE void mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
- const int16_t a[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+ const int16_t a[MLKEM_N])
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
);
#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
@@ -353,13 +384,15 @@ __contract__(
* - a: const pointer to input byte array
* (of MLKEM_POLYBYTES bytes)
**************************************************/
-static MLK_INLINE void mlk_poly_frombytes_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_frombytes_native(
int16_t a[MLKEM_N], const uint8_t r[MLKEM_POLYBYTES])
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(a, sizeof(int16_t) * MLKEM_N))
- ensures(array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
);
#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
@@ -381,6 +414,7 @@ __contract__(
* Otherwise, returns non-negative number of sampled 16-bit integers (at most
* len).
**************************************************/
+MLK_MUST_CHECK_RETURN_VALUE
static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
const uint8_t *buf,
unsigned buflen)
@@ -389,8 +423,10 @@ __contract__(
requires(memory_no_alias(r, sizeof(int16_t) * len))
requires(memory_no_alias(buf, buflen))
assigns(memory_slice(r, sizeof(int16_t) * len))
- ensures(return_value == -1 || (0 <= return_value && return_value <= len))
- ensures(return_value != -1 ==> array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
+ ensures(return_value != MLK_NATIVE_FUNC_FALLBACK
+ ==> (0 <= return_value && return_value <= len))
+ ensures(return_value != MLK_NATIVE_FUNC_FALLBACK
+ ==> array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
);
#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
@@ -408,8 +444,15 @@ __contract__(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d4_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d4_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
@@ -425,8 +468,15 @@ static MLK_INLINE void mlk_poly_compress_d4_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d10_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d10_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
@@ -444,8 +494,15 @@ static MLK_INLINE void mlk_poly_compress_d10_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d4_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d4_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
@@ -463,8 +520,15 @@ static MLK_INLINE void mlk_poly_decompress_d4_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d10_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d10_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
@@ -482,8 +546,15 @@ static MLK_INLINE void mlk_poly_decompress_d10_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d5_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d5_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
@@ -499,8 +570,15 @@ static MLK_INLINE void mlk_poly_compress_d5_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d11_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d11_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
@@ -518,8 +596,15 @@ static MLK_INLINE void mlk_poly_compress_d11_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d5_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d5_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
@@ -537,8 +622,15 @@ static MLK_INLINE void mlk_poly_decompress_d5_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d11_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d11_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/meta.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/meta.h
index f2b9b848b7..4291d629b1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/meta.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/native/meta.h
@@ -18,4 +18,8 @@
#include "x86_64/meta.h"
#endif
+#if defined(MLK_SYS_RISCV64_RVV)
+#include "riscv64/meta.h"
+#endif
+
#endif /* !MLK_NATIVE_META_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/params.h
index 3f81bb0e2e..04598539c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/params.h
@@ -5,12 +5,6 @@
#ifndef MLK_PARAMS_H
#define MLK_PARAMS_H
-#if defined(MLK_CONFIG_FILE)
-#include MLK_CONFIG_FILE
-#else
-#include "config.h"
-#endif
-
#if !defined(MLK_CONFIG_PARAMETER_SET)
#error MLK_CONFIG_PARAMETER_SET is not defined
#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly.c
index 40d29948c8..564d5d712b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly.c
@@ -20,8 +20,7 @@
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
-#include
+
#include "cbmc.h"
#include "debug.h"
#include "poly.h"
@@ -29,9 +28,6 @@
#include "symmetric.h"
#include "verify.h"
-#if !defined(MLK_USE_NATIVE_POLY_TOMONT) || \
- !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
- !defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_fqmul
*
@@ -68,10 +64,7 @@ __contract__(
mlk_assert_abs_bound(&res, 1, MLKEM_Q);
return res;
}
-#endif /* !MLK_USE_NATIVE_POLY_TOMONT || !MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE \
- || !MLK_USE_NATIVE_NTT || !MLK_USE_NATIVE_INTT */
-#if !defined(MLK_USE_NATIVE_POLY_REDUCE) || !defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_barrett_reduce
*
@@ -107,7 +100,7 @@ __contract__(
* Here, we assume it's sign-preserving "arithmetic" shift right.
* See (C99 6.5.7 (5))
*/
- const int32_t t = (magic * a + (1 << 25)) >> 26;
+ const int32_t t = (magic * a + ((int32_t)1 << 25)) >> 26;
/*
* t is in -10 .. +10, so we need 32-bit math to
@@ -118,12 +111,14 @@ __contract__(
mlk_assert_abs_bound(&res, 1, MLKEM_Q_HALF);
return res;
}
-#endif /* !MLK_USE_NATIVE_POLY_REDUCE || !MLK_USE_NATIVE_INTT */
-#if !defined(MLK_USE_NATIVE_POLY_TOMONT)
/* Reference: `poly_tomont()` in the reference implementation @[REF]. */
-MLK_INTERNAL_API
-void mlk_poly_tomont(mlk_poly *r)
+MLK_STATIC_TESTABLE void mlk_poly_tomont_c(mlk_poly *r)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+)
{
unsigned i;
const int16_t f = 1353; /* check-magic: 1353 == signed_mod(2^32, MLKEM_Q) */
@@ -137,16 +132,23 @@ void mlk_poly_tomont(mlk_poly *r)
mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_TOMONT */
+
MLK_INTERNAL_API
void mlk_poly_tomont(mlk_poly *r)
{
- mlk_poly_tomont_native(r->coeffs);
- mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_TOMONT)
+ int ret;
+ ret = mlk_poly_tomont_native(r->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
-#if !defined(MLK_USE_NATIVE_POLY_REDUCE)
+ mlk_poly_tomont_c(r);
+}
+
/************************************************************
* Name: mlk_scalar_signed_to_unsigned_q
*
@@ -162,7 +164,7 @@ void mlk_poly_tomont(mlk_poly *r)
* - Used here to implement different semantics of `poly_reduce()`;
* see below. in the reference implementation @[REF], this logic is
* part of all compression functions (see `compress.c`). */
-static MLK_INLINE uint16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
+static MLK_INLINE int16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
__contract__(
requires(c > -MLKEM_Q && c < MLKEM_Q)
ensures(return_value >= 0 && return_value < MLKEM_Q)
@@ -170,12 +172,14 @@ __contract__(
{
mlk_assert_abs_bound(&c, 1, MLKEM_Q);
- /* Add Q if c is negative, but in constant time */
- c = mlk_ct_sel_int16(c + MLKEM_Q, c, mlk_ct_cmask_neg_i16(c));
+ /* Add MLKEM_Q if c is negative, but in constant time.
+ *
+ * Note that c + MLKEM_Q does not overflow in int16_t,
+ * so the cast to uint16_t is safe. */
+ c = mlk_ct_sel_int16((int16_t)(c + MLKEM_Q), c, mlk_ct_cmask_neg_i16(c));
- /* and therefore cast to uint16_t is safe. */
mlk_assert_bound(&c, 1, 0, MLKEM_Q);
- return (uint16_t)c;
+ return c;
}
/* Reference: `poly_reduce()` in the reference implementation @[REF]
@@ -185,10 +189,15 @@ __contract__(
* here to go from signed to unsigned representatives.
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
-MLK_INTERNAL_API
-void mlk_poly_reduce(mlk_poly *r)
+MLK_STATIC_TESTABLE void mlk_poly_reduce_c(mlk_poly *r)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
+
for (i = 0; i < MLKEM_N; i++)
__loop__(
invariant(i <= MLKEM_N)
@@ -202,15 +211,23 @@ void mlk_poly_reduce(mlk_poly *r)
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_REDUCE */
+
MLK_INTERNAL_API
void mlk_poly_reduce(mlk_poly *r)
{
- mlk_poly_reduce_native(r->coeffs);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_REDUCE)
+ int ret;
+ ret = mlk_poly_reduce_native(r->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
+ mlk_poly_reduce_c(r);
+}
+
/* Reference: `poly_add()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
@@ -224,7 +241,8 @@ void mlk_poly_add(mlk_poly *r, const mlk_poly *b)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
{
- r->coeffs[i] = r->coeffs[i] + b->coeffs[i];
+ /* The preconditions imply that the addition stays within int16_t. */
+ r->coeffs[i] = (int16_t)(r->coeffs[i] + b->coeffs[i]);
}
}
@@ -241,24 +259,24 @@ void mlk_poly_sub(mlk_poly *r, const mlk_poly *b)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
{
- r->coeffs[i] = r->coeffs[i] - b->coeffs[i];
+ /* The preconditions imply that the subtraction stays within int16_t. */
+ r->coeffs[i] = (int16_t)(r->coeffs[i] - b->coeffs[i]);
}
}
-/* Include zeta table unless NTT, invNTT and mulcache computation
- * have been replaced by native implementations. */
-#if !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
- !defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT)
#include "zetas.inc"
-#endif
-#if !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
/* Reference: Does not exist in the reference implementation @[REF].
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This idea originates
* from @[NeonNTT] and is used at the C level here. */
-MLK_INTERNAL_API
-void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_mulcache_compute_c(mlk_poly_mulcache *x,
+ const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 4; i++)
@@ -266,8 +284,11 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
invariant(i <= MLKEM_N / 4)
invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
{
- x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
- x->coeffs[2 * i + 1] = mlk_fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
+ x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], mlk_zetas[64 + i]);
+ /* The values in zeta table are <= MLKEM_Q in absolute value,
+ * so the negation in int16_t is safe. */
+ x->coeffs[2 * i + 1] =
+ mlk_fqmul(a->coeffs[4 * i + 3], (int16_t)(-mlk_zetas[64 + i]));
}
/*
@@ -278,15 +299,22 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
*/
mlk_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
MLK_INTERNAL_API
void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
{
- mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
-}
+#if defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+ int ret;
+ ret = mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
-#if !defined(MLK_USE_NATIVE_NTT)
+ mlk_poly_mulcache_compute_c(x, a);
+}
+
/*
* Computes a block CT butterflies with a fixed twiddle factor,
* using Montgomery multiplication.
@@ -316,7 +344,8 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
/* Reference: Embedded in `ntt()` in the reference implementation @[REF]. */
static void mlk_ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
- unsigned start, unsigned len, int bound)
+ unsigned start, unsigned len,
+ unsigned bound)
__contract__(
requires(start < MLKEM_N)
requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
@@ -346,8 +375,9 @@ __contract__(
{
int16_t t;
t = mlk_fqmul(r[j + len], zeta);
- r[j + len] = r[j] - t;
- r[j] = r[j] + t;
+ /* The precondition implies that the arithmetic does not overflow. */
+ r[j + len] = (int16_t)(r[j] - t);
+ r[j] = (int16_t)(r[j] + t);
}
}
@@ -370,7 +400,7 @@ __contract__(
unsigned start, k, len;
/* Twiddle factors for layer n are at indices 2^(n-1)..2^n-1. */
k = 1u << (layer - 1);
- len = MLKEM_N >> layer;
+ len = (unsigned)MLKEM_N >> layer;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
invariant(start < MLKEM_N + 2 * len)
@@ -378,7 +408,7 @@ __contract__(
invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
{
- int16_t zeta = zetas[k++];
+ int16_t zeta = mlk_zetas[k++];
mlk_ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
}
}
@@ -395,12 +425,19 @@ __contract__(
/* Reference: `ntt()` in the reference implementation @[REF].
* - Iterate over `layer` instead of `len` in the outer loop
* to simplify computation of zeta index. */
-MLK_INTERNAL_API
-void mlk_poly_ntt(mlk_poly *p)
+MLK_STATIC_TESTABLE void mlk_poly_ntt_c(mlk_poly *p)
+__contract__(
+ requires(memory_no_alias(p, sizeof(mlk_poly)))
+ requires(array_abs_bound(p->coeffs, 0, MLKEM_N, MLKEM_Q))
+ assigns(memory_slice(p, sizeof(mlk_poly)))
+ ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_NTT_BOUND))
+)
{
unsigned layer;
int16_t *r;
+
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+
r = p->coeffs;
for (layer = 1; layer <= 7; layer++)
@@ -414,18 +451,24 @@ void mlk_poly_ntt(mlk_poly *p)
/* Check the stronger bound */
mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
}
-#else /* !MLK_USE_NATIVE_NTT */
MLK_INTERNAL_API
void mlk_poly_ntt(mlk_poly *p)
{
+#if defined(MLK_USE_NATIVE_NTT)
+ int ret;
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
- mlk_ntt_native(p->coeffs);
- mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
-}
+ ret = mlk_ntt_native(p->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
+ return;
+ }
#endif /* MLK_USE_NATIVE_NTT */
-#if !defined(MLK_USE_NATIVE_INTT)
+ mlk_poly_ntt_c(p);
+}
+
/* Compute one layer of inverse NTT */
@@ -439,7 +482,7 @@ __contract__(
ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
unsigned start, k, len;
- len = (MLKEM_N >> layer);
+ len = (unsigned)MLKEM_N >> layer;
k = (1u << layer) - 1;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
@@ -449,7 +492,7 @@ __contract__(
invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
{
unsigned j;
- int16_t zeta = zetas[k--];
+ int16_t zeta = mlk_zetas[k--];
for (j = start; j < start + len; j++)
__loop__(
invariant(start <= j && j <= start + len)
@@ -457,8 +500,9 @@ __contract__(
invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
int16_t t = r[j];
- r[j] = mlk_barrett_reduce(t + r[j + len]);
- r[j + len] = r[j + len] - t;
+ /* The preconditions imply that the arithmetic does not overflow. */
+ r[j] = mlk_barrett_reduce((int16_t)(t + r[j + len]));
+ r[j + len] = (int16_t)(r[j + len] - t);
r[j + len] = mlk_fqmul(r[j + len], zeta);
}
}
@@ -469,18 +513,22 @@ __contract__(
* while the reference implementation normalizes at
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
-MLK_INTERNAL_API
-void mlk_poly_invntt_tomont(mlk_poly *p)
+MLK_STATIC_TESTABLE void mlk_poly_invntt_tomont_c(mlk_poly *p)
+__contract__(
+ requires(memory_no_alias(p, sizeof(mlk_poly)))
+ assigns(memory_slice(p, sizeof(mlk_poly)))
+ ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND))
+)
{
+ unsigned j, layer;
+ const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
+ int16_t *r = p->coeffs;
+
/*
* Scale input polynomial to account for Montgomery factor
* and NTT twist. This also brings coefficients down to
* absolute value < MLKEM_Q.
*/
- unsigned j, layer;
- const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
- int16_t *r = p->coeffs;
-
for (j = 0; j < MLKEM_N; j++)
__loop__(
invariant(j <= MLKEM_N)
@@ -500,16 +548,23 @@ void mlk_poly_invntt_tomont(mlk_poly *p)
mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
}
-#else /* !MLK_USE_NATIVE_INTT */
MLK_INTERNAL_API
void mlk_poly_invntt_tomont(mlk_poly *p)
{
- mlk_intt_native(p->coeffs);
- mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
-}
+#if defined(MLK_USE_NATIVE_INTT)
+ int ret;
+ ret = mlk_intt_native(p->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
+ return;
+ }
#endif /* MLK_USE_NATIVE_INTT */
+ mlk_poly_invntt_tomont_c(p);
+}
+
#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
MLK_EMPTY_CU(mlk_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly.h
index 20fb65e720..587062cce5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly.h
@@ -15,8 +15,7 @@
#ifndef MLK_POLY_H
#define MLK_POLY_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include "debug.h"
@@ -46,34 +45,6 @@ typedef struct
int16_t coeffs[MLKEM_N >> 1];
} MLK_ALIGN mlk_poly_mulcache;
-/*************************************************
- * Name: mlk_cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- * input x in 0 .. 32767: returns value unchanged
- * input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
-{
- /*
- * PORTABILITY: This relies on uint16_t -> int16_t
- * being implemented as the inverse of int16_t -> uint16_t,
- * which is implementation-defined (C99 6.3.1.3 (3))
- * CBMC (correctly) fails to prove this conversion is OK,
- * so we have to suppress that check here
- */
- return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
/*************************************************
* Name: mlk_montgomery_reduce
*
@@ -90,7 +61,7 @@ static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
static MLK_ALWAYS_INLINE int16_t mlk_montgomery_reduce(int32_t a)
__contract__(
requires(a < +(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)) &&
- a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
+ a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
/* We don't attempt to express an input-dependent output bound
* as the post-condition here. There are two call-sites for this
* function:
@@ -102,8 +73,8 @@ __contract__(
/* check-magic: 62209 == unsigned_mod(pow(MLKEM_Q, -1, 2^16), 2^16) */
const uint32_t QINV = 62209;
- /* Compute a*q^{-1} mod 2^16 in unsigned representatives */
- const uint16_t a_reduced = a & UINT16_MAX;
+ /* Compute a*q^{-1} mod 2^16 in unsigned representatives. */
+ const uint16_t a_reduced = mlk_cast_int32_to_uint16(a);
const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
/* Lift to signed canonical representative mod 2^16. */
@@ -187,7 +158,7 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
__contract__(
requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
requires(memory_no_alias(a, sizeof(mlk_poly)))
- assigns(object_whole(x))
+ assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
);
#define mlk_poly_reduce MLK_NAMESPACE(poly_reduce)
@@ -280,7 +251,7 @@ __contract__(
requires(forall(k0, 0, MLKEM_N, (int32_t) r->coeffs[k0] - b->coeffs[k0] <= INT16_MAX))
requires(forall(k1, 0, MLKEM_N, (int32_t) r->coeffs[k1] - b->coeffs[k1] >= INT16_MIN))
ensures(forall(k, 0, MLKEM_N, r->coeffs[k] == old(*r).coeffs[k] - b->coeffs[k]))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
);
#define mlk_poly_ntt MLK_NAMESPACE(poly_ntt)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly_k.c
index f15ab96ce7..32b214ee04 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly_k.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly_k.c
@@ -22,12 +22,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
+#include "poly_k.h"
-#include "compress.h"
#include "debug.h"
-#include "poly_k.h"
#include "sampling.h"
#include "symmetric.h"
@@ -37,6 +34,8 @@
* within a single compilation unit. */
#define mlk_poly_cbd_eta1 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta1)
#define mlk_poly_cbd_eta2 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta2)
+#define mlk_polyvec_basemul_acc_montgomery_cached_c \
+ MLK_ADD_PARAM_SET(mlk_polyvec_basemul_acc_montgomery_cached_c)
/* End of parameter set namespacing */
/* Reference: `polyvec_compress()` in the reference implementation @[REF]
@@ -46,29 +45,29 @@
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
- const mlk_polyvec a)
+ const mlk_polyvec *a)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a[i]);
+ mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
}
}
/* Reference: `polyvec_decompress()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_decompress_du(mlk_polyvec r,
+void mlk_polyvec_decompress_du(mlk_polyvec *r,
const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_decompress_du(&r[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+ mlk_poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `polyvec_tobytes()` in the reference implementation @[REF].
@@ -77,41 +76,45 @@ void mlk_polyvec_decompress_du(mlk_polyvec r,
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
-void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a)
+void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec *a)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(r, MLKEM_POLYVECBYTES))
+ invariant(i <= MLKEM_K)
+ )
{
- mlk_poly_tobytes(r + i * MLKEM_POLYBYTES, &a[i]);
+ mlk_poly_tobytes(&r[i * MLKEM_POLYBYTES], &a->vec[i]);
}
}
/* Reference: `polyvec_frombytes()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES])
+void mlk_polyvec_frombytes(mlk_polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_frombytes(&r[i], a + i * MLKEM_POLYBYTES);
+ mlk_poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
}
/* Reference: `polyvec_ntt()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_ntt(mlk_polyvec r)
+void mlk_polyvec_ntt(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_ntt(&r[i]);
+ mlk_poly_ntt(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
}
/* Reference: `polyvec_invntt_tomont()` in the reference implementation @[REF].
@@ -120,18 +123,17 @@ void mlk_polyvec_ntt(mlk_polyvec r)
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
MLK_INTERNAL_API
-void mlk_polyvec_invntt_tomont(mlk_polyvec r)
+void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_invntt_tomont(&r[i]);
+ mlk_poly_invntt_tomont(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND);
}
-#if !defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
/* Reference: `polyvec_basemul_acc_montgomery()` in the
* reference implementation @[REF].
* - We use a multiplication cache ('mulcache') here
@@ -143,13 +145,22 @@ void mlk_polyvec_invntt_tomont(mlk_polyvec r)
* at the end. The reference implementation uses 2 * MLKEM_K
* more modular reductions since it reduces after every modular
* multiplication. */
-MLK_INTERNAL_API
-void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+MLK_STATIC_TESTABLE void mlk_polyvec_basemul_acc_montgomery_cached_c(
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, sizeof(mlk_polyvec)))
+ requires(memory_no_alias(b, sizeof(mlk_polyvec)))
+ requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
+ requires(forall(k1, 0, MLKEM_K,
+ array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(invariant(i <= MLKEM_N / 2))
{
@@ -163,53 +174,59 @@ void mlk_polyvec_basemul_acc_montgomery_cached(
t[1] <= ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768) &&
t[1] >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768)))
{
- t[0] += (int32_t)a[k].coeffs[2 * i + 1] * b_cache[k].coeffs[i];
- t[0] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i];
- t[1] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i + 1];
- t[1] += (int32_t)a[k].coeffs[2 * i + 1] * b[k].coeffs[2 * i];
+ t[0] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b_cache->vec[k].coeffs[i];
+ t[0] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i];
+ t[1] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i + 1];
+ t[1] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b->vec[k].coeffs[2 * i];
}
r->coeffs[2 * i + 0] = mlk_montgomery_reduce(t[0]);
r->coeffs[2 * i + 1] = mlk_montgomery_reduce(t[1]);
}
}
-#else /* !MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
MLK_INTERNAL_API
void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
{
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
- /* Omitting bounds assertion for cache since native implementations may
- * decide not to use a mulcache. Note that the C backend implementation
- * of poly_basemul_montgomery_cached() does still include the check. */
+#if defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+ {
+ int ret;
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
#if MLKEM_K == 2
- mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#elif MLKEM_K == 3
- mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#elif MLKEM_K == 4
- mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#endif
-}
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
+ }
#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+ mlk_polyvec_basemul_acc_montgomery_cached_c(r, a, b, b_cache);
+}
+
/* Reference: Does not exist in the reference implementation @[REF].
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This idea originates
* from @[NeonNTT] and is used at the C level here. */
MLK_INTERNAL_API
-void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
+void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_mulcache_compute(&x[i], &a[i]);
+ mlk_poly_mulcache_compute(&x->vec[i], &a->vec[i]);
}
}
@@ -221,41 +238,53 @@ void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
MLK_INTERNAL_API
-void mlk_polyvec_reduce(mlk_polyvec r)
+void mlk_polyvec_reduce(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_reduce(&r[i]);
+ mlk_poly_reduce(&r->vec[i]);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `polyvec_add()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
MLK_INTERNAL_API
-void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b)
+void mlk_polyvec_add(mlk_polyvec *r, const mlk_polyvec *b)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(r, sizeof(mlk_polyvec)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(j0, i, MLKEM_K,
+ forall(k0, 0, MLKEM_N,
+ ((int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX) &&
+ ((int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] >= INT16_MIN))))
+ invariant(forall(j2, 0, i,
+ forall(k2, 0, MLKEM_N,
+ (r->vec[j2].coeffs[k2] <= INT16_MAX) &&
+ (r->vec[j2].coeffs[k2] >= INT16_MIN))))
+ )
{
- mlk_poly_add(&r[i], &b[i]);
+ mlk_poly_add(&r->vec[i], &b->vec[i]);
}
}
/* Reference: `polyvec_tomont()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_tomont(mlk_polyvec r)
+void mlk_polyvec_tomont(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_tomont(&r[i]);
+ mlk_poly_tomont(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLKEM_Q);
}
@@ -306,24 +335,41 @@ void mlk_poly_getnoise_eta1_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
{
MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
- memcpy(extkey[0], seed, MLKEM_SYMBYTES);
- memcpy(extkey[1], seed, MLKEM_SYMBYTES);
- memcpy(extkey[2], seed, MLKEM_SYMBYTES);
- memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
extkey[0][MLKEM_SYMBYTES] = nonce0;
extkey[1][MLKEM_SYMBYTES] = nonce1;
extkey[2][MLKEM_SYMBYTES] = nonce2;
extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) && \
+ !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
mlk_prf_eta1_x4(buf, extkey);
+#else
+ mlk_prf_eta1(buf[0], extkey[0]);
+ mlk_prf_eta1(buf[1], extkey[1]);
+ mlk_prf_eta1(buf[2], extkey[2]);
+ if (r3 != NULL)
+ {
+ mlk_prf_eta1(buf[3], extkey[3]);
+ }
+#endif /* !(!FIPS202_X4_DEFAULT_IMPLEMENTATION && \
+ !MLK_CONFIG_SERIAL_FIPS202_ONLY) */
+
mlk_poly_cbd_eta1(r0, buf[0]);
mlk_poly_cbd_eta1(r1, buf[1]);
mlk_poly_cbd_eta1(r2, buf[2]);
- mlk_poly_cbd_eta1(r3, buf[3]);
+ if (r3 != NULL)
+ {
+ mlk_poly_cbd_eta1(r3, buf[3]);
+ mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+ }
mlk_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
- mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -364,7 +410,7 @@ __contract__(
#endif
}
-/* Reference: `poly_getnoise_eta1()` in the reference implementation @[REF].
+/* Reference: `poly_getnoise_eta2()` in the reference implementation @[REF].
* - We include buffer zeroization. */
MLK_INTERNAL_API
void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
@@ -373,13 +419,13 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
MLK_ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
MLK_ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
- memcpy(extkey, seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey, seed, MLKEM_SYMBYTES);
extkey[MLKEM_SYMBYTES] = nonce;
mlk_prf_eta2(buf, extkey);
mlk_poly_cbd_eta2(r, buf);
- mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+ mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA2 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -391,7 +437,7 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
#if MLKEM_K == 2
/* Reference: Does not exist in the reference implementation @[REF].
* - This implements a x4-batched version of `poly_getnoise_eta1()`
- * and `poly_getnoise_eta1()` from the reference implementation,
+ * and `poly_getnoise_eta2()` from the reference implementation,
* leveraging batched Keccak-f1600.
* - If a x4-batched Keccak-f1600 is available, we squeeze
* more random data than needed for the eta2 calls, to be
@@ -409,10 +455,10 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
- memcpy(extkey[0], seed, MLKEM_SYMBYTES);
- memcpy(extkey[1], seed, MLKEM_SYMBYTES);
- memcpy(extkey[2], seed, MLKEM_SYMBYTES);
- memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
extkey[0][MLKEM_SYMBYTES] = nonce0;
extkey[1][MLKEM_SYMBYTES] = nonce1;
extkey[2][MLKEM_SYMBYTES] = nonce2;
@@ -421,14 +467,16 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
/* On systems with fast batched Keccak, we use 4-fold batched PRF,
* even though that means generating more random data in buf[2] and buf[3]
* than necessary. */
-#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION)
+#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) && \
+ !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
mlk_prf_eta1_x4(buf, extkey);
#else
mlk_prf_eta1(buf[0], extkey[0]);
mlk_prf_eta1(buf[1], extkey[1]);
mlk_prf_eta2(buf[2], extkey[2]);
mlk_prf_eta2(buf[3], extkey[3]);
-#endif /* FIPS202_X4_DEFAULT_IMPLEMENTATION */
+#endif /* !(!FIPS202_X4_DEFAULT_IMPLEMENTATION && \
+ !MLK_CONFIG_SERIAL_FIPS202_ONLY) */
mlk_poly_cbd_eta1(r0, buf[0]);
mlk_poly_cbd_eta1(r1, buf[1]);
@@ -451,3 +499,4 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
#undef mlk_poly_cbd_eta1
#undef mlk_poly_cbd_eta2
+#undef mlk_polyvec_basemul_acc_montgomery_cached_c
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly_k.h
index f7a40ff5f9..9089a8e431 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly_k.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/poly_k.h
@@ -15,7 +15,6 @@
#ifndef MLK_POLY_K_H
#define MLK_POLY_K_H
-#include
#include "common.h"
#include "compress.h"
#include "poly.h"
@@ -29,9 +28,20 @@
#define mlk_polyvec_mulcache MLK_ADD_PARAM_SET(mlk_polyvec_mulcache)
/* End of parameter set namespacing */
-typedef mlk_poly mlk_polyvec[MLKEM_K];
-typedef mlk_poly mlk_polymat[MLKEM_K * MLKEM_K];
-typedef mlk_poly_mulcache mlk_polyvec_mulcache[MLKEM_K];
+typedef struct
+{
+ mlk_poly vec[MLKEM_K];
+} MLK_ALIGN mlk_polyvec;
+
+typedef struct
+{
+ mlk_polyvec vec[MLKEM_K];
+} MLK_ALIGN mlk_polymat;
+
+typedef struct
+{
+ mlk_poly_mulcache vec[MLKEM_K];
+} MLK_ALIGN mlk_polyvec_mulcache;
#define mlk_poly_compress_du MLK_NAMESPACE_K(poly_compress_du)
/*************************************************
@@ -131,7 +141,7 @@ __contract__(
requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r)))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DV)))
{
#if MLKEM_DV == 4
mlk_poly_compress_d4(r, a);
@@ -168,7 +178,7 @@ static MLK_INLINE void mlk_poly_decompress_dv(
__contract__(
requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
requires(memory_no_alias(r, sizeof(mlk_poly)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
{
#if MLKEM_DV == 4
@@ -200,13 +210,13 @@ __contract__(
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
- const mlk_polyvec a)
+ const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(forall(k0, 0, MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
- assigns(object_whole(r))
+ array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
);
#define mlk_polyvec_decompress_du MLK_NAMESPACE_K(polyvec_decompress_du)
@@ -228,14 +238,14 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_decompress_du(mlk_polyvec r,
+void mlk_polyvec_decompress_du(mlk_polyvec *r,
const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
__contract__(
requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_polyvec_tobytes MLK_NAMESPACE_K(polyvec_tobytes)
@@ -256,13 +266,13 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a)
+void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
requires(forall(k0, 0, MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
- assigns(object_whole(r))
+ array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(r, MLKEM_POLYVECBYTES))
);
#define mlk_polyvec_frombytes MLK_NAMESPACE_K(polyvec_frombytes)
@@ -284,13 +294,13 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES])
+void mlk_polyvec_frombytes(mlk_polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
);
#define mlk_polyvec_ntt MLK_NAMESPACE_K(polyvec_ntt)
@@ -313,14 +323,14 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_ntt(mlk_polyvec r)
+void mlk_polyvec_ntt(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
- assigns(object_whole(r))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
);
#define mlk_polyvec_invntt_tomont MLK_NAMESPACE_K(polyvec_invntt_tomont)
@@ -344,12 +354,12 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_invntt_tomont(mlk_polyvec r)
+void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
);
#define mlk_polyvec_basemul_acc_montgomery_cached \
@@ -380,16 +390,16 @@ __contract__(
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(memory_no_alias(b, sizeof(mlk_polyvec)))
requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
requires(forall(k1, 0, MLKEM_K,
- array_bound(a[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
- assigns(object_whole(r))
+ array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
);
#define mlk_polyvec_mulcache_compute MLK_NAMESPACE_K(polyvec_mulcache_compute)
@@ -423,11 +433,11 @@ __contract__(
* higher level safety proofs, and thus not part of the spec.
*/
MLK_INTERNAL_API
-void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
+void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(x, sizeof(mlk_polyvec_mulcache)))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
- assigns(object_whole(x))
+ assigns(memory_slice(x, sizeof(mlk_polyvec_mulcache)))
);
#define mlk_polyvec_reduce MLK_NAMESPACE_K(polyvec_reduce)
@@ -436,7 +446,7 @@ __contract__(
*
* Description: Applies Barrett reduction to each coefficient
* of each element of a vector of polynomials;
- * for details of the Barrett reduction see comments in reduce.c
+ * for details of the Barrett reduction see comments in poly.c
*
* Arguments: - mlk_polyvec r: pointer to input/output polynomial
*
@@ -453,12 +463,12 @@ __contract__(
* use of mlk_poly_reduce() in the context of (de)serialization.
*/
MLK_INTERNAL_API
-void mlk_polyvec_reduce(mlk_polyvec r)
+void mlk_polyvec_reduce(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_polyvec_add MLK_NAMESPACE_K(polyvec_add)
@@ -485,17 +495,17 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b)
+void mlk_polyvec_add(mlk_polyvec *r, const mlk_polyvec *b)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(memory_no_alias(b, sizeof(mlk_polyvec)))
requires(forall(j0, 0, MLKEM_K,
forall(k0, 0, MLKEM_N,
- (int32_t)r[j0].coeffs[k0] + b[j0].coeffs[k0] <= INT16_MAX)))
+ (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
requires(forall(j1, 0, MLKEM_K,
forall(k1, 0, MLKEM_N,
- (int32_t)r[j1].coeffs[k1] + b[j1].coeffs[k1] >= INT16_MIN)))
- assigns(object_whole(r))
+ (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
);
#define mlk_polyvec_tomont MLK_NAMESPACE_K(polyvec_tomont)
@@ -514,13 +524,12 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_tomont(mlk_polyvec r)
+void mlk_polyvec_tomont(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
assigns(memory_slice(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
);
#define mlk_poly_getnoise_eta1_4x MLK_NAMESPACE_K(poly_getnoise_eta1_4x)
@@ -531,7 +540,8 @@ __contract__(
* and nonces, with output polynomials close to centered binomial
* distribution with parameter MLKEM_ETA1.
*
- * Arguments: - mlk_poly *r{0,1,2,3}: pointer to output polynomial
+ * Arguments: - mlk_poly *r{0,1,2,3}: pointer to output polynomial. The last
+ * polynomial pointer may be NULL.
* - const uint8_t *seed: pointer to input seed
* (of length MLKEM_SYMBYTES bytes)
* - uint8_t nonce{0,1,2,3}: one-byte input nonce
@@ -555,16 +565,15 @@ __contract__(
requires(memory_no_alias(r0, sizeof(mlk_poly)))
requires(memory_no_alias(r1, sizeof(mlk_poly)))
requires(memory_no_alias(r2, sizeof(mlk_poly)))
- requires(memory_no_alias(r3, sizeof(mlk_poly)))
+ requires(r3 == NULL || memory_no_alias(r3, sizeof(mlk_poly)))
assigns(memory_slice(r0, sizeof(mlk_poly)))
assigns(memory_slice(r1, sizeof(mlk_poly)))
assigns(memory_slice(r2, sizeof(mlk_poly)))
- assigns(memory_slice(r3, sizeof(mlk_poly)))
- ensures(
- array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+ assigns(r3 != NULL: memory_slice(r3, sizeof(mlk_poly)))
+ ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(r3 != NULL ==> array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
);
#if MLKEM_ETA1 == MLKEM_ETA2
@@ -604,7 +613,7 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
);
#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
@@ -640,15 +649,19 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
uint8_t nonce0, uint8_t nonce1,
uint8_t nonce2, uint8_t nonce3)
__contract__(
- requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(mlk_poly)) && memory_no_alias(r2, 2 * sizeof(mlk_poly)) &&
- r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+ requires(memory_no_alias(r0, sizeof(mlk_poly)))
+ requires(memory_no_alias(r1, sizeof(mlk_poly)))
+ requires(memory_no_alias(r2, sizeof(mlk_poly)))
+ requires(memory_no_alias(r3, sizeof(mlk_poly)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
- assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+ assigns(memory_slice(r0, sizeof(mlk_poly)))
+ assigns(memory_slice(r1, sizeof(mlk_poly)))
+ assigns(memory_slice(r2, sizeof(mlk_poly)))
+ assigns(memory_slice(r3, sizeof(mlk_poly)))
ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
- && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+ && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+ && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+ && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1))
);
#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/randombytes.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/randombytes.h
index 132d920afb..3e841d26ca 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/randombytes.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/randombytes.h
@@ -5,18 +5,56 @@
#ifndef MLK_RANDOMBYTES_H
#define MLK_RANDOMBYTES_H
-#include
-#include
#include "cbmc.h"
#include "common.h"
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
#if !defined(MLK_CONFIG_CUSTOM_RANDOMBYTES)
-void randombytes(uint8_t *out, size_t outlen);
-static MLK_INLINE void mlk_randombytes(uint8_t *out, size_t outlen)
+/*************************************************
+ * Name: randombytes
+ *
+ * Description: Fill a buffer with cryptographically secure random bytes.
+ *
+ * mlkem-native does not provide an implementation of this
+ * function. It must be provided by the consumer.
+ *
+ * To use a custom random byte source with a different name
+ * or signature, set MLK_CONFIG_CUSTOM_RANDOMBYTES and define
+ * mlk_randombytes directly.
+ *
+ * Arguments: - uint8_t *out: pointer to output buffer
+ * - size_t outlen: number of random bytes to write
+ *
+ * Returns: 0 on success, non-zero on failure.
+ * On failure, top-level APIs return MLK_ERR_RNG_FAIL.
+ *
+ **************************************************/
+int randombytes(uint8_t *out, size_t outlen);
+
+/*************************************************
+ * Name: mlk_randombytes
+ *
+ * Description: Internal wrapper around randombytes().
+ *
+ * Fill a buffer with cryptographically secure random bytes.
+ *
+ * This function can be replaced by setting
+ * MLK_CONFIG_CUSTOM_RANDOMBYTES and defining mlk_randombytes
+ * directly.
+ *
+ * Arguments: - uint8_t *out: pointer to output buffer
+ * - size_t outlen: number of random bytes to write
+ *
+ * Returns: 0 on success, non-zero on failure.
+ * On failure, top-level APIs return MLK_ERR_RNG_FAIL.
+ *
+ **************************************************/
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_randombytes(uint8_t *out, size_t outlen)
__contract__(
requires(memory_no_alias(out, outlen))
- assigns(memory_slice(out, outlen))) { randombytes(out, outlen); }
+ assigns(memory_slice(out, outlen))) { return randombytes(out, outlen); }
#endif /* !MLK_CONFIG_CUSTOM_RANDOMBYTES */
-
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
#endif /* !MLK_RANDOMBYTES_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/sampling.c
index be5d931a79..945d12ed3d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/sampling.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/sampling.c
@@ -29,9 +29,10 @@
* in that it adds the offset and always expects the base of the
* target buffer. This avoids shifting the buffer base in the
* caller, which appears tricky to reason about. */
-static unsigned mlk_rej_uniform_scalar(int16_t *r, unsigned target,
- unsigned offset, const uint8_t *buf,
- unsigned buflen)
+MLK_STATIC_TESTABLE unsigned mlk_rej_uniform_c(int16_t *r, unsigned target,
+ unsigned offset,
+ const uint8_t *buf,
+ unsigned buflen)
__contract__(
requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
requires(memory_no_alias(r, sizeof(int16_t) * target))
@@ -39,11 +40,10 @@ __contract__(
requires(array_bound(r, 0, offset, 0, MLKEM_Q))
assigns(memory_slice(r, sizeof(int16_t) * target))
ensures(offset <= return_value && return_value <= target)
- ensures(array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
+ ensures(array_bound(r, 0, return_value, 0, MLKEM_Q)))
{
unsigned ctr, pos;
- uint16_t val0, val1;
+ int16_t val0, val1;
mlk_assert_bound(r, offset, 0, MLKEM_Q);
@@ -55,8 +55,8 @@ __contract__(
invariant(offset <= ctr && ctr <= target && pos <= buflen)
invariant(array_bound(r, 0, ctr, 0, MLKEM_Q)))
{
- val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
- val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+ val0 = ((buf[pos + 0] >> 0) | (buf[pos + 1] << 8)) & 0xFFF;
+ val1 = ((buf[pos + 1] >> 4) | (buf[pos + 2] << 4)) & 0xFFF;
pos += 3;
if (val0 < MLKEM_Q)
@@ -93,7 +93,7 @@ __contract__(
* Must be a multiple of 3.
*
* Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 128 is somewhat arbitrary but sufficient for all
+ * excluding. The limit of 4096 is somewhat arbitrary but sufficient for all
* uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
*
* Returns the new offset of sampled 16-bit integers, at most target,
@@ -124,8 +124,9 @@ __contract__(
#if defined(MLK_USE_NATIVE_REJ_UNIFORM)
if (offset == 0)
{
- int ret = mlk_rej_uniform_native(r, target, buf, buflen);
- if (ret != -1)
+ int ret;
+ ret = mlk_rej_uniform_native(r, target, buf, buflen);
+ if (ret != MLK_NATIVE_FUNC_FALLBACK)
{
unsigned res = (unsigned)ret;
mlk_assert_bound(r, res, 0, MLKEM_Q);
@@ -134,19 +135,22 @@ __contract__(
}
#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
- return mlk_rej_uniform_scalar(r, target, offset, buf, buflen);
+ return mlk_rej_uniform_c(r, target, offset, buf, buflen);
}
#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
- ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + MLK_XOF_RATE) / MLK_XOF_RATE)
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+ ((12 * MLKEM_N / 8 * ((uint32_t)1 << 12) / MLKEM_Q + MLK_XOF_RATE) / \
+ MLK_XOF_RATE)
#endif
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
/* Reference: Does not exist in the reference implementation @[REF].
* - x4-batched version of `rej_uniform()` from the
* reference implementation, leveraging x4-batched Keccak-f1600. */
MLK_INTERNAL_API
-void mlk_poly_rej_uniform_x4(mlk_poly *vec,
+void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
+ mlk_poly *vec3,
uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
{
/* Temporary buffers for XOF output before rejection sampling */
@@ -167,10 +171,10 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
*/
mlk_xof_x4_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &statex);
buflen = MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE;
- ctr[0] = mlk_rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf[0], buflen);
- ctr[1] = mlk_rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf[1], buflen);
- ctr[2] = mlk_rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf[2], buflen);
- ctr[3] = mlk_rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf[3], buflen);
+ ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, 0, buf[0], buflen);
+ ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, 0, buf[1], buflen);
+ ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, 0, buf[2], buflen);
+ ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, 0, buf[3], buflen);
/*
* So long as not all matrix entries have been generated, squeeze
@@ -180,20 +184,24 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
ctr[3] < MLKEM_N)
__loop__(
- assigns(ctr, statex, memory_slice(vec, sizeof(mlk_poly) * 4), object_whole(buf[0]),
- object_whole(buf[1]), object_whole(buf[2]), object_whole(buf[3]))
+ assigns(ctr, statex,
+ memory_slice(vec0, sizeof(mlk_poly)),
+ memory_slice(vec1, sizeof(mlk_poly)),
+ memory_slice(vec2, sizeof(mlk_poly)),
+ memory_slice(vec3, sizeof(mlk_poly)),
+ object_whole(buf))
invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
- invariant(array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
- invariant(array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
- invariant(array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
- invariant(array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+ invariant(array_bound(vec0->coeffs, 0, ctr[0], 0, MLKEM_Q))
+ invariant(array_bound(vec1->coeffs, 0, ctr[1], 0, MLKEM_Q))
+ invariant(array_bound(vec2->coeffs, 0, ctr[2], 0, MLKEM_Q))
+ invariant(array_bound(vec3->coeffs, 0, ctr[3], 0, MLKEM_Q)))
{
mlk_xof_x4_squeezeblocks(buf, 1, &statex);
- ctr[0] = mlk_rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf[0], buflen);
- ctr[1] = mlk_rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf[1], buflen);
- ctr[2] = mlk_rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf[2], buflen);
- ctr[3] = mlk_rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf[3], buflen);
+ ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, ctr[0], buf[0], buflen);
+ ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, ctr[1], buf[1], buflen);
+ ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, ctr[2], buf[2], buflen);
+ ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, ctr[3], buf[3], buflen);
}
mlk_xof_x4_release(&statex);
@@ -202,6 +210,7 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(buf, sizeof(buf));
}
+#endif /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
MLK_INTERNAL_API
void mlk_poly_rej_uniform(mlk_poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
@@ -284,7 +293,7 @@ void mlk_poly_cbd2(mlk_poly *r, const uint8_t buf[2 * MLKEM_N / 4])
{
const int16_t a = (d >> (4 * j + 0)) & 0x3;
const int16_t b = (d >> (4 * j + 2)) & 0x3;
- r->coeffs[8 * i + j] = a - b;
+ r->coeffs[8 * i + j] = (int16_t)(a - b);
}
}
}
@@ -336,7 +345,7 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4])
{
const int16_t a = (d >> (6 * j + 0)) & 0x7;
const int16_t b = (d >> (6 * j + 3)) & 0x7;
- r->coeffs[4 * i + j] = a - b;
+ r->coeffs[4 * i + j] = (int16_t)(a - b);
}
}
}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/sampling.h
index 2cf43c889b..24c26b34a5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/sampling.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/sampling.h
@@ -15,8 +15,6 @@
#ifndef MLK_SAMPLING_H
#define MLK_SAMPLING_H
-#include
-#include
#include "cbmc.h"
#include "common.h"
#include "poly.h"
@@ -58,6 +56,7 @@ MLK_INTERNAL_API
void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_ETA1 == 3 */
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
#define mlk_poly_rej_uniform_x4 MLK_NAMESPACE(poly_rej_uniform_x4)
/*************************************************
* Name: mlk_poly_rej_uniform_x4
@@ -65,8 +64,8 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
* Description: Generate four polynomials using rejection sampling
* on (pseudo-)uniformly random bytes sampled from a seed.
*
- * Arguments: - mlk_poly *vec:
- * Pointer to an array of 4 polynomials to be sampled.
+ * Arguments: - mlk_poly *vec0, *vec1, *vec2, *vec3:
+ * Pointers to 4 polynomials to be sampled.
* - uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)]:
* Pointer consecutive array of seed buffers of size
* MLKEM_SYMBYTES + 2 each, plus padding for alignment.
@@ -75,16 +74,24 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
*
**************************************************/
MLK_INTERNAL_API
-void mlk_poly_rej_uniform_x4(mlk_poly *vec,
+void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
+ mlk_poly *vec3,
uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
__contract__(
- requires(memory_no_alias(vec, sizeof(mlk_poly) * 4))
+ requires(memory_no_alias(vec0, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec1, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec2, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec3, sizeof(mlk_poly)))
requires(memory_no_alias(seed, 4 * MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)))
- assigns(memory_slice(vec, sizeof(mlk_poly) * 4))
- ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+ assigns(memory_slice(vec0, sizeof(mlk_poly)))
+ assigns(memory_slice(vec1, sizeof(mlk_poly)))
+ assigns(memory_slice(vec2, sizeof(mlk_poly)))
+ assigns(memory_slice(vec3, sizeof(mlk_poly)))
+ ensures(array_bound(vec0->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec1->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec2->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec3->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+#endif /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
#define mlk_poly_rej_uniform MLK_NAMESPACE(poly_rej_uniform)
/*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/symmetric.h
index 985bfeab37..68d7e1a0cd 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/symmetric.h
@@ -15,12 +15,13 @@
#ifndef MLK_SYMMETRIC_H
#define MLK_SYMMETRIC_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include MLK_FIPS202_HEADER_FILE
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
#include MLK_FIPS202X4_HEADER_FILE
+#endif
/* Macros denoting FIPS 203 specific Hash functions */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/sys.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/sys.h
index 8f690cc553..0ab8947318 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/sys.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/sys.h
@@ -20,6 +20,15 @@
#error "__BYTE_ORDER__ defined, but don't recognize value."
#endif
#endif /* __BYTE_ORDER__ */
+
+/* MSVC does not define __BYTE_ORDER__. However, MSVC only supports
+ * little endian x86, x86_64, and AArch64. It is, hence, safe to assume
+ * little endian. */
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || \
+ defined(_M_IX86) || defined(_M_ARM64))
+#define MLK_SYS_LITTLE_ENDIAN
+#endif
+
#endif /* !MLK_SYS_LITTLE_ENDIAN && !MLK_SYS_BIG_ENDIAN */
/* Check if we're running on an AArch64 little endian system. _M_ARM64 is set by
@@ -33,6 +42,11 @@
#define MLK_SYS_AARCH64_EB
#endif
+/* Check if we're running on an Armv8.1-M system with MVE */
+#if defined(__ARM_ARCH_8_1M_MAIN__) || defined(__ARM_FEATURE_MVE)
+#define MLK_SYS_ARMV81M_MVE
+#endif
+
#if defined(__x86_64__)
#define MLK_SYS_X86_64
#if defined(__AVX2__)
@@ -48,6 +62,11 @@
#define MLK_SYS_RISCV64
#endif
+#if defined(MLK_SYS_RISCV64) && defined(__riscv_vector) && \
+ defined(__riscv_v_intrinsic)
+#define MLK_SYS_RISCV64_RVV
+#endif
+
#if defined(__riscv) && defined(__riscv_xlen) && __riscv_xlen == 32
#define MLK_SYS_RISCV32
#endif
@@ -56,6 +75,14 @@
#define MLK_SYS_WINDOWS
#endif
+#if defined(__linux__)
+#define MLK_SYS_LINUX
+#endif
+
+#if defined(__APPLE__)
+#define MLK_SYS_APPLE
+#endif
+
#if defined(MLK_FORCE_AARCH64) && !defined(MLK_SYS_AARCH64)
#error "MLK_FORCE_AARCH64 is set, but we don't seem to be on an AArch64 system."
#endif
@@ -82,34 +109,46 @@
#endif
/*
- * C90 does not have the inline compiler directive yet.
- * We don't use it in C90 builds.
- * However, in that case the compiler warns about some inline functions in
- * header files not being used in every compilation unit that includes that
- * header. To work around it we silence that warning in that case using
- * __attribute__((unused)).
+ * MLK_INLINE: Hint for inlining.
+ * - MSVC: __inline
+ * - C99+: inline
+ * - GCC/Clang C90: __attribute__((unused)) to silence warnings
+ * - Other C90: empty
*/
-
-/* Do not use inline for C90 builds*/
#if !defined(MLK_INLINE)
-#if !defined(inline)
#if defined(_MSC_VER)
#define MLK_INLINE __inline
-/* Don't combine __inline and __forceinline */
-#define MLK_ALWAYS_INLINE __forceinline
-#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#elif defined(inline) || \
+ (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
#define MLK_INLINE inline
+#elif defined(__GNUC__) || defined(__clang__)
+#define MLK_INLINE __attribute__((unused))
+#else
+#define MLK_INLINE
+#endif
+#endif /* !MLK_INLINE */
+
+/*
+ * MLK_ALWAYS_INLINE: Force inlining.
+ * - MSVC: __forceinline
+ * - GCC/Clang C99+: MLK_INLINE __attribute__((always_inline))
+ * - Other: MLK_INLINE (no forced inlining)
+ */
+#if !defined(MLK_ALWAYS_INLINE)
+#if defined(_MSC_VER)
+#define MLK_ALWAYS_INLINE __forceinline
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+ (defined(inline) || \
+ (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L))
#define MLK_ALWAYS_INLINE MLK_INLINE __attribute__((always_inline))
#else
-#define MLK_INLINE __attribute__((unused))
#define MLK_ALWAYS_INLINE MLK_INLINE
#endif
+#endif /* !MLK_ALWAYS_INLINE */
-#else /* !inline */
-#define MLK_INLINE inline
-#define MLK_ALWAYS_INLINE MLK_INLINE __attribute__((always_inline))
-#endif /* inline */
-#endif /* !MLK_INLINE */
+#ifndef MLK_STATIC_TESTABLE
+#define MLK_STATIC_TESTABLE static
+#endif
/*
* C90 does not have the restrict compiler directive yet.
@@ -181,10 +220,41 @@
} while (0)
#endif /* !(MLK_CONFIG_CT_TESTING_ENABLED && !__ASSEMBLER__) */
-#if defined(__GNUC__) || defined(clang)
+#if defined(__GNUC__) || defined(__clang__)
#define MLK_MUST_CHECK_RETURN_VALUE __attribute__((warn_unused_result))
#else
#define MLK_MUST_CHECK_RETURN_VALUE
#endif
+#if !defined(__ASSEMBLER__)
+/* System capability enumeration */
+typedef enum
+{
+ /* x86_64 */
+ MLK_SYS_CAP_AVX2,
+ /* AArch64 */
+ MLK_SYS_CAP_SHA3
+} mlk_sys_cap;
+
+#if !defined(MLK_CONFIG_CUSTOM_CAPABILITY_FUNC)
+#include "cbmc.h"
+
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_sys_check_capability(mlk_sys_cap cap)
+__contract__(
+ ensures(return_value == 0 || return_value == 1)
+)
+{
+ /* By default, we rely on compile-time feature detection/specification:
+ * If a feature is enabled at compile-time, we assume it is supported by
+ * the host that the resulting library/binary will be built on.
+ * If this assumption is not true, you MUST overwrite this function.
+ * See the documentation of MLK_CONFIG_CUSTOM_CAPABILITY_FUNC in
+ * mlkem_native_config.h for more information. */
+ (void)cap;
+ return 1;
+}
+#endif /* !MLK_CONFIG_CUSTOM_CAPABILITY_FUNC */
+#endif /* !__ASSEMBLER__ */
+
#endif /* !MLK_SYS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/verify.h
index 85626c15ea..a9bdeaab30 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/verify.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/verify.h
@@ -30,9 +30,7 @@
#ifndef MLK_VERIFY_H
#define MLK_VERIFY_H
-#include
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
@@ -115,92 +113,83 @@ __contract__(ensures(return_value == b)) { return (b ^ mlk_ct_get_optblocker_u8(
static MLK_INLINE uint32_t mlk_value_barrier_u32(uint32_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
static MLK_INLINE int32_t mlk_value_barrier_i32(int32_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
static MLK_INLINE uint8_t mlk_value_barrier_u8(uint8_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
#endif /* MLK_USE_ASM_VALUE_BARRIER */
-/*
- * The ct_cmask_nonzero_xxx functions below make deliberate use of unsigned
- * overflow, which is fully defined behaviour in C. It is thus safe to disable
- * this warning.
- */
#ifdef CBMC
#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
+#pragma CPROVER check disable "conversion"
#endif
-
/*************************************************
- * Name: mlk_ct_cmask_nonzero_u16
+ * Name: mlk_cast_uint16_to_int16
*
- * Description: Return 0 if input is zero, and -1 otherwise.
+ * Description: Cast uint16 value to int16
*
- * Arguments: uint16_t x: Value to be converted into a mask
+ * Returns: For uint16_t x, the unique y in int16_t
+ * so that x == y mod 2^16.
+ *
+ * Concretely:
+ * - x < 32768: returns x
+ * - x >= 32768: returns x - 65536
*
**************************************************/
-
-/* Reference: Embedded in `cmov_int16()` in the reference implementation @[REF].
- * - Use value barrier and shift instead of `b = -b` to
- * convert condition into mask. */
-static MLK_INLINE uint16_t mlk_ct_cmask_nonzero_u16(uint16_t x)
-__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
+static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
{
- uint32_t tmp = mlk_value_barrier_u32(-((uint32_t)x));
- tmp >>= 16;
- return tmp;
+ /*
+ * PORTABILITY: This relies on uint16_t -> int16_t
+ * being implemented as the inverse of int16_t -> uint16_t,
+ * which is implementation-defined (C99 6.3.1.3 (3))
+ * CBMC (correctly) fails to prove this conversion is OK,
+ * so we have to suppress that check here
+ */
+ return (int16_t)x;
}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
/*************************************************
- * Name: mlk_ct_cmask_nonzero_u8
+ * Name: mlk_cast_int32_to_uint16
*
- * Description: Return 0 if input is zero, and -1 otherwise.
- *
- * Arguments: uint8_t x: Value to be converted into a mask
+ * Description: Cast int32 value to uint16 as per C standard.
*
+ * Returns: For int32_t x, the unique y in uint16_t
+ * so that x == y mod 2^16.
**************************************************/
-
-/* Reference: Embedded in `verify()` and `cmov()` in the
- * reference implementation @[REF].
- * - We include a value barrier not present in the
- * reference implementation, to prevent the compiler
- * from realizing that this function returns a mask. */
-static MLK_INLINE uint8_t mlk_ct_cmask_nonzero_u8(uint8_t x)
-__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
+static MLK_ALWAYS_INLINE uint16_t mlk_cast_int32_to_uint16(int32_t x)
{
- uint32_t tmp = mlk_value_barrier_u32(-((uint32_t)x));
- tmp >>= 24;
- return tmp;
+ return (uint16_t)(x & (int32_t)UINT16_MAX);
}
-/* Put unsigned overflow warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/*
- * The mlk_ct_cmask_neg_i16 function below makes deliberate use of
- * signed to unsigned integer conversion, which is fully defined
- * behaviour in C. It is thus safe to disable this warning.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
+/*************************************************
+ * Name: mlk_cast_int16_to_uint16
+ *
+ * Description: Cast int16 value to uint16 as per C standard.
+ *
+ * Returns: For int16_t x, the unique y in uint16_t
+ * so that x == y mod 2^16.
+ **************************************************/
+static MLK_ALWAYS_INLINE uint16_t mlk_cast_int16_to_uint16(int32_t x)
+{
+ return mlk_cast_int32_to_uint16((int32_t)x);
+}
/*************************************************
* Name: mlk_ct_cmask_neg_i16
@@ -225,24 +214,49 @@ __contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
{
int32_t tmp = mlk_value_barrier_i32((int32_t)x);
tmp >>= 16;
- return (int16_t)tmp;
+ return mlk_cast_int32_to_uint16(tmp);
}
-/* Put unsigned-to-signed warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
+/*************************************************
+ * Name: mlk_ct_cmask_nonzero_u16
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments: uint16_t x: Value to be converted into a mask
+ *
+ **************************************************/
-/*
- * The ct_csel_xxx functions below make deliberate use of unsigned
- * to signed integer conversion, which is implementation-defined
- * behaviour. Here, we assume that uint16_t -> int16_t is inverse
- * to int16_t -> uint16_t.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
+/* Reference: Embedded in `cmov_int16()` in the reference implementation @[REF].
+ * - Use value barrier and shift instead of `b = -b` to
+ * convert condition into mask. */
+static MLK_INLINE uint16_t mlk_ct_cmask_nonzero_u16(uint16_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
+{
+ int32_t tmp = mlk_value_barrier_i32(-((int32_t)x));
+ tmp >>= 16;
+ return mlk_cast_int32_to_uint16(tmp);
+}
+
+/*************************************************
+ * Name: mlk_ct_cmask_nonzero_u8
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments: uint8_t x: Value to be converted into a mask
+ *
+ **************************************************/
+
+/* Reference: Embedded in `verify()` and `cmov()` in the
+ * reference implementation @[REF].
+ * - We include a value barrier not present in the
+ * reference implementation, to prevent the compiler
+ * from realizing that this function returns a mask. */
+static MLK_INLINE uint8_t mlk_ct_cmask_nonzero_u8(uint8_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
+{
+ uint16_t mask = mlk_ct_cmask_nonzero_u16((uint16_t)x);
+ return (uint8_t)(mask & 0xFF);
+}
/*************************************************
* Name: mlk_ct_sel_int16
@@ -280,16 +294,12 @@ __contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
static MLK_INLINE int16_t mlk_ct_sel_int16(int16_t a, int16_t b, uint16_t cond)
__contract__(ensures(return_value == (cond ? a : b)))
{
- uint16_t au = a, bu = b;
+ uint16_t au = mlk_cast_int16_to_uint16(a);
+ uint16_t bu = mlk_cast_int16_to_uint16(b);
uint16_t res = bu ^ (mlk_ct_cmask_nonzero_u16(cond) & (au ^ bu));
- return (int16_t)res;
+ return mlk_cast_uint16_to_int16(res);
}
-/* Put unsigned-to-signed warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
/*************************************************
* Name: mlk_ct_sel_uint8
*
@@ -318,9 +328,11 @@ __contract__(ensures(return_value == (cond ? a : b)))
*
* Arguments: const uint8_t *a: pointer to first byte array
* const uint8_t *b: pointer to second byte array
- * size_t len: length of the byte arrays
+ * size_t len: length of the byte arrays, upper-bounded
+ * to UINT16_MAX to control proof complexity
+ * only.
*
- * Returns 0 if the byte arrays are equal, a non-zero value otherwise
+ * Returns 0 if the byte arrays are equal, 0xFF otherwise.
*
* Specification:
* - Used to securely compute conditional move in
@@ -338,9 +350,10 @@ __contract__(ensures(return_value == (cond ? a : b)))
static MLK_INLINE uint8_t mlk_ct_memcmp(const uint8_t *a, const uint8_t *b,
const size_t len)
__contract__(
+ requires(len <= UINT16_MAX)
requires(memory_no_alias(a, len))
requires(memory_no_alias(b, len))
- requires(len <= INT_MAX)
+ ensures((return_value == 0) || (return_value == 0xFF))
ensures((return_value == 0) == forall(i, 0, len, (a[i] == b[i]))))
{
uint8_t r = 0, s = 0;
@@ -391,13 +404,17 @@ __contract__(
static MLK_INLINE void mlk_ct_cmov_zero(uint8_t *r, const uint8_t *x,
size_t len, uint8_t b)
__contract__(
+ requires(len <= MLK_MAX_BUFFER_SIZE)
requires(memory_no_alias(r, len))
requires(memory_no_alias(x, len))
- assigns(memory_slice(r, len)))
+ assigns(memory_slice(r, len))
+ ensures(forall(i, 0, len, (r[i] == (b == 0 ? x[i] : old(r)[i])))))
{
size_t i;
for (i = 0; i < len; i++)
- __loop__(invariant(i <= len))
+ __loop__(
+ invariant(i <= len)
+ invariant(forall(k, 0, i, r[k] == (b == 0 ? x[k] : loop_entry(r)[k]))))
{
r[i] = mlk_ct_sel_uint8(r[i], x[i], b);
}
@@ -431,13 +448,13 @@ __contract__(
requires(memory_no_alias(ptr, len))
assigns(memory_slice(ptr, len)))
{
- memset(ptr, 0, len);
+ mlk_memset(ptr, 0, len);
/* This follows OpenSSL and seems sufficient to prevent the compiler
* from optimizing away the memset.
*
* If there was a reliable way to detect availability of memset_s(),
* that would be preferred. */
- __asm__ __volatile__("" : : "r"(ptr) : "memory");
+ __asm__ volatile("" : : "r"(ptr) : "memory");
}
#else /* !MLK_SYS_WINDOWS && MLK_HAVE_INLINE_ASM */
#error No plausibly-secure implementation of mlk_zeroize available. Please provide your own using MLK_CONFIG_CUSTOM_ZEROIZE.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/zetas.inc b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/zetas.inc
index 0c00b5b905..00316daf67 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/zetas.inc
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/mlkem/src/zetas.inc
@@ -5,16 +5,16 @@
/*
* WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
* Do not modify it directly.
*/
-#include
/*
* Table of zeta values used in the reference NTT and inverse NTT.
* See autogen for details.
*/
-static MLK_ALIGN const int16_t zetas[128] = {
+static MLK_ALIGN const int16_t mlk_zetas[128] = {
-1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577,
182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458,
-1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/integration/liboqs/config_c.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/integration/liboqs/config_c.h
index b546e2686d..9b1eef321a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/integration/liboqs/config_c.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/integration/liboqs/config_c.h
@@ -8,13 +8,24 @@
*
* - [FIPS140_3_IG]
* Implementation Guidance for FIPS 140-3 and the Cryptographic Module
- * Validation Program National Institute of Standards and Technology
+ * Validation Program
+ * National Institute of Standards and Technology
* https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
*/
#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_C_H
#define MLK_INTEGRATION_LIBOQS_CONFIG_C_H
+/* Enable valgrind-based assertions in mlkem-native through macro
+ * from libOQS. */
+#if !defined(__ASSEMBLER__)
+#include
+#if defined(OQS_ENABLE_TEST_CONSTANT_TIME)
+#define MLK_CONFIG_CT_TESTING_ENABLED
+#endif
+#endif /* !__ASSEMBLER__ */
+
+
/******************************************************************************
* Name: MLK_CONFIG_PARAMETER_SET
*
@@ -134,7 +145,7 @@
* consumer.
*
* If this option is not set, mlkem-native expects a function
- * void randombytes(uint8_t *out, size_t outlen).
+ * int randombytes(uint8_t *out, size_t outlen).
*
* Set this option and define `mlk_randombytes` if you want to
* use a custom method to sample randombytes with a different name
@@ -146,9 +157,10 @@
#include
#include
#include "../../mlkem/src/sys.h"
-static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len)
+static MLK_INLINE int mlk_randombytes(uint8_t *ptr, size_t len)
{
OQS_randombytes(ptr, len);
+ return 0;
}
#endif /* !__ASSEMBLER__ */
@@ -212,13 +224,4 @@ static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len)
#endif
*/
-/* Enable valgrind-based assertions in mlkem-native through macro
- * from libOQS. */
-#if !defined(__ASSEMBLER__)
-#include
-#if defined(OQS_ENABLE_TEST_CONSTANT_TIME)
-#define MLK_CONFIG_CT_TESTING_ENABLED
-#endif
-#endif /* !__ASSEMBLER__ */
-
#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_C_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/cbmc.h
index 650d32b95b..80e1a36fc7 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/cbmc.h
@@ -8,7 +8,6 @@
/***************************************************
* Basic replacements for __CPROVER_XXX contracts
***************************************************/
-
#ifndef CBMC
#define __contract__(x)
@@ -16,6 +15,7 @@
#else /* !CBMC */
+
#define __contract__(x) x
#define __loop__(x) x
@@ -49,7 +49,6 @@
*/
#define object_whole(...) __CPROVER_object_whole(__VA_ARGS__)
#define memory_slice(...) __CPROVER_object_upto(__VA_ARGS__)
-#define same_object(...) __CPROVER_same_object(__VA_ARGS__)
/*
* Pointer-related predicates
@@ -59,6 +58,17 @@
#define readable(...) __CPROVER_r_ok(__VA_ARGS__)
#define writeable(...) __CPROVER_w_ok(__VA_ARGS__)
+/* Maximum supported buffer size
+ *
+ * Larger buffers may be supported, but due to internal modeling constraints
+ * in CBMC, the proofs of memory- and type-safety won't be able to run.
+ *
+ * If you find yourself in need for a buffer size larger than this,
+ * please contact the maintainers, so we can prioritize work to relax
+ * this somewhat artificial bound.
+ */
+#define MLK_MAX_BUFFER_SIZE (SIZE_MAX >> 12)
+
/*
* History variables
* https://diffblue.github.io/cbmc/contracts-history-variables.html
@@ -83,7 +93,7 @@
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> (predicate) \
}
-#define EXISTS(qvar, qvar_lb, qvar_ub, predicate) \
+#define exists(qvar, qvar_lb, qvar_ub, predicate) \
__CPROVER_exists \
{ \
unsigned qvar; \
@@ -118,13 +128,35 @@
{ \
unsigned qvar; \
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
- (((int)(value_lb) <= ((array_var)[(qvar)])) && \
- (((array_var)[(qvar)]) < (int)(value_ub))) \
+ (((int)(value_lb) <= ((array_var)[(qvar)])) && \
+ (((array_var)[(qvar)]) < (int)(value_ub))) \
}
-#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
- array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb), \
+#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
+ array_bound_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), (qvar_lb), \
(qvar_ub), (array_var), (value_lb), (value_ub))
+
+#define array_unchanged_core(qvar, qvar_lb, qvar_ub, array_var) \
+ __CPROVER_forall \
+ { \
+ unsigned qvar; \
+ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
+ ((array_var)[(qvar)]) == (old(* (int16_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
+ }
+
+#define array_unchanged(array_var, N) \
+ array_unchanged_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var))
+
+#define array_unchanged_u64_core(qvar, qvar_lb, qvar_ub, array_var) \
+ __CPROVER_forall \
+ { \
+ unsigned qvar; \
+ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
+ ((array_var)[(qvar)]) == (old(* (uint64_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
+ }
+
+#define array_unchanged_u64(array_var, N) \
+ array_unchanged_u64_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var))
/* clang-format on */
/* Wrapper around array_bound operating on absolute values.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/common.h
index 9de9875556..bc4e9ed72c 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/common.h
@@ -5,10 +5,16 @@
#ifndef MLK_COMMON_H
#define MLK_COMMON_H
+#ifndef __ASSEMBLER__
+#include
+#endif
+
+#define MLK_BUILD_INTERNAL
+
#if defined(MLK_CONFIG_FILE)
#include MLK_CONFIG_FILE
#else
-#include "config.h"
+#include "mlkem_native_config.h"
#endif
#include "params.h"
@@ -28,15 +34,11 @@
#define MLK_EXTERNAL_API MLK_CONFIG_EXTERNAL_API_QUALIFIER
#endif
-#if defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) || \
- defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED)
-#define MLK_MULTILEVEL_BUILD
-#endif
-
#define MLK_CONCAT_(x1, x2) x1##x2
#define MLK_CONCAT(x1, x2) MLK_CONCAT_(x1, x2)
-#if defined(MLK_MULTILEVEL_BUILD)
+#if (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || \
+ defined(MLK_CONFIG_MULTILEVEL_NO_SHARED))
#define MLK_ADD_PARAM_SET(s) MLK_CONCAT(s, MLK_CONFIG_PARAMETER_SET)
#else
#define MLK_ADD_PARAM_SET(s) s
@@ -49,7 +51,7 @@
/* Functions are prefixed by MLK_CONFIG_NAMESPACE_PREFIX.
*
* If multiple parameter sets are used, functions depending on the parameter
- * set are additionally prefixed with 512/768/1024. See config.h.
+ * set are additionally prefixed with 512/768/1024. See mlkem_native_config.h.
*
* Example: If MLK_CONFIG_NAMESPACE_PREFIX is mlkem, then
* MLK_NAMESPACE_K(enc) becomes mlkem512_enc/mlkem768_enc/mlkem1024_enc.
@@ -73,8 +75,24 @@
*/
#if defined(MLK_SYS_X86_64)
#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) : MLK_CET_ENDBR
-#else
+#elif defined(MLK_SYS_ARMV81M_MVE)
+/* clang-format off */
+#define MLK_ASM_FN_SYMBOL(sym) \
+ .type MLK_ASM_NAMESPACE(sym), %function; \
+ MLK_ASM_NAMESPACE(sym) :
+/* clang-format on */
+#else /* !MLK_SYS_X86_64 && MLK_SYS_ARMV81M_MVE */
#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) :
+#endif /* !MLK_SYS_X86_64 && !MLK_SYS_ARMV81M_MVE */
+
+/*
+ * Output the size of an assembly function.
+ */
+#if defined(__ELF__)
+#define MLK_ASM_FN_SIZE(sym) \
+ .size MLK_ASM_NAMESPACE(sym), .- MLK_ASM_NAMESPACE(sym)
+#else
+#define MLK_ASM_FN_SIZE(sym)
#endif
/* We aim to simplify the user's life by supporting builds where
@@ -99,6 +117,10 @@
#error Bad configuration: MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, but MLK_CONFIG_FIPS202_BACKEND_FILE is not.
#endif
+#if defined(MLK_CONFIG_NO_RANDOMIZED_API) && defined(MLK_CONFIG_KEYGEN_PCT)
+#error Bad configuration: MLK_CONFIG_NO_RANDOMIZED_API is incompatible with MLK_CONFIG_KEYGEN_PCT as the current PCT implementation requires crypto_kem_enc()
+#endif
+
#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH)
#include MLK_CONFIG_ARITH_BACKEND_FILE
/* Include to enforce consistency of API and implementation,
@@ -135,20 +157,118 @@
#define MLK_FIPS202X4_HEADER_FILE MLK_CONFIG_FIPS202X4_CUSTOM_HEADER
#endif
-/* Just in case we want to include mlkem_native.h, set the configuration
- * for that header in accordance with the configuration used here. */
+/* Standard library function replacements */
+#if !defined(__ASSEMBLER__)
+#if !defined(MLK_CONFIG_CUSTOM_MEMCPY)
+#include
+#define mlk_memcpy memcpy
+#endif
+
+#if !defined(MLK_CONFIG_CUSTOM_MEMSET)
+#include
+#define mlk_memset memset
+#endif
+
+
+/* Allocation macros for large local structures
+ *
+ * MLK_ALLOC(v, T, N) declares T *v and attempts to point it to an T[N]
+ * MLK_FREE(v, T, N) zeroizes and frees the allocation
+ *
+ * Default implementation uses stack allocation.
+ * Can be overridden by setting the config option MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * and defining MLK_CUSTOM_ALLOC and MLK_CUSTOM_FREE.
+ */
+#if defined(MLK_CONFIG_CUSTOM_ALLOC_FREE) != \
+ (defined(MLK_CUSTOM_ALLOC) && defined(MLK_CUSTOM_FREE))
+#error Bad configuration: MLK_CONFIG_CUSTOM_ALLOC_FREE must be set together with MLK_CUSTOM_ALLOC and MLK_CUSTOM_FREE
+#endif
+
+/*
+ * If the integration wants to provide a context parameter for use in
+ * platform-specific hooks, then it should define this parameter.
+ *
+ * The MLK_CONTEXT_PARAMETERS_n macros are intended to be used with macros
+ * defining the function names and expand to either pass or discard the context
+ * argument as required by the current build. If there is no context parameter
+ * requested then these are removed from the prototypes and from all calls.
+ */
+#ifdef MLK_CONFIG_CONTEXT_PARAMETER
+#define MLK_CONTEXT_PARAMETERS_0(context) (context)
+#define MLK_CONTEXT_PARAMETERS_1(arg0, context) (arg0, context)
+#define MLK_CONTEXT_PARAMETERS_2(arg0, arg1, context) (arg0, arg1, context)
+#define MLK_CONTEXT_PARAMETERS_3(arg0, arg1, arg2, context) \
+ (arg0, arg1, arg2, context)
+#define MLK_CONTEXT_PARAMETERS_4(arg0, arg1, arg2, arg3, context) \
+ (arg0, arg1, arg2, arg3, context)
+#else /* MLK_CONFIG_CONTEXT_PARAMETER */
+#define MLK_CONTEXT_PARAMETERS_0(context) ()
+#define MLK_CONTEXT_PARAMETERS_1(arg0, context) (arg0)
+#define MLK_CONTEXT_PARAMETERS_2(arg0, arg1, context) (arg0, arg1)
+#define MLK_CONTEXT_PARAMETERS_3(arg0, arg1, arg2, context) (arg0, arg1, arg2)
+#define MLK_CONTEXT_PARAMETERS_4(arg0, arg1, arg2, arg3, context) \
+ (arg0, arg1, arg2, arg3)
+#endif /* !MLK_CONFIG_CONTEXT_PARAMETER */
+
+#if defined(MLK_CONFIG_CONTEXT_PARAMETER_TYPE) != \
+ defined(MLK_CONFIG_CONTEXT_PARAMETER)
+#error MLK_CONFIG_CONTEXT_PARAMETER_TYPE must be defined if and only if MLK_CONFIG_CONTEXT_PARAMETER is defined
+#endif
+
+#if !defined(MLK_CONFIG_CUSTOM_ALLOC_FREE)
+/* Default: stack allocation */
+
+#define MLK_ALLOC(v, T, N, context) \
+ MLK_ALIGN T mlk_alloc_##v[N]; \
+ T *v = mlk_alloc_##v
+
+/* TODO: This leads to a circular dependency between common and verify.h
+ * It just works out before we're at the end of the file, but it's still
+ * prone to issues in the future. */
+#include "verify.h"
+#define MLK_FREE(v, T, N, context) \
+ do \
+ { \
+ mlk_zeroize(mlk_alloc_##v, sizeof(mlk_alloc_##v)); \
+ (v) = NULL; \
+ } while (0)
+
+#else /* !MLK_CONFIG_CUSTOM_ALLOC_FREE */
+
+/* Custom allocation */
+
+/*
+ * The indirection here is necessary to use MLK_CONTEXT_PARAMETERS_3 here.
+ */
+#define MLK_APPLY(f, args) f args
+
+#define MLK_ALLOC(v, T, N, context) \
+ MLK_APPLY(MLK_CUSTOM_ALLOC, MLK_CONTEXT_PARAMETERS_3(v, T, N, context))
+
+#define MLK_FREE(v, T, N, context) \
+ do \
+ { \
+ if (v != NULL) \
+ { \
+ mlk_zeroize(v, sizeof(T) * (N)); \
+ MLK_APPLY(MLK_CUSTOM_FREE, MLK_CONTEXT_PARAMETERS_3(v, T, N, context)); \
+ v = NULL; \
+ } \
+ } while (0)
+
+#endif /* MLK_CONFIG_CUSTOM_ALLOC_FREE */
+
+/****************************** Error codes ***********************************/
-/* Double-check that this is not conflicting with pre-existing definitions. */
-#if defined(MLK_CONFIG_API_PARAMETER_SET) || \
- defined(MLK_CONFIG_API_NAMESPACE_PREFIX) || \
- defined(MLK_CONFIG_API_NO_SUPERCOP) || \
- defined(MLK_CONFIG_API_CONSTANTS_ONLY)
-#error Pre-existing MLK_CONFIG_API_XXX configuration is neither useful nor allowed during an mlkem-native build
-#endif /* MLK_CONFIG_API_PARAMETER_SET || MLK_CONFIG_API_NAMESPACE_PREFIX || \
- MLK_CONFIG_API_NO_SUPERCOP || MLK_CONFIG_API_CONSTANTS_ONLY */
+/* Generic failure condition */
+#define MLK_ERR_FAIL -1
+/* An allocation failed. This can only happen if MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * is defined and the provided MLK_CUSTOM_ALLOC can fail. */
+#define MLK_ERR_OUT_OF_MEMORY -2
+/* An rng failure occured. Might be due to insufficient entropy or
+ * system misconfiguration. */
+#define MLK_ERR_RNG_FAIL -3
-#define MLK_CONFIG_API_PARAMETER_SET MLK_CONFIG_PARAMETER_SET
-#define MLK_CONFIG_API_NAMESPACE_PREFIX \
- MLK_ADD_PARAM_SET(MLK_CONFIG_NAMESPACE_PREFIX)
+#endif /* !__ASSEMBLER__ */
#endif /* !MLK_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/compress.c
index d7ff2bbe7a..50da36d0e4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/compress.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/compress.c
@@ -20,24 +20,27 @@
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
-#include
+
#include "cbmc.h"
#include "compress.h"
#include "debug.h"
#include "verify.h"
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
/* Reference: `poly_compress()` in the reference implementation @[REF],
* for ML-KEM-{512,768}.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d4_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -55,32 +58,51 @@ void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
t[j] = mlk_scalar_compress_d4(a->coeffs[8 * i + j]);
}
- r[i * 4] = t[0] | (t[1] << 4);
- r[i * 4 + 1] = t[2] | (t[3] << 4);
- r[i * 4 + 2] = t[4] | (t[5] << 4);
- r[i * 4 + 3] = t[6] | (t[7] << 4);
+ /* All t[i] are 4-bit wide, so the truncations don't alter the value. */
+ r[i * 4] = (uint8_t)(t[0] | (t[1] << 4));
+ r[i * 4 + 1] = (uint8_t)(t[2] | (t[3] << 4));
+ r[i * 4 + 2] = (uint8_t)(t[4] | (t[5] << 4));
+ r[i * 4 + 3] = (uint8_t)(t[6] | (t[7] << 4));
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D4 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d4_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d4_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
+ mlk_poly_compress_d4_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation, for ML-KEM-{512,768}.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d10_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+)
{
unsigned j;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -101,29 +123,47 @@ void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
* Make all implicit truncation explicit. No data is being
* truncated for the LHS's since each t[i] is 10-bit in size.
*/
- r[5 * j + 0] = (t[0] >> 0) & 0xFF;
- r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
- r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
- r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
- r[5 * j + 4] = (t[3] >> 2);
+ r[5 * j + 0] = (uint8_t)((t[0] >> 0) & 0xFF);
+ r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] << 2) & 0xFF));
+ r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] << 4) & 0xFF));
+ r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] << 6) & 0xFF));
+ r[5 * j + 4] = (uint8_t)(t[3] >> 2);
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D10 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d10_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d10_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
+ mlk_poly_compress_d10_c(r, a);
+}
+
/* Reference: `poly_decompress()` in the reference implementation @[REF],
* for ML-KEM-{512,768}. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d4(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d4_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 2; i++)
@@ -137,22 +177,40 @@ void mlk_poly_decompress_d4(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d4(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d4_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
+ int ret;
+ ret = mlk_poly_decompress_d4_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
+ mlk_poly_decompress_d4_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation, for ML-KEM-{512,768}. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d10(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d10_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned j;
for (j = 0; j < MLKEM_N / 4; j++)
@@ -180,28 +238,46 @@ void mlk_poly_decompress_d10(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d10(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d10_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
+ int ret;
+ ret = mlk_poly_decompress_d10_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
+
+ mlk_poly_decompress_d10_c(r, a);
+}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
/* Reference: `poly_compress()` in the reference implementation @[REF],
* for ML-KEM-1024.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d5_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -219,38 +295,51 @@ void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
t[j] = mlk_scalar_compress_d5(a->coeffs[8 * i + j]);
}
- /*
- * Explicitly truncate to avoid warning about
- * implicit truncation in CBMC, and use array indexing into
- * r rather than pointer-arithmetic to simplify verification
- */
- r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
- r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
- r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
- r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
- r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+ r[i * 5] = (uint8_t)(0xFF & ((t[0] >> 0) | (t[1] << 5)));
+ r[i * 5 + 1] = (uint8_t)(0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)));
+ r[i * 5 + 2] = (uint8_t)(0xFF & ((t[3] >> 1) | (t[4] << 4)));
+ r[i * 5 + 3] = (uint8_t)(0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)));
+ r[i * 5 + 4] = (uint8_t)(0xFF & ((t[6] >> 2) | (t[7] << 3)));
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D5 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d5_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d5_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
+ mlk_poly_compress_d5_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation, for ML-KEM-1024.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d11_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+)
{
unsigned j;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -272,35 +361,53 @@ void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
* Make all implicit truncation explicit. No data is being
* truncated for the LHS's since each t[i] is 11-bit in size.
*/
- r[11 * j + 0] = (t[0] >> 0) & 0xFF;
- r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
- r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
- r[11 * j + 3] = (t[2] >> 2) & 0xFF;
- r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
- r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
- r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
- r[11 * j + 7] = (t[5] >> 1) & 0xFF;
- r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
- r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
- r[11 * j + 10] = (t[7] >> 3);
+ r[11 * j + 0] = (uint8_t)((t[0] >> 0) & 0xFF);
+ r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] << 3) & 0xFF));
+ r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] << 6) & 0xFF));
+ r[11 * j + 3] = (uint8_t)((t[2] >> 2) & 0xFF);
+ r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] << 1) & 0xFF));
+ r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] << 4) & 0xFF));
+ r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] << 7) & 0xFF));
+ r[11 * j + 7] = (uint8_t)((t[5] >> 1) & 0xFF);
+ r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] << 2) & 0xFF));
+ r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] << 5) & 0xFF));
+ r[11 * j + 10] = (uint8_t)(t[7] >> 3);
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D11 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d11_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d11_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
+ mlk_poly_compress_d11_c(r, a);
+}
+
/* Reference: `poly_decompress()` in the reference implementation @[REF],
* for ML-KEM-1024. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d5(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d5_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 8; i++)
@@ -342,22 +449,40 @@ void mlk_poly_decompress_d5(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d5(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d5_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
+ int ret;
+ ret = mlk_poly_decompress_d5_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
+ mlk_poly_decompress_d5_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation, for ML-KEM-1024. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d11(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d11_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned j;
for (j = 0; j < MLKEM_N / 8; j++)
@@ -390,26 +515,45 @@ void mlk_poly_decompress_d11(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d11(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d11_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
+ int ret;
+ ret = mlk_poly_decompress_d11_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
+ mlk_poly_decompress_d11_c(r, a);
+}
+
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
-#if !defined(MLK_USE_NATIVE_POLY_TOBYTES)
/* Reference: `poly_tobytes()` in the reference implementation @[REF].
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_tobytes_c(uint8_t r[MLKEM_POLYBYTES],
+ const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYBYTES))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -417,8 +561,10 @@ void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(invariant(i <= MLKEM_N / 2))
{
- const uint16_t t0 = a->coeffs[2 * i];
- const uint16_t t1 = a->coeffs[2 * i + 1];
+ /* The conversion to uint16_t is safe since we assume that
+ * the coefficients of `a` are non-negative. */
+ const uint16_t t0 = (uint16_t)a->coeffs[2 * i];
+ const uint16_t t1 = (uint16_t)a->coeffs[2 * i + 1];
/*
* t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
* significant data, so these can be packed into 24 bits or exactly
@@ -426,32 +572,48 @@ void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
*/
/* Least significant bits 0 - 7 of t0. */
- r[3 * i + 0] = t0 & 0xFF;
+ r[3 * i + 0] = (uint8_t)(t0 & 0xFF);
/*
* Most significant bits 8 - 11 of t0 become the least significant
* nibble of the second byte. The least significant 4 bits
* of t1 become the upper nibble of the second byte.
+ *
+ * The conversion to uint8_t does not alter the value.
*/
- r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+ r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 << 4) & 0xF0));
- /* Bits 4 - 11 of t1 become the third byte. */
- r[3 * i + 2] = t1 >> 4;
+ /* Bits 4 - 11 of t1 become the third byte. The conversion to uint8_t
+ * does not alter the value because t1 is 12-bit wide. */
+ r[3 * i + 2] = (uint8_t)(t1 >> 4);
}
}
-#else /* !MLK_USE_NATIVE_POLY_TOBYTES */
+
MLK_INTERNAL_API
void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
{
+#if defined(MLK_USE_NATIVE_POLY_TOBYTES)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_tobytes_native(r, a->coeffs);
-}
+ ret = mlk_poly_tobytes_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
-#if !defined(MLK_USE_NATIVE_POLY_FROMBYTES)
+ mlk_poly_tobytes_c(r, a);
+}
+
/* Reference: `poly_frombytes()` in the reference implementation @[REF]. */
-MLK_INTERNAL_API
-void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
+MLK_STATIC_TESTABLE void mlk_poly_frombytes_c(mlk_poly *r,
+ const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+ requires(memory_no_alias(a, MLKEM_POLYBYTES))
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 2; i++)
@@ -462,21 +624,29 @@ void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
const uint8_t t0 = a[3 * i + 0];
const uint8_t t1 = a[3 * i + 1];
const uint8_t t2 = a[3 * i + 2];
- r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
- r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+ r->coeffs[2 * i + 0] = (int16_t)(t0 | ((t1 << 8) & 0xFFF));
+ r->coeffs[2 * i + 1] = (int16_t)((t1 >> 4) | (t2 << 4));
}
/* Note that the coefficients are not canonical */
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
}
-#else /* !MLK_USE_NATIVE_POLY_FROMBYTES */
+
MLK_INTERNAL_API
void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
{
- mlk_poly_frombytes_native(r->coeffs, a);
-}
+#if defined(MLK_USE_NATIVE_POLY_FROMBYTES)
+ int ret;
+ ret = mlk_poly_frombytes_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
+ mlk_poly_frombytes_c(r, a);
+}
+
/* Reference: `poly_frommsg()` in the reference implementation @[REF].
* - We use a value barrier around the bit-selection mask to
* reduce the risk of compiler-introduced branches.
@@ -506,7 +676,7 @@ void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
* as per @[FIPS203, Eq (4.8)]. */
/* Prevent the compiler from recognizing this as a bit selection */
- uint8_t mask = mlk_value_barrier_u8(1u << j);
+ uint8_t mask = mlk_value_barrier_u8((uint8_t)(1u << j));
r->coeffs[8 * i + j] = mlk_ct_sel_int16(MLKEM_Q_HALF, 0, msg[i] & mask);
}
}
@@ -535,7 +705,7 @@ void mlk_poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const mlk_poly *a)
invariant(i <= MLKEM_N / 8 && j <= 8))
{
uint32_t t = mlk_scalar_compress_d1(a->coeffs[8 * i + j]);
- msg[i] |= t << j;
+ msg[i] |= (uint8_t)(t << j);
}
}
}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/compress.h
index f0789d42d6..b16b0889b5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/compress.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/compress.h
@@ -20,8 +20,7 @@
#ifndef MLK_COMPRESS_H
#define MLK_COMPRESS_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include "debug.h"
@@ -50,9 +49,9 @@
#endif
/* Reference: Part of poly_tomsg() in the reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d1(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d1(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 2)
ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) )
{
@@ -65,7 +64,8 @@ __contract__(
*/
/* check-magic: 1290168 == 2*round(2^31 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290168;
- return (d0 + (1u << 30)) >> 31;
+ /* Unsigned shifting by 31 positions leaves only the top bit. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 30)) >> 31);
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -93,9 +93,9 @@ __contract__(
/* Reference: Embedded into `poly_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d4(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d4(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 16)
ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
{
@@ -108,7 +108,8 @@ __contract__(
*/
/* check-magic: 1290160 == 16 * round(2^28 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290160;
- return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */
+ /* The return value is < 16, so not altered by the conversion to uint8_t. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 27)) >> 28); /* round(d0/2^28) */
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -128,11 +129,16 @@ __contract__(
/* Reference: Embedded into `poly_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d4(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d4(uint8_t u)
__contract__(
requires(0 <= u && u < 16)
ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) >> 4; }
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 8) >> 4);
+}
/************************************************************
* Name: mlk_scalar_compress_d5
@@ -156,9 +162,9 @@ __contract__(
/* Reference: Embedded into `poly_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d5(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d5(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 32)
ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) )
{
@@ -171,7 +177,8 @@ __contract__(
*/
/* check-magic: 1290176 == 2^5 * round(2^27 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290176;
- return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */
+ /* The return value is < 32, so not altered by the conversion to uint8_t. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 26)) >> 27); /* round(d0/2^27) */
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -191,11 +198,16 @@ __contract__(
/* Reference: Embedded into `poly_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d5(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d5(uint8_t u)
__contract__(
requires(0 <= u && u < 32)
- ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) >> 5; }
+ ensures(0 <= return_value && return_value <= MLKEM_Q - 1)
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 16) >> 5);
+}
/************************************************************
* Name: mlk_scalar_compress_d10
@@ -219,9 +231,9 @@ __contract__(
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d10(uint16_t u)
+static MLK_INLINE uint16_t mlk_scalar_compress_d10(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < (1u << 10))
ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
{
@@ -255,11 +267,16 @@ __contract__(
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d10(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d10(uint16_t u)
__contract__(
requires(0 <= u && u < 1024)
- ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) >> 10; }
+ ensures(0 <= return_value && return_value <= (MLKEM_Q - 1))
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 512) >> 10);
+}
/************************************************************
* Name: mlk_scalar_compress_d11
@@ -283,9 +300,9 @@ __contract__(
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d11(uint16_t u)
+static MLK_INLINE uint16_t mlk_scalar_compress_d11(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < (1u << 11))
ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
{
@@ -319,11 +336,16 @@ __contract__(
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d11(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d11(uint16_t u)
__contract__(
requires(0 <= u && u < 2048)
- ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) >> 11; }
+ ensures(0 <= return_value && return_value <= (MLKEM_Q - 1))
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 1024) >> 11);
+}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
#define mlk_poly_compress_d4 MLK_NAMESPACE(poly_compress_d4)
@@ -575,7 +597,7 @@ __contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
);
@@ -631,7 +653,7 @@ void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
__contract__(
requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
);
@@ -660,7 +682,7 @@ __contract__(
requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(msg))
+ assigns(memory_slice(msg, MLKEM_INDCPA_MSGBYTES))
);
#endif /* !MLK_COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/debug.h
index 01f7c88ccf..47c864bd36 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/debug.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/debug.h
@@ -7,7 +7,6 @@
#include "common.h"
#if defined(MLKEM_DEBUG)
-#include
/*************************************************
* Name: mlk_assert
@@ -89,14 +88,14 @@ void mlk_debug_check_bounds(const char *file, int line, const int16_t *ptr,
/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
* just use a single flattened array_bound(...) here. */
-#define mlk_assert_bound_2d(ptr, M, N, value_lb, value_ub) \
- cassert(forall(kN, 0, (M), \
- array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+#define mlk_assert_bound_2d(ptr, M, N, value_lb, value_ub) \
+ cassert(forall(kN, 0, (M), \
+ array_bound(&((int16_t (*)[(N)])(ptr))[kN][0], 0, (N), \
(value_lb), (value_ub))))
-#define mlk_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \
- cassert(forall(kN, 0, (M), \
- array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+#define mlk_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \
+ cassert(forall(kN, 0, (M), \
+ array_abs_bound(&((int16_t (*)[(N)])(ptr))[kN][0], 0, (N), \
(value_abs_bd))))
#else /* !MLKEM_DEBUG && CBMC */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/indcpa.c
index 85d4f595a9..e03b16c38b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/indcpa.c
@@ -17,15 +17,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
-#include
+#include "indcpa.h"
-#include "cbmc.h"
#include "debug.h"
-#include "indcpa.h"
-#include "poly.h"
-#include "poly_k.h"
#include "randombytes.h"
#include "sampling.h"
#include "symmetric.h"
@@ -41,6 +35,10 @@
#define mlk_pack_ciphertext MLK_ADD_PARAM_SET(mlk_pack_ciphertext)
#define mlk_unpack_ciphertext MLK_ADD_PARAM_SET(mlk_unpack_ciphertext)
#define mlk_matvec_mul MLK_ADD_PARAM_SET(mlk_matvec_mul)
+#define mlk_polyvec_permute_bitrev_to_custom \
+ MLK_ADD_PARAM_SET(mlk_polyvec_permute_bitrev_to_custom)
+#define mlk_polymat_permute_bitrev_to_custom \
+ MLK_ADD_PARAM_SET(mlk_polymat_permute_bitrev_to_custom)
/* End of parameter set namespacing */
/*************************************************
@@ -59,12 +57,13 @@
* Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L19]
*
**************************************************/
-static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], mlk_polyvec pk,
+static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const mlk_polyvec *pk,
const uint8_t seed[MLKEM_SYMBYTES])
{
- mlk_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(pk->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
mlk_polyvec_tobytes(r, pk);
- memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
+ mlk_memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
}
/*************************************************
@@ -83,11 +82,11 @@ static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], mlk_polyvec pk,
* Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L2-3]
*
**************************************************/
-static void mlk_unpack_pk(mlk_polyvec pk, uint8_t seed[MLKEM_SYMBYTES],
+static void mlk_unpack_pk(mlk_polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
const uint8_t packedpk[MLKEM_INDCPA_PUBLICKEYBYTES])
{
mlk_polyvec_frombytes(pk, packedpk);
- memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
/* NOTE: If a modulus check was conducted on the PK, we know at this
* point that the coefficients of `pk` are unsigned canonical. The
@@ -108,9 +107,10 @@ static void mlk_unpack_pk(mlk_polyvec pk, uint8_t seed[MLKEM_SYMBYTES],
* Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L20]
*
**************************************************/
-static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], mlk_polyvec sk)
+static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES],
+ const mlk_polyvec *sk)
{
- mlk_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(sk->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
mlk_polyvec_tobytes(r, sk);
}
@@ -128,7 +128,7 @@ static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], mlk_polyvec sk)
* Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L5]
*
**************************************************/
-static void mlk_unpack_sk(mlk_polyvec sk,
+static void mlk_unpack_sk(mlk_polyvec *sk,
const uint8_t packedsk[MLKEM_INDCPA_SECRETKEYBYTES])
{
mlk_polyvec_frombytes(sk, packedsk);
@@ -149,8 +149,8 @@ static void mlk_unpack_sk(mlk_polyvec sk,
* Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22-23]
*
**************************************************/
-static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], mlk_polyvec b,
- mlk_poly *v)
+static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES],
+ const mlk_polyvec *b, mlk_poly *v)
{
mlk_polyvec_compress_du(r, b);
mlk_poly_compress_dv(r + MLKEM_POLYVECCOMPRESSEDBYTES_DU, v);
@@ -170,28 +170,69 @@ static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], mlk_polyvec b,
* Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L1-4]
*
**************************************************/
-static void mlk_unpack_ciphertext(mlk_polyvec b, mlk_poly *v,
+static void mlk_unpack_ciphertext(mlk_polyvec *b, mlk_poly *v,
const uint8_t c[MLKEM_INDCPA_BYTES])
{
mlk_polyvec_decompress_du(b, c);
mlk_poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
}
-#if !defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
-/* This namespacing is not done at the top to avoid a naming conflict
- * with native backends, which are currently not yet namespaced. */
-#define mlk_poly_permute_bitrev_to_custom \
- MLK_ADD_PARAM_SET(mlk_poly_permute_bitrev_to_custom)
-
-static MLK_INLINE void mlk_poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
+/* Helper function to ensure that the polynomial entries in the output
+ * of gen_matrix use the standard (bitreversed) ordering of coefficients.
+ * No-op unless a native backend with a custom ordering is used.
+ *
+ * We don't inline this into gen_matrix to avoid having to split the CBMC
+ * proof for gen_matrix based on MLK_USE_NATIVE_NTT_CUSTOM_ORDER. */
+static void mlk_polyvec_permute_bitrev_to_custom(mlk_polyvec *v)
__contract__(
/* We don't specify that this should be a permutation, but only
* that it does not change the bound established at the end of mlk_gen_matrix. */
- requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
- requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(memory_slice(data, sizeof(mlk_poly)))
- ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+ requires(memory_no_alias(v, sizeof(mlk_polyvec)))
+ requires(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(v, sizeof(mlk_polyvec)))
+ ensures(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+{
+#if defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
+ unsigned i;
+ for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(v, sizeof(mlk_polyvec)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+ {
+ mlk_poly_permute_bitrev_to_custom(v->vec[i].coeffs);
+ }
+#else /* MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
+ /* Nothing to do */
+ (void)v;
#endif /* !MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
+}
+
+static void mlk_polymat_permute_bitrev_to_custom(mlk_polymat *a)
+__contract__(
+ /* We don't specify that this should be a permutation, but only
+ * that it does not change the bound established at the end of mlk_gen_matrix. */
+ requires(memory_no_alias(a, sizeof(mlk_polymat)))
+ requires(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+ assigns(memory_slice(a, sizeof(mlk_polymat)))
+ ensures(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))))
+{
+ unsigned i;
+ for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(a, sizeof(mlk_polymat)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))))
+ {
+ mlk_polyvec_permute_bitrev_to_custom(&a->vec[i]);
+ }
+}
/* Reference: `gen_matrix()` in the reference implementation @[REF].
* - We use a special subroutine to generate 4 polynomials
@@ -201,32 +242,27 @@ __contract__(
*
* Not static for benchmarking */
MLK_INTERNAL_API
-void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
+void mlk_gen_matrix(mlk_polymat *a, const uint8_t seed[MLKEM_SYMBYTES],
int transposed)
{
unsigned i, j;
- /*
- * We generate four separate seed arrays rather than a single one to work
- * around limitations in CBMC function contracts dealing with disjoint slices
- * of the same parent object.
- */
-
MLK_ALIGN uint8_t seed_ext[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)];
for (j = 0; j < 4; j++)
{
- memcpy(seed_ext[j], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(seed_ext[j], seed, MLKEM_SYMBYTES);
}
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
/* Sample 4 matrix entries a time. */
for (i = 0; i < (MLKEM_K * MLKEM_K / 4) * 4; i += 4)
{
- uint8_t x, y;
-
for (j = 0; j < 4; j++)
{
- x = (i + j) / MLKEM_K;
- y = (i + j) % MLKEM_K;
+ uint8_t x, y;
+ /* MLKEM_K <= 4, so the values fit in uint8_t. */
+ x = (uint8_t)((i + j) / MLKEM_K);
+ y = (uint8_t)((i + j) % MLKEM_K);
if (transposed)
{
seed_ext[j][MLKEM_SYMBYTES + 0] = x;
@@ -239,19 +275,26 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
}
}
- /*
- * This call writes across mlk_polyvec boundaries for K=2 and K=3.
- * This is intentional and safe.
- */
- mlk_poly_rej_uniform_x4(&a[i], seed_ext);
+ mlk_poly_rej_uniform_x4(&a->vec[i / MLKEM_K].vec[i % MLKEM_K],
+ &a->vec[(i + 1) / MLKEM_K].vec[(i + 1) % MLKEM_K],
+ &a->vec[(i + 2) / MLKEM_K].vec[(i + 2) % MLKEM_K],
+ &a->vec[(i + 3) / MLKEM_K].vec[(i + 3) % MLKEM_K],
+ seed_ext);
}
-
- /* For MLKEM_K == 3, sample the last entry individually. */
- if (i < MLKEM_K * MLKEM_K)
+#else /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
+ /* When using serial FIPS202, sample all entries individually. */
+ i = 0;
+#endif /* MLK_CONFIG_SERIAL_FIPS202_ONLY */
+
+ /* For MLKEM_K == 3, sample the last entry individually.
+ * When MLK_CONFIG_SERIAL_FIPS202_ONLY is set, sample all entries
+ * individually. */
+ for (; i < MLKEM_K * MLKEM_K; i++)
{
uint8_t x, y;
- x = i / MLKEM_K;
- y = i % MLKEM_K;
+ /* MLKEM_K <= 4, so the values fit in uint8_t. */
+ x = (uint8_t)(i / MLKEM_K);
+ y = (uint8_t)(i % MLKEM_K);
if (transposed)
{
@@ -264,8 +307,7 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
seed_ext[0][MLKEM_SYMBYTES + 1] = x;
}
- mlk_poly_rej_uniform(&a[i], seed_ext[0]);
- i++;
+ mlk_poly_rej_uniform(&a->vec[i / MLKEM_K].vec[i % MLKEM_K], seed_ext[0]);
}
mlk_assert(i == MLKEM_K * MLKEM_K);
@@ -274,10 +316,7 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
* The public matrix is generated in NTT domain. If the native backend
* uses a custom order in NTT domain, permute A accordingly.
*/
- for (i = 0; i < MLKEM_K * MLKEM_K; i++)
- {
- mlk_poly_permute_bitrev_to_custom(a[i].coeffs);
- }
+ mlk_polymat_permute_bitrev_to_custom(a);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -301,24 +340,25 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
* Specification: Implements @[FIPS203, Section 2.4.7, Eq (2.12), (2.13)]
*
**************************************************/
-static void mlk_matvec_mul(mlk_polyvec out, const mlk_polymat a,
- const mlk_polyvec v, const mlk_polyvec_mulcache vc)
+static void mlk_matvec_mul(mlk_polyvec *out, const mlk_polymat *a,
+ const mlk_polyvec *v, const mlk_polyvec_mulcache *vc)
__contract__(
requires(memory_no_alias(out, sizeof(mlk_polyvec)))
requires(memory_no_alias(a, sizeof(mlk_polymat)))
requires(memory_no_alias(v, sizeof(mlk_polyvec)))
requires(memory_no_alias(vc, sizeof(mlk_polyvec_mulcache)))
- requires(forall(k0, 0, MLKEM_K * MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
- assigns(object_whole(out)))
+ requires(forall(k0, 0, MLKEM_K,
+ forall(k1, 0, MLKEM_K,
+ array_bound(a->vec[k0].vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))))
+ assigns(memory_slice(out, sizeof(mlk_polyvec))))
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
__loop__(
- assigns(i, object_whole(out))
+ assigns(i, memory_slice(out, sizeof(mlk_polyvec)))
invariant(i <= MLKEM_K))
{
- mlk_polyvec_basemul_acc_montgomery_cached(&out[i], &a[MLKEM_K * i], v, vc);
+ mlk_polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a->vec[i], v, vc);
}
}
@@ -331,20 +371,34 @@ __contract__(
* - We include buffer zeroization.
*/
MLK_INTERNAL_API
-void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- const uint8_t *publicseed = buf;
- const uint8_t *noiseseed = buf + MLKEM_SYMBYTES;
- mlk_polymat a;
- mlk_polyvec e, pkpv, skpv;
- mlk_polyvec_mulcache skpv_cache;
-
- MLK_ALIGN uint8_t coins_with_domain_separator[MLKEM_SYMBYTES + 1];
+ int ret = 0;
+ const uint8_t *publicseed;
+ const uint8_t *noiseseed;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(coins_with_domain_separator, uint8_t, MLKEM_SYMBYTES + 1, context);
+ MLK_ALLOC(a, mlk_polymat, 1, context);
+ MLK_ALLOC(e, mlk_polyvec, 1, context);
+ MLK_ALLOC(pkpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (buf == NULL || coins_with_domain_separator == NULL || a == NULL ||
+ e == NULL || pkpv == NULL || skpv == NULL || skpv_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
+ publicseed = buf;
+ noiseseed = buf + MLKEM_SYMBYTES;
+
/* Concatenate coins with MLKEM_K for domain separation of security levels */
- memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
+ mlk_memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
coins_with_domain_separator[MLKEM_SYMBYTES] = MLKEM_K;
mlk_hash_g(buf, coins_with_domain_separator, MLKEM_SYMBYTES + 1);
@@ -360,24 +414,24 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
mlk_gen_matrix(a, publicseed, 0 /* no transpose */);
#if MLKEM_K == 2
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &e[0], &e[1], noiseseed, 0, 1,
- 2, 3);
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &e->vec[0],
+ &e->vec[1], noiseseed, 0, 1, 2, 3);
#elif MLKEM_K == 3
/*
* Only the first three output buffers are needed.
* The laster parameter is a dummy that's overwritten later.
*/
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &skpv[2],
- &pkpv[0] /* irrelevant */, noiseseed, 0, 1, 2,
- 0xFF /* irrelevant */);
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &skpv->vec[2], NULL,
+ noiseseed, 0, 1, 2, 0xFF /* irrelevant */);
/* Same here */
- mlk_poly_getnoise_eta1_4x(&e[0], &e[1], &e[2], &pkpv[0] /* irrelevant */,
- noiseseed, 3, 4, 5, 0xFF /* irrelevant */);
+ mlk_poly_getnoise_eta1_4x(&e->vec[0], &e->vec[1], &e->vec[2], NULL, noiseseed,
+ 3, 4, 5, 0xFF /* irrelevant */);
#elif MLKEM_K == 4
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &skpv[2], &skpv[3], noiseseed,
- 0, 1, 2, 3);
- mlk_poly_getnoise_eta1_4x(&e[0], &e[1], &e[2], &e[3], noiseseed, 4, 5, 6, 7);
-#endif
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &skpv->vec[2],
+ &skpv->vec[3], noiseseed, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta1_4x(&e->vec[0], &e->vec[1], &e->vec[2], &e->vec[3],
+ noiseseed, 4, 5, 6, 7);
+#endif /* MLKEM_K == 4 */
mlk_polyvec_ntt(skpv);
mlk_polyvec_ntt(e);
@@ -393,14 +447,17 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
mlk_pack_sk(sk, skpv);
mlk_pack_pk(pk, pkpv, publicseed);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(coins_with_domain_separator, sizeof(coins_with_domain_separator));
- mlk_zeroize(a, sizeof(a));
- mlk_zeroize(&e, sizeof(e));
- mlk_zeroize(&skpv, sizeof(skpv));
- mlk_zeroize(&skpv_cache, sizeof(skpv_cache));
+ MLK_FREE(skpv_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(skpv, mlk_polyvec, 1, context);
+ MLK_FREE(pkpv, mlk_polyvec, 1, context);
+ MLK_FREE(e, mlk_polyvec, 1, context);
+ MLK_FREE(a, mlk_polymat, 1, context);
+ MLK_FREE(coins_with_domain_separator, uint8_t, MLKEM_SYMBYTES + 1, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
/* Reference: `indcpa_enc()` in the reference implementation @[REF].
@@ -412,19 +469,33 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
* - We include buffer zeroization.
*/
MLK_INTERNAL_API
-void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t seed[MLKEM_SYMBYTES];
- mlk_polymat at;
- mlk_polyvec sp, pkpv, ep, b;
- mlk_poly v, k, epp;
- mlk_polyvec_mulcache sp_cache;
+ int ret = 0;
+ MLK_ALLOC(seed, uint8_t, MLKEM_SYMBYTES, context);
+ MLK_ALLOC(at, mlk_polymat, 1, context);
+ MLK_ALLOC(sp, mlk_polyvec, 1, context);
+ MLK_ALLOC(pkpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(ep, mlk_polyvec, 1, context);
+ MLK_ALLOC(b, mlk_polyvec, 1, context);
+ MLK_ALLOC(v, mlk_poly, 1, context);
+ MLK_ALLOC(k, mlk_poly, 1, context);
+ MLK_ALLOC(epp, mlk_poly, 1, context);
+ MLK_ALLOC(sp_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (seed == NULL || at == NULL || sp == NULL || pkpv == NULL || ep == NULL ||
+ b == NULL || v == NULL || k == NULL || epp == NULL || sp_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
mlk_unpack_pk(pkpv, seed, pk);
- mlk_poly_frommsg(&k, m);
+ mlk_poly_frommsg(k, m);
/*
* Declassify the public seed.
@@ -437,87 +508,105 @@ void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
mlk_gen_matrix(at, seed, 1 /* transpose */);
#if MLKEM_K == 2
- mlk_poly_getnoise_eta1122_4x(&sp[0], &sp[1], &ep[0], &ep[1], coins, 0, 1, 2,
- 3);
- mlk_poly_getnoise_eta2(&epp, coins, 4);
+ mlk_poly_getnoise_eta1122_4x(&sp->vec[0], &sp->vec[1], &ep->vec[0],
+ &ep->vec[1], coins, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta2(epp, coins, 4);
#elif MLKEM_K == 3
/*
* In this call, only the first three output buffers are needed.
* The last parameter is a dummy that's overwritten later.
*/
- mlk_poly_getnoise_eta1_4x(&sp[0], &sp[1], &sp[2], &b[0], coins, 0, 1, 2,
- 0xFF);
+ mlk_poly_getnoise_eta1_4x(&sp->vec[0], &sp->vec[1], &sp->vec[2], NULL, coins,
+ 0, 1, 2, 0xFF /* irrelevant */);
/* The fourth output buffer in this call _is_ used. */
- mlk_poly_getnoise_eta2_4x(&ep[0], &ep[1], &ep[2], &epp, coins, 3, 4, 5, 6);
+ mlk_poly_getnoise_eta2_4x(&ep->vec[0], &ep->vec[1], &ep->vec[2], epp, coins,
+ 3, 4, 5, 6);
#elif MLKEM_K == 4
- mlk_poly_getnoise_eta1_4x(&sp[0], &sp[1], &sp[2], &sp[3], coins, 0, 1, 2, 3);
- mlk_poly_getnoise_eta2_4x(&ep[0], &ep[1], &ep[2], &ep[3], coins, 4, 5, 6, 7);
- mlk_poly_getnoise_eta2(&epp, coins, 8);
-#endif
+ mlk_poly_getnoise_eta1_4x(&sp->vec[0], &sp->vec[1], &sp->vec[2], &sp->vec[3],
+ coins, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta2_4x(&ep->vec[0], &ep->vec[1], &ep->vec[2], &ep->vec[3],
+ coins, 4, 5, 6, 7);
+ mlk_poly_getnoise_eta2(epp, coins, 8);
+#endif /* MLKEM_K == 4 */
mlk_polyvec_ntt(sp);
mlk_polyvec_mulcache_compute(sp_cache, sp);
mlk_matvec_mul(b, at, sp, sp_cache);
- mlk_polyvec_basemul_acc_montgomery_cached(&v, pkpv, sp, sp_cache);
+ mlk_polyvec_basemul_acc_montgomery_cached(v, pkpv, sp, sp_cache);
mlk_polyvec_invntt_tomont(b);
- mlk_poly_invntt_tomont(&v);
+ mlk_poly_invntt_tomont(v);
mlk_polyvec_add(b, ep);
- mlk_poly_add(&v, &epp);
- mlk_poly_add(&v, &k);
+ mlk_poly_add(v, epp);
+ mlk_poly_add(v, k);
mlk_polyvec_reduce(b);
- mlk_poly_reduce(&v);
+ mlk_poly_reduce(v);
- mlk_pack_ciphertext(c, b, &v);
+ mlk_pack_ciphertext(c, b, v);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(seed, sizeof(seed));
- mlk_zeroize(&sp, sizeof(sp));
- mlk_zeroize(&sp_cache, sizeof(sp_cache));
- mlk_zeroize(&b, sizeof(b));
- mlk_zeroize(&v, sizeof(v));
- mlk_zeroize(at, sizeof(at));
- mlk_zeroize(&k, sizeof(k));
- mlk_zeroize(&ep, sizeof(ep));
- mlk_zeroize(&epp, sizeof(epp));
+ MLK_FREE(sp_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(epp, mlk_poly, 1, context);
+ MLK_FREE(k, mlk_poly, 1, context);
+ MLK_FREE(v, mlk_poly, 1, context);
+ MLK_FREE(b, mlk_polyvec, 1, context);
+ MLK_FREE(ep, mlk_polyvec, 1, context);
+ MLK_FREE(pkpv, mlk_polyvec, 1, context);
+ MLK_FREE(sp, mlk_polyvec, 1, context);
+ MLK_FREE(at, mlk_polymat, 1, context);
+ MLK_FREE(seed, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
/* Reference: `indcpa_dec()` in the reference implementation @[REF].
* - We use a mulcache for the scalar product.
* - We include buffer zeroization. */
MLK_INTERNAL_API
-void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+int mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- mlk_polyvec b, skpv;
- mlk_poly v, sb;
- mlk_polyvec_mulcache b_cache;
+ int ret = 0;
+ MLK_ALLOC(b, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(v, mlk_poly, 1, context);
+ MLK_ALLOC(sb, mlk_poly, 1, context);
+ MLK_ALLOC(b_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (b == NULL || skpv == NULL || v == NULL || sb == NULL || b_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
- mlk_unpack_ciphertext(b, &v, c);
+ mlk_unpack_ciphertext(b, v, c);
mlk_unpack_sk(skpv, sk);
mlk_polyvec_ntt(b);
mlk_polyvec_mulcache_compute(b_cache, b);
- mlk_polyvec_basemul_acc_montgomery_cached(&sb, skpv, b, b_cache);
- mlk_poly_invntt_tomont(&sb);
+ mlk_polyvec_basemul_acc_montgomery_cached(sb, skpv, b, b_cache);
+ mlk_poly_invntt_tomont(sb);
- mlk_poly_sub(&v, &sb);
- mlk_poly_reduce(&v);
+ mlk_poly_sub(v, sb);
+ mlk_poly_reduce(v);
- mlk_poly_tomsg(m, &v);
+ mlk_poly_tomsg(m, v);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(&skpv, sizeof(skpv));
- mlk_zeroize(&b, sizeof(b));
- mlk_zeroize(&b_cache, sizeof(b_cache));
- mlk_zeroize(&v, sizeof(v));
- mlk_zeroize(&sb, sizeof(sb));
+ MLK_FREE(b_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(sb, mlk_poly, 1, context);
+ MLK_FREE(v, mlk_poly, 1, context);
+ MLK_FREE(skpv, mlk_polyvec, 1, context);
+ MLK_FREE(b, mlk_polyvec, 1, context);
+ return ret;
}
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
@@ -529,4 +618,5 @@ void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
#undef mlk_pack_ciphertext
#undef mlk_unpack_ciphertext
#undef mlk_matvec_mul
-#undef mlk_poly_permute_bitrev_to_custom
+#undef mlk_polyvec_permute_bitrev_to_custom
+#undef mlk_polymat_permute_bitrev_to_custom
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/indcpa.h
index 4c44d0d411..b31756dcb6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/indcpa.h
@@ -15,7 +15,6 @@
#ifndef MLK_INDCPA_H
#define MLK_INDCPA_H
-#include
#include "cbmc.h"
#include "common.h"
#include "poly_k.h"
@@ -39,18 +38,19 @@
*
**************************************************/
MLK_INTERNAL_API
-void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
+void mlk_gen_matrix(mlk_polymat *a, const uint8_t seed[MLKEM_SYMBYTES],
int transposed)
__contract__(
requires(memory_no_alias(a, sizeof(mlk_polymat)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
requires(transposed == 0 || transposed == 1)
- assigns(object_whole(a))
- ensures(forall(x, 0, MLKEM_K * MLKEM_K,
- array_bound(a[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(a, sizeof(mlk_polymat)))
+ ensures(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
);
-#define mlk_indcpa_keypair_derand MLK_NAMESPACE_K(indcpa_keypair_derand)
+#define mlk_indcpa_keypair_derand \
+ MLK_NAMESPACE_K(indcpa_keypair_derand) MLK_CONTEXT_PARAMETERS_3
/*************************************************
* Name: mlk_indcpa_keypair_derand
*
@@ -68,18 +68,23 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
-#define mlk_indcpa_enc MLK_NAMESPACE_K(indcpa_enc)
+#define mlk_indcpa_enc MLK_NAMESPACE_K(indcpa_enc) MLK_CONTEXT_PARAMETERS_4
/*************************************************
* Name: mlk_indcpa_enc
*
@@ -100,19 +105,23 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(c))
+ assigns(memory_slice(c, MLKEM_INDCPA_BYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
-#define mlk_indcpa_dec MLK_NAMESPACE_K(indcpa_dec)
+#define mlk_indcpa_dec MLK_NAMESPACE_K(indcpa_dec) MLK_CONTEXT_PARAMETERS_3
/*************************************************
* Name: mlk_indcpa_dec
*
@@ -130,14 +139,18 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
- assigns(object_whole(m))
+ assigns(memory_slice(m, MLKEM_INDCPA_MSGBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
#endif /* !MLK_INDCPA_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/kem.c
index d6f4e83628..3c82d6df70 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/kem.c
@@ -8,7 +8,8 @@
*
* - [FIPS140_3_IG]
* Implementation Guidance for FIPS 140-3 and the Cryptographic Module
- * Validation Program National Institute of Standards and Technology
+ * Validation Program
+ * National Institute of Standards and Technology
* https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
*
* - [FIPS203]
@@ -22,12 +23,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
-#include
+#include "kem.h"
#include "indcpa.h"
-#include "kem.h"
#include "randombytes.h"
#include "symmetric.h"
#include "verify.h"
@@ -36,44 +34,24 @@
* This is to facilitate building multiple instances
* of mlkem-native (e.g. with varying security levels)
* within a single compilation unit. */
-#define mlk_check_pk MLK_ADD_PARAM_SET(mlk_check_pk)
-#define mlk_check_sk MLK_ADD_PARAM_SET(mlk_check_sk)
-#define mlk_check_pct MLK_ADD_PARAM_SET(mlk_check_pct)
+#define mlk_check_pct MLK_ADD_PARAM_SET(mlk_check_pct) MLK_CONTEXT_PARAMETERS_2
/* End of parameter set namespacing */
-#if defined(CBMC)
-/* Redeclaration with contract needed for CBMC only */
-int memcmp(const void *str1, const void *str2, size_t n)
-__contract__(
- requires(memory_no_alias(str1, n))
- requires(memory_no_alias(str2, n))
-);
-#endif /* CBMC */
-
-/*************************************************
- * Name: mlk_check_pk
- *
- * Description: Implements modulus check mandated by FIPS 203,
- * i.e., ensures that coefficients are in [0,q-1].
- *
- * Arguments: - const uint8_t *pk: pointer to input public key
- * (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
- * bytes)
- *
- * Returns: - 0 on success
- * - -1 on failure
- *
- * Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
- *
- **************************************************/
-
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-static int mlk_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- mlk_polyvec p;
- uint8_t p_reencoded[MLKEM_POLYVECBYTES];
+ int ret = 0;
+ MLK_ALLOC(p, mlk_polyvec, 1, context);
+ MLK_ALLOC(p_reencoded, uint8_t, MLKEM_POLYVECBYTES, context);
+
+ if (p == NULL || p_reencoded == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
mlk_polyvec_frombytes(p, pk);
mlk_polyvec_reduce(p);
@@ -81,39 +59,32 @@ static int mlk_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
/* We use a constant-time memcmp here to avoid having to
* declassify the PK before the PCT has succeeded. */
- res = mlk_ct_memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES) ? -1 : 0;
+ ret = mlk_ct_memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES) ? MLK_ERR_FAIL : 0;
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(p_reencoded, sizeof(p_reencoded));
- mlk_zeroize(&p, sizeof(p));
- return res;
+ MLK_FREE(p_reencoded, uint8_t, MLKEM_POLYVECBYTES, context);
+ MLK_FREE(p, mlk_polyvec, 1, context);
+ return ret;
}
-/*************************************************
- * Name: mlk_check_sk
- *
- * Description: Implements public key hash check mandated by FIPS 203,
- * i.e., ensures that
- * sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
- *
- * Arguments: - const uint8_t *sk: pointer to input private key
- * (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
- * bytes)
- *
- * Returns: - 0 on success
- * - -1 on failure
- *
- * Specification: Implements @[FIPS203, Section 7.3, 'hash check']
- *
- **************************************************/
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-static int mlk_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t test[MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(test, uint8_t, MLKEM_SYMBYTES, context);
+
+ if (test == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
/*
* The parts of `sk` being hashed and compared here are public, so
* no public information is leaked through the runtime or the return value
@@ -128,23 +99,32 @@ static int mlk_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
mlk_hash_h(test, sk + MLKEM_INDCPA_SECRETKEYBYTES,
MLKEM_INDCCA_PUBLICKEYBYTES);
- res = memcmp(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, test,
- MLKEM_SYMBYTES)
- ? -1
+ /* This doesn't have to be a constant-time memcmp, but it's the only place
+ * in the library where a normal memcmp would be used otherwise, so for sake
+ * of minimizing stdlib dependency, we use our constant-time one anyway. */
+ ret = mlk_ct_memcmp(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+ test, MLKEM_SYMBYTES)
+ ? MLK_ERR_FAIL
: 0;
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(test, sizeof(test));
- return res;
+ MLK_FREE(test, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
- requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES)));
+ requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
+);
#if defined(MLK_CONFIG_KEYGEN_PCT)
/* Specification:
@@ -152,21 +132,30 @@ __contract__(
* @[FIPS203, Section 7.1, Pairwise Consistency]. */
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES];
- uint8_t ss_enc[MLKEM_SSBYTES], ss_dec[MLKEM_SSBYTES];
+ int ret = 0;
+ MLK_ALLOC(ct, uint8_t, MLKEM_INDCCA_CIPHERTEXTBYTES, context);
+ MLK_ALLOC(ss_enc, uint8_t, MLKEM_SSBYTES, context);
+ MLK_ALLOC(ss_dec, uint8_t, MLKEM_SSBYTES, context);
+
+ if (ct == NULL || ss_enc == NULL || ss_dec == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
- res = crypto_kem_enc(ct, ss_enc, pk);
- if (res != 0)
+ ret = mlk_kem_enc(ct, ss_enc, pk, context);
+ if (ret != 0)
{
goto cleanup;
}
- res = crypto_kem_dec(ss_dec, ct, sk);
- if (res != 0)
+ ret = mlk_kem_dec(ss_dec, ct, sk, context);
+ if (ret != 0)
{
goto cleanup;
}
@@ -179,26 +168,36 @@ static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
}
#endif /* MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST */
- res = mlk_ct_memcmp(ss_enc, ss_dec, sizeof(ss_dec));
+ ret = mlk_ct_memcmp(ss_enc, ss_dec, MLKEM_SSBYTES);
+ /* The result of the PCT is public. */
+ MLK_CT_TESTING_DECLASSIFY(&ret, sizeof(ret));
+
+ if (ret != 0)
+ {
+ ret = MLK_ERR_FAIL;
+ }
cleanup:
- /* The result of the PCT is public. */
- MLK_CT_TESTING_DECLASSIFY(&res, sizeof(res));
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(ct, sizeof(ct));
- mlk_zeroize(ss_enc, sizeof(ss_enc));
- mlk_zeroize(ss_dec, sizeof(ss_dec));
- return res;
+ MLK_FREE(ss_dec, uint8_t, MLKEM_SSBYTES, context);
+ MLK_FREE(ss_enc, uint8_t, MLKEM_SSBYTES, context);
+ MLK_FREE(ct, uint8_t, MLKEM_INDCCA_CIPHERTEXTBYTES, context);
+ return ret;
}
-#else /* MLK_CONFIG_KEYGEN_PCT */
+#else /* MLK_CONFIG_KEYGEN_PCT */
+MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
/* Skip PCT */
((void)pk);
((void)sk);
+#if defined(MLK_CONFIG_CONTEXT_PARAMETER)
+ ((void)context);
+#endif
return 0;
}
#endif /* !MLK_CONFIG_KEYGEN_PCT */
@@ -208,164 +207,240 @@ static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
* - We optionally include PCT which is not present in
* the reference code. */
MLK_EXTERNAL_API
-int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
- const uint8_t coins[2 * MLKEM_SYMBYTES])
+int mlk_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ const uint8_t coins[2 * MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- mlk_indcpa_keypair_derand(pk, sk, coins);
- memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+ int ret;
+
+ ret = mlk_indcpa_keypair_derand(pk, sk, coins, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
+
+ mlk_memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
mlk_hash_h(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, pk,
MLKEM_INDCCA_PUBLICKEYBYTES);
/* Value z for pseudo-random output on reject */
- memcpy(sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
- coins + MLKEM_SYMBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
+ coins + MLKEM_SYMBYTES, MLKEM_SYMBYTES);
/* Declassify public key */
MLK_CT_TESTING_DECLASSIFY(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
/* Pairwise Consistency Test (PCT) @[FIPS140_3_IG, p.87] */
- if (mlk_check_pct(pk, sk))
+ ret = mlk_check_pct(pk, sk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- return 0;
+cleanup:
+ if (ret != 0)
+ {
+ mlk_zeroize(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+ mlk_zeroize(sk, MLKEM_INDCCA_SECRETKEYBYTES);
+ }
+
+ return ret;
}
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
/* Reference: `crypto_kem_keypair()` in the reference implementation @[REF]
* - We zeroize the stack buffer */
MLK_EXTERNAL_API
-int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t coins[2 * MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(coins, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+ if (coins == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Acquire necessary randomness, and mark it as secret. */
- mlk_randombytes(coins, 2 * MLKEM_SYMBYTES);
- MLK_CT_TESTING_SECRET(coins, sizeof(coins));
+ if (mlk_randombytes(coins, 2 * MLKEM_SYMBYTES) != 0)
+ {
+ ret = MLK_ERR_RNG_FAIL;
+ goto cleanup;
+ }
+
+ MLK_CT_TESTING_SECRET(coins, 2 * MLKEM_SYMBYTES);
- res = crypto_kem_keypair_derand(pk, sk, coins);
+ ret = mlk_kem_keypair_derand(pk, sk, coins, context);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(coins, sizeof(coins));
- return res;
+ MLK_FREE(coins, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
/* Reference: `crypto_kem_enc_derand()` in the reference implementation @[REF]
* - We include public key check
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- /* Will contain key, coins */
- MLK_ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+ if (buf == NULL || kr == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Specification: Implements @[FIPS203, Section 7.2, Modulus check] */
- if (mlk_check_pk(pk))
+ ret = mlk_kem_check_pk(pk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- memcpy(buf, coins, MLKEM_SYMBYTES);
+ mlk_memcpy(buf, coins, MLKEM_SYMBYTES);
/* Multitarget countermeasure for coins + contributory KEM */
mlk_hash_h(buf + MLKEM_SYMBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
/* coins are in kr+MLKEM_SYMBYTES */
- mlk_indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES);
+ ret = mlk_indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
- memcpy(ss, kr, MLKEM_SYMBYTES);
+ mlk_memcpy(ss, kr, MLKEM_SYMBYTES);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(kr, sizeof(kr));
-
- return 0;
+ MLK_FREE(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
/* Reference: `crypto_kem_enc()` in the reference implementation @[REF]
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t coins[MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(coins, uint8_t, MLKEM_SYMBYTES, context);
- mlk_randombytes(coins, MLKEM_SYMBYTES);
- MLK_CT_TESTING_SECRET(coins, sizeof(coins));
+ if (coins == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
+ if (mlk_randombytes(coins, MLKEM_SYMBYTES) != 0)
+ {
+ ret = MLK_ERR_RNG_FAIL;
+ goto cleanup;
+ }
+
+ MLK_CT_TESTING_SECRET(coins, MLKEM_SYMBYTES);
- res = crypto_kem_enc_derand(ct, ss, pk, coins);
+ ret = mlk_kem_enc_derand(ct, ss, pk, coins, context);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(coins, sizeof(coins));
- return res;
+ MLK_FREE(coins, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
/* Reference: `crypto_kem_dec()` in the reference implementation @[REF]
* - We include secret key check
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_dec(uint8_t ss[MLKEM_SSBYTES],
- const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_dec(uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
+ int ret = 0;
uint8_t fail;
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- /* Will contain key, coins */
- MLK_ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
- MLK_ALIGN uint8_t tmp[MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES];
-
const uint8_t *pk = sk + MLKEM_INDCPA_SECRETKEYBYTES;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(tmp, uint8_t, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES,
+ context);
+
+ if (buf == NULL || kr == NULL || tmp == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Specification: Implements @[FIPS203, Section 7.3, Hash check] */
- if (mlk_check_sk(sk))
+ ret = mlk_kem_check_sk(sk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- mlk_indcpa_dec(buf, ct, sk);
+ ret = mlk_indcpa_dec(buf, ct, sk, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
/* Multitarget countermeasure for coins + contributory KEM */
- memcpy(buf + MLKEM_SYMBYTES,
- sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(buf + MLKEM_SYMBYTES,
+ sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+ MLKEM_SYMBYTES);
mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
/* Recompute and compare ciphertext */
/* coins are in kr+MLKEM_SYMBYTES */
- mlk_indcpa_enc(tmp, buf, pk, kr + MLKEM_SYMBYTES);
+ ret = mlk_indcpa_enc(tmp, buf, pk, kr + MLKEM_SYMBYTES, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
+
fail = mlk_ct_memcmp(ct, tmp, MLKEM_INDCCA_CIPHERTEXTBYTES);
/* Compute rejection key */
- memcpy(tmp, sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
- MLKEM_SYMBYTES);
- memcpy(tmp + MLKEM_SYMBYTES, ct, MLKEM_INDCCA_CIPHERTEXTBYTES);
- mlk_hash_j(ss, tmp, sizeof(tmp));
+ mlk_memcpy(tmp, sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
+ MLKEM_SYMBYTES);
+ mlk_memcpy(tmp + MLKEM_SYMBYTES, ct, MLKEM_INDCCA_CIPHERTEXTBYTES);
+ mlk_hash_j(ss, tmp, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES);
/* Copy true key to return buffer if fail is 0 */
mlk_ct_cmov_zero(ss, kr, MLKEM_SYMBYTES, fail);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(kr, sizeof(kr));
- mlk_zeroize(tmp, sizeof(tmp));
+ MLK_FREE(tmp, uint8_t, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES,
+ context);
+ MLK_FREE(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
- return 0;
+ return ret;
}
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
-#undef mlk_check_pk
-#undef mlk_check_sk
#undef mlk_check_pct
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/kem.h
index d3e5f50ce6..0502715c39 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/kem.h
@@ -10,12 +10,16 @@
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [REF]
+ * CRYSTALS-Kyber C reference implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/ref
*/
#ifndef MLK_KEM_H
#define MLK_KEM_H
-#include
#include "cbmc.h"
#include "common.h"
#include "sys.h"
@@ -23,9 +27,7 @@
#if defined(MLK_CHECK_APIS)
/* Include to ensure consistency between internal kem.h
* and external mlkem_native.h. */
-#define MLK_CONFIG_API_NO_SUPERCOP
#include "mlkem_native.h"
-#undef MLK_CONFIG_API_NO_SUPERCOP
#if MLKEM_INDCCA_SECRETKEYBYTES != \
MLKEM_SECRETKEYBYTES(MLK_CONFIG_PARAMETER_SET)
@@ -44,14 +46,79 @@
#endif /* MLK_CHECK_APIS */
-#define crypto_kem_keypair_derand MLK_NAMESPACE_K(keypair_derand)
-#define crypto_kem_keypair MLK_NAMESPACE_K(keypair)
-#define crypto_kem_enc_derand MLK_NAMESPACE_K(enc_derand)
-#define crypto_kem_enc MLK_NAMESPACE_K(enc)
-#define crypto_kem_dec MLK_NAMESPACE_K(dec)
+#define mlk_kem_keypair_derand \
+ MLK_NAMESPACE_K(keypair_derand) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_keypair MLK_NAMESPACE_K(keypair) MLK_CONTEXT_PARAMETERS_2
+#define mlk_kem_enc_derand MLK_NAMESPACE_K(enc_derand) MLK_CONTEXT_PARAMETERS_4
+#define mlk_kem_enc MLK_NAMESPACE_K(enc) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_dec MLK_NAMESPACE_K(dec) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_check_pk MLK_NAMESPACE_K(check_pk) MLK_CONTEXT_PARAMETERS_1
+#define mlk_kem_check_sk MLK_NAMESPACE_K(check_sk) MLK_CONTEXT_PARAMETERS_1
+
+/*************************************************
+ * Name: mlk_kem_check_pk
+ *
+ * Description: Implements modulus check mandated by FIPS 203,
+ * i.e., ensures that coefficients are in [0,q-1].
+ *
+ * Arguments: - const uint8_t *pk: pointer to input public key
+ * (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
+ * bytes)
+ *
+ * Returns: - 0 on success
+ * - MLK_ERR_FAIL: If the modulus check failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
+ *
+ **************************************************/
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+ requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
+);
+
+
+/*************************************************
+ * Name: mlk_kem_check_sk
+ *
+ * Description: Implements public key hash check mandated by FIPS 203,
+ * i.e., ensures that
+ * sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
+ *
+ * Arguments: - const uint8_t *sk: pointer to input private key
+ * (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
+ * bytes)
+ *
+ * Returns: - 0 on success
+ * - MLK_ERR_FAIL: If the public key hash check failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.3, 'hash check']
+ *
+ **************************************************/
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+ requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
+);
/*************************************************
- * Name: crypto_kem_keypair_derand
+ * Name: mlk_kem_keypair_derand
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
@@ -67,26 +134,33 @@
* random bytes)
*
* Returns: - 0: On success
- * - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
+ * - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ * PCT failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 16, ML-KEM.KeyGen_Internal]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
- const uint8_t coins[2 * MLKEM_SYMBYTES])
+int mlk_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ const uint8_t coins[2 * MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
requires(memory_no_alias(coins, 2 * MLKEM_SYMBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_keypair
+ * Name: mlk_kem_keypair
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
@@ -99,24 +173,32 @@ __contract__(
* bytes)
*
* Returns: - 0: On success
- * - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ * - MLK_ERR_RNG_FAIL: Random number generation failed.
+ * - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ * PCT failed.
*
* Specification: Implements @[FIPS203, Algorithm 19, ML-KEM.KeyGen]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_enc_derand
+ * Name: mlk_kem_enc_derand
*
* Description: Generates cipher text and shared
* secret for given public key
@@ -134,29 +216,34 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'modulus check' @[FIPS203, Section 7.2]
- * for the public key fails.
+ * - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ * for the public key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 17, ML-KEM.Encaps_Internal]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(ct))
- assigns(object_whole(ss))
+ assigns(memory_slice(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
/*************************************************
- * Name: crypto_kem_enc
+ * Name: mlk_kem_enc
*
* Description: Generates cipher text and shared
* secret for given public key
@@ -171,27 +258,34 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'modulus check' @[FIPS203, Section 7.2]
- * for the public key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ * - MLK_ERR_RNG_FAIL: Random number generation failed.
+ * - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ * for the public key fails.
*
* Specification: Implements @[FIPS203, Algorithm 20, ML-KEM.Encaps]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
- assigns(object_whole(ct))
- assigns(object_whole(ss))
+ assigns(memory_slice(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_dec
+ * Name: mlk_kem_dec
*
* Description: Generates shared secret for given
* cipher text and private key
@@ -206,22 +300,27 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'hash check' @[FIPS203, Section 7.3]
- * for the secret key fails.
+ * - MLK_ERR_FAIL: If the 'hash check' @[FIPS203, Section 7.3]
+ * for the secret key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 21, ML-KEM.Decaps]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_dec(uint8_t ss[MLKEM_SSBYTES],
- const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_dec(uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
- assigns(object_whole(ss))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
#endif /* !MLK_KEM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/native/api.h
index aea28a3af4..0308f2bd51 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/native/api.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/native/api.h
@@ -17,10 +17,18 @@
* and run sanity checks.
*/
-#include
#include "../cbmc.h"
#include "../common.h"
+/* Backends must return MLK_NATIVE_FUNC_SUCCESS upon success. */
+#define MLK_NATIVE_FUNC_SUCCESS (0)
+/* Backends may return MLK_NATIVE_FUNC_FALLBACK to signal to the frontend that
+ * the target/parameters are unsupported; typically, this would be because of
+ * dependencies on CPU features not detected on the host CPU. In this case,
+ * the frontend falls back to the default C implementation. */
+#define MLK_NATIVE_FUNC_FALLBACK (-1)
+
+
/* Absolute exclusive upper bound for the output of the inverse NTT
*
* NOTE: This is the same bound as in poly.h and has to be kept
@@ -74,12 +82,16 @@
*
* Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial
**************************************************/
-static MLK_INLINE void mlk_ntt_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_ntt_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
requires(array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLK_NTT_BOUND))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLK_NTT_BOUND))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_NTT */
@@ -140,11 +152,14 @@ __contract__(
*
* Arguments: - uint16_t *a: pointer to in/output polynomial
**************************************************/
-static MLK_INLINE void mlk_intt_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_intt_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLK_INVNTT_BOUND))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLK_INVNTT_BOUND))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_INTT */
@@ -156,11 +171,14 @@ __contract__(
*
* Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial
**************************************************/
-static MLK_INLINE void mlk_poly_reduce_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_reduce_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
@@ -173,11 +191,14 @@ __contract__(
*
* Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial
**************************************************/
-static MLK_INLINE void mlk_poly_tomont_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tomont_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
@@ -203,13 +224,15 @@ __contract__(
* OUTPUT
* - cache: pointer to multiplication cache
**************************************************/
-static MLK_INLINE void mlk_poly_mulcache_compute_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_mulcache_compute_native(
int16_t cache[MLKEM_N / 2], const int16_t mlk_poly[MLKEM_N])
__contract__(
requires(memory_no_alias(cache, sizeof(int16_t) * (MLKEM_N / 2)))
requires(memory_no_alias(mlk_poly, sizeof(int16_t) * MLKEM_N))
- assigns(object_whole(cache))
- ensures(array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
+ assigns(memory_slice(cache, sizeof(int16_t) * (MLKEM_N / 2)))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
);
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
@@ -234,7 +257,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N],
const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)])
__contract__(
@@ -244,6 +268,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 2 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 2 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 */
@@ -267,7 +292,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N],
const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)])
__contract__(
@@ -277,6 +303,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 3 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 3 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3 */
@@ -300,7 +327,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N],
const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)])
__contract__(
@@ -310,6 +338,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 4 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 4 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
@@ -324,18 +353,20 @@ __contract__(
*
* Arguments: INPUT:
* - a: const pointer to input polynomial,
- * with each coefficient in the range -Q+1 .. Q-1
+ * with each coefficient in the range 0 .. Q-1
* OUTPUT
* - r: pointer to output byte array
* (of MLKEM_POLYBYTES bytes)
**************************************************/
-static MLK_INLINE void mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
- const int16_t a[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+ const int16_t a[MLKEM_N])
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
);
#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
@@ -353,13 +384,15 @@ __contract__(
* - a: const pointer to input byte array
* (of MLKEM_POLYBYTES bytes)
**************************************************/
-static MLK_INLINE void mlk_poly_frombytes_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_frombytes_native(
int16_t a[MLKEM_N], const uint8_t r[MLKEM_POLYBYTES])
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(a, sizeof(int16_t) * MLKEM_N))
- ensures(array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
);
#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
@@ -381,6 +414,7 @@ __contract__(
* Otherwise, returns non-negative number of sampled 16-bit integers (at most
* len).
**************************************************/
+MLK_MUST_CHECK_RETURN_VALUE
static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
const uint8_t *buf,
unsigned buflen)
@@ -389,8 +423,10 @@ __contract__(
requires(memory_no_alias(r, sizeof(int16_t) * len))
requires(memory_no_alias(buf, buflen))
assigns(memory_slice(r, sizeof(int16_t) * len))
- ensures(return_value == -1 || (0 <= return_value && return_value <= len))
- ensures(return_value != -1 ==> array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
+ ensures(return_value != MLK_NATIVE_FUNC_FALLBACK
+ ==> (0 <= return_value && return_value <= len))
+ ensures(return_value != MLK_NATIVE_FUNC_FALLBACK
+ ==> array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
);
#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
@@ -408,8 +444,15 @@ __contract__(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d4_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d4_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
@@ -425,8 +468,15 @@ static MLK_INLINE void mlk_poly_compress_d4_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d10_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d10_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
@@ -444,8 +494,15 @@ static MLK_INLINE void mlk_poly_compress_d10_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d4_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d4_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
@@ -463,8 +520,15 @@ static MLK_INLINE void mlk_poly_decompress_d4_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d10_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d10_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
@@ -482,8 +546,15 @@ static MLK_INLINE void mlk_poly_decompress_d10_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d5_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d5_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
@@ -499,8 +570,15 @@ static MLK_INLINE void mlk_poly_compress_d5_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d11_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d11_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
@@ -518,8 +596,15 @@ static MLK_INLINE void mlk_poly_compress_d11_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d5_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d5_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
@@ -537,8 +622,15 @@ static MLK_INLINE void mlk_poly_decompress_d5_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d11_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d11_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/native/meta.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/native/meta.h
index f2b9b848b7..4291d629b1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/native/meta.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/native/meta.h
@@ -18,4 +18,8 @@
#include "x86_64/meta.h"
#endif
+#if defined(MLK_SYS_RISCV64_RVV)
+#include "riscv64/meta.h"
+#endif
+
#endif /* !MLK_NATIVE_META_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/params.h
index 3f81bb0e2e..04598539c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/params.h
@@ -5,12 +5,6 @@
#ifndef MLK_PARAMS_H
#define MLK_PARAMS_H
-#if defined(MLK_CONFIG_FILE)
-#include MLK_CONFIG_FILE
-#else
-#include "config.h"
-#endif
-
#if !defined(MLK_CONFIG_PARAMETER_SET)
#error MLK_CONFIG_PARAMETER_SET is not defined
#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly.c
index 40d29948c8..564d5d712b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly.c
@@ -20,8 +20,7 @@
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
-#include
+
#include "cbmc.h"
#include "debug.h"
#include "poly.h"
@@ -29,9 +28,6 @@
#include "symmetric.h"
#include "verify.h"
-#if !defined(MLK_USE_NATIVE_POLY_TOMONT) || \
- !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
- !defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_fqmul
*
@@ -68,10 +64,7 @@ __contract__(
mlk_assert_abs_bound(&res, 1, MLKEM_Q);
return res;
}
-#endif /* !MLK_USE_NATIVE_POLY_TOMONT || !MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE \
- || !MLK_USE_NATIVE_NTT || !MLK_USE_NATIVE_INTT */
-#if !defined(MLK_USE_NATIVE_POLY_REDUCE) || !defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_barrett_reduce
*
@@ -107,7 +100,7 @@ __contract__(
* Here, we assume it's sign-preserving "arithmetic" shift right.
* See (C99 6.5.7 (5))
*/
- const int32_t t = (magic * a + (1 << 25)) >> 26;
+ const int32_t t = (magic * a + ((int32_t)1 << 25)) >> 26;
/*
* t is in -10 .. +10, so we need 32-bit math to
@@ -118,12 +111,14 @@ __contract__(
mlk_assert_abs_bound(&res, 1, MLKEM_Q_HALF);
return res;
}
-#endif /* !MLK_USE_NATIVE_POLY_REDUCE || !MLK_USE_NATIVE_INTT */
-#if !defined(MLK_USE_NATIVE_POLY_TOMONT)
/* Reference: `poly_tomont()` in the reference implementation @[REF]. */
-MLK_INTERNAL_API
-void mlk_poly_tomont(mlk_poly *r)
+MLK_STATIC_TESTABLE void mlk_poly_tomont_c(mlk_poly *r)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+)
{
unsigned i;
const int16_t f = 1353; /* check-magic: 1353 == signed_mod(2^32, MLKEM_Q) */
@@ -137,16 +132,23 @@ void mlk_poly_tomont(mlk_poly *r)
mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_TOMONT */
+
MLK_INTERNAL_API
void mlk_poly_tomont(mlk_poly *r)
{
- mlk_poly_tomont_native(r->coeffs);
- mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_TOMONT)
+ int ret;
+ ret = mlk_poly_tomont_native(r->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
-#if !defined(MLK_USE_NATIVE_POLY_REDUCE)
+ mlk_poly_tomont_c(r);
+}
+
/************************************************************
* Name: mlk_scalar_signed_to_unsigned_q
*
@@ -162,7 +164,7 @@ void mlk_poly_tomont(mlk_poly *r)
* - Used here to implement different semantics of `poly_reduce()`;
* see below. in the reference implementation @[REF], this logic is
* part of all compression functions (see `compress.c`). */
-static MLK_INLINE uint16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
+static MLK_INLINE int16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
__contract__(
requires(c > -MLKEM_Q && c < MLKEM_Q)
ensures(return_value >= 0 && return_value < MLKEM_Q)
@@ -170,12 +172,14 @@ __contract__(
{
mlk_assert_abs_bound(&c, 1, MLKEM_Q);
- /* Add Q if c is negative, but in constant time */
- c = mlk_ct_sel_int16(c + MLKEM_Q, c, mlk_ct_cmask_neg_i16(c));
+ /* Add MLKEM_Q if c is negative, but in constant time.
+ *
+ * Note that c + MLKEM_Q does not overflow in int16_t,
+ * so the cast to uint16_t is safe. */
+ c = mlk_ct_sel_int16((int16_t)(c + MLKEM_Q), c, mlk_ct_cmask_neg_i16(c));
- /* and therefore cast to uint16_t is safe. */
mlk_assert_bound(&c, 1, 0, MLKEM_Q);
- return (uint16_t)c;
+ return c;
}
/* Reference: `poly_reduce()` in the reference implementation @[REF]
@@ -185,10 +189,15 @@ __contract__(
* here to go from signed to unsigned representatives.
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
-MLK_INTERNAL_API
-void mlk_poly_reduce(mlk_poly *r)
+MLK_STATIC_TESTABLE void mlk_poly_reduce_c(mlk_poly *r)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
+
for (i = 0; i < MLKEM_N; i++)
__loop__(
invariant(i <= MLKEM_N)
@@ -202,15 +211,23 @@ void mlk_poly_reduce(mlk_poly *r)
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_REDUCE */
+
MLK_INTERNAL_API
void mlk_poly_reduce(mlk_poly *r)
{
- mlk_poly_reduce_native(r->coeffs);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_REDUCE)
+ int ret;
+ ret = mlk_poly_reduce_native(r->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
+ mlk_poly_reduce_c(r);
+}
+
/* Reference: `poly_add()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
@@ -224,7 +241,8 @@ void mlk_poly_add(mlk_poly *r, const mlk_poly *b)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
{
- r->coeffs[i] = r->coeffs[i] + b->coeffs[i];
+ /* The preconditions imply that the addition stays within int16_t. */
+ r->coeffs[i] = (int16_t)(r->coeffs[i] + b->coeffs[i]);
}
}
@@ -241,24 +259,24 @@ void mlk_poly_sub(mlk_poly *r, const mlk_poly *b)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
{
- r->coeffs[i] = r->coeffs[i] - b->coeffs[i];
+ /* The preconditions imply that the subtraction stays within int16_t. */
+ r->coeffs[i] = (int16_t)(r->coeffs[i] - b->coeffs[i]);
}
}
-/* Include zeta table unless NTT, invNTT and mulcache computation
- * have been replaced by native implementations. */
-#if !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
- !defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT)
#include "zetas.inc"
-#endif
-#if !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
/* Reference: Does not exist in the reference implementation @[REF].
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This idea originates
* from @[NeonNTT] and is used at the C level here. */
-MLK_INTERNAL_API
-void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_mulcache_compute_c(mlk_poly_mulcache *x,
+ const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 4; i++)
@@ -266,8 +284,11 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
invariant(i <= MLKEM_N / 4)
invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
{
- x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
- x->coeffs[2 * i + 1] = mlk_fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
+ x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], mlk_zetas[64 + i]);
+ /* The values in zeta table are <= MLKEM_Q in absolute value,
+ * so the negation in int16_t is safe. */
+ x->coeffs[2 * i + 1] =
+ mlk_fqmul(a->coeffs[4 * i + 3], (int16_t)(-mlk_zetas[64 + i]));
}
/*
@@ -278,15 +299,22 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
*/
mlk_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
MLK_INTERNAL_API
void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
{
- mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
-}
+#if defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+ int ret;
+ ret = mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
-#if !defined(MLK_USE_NATIVE_NTT)
+ mlk_poly_mulcache_compute_c(x, a);
+}
+
/*
* Computes a block CT butterflies with a fixed twiddle factor,
* using Montgomery multiplication.
@@ -316,7 +344,8 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
/* Reference: Embedded in `ntt()` in the reference implementation @[REF]. */
static void mlk_ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
- unsigned start, unsigned len, int bound)
+ unsigned start, unsigned len,
+ unsigned bound)
__contract__(
requires(start < MLKEM_N)
requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
@@ -346,8 +375,9 @@ __contract__(
{
int16_t t;
t = mlk_fqmul(r[j + len], zeta);
- r[j + len] = r[j] - t;
- r[j] = r[j] + t;
+ /* The precondition implies that the arithmetic does not overflow. */
+ r[j + len] = (int16_t)(r[j] - t);
+ r[j] = (int16_t)(r[j] + t);
}
}
@@ -370,7 +400,7 @@ __contract__(
unsigned start, k, len;
/* Twiddle factors for layer n are at indices 2^(n-1)..2^n-1. */
k = 1u << (layer - 1);
- len = MLKEM_N >> layer;
+ len = (unsigned)MLKEM_N >> layer;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
invariant(start < MLKEM_N + 2 * len)
@@ -378,7 +408,7 @@ __contract__(
invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
{
- int16_t zeta = zetas[k++];
+ int16_t zeta = mlk_zetas[k++];
mlk_ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
}
}
@@ -395,12 +425,19 @@ __contract__(
/* Reference: `ntt()` in the reference implementation @[REF].
* - Iterate over `layer` instead of `len` in the outer loop
* to simplify computation of zeta index. */
-MLK_INTERNAL_API
-void mlk_poly_ntt(mlk_poly *p)
+MLK_STATIC_TESTABLE void mlk_poly_ntt_c(mlk_poly *p)
+__contract__(
+ requires(memory_no_alias(p, sizeof(mlk_poly)))
+ requires(array_abs_bound(p->coeffs, 0, MLKEM_N, MLKEM_Q))
+ assigns(memory_slice(p, sizeof(mlk_poly)))
+ ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_NTT_BOUND))
+)
{
unsigned layer;
int16_t *r;
+
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+
r = p->coeffs;
for (layer = 1; layer <= 7; layer++)
@@ -414,18 +451,24 @@ void mlk_poly_ntt(mlk_poly *p)
/* Check the stronger bound */
mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
}
-#else /* !MLK_USE_NATIVE_NTT */
MLK_INTERNAL_API
void mlk_poly_ntt(mlk_poly *p)
{
+#if defined(MLK_USE_NATIVE_NTT)
+ int ret;
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
- mlk_ntt_native(p->coeffs);
- mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
-}
+ ret = mlk_ntt_native(p->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
+ return;
+ }
#endif /* MLK_USE_NATIVE_NTT */
-#if !defined(MLK_USE_NATIVE_INTT)
+ mlk_poly_ntt_c(p);
+}
+
/* Compute one layer of inverse NTT */
@@ -439,7 +482,7 @@ __contract__(
ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
unsigned start, k, len;
- len = (MLKEM_N >> layer);
+ len = (unsigned)MLKEM_N >> layer;
k = (1u << layer) - 1;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
@@ -449,7 +492,7 @@ __contract__(
invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
{
unsigned j;
- int16_t zeta = zetas[k--];
+ int16_t zeta = mlk_zetas[k--];
for (j = start; j < start + len; j++)
__loop__(
invariant(start <= j && j <= start + len)
@@ -457,8 +500,9 @@ __contract__(
invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
int16_t t = r[j];
- r[j] = mlk_barrett_reduce(t + r[j + len]);
- r[j + len] = r[j + len] - t;
+ /* The preconditions imply that the arithmetic does not overflow. */
+ r[j] = mlk_barrett_reduce((int16_t)(t + r[j + len]));
+ r[j + len] = (int16_t)(r[j + len] - t);
r[j + len] = mlk_fqmul(r[j + len], zeta);
}
}
@@ -469,18 +513,22 @@ __contract__(
* while the reference implementation normalizes at
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
-MLK_INTERNAL_API
-void mlk_poly_invntt_tomont(mlk_poly *p)
+MLK_STATIC_TESTABLE void mlk_poly_invntt_tomont_c(mlk_poly *p)
+__contract__(
+ requires(memory_no_alias(p, sizeof(mlk_poly)))
+ assigns(memory_slice(p, sizeof(mlk_poly)))
+ ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND))
+)
{
+ unsigned j, layer;
+ const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
+ int16_t *r = p->coeffs;
+
/*
* Scale input polynomial to account for Montgomery factor
* and NTT twist. This also brings coefficients down to
* absolute value < MLKEM_Q.
*/
- unsigned j, layer;
- const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
- int16_t *r = p->coeffs;
-
for (j = 0; j < MLKEM_N; j++)
__loop__(
invariant(j <= MLKEM_N)
@@ -500,16 +548,23 @@ void mlk_poly_invntt_tomont(mlk_poly *p)
mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
}
-#else /* !MLK_USE_NATIVE_INTT */
MLK_INTERNAL_API
void mlk_poly_invntt_tomont(mlk_poly *p)
{
- mlk_intt_native(p->coeffs);
- mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
-}
+#if defined(MLK_USE_NATIVE_INTT)
+ int ret;
+ ret = mlk_intt_native(p->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
+ return;
+ }
#endif /* MLK_USE_NATIVE_INTT */
+ mlk_poly_invntt_tomont_c(p);
+}
+
#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
MLK_EMPTY_CU(mlk_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly.h
index 20fb65e720..587062cce5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly.h
@@ -15,8 +15,7 @@
#ifndef MLK_POLY_H
#define MLK_POLY_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include "debug.h"
@@ -46,34 +45,6 @@ typedef struct
int16_t coeffs[MLKEM_N >> 1];
} MLK_ALIGN mlk_poly_mulcache;
-/*************************************************
- * Name: mlk_cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- * input x in 0 .. 32767: returns value unchanged
- * input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
-{
- /*
- * PORTABILITY: This relies on uint16_t -> int16_t
- * being implemented as the inverse of int16_t -> uint16_t,
- * which is implementation-defined (C99 6.3.1.3 (3))
- * CBMC (correctly) fails to prove this conversion is OK,
- * so we have to suppress that check here
- */
- return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
/*************************************************
* Name: mlk_montgomery_reduce
*
@@ -90,7 +61,7 @@ static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
static MLK_ALWAYS_INLINE int16_t mlk_montgomery_reduce(int32_t a)
__contract__(
requires(a < +(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)) &&
- a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
+ a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
/* We don't attempt to express an input-dependent output bound
* as the post-condition here. There are two call-sites for this
* function:
@@ -102,8 +73,8 @@ __contract__(
/* check-magic: 62209 == unsigned_mod(pow(MLKEM_Q, -1, 2^16), 2^16) */
const uint32_t QINV = 62209;
- /* Compute a*q^{-1} mod 2^16 in unsigned representatives */
- const uint16_t a_reduced = a & UINT16_MAX;
+ /* Compute a*q^{-1} mod 2^16 in unsigned representatives. */
+ const uint16_t a_reduced = mlk_cast_int32_to_uint16(a);
const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
/* Lift to signed canonical representative mod 2^16. */
@@ -187,7 +158,7 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
__contract__(
requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
requires(memory_no_alias(a, sizeof(mlk_poly)))
- assigns(object_whole(x))
+ assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
);
#define mlk_poly_reduce MLK_NAMESPACE(poly_reduce)
@@ -280,7 +251,7 @@ __contract__(
requires(forall(k0, 0, MLKEM_N, (int32_t) r->coeffs[k0] - b->coeffs[k0] <= INT16_MAX))
requires(forall(k1, 0, MLKEM_N, (int32_t) r->coeffs[k1] - b->coeffs[k1] >= INT16_MIN))
ensures(forall(k, 0, MLKEM_N, r->coeffs[k] == old(*r).coeffs[k] - b->coeffs[k]))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
);
#define mlk_poly_ntt MLK_NAMESPACE(poly_ntt)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly_k.c
index f15ab96ce7..32b214ee04 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly_k.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly_k.c
@@ -22,12 +22,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
+#include "poly_k.h"
-#include "compress.h"
#include "debug.h"
-#include "poly_k.h"
#include "sampling.h"
#include "symmetric.h"
@@ -37,6 +34,8 @@
* within a single compilation unit. */
#define mlk_poly_cbd_eta1 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta1)
#define mlk_poly_cbd_eta2 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta2)
+#define mlk_polyvec_basemul_acc_montgomery_cached_c \
+ MLK_ADD_PARAM_SET(mlk_polyvec_basemul_acc_montgomery_cached_c)
/* End of parameter set namespacing */
/* Reference: `polyvec_compress()` in the reference implementation @[REF]
@@ -46,29 +45,29 @@
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
- const mlk_polyvec a)
+ const mlk_polyvec *a)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a[i]);
+ mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
}
}
/* Reference: `polyvec_decompress()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_decompress_du(mlk_polyvec r,
+void mlk_polyvec_decompress_du(mlk_polyvec *r,
const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_decompress_du(&r[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+ mlk_poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `polyvec_tobytes()` in the reference implementation @[REF].
@@ -77,41 +76,45 @@ void mlk_polyvec_decompress_du(mlk_polyvec r,
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
-void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a)
+void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec *a)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(r, MLKEM_POLYVECBYTES))
+ invariant(i <= MLKEM_K)
+ )
{
- mlk_poly_tobytes(r + i * MLKEM_POLYBYTES, &a[i]);
+ mlk_poly_tobytes(&r[i * MLKEM_POLYBYTES], &a->vec[i]);
}
}
/* Reference: `polyvec_frombytes()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES])
+void mlk_polyvec_frombytes(mlk_polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_frombytes(&r[i], a + i * MLKEM_POLYBYTES);
+ mlk_poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
}
/* Reference: `polyvec_ntt()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_ntt(mlk_polyvec r)
+void mlk_polyvec_ntt(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_ntt(&r[i]);
+ mlk_poly_ntt(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
}
/* Reference: `polyvec_invntt_tomont()` in the reference implementation @[REF].
@@ -120,18 +123,17 @@ void mlk_polyvec_ntt(mlk_polyvec r)
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
MLK_INTERNAL_API
-void mlk_polyvec_invntt_tomont(mlk_polyvec r)
+void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_invntt_tomont(&r[i]);
+ mlk_poly_invntt_tomont(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND);
}
-#if !defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
/* Reference: `polyvec_basemul_acc_montgomery()` in the
* reference implementation @[REF].
* - We use a multiplication cache ('mulcache') here
@@ -143,13 +145,22 @@ void mlk_polyvec_invntt_tomont(mlk_polyvec r)
* at the end. The reference implementation uses 2 * MLKEM_K
* more modular reductions since it reduces after every modular
* multiplication. */
-MLK_INTERNAL_API
-void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+MLK_STATIC_TESTABLE void mlk_polyvec_basemul_acc_montgomery_cached_c(
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, sizeof(mlk_polyvec)))
+ requires(memory_no_alias(b, sizeof(mlk_polyvec)))
+ requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
+ requires(forall(k1, 0, MLKEM_K,
+ array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(invariant(i <= MLKEM_N / 2))
{
@@ -163,53 +174,59 @@ void mlk_polyvec_basemul_acc_montgomery_cached(
t[1] <= ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768) &&
t[1] >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768)))
{
- t[0] += (int32_t)a[k].coeffs[2 * i + 1] * b_cache[k].coeffs[i];
- t[0] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i];
- t[1] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i + 1];
- t[1] += (int32_t)a[k].coeffs[2 * i + 1] * b[k].coeffs[2 * i];
+ t[0] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b_cache->vec[k].coeffs[i];
+ t[0] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i];
+ t[1] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i + 1];
+ t[1] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b->vec[k].coeffs[2 * i];
}
r->coeffs[2 * i + 0] = mlk_montgomery_reduce(t[0]);
r->coeffs[2 * i + 1] = mlk_montgomery_reduce(t[1]);
}
}
-#else /* !MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
MLK_INTERNAL_API
void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
{
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
- /* Omitting bounds assertion for cache since native implementations may
- * decide not to use a mulcache. Note that the C backend implementation
- * of poly_basemul_montgomery_cached() does still include the check. */
+#if defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+ {
+ int ret;
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
#if MLKEM_K == 2
- mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#elif MLKEM_K == 3
- mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#elif MLKEM_K == 4
- mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#endif
-}
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
+ }
#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+ mlk_polyvec_basemul_acc_montgomery_cached_c(r, a, b, b_cache);
+}
+
/* Reference: Does not exist in the reference implementation @[REF].
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This idea originates
* from @[NeonNTT] and is used at the C level here. */
MLK_INTERNAL_API
-void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
+void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_mulcache_compute(&x[i], &a[i]);
+ mlk_poly_mulcache_compute(&x->vec[i], &a->vec[i]);
}
}
@@ -221,41 +238,53 @@ void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
MLK_INTERNAL_API
-void mlk_polyvec_reduce(mlk_polyvec r)
+void mlk_polyvec_reduce(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_reduce(&r[i]);
+ mlk_poly_reduce(&r->vec[i]);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `polyvec_add()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
MLK_INTERNAL_API
-void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b)
+void mlk_polyvec_add(mlk_polyvec *r, const mlk_polyvec *b)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(r, sizeof(mlk_polyvec)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(j0, i, MLKEM_K,
+ forall(k0, 0, MLKEM_N,
+ ((int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX) &&
+ ((int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] >= INT16_MIN))))
+ invariant(forall(j2, 0, i,
+ forall(k2, 0, MLKEM_N,
+ (r->vec[j2].coeffs[k2] <= INT16_MAX) &&
+ (r->vec[j2].coeffs[k2] >= INT16_MIN))))
+ )
{
- mlk_poly_add(&r[i], &b[i]);
+ mlk_poly_add(&r->vec[i], &b->vec[i]);
}
}
/* Reference: `polyvec_tomont()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_tomont(mlk_polyvec r)
+void mlk_polyvec_tomont(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_tomont(&r[i]);
+ mlk_poly_tomont(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLKEM_Q);
}
@@ -306,24 +335,41 @@ void mlk_poly_getnoise_eta1_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
{
MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
- memcpy(extkey[0], seed, MLKEM_SYMBYTES);
- memcpy(extkey[1], seed, MLKEM_SYMBYTES);
- memcpy(extkey[2], seed, MLKEM_SYMBYTES);
- memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
extkey[0][MLKEM_SYMBYTES] = nonce0;
extkey[1][MLKEM_SYMBYTES] = nonce1;
extkey[2][MLKEM_SYMBYTES] = nonce2;
extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) && \
+ !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
mlk_prf_eta1_x4(buf, extkey);
+#else
+ mlk_prf_eta1(buf[0], extkey[0]);
+ mlk_prf_eta1(buf[1], extkey[1]);
+ mlk_prf_eta1(buf[2], extkey[2]);
+ if (r3 != NULL)
+ {
+ mlk_prf_eta1(buf[3], extkey[3]);
+ }
+#endif /* !(!FIPS202_X4_DEFAULT_IMPLEMENTATION && \
+ !MLK_CONFIG_SERIAL_FIPS202_ONLY) */
+
mlk_poly_cbd_eta1(r0, buf[0]);
mlk_poly_cbd_eta1(r1, buf[1]);
mlk_poly_cbd_eta1(r2, buf[2]);
- mlk_poly_cbd_eta1(r3, buf[3]);
+ if (r3 != NULL)
+ {
+ mlk_poly_cbd_eta1(r3, buf[3]);
+ mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+ }
mlk_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
- mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -364,7 +410,7 @@ __contract__(
#endif
}
-/* Reference: `poly_getnoise_eta1()` in the reference implementation @[REF].
+/* Reference: `poly_getnoise_eta2()` in the reference implementation @[REF].
* - We include buffer zeroization. */
MLK_INTERNAL_API
void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
@@ -373,13 +419,13 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
MLK_ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
MLK_ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
- memcpy(extkey, seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey, seed, MLKEM_SYMBYTES);
extkey[MLKEM_SYMBYTES] = nonce;
mlk_prf_eta2(buf, extkey);
mlk_poly_cbd_eta2(r, buf);
- mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+ mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA2 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -391,7 +437,7 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
#if MLKEM_K == 2
/* Reference: Does not exist in the reference implementation @[REF].
* - This implements a x4-batched version of `poly_getnoise_eta1()`
- * and `poly_getnoise_eta1()` from the reference implementation,
+ * and `poly_getnoise_eta2()` from the reference implementation,
* leveraging batched Keccak-f1600.
* - If a x4-batched Keccak-f1600 is available, we squeeze
* more random data than needed for the eta2 calls, to be
@@ -409,10 +455,10 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
- memcpy(extkey[0], seed, MLKEM_SYMBYTES);
- memcpy(extkey[1], seed, MLKEM_SYMBYTES);
- memcpy(extkey[2], seed, MLKEM_SYMBYTES);
- memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
extkey[0][MLKEM_SYMBYTES] = nonce0;
extkey[1][MLKEM_SYMBYTES] = nonce1;
extkey[2][MLKEM_SYMBYTES] = nonce2;
@@ -421,14 +467,16 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
/* On systems with fast batched Keccak, we use 4-fold batched PRF,
* even though that means generating more random data in buf[2] and buf[3]
* than necessary. */
-#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION)
+#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) && \
+ !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
mlk_prf_eta1_x4(buf, extkey);
#else
mlk_prf_eta1(buf[0], extkey[0]);
mlk_prf_eta1(buf[1], extkey[1]);
mlk_prf_eta2(buf[2], extkey[2]);
mlk_prf_eta2(buf[3], extkey[3]);
-#endif /* FIPS202_X4_DEFAULT_IMPLEMENTATION */
+#endif /* !(!FIPS202_X4_DEFAULT_IMPLEMENTATION && \
+ !MLK_CONFIG_SERIAL_FIPS202_ONLY) */
mlk_poly_cbd_eta1(r0, buf[0]);
mlk_poly_cbd_eta1(r1, buf[1]);
@@ -451,3 +499,4 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
#undef mlk_poly_cbd_eta1
#undef mlk_poly_cbd_eta2
+#undef mlk_polyvec_basemul_acc_montgomery_cached_c
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly_k.h
index f7a40ff5f9..9089a8e431 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly_k.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/poly_k.h
@@ -15,7 +15,6 @@
#ifndef MLK_POLY_K_H
#define MLK_POLY_K_H
-#include
#include "common.h"
#include "compress.h"
#include "poly.h"
@@ -29,9 +28,20 @@
#define mlk_polyvec_mulcache MLK_ADD_PARAM_SET(mlk_polyvec_mulcache)
/* End of parameter set namespacing */
-typedef mlk_poly mlk_polyvec[MLKEM_K];
-typedef mlk_poly mlk_polymat[MLKEM_K * MLKEM_K];
-typedef mlk_poly_mulcache mlk_polyvec_mulcache[MLKEM_K];
+typedef struct
+{
+ mlk_poly vec[MLKEM_K];
+} MLK_ALIGN mlk_polyvec;
+
+typedef struct
+{
+ mlk_polyvec vec[MLKEM_K];
+} MLK_ALIGN mlk_polymat;
+
+typedef struct
+{
+ mlk_poly_mulcache vec[MLKEM_K];
+} MLK_ALIGN mlk_polyvec_mulcache;
#define mlk_poly_compress_du MLK_NAMESPACE_K(poly_compress_du)
/*************************************************
@@ -131,7 +141,7 @@ __contract__(
requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r)))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DV)))
{
#if MLKEM_DV == 4
mlk_poly_compress_d4(r, a);
@@ -168,7 +178,7 @@ static MLK_INLINE void mlk_poly_decompress_dv(
__contract__(
requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
requires(memory_no_alias(r, sizeof(mlk_poly)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
{
#if MLKEM_DV == 4
@@ -200,13 +210,13 @@ __contract__(
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
- const mlk_polyvec a)
+ const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(forall(k0, 0, MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
- assigns(object_whole(r))
+ array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
);
#define mlk_polyvec_decompress_du MLK_NAMESPACE_K(polyvec_decompress_du)
@@ -228,14 +238,14 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_decompress_du(mlk_polyvec r,
+void mlk_polyvec_decompress_du(mlk_polyvec *r,
const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
__contract__(
requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_polyvec_tobytes MLK_NAMESPACE_K(polyvec_tobytes)
@@ -256,13 +266,13 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a)
+void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
requires(forall(k0, 0, MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
- assigns(object_whole(r))
+ array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(r, MLKEM_POLYVECBYTES))
);
#define mlk_polyvec_frombytes MLK_NAMESPACE_K(polyvec_frombytes)
@@ -284,13 +294,13 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES])
+void mlk_polyvec_frombytes(mlk_polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
);
#define mlk_polyvec_ntt MLK_NAMESPACE_K(polyvec_ntt)
@@ -313,14 +323,14 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_ntt(mlk_polyvec r)
+void mlk_polyvec_ntt(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
- assigns(object_whole(r))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
);
#define mlk_polyvec_invntt_tomont MLK_NAMESPACE_K(polyvec_invntt_tomont)
@@ -344,12 +354,12 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_invntt_tomont(mlk_polyvec r)
+void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
);
#define mlk_polyvec_basemul_acc_montgomery_cached \
@@ -380,16 +390,16 @@ __contract__(
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(memory_no_alias(b, sizeof(mlk_polyvec)))
requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
requires(forall(k1, 0, MLKEM_K,
- array_bound(a[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
- assigns(object_whole(r))
+ array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
);
#define mlk_polyvec_mulcache_compute MLK_NAMESPACE_K(polyvec_mulcache_compute)
@@ -423,11 +433,11 @@ __contract__(
* higher level safety proofs, and thus not part of the spec.
*/
MLK_INTERNAL_API
-void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
+void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(x, sizeof(mlk_polyvec_mulcache)))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
- assigns(object_whole(x))
+ assigns(memory_slice(x, sizeof(mlk_polyvec_mulcache)))
);
#define mlk_polyvec_reduce MLK_NAMESPACE_K(polyvec_reduce)
@@ -436,7 +446,7 @@ __contract__(
*
* Description: Applies Barrett reduction to each coefficient
* of each element of a vector of polynomials;
- * for details of the Barrett reduction see comments in reduce.c
+ * for details of the Barrett reduction see comments in poly.c
*
* Arguments: - mlk_polyvec r: pointer to input/output polynomial
*
@@ -453,12 +463,12 @@ __contract__(
* use of mlk_poly_reduce() in the context of (de)serialization.
*/
MLK_INTERNAL_API
-void mlk_polyvec_reduce(mlk_polyvec r)
+void mlk_polyvec_reduce(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_polyvec_add MLK_NAMESPACE_K(polyvec_add)
@@ -485,17 +495,17 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b)
+void mlk_polyvec_add(mlk_polyvec *r, const mlk_polyvec *b)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(memory_no_alias(b, sizeof(mlk_polyvec)))
requires(forall(j0, 0, MLKEM_K,
forall(k0, 0, MLKEM_N,
- (int32_t)r[j0].coeffs[k0] + b[j0].coeffs[k0] <= INT16_MAX)))
+ (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
requires(forall(j1, 0, MLKEM_K,
forall(k1, 0, MLKEM_N,
- (int32_t)r[j1].coeffs[k1] + b[j1].coeffs[k1] >= INT16_MIN)))
- assigns(object_whole(r))
+ (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
);
#define mlk_polyvec_tomont MLK_NAMESPACE_K(polyvec_tomont)
@@ -514,13 +524,12 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_tomont(mlk_polyvec r)
+void mlk_polyvec_tomont(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
assigns(memory_slice(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
);
#define mlk_poly_getnoise_eta1_4x MLK_NAMESPACE_K(poly_getnoise_eta1_4x)
@@ -531,7 +540,8 @@ __contract__(
* and nonces, with output polynomials close to centered binomial
* distribution with parameter MLKEM_ETA1.
*
- * Arguments: - mlk_poly *r{0,1,2,3}: pointer to output polynomial
+ * Arguments: - mlk_poly *r{0,1,2,3}: pointer to output polynomial. The last
+ * polynomial pointer may be NULL.
* - const uint8_t *seed: pointer to input seed
* (of length MLKEM_SYMBYTES bytes)
* - uint8_t nonce{0,1,2,3}: one-byte input nonce
@@ -555,16 +565,15 @@ __contract__(
requires(memory_no_alias(r0, sizeof(mlk_poly)))
requires(memory_no_alias(r1, sizeof(mlk_poly)))
requires(memory_no_alias(r2, sizeof(mlk_poly)))
- requires(memory_no_alias(r3, sizeof(mlk_poly)))
+ requires(r3 == NULL || memory_no_alias(r3, sizeof(mlk_poly)))
assigns(memory_slice(r0, sizeof(mlk_poly)))
assigns(memory_slice(r1, sizeof(mlk_poly)))
assigns(memory_slice(r2, sizeof(mlk_poly)))
- assigns(memory_slice(r3, sizeof(mlk_poly)))
- ensures(
- array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+ assigns(r3 != NULL: memory_slice(r3, sizeof(mlk_poly)))
+ ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(r3 != NULL ==> array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
);
#if MLKEM_ETA1 == MLKEM_ETA2
@@ -604,7 +613,7 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
);
#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
@@ -640,15 +649,19 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
uint8_t nonce0, uint8_t nonce1,
uint8_t nonce2, uint8_t nonce3)
__contract__(
- requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(mlk_poly)) && memory_no_alias(r2, 2 * sizeof(mlk_poly)) &&
- r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+ requires(memory_no_alias(r0, sizeof(mlk_poly)))
+ requires(memory_no_alias(r1, sizeof(mlk_poly)))
+ requires(memory_no_alias(r2, sizeof(mlk_poly)))
+ requires(memory_no_alias(r3, sizeof(mlk_poly)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
- assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+ assigns(memory_slice(r0, sizeof(mlk_poly)))
+ assigns(memory_slice(r1, sizeof(mlk_poly)))
+ assigns(memory_slice(r2, sizeof(mlk_poly)))
+ assigns(memory_slice(r3, sizeof(mlk_poly)))
ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
- && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+ && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+ && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+ && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1))
);
#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/randombytes.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/randombytes.h
index 132d920afb..3e841d26ca 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/randombytes.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/randombytes.h
@@ -5,18 +5,56 @@
#ifndef MLK_RANDOMBYTES_H
#define MLK_RANDOMBYTES_H
-#include
-#include
#include "cbmc.h"
#include "common.h"
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
#if !defined(MLK_CONFIG_CUSTOM_RANDOMBYTES)
-void randombytes(uint8_t *out, size_t outlen);
-static MLK_INLINE void mlk_randombytes(uint8_t *out, size_t outlen)
+/*************************************************
+ * Name: randombytes
+ *
+ * Description: Fill a buffer with cryptographically secure random bytes.
+ *
+ * mlkem-native does not provide an implementation of this
+ * function. It must be provided by the consumer.
+ *
+ * To use a custom random byte source with a different name
+ * or signature, set MLK_CONFIG_CUSTOM_RANDOMBYTES and define
+ * mlk_randombytes directly.
+ *
+ * Arguments: - uint8_t *out: pointer to output buffer
+ * - size_t outlen: number of random bytes to write
+ *
+ * Returns: 0 on success, non-zero on failure.
+ * On failure, top-level APIs return MLK_ERR_RNG_FAIL.
+ *
+ **************************************************/
+int randombytes(uint8_t *out, size_t outlen);
+
+/*************************************************
+ * Name: mlk_randombytes
+ *
+ * Description: Internal wrapper around randombytes().
+ *
+ * Fill a buffer with cryptographically secure random bytes.
+ *
+ * This function can be replaced by setting
+ * MLK_CONFIG_CUSTOM_RANDOMBYTES and defining mlk_randombytes
+ * directly.
+ *
+ * Arguments: - uint8_t *out: pointer to output buffer
+ * - size_t outlen: number of random bytes to write
+ *
+ * Returns: 0 on success, non-zero on failure.
+ * On failure, top-level APIs return MLK_ERR_RNG_FAIL.
+ *
+ **************************************************/
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_randombytes(uint8_t *out, size_t outlen)
__contract__(
requires(memory_no_alias(out, outlen))
- assigns(memory_slice(out, outlen))) { randombytes(out, outlen); }
+ assigns(memory_slice(out, outlen))) { return randombytes(out, outlen); }
#endif /* !MLK_CONFIG_CUSTOM_RANDOMBYTES */
-
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
#endif /* !MLK_RANDOMBYTES_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/sampling.c
index be5d931a79..945d12ed3d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/sampling.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/sampling.c
@@ -29,9 +29,10 @@
* in that it adds the offset and always expects the base of the
* target buffer. This avoids shifting the buffer base in the
* caller, which appears tricky to reason about. */
-static unsigned mlk_rej_uniform_scalar(int16_t *r, unsigned target,
- unsigned offset, const uint8_t *buf,
- unsigned buflen)
+MLK_STATIC_TESTABLE unsigned mlk_rej_uniform_c(int16_t *r, unsigned target,
+ unsigned offset,
+ const uint8_t *buf,
+ unsigned buflen)
__contract__(
requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
requires(memory_no_alias(r, sizeof(int16_t) * target))
@@ -39,11 +40,10 @@ __contract__(
requires(array_bound(r, 0, offset, 0, MLKEM_Q))
assigns(memory_slice(r, sizeof(int16_t) * target))
ensures(offset <= return_value && return_value <= target)
- ensures(array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
+ ensures(array_bound(r, 0, return_value, 0, MLKEM_Q)))
{
unsigned ctr, pos;
- uint16_t val0, val1;
+ int16_t val0, val1;
mlk_assert_bound(r, offset, 0, MLKEM_Q);
@@ -55,8 +55,8 @@ __contract__(
invariant(offset <= ctr && ctr <= target && pos <= buflen)
invariant(array_bound(r, 0, ctr, 0, MLKEM_Q)))
{
- val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
- val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+ val0 = ((buf[pos + 0] >> 0) | (buf[pos + 1] << 8)) & 0xFFF;
+ val1 = ((buf[pos + 1] >> 4) | (buf[pos + 2] << 4)) & 0xFFF;
pos += 3;
if (val0 < MLKEM_Q)
@@ -93,7 +93,7 @@ __contract__(
* Must be a multiple of 3.
*
* Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 128 is somewhat arbitrary but sufficient for all
+ * excluding. The limit of 4096 is somewhat arbitrary but sufficient for all
* uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
*
* Returns the new offset of sampled 16-bit integers, at most target,
@@ -124,8 +124,9 @@ __contract__(
#if defined(MLK_USE_NATIVE_REJ_UNIFORM)
if (offset == 0)
{
- int ret = mlk_rej_uniform_native(r, target, buf, buflen);
- if (ret != -1)
+ int ret;
+ ret = mlk_rej_uniform_native(r, target, buf, buflen);
+ if (ret != MLK_NATIVE_FUNC_FALLBACK)
{
unsigned res = (unsigned)ret;
mlk_assert_bound(r, res, 0, MLKEM_Q);
@@ -134,19 +135,22 @@ __contract__(
}
#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
- return mlk_rej_uniform_scalar(r, target, offset, buf, buflen);
+ return mlk_rej_uniform_c(r, target, offset, buf, buflen);
}
#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
- ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + MLK_XOF_RATE) / MLK_XOF_RATE)
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+ ((12 * MLKEM_N / 8 * ((uint32_t)1 << 12) / MLKEM_Q + MLK_XOF_RATE) / \
+ MLK_XOF_RATE)
#endif
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
/* Reference: Does not exist in the reference implementation @[REF].
* - x4-batched version of `rej_uniform()` from the
* reference implementation, leveraging x4-batched Keccak-f1600. */
MLK_INTERNAL_API
-void mlk_poly_rej_uniform_x4(mlk_poly *vec,
+void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
+ mlk_poly *vec3,
uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
{
/* Temporary buffers for XOF output before rejection sampling */
@@ -167,10 +171,10 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
*/
mlk_xof_x4_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &statex);
buflen = MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE;
- ctr[0] = mlk_rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf[0], buflen);
- ctr[1] = mlk_rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf[1], buflen);
- ctr[2] = mlk_rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf[2], buflen);
- ctr[3] = mlk_rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf[3], buflen);
+ ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, 0, buf[0], buflen);
+ ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, 0, buf[1], buflen);
+ ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, 0, buf[2], buflen);
+ ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, 0, buf[3], buflen);
/*
* So long as not all matrix entries have been generated, squeeze
@@ -180,20 +184,24 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
ctr[3] < MLKEM_N)
__loop__(
- assigns(ctr, statex, memory_slice(vec, sizeof(mlk_poly) * 4), object_whole(buf[0]),
- object_whole(buf[1]), object_whole(buf[2]), object_whole(buf[3]))
+ assigns(ctr, statex,
+ memory_slice(vec0, sizeof(mlk_poly)),
+ memory_slice(vec1, sizeof(mlk_poly)),
+ memory_slice(vec2, sizeof(mlk_poly)),
+ memory_slice(vec3, sizeof(mlk_poly)),
+ object_whole(buf))
invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
- invariant(array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
- invariant(array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
- invariant(array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
- invariant(array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+ invariant(array_bound(vec0->coeffs, 0, ctr[0], 0, MLKEM_Q))
+ invariant(array_bound(vec1->coeffs, 0, ctr[1], 0, MLKEM_Q))
+ invariant(array_bound(vec2->coeffs, 0, ctr[2], 0, MLKEM_Q))
+ invariant(array_bound(vec3->coeffs, 0, ctr[3], 0, MLKEM_Q)))
{
mlk_xof_x4_squeezeblocks(buf, 1, &statex);
- ctr[0] = mlk_rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf[0], buflen);
- ctr[1] = mlk_rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf[1], buflen);
- ctr[2] = mlk_rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf[2], buflen);
- ctr[3] = mlk_rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf[3], buflen);
+ ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, ctr[0], buf[0], buflen);
+ ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, ctr[1], buf[1], buflen);
+ ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, ctr[2], buf[2], buflen);
+ ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, ctr[3], buf[3], buflen);
}
mlk_xof_x4_release(&statex);
@@ -202,6 +210,7 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(buf, sizeof(buf));
}
+#endif /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
MLK_INTERNAL_API
void mlk_poly_rej_uniform(mlk_poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
@@ -284,7 +293,7 @@ void mlk_poly_cbd2(mlk_poly *r, const uint8_t buf[2 * MLKEM_N / 4])
{
const int16_t a = (d >> (4 * j + 0)) & 0x3;
const int16_t b = (d >> (4 * j + 2)) & 0x3;
- r->coeffs[8 * i + j] = a - b;
+ r->coeffs[8 * i + j] = (int16_t)(a - b);
}
}
}
@@ -336,7 +345,7 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4])
{
const int16_t a = (d >> (6 * j + 0)) & 0x7;
const int16_t b = (d >> (6 * j + 3)) & 0x7;
- r->coeffs[4 * i + j] = a - b;
+ r->coeffs[4 * i + j] = (int16_t)(a - b);
}
}
}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/sampling.h
index 2cf43c889b..24c26b34a5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/sampling.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/sampling.h
@@ -15,8 +15,6 @@
#ifndef MLK_SAMPLING_H
#define MLK_SAMPLING_H
-#include
-#include
#include "cbmc.h"
#include "common.h"
#include "poly.h"
@@ -58,6 +56,7 @@ MLK_INTERNAL_API
void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_ETA1 == 3 */
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
#define mlk_poly_rej_uniform_x4 MLK_NAMESPACE(poly_rej_uniform_x4)
/*************************************************
* Name: mlk_poly_rej_uniform_x4
@@ -65,8 +64,8 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
* Description: Generate four polynomials using rejection sampling
* on (pseudo-)uniformly random bytes sampled from a seed.
*
- * Arguments: - mlk_poly *vec:
- * Pointer to an array of 4 polynomials to be sampled.
+ * Arguments: - mlk_poly *vec0, *vec1, *vec2, *vec3:
+ * Pointers to 4 polynomials to be sampled.
* - uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)]:
* Pointer consecutive array of seed buffers of size
* MLKEM_SYMBYTES + 2 each, plus padding for alignment.
@@ -75,16 +74,24 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
*
**************************************************/
MLK_INTERNAL_API
-void mlk_poly_rej_uniform_x4(mlk_poly *vec,
+void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
+ mlk_poly *vec3,
uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
__contract__(
- requires(memory_no_alias(vec, sizeof(mlk_poly) * 4))
+ requires(memory_no_alias(vec0, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec1, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec2, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec3, sizeof(mlk_poly)))
requires(memory_no_alias(seed, 4 * MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)))
- assigns(memory_slice(vec, sizeof(mlk_poly) * 4))
- ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+ assigns(memory_slice(vec0, sizeof(mlk_poly)))
+ assigns(memory_slice(vec1, sizeof(mlk_poly)))
+ assigns(memory_slice(vec2, sizeof(mlk_poly)))
+ assigns(memory_slice(vec3, sizeof(mlk_poly)))
+ ensures(array_bound(vec0->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec1->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec2->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec3->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+#endif /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
#define mlk_poly_rej_uniform MLK_NAMESPACE(poly_rej_uniform)
/*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/symmetric.h
index 985bfeab37..68d7e1a0cd 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/symmetric.h
@@ -15,12 +15,13 @@
#ifndef MLK_SYMMETRIC_H
#define MLK_SYMMETRIC_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include MLK_FIPS202_HEADER_FILE
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
#include MLK_FIPS202X4_HEADER_FILE
+#endif
/* Macros denoting FIPS 203 specific Hash functions */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/sys.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/sys.h
index 8f690cc553..0ab8947318 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/sys.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/sys.h
@@ -20,6 +20,15 @@
#error "__BYTE_ORDER__ defined, but don't recognize value."
#endif
#endif /* __BYTE_ORDER__ */
+
+/* MSVC does not define __BYTE_ORDER__. However, MSVC only supports
+ * little endian x86, x86_64, and AArch64. It is, hence, safe to assume
+ * little endian. */
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || \
+ defined(_M_IX86) || defined(_M_ARM64))
+#define MLK_SYS_LITTLE_ENDIAN
+#endif
+
#endif /* !MLK_SYS_LITTLE_ENDIAN && !MLK_SYS_BIG_ENDIAN */
/* Check if we're running on an AArch64 little endian system. _M_ARM64 is set by
@@ -33,6 +42,11 @@
#define MLK_SYS_AARCH64_EB
#endif
+/* Check if we're running on an Armv8.1-M system with MVE */
+#if defined(__ARM_ARCH_8_1M_MAIN__) || defined(__ARM_FEATURE_MVE)
+#define MLK_SYS_ARMV81M_MVE
+#endif
+
#if defined(__x86_64__)
#define MLK_SYS_X86_64
#if defined(__AVX2__)
@@ -48,6 +62,11 @@
#define MLK_SYS_RISCV64
#endif
+#if defined(MLK_SYS_RISCV64) && defined(__riscv_vector) && \
+ defined(__riscv_v_intrinsic)
+#define MLK_SYS_RISCV64_RVV
+#endif
+
#if defined(__riscv) && defined(__riscv_xlen) && __riscv_xlen == 32
#define MLK_SYS_RISCV32
#endif
@@ -56,6 +75,14 @@
#define MLK_SYS_WINDOWS
#endif
+#if defined(__linux__)
+#define MLK_SYS_LINUX
+#endif
+
+#if defined(__APPLE__)
+#define MLK_SYS_APPLE
+#endif
+
#if defined(MLK_FORCE_AARCH64) && !defined(MLK_SYS_AARCH64)
#error "MLK_FORCE_AARCH64 is set, but we don't seem to be on an AArch64 system."
#endif
@@ -82,34 +109,46 @@
#endif
/*
- * C90 does not have the inline compiler directive yet.
- * We don't use it in C90 builds.
- * However, in that case the compiler warns about some inline functions in
- * header files not being used in every compilation unit that includes that
- * header. To work around it we silence that warning in that case using
- * __attribute__((unused)).
+ * MLK_INLINE: Hint for inlining.
+ * - MSVC: __inline
+ * - C99+: inline
+ * - GCC/Clang C90: __attribute__((unused)) to silence warnings
+ * - Other C90: empty
*/
-
-/* Do not use inline for C90 builds*/
#if !defined(MLK_INLINE)
-#if !defined(inline)
#if defined(_MSC_VER)
#define MLK_INLINE __inline
-/* Don't combine __inline and __forceinline */
-#define MLK_ALWAYS_INLINE __forceinline
-#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#elif defined(inline) || \
+ (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
#define MLK_INLINE inline
+#elif defined(__GNUC__) || defined(__clang__)
+#define MLK_INLINE __attribute__((unused))
+#else
+#define MLK_INLINE
+#endif
+#endif /* !MLK_INLINE */
+
+/*
+ * MLK_ALWAYS_INLINE: Force inlining.
+ * - MSVC: __forceinline
+ * - GCC/Clang C99+: MLK_INLINE __attribute__((always_inline))
+ * - Other: MLK_INLINE (no forced inlining)
+ */
+#if !defined(MLK_ALWAYS_INLINE)
+#if defined(_MSC_VER)
+#define MLK_ALWAYS_INLINE __forceinline
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+ (defined(inline) || \
+ (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L))
#define MLK_ALWAYS_INLINE MLK_INLINE __attribute__((always_inline))
#else
-#define MLK_INLINE __attribute__((unused))
#define MLK_ALWAYS_INLINE MLK_INLINE
#endif
+#endif /* !MLK_ALWAYS_INLINE */
-#else /* !inline */
-#define MLK_INLINE inline
-#define MLK_ALWAYS_INLINE MLK_INLINE __attribute__((always_inline))
-#endif /* inline */
-#endif /* !MLK_INLINE */
+#ifndef MLK_STATIC_TESTABLE
+#define MLK_STATIC_TESTABLE static
+#endif
/*
* C90 does not have the restrict compiler directive yet.
@@ -181,10 +220,41 @@
} while (0)
#endif /* !(MLK_CONFIG_CT_TESTING_ENABLED && !__ASSEMBLER__) */
-#if defined(__GNUC__) || defined(clang)
+#if defined(__GNUC__) || defined(__clang__)
#define MLK_MUST_CHECK_RETURN_VALUE __attribute__((warn_unused_result))
#else
#define MLK_MUST_CHECK_RETURN_VALUE
#endif
+#if !defined(__ASSEMBLER__)
+/* System capability enumeration */
+typedef enum
+{
+ /* x86_64 */
+ MLK_SYS_CAP_AVX2,
+ /* AArch64 */
+ MLK_SYS_CAP_SHA3
+} mlk_sys_cap;
+
+#if !defined(MLK_CONFIG_CUSTOM_CAPABILITY_FUNC)
+#include "cbmc.h"
+
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_sys_check_capability(mlk_sys_cap cap)
+__contract__(
+ ensures(return_value == 0 || return_value == 1)
+)
+{
+ /* By default, we rely on compile-time feature detection/specification:
+ * If a feature is enabled at compile-time, we assume it is supported by
+ * the host that the resulting library/binary will be built on.
+ * If this assumption is not true, you MUST overwrite this function.
+ * See the documentation of MLK_CONFIG_CUSTOM_CAPABILITY_FUNC in
+ * mlkem_native_config.h for more information. */
+ (void)cap;
+ return 1;
+}
+#endif /* !MLK_CONFIG_CUSTOM_CAPABILITY_FUNC */
+#endif /* !__ASSEMBLER__ */
+
#endif /* !MLK_SYS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/verify.h
index 85626c15ea..a9bdeaab30 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/verify.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/verify.h
@@ -30,9 +30,7 @@
#ifndef MLK_VERIFY_H
#define MLK_VERIFY_H
-#include
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
@@ -115,92 +113,83 @@ __contract__(ensures(return_value == b)) { return (b ^ mlk_ct_get_optblocker_u8(
static MLK_INLINE uint32_t mlk_value_barrier_u32(uint32_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
static MLK_INLINE int32_t mlk_value_barrier_i32(int32_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
static MLK_INLINE uint8_t mlk_value_barrier_u8(uint8_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
#endif /* MLK_USE_ASM_VALUE_BARRIER */
-/*
- * The ct_cmask_nonzero_xxx functions below make deliberate use of unsigned
- * overflow, which is fully defined behaviour in C. It is thus safe to disable
- * this warning.
- */
#ifdef CBMC
#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
+#pragma CPROVER check disable "conversion"
#endif
-
/*************************************************
- * Name: mlk_ct_cmask_nonzero_u16
+ * Name: mlk_cast_uint16_to_int16
*
- * Description: Return 0 if input is zero, and -1 otherwise.
+ * Description: Cast uint16 value to int16
*
- * Arguments: uint16_t x: Value to be converted into a mask
+ * Returns: For uint16_t x, the unique y in int16_t
+ * so that x == y mod 2^16.
+ *
+ * Concretely:
+ * - x < 32768: returns x
+ * - x >= 32768: returns x - 65536
*
**************************************************/
-
-/* Reference: Embedded in `cmov_int16()` in the reference implementation @[REF].
- * - Use value barrier and shift instead of `b = -b` to
- * convert condition into mask. */
-static MLK_INLINE uint16_t mlk_ct_cmask_nonzero_u16(uint16_t x)
-__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
+static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
{
- uint32_t tmp = mlk_value_barrier_u32(-((uint32_t)x));
- tmp >>= 16;
- return tmp;
+ /*
+ * PORTABILITY: This relies on uint16_t -> int16_t
+ * being implemented as the inverse of int16_t -> uint16_t,
+ * which is implementation-defined (C99 6.3.1.3 (3))
+ * CBMC (correctly) fails to prove this conversion is OK,
+ * so we have to suppress that check here
+ */
+ return (int16_t)x;
}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
/*************************************************
- * Name: mlk_ct_cmask_nonzero_u8
+ * Name: mlk_cast_int32_to_uint16
*
- * Description: Return 0 if input is zero, and -1 otherwise.
- *
- * Arguments: uint8_t x: Value to be converted into a mask
+ * Description: Cast int32 value to uint16 as per C standard.
*
+ * Returns: For int32_t x, the unique y in uint16_t
+ * so that x == y mod 2^16.
**************************************************/
-
-/* Reference: Embedded in `verify()` and `cmov()` in the
- * reference implementation @[REF].
- * - We include a value barrier not present in the
- * reference implementation, to prevent the compiler
- * from realizing that this function returns a mask. */
-static MLK_INLINE uint8_t mlk_ct_cmask_nonzero_u8(uint8_t x)
-__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
+static MLK_ALWAYS_INLINE uint16_t mlk_cast_int32_to_uint16(int32_t x)
{
- uint32_t tmp = mlk_value_barrier_u32(-((uint32_t)x));
- tmp >>= 24;
- return tmp;
+ return (uint16_t)(x & (int32_t)UINT16_MAX);
}
-/* Put unsigned overflow warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/*
- * The mlk_ct_cmask_neg_i16 function below makes deliberate use of
- * signed to unsigned integer conversion, which is fully defined
- * behaviour in C. It is thus safe to disable this warning.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
+/*************************************************
+ * Name: mlk_cast_int16_to_uint16
+ *
+ * Description: Cast int16 value to uint16 as per C standard.
+ *
+ * Returns: For int16_t x, the unique y in uint16_t
+ * so that x == y mod 2^16.
+ **************************************************/
+static MLK_ALWAYS_INLINE uint16_t mlk_cast_int16_to_uint16(int32_t x)
+{
+ return mlk_cast_int32_to_uint16((int32_t)x);
+}
/*************************************************
* Name: mlk_ct_cmask_neg_i16
@@ -225,24 +214,49 @@ __contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
{
int32_t tmp = mlk_value_barrier_i32((int32_t)x);
tmp >>= 16;
- return (int16_t)tmp;
+ return mlk_cast_int32_to_uint16(tmp);
}
-/* Put unsigned-to-signed warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
+/*************************************************
+ * Name: mlk_ct_cmask_nonzero_u16
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments: uint16_t x: Value to be converted into a mask
+ *
+ **************************************************/
-/*
- * The ct_csel_xxx functions below make deliberate use of unsigned
- * to signed integer conversion, which is implementation-defined
- * behaviour. Here, we assume that uint16_t -> int16_t is inverse
- * to int16_t -> uint16_t.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
+/* Reference: Embedded in `cmov_int16()` in the reference implementation @[REF].
+ * - Use value barrier and shift instead of `b = -b` to
+ * convert condition into mask. */
+static MLK_INLINE uint16_t mlk_ct_cmask_nonzero_u16(uint16_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
+{
+ int32_t tmp = mlk_value_barrier_i32(-((int32_t)x));
+ tmp >>= 16;
+ return mlk_cast_int32_to_uint16(tmp);
+}
+
+/*************************************************
+ * Name: mlk_ct_cmask_nonzero_u8
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments: uint8_t x: Value to be converted into a mask
+ *
+ **************************************************/
+
+/* Reference: Embedded in `verify()` and `cmov()` in the
+ * reference implementation @[REF].
+ * - We include a value barrier not present in the
+ * reference implementation, to prevent the compiler
+ * from realizing that this function returns a mask. */
+static MLK_INLINE uint8_t mlk_ct_cmask_nonzero_u8(uint8_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
+{
+ uint16_t mask = mlk_ct_cmask_nonzero_u16((uint16_t)x);
+ return (uint8_t)(mask & 0xFF);
+}
/*************************************************
* Name: mlk_ct_sel_int16
@@ -280,16 +294,12 @@ __contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
static MLK_INLINE int16_t mlk_ct_sel_int16(int16_t a, int16_t b, uint16_t cond)
__contract__(ensures(return_value == (cond ? a : b)))
{
- uint16_t au = a, bu = b;
+ uint16_t au = mlk_cast_int16_to_uint16(a);
+ uint16_t bu = mlk_cast_int16_to_uint16(b);
uint16_t res = bu ^ (mlk_ct_cmask_nonzero_u16(cond) & (au ^ bu));
- return (int16_t)res;
+ return mlk_cast_uint16_to_int16(res);
}
-/* Put unsigned-to-signed warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
/*************************************************
* Name: mlk_ct_sel_uint8
*
@@ -318,9 +328,11 @@ __contract__(ensures(return_value == (cond ? a : b)))
*
* Arguments: const uint8_t *a: pointer to first byte array
* const uint8_t *b: pointer to second byte array
- * size_t len: length of the byte arrays
+ * size_t len: length of the byte arrays, upper-bounded
+ * to UINT16_MAX to control proof complexity
+ * only.
*
- * Returns 0 if the byte arrays are equal, a non-zero value otherwise
+ * Returns 0 if the byte arrays are equal, 0xFF otherwise.
*
* Specification:
* - Used to securely compute conditional move in
@@ -338,9 +350,10 @@ __contract__(ensures(return_value == (cond ? a : b)))
static MLK_INLINE uint8_t mlk_ct_memcmp(const uint8_t *a, const uint8_t *b,
const size_t len)
__contract__(
+ requires(len <= UINT16_MAX)
requires(memory_no_alias(a, len))
requires(memory_no_alias(b, len))
- requires(len <= INT_MAX)
+ ensures((return_value == 0) || (return_value == 0xFF))
ensures((return_value == 0) == forall(i, 0, len, (a[i] == b[i]))))
{
uint8_t r = 0, s = 0;
@@ -391,13 +404,17 @@ __contract__(
static MLK_INLINE void mlk_ct_cmov_zero(uint8_t *r, const uint8_t *x,
size_t len, uint8_t b)
__contract__(
+ requires(len <= MLK_MAX_BUFFER_SIZE)
requires(memory_no_alias(r, len))
requires(memory_no_alias(x, len))
- assigns(memory_slice(r, len)))
+ assigns(memory_slice(r, len))
+ ensures(forall(i, 0, len, (r[i] == (b == 0 ? x[i] : old(r)[i])))))
{
size_t i;
for (i = 0; i < len; i++)
- __loop__(invariant(i <= len))
+ __loop__(
+ invariant(i <= len)
+ invariant(forall(k, 0, i, r[k] == (b == 0 ? x[k] : loop_entry(r)[k]))))
{
r[i] = mlk_ct_sel_uint8(r[i], x[i], b);
}
@@ -431,13 +448,13 @@ __contract__(
requires(memory_no_alias(ptr, len))
assigns(memory_slice(ptr, len)))
{
- memset(ptr, 0, len);
+ mlk_memset(ptr, 0, len);
/* This follows OpenSSL and seems sufficient to prevent the compiler
* from optimizing away the memset.
*
* If there was a reliable way to detect availability of memset_s(),
* that would be preferred. */
- __asm__ __volatile__("" : : "r"(ptr) : "memory");
+ __asm__ volatile("" : : "r"(ptr) : "memory");
}
#else /* !MLK_SYS_WINDOWS && MLK_HAVE_INLINE_ASM */
#error No plausibly-secure implementation of mlk_zeroize available. Please provide your own using MLK_CONFIG_CUSTOM_ZEROIZE.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/zetas.inc b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/zetas.inc
index 0c00b5b905..00316daf67 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/zetas.inc
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/mlkem/src/zetas.inc
@@ -5,16 +5,16 @@
/*
* WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
* Do not modify it directly.
*/
-#include
/*
* Table of zeta values used in the reference NTT and inverse NTT.
* See autogen for details.
*/
-static MLK_ALIGN const int16_t zetas[128] = {
+static MLK_ALIGN const int16_t mlk_zetas[128] = {
-1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577,
182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458,
-1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/integration/liboqs/config_x86_64.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/integration/liboqs/config_x86_64.h
index c818bcc980..b82f3dd434 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/integration/liboqs/config_x86_64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/integration/liboqs/config_x86_64.h
@@ -8,13 +8,23 @@
*
* - [FIPS140_3_IG]
* Implementation Guidance for FIPS 140-3 and the Cryptographic Module
- * Validation Program National Institute of Standards and Technology
+ * Validation Program
+ * National Institute of Standards and Technology
* https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
*/
#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_X86_64_H
#define MLK_INTEGRATION_LIBOQS_CONFIG_X86_64_H
+/* Enable valgrind-based assertions in mlkem-native through macro
+ * from libOQS. */
+#if !defined(__ASSEMBLER__)
+#include
+#if defined(OQS_ENABLE_TEST_CONSTANT_TIME)
+#define MLK_CONFIG_CT_TESTING_ENABLED
+#endif
+#endif /* !__ASSEMBLER__ */
+
/******************************************************************************
* Name: MLK_CONFIG_PARAMETER_SET
*
@@ -172,7 +182,7 @@
* consumer.
*
* If this option is not set, mlkem-native expects a function
- * void randombytes(uint8_t *out, size_t outlen).
+ * int randombytes(uint8_t *out, size_t outlen).
*
* Set this option and define `mlk_randombytes` if you want to
* use a custom method to sample randombytes with a different name
@@ -184,9 +194,10 @@
#include
#include
#include "../../mlkem/src/sys.h"
-static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len)
+static MLK_INLINE int mlk_randombytes(uint8_t *ptr, size_t len)
{
OQS_randombytes(ptr, len);
+ return 0;
}
#endif /* !__ASSEMBLER__ */
@@ -251,13 +262,4 @@ static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len)
#endif
*/
-/* Enable valgrind-based assertions in mlkem-native through macro
- * from libOQS. */
-#if !defined(__ASSEMBLER__)
-#include
-#if defined(OQS_ENABLE_TEST_CONSTANT_TIME)
-#define MLK_CONFIG_CT_TESTING_ENABLED
-#endif
-#endif /* !__ASSEMBLER__ */
-
#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_X86_64_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/cbmc.h
index 650d32b95b..80e1a36fc7 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/cbmc.h
@@ -8,7 +8,6 @@
/***************************************************
* Basic replacements for __CPROVER_XXX contracts
***************************************************/
-
#ifndef CBMC
#define __contract__(x)
@@ -16,6 +15,7 @@
#else /* !CBMC */
+
#define __contract__(x) x
#define __loop__(x) x
@@ -49,7 +49,6 @@
*/
#define object_whole(...) __CPROVER_object_whole(__VA_ARGS__)
#define memory_slice(...) __CPROVER_object_upto(__VA_ARGS__)
-#define same_object(...) __CPROVER_same_object(__VA_ARGS__)
/*
* Pointer-related predicates
@@ -59,6 +58,17 @@
#define readable(...) __CPROVER_r_ok(__VA_ARGS__)
#define writeable(...) __CPROVER_w_ok(__VA_ARGS__)
+/* Maximum supported buffer size
+ *
+ * Larger buffers may be supported, but due to internal modeling constraints
+ * in CBMC, the proofs of memory- and type-safety won't be able to run.
+ *
+ * If you find yourself in need for a buffer size larger than this,
+ * please contact the maintainers, so we can prioritize work to relax
+ * this somewhat artificial bound.
+ */
+#define MLK_MAX_BUFFER_SIZE (SIZE_MAX >> 12)
+
/*
* History variables
* https://diffblue.github.io/cbmc/contracts-history-variables.html
@@ -83,7 +93,7 @@
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> (predicate) \
}
-#define EXISTS(qvar, qvar_lb, qvar_ub, predicate) \
+#define exists(qvar, qvar_lb, qvar_ub, predicate) \
__CPROVER_exists \
{ \
unsigned qvar; \
@@ -118,13 +128,35 @@
{ \
unsigned qvar; \
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
- (((int)(value_lb) <= ((array_var)[(qvar)])) && \
- (((array_var)[(qvar)]) < (int)(value_ub))) \
+ (((int)(value_lb) <= ((array_var)[(qvar)])) && \
+ (((array_var)[(qvar)]) < (int)(value_ub))) \
}
-#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
- array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb), \
+#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
+ array_bound_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), (qvar_lb), \
(qvar_ub), (array_var), (value_lb), (value_ub))
+
+#define array_unchanged_core(qvar, qvar_lb, qvar_ub, array_var) \
+ __CPROVER_forall \
+ { \
+ unsigned qvar; \
+ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
+ ((array_var)[(qvar)]) == (old(* (int16_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
+ }
+
+#define array_unchanged(array_var, N) \
+ array_unchanged_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var))
+
+#define array_unchanged_u64_core(qvar, qvar_lb, qvar_ub, array_var) \
+ __CPROVER_forall \
+ { \
+ unsigned qvar; \
+ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
+ ((array_var)[(qvar)]) == (old(* (uint64_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
+ }
+
+#define array_unchanged_u64(array_var, N) \
+ array_unchanged_u64_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var))
/* clang-format on */
/* Wrapper around array_bound operating on absolute values.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/common.h
index 9de9875556..bc4e9ed72c 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/common.h
@@ -5,10 +5,16 @@
#ifndef MLK_COMMON_H
#define MLK_COMMON_H
+#ifndef __ASSEMBLER__
+#include
+#endif
+
+#define MLK_BUILD_INTERNAL
+
#if defined(MLK_CONFIG_FILE)
#include MLK_CONFIG_FILE
#else
-#include "config.h"
+#include "mlkem_native_config.h"
#endif
#include "params.h"
@@ -28,15 +34,11 @@
#define MLK_EXTERNAL_API MLK_CONFIG_EXTERNAL_API_QUALIFIER
#endif
-#if defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) || \
- defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED)
-#define MLK_MULTILEVEL_BUILD
-#endif
-
#define MLK_CONCAT_(x1, x2) x1##x2
#define MLK_CONCAT(x1, x2) MLK_CONCAT_(x1, x2)
-#if defined(MLK_MULTILEVEL_BUILD)
+#if (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || \
+ defined(MLK_CONFIG_MULTILEVEL_NO_SHARED))
#define MLK_ADD_PARAM_SET(s) MLK_CONCAT(s, MLK_CONFIG_PARAMETER_SET)
#else
#define MLK_ADD_PARAM_SET(s) s
@@ -49,7 +51,7 @@
/* Functions are prefixed by MLK_CONFIG_NAMESPACE_PREFIX.
*
* If multiple parameter sets are used, functions depending on the parameter
- * set are additionally prefixed with 512/768/1024. See config.h.
+ * set are additionally prefixed with 512/768/1024. See mlkem_native_config.h.
*
* Example: If MLK_CONFIG_NAMESPACE_PREFIX is mlkem, then
* MLK_NAMESPACE_K(enc) becomes mlkem512_enc/mlkem768_enc/mlkem1024_enc.
@@ -73,8 +75,24 @@
*/
#if defined(MLK_SYS_X86_64)
#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) : MLK_CET_ENDBR
-#else
+#elif defined(MLK_SYS_ARMV81M_MVE)
+/* clang-format off */
+#define MLK_ASM_FN_SYMBOL(sym) \
+ .type MLK_ASM_NAMESPACE(sym), %function; \
+ MLK_ASM_NAMESPACE(sym) :
+/* clang-format on */
+#else /* !MLK_SYS_X86_64 && MLK_SYS_ARMV81M_MVE */
#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) :
+#endif /* !MLK_SYS_X86_64 && !MLK_SYS_ARMV81M_MVE */
+
+/*
+ * Output the size of an assembly function.
+ */
+#if defined(__ELF__)
+#define MLK_ASM_FN_SIZE(sym) \
+ .size MLK_ASM_NAMESPACE(sym), .- MLK_ASM_NAMESPACE(sym)
+#else
+#define MLK_ASM_FN_SIZE(sym)
#endif
/* We aim to simplify the user's life by supporting builds where
@@ -99,6 +117,10 @@
#error Bad configuration: MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, but MLK_CONFIG_FIPS202_BACKEND_FILE is not.
#endif
+#if defined(MLK_CONFIG_NO_RANDOMIZED_API) && defined(MLK_CONFIG_KEYGEN_PCT)
+#error Bad configuration: MLK_CONFIG_NO_RANDOMIZED_API is incompatible with MLK_CONFIG_KEYGEN_PCT as the current PCT implementation requires crypto_kem_enc()
+#endif
+
#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH)
#include MLK_CONFIG_ARITH_BACKEND_FILE
/* Include to enforce consistency of API and implementation,
@@ -135,20 +157,118 @@
#define MLK_FIPS202X4_HEADER_FILE MLK_CONFIG_FIPS202X4_CUSTOM_HEADER
#endif
-/* Just in case we want to include mlkem_native.h, set the configuration
- * for that header in accordance with the configuration used here. */
+/* Standard library function replacements */
+#if !defined(__ASSEMBLER__)
+#if !defined(MLK_CONFIG_CUSTOM_MEMCPY)
+#include
+#define mlk_memcpy memcpy
+#endif
+
+#if !defined(MLK_CONFIG_CUSTOM_MEMSET)
+#include
+#define mlk_memset memset
+#endif
+
+
+/* Allocation macros for large local structures
+ *
+ * MLK_ALLOC(v, T, N) declares T *v and attempts to point it to an T[N]
+ * MLK_FREE(v, T, N) zeroizes and frees the allocation
+ *
+ * Default implementation uses stack allocation.
+ * Can be overridden by setting the config option MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * and defining MLK_CUSTOM_ALLOC and MLK_CUSTOM_FREE.
+ */
+#if defined(MLK_CONFIG_CUSTOM_ALLOC_FREE) != \
+ (defined(MLK_CUSTOM_ALLOC) && defined(MLK_CUSTOM_FREE))
+#error Bad configuration: MLK_CONFIG_CUSTOM_ALLOC_FREE must be set together with MLK_CUSTOM_ALLOC and MLK_CUSTOM_FREE
+#endif
+
+/*
+ * If the integration wants to provide a context parameter for use in
+ * platform-specific hooks, then it should define this parameter.
+ *
+ * The MLK_CONTEXT_PARAMETERS_n macros are intended to be used with macros
+ * defining the function names and expand to either pass or discard the context
+ * argument as required by the current build. If there is no context parameter
+ * requested then these are removed from the prototypes and from all calls.
+ */
+#ifdef MLK_CONFIG_CONTEXT_PARAMETER
+#define MLK_CONTEXT_PARAMETERS_0(context) (context)
+#define MLK_CONTEXT_PARAMETERS_1(arg0, context) (arg0, context)
+#define MLK_CONTEXT_PARAMETERS_2(arg0, arg1, context) (arg0, arg1, context)
+#define MLK_CONTEXT_PARAMETERS_3(arg0, arg1, arg2, context) \
+ (arg0, arg1, arg2, context)
+#define MLK_CONTEXT_PARAMETERS_4(arg0, arg1, arg2, arg3, context) \
+ (arg0, arg1, arg2, arg3, context)
+#else /* MLK_CONFIG_CONTEXT_PARAMETER */
+#define MLK_CONTEXT_PARAMETERS_0(context) ()
+#define MLK_CONTEXT_PARAMETERS_1(arg0, context) (arg0)
+#define MLK_CONTEXT_PARAMETERS_2(arg0, arg1, context) (arg0, arg1)
+#define MLK_CONTEXT_PARAMETERS_3(arg0, arg1, arg2, context) (arg0, arg1, arg2)
+#define MLK_CONTEXT_PARAMETERS_4(arg0, arg1, arg2, arg3, context) \
+ (arg0, arg1, arg2, arg3)
+#endif /* !MLK_CONFIG_CONTEXT_PARAMETER */
+
+#if defined(MLK_CONFIG_CONTEXT_PARAMETER_TYPE) != \
+ defined(MLK_CONFIG_CONTEXT_PARAMETER)
+#error MLK_CONFIG_CONTEXT_PARAMETER_TYPE must be defined if and only if MLK_CONFIG_CONTEXT_PARAMETER is defined
+#endif
+
+#if !defined(MLK_CONFIG_CUSTOM_ALLOC_FREE)
+/* Default: stack allocation */
+
+#define MLK_ALLOC(v, T, N, context) \
+ MLK_ALIGN T mlk_alloc_##v[N]; \
+ T *v = mlk_alloc_##v
+
+/* TODO: This leads to a circular dependency between common and verify.h
+ * It just works out before we're at the end of the file, but it's still
+ * prone to issues in the future. */
+#include "verify.h"
+#define MLK_FREE(v, T, N, context) \
+ do \
+ { \
+ mlk_zeroize(mlk_alloc_##v, sizeof(mlk_alloc_##v)); \
+ (v) = NULL; \
+ } while (0)
+
+#else /* !MLK_CONFIG_CUSTOM_ALLOC_FREE */
+
+/* Custom allocation */
+
+/*
+ * The indirection here is necessary to use MLK_CONTEXT_PARAMETERS_3 here.
+ */
+#define MLK_APPLY(f, args) f args
+
+#define MLK_ALLOC(v, T, N, context) \
+ MLK_APPLY(MLK_CUSTOM_ALLOC, MLK_CONTEXT_PARAMETERS_3(v, T, N, context))
+
+#define MLK_FREE(v, T, N, context) \
+ do \
+ { \
+ if (v != NULL) \
+ { \
+ mlk_zeroize(v, sizeof(T) * (N)); \
+ MLK_APPLY(MLK_CUSTOM_FREE, MLK_CONTEXT_PARAMETERS_3(v, T, N, context)); \
+ v = NULL; \
+ } \
+ } while (0)
+
+#endif /* MLK_CONFIG_CUSTOM_ALLOC_FREE */
+
+/****************************** Error codes ***********************************/
-/* Double-check that this is not conflicting with pre-existing definitions. */
-#if defined(MLK_CONFIG_API_PARAMETER_SET) || \
- defined(MLK_CONFIG_API_NAMESPACE_PREFIX) || \
- defined(MLK_CONFIG_API_NO_SUPERCOP) || \
- defined(MLK_CONFIG_API_CONSTANTS_ONLY)
-#error Pre-existing MLK_CONFIG_API_XXX configuration is neither useful nor allowed during an mlkem-native build
-#endif /* MLK_CONFIG_API_PARAMETER_SET || MLK_CONFIG_API_NAMESPACE_PREFIX || \
- MLK_CONFIG_API_NO_SUPERCOP || MLK_CONFIG_API_CONSTANTS_ONLY */
+/* Generic failure condition */
+#define MLK_ERR_FAIL -1
+/* An allocation failed. This can only happen if MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * is defined and the provided MLK_CUSTOM_ALLOC can fail. */
+#define MLK_ERR_OUT_OF_MEMORY -2
+/* An rng failure occured. Might be due to insufficient entropy or
+ * system misconfiguration. */
+#define MLK_ERR_RNG_FAIL -3
-#define MLK_CONFIG_API_PARAMETER_SET MLK_CONFIG_PARAMETER_SET
-#define MLK_CONFIG_API_NAMESPACE_PREFIX \
- MLK_ADD_PARAM_SET(MLK_CONFIG_NAMESPACE_PREFIX)
+#endif /* !__ASSEMBLER__ */
#endif /* !MLK_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/compress.c
index d7ff2bbe7a..50da36d0e4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/compress.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/compress.c
@@ -20,24 +20,27 @@
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
-#include
+
#include "cbmc.h"
#include "compress.h"
#include "debug.h"
#include "verify.h"
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
/* Reference: `poly_compress()` in the reference implementation @[REF],
* for ML-KEM-{512,768}.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d4_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -55,32 +58,51 @@ void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
t[j] = mlk_scalar_compress_d4(a->coeffs[8 * i + j]);
}
- r[i * 4] = t[0] | (t[1] << 4);
- r[i * 4 + 1] = t[2] | (t[3] << 4);
- r[i * 4 + 2] = t[4] | (t[5] << 4);
- r[i * 4 + 3] = t[6] | (t[7] << 4);
+ /* All t[i] are 4-bit wide, so the truncations don't alter the value. */
+ r[i * 4] = (uint8_t)(t[0] | (t[1] << 4));
+ r[i * 4 + 1] = (uint8_t)(t[2] | (t[3] << 4));
+ r[i * 4 + 2] = (uint8_t)(t[4] | (t[5] << 4));
+ r[i * 4 + 3] = (uint8_t)(t[6] | (t[7] << 4));
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D4 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d4_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d4_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
+ mlk_poly_compress_d4_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation, for ML-KEM-{512,768}.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d10_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+)
{
unsigned j;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -101,29 +123,47 @@ void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
* Make all implicit truncation explicit. No data is being
* truncated for the LHS's since each t[i] is 10-bit in size.
*/
- r[5 * j + 0] = (t[0] >> 0) & 0xFF;
- r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
- r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
- r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
- r[5 * j + 4] = (t[3] >> 2);
+ r[5 * j + 0] = (uint8_t)((t[0] >> 0) & 0xFF);
+ r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] << 2) & 0xFF));
+ r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] << 4) & 0xFF));
+ r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] << 6) & 0xFF));
+ r[5 * j + 4] = (uint8_t)(t[3] >> 2);
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D10 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d10_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d10_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
+ mlk_poly_compress_d10_c(r, a);
+}
+
/* Reference: `poly_decompress()` in the reference implementation @[REF],
* for ML-KEM-{512,768}. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d4(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d4_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 2; i++)
@@ -137,22 +177,40 @@ void mlk_poly_decompress_d4(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d4(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d4_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
+ int ret;
+ ret = mlk_poly_decompress_d4_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
+ mlk_poly_decompress_d4_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation, for ML-KEM-{512,768}. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d10(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d10_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned j;
for (j = 0; j < MLKEM_N / 4; j++)
@@ -180,28 +238,46 @@ void mlk_poly_decompress_d10(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d10(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d10_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
+ int ret;
+ ret = mlk_poly_decompress_d10_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
+
+ mlk_poly_decompress_d10_c(r, a);
+}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
/* Reference: `poly_compress()` in the reference implementation @[REF],
* for ML-KEM-1024.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d5_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -219,38 +295,51 @@ void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
t[j] = mlk_scalar_compress_d5(a->coeffs[8 * i + j]);
}
- /*
- * Explicitly truncate to avoid warning about
- * implicit truncation in CBMC, and use array indexing into
- * r rather than pointer-arithmetic to simplify verification
- */
- r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
- r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
- r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
- r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
- r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+ r[i * 5] = (uint8_t)(0xFF & ((t[0] >> 0) | (t[1] << 5)));
+ r[i * 5 + 1] = (uint8_t)(0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)));
+ r[i * 5 + 2] = (uint8_t)(0xFF & ((t[3] >> 1) | (t[4] << 4)));
+ r[i * 5 + 3] = (uint8_t)(0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)));
+ r[i * 5 + 4] = (uint8_t)(0xFF & ((t[6] >> 2) | (t[7] << 3)));
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D5 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d5_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d5_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
+ mlk_poly_compress_d5_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation, for ML-KEM-1024.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d11_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+)
{
unsigned j;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -272,35 +361,53 @@ void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
* Make all implicit truncation explicit. No data is being
* truncated for the LHS's since each t[i] is 11-bit in size.
*/
- r[11 * j + 0] = (t[0] >> 0) & 0xFF;
- r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
- r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
- r[11 * j + 3] = (t[2] >> 2) & 0xFF;
- r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
- r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
- r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
- r[11 * j + 7] = (t[5] >> 1) & 0xFF;
- r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
- r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
- r[11 * j + 10] = (t[7] >> 3);
+ r[11 * j + 0] = (uint8_t)((t[0] >> 0) & 0xFF);
+ r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] << 3) & 0xFF));
+ r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] << 6) & 0xFF));
+ r[11 * j + 3] = (uint8_t)((t[2] >> 2) & 0xFF);
+ r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] << 1) & 0xFF));
+ r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] << 4) & 0xFF));
+ r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] << 7) & 0xFF));
+ r[11 * j + 7] = (uint8_t)((t[5] >> 1) & 0xFF);
+ r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] << 2) & 0xFF));
+ r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] << 5) & 0xFF));
+ r[11 * j + 10] = (uint8_t)(t[7] >> 3);
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D11 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d11_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d11_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
+ mlk_poly_compress_d11_c(r, a);
+}
+
/* Reference: `poly_decompress()` in the reference implementation @[REF],
* for ML-KEM-1024. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d5(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d5_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 8; i++)
@@ -342,22 +449,40 @@ void mlk_poly_decompress_d5(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d5(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d5_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
+ int ret;
+ ret = mlk_poly_decompress_d5_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
+ mlk_poly_decompress_d5_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation, for ML-KEM-1024. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d11(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d11_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned j;
for (j = 0; j < MLKEM_N / 8; j++)
@@ -390,26 +515,45 @@ void mlk_poly_decompress_d11(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d11(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d11_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
+ int ret;
+ ret = mlk_poly_decompress_d11_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
+ mlk_poly_decompress_d11_c(r, a);
+}
+
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
-#if !defined(MLK_USE_NATIVE_POLY_TOBYTES)
/* Reference: `poly_tobytes()` in the reference implementation @[REF].
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_tobytes_c(uint8_t r[MLKEM_POLYBYTES],
+ const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYBYTES))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -417,8 +561,10 @@ void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(invariant(i <= MLKEM_N / 2))
{
- const uint16_t t0 = a->coeffs[2 * i];
- const uint16_t t1 = a->coeffs[2 * i + 1];
+ /* The conversion to uint16_t is safe since we assume that
+ * the coefficients of `a` are non-negative. */
+ const uint16_t t0 = (uint16_t)a->coeffs[2 * i];
+ const uint16_t t1 = (uint16_t)a->coeffs[2 * i + 1];
/*
* t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
* significant data, so these can be packed into 24 bits or exactly
@@ -426,32 +572,48 @@ void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
*/
/* Least significant bits 0 - 7 of t0. */
- r[3 * i + 0] = t0 & 0xFF;
+ r[3 * i + 0] = (uint8_t)(t0 & 0xFF);
/*
* Most significant bits 8 - 11 of t0 become the least significant
* nibble of the second byte. The least significant 4 bits
* of t1 become the upper nibble of the second byte.
+ *
+ * The conversion to uint8_t does not alter the value.
*/
- r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+ r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 << 4) & 0xF0));
- /* Bits 4 - 11 of t1 become the third byte. */
- r[3 * i + 2] = t1 >> 4;
+ /* Bits 4 - 11 of t1 become the third byte. The conversion to uint8_t
+ * does not alter the value because t1 is 12-bit wide. */
+ r[3 * i + 2] = (uint8_t)(t1 >> 4);
}
}
-#else /* !MLK_USE_NATIVE_POLY_TOBYTES */
+
MLK_INTERNAL_API
void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
{
+#if defined(MLK_USE_NATIVE_POLY_TOBYTES)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_tobytes_native(r, a->coeffs);
-}
+ ret = mlk_poly_tobytes_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
-#if !defined(MLK_USE_NATIVE_POLY_FROMBYTES)
+ mlk_poly_tobytes_c(r, a);
+}
+
/* Reference: `poly_frombytes()` in the reference implementation @[REF]. */
-MLK_INTERNAL_API
-void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
+MLK_STATIC_TESTABLE void mlk_poly_frombytes_c(mlk_poly *r,
+ const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+ requires(memory_no_alias(a, MLKEM_POLYBYTES))
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 2; i++)
@@ -462,21 +624,29 @@ void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
const uint8_t t0 = a[3 * i + 0];
const uint8_t t1 = a[3 * i + 1];
const uint8_t t2 = a[3 * i + 2];
- r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
- r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+ r->coeffs[2 * i + 0] = (int16_t)(t0 | ((t1 << 8) & 0xFFF));
+ r->coeffs[2 * i + 1] = (int16_t)((t1 >> 4) | (t2 << 4));
}
/* Note that the coefficients are not canonical */
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
}
-#else /* !MLK_USE_NATIVE_POLY_FROMBYTES */
+
MLK_INTERNAL_API
void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
{
- mlk_poly_frombytes_native(r->coeffs, a);
-}
+#if defined(MLK_USE_NATIVE_POLY_FROMBYTES)
+ int ret;
+ ret = mlk_poly_frombytes_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
+ mlk_poly_frombytes_c(r, a);
+}
+
/* Reference: `poly_frommsg()` in the reference implementation @[REF].
* - We use a value barrier around the bit-selection mask to
* reduce the risk of compiler-introduced branches.
@@ -506,7 +676,7 @@ void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
* as per @[FIPS203, Eq (4.8)]. */
/* Prevent the compiler from recognizing this as a bit selection */
- uint8_t mask = mlk_value_barrier_u8(1u << j);
+ uint8_t mask = mlk_value_barrier_u8((uint8_t)(1u << j));
r->coeffs[8 * i + j] = mlk_ct_sel_int16(MLKEM_Q_HALF, 0, msg[i] & mask);
}
}
@@ -535,7 +705,7 @@ void mlk_poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const mlk_poly *a)
invariant(i <= MLKEM_N / 8 && j <= 8))
{
uint32_t t = mlk_scalar_compress_d1(a->coeffs[8 * i + j]);
- msg[i] |= t << j;
+ msg[i] |= (uint8_t)(t << j);
}
}
}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/compress.h
index f0789d42d6..b16b0889b5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/compress.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/compress.h
@@ -20,8 +20,7 @@
#ifndef MLK_COMPRESS_H
#define MLK_COMPRESS_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include "debug.h"
@@ -50,9 +49,9 @@
#endif
/* Reference: Part of poly_tomsg() in the reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d1(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d1(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 2)
ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) )
{
@@ -65,7 +64,8 @@ __contract__(
*/
/* check-magic: 1290168 == 2*round(2^31 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290168;
- return (d0 + (1u << 30)) >> 31;
+ /* Unsigned shifting by 31 positions leaves only the top bit. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 30)) >> 31);
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -93,9 +93,9 @@ __contract__(
/* Reference: Embedded into `poly_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d4(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d4(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 16)
ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
{
@@ -108,7 +108,8 @@ __contract__(
*/
/* check-magic: 1290160 == 16 * round(2^28 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290160;
- return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */
+ /* The return value is < 16, so not altered by the conversion to uint8_t. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 27)) >> 28); /* round(d0/2^28) */
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -128,11 +129,16 @@ __contract__(
/* Reference: Embedded into `poly_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d4(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d4(uint8_t u)
__contract__(
requires(0 <= u && u < 16)
ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) >> 4; }
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 8) >> 4);
+}
/************************************************************
* Name: mlk_scalar_compress_d5
@@ -156,9 +162,9 @@ __contract__(
/* Reference: Embedded into `poly_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d5(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d5(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 32)
ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) )
{
@@ -171,7 +177,8 @@ __contract__(
*/
/* check-magic: 1290176 == 2^5 * round(2^27 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290176;
- return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */
+ /* The return value is < 32, so not altered by the conversion to uint8_t. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 26)) >> 27); /* round(d0/2^27) */
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -191,11 +198,16 @@ __contract__(
/* Reference: Embedded into `poly_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d5(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d5(uint8_t u)
__contract__(
requires(0 <= u && u < 32)
- ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) >> 5; }
+ ensures(0 <= return_value && return_value <= MLKEM_Q - 1)
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 16) >> 5);
+}
/************************************************************
* Name: mlk_scalar_compress_d10
@@ -219,9 +231,9 @@ __contract__(
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d10(uint16_t u)
+static MLK_INLINE uint16_t mlk_scalar_compress_d10(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < (1u << 10))
ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
{
@@ -255,11 +267,16 @@ __contract__(
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d10(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d10(uint16_t u)
__contract__(
requires(0 <= u && u < 1024)
- ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) >> 10; }
+ ensures(0 <= return_value && return_value <= (MLKEM_Q - 1))
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 512) >> 10);
+}
/************************************************************
* Name: mlk_scalar_compress_d11
@@ -283,9 +300,9 @@ __contract__(
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d11(uint16_t u)
+static MLK_INLINE uint16_t mlk_scalar_compress_d11(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < (1u << 11))
ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
{
@@ -319,11 +336,16 @@ __contract__(
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d11(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d11(uint16_t u)
__contract__(
requires(0 <= u && u < 2048)
- ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) >> 11; }
+ ensures(0 <= return_value && return_value <= (MLKEM_Q - 1))
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 1024) >> 11);
+}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
#define mlk_poly_compress_d4 MLK_NAMESPACE(poly_compress_d4)
@@ -575,7 +597,7 @@ __contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
);
@@ -631,7 +653,7 @@ void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
__contract__(
requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
);
@@ -660,7 +682,7 @@ __contract__(
requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(msg))
+ assigns(memory_slice(msg, MLKEM_INDCPA_MSGBYTES))
);
#endif /* !MLK_COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/debug.h
index 01f7c88ccf..47c864bd36 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/debug.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/debug.h
@@ -7,7 +7,6 @@
#include "common.h"
#if defined(MLKEM_DEBUG)
-#include
/*************************************************
* Name: mlk_assert
@@ -89,14 +88,14 @@ void mlk_debug_check_bounds(const char *file, int line, const int16_t *ptr,
/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
* just use a single flattened array_bound(...) here. */
-#define mlk_assert_bound_2d(ptr, M, N, value_lb, value_ub) \
- cassert(forall(kN, 0, (M), \
- array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+#define mlk_assert_bound_2d(ptr, M, N, value_lb, value_ub) \
+ cassert(forall(kN, 0, (M), \
+ array_bound(&((int16_t (*)[(N)])(ptr))[kN][0], 0, (N), \
(value_lb), (value_ub))))
-#define mlk_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \
- cassert(forall(kN, 0, (M), \
- array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+#define mlk_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \
+ cassert(forall(kN, 0, (M), \
+ array_abs_bound(&((int16_t (*)[(N)])(ptr))[kN][0], 0, (N), \
(value_abs_bd))))
#else /* !MLKEM_DEBUG && CBMC */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/indcpa.c
index 85d4f595a9..e03b16c38b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/indcpa.c
@@ -17,15 +17,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
-#include
+#include "indcpa.h"
-#include "cbmc.h"
#include "debug.h"
-#include "indcpa.h"
-#include "poly.h"
-#include "poly_k.h"
#include "randombytes.h"
#include "sampling.h"
#include "symmetric.h"
@@ -41,6 +35,10 @@
#define mlk_pack_ciphertext MLK_ADD_PARAM_SET(mlk_pack_ciphertext)
#define mlk_unpack_ciphertext MLK_ADD_PARAM_SET(mlk_unpack_ciphertext)
#define mlk_matvec_mul MLK_ADD_PARAM_SET(mlk_matvec_mul)
+#define mlk_polyvec_permute_bitrev_to_custom \
+ MLK_ADD_PARAM_SET(mlk_polyvec_permute_bitrev_to_custom)
+#define mlk_polymat_permute_bitrev_to_custom \
+ MLK_ADD_PARAM_SET(mlk_polymat_permute_bitrev_to_custom)
/* End of parameter set namespacing */
/*************************************************
@@ -59,12 +57,13 @@
* Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L19]
*
**************************************************/
-static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], mlk_polyvec pk,
+static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const mlk_polyvec *pk,
const uint8_t seed[MLKEM_SYMBYTES])
{
- mlk_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(pk->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
mlk_polyvec_tobytes(r, pk);
- memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
+ mlk_memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
}
/*************************************************
@@ -83,11 +82,11 @@ static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], mlk_polyvec pk,
* Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L2-3]
*
**************************************************/
-static void mlk_unpack_pk(mlk_polyvec pk, uint8_t seed[MLKEM_SYMBYTES],
+static void mlk_unpack_pk(mlk_polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
const uint8_t packedpk[MLKEM_INDCPA_PUBLICKEYBYTES])
{
mlk_polyvec_frombytes(pk, packedpk);
- memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
/* NOTE: If a modulus check was conducted on the PK, we know at this
* point that the coefficients of `pk` are unsigned canonical. The
@@ -108,9 +107,10 @@ static void mlk_unpack_pk(mlk_polyvec pk, uint8_t seed[MLKEM_SYMBYTES],
* Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L20]
*
**************************************************/
-static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], mlk_polyvec sk)
+static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES],
+ const mlk_polyvec *sk)
{
- mlk_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(sk->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
mlk_polyvec_tobytes(r, sk);
}
@@ -128,7 +128,7 @@ static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], mlk_polyvec sk)
* Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L5]
*
**************************************************/
-static void mlk_unpack_sk(mlk_polyvec sk,
+static void mlk_unpack_sk(mlk_polyvec *sk,
const uint8_t packedsk[MLKEM_INDCPA_SECRETKEYBYTES])
{
mlk_polyvec_frombytes(sk, packedsk);
@@ -149,8 +149,8 @@ static void mlk_unpack_sk(mlk_polyvec sk,
* Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22-23]
*
**************************************************/
-static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], mlk_polyvec b,
- mlk_poly *v)
+static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES],
+ const mlk_polyvec *b, mlk_poly *v)
{
mlk_polyvec_compress_du(r, b);
mlk_poly_compress_dv(r + MLKEM_POLYVECCOMPRESSEDBYTES_DU, v);
@@ -170,28 +170,69 @@ static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], mlk_polyvec b,
* Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L1-4]
*
**************************************************/
-static void mlk_unpack_ciphertext(mlk_polyvec b, mlk_poly *v,
+static void mlk_unpack_ciphertext(mlk_polyvec *b, mlk_poly *v,
const uint8_t c[MLKEM_INDCPA_BYTES])
{
mlk_polyvec_decompress_du(b, c);
mlk_poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
}
-#if !defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
-/* This namespacing is not done at the top to avoid a naming conflict
- * with native backends, which are currently not yet namespaced. */
-#define mlk_poly_permute_bitrev_to_custom \
- MLK_ADD_PARAM_SET(mlk_poly_permute_bitrev_to_custom)
-
-static MLK_INLINE void mlk_poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
+/* Helper function to ensure that the polynomial entries in the output
+ * of gen_matrix use the standard (bitreversed) ordering of coefficients.
+ * No-op unless a native backend with a custom ordering is used.
+ *
+ * We don't inline this into gen_matrix to avoid having to split the CBMC
+ * proof for gen_matrix based on MLK_USE_NATIVE_NTT_CUSTOM_ORDER. */
+static void mlk_polyvec_permute_bitrev_to_custom(mlk_polyvec *v)
__contract__(
/* We don't specify that this should be a permutation, but only
* that it does not change the bound established at the end of mlk_gen_matrix. */
- requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
- requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(memory_slice(data, sizeof(mlk_poly)))
- ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+ requires(memory_no_alias(v, sizeof(mlk_polyvec)))
+ requires(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(v, sizeof(mlk_polyvec)))
+ ensures(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+{
+#if defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
+ unsigned i;
+ for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(v, sizeof(mlk_polyvec)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+ {
+ mlk_poly_permute_bitrev_to_custom(v->vec[i].coeffs);
+ }
+#else /* MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
+ /* Nothing to do */
+ (void)v;
#endif /* !MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
+}
+
+static void mlk_polymat_permute_bitrev_to_custom(mlk_polymat *a)
+__contract__(
+ /* We don't specify that this should be a permutation, but only
+ * that it does not change the bound established at the end of mlk_gen_matrix. */
+ requires(memory_no_alias(a, sizeof(mlk_polymat)))
+ requires(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+ assigns(memory_slice(a, sizeof(mlk_polymat)))
+ ensures(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))))
+{
+ unsigned i;
+ for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(a, sizeof(mlk_polymat)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))))
+ {
+ mlk_polyvec_permute_bitrev_to_custom(&a->vec[i]);
+ }
+}
/* Reference: `gen_matrix()` in the reference implementation @[REF].
* - We use a special subroutine to generate 4 polynomials
@@ -201,32 +242,27 @@ __contract__(
*
* Not static for benchmarking */
MLK_INTERNAL_API
-void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
+void mlk_gen_matrix(mlk_polymat *a, const uint8_t seed[MLKEM_SYMBYTES],
int transposed)
{
unsigned i, j;
- /*
- * We generate four separate seed arrays rather than a single one to work
- * around limitations in CBMC function contracts dealing with disjoint slices
- * of the same parent object.
- */
-
MLK_ALIGN uint8_t seed_ext[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)];
for (j = 0; j < 4; j++)
{
- memcpy(seed_ext[j], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(seed_ext[j], seed, MLKEM_SYMBYTES);
}
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
/* Sample 4 matrix entries a time. */
for (i = 0; i < (MLKEM_K * MLKEM_K / 4) * 4; i += 4)
{
- uint8_t x, y;
-
for (j = 0; j < 4; j++)
{
- x = (i + j) / MLKEM_K;
- y = (i + j) % MLKEM_K;
+ uint8_t x, y;
+ /* MLKEM_K <= 4, so the values fit in uint8_t. */
+ x = (uint8_t)((i + j) / MLKEM_K);
+ y = (uint8_t)((i + j) % MLKEM_K);
if (transposed)
{
seed_ext[j][MLKEM_SYMBYTES + 0] = x;
@@ -239,19 +275,26 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
}
}
- /*
- * This call writes across mlk_polyvec boundaries for K=2 and K=3.
- * This is intentional and safe.
- */
- mlk_poly_rej_uniform_x4(&a[i], seed_ext);
+ mlk_poly_rej_uniform_x4(&a->vec[i / MLKEM_K].vec[i % MLKEM_K],
+ &a->vec[(i + 1) / MLKEM_K].vec[(i + 1) % MLKEM_K],
+ &a->vec[(i + 2) / MLKEM_K].vec[(i + 2) % MLKEM_K],
+ &a->vec[(i + 3) / MLKEM_K].vec[(i + 3) % MLKEM_K],
+ seed_ext);
}
-
- /* For MLKEM_K == 3, sample the last entry individually. */
- if (i < MLKEM_K * MLKEM_K)
+#else /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
+ /* When using serial FIPS202, sample all entries individually. */
+ i = 0;
+#endif /* MLK_CONFIG_SERIAL_FIPS202_ONLY */
+
+ /* For MLKEM_K == 3, sample the last entry individually.
+ * When MLK_CONFIG_SERIAL_FIPS202_ONLY is set, sample all entries
+ * individually. */
+ for (; i < MLKEM_K * MLKEM_K; i++)
{
uint8_t x, y;
- x = i / MLKEM_K;
- y = i % MLKEM_K;
+ /* MLKEM_K <= 4, so the values fit in uint8_t. */
+ x = (uint8_t)(i / MLKEM_K);
+ y = (uint8_t)(i % MLKEM_K);
if (transposed)
{
@@ -264,8 +307,7 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
seed_ext[0][MLKEM_SYMBYTES + 1] = x;
}
- mlk_poly_rej_uniform(&a[i], seed_ext[0]);
- i++;
+ mlk_poly_rej_uniform(&a->vec[i / MLKEM_K].vec[i % MLKEM_K], seed_ext[0]);
}
mlk_assert(i == MLKEM_K * MLKEM_K);
@@ -274,10 +316,7 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
* The public matrix is generated in NTT domain. If the native backend
* uses a custom order in NTT domain, permute A accordingly.
*/
- for (i = 0; i < MLKEM_K * MLKEM_K; i++)
- {
- mlk_poly_permute_bitrev_to_custom(a[i].coeffs);
- }
+ mlk_polymat_permute_bitrev_to_custom(a);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -301,24 +340,25 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
* Specification: Implements @[FIPS203, Section 2.4.7, Eq (2.12), (2.13)]
*
**************************************************/
-static void mlk_matvec_mul(mlk_polyvec out, const mlk_polymat a,
- const mlk_polyvec v, const mlk_polyvec_mulcache vc)
+static void mlk_matvec_mul(mlk_polyvec *out, const mlk_polymat *a,
+ const mlk_polyvec *v, const mlk_polyvec_mulcache *vc)
__contract__(
requires(memory_no_alias(out, sizeof(mlk_polyvec)))
requires(memory_no_alias(a, sizeof(mlk_polymat)))
requires(memory_no_alias(v, sizeof(mlk_polyvec)))
requires(memory_no_alias(vc, sizeof(mlk_polyvec_mulcache)))
- requires(forall(k0, 0, MLKEM_K * MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
- assigns(object_whole(out)))
+ requires(forall(k0, 0, MLKEM_K,
+ forall(k1, 0, MLKEM_K,
+ array_bound(a->vec[k0].vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))))
+ assigns(memory_slice(out, sizeof(mlk_polyvec))))
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
__loop__(
- assigns(i, object_whole(out))
+ assigns(i, memory_slice(out, sizeof(mlk_polyvec)))
invariant(i <= MLKEM_K))
{
- mlk_polyvec_basemul_acc_montgomery_cached(&out[i], &a[MLKEM_K * i], v, vc);
+ mlk_polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a->vec[i], v, vc);
}
}
@@ -331,20 +371,34 @@ __contract__(
* - We include buffer zeroization.
*/
MLK_INTERNAL_API
-void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- const uint8_t *publicseed = buf;
- const uint8_t *noiseseed = buf + MLKEM_SYMBYTES;
- mlk_polymat a;
- mlk_polyvec e, pkpv, skpv;
- mlk_polyvec_mulcache skpv_cache;
-
- MLK_ALIGN uint8_t coins_with_domain_separator[MLKEM_SYMBYTES + 1];
+ int ret = 0;
+ const uint8_t *publicseed;
+ const uint8_t *noiseseed;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(coins_with_domain_separator, uint8_t, MLKEM_SYMBYTES + 1, context);
+ MLK_ALLOC(a, mlk_polymat, 1, context);
+ MLK_ALLOC(e, mlk_polyvec, 1, context);
+ MLK_ALLOC(pkpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (buf == NULL || coins_with_domain_separator == NULL || a == NULL ||
+ e == NULL || pkpv == NULL || skpv == NULL || skpv_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
+ publicseed = buf;
+ noiseseed = buf + MLKEM_SYMBYTES;
+
/* Concatenate coins with MLKEM_K for domain separation of security levels */
- memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
+ mlk_memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
coins_with_domain_separator[MLKEM_SYMBYTES] = MLKEM_K;
mlk_hash_g(buf, coins_with_domain_separator, MLKEM_SYMBYTES + 1);
@@ -360,24 +414,24 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
mlk_gen_matrix(a, publicseed, 0 /* no transpose */);
#if MLKEM_K == 2
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &e[0], &e[1], noiseseed, 0, 1,
- 2, 3);
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &e->vec[0],
+ &e->vec[1], noiseseed, 0, 1, 2, 3);
#elif MLKEM_K == 3
/*
* Only the first three output buffers are needed.
* The laster parameter is a dummy that's overwritten later.
*/
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &skpv[2],
- &pkpv[0] /* irrelevant */, noiseseed, 0, 1, 2,
- 0xFF /* irrelevant */);
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &skpv->vec[2], NULL,
+ noiseseed, 0, 1, 2, 0xFF /* irrelevant */);
/* Same here */
- mlk_poly_getnoise_eta1_4x(&e[0], &e[1], &e[2], &pkpv[0] /* irrelevant */,
- noiseseed, 3, 4, 5, 0xFF /* irrelevant */);
+ mlk_poly_getnoise_eta1_4x(&e->vec[0], &e->vec[1], &e->vec[2], NULL, noiseseed,
+ 3, 4, 5, 0xFF /* irrelevant */);
#elif MLKEM_K == 4
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &skpv[2], &skpv[3], noiseseed,
- 0, 1, 2, 3);
- mlk_poly_getnoise_eta1_4x(&e[0], &e[1], &e[2], &e[3], noiseseed, 4, 5, 6, 7);
-#endif
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &skpv->vec[2],
+ &skpv->vec[3], noiseseed, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta1_4x(&e->vec[0], &e->vec[1], &e->vec[2], &e->vec[3],
+ noiseseed, 4, 5, 6, 7);
+#endif /* MLKEM_K == 4 */
mlk_polyvec_ntt(skpv);
mlk_polyvec_ntt(e);
@@ -393,14 +447,17 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
mlk_pack_sk(sk, skpv);
mlk_pack_pk(pk, pkpv, publicseed);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(coins_with_domain_separator, sizeof(coins_with_domain_separator));
- mlk_zeroize(a, sizeof(a));
- mlk_zeroize(&e, sizeof(e));
- mlk_zeroize(&skpv, sizeof(skpv));
- mlk_zeroize(&skpv_cache, sizeof(skpv_cache));
+ MLK_FREE(skpv_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(skpv, mlk_polyvec, 1, context);
+ MLK_FREE(pkpv, mlk_polyvec, 1, context);
+ MLK_FREE(e, mlk_polyvec, 1, context);
+ MLK_FREE(a, mlk_polymat, 1, context);
+ MLK_FREE(coins_with_domain_separator, uint8_t, MLKEM_SYMBYTES + 1, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
/* Reference: `indcpa_enc()` in the reference implementation @[REF].
@@ -412,19 +469,33 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
* - We include buffer zeroization.
*/
MLK_INTERNAL_API
-void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t seed[MLKEM_SYMBYTES];
- mlk_polymat at;
- mlk_polyvec sp, pkpv, ep, b;
- mlk_poly v, k, epp;
- mlk_polyvec_mulcache sp_cache;
+ int ret = 0;
+ MLK_ALLOC(seed, uint8_t, MLKEM_SYMBYTES, context);
+ MLK_ALLOC(at, mlk_polymat, 1, context);
+ MLK_ALLOC(sp, mlk_polyvec, 1, context);
+ MLK_ALLOC(pkpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(ep, mlk_polyvec, 1, context);
+ MLK_ALLOC(b, mlk_polyvec, 1, context);
+ MLK_ALLOC(v, mlk_poly, 1, context);
+ MLK_ALLOC(k, mlk_poly, 1, context);
+ MLK_ALLOC(epp, mlk_poly, 1, context);
+ MLK_ALLOC(sp_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (seed == NULL || at == NULL || sp == NULL || pkpv == NULL || ep == NULL ||
+ b == NULL || v == NULL || k == NULL || epp == NULL || sp_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
mlk_unpack_pk(pkpv, seed, pk);
- mlk_poly_frommsg(&k, m);
+ mlk_poly_frommsg(k, m);
/*
* Declassify the public seed.
@@ -437,87 +508,105 @@ void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
mlk_gen_matrix(at, seed, 1 /* transpose */);
#if MLKEM_K == 2
- mlk_poly_getnoise_eta1122_4x(&sp[0], &sp[1], &ep[0], &ep[1], coins, 0, 1, 2,
- 3);
- mlk_poly_getnoise_eta2(&epp, coins, 4);
+ mlk_poly_getnoise_eta1122_4x(&sp->vec[0], &sp->vec[1], &ep->vec[0],
+ &ep->vec[1], coins, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta2(epp, coins, 4);
#elif MLKEM_K == 3
/*
* In this call, only the first three output buffers are needed.
* The last parameter is a dummy that's overwritten later.
*/
- mlk_poly_getnoise_eta1_4x(&sp[0], &sp[1], &sp[2], &b[0], coins, 0, 1, 2,
- 0xFF);
+ mlk_poly_getnoise_eta1_4x(&sp->vec[0], &sp->vec[1], &sp->vec[2], NULL, coins,
+ 0, 1, 2, 0xFF /* irrelevant */);
/* The fourth output buffer in this call _is_ used. */
- mlk_poly_getnoise_eta2_4x(&ep[0], &ep[1], &ep[2], &epp, coins, 3, 4, 5, 6);
+ mlk_poly_getnoise_eta2_4x(&ep->vec[0], &ep->vec[1], &ep->vec[2], epp, coins,
+ 3, 4, 5, 6);
#elif MLKEM_K == 4
- mlk_poly_getnoise_eta1_4x(&sp[0], &sp[1], &sp[2], &sp[3], coins, 0, 1, 2, 3);
- mlk_poly_getnoise_eta2_4x(&ep[0], &ep[1], &ep[2], &ep[3], coins, 4, 5, 6, 7);
- mlk_poly_getnoise_eta2(&epp, coins, 8);
-#endif
+ mlk_poly_getnoise_eta1_4x(&sp->vec[0], &sp->vec[1], &sp->vec[2], &sp->vec[3],
+ coins, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta2_4x(&ep->vec[0], &ep->vec[1], &ep->vec[2], &ep->vec[3],
+ coins, 4, 5, 6, 7);
+ mlk_poly_getnoise_eta2(epp, coins, 8);
+#endif /* MLKEM_K == 4 */
mlk_polyvec_ntt(sp);
mlk_polyvec_mulcache_compute(sp_cache, sp);
mlk_matvec_mul(b, at, sp, sp_cache);
- mlk_polyvec_basemul_acc_montgomery_cached(&v, pkpv, sp, sp_cache);
+ mlk_polyvec_basemul_acc_montgomery_cached(v, pkpv, sp, sp_cache);
mlk_polyvec_invntt_tomont(b);
- mlk_poly_invntt_tomont(&v);
+ mlk_poly_invntt_tomont(v);
mlk_polyvec_add(b, ep);
- mlk_poly_add(&v, &epp);
- mlk_poly_add(&v, &k);
+ mlk_poly_add(v, epp);
+ mlk_poly_add(v, k);
mlk_polyvec_reduce(b);
- mlk_poly_reduce(&v);
+ mlk_poly_reduce(v);
- mlk_pack_ciphertext(c, b, &v);
+ mlk_pack_ciphertext(c, b, v);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(seed, sizeof(seed));
- mlk_zeroize(&sp, sizeof(sp));
- mlk_zeroize(&sp_cache, sizeof(sp_cache));
- mlk_zeroize(&b, sizeof(b));
- mlk_zeroize(&v, sizeof(v));
- mlk_zeroize(at, sizeof(at));
- mlk_zeroize(&k, sizeof(k));
- mlk_zeroize(&ep, sizeof(ep));
- mlk_zeroize(&epp, sizeof(epp));
+ MLK_FREE(sp_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(epp, mlk_poly, 1, context);
+ MLK_FREE(k, mlk_poly, 1, context);
+ MLK_FREE(v, mlk_poly, 1, context);
+ MLK_FREE(b, mlk_polyvec, 1, context);
+ MLK_FREE(ep, mlk_polyvec, 1, context);
+ MLK_FREE(pkpv, mlk_polyvec, 1, context);
+ MLK_FREE(sp, mlk_polyvec, 1, context);
+ MLK_FREE(at, mlk_polymat, 1, context);
+ MLK_FREE(seed, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
/* Reference: `indcpa_dec()` in the reference implementation @[REF].
* - We use a mulcache for the scalar product.
* - We include buffer zeroization. */
MLK_INTERNAL_API
-void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+int mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- mlk_polyvec b, skpv;
- mlk_poly v, sb;
- mlk_polyvec_mulcache b_cache;
+ int ret = 0;
+ MLK_ALLOC(b, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(v, mlk_poly, 1, context);
+ MLK_ALLOC(sb, mlk_poly, 1, context);
+ MLK_ALLOC(b_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (b == NULL || skpv == NULL || v == NULL || sb == NULL || b_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
- mlk_unpack_ciphertext(b, &v, c);
+ mlk_unpack_ciphertext(b, v, c);
mlk_unpack_sk(skpv, sk);
mlk_polyvec_ntt(b);
mlk_polyvec_mulcache_compute(b_cache, b);
- mlk_polyvec_basemul_acc_montgomery_cached(&sb, skpv, b, b_cache);
- mlk_poly_invntt_tomont(&sb);
+ mlk_polyvec_basemul_acc_montgomery_cached(sb, skpv, b, b_cache);
+ mlk_poly_invntt_tomont(sb);
- mlk_poly_sub(&v, &sb);
- mlk_poly_reduce(&v);
+ mlk_poly_sub(v, sb);
+ mlk_poly_reduce(v);
- mlk_poly_tomsg(m, &v);
+ mlk_poly_tomsg(m, v);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(&skpv, sizeof(skpv));
- mlk_zeroize(&b, sizeof(b));
- mlk_zeroize(&b_cache, sizeof(b_cache));
- mlk_zeroize(&v, sizeof(v));
- mlk_zeroize(&sb, sizeof(sb));
+ MLK_FREE(b_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(sb, mlk_poly, 1, context);
+ MLK_FREE(v, mlk_poly, 1, context);
+ MLK_FREE(skpv, mlk_polyvec, 1, context);
+ MLK_FREE(b, mlk_polyvec, 1, context);
+ return ret;
}
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
@@ -529,4 +618,5 @@ void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
#undef mlk_pack_ciphertext
#undef mlk_unpack_ciphertext
#undef mlk_matvec_mul
-#undef mlk_poly_permute_bitrev_to_custom
+#undef mlk_polyvec_permute_bitrev_to_custom
+#undef mlk_polymat_permute_bitrev_to_custom
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/indcpa.h
index 4c44d0d411..b31756dcb6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/indcpa.h
@@ -15,7 +15,6 @@
#ifndef MLK_INDCPA_H
#define MLK_INDCPA_H
-#include
#include "cbmc.h"
#include "common.h"
#include "poly_k.h"
@@ -39,18 +38,19 @@
*
**************************************************/
MLK_INTERNAL_API
-void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
+void mlk_gen_matrix(mlk_polymat *a, const uint8_t seed[MLKEM_SYMBYTES],
int transposed)
__contract__(
requires(memory_no_alias(a, sizeof(mlk_polymat)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
requires(transposed == 0 || transposed == 1)
- assigns(object_whole(a))
- ensures(forall(x, 0, MLKEM_K * MLKEM_K,
- array_bound(a[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(a, sizeof(mlk_polymat)))
+ ensures(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
);
-#define mlk_indcpa_keypair_derand MLK_NAMESPACE_K(indcpa_keypair_derand)
+#define mlk_indcpa_keypair_derand \
+ MLK_NAMESPACE_K(indcpa_keypair_derand) MLK_CONTEXT_PARAMETERS_3
/*************************************************
* Name: mlk_indcpa_keypair_derand
*
@@ -68,18 +68,23 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
-#define mlk_indcpa_enc MLK_NAMESPACE_K(indcpa_enc)
+#define mlk_indcpa_enc MLK_NAMESPACE_K(indcpa_enc) MLK_CONTEXT_PARAMETERS_4
/*************************************************
* Name: mlk_indcpa_enc
*
@@ -100,19 +105,23 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(c))
+ assigns(memory_slice(c, MLKEM_INDCPA_BYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
-#define mlk_indcpa_dec MLK_NAMESPACE_K(indcpa_dec)
+#define mlk_indcpa_dec MLK_NAMESPACE_K(indcpa_dec) MLK_CONTEXT_PARAMETERS_3
/*************************************************
* Name: mlk_indcpa_dec
*
@@ -130,14 +139,18 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
- assigns(object_whole(m))
+ assigns(memory_slice(m, MLKEM_INDCPA_MSGBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
#endif /* !MLK_INDCPA_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/kem.c
index d6f4e83628..3c82d6df70 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/kem.c
@@ -8,7 +8,8 @@
*
* - [FIPS140_3_IG]
* Implementation Guidance for FIPS 140-3 and the Cryptographic Module
- * Validation Program National Institute of Standards and Technology
+ * Validation Program
+ * National Institute of Standards and Technology
* https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
*
* - [FIPS203]
@@ -22,12 +23,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
-#include
+#include "kem.h"
#include "indcpa.h"
-#include "kem.h"
#include "randombytes.h"
#include "symmetric.h"
#include "verify.h"
@@ -36,44 +34,24 @@
* This is to facilitate building multiple instances
* of mlkem-native (e.g. with varying security levels)
* within a single compilation unit. */
-#define mlk_check_pk MLK_ADD_PARAM_SET(mlk_check_pk)
-#define mlk_check_sk MLK_ADD_PARAM_SET(mlk_check_sk)
-#define mlk_check_pct MLK_ADD_PARAM_SET(mlk_check_pct)
+#define mlk_check_pct MLK_ADD_PARAM_SET(mlk_check_pct) MLK_CONTEXT_PARAMETERS_2
/* End of parameter set namespacing */
-#if defined(CBMC)
-/* Redeclaration with contract needed for CBMC only */
-int memcmp(const void *str1, const void *str2, size_t n)
-__contract__(
- requires(memory_no_alias(str1, n))
- requires(memory_no_alias(str2, n))
-);
-#endif /* CBMC */
-
-/*************************************************
- * Name: mlk_check_pk
- *
- * Description: Implements modulus check mandated by FIPS 203,
- * i.e., ensures that coefficients are in [0,q-1].
- *
- * Arguments: - const uint8_t *pk: pointer to input public key
- * (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
- * bytes)
- *
- * Returns: - 0 on success
- * - -1 on failure
- *
- * Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
- *
- **************************************************/
-
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-static int mlk_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- mlk_polyvec p;
- uint8_t p_reencoded[MLKEM_POLYVECBYTES];
+ int ret = 0;
+ MLK_ALLOC(p, mlk_polyvec, 1, context);
+ MLK_ALLOC(p_reencoded, uint8_t, MLKEM_POLYVECBYTES, context);
+
+ if (p == NULL || p_reencoded == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
mlk_polyvec_frombytes(p, pk);
mlk_polyvec_reduce(p);
@@ -81,39 +59,32 @@ static int mlk_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
/* We use a constant-time memcmp here to avoid having to
* declassify the PK before the PCT has succeeded. */
- res = mlk_ct_memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES) ? -1 : 0;
+ ret = mlk_ct_memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES) ? MLK_ERR_FAIL : 0;
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(p_reencoded, sizeof(p_reencoded));
- mlk_zeroize(&p, sizeof(p));
- return res;
+ MLK_FREE(p_reencoded, uint8_t, MLKEM_POLYVECBYTES, context);
+ MLK_FREE(p, mlk_polyvec, 1, context);
+ return ret;
}
-/*************************************************
- * Name: mlk_check_sk
- *
- * Description: Implements public key hash check mandated by FIPS 203,
- * i.e., ensures that
- * sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
- *
- * Arguments: - const uint8_t *sk: pointer to input private key
- * (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
- * bytes)
- *
- * Returns: - 0 on success
- * - -1 on failure
- *
- * Specification: Implements @[FIPS203, Section 7.3, 'hash check']
- *
- **************************************************/
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-static int mlk_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t test[MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(test, uint8_t, MLKEM_SYMBYTES, context);
+
+ if (test == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
/*
* The parts of `sk` being hashed and compared here are public, so
* no public information is leaked through the runtime or the return value
@@ -128,23 +99,32 @@ static int mlk_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
mlk_hash_h(test, sk + MLKEM_INDCPA_SECRETKEYBYTES,
MLKEM_INDCCA_PUBLICKEYBYTES);
- res = memcmp(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, test,
- MLKEM_SYMBYTES)
- ? -1
+ /* This doesn't have to be a constant-time memcmp, but it's the only place
+ * in the library where a normal memcmp would be used otherwise, so for sake
+ * of minimizing stdlib dependency, we use our constant-time one anyway. */
+ ret = mlk_ct_memcmp(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+ test, MLKEM_SYMBYTES)
+ ? MLK_ERR_FAIL
: 0;
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(test, sizeof(test));
- return res;
+ MLK_FREE(test, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
- requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES)));
+ requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
+);
#if defined(MLK_CONFIG_KEYGEN_PCT)
/* Specification:
@@ -152,21 +132,30 @@ __contract__(
* @[FIPS203, Section 7.1, Pairwise Consistency]. */
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES];
- uint8_t ss_enc[MLKEM_SSBYTES], ss_dec[MLKEM_SSBYTES];
+ int ret = 0;
+ MLK_ALLOC(ct, uint8_t, MLKEM_INDCCA_CIPHERTEXTBYTES, context);
+ MLK_ALLOC(ss_enc, uint8_t, MLKEM_SSBYTES, context);
+ MLK_ALLOC(ss_dec, uint8_t, MLKEM_SSBYTES, context);
+
+ if (ct == NULL || ss_enc == NULL || ss_dec == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
- res = crypto_kem_enc(ct, ss_enc, pk);
- if (res != 0)
+ ret = mlk_kem_enc(ct, ss_enc, pk, context);
+ if (ret != 0)
{
goto cleanup;
}
- res = crypto_kem_dec(ss_dec, ct, sk);
- if (res != 0)
+ ret = mlk_kem_dec(ss_dec, ct, sk, context);
+ if (ret != 0)
{
goto cleanup;
}
@@ -179,26 +168,36 @@ static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
}
#endif /* MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST */
- res = mlk_ct_memcmp(ss_enc, ss_dec, sizeof(ss_dec));
+ ret = mlk_ct_memcmp(ss_enc, ss_dec, MLKEM_SSBYTES);
+ /* The result of the PCT is public. */
+ MLK_CT_TESTING_DECLASSIFY(&ret, sizeof(ret));
+
+ if (ret != 0)
+ {
+ ret = MLK_ERR_FAIL;
+ }
cleanup:
- /* The result of the PCT is public. */
- MLK_CT_TESTING_DECLASSIFY(&res, sizeof(res));
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(ct, sizeof(ct));
- mlk_zeroize(ss_enc, sizeof(ss_enc));
- mlk_zeroize(ss_dec, sizeof(ss_dec));
- return res;
+ MLK_FREE(ss_dec, uint8_t, MLKEM_SSBYTES, context);
+ MLK_FREE(ss_enc, uint8_t, MLKEM_SSBYTES, context);
+ MLK_FREE(ct, uint8_t, MLKEM_INDCCA_CIPHERTEXTBYTES, context);
+ return ret;
}
-#else /* MLK_CONFIG_KEYGEN_PCT */
+#else /* MLK_CONFIG_KEYGEN_PCT */
+MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
/* Skip PCT */
((void)pk);
((void)sk);
+#if defined(MLK_CONFIG_CONTEXT_PARAMETER)
+ ((void)context);
+#endif
return 0;
}
#endif /* !MLK_CONFIG_KEYGEN_PCT */
@@ -208,164 +207,240 @@ static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
* - We optionally include PCT which is not present in
* the reference code. */
MLK_EXTERNAL_API
-int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
- const uint8_t coins[2 * MLKEM_SYMBYTES])
+int mlk_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ const uint8_t coins[2 * MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- mlk_indcpa_keypair_derand(pk, sk, coins);
- memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+ int ret;
+
+ ret = mlk_indcpa_keypair_derand(pk, sk, coins, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
+
+ mlk_memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
mlk_hash_h(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, pk,
MLKEM_INDCCA_PUBLICKEYBYTES);
/* Value z for pseudo-random output on reject */
- memcpy(sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
- coins + MLKEM_SYMBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
+ coins + MLKEM_SYMBYTES, MLKEM_SYMBYTES);
/* Declassify public key */
MLK_CT_TESTING_DECLASSIFY(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
/* Pairwise Consistency Test (PCT) @[FIPS140_3_IG, p.87] */
- if (mlk_check_pct(pk, sk))
+ ret = mlk_check_pct(pk, sk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- return 0;
+cleanup:
+ if (ret != 0)
+ {
+ mlk_zeroize(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+ mlk_zeroize(sk, MLKEM_INDCCA_SECRETKEYBYTES);
+ }
+
+ return ret;
}
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
/* Reference: `crypto_kem_keypair()` in the reference implementation @[REF]
* - We zeroize the stack buffer */
MLK_EXTERNAL_API
-int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t coins[2 * MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(coins, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+ if (coins == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Acquire necessary randomness, and mark it as secret. */
- mlk_randombytes(coins, 2 * MLKEM_SYMBYTES);
- MLK_CT_TESTING_SECRET(coins, sizeof(coins));
+ if (mlk_randombytes(coins, 2 * MLKEM_SYMBYTES) != 0)
+ {
+ ret = MLK_ERR_RNG_FAIL;
+ goto cleanup;
+ }
+
+ MLK_CT_TESTING_SECRET(coins, 2 * MLKEM_SYMBYTES);
- res = crypto_kem_keypair_derand(pk, sk, coins);
+ ret = mlk_kem_keypair_derand(pk, sk, coins, context);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(coins, sizeof(coins));
- return res;
+ MLK_FREE(coins, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
/* Reference: `crypto_kem_enc_derand()` in the reference implementation @[REF]
* - We include public key check
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- /* Will contain key, coins */
- MLK_ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+ if (buf == NULL || kr == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Specification: Implements @[FIPS203, Section 7.2, Modulus check] */
- if (mlk_check_pk(pk))
+ ret = mlk_kem_check_pk(pk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- memcpy(buf, coins, MLKEM_SYMBYTES);
+ mlk_memcpy(buf, coins, MLKEM_SYMBYTES);
/* Multitarget countermeasure for coins + contributory KEM */
mlk_hash_h(buf + MLKEM_SYMBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
/* coins are in kr+MLKEM_SYMBYTES */
- mlk_indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES);
+ ret = mlk_indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
- memcpy(ss, kr, MLKEM_SYMBYTES);
+ mlk_memcpy(ss, kr, MLKEM_SYMBYTES);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(kr, sizeof(kr));
-
- return 0;
+ MLK_FREE(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
/* Reference: `crypto_kem_enc()` in the reference implementation @[REF]
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t coins[MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(coins, uint8_t, MLKEM_SYMBYTES, context);
- mlk_randombytes(coins, MLKEM_SYMBYTES);
- MLK_CT_TESTING_SECRET(coins, sizeof(coins));
+ if (coins == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
+ if (mlk_randombytes(coins, MLKEM_SYMBYTES) != 0)
+ {
+ ret = MLK_ERR_RNG_FAIL;
+ goto cleanup;
+ }
+
+ MLK_CT_TESTING_SECRET(coins, MLKEM_SYMBYTES);
- res = crypto_kem_enc_derand(ct, ss, pk, coins);
+ ret = mlk_kem_enc_derand(ct, ss, pk, coins, context);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(coins, sizeof(coins));
- return res;
+ MLK_FREE(coins, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
/* Reference: `crypto_kem_dec()` in the reference implementation @[REF]
* - We include secret key check
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_dec(uint8_t ss[MLKEM_SSBYTES],
- const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_dec(uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
+ int ret = 0;
uint8_t fail;
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- /* Will contain key, coins */
- MLK_ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
- MLK_ALIGN uint8_t tmp[MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES];
-
const uint8_t *pk = sk + MLKEM_INDCPA_SECRETKEYBYTES;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(tmp, uint8_t, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES,
+ context);
+
+ if (buf == NULL || kr == NULL || tmp == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Specification: Implements @[FIPS203, Section 7.3, Hash check] */
- if (mlk_check_sk(sk))
+ ret = mlk_kem_check_sk(sk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- mlk_indcpa_dec(buf, ct, sk);
+ ret = mlk_indcpa_dec(buf, ct, sk, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
/* Multitarget countermeasure for coins + contributory KEM */
- memcpy(buf + MLKEM_SYMBYTES,
- sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(buf + MLKEM_SYMBYTES,
+ sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+ MLKEM_SYMBYTES);
mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
/* Recompute and compare ciphertext */
/* coins are in kr+MLKEM_SYMBYTES */
- mlk_indcpa_enc(tmp, buf, pk, kr + MLKEM_SYMBYTES);
+ ret = mlk_indcpa_enc(tmp, buf, pk, kr + MLKEM_SYMBYTES, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
+
fail = mlk_ct_memcmp(ct, tmp, MLKEM_INDCCA_CIPHERTEXTBYTES);
/* Compute rejection key */
- memcpy(tmp, sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
- MLKEM_SYMBYTES);
- memcpy(tmp + MLKEM_SYMBYTES, ct, MLKEM_INDCCA_CIPHERTEXTBYTES);
- mlk_hash_j(ss, tmp, sizeof(tmp));
+ mlk_memcpy(tmp, sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
+ MLKEM_SYMBYTES);
+ mlk_memcpy(tmp + MLKEM_SYMBYTES, ct, MLKEM_INDCCA_CIPHERTEXTBYTES);
+ mlk_hash_j(ss, tmp, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES);
/* Copy true key to return buffer if fail is 0 */
mlk_ct_cmov_zero(ss, kr, MLKEM_SYMBYTES, fail);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(kr, sizeof(kr));
- mlk_zeroize(tmp, sizeof(tmp));
+ MLK_FREE(tmp, uint8_t, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES,
+ context);
+ MLK_FREE(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
- return 0;
+ return ret;
}
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
-#undef mlk_check_pk
-#undef mlk_check_sk
#undef mlk_check_pct
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/kem.h
index d3e5f50ce6..0502715c39 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/kem.h
@@ -10,12 +10,16 @@
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [REF]
+ * CRYSTALS-Kyber C reference implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/ref
*/
#ifndef MLK_KEM_H
#define MLK_KEM_H
-#include
#include "cbmc.h"
#include "common.h"
#include "sys.h"
@@ -23,9 +27,7 @@
#if defined(MLK_CHECK_APIS)
/* Include to ensure consistency between internal kem.h
* and external mlkem_native.h. */
-#define MLK_CONFIG_API_NO_SUPERCOP
#include "mlkem_native.h"
-#undef MLK_CONFIG_API_NO_SUPERCOP
#if MLKEM_INDCCA_SECRETKEYBYTES != \
MLKEM_SECRETKEYBYTES(MLK_CONFIG_PARAMETER_SET)
@@ -44,14 +46,79 @@
#endif /* MLK_CHECK_APIS */
-#define crypto_kem_keypair_derand MLK_NAMESPACE_K(keypair_derand)
-#define crypto_kem_keypair MLK_NAMESPACE_K(keypair)
-#define crypto_kem_enc_derand MLK_NAMESPACE_K(enc_derand)
-#define crypto_kem_enc MLK_NAMESPACE_K(enc)
-#define crypto_kem_dec MLK_NAMESPACE_K(dec)
+#define mlk_kem_keypair_derand \
+ MLK_NAMESPACE_K(keypair_derand) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_keypair MLK_NAMESPACE_K(keypair) MLK_CONTEXT_PARAMETERS_2
+#define mlk_kem_enc_derand MLK_NAMESPACE_K(enc_derand) MLK_CONTEXT_PARAMETERS_4
+#define mlk_kem_enc MLK_NAMESPACE_K(enc) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_dec MLK_NAMESPACE_K(dec) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_check_pk MLK_NAMESPACE_K(check_pk) MLK_CONTEXT_PARAMETERS_1
+#define mlk_kem_check_sk MLK_NAMESPACE_K(check_sk) MLK_CONTEXT_PARAMETERS_1
+
+/*************************************************
+ * Name: mlk_kem_check_pk
+ *
+ * Description: Implements modulus check mandated by FIPS 203,
+ * i.e., ensures that coefficients are in [0,q-1].
+ *
+ * Arguments: - const uint8_t *pk: pointer to input public key
+ * (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
+ * bytes)
+ *
+ * Returns: - 0 on success
+ * - MLK_ERR_FAIL: If the modulus check failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
+ *
+ **************************************************/
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+ requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
+);
+
+
+/*************************************************
+ * Name: mlk_kem_check_sk
+ *
+ * Description: Implements public key hash check mandated by FIPS 203,
+ * i.e., ensures that
+ * sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
+ *
+ * Arguments: - const uint8_t *sk: pointer to input private key
+ * (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
+ * bytes)
+ *
+ * Returns: - 0 on success
+ * - MLK_ERR_FAIL: If the public key hash check failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.3, 'hash check']
+ *
+ **************************************************/
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+ requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
+);
/*************************************************
- * Name: crypto_kem_keypair_derand
+ * Name: mlk_kem_keypair_derand
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
@@ -67,26 +134,33 @@
* random bytes)
*
* Returns: - 0: On success
- * - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
+ * - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ * PCT failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 16, ML-KEM.KeyGen_Internal]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
- const uint8_t coins[2 * MLKEM_SYMBYTES])
+int mlk_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ const uint8_t coins[2 * MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
requires(memory_no_alias(coins, 2 * MLKEM_SYMBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_keypair
+ * Name: mlk_kem_keypair
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
@@ -99,24 +173,32 @@ __contract__(
* bytes)
*
* Returns: - 0: On success
- * - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ * - MLK_ERR_RNG_FAIL: Random number generation failed.
+ * - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ * PCT failed.
*
* Specification: Implements @[FIPS203, Algorithm 19, ML-KEM.KeyGen]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_enc_derand
+ * Name: mlk_kem_enc_derand
*
* Description: Generates cipher text and shared
* secret for given public key
@@ -134,29 +216,34 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'modulus check' @[FIPS203, Section 7.2]
- * for the public key fails.
+ * - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ * for the public key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 17, ML-KEM.Encaps_Internal]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(ct))
- assigns(object_whole(ss))
+ assigns(memory_slice(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
/*************************************************
- * Name: crypto_kem_enc
+ * Name: mlk_kem_enc
*
* Description: Generates cipher text and shared
* secret for given public key
@@ -171,27 +258,34 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'modulus check' @[FIPS203, Section 7.2]
- * for the public key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ * - MLK_ERR_RNG_FAIL: Random number generation failed.
+ * - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ * for the public key fails.
*
* Specification: Implements @[FIPS203, Algorithm 20, ML-KEM.Encaps]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
- assigns(object_whole(ct))
- assigns(object_whole(ss))
+ assigns(memory_slice(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_dec
+ * Name: mlk_kem_dec
*
* Description: Generates shared secret for given
* cipher text and private key
@@ -206,22 +300,27 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'hash check' @[FIPS203, Section 7.3]
- * for the secret key fails.
+ * - MLK_ERR_FAIL: If the 'hash check' @[FIPS203, Section 7.3]
+ * for the secret key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 21, ML-KEM.Decaps]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_dec(uint8_t ss[MLKEM_SSBYTES],
- const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_dec(uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
- assigns(object_whole(ss))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
#endif /* !MLK_KEM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/api.h
index aea28a3af4..0308f2bd51 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/api.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/api.h
@@ -17,10 +17,18 @@
* and run sanity checks.
*/
-#include
#include "../cbmc.h"
#include "../common.h"
+/* Backends must return MLK_NATIVE_FUNC_SUCCESS upon success. */
+#define MLK_NATIVE_FUNC_SUCCESS (0)
+/* Backends may return MLK_NATIVE_FUNC_FALLBACK to signal to the frontend that
+ * the target/parameters are unsupported; typically, this would be because of
+ * dependencies on CPU features not detected on the host CPU. In this case,
+ * the frontend falls back to the default C implementation. */
+#define MLK_NATIVE_FUNC_FALLBACK (-1)
+
+
/* Absolute exclusive upper bound for the output of the inverse NTT
*
* NOTE: This is the same bound as in poly.h and has to be kept
@@ -74,12 +82,16 @@
*
* Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial
**************************************************/
-static MLK_INLINE void mlk_ntt_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_ntt_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
requires(array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLK_NTT_BOUND))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLK_NTT_BOUND))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_NTT */
@@ -140,11 +152,14 @@ __contract__(
*
* Arguments: - uint16_t *a: pointer to in/output polynomial
**************************************************/
-static MLK_INLINE void mlk_intt_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_intt_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLK_INVNTT_BOUND))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLK_INVNTT_BOUND))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_INTT */
@@ -156,11 +171,14 @@ __contract__(
*
* Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial
**************************************************/
-static MLK_INLINE void mlk_poly_reduce_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_reduce_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
@@ -173,11 +191,14 @@ __contract__(
*
* Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial
**************************************************/
-static MLK_INLINE void mlk_poly_tomont_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tomont_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
@@ -203,13 +224,15 @@ __contract__(
* OUTPUT
* - cache: pointer to multiplication cache
**************************************************/
-static MLK_INLINE void mlk_poly_mulcache_compute_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_mulcache_compute_native(
int16_t cache[MLKEM_N / 2], const int16_t mlk_poly[MLKEM_N])
__contract__(
requires(memory_no_alias(cache, sizeof(int16_t) * (MLKEM_N / 2)))
requires(memory_no_alias(mlk_poly, sizeof(int16_t) * MLKEM_N))
- assigns(object_whole(cache))
- ensures(array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
+ assigns(memory_slice(cache, sizeof(int16_t) * (MLKEM_N / 2)))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
);
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
@@ -234,7 +257,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N],
const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)])
__contract__(
@@ -244,6 +268,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 2 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 2 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 */
@@ -267,7 +292,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N],
const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)])
__contract__(
@@ -277,6 +303,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 3 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 3 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3 */
@@ -300,7 +327,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N],
const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)])
__contract__(
@@ -310,6 +338,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 4 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 4 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
@@ -324,18 +353,20 @@ __contract__(
*
* Arguments: INPUT:
* - a: const pointer to input polynomial,
- * with each coefficient in the range -Q+1 .. Q-1
+ * with each coefficient in the range 0 .. Q-1
* OUTPUT
* - r: pointer to output byte array
* (of MLKEM_POLYBYTES bytes)
**************************************************/
-static MLK_INLINE void mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
- const int16_t a[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+ const int16_t a[MLKEM_N])
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
);
#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
@@ -353,13 +384,15 @@ __contract__(
* - a: const pointer to input byte array
* (of MLKEM_POLYBYTES bytes)
**************************************************/
-static MLK_INLINE void mlk_poly_frombytes_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_frombytes_native(
int16_t a[MLKEM_N], const uint8_t r[MLKEM_POLYBYTES])
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(a, sizeof(int16_t) * MLKEM_N))
- ensures(array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
);
#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
@@ -381,6 +414,7 @@ __contract__(
* Otherwise, returns non-negative number of sampled 16-bit integers (at most
* len).
**************************************************/
+MLK_MUST_CHECK_RETURN_VALUE
static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
const uint8_t *buf,
unsigned buflen)
@@ -389,8 +423,10 @@ __contract__(
requires(memory_no_alias(r, sizeof(int16_t) * len))
requires(memory_no_alias(buf, buflen))
assigns(memory_slice(r, sizeof(int16_t) * len))
- ensures(return_value == -1 || (0 <= return_value && return_value <= len))
- ensures(return_value != -1 ==> array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
+ ensures(return_value != MLK_NATIVE_FUNC_FALLBACK
+ ==> (0 <= return_value && return_value <= len))
+ ensures(return_value != MLK_NATIVE_FUNC_FALLBACK
+ ==> array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
);
#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
@@ -408,8 +444,15 @@ __contract__(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d4_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d4_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
@@ -425,8 +468,15 @@ static MLK_INLINE void mlk_poly_compress_d4_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d10_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d10_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
@@ -444,8 +494,15 @@ static MLK_INLINE void mlk_poly_compress_d10_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d4_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d4_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
@@ -463,8 +520,15 @@ static MLK_INLINE void mlk_poly_decompress_d4_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d10_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d10_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
@@ -482,8 +546,15 @@ static MLK_INLINE void mlk_poly_decompress_d10_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d5_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d5_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
@@ -499,8 +570,15 @@ static MLK_INLINE void mlk_poly_compress_d5_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d11_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d11_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
@@ -518,8 +596,15 @@ static MLK_INLINE void mlk_poly_compress_d11_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d5_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d5_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
@@ -537,8 +622,15 @@ static MLK_INLINE void mlk_poly_decompress_d5_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d11_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d11_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/meta.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/meta.h
index f2b9b848b7..4291d629b1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/meta.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/meta.h
@@ -18,4 +18,8 @@
#include "x86_64/meta.h"
#endif
+#if defined(MLK_SYS_RISCV64_RVV)
+#include "riscv64/meta.h"
+#endif
+
#endif /* !MLK_NATIVE_META_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/meta.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/meta.h
index d8459ec6fc..39fa04c2b3 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/meta.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/meta.h
@@ -30,143 +30,272 @@
#define MLK_USE_NATIVE_POLY_DECOMPRESS_D11
#if !defined(__ASSEMBLER__)
-#include
#include "../../common.h"
+#include "../api.h"
#include "src/arith_native_x86_64.h"
+#include "src/compress_consts.h"
static MLK_INLINE void mlk_poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
{
- mlk_nttunpack_avx2((__m256i *)(data));
+ if (mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ mlk_nttunpack_avx2(data);
+ }
}
+MLK_MUST_CHECK_RETURN_VALUE
static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
const uint8_t *buf,
unsigned buflen)
{
- /* AVX2 implementation assumes specific buffer lengths */
- if (len != MLKEM_N || buflen != MLK_AVX2_REJ_UNIFORM_BUFLEN)
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2) || len != MLKEM_N ||
+ buflen % 12 != 0)
{
- return -1;
+ return MLK_NATIVE_FUNC_FALLBACK;
}
-
- return (int)mlk_rej_uniform_avx2(r, buf);
+ return (int)mlk_rej_uniform_asm(r, buf, buflen, mlk_rej_uniform_table);
}
-static MLK_INLINE void mlk_ntt_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N])
{
- mlk_ntt_avx2((__m256i *)data, mlk_qdata.vec);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_ntt_avx2(data, mlk_qdata);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_intt_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N])
{
- mlk_invntt_avx2((__m256i *)data, mlk_qdata.vec);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_invntt_avx2(data, mlk_qdata);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_reduce_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N])
{
- mlk_reduce_avx2((__m256i *)data, mlk_qdata.vec);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_reduce_avx2(data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_tomont_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N])
{
- mlk_tomont_avx2((__m256i *)data, mlk_qdata.vec);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_tomont_avx2(data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_mulcache_compute_native(
- int16_t x[MLKEM_N / 2], const int16_t y[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
+ const int16_t y[MLKEM_N])
{
- mlk_poly_mulcache_compute_avx2((__m256i *)x, (const __m256i *)y,
- mlk_qdata.vec);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_mulcache_compute_avx2(x, y, mlk_qdata);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N],
const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)])
{
- mlk_polyvec_basemul_acc_montgomery_cached_avx2(2, r, a, b, b_cache);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(r, a, b, b_cache);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 3
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N],
const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)])
{
- mlk_polyvec_basemul_acc_montgomery_cached_avx2(3, r, a, b, b_cache);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(r, a, b, b_cache);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N],
const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)])
{
- mlk_polyvec_basemul_acc_montgomery_cached_avx2(4, r, a, b, b_cache);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(r, a, b, b_cache);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
-static MLK_INLINE void mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
- const int16_t a[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+ const int16_t a[MLKEM_N])
{
- mlk_ntttobytes_avx2(r, (const __m256i *)a, mlk_qdata.vec);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_ntttobytes_avx2(r, a);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_frombytes_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_frombytes_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYBYTES])
{
- mlk_nttfrombytes_avx2((__m256i *)r, a, mlk_qdata.vec);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_nttfrombytes_avx2(r, a);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-static MLK_INLINE void mlk_poly_compress_d4_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d4_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N])
{
- mlk_poly_compress_d4_avx2(r, (const __m256i *)a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_compress_d4_avx2(r, a, mlk_compress_d4_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_compress_d10_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d10_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N])
{
- mlk_poly_compress_d10_avx2(r, (const __m256i *)a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_compress_d10_avx2(r, a, mlk_compress_d10_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_decompress_d4_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d4_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
{
- mlk_poly_decompress_d4_avx2((__m256i *)r, a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_decompress_d4_avx2(r, a, mlk_decompress_d4_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_decompress_d10_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d10_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
{
- mlk_poly_decompress_d10_avx2((__m256i *)r, a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_decompress_d10_avx2(r, a, mlk_decompress_d10_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
-static MLK_INLINE void mlk_poly_compress_d5_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d5_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N])
{
- mlk_poly_compress_d5_avx2(r, (const __m256i *)a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_compress_d5_avx2(r, a, mlk_compress_d5_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_compress_d11_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d11_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N])
{
- mlk_poly_compress_d11_avx2(r, (const __m256i *)a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_compress_d11_avx2(r, a, mlk_compress_d11_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_decompress_d5_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d5_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
{
- mlk_poly_decompress_d5_avx2((__m256i *)r, a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_decompress_d5_avx2(r, a, mlk_decompress_d5_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_decompress_d11_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d11_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
{
- mlk_poly_decompress_d11_avx2((__m256i *)r, a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_decompress_d11_avx2(r, a, mlk_decompress_d11_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/align.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/align.h
deleted file mode 100644
index 5086f69864..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/align.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/* References
- * ==========
- *
- * - [REF_AVX2]
- * CRYSTALS-Kyber optimized AVX2 implementation
- * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
- * https://github.com/pq-crystals/kyber/tree/main/avx2
- */
-
-#ifndef MLK_NATIVE_X86_64_SRC_ALIGN_H
-#define MLK_NATIVE_X86_64_SRC_ALIGN_H
-
-/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
- */
-
-#include
-#include
-
-#define MLK_ALIGNED_INT16(N) \
- union \
- { \
- int16_t coeffs[N]; \
- __m256i vec[(N + 15) / 16]; \
- }
-
-#endif /* !MLK_NATIVE_X86_64_SRC_ALIGN_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/arith_native_x86_64.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/arith_native_x86_64.h
index 2e8d6849a3..d73ba4346e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/arith_native_x86_64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/arith_native_x86_64.h
@@ -7,81 +7,303 @@
#include "../../../common.h"
-#include
#include
+#include "compress_consts.h"
#include "consts.h"
#define MLK_AVX2_REJ_UNIFORM_BUFLEN \
(3 * 168) /* REJ_UNIFORM_NBLOCKS * SHAKE128_RATE */
-#define mlk_rej_uniform_avx2 MLK_NAMESPACE(rej_uniform_avx2)
-unsigned mlk_rej_uniform_avx2(int16_t *r, const uint8_t *buf);
-
#define mlk_rej_uniform_table MLK_NAMESPACE(rej_uniform_table)
-extern const uint8_t mlk_rej_uniform_table[256][8];
+extern const uint8_t mlk_rej_uniform_table[];
+
+#define mlk_rej_uniform_asm MLK_NAMESPACE(rej_uniform_asm)
+MLK_MUST_CHECK_RETURN_VALUE
+uint64_t mlk_rej_uniform_asm(int16_t *r, const uint8_t *buf, unsigned buflen,
+ const uint8_t *table)
+/* This must be kept in sync with the HOL-Light specification
+ * in proofs/hol_light/x86_64/proofs/mlkem_rej_uniform.ml. */
+__contract__(
+ requires(buflen % 12 == 0)
+ requires(memory_no_alias(buf, buflen))
+ requires(table == mlk_rej_uniform_table)
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value <= MLKEM_N)
+ ensures(array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
+);
#define mlk_ntt_avx2 MLK_NAMESPACE(ntt_avx2)
-void mlk_ntt_avx2(__m256i *r, const __m256i *mlk_qdata);
+void mlk_ntt_avx2(int16_t *r, const int16_t *qdata)
+/* This must be kept in sync with the HOL-Light specification
+ * in proofs/hol_light/x86_64/proofs/mlkem_ntt.ml */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(array_abs_bound(r, 0, MLKEM_N, 8192))
+ requires(qdata == mlk_qdata)
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ /* check-magic: off */
+ ensures(array_abs_bound(r, 0, MLKEM_N, 23595))
+ /* check-magic: on */
+);
#define mlk_invntt_avx2 MLK_NAMESPACE(invntt_avx2)
-void mlk_invntt_avx2(__m256i *r, const __m256i *mlk_qdata);
+void mlk_invntt_avx2(int16_t *r, const int16_t *qdata)
+/* This must be kept in sync with the HOL-Light specification
+ * in proofs/hol_light/x86_64/proofs/mlkem_intt.ml */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(qdata == mlk_qdata)
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ /* check-magic: off */
+ ensures(array_abs_bound(r, 0, MLKEM_N, 26632))
+ /* check-magic: on */
+);
#define mlk_nttunpack_avx2 MLK_NAMESPACE(nttunpack_avx2)
-void mlk_nttunpack_avx2(__m256i *r);
+void mlk_nttunpack_avx2(int16_t *r)
+/* This must be kept in sync with the HOL-Light specification
+ * in proofs/hol_light/x86_64/proofs/mlkem_unpack.ml */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(r, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ /* Output is a permutation of input: every output coefficient
+ * is some input coefficient */
+ ensures(forall(i, 0, MLKEM_N, exists(j, 0, MLKEM_N,
+ r[i] == old(*(int16_t (*)[MLKEM_N])r)[j])))
+);
#define mlk_reduce_avx2 MLK_NAMESPACE(reduce_avx2)
-void mlk_reduce_avx2(__m256i *r, const __m256i *mlk_qdata);
-
-#define mlk_basemul_avx2 MLK_NAMESPACE(basemul_avx2)
-void mlk_basemul_avx2(__m256i *r, const __m256i *a, const __m256i *b,
- const __m256i *b_cache, const __m256i *mlk_qdata);
+void mlk_reduce_avx2(int16_t *r)
+/* This must be kept in sync with the HOL-Light specification
+ * in proofs/hol_light/x86_64/proofs/mlkem_reduce.ml */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(array_bound(r, 0, MLKEM_N, 0, MLKEM_Q))
+);
#define mlk_poly_mulcache_compute_avx2 MLK_NAMESPACE(poly_mulcache_compute_avx2)
-void mlk_poly_mulcache_compute_avx2(__m256i *out, const __m256i *in,
- const __m256i *mlk_qdata);
+void mlk_poly_mulcache_compute_avx2(int16_t *out, const int16_t *in,
+ const int16_t *qdata)
+/* This must be kept in sync with the HOL-Light specification
+ * in proofs/hol_light/x86_64/proofs/mlkem_mulcache_compute.ml */
+__contract__(
+ requires(memory_no_alias(out, sizeof(int16_t) * (MLKEM_N / 2)))
+ requires(memory_no_alias(in, sizeof(int16_t) * MLKEM_N))
+ requires(qdata == mlk_qdata)
+ assigns(memory_slice(out, sizeof(int16_t) * (MLKEM_N / 2)))
+ ensures(array_abs_bound(out, 0, MLKEM_N/2, MLKEM_Q))
+);
-#define mlk_polyvec_basemul_acc_montgomery_cached_avx2 \
- MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_avx2)
-void mlk_polyvec_basemul_acc_montgomery_cached_avx2(unsigned k,
- int16_t r[MLKEM_N],
- const int16_t *a,
- const int16_t *b,
- const int16_t *kb_cache);
+#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k2 \
+ MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2)
+void mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, sizeof(int16_t) * 2 * MLKEM_N))
+ requires(memory_no_alias(b, sizeof(int16_t) * 2 * MLKEM_N))
+ requires(memory_no_alias(b_cache, sizeof(int16_t) * 2 * (MLKEM_N / 2)))
+ requires(array_abs_bound(a, 0, 2 * MLKEM_N, MLKEM_UINT12_LIMIT + 1))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+);
+
+#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k3 \
+ MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3)
+void mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, sizeof(int16_t) * 3 * MLKEM_N))
+ requires(memory_no_alias(b, sizeof(int16_t) * 3 * MLKEM_N))
+ requires(memory_no_alias(b_cache, sizeof(int16_t) * 3 * (MLKEM_N / 2)))
+ requires(array_abs_bound(a, 0, 3 * MLKEM_N, MLKEM_UINT12_LIMIT + 1))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+);
+
+#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k4 \
+ MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4)
+void mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, sizeof(int16_t) * 4 * MLKEM_N))
+ requires(memory_no_alias(b, sizeof(int16_t) * 4 * MLKEM_N))
+ requires(memory_no_alias(b_cache, sizeof(int16_t) * 4 * (MLKEM_N / 2)))
+ requires(array_abs_bound(a, 0, 4 * MLKEM_N, MLKEM_UINT12_LIMIT + 1))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+);
#define mlk_ntttobytes_avx2 MLK_NAMESPACE(ntttobytes_avx2)
-void mlk_ntttobytes_avx2(uint8_t *r, const __m256i *a,
- const __m256i *mlk_qdata);
+void mlk_ntttobytes_avx2(uint8_t *r, const int16_t *a)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_tobytes.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYBYTES))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
+);
#define mlk_nttfrombytes_avx2 MLK_NAMESPACE(nttfrombytes_avx2)
-void mlk_nttfrombytes_avx2(__m256i *r, const uint8_t *a,
- const __m256i *mlk_qdata);
+void mlk_nttfrombytes_avx2(int16_t *r, const uint8_t *a)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_frombytes.ml.
+ */
+__contract__(
+ requires(memory_no_alias(a, MLKEM_POLYBYTES))
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(array_bound(r, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+);
#define mlk_tomont_avx2 MLK_NAMESPACE(tomont_avx2)
-void mlk_tomont_avx2(__m256i *r, const __m256i *mlk_qdata);
+void mlk_tomont_avx2(int16_t *r)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_tomont.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+);
#define mlk_poly_compress_d4_avx2 MLK_NAMESPACE(poly_compress_d4_avx2)
void mlk_poly_compress_d4_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
- const __m256i *MLK_RESTRICT a);
+ const int16_t *MLK_RESTRICT a,
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_compress_d4.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ requires(data == mlk_compress_d4_data)
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+);
+
#define mlk_poly_decompress_d4_avx2 MLK_NAMESPACE(poly_decompress_d4_avx2)
-void mlk_poly_decompress_d4_avx2(__m256i *MLK_RESTRICT r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
-#define mlk_poly_compress_d10_avx2 MLK_NAMESPACE(poly_compress10_avx2)
+void mlk_poly_decompress_d4_avx2(int16_t *MLK_RESTRICT r,
+ const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4],
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_decompress_d4.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(data == mlk_decompress_d4_data)
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(array_bound(r, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define mlk_poly_compress_d10_avx2 MLK_NAMESPACE(poly_compress_d10_avx2)
void mlk_poly_compress_d10_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
- const __m256i *MLK_RESTRICT a);
-#define mlk_poly_decompress_d10_avx2 MLK_NAMESPACE(poly_decompress10_avx2)
+ const int16_t *MLK_RESTRICT a,
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_compress_d10.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ requires(data == mlk_compress_d10_data)
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+);
+
+#define mlk_poly_decompress_d10_avx2 MLK_NAMESPACE(poly_decompress_d10_avx2)
void mlk_poly_decompress_d10_avx2(
- __m256i *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+ int16_t *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10],
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_decompress_d10.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(data == mlk_decompress_d10_data)
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(array_bound(r, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
#define mlk_poly_compress_d5_avx2 MLK_NAMESPACE(poly_compress_d5_avx2)
void mlk_poly_compress_d5_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
- const __m256i *MLK_RESTRICT a);
+ const int16_t *MLK_RESTRICT a,
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_compress_d5.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ requires(data == mlk_compress_d5_data)
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+);
+
#define mlk_poly_decompress_d5_avx2 MLK_NAMESPACE(poly_decompress_d5_avx2)
-void mlk_poly_decompress_d5_avx2(__m256i *MLK_RESTRICT r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
-#define mlk_poly_compress_d11_avx2 MLK_NAMESPACE(poly_compress11_avx2)
+void mlk_poly_decompress_d5_avx2(int16_t *MLK_RESTRICT r,
+ const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5],
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_decompress_d5.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(data == mlk_decompress_d5_data)
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(array_bound(r, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define mlk_poly_compress_d11_avx2 MLK_NAMESPACE(poly_compress_d11_avx2)
void mlk_poly_compress_d11_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
- const __m256i *MLK_RESTRICT a);
-#define mlk_poly_decompress_d11_avx2 MLK_NAMESPACE(poly_decompress11_avx2)
+ const int16_t *MLK_RESTRICT a,
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_compress_d11.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ requires(data == mlk_compress_d11_data)
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+);
+
+#define mlk_poly_decompress_d11_avx2 MLK_NAMESPACE(poly_decompress_d11_avx2)
void mlk_poly_decompress_d11_avx2(
- __m256i *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+ int16_t *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11],
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_decompress_d11.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(data == mlk_decompress_d11_data)
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(array_bound(r, 0, MLKEM_N, 0, MLKEM_Q))
+);
#endif /* !MLK_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/basemul.S
deleted file mode 100644
index fbe5a8e91f..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/basemul.S
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/* References
- * ==========
- *
- * - [REF_AVX2]
- * CRYSTALS-Kyber optimized AVX2 implementation
- * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
- * https://github.com/pq-crystals/kyber/tree/main/avx2
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
- *
- * The main difference is the use of a mulcache.
- */
-
-#include "../../../common.h"
-#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-
-/*
- * WARNING: This file is auto-derived from the mlkem-native source file
- * dev/x86_64/src/basemul.S using scripts/simpasm. Do not modify it directly.
- */
-
-
-.text
-.balign 4
-.global MLK_ASM_NAMESPACE(basemul_avx2)
-MLK_ASM_FN_SYMBOL(basemul_avx2)
-
- movq %rsp, %r11
- andq $-0x20, %rsp
- subq $0x20, %rsp
- vmovdqa 0x20(%r8), %ymm0
- vmovdqa (%rsi), %ymm1
- vmovdqa 0x20(%rsi), %ymm2
- vmovdqa 0x40(%rsi), %ymm3
- vmovdqa 0x60(%rsi), %ymm4
- vpmullw %ymm0, %ymm1, %ymm9
- vpmullw %ymm0, %ymm2, %ymm10
- vpmullw %ymm0, %ymm3, %ymm11
- vpmullw %ymm0, %ymm4, %ymm12
- vmovdqa (%rdx), %ymm5
- vmovdqa 0x20(%rdx), %ymm6
- vpmulhw %ymm5, %ymm1, %ymm13
- vpmulhw %ymm6, %ymm1, %ymm1
- vpmulhw %ymm5, %ymm2, %ymm14
- vmovdqa 0x40(%rdx), %ymm7
- vmovdqa 0x60(%rdx), %ymm8
- vpmulhw %ymm7, %ymm3, %ymm15
- vpmulhw %ymm8, %ymm3, %ymm3
- vpmulhw %ymm7, %ymm4, %ymm0
- vmovdqa %ymm13, (%rsp)
- vpmullw %ymm5, %ymm9, %ymm13
- vpmullw %ymm6, %ymm9, %ymm9
- vpmullw %ymm5, %ymm10, %ymm5
- vpmullw %ymm7, %ymm11, %ymm6
- vpmullw %ymm8, %ymm11, %ymm11
- vpmullw %ymm7, %ymm12, %ymm7
- vmovdqa (%rcx), %ymm8
- vpmulhw %ymm8, %ymm2, %ymm2
- vpmullw %ymm8, %ymm10, %ymm10
- vmovdqa 0x20(%rcx), %ymm8
- vpmulhw %ymm8, %ymm4, %ymm4
- vpmullw %ymm8, %ymm12, %ymm12
- vmovdqa (%r8), %ymm8
- vpmulhw %ymm8, %ymm13, %ymm13
- vpmulhw %ymm8, %ymm9, %ymm9
- vpmulhw %ymm8, %ymm5, %ymm5
- vpmulhw %ymm8, %ymm10, %ymm10
- vpmulhw %ymm8, %ymm6, %ymm6
- vpmulhw %ymm8, %ymm11, %ymm11
- vpmulhw %ymm8, %ymm7, %ymm7
- vpmulhw %ymm8, %ymm12, %ymm12
- vpsubw (%rsp), %ymm13, %ymm13
- vpsubw %ymm9, %ymm1, %ymm9
- vpsubw %ymm5, %ymm14, %ymm5
- vpsubw %ymm10, %ymm2, %ymm10
- vpsubw %ymm6, %ymm15, %ymm6
- vpsubw %ymm11, %ymm3, %ymm11
- vpsubw %ymm7, %ymm0, %ymm7
- vpsubw %ymm12, %ymm4, %ymm12
- vpaddw %ymm5, %ymm9, %ymm9
- vpaddw %ymm7, %ymm11, %ymm11
- vpsubw %ymm13, %ymm10, %ymm13
- vpsubw %ymm12, %ymm6, %ymm6
- vmovdqa %ymm13, (%rdi)
- vmovdqa %ymm9, 0x20(%rdi)
- vmovdqa %ymm6, 0x40(%rdi)
- vmovdqa %ymm11, 0x60(%rdi)
- vmovdqa 0x20(%r8), %ymm0
- vmovdqa 0x80(%rsi), %ymm1
- vmovdqa 0xa0(%rsi), %ymm2
- vmovdqa 0xc0(%rsi), %ymm3
- vmovdqa 0xe0(%rsi), %ymm4
- vpmullw %ymm0, %ymm1, %ymm9
- vpmullw %ymm0, %ymm2, %ymm10
- vpmullw %ymm0, %ymm3, %ymm11
- vpmullw %ymm0, %ymm4, %ymm12
- vmovdqa 0x80(%rdx), %ymm5
- vmovdqa 0xa0(%rdx), %ymm6
- vpmulhw %ymm5, %ymm1, %ymm13
- vpmulhw %ymm6, %ymm1, %ymm1
- vpmulhw %ymm5, %ymm2, %ymm14
- vmovdqa 0xc0(%rdx), %ymm7
- vmovdqa 0xe0(%rdx), %ymm8
- vpmulhw %ymm7, %ymm3, %ymm15
- vpmulhw %ymm8, %ymm3, %ymm3
- vpmulhw %ymm7, %ymm4, %ymm0
- vmovdqa %ymm13, (%rsp)
- vpmullw %ymm5, %ymm9, %ymm13
- vpmullw %ymm6, %ymm9, %ymm9
- vpmullw %ymm5, %ymm10, %ymm5
- vpmullw %ymm7, %ymm11, %ymm6
- vpmullw %ymm8, %ymm11, %ymm11
- vpmullw %ymm7, %ymm12, %ymm7
- vmovdqa 0x40(%rcx), %ymm8
- vpmulhw %ymm8, %ymm2, %ymm2
- vpmullw %ymm8, %ymm10, %ymm10
- vmovdqa 0x60(%rcx), %ymm8
- vpmulhw %ymm8, %ymm4, %ymm4
- vpmullw %ymm8, %ymm12, %ymm12
- vmovdqa (%r8), %ymm8
- vpmulhw %ymm8, %ymm13, %ymm13
- vpmulhw %ymm8, %ymm9, %ymm9
- vpmulhw %ymm8, %ymm5, %ymm5
- vpmulhw %ymm8, %ymm10, %ymm10
- vpmulhw %ymm8, %ymm6, %ymm6
- vpmulhw %ymm8, %ymm11, %ymm11
- vpmulhw %ymm8, %ymm7, %ymm7
- vpmulhw %ymm8, %ymm12, %ymm12
- vpsubw (%rsp), %ymm13, %ymm13
- vpsubw %ymm9, %ymm1, %ymm9
- vpsubw %ymm5, %ymm14, %ymm5
- vpsubw %ymm10, %ymm2, %ymm10
- vpsubw %ymm6, %ymm15, %ymm6
- vpsubw %ymm11, %ymm3, %ymm11
- vpsubw %ymm7, %ymm0, %ymm7
- vpsubw %ymm12, %ymm4, %ymm12
- vpaddw %ymm5, %ymm9, %ymm9
- vpaddw %ymm7, %ymm11, %ymm11
- vpsubw %ymm13, %ymm10, %ymm13
- vpsubw %ymm12, %ymm6, %ymm6
- vmovdqa %ymm13, 0x80(%rdi)
- vmovdqa %ymm9, 0xa0(%rdi)
- vmovdqa %ymm6, 0xc0(%rdi)
- vmovdqa %ymm11, 0xe0(%rdi)
- vmovdqa 0x20(%r8), %ymm0
- vmovdqa 0x100(%rsi), %ymm1
- vmovdqa 0x120(%rsi), %ymm2
- vmovdqa 0x140(%rsi), %ymm3
- vmovdqa 0x160(%rsi), %ymm4
- vpmullw %ymm0, %ymm1, %ymm9
- vpmullw %ymm0, %ymm2, %ymm10
- vpmullw %ymm0, %ymm3, %ymm11
- vpmullw %ymm0, %ymm4, %ymm12
- vmovdqa 0x100(%rdx), %ymm5
- vmovdqa 0x120(%rdx), %ymm6
- vpmulhw %ymm5, %ymm1, %ymm13
- vpmulhw %ymm6, %ymm1, %ymm1
- vpmulhw %ymm5, %ymm2, %ymm14
- vmovdqa 0x140(%rdx), %ymm7
- vmovdqa 0x160(%rdx), %ymm8
- vpmulhw %ymm7, %ymm3, %ymm15
- vpmulhw %ymm8, %ymm3, %ymm3
- vpmulhw %ymm7, %ymm4, %ymm0
- vmovdqa %ymm13, (%rsp)
- vpmullw %ymm5, %ymm9, %ymm13
- vpmullw %ymm6, %ymm9, %ymm9
- vpmullw %ymm5, %ymm10, %ymm5
- vpmullw %ymm7, %ymm11, %ymm6
- vpmullw %ymm8, %ymm11, %ymm11
- vpmullw %ymm7, %ymm12, %ymm7
- vmovdqa 0x80(%rcx), %ymm8
- vpmulhw %ymm8, %ymm2, %ymm2
- vpmullw %ymm8, %ymm10, %ymm10
- vmovdqa 0xa0(%rcx), %ymm8
- vpmulhw %ymm8, %ymm4, %ymm4
- vpmullw %ymm8, %ymm12, %ymm12
- vmovdqa (%r8), %ymm8
- vpmulhw %ymm8, %ymm13, %ymm13
- vpmulhw %ymm8, %ymm9, %ymm9
- vpmulhw %ymm8, %ymm5, %ymm5
- vpmulhw %ymm8, %ymm10, %ymm10
- vpmulhw %ymm8, %ymm6, %ymm6
- vpmulhw %ymm8, %ymm11, %ymm11
- vpmulhw %ymm8, %ymm7, %ymm7
- vpmulhw %ymm8, %ymm12, %ymm12
- vpsubw (%rsp), %ymm13, %ymm13
- vpsubw %ymm9, %ymm1, %ymm9
- vpsubw %ymm5, %ymm14, %ymm5
- vpsubw %ymm10, %ymm2, %ymm10
- vpsubw %ymm6, %ymm15, %ymm6
- vpsubw %ymm11, %ymm3, %ymm11
- vpsubw %ymm7, %ymm0, %ymm7
- vpsubw %ymm12, %ymm4, %ymm12
- vpaddw %ymm5, %ymm9, %ymm9
- vpaddw %ymm7, %ymm11, %ymm11
- vpsubw %ymm13, %ymm10, %ymm13
- vpsubw %ymm12, %ymm6, %ymm6
- vmovdqa %ymm13, 0x100(%rdi)
- vmovdqa %ymm9, 0x120(%rdi)
- vmovdqa %ymm6, 0x140(%rdi)
- vmovdqa %ymm11, 0x160(%rdi)
- vmovdqa 0x20(%r8), %ymm0
- vmovdqa 0x180(%rsi), %ymm1
- vmovdqa 0x1a0(%rsi), %ymm2
- vmovdqa 0x1c0(%rsi), %ymm3
- vmovdqa 0x1e0(%rsi), %ymm4
- vpmullw %ymm0, %ymm1, %ymm9
- vpmullw %ymm0, %ymm2, %ymm10
- vpmullw %ymm0, %ymm3, %ymm11
- vpmullw %ymm0, %ymm4, %ymm12
- vmovdqa 0x180(%rdx), %ymm5
- vmovdqa 0x1a0(%rdx), %ymm6
- vpmulhw %ymm5, %ymm1, %ymm13
- vpmulhw %ymm6, %ymm1, %ymm1
- vpmulhw %ymm5, %ymm2, %ymm14
- vmovdqa 0x1c0(%rdx), %ymm7
- vmovdqa 0x1e0(%rdx), %ymm8
- vpmulhw %ymm7, %ymm3, %ymm15
- vpmulhw %ymm8, %ymm3, %ymm3
- vpmulhw %ymm7, %ymm4, %ymm0
- vmovdqa %ymm13, (%rsp)
- vpmullw %ymm5, %ymm9, %ymm13
- vpmullw %ymm6, %ymm9, %ymm9
- vpmullw %ymm5, %ymm10, %ymm5
- vpmullw %ymm7, %ymm11, %ymm6
- vpmullw %ymm8, %ymm11, %ymm11
- vpmullw %ymm7, %ymm12, %ymm7
- vmovdqa 0xc0(%rcx), %ymm8
- vpmulhw %ymm8, %ymm2, %ymm2
- vpmullw %ymm8, %ymm10, %ymm10
- vmovdqa 0xe0(%rcx), %ymm8
- vpmulhw %ymm8, %ymm4, %ymm4
- vpmullw %ymm8, %ymm12, %ymm12
- vmovdqa (%r8), %ymm8
- vpmulhw %ymm8, %ymm13, %ymm13
- vpmulhw %ymm8, %ymm9, %ymm9
- vpmulhw %ymm8, %ymm5, %ymm5
- vpmulhw %ymm8, %ymm10, %ymm10
- vpmulhw %ymm8, %ymm6, %ymm6
- vpmulhw %ymm8, %ymm11, %ymm11
- vpmulhw %ymm8, %ymm7, %ymm7
- vpmulhw %ymm8, %ymm12, %ymm12
- vpsubw (%rsp), %ymm13, %ymm13
- vpsubw %ymm9, %ymm1, %ymm9
- vpsubw %ymm5, %ymm14, %ymm5
- vpsubw %ymm10, %ymm2, %ymm10
- vpsubw %ymm6, %ymm15, %ymm6
- vpsubw %ymm11, %ymm3, %ymm11
- vpsubw %ymm7, %ymm0, %ymm7
- vpsubw %ymm12, %ymm4, %ymm12
- vpaddw %ymm5, %ymm9, %ymm9
- vpaddw %ymm7, %ymm11, %ymm11
- vpsubw %ymm13, %ymm10, %ymm13
- vpsubw %ymm12, %ymm6, %ymm6
- vmovdqa %ymm13, 0x180(%rdi)
- vmovdqa %ymm9, 0x1a0(%rdi)
- vmovdqa %ymm6, 0x1c0(%rdi)
- vmovdqa %ymm11, 0x1e0(%rdi)
- movq %r11, %rsp
- retq
-
-#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
- */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/basemul.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/basemul.c
deleted file mode 100644
index 24f6231101..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/basemul.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/* References
- * ==========
- *
- * - [FIPS203]
- * FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
- * National Institute of Standards and Technology
- * https://csrc.nist.gov/pubs/fips/203/final
- *
- * - [REF_AVX2]
- * CRYSTALS-Kyber optimized AVX2 implementation
- * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
- * https://github.com/pq-crystals/kyber/tree/main/avx2
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
- */
-
-#include "../../../common.h"
-
-#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-
-#include "../../../verify.h"
-#include "arith_native_x86_64.h"
-#include "consts.h"
-
-static void poly_basemul_montgomery_avx2(int16_t r[MLKEM_N],
- const int16_t a[MLKEM_N],
- const int16_t b[MLKEM_N],
- const int16_t b_cache[MLKEM_N / 2])
-{
- mlk_basemul_avx2((__m256i *)r, (const __m256i *)a, (const __m256i *)b,
- (const __m256i *)b_cache, mlk_qdata.vec);
-}
-
-static void poly_add_avx2(int16_t r[MLKEM_N], const int16_t a[MLKEM_N],
- const int16_t b[MLKEM_N])
-{
- unsigned i;
- __m256i f0, f1;
-
- for (i = 0; i < MLKEM_N; i += 16)
- {
- f0 = _mm256_load_si256((const __m256i *)&a[i]);
- f1 = _mm256_load_si256((const __m256i *)&b[i]);
- f0 = _mm256_add_epi16(f0, f1);
- _mm256_store_si256((__m256i *)&r[i], f0);
- }
-}
-
-void mlk_polyvec_basemul_acc_montgomery_cached_avx2(unsigned k,
- int16_t r[MLKEM_N],
- const int16_t *a,
- const int16_t *b,
- const int16_t *b_cache)
-{
- unsigned i;
- int16_t t[MLKEM_N] MLK_ALIGN;
-
- /* Coefficient-wise bound of each basemul is 2q.
- * Since we are accumulating at most 4 times, the
- * overall bound is 8q < INT16_MAX. */
- poly_basemul_montgomery_avx2(r, &a[0], &b[0], &b_cache[0]);
- for (i = 1; i < k; i++)
- {
- poly_basemul_montgomery_avx2(t, &a[i * MLKEM_N], &b[i * MLKEM_N],
- &b_cache[i * (MLKEM_N / 2)]);
- poly_add_avx2(r, r, t);
- }
-
- /* Specification: Partially implements
- * @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(t, sizeof(t));
-}
-
-#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
- */
-
-MLK_EMPTY_CU(avx2_basemul)
-
-#endif /* !(MLK_ARITH_BACKEND_X86_64_DEFAULT && \
- !MLK_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/compress_avx2.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/compress_avx2.c
deleted file mode 100644
index c9827099d0..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/compress_avx2.c
+++ /dev/null
@@ -1,387 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/* References
- * ==========
- *
- * - [REF_AVX2]
- * CRYSTALS-Kyber optimized AVX2 implementation
- * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
- * https://github.com/pq-crystals/kyber/tree/main/avx2
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
- */
-
-#include "../../../common.h"
-
-#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-
-#include
-#include
-#include
-#include "arith_native_x86_64.h"
-#include "consts.h"
-
-#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-void mlk_poly_compress_d4_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
- const __m256i *MLK_RESTRICT a)
-{
- unsigned int i;
- __m256i f0, f1, f2, f3;
- const __m256i v =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XV / 16]);
- const __m256i shift1 = _mm256_set1_epi16(1 << 9);
- const __m256i mask = _mm256_set1_epi16(15);
- const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1);
- const __m256i permdidx = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-
- for (i = 0; i < MLKEM_N / 64; i++)
- {
- f0 = _mm256_load_si256(&a[4 * i + 0]);
- f1 = _mm256_load_si256(&a[4 * i + 1]);
- f2 = _mm256_load_si256(&a[4 * i + 2]);
- f3 = _mm256_load_si256(&a[4 * i + 3]);
- f0 = _mm256_mulhi_epi16(f0, v);
- f1 = _mm256_mulhi_epi16(f1, v);
- f2 = _mm256_mulhi_epi16(f2, v);
- f3 = _mm256_mulhi_epi16(f3, v);
- f0 = _mm256_mulhrs_epi16(f0, shift1);
- f1 = _mm256_mulhrs_epi16(f1, shift1);
- f2 = _mm256_mulhrs_epi16(f2, shift1);
- f3 = _mm256_mulhrs_epi16(f3, shift1);
- f0 = _mm256_and_si256(f0, mask);
- f1 = _mm256_and_si256(f1, mask);
- f2 = _mm256_and_si256(f2, mask);
- f3 = _mm256_and_si256(f3, mask);
- f0 = _mm256_packus_epi16(f0, f1);
- f2 = _mm256_packus_epi16(f2, f3);
- f0 = _mm256_maddubs_epi16(f0, shift2);
- f2 = _mm256_maddubs_epi16(f2, shift2);
- f0 = _mm256_packus_epi16(f0, f2);
- f0 = _mm256_permutevar8x32_epi32(f0, permdidx);
- _mm256_storeu_si256((__m256i *)&r[32 * i], f0);
- }
-}
-
-void mlk_poly_decompress_d4_avx2(__m256i *MLK_RESTRICT r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
-{
- unsigned int i;
- __m128i t;
- __m256i f;
- const __m256i q =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XQ / 16]);
- const __m256i shufbidx =
- _mm256_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3,
- 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
- const __m256i mask = _mm256_set1_epi32(0x00F0000F);
- const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048);
-
- for (i = 0; i < MLKEM_N / 16; i++)
- {
- t = _mm_loadl_epi64((__m128i *)&a[8 * i]);
- f = _mm256_broadcastsi128_si256(t);
- f = _mm256_shuffle_epi8(f, shufbidx);
- f = _mm256_and_si256(f, mask);
- f = _mm256_mullo_epi16(f, shift);
- f = _mm256_mulhrs_epi16(f, q);
- _mm256_store_si256(&r[i], f);
- }
-}
-
-void mlk_poly_compress_d10_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
- const __m256i *MLK_RESTRICT a)
-{
- unsigned int i;
- __m256i f0, f1, f2;
- __m128i t0, t1;
- const __m256i v =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XV / 16]);
- const __m256i v8 = _mm256_slli_epi16(v, 3);
- const __m256i off = _mm256_set1_epi16(15);
- const __m256i shift1 = _mm256_set1_epi16(1 << 12);
- const __m256i mask = _mm256_set1_epi16(1023);
- const __m256i shift2 =
- _mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1);
- const __m256i sllvdidx = _mm256_set1_epi64x(12);
- const __m256i shufbidx =
- _mm256_set_epi8(8, 4, 3, 2, 1, 0, -1, -1, -1, -1, -1, -1, 12, 11, 10, 9,
- -1, -1, -1, -1, -1, -1, 12, 11, 10, 9, 8, 4, 3, 2, 1, 0);
-
- for (i = 0; i < MLKEM_N / 16; i++)
- {
- f0 = _mm256_load_si256(&a[i]);
- f1 = _mm256_mullo_epi16(f0, v8);
- f2 = _mm256_add_epi16(f0, off);
- f0 = _mm256_slli_epi16(f0, 3);
- f0 = _mm256_mulhi_epi16(f0, v);
- f2 = _mm256_sub_epi16(f1, f2);
- f1 = _mm256_andnot_si256(f1, f2);
- f1 = _mm256_srli_epi16(f1, 15);
- f0 = _mm256_sub_epi16(f0, f1);
- f0 = _mm256_mulhrs_epi16(f0, shift1);
- f0 = _mm256_and_si256(f0, mask);
- f0 = _mm256_madd_epi16(f0, shift2);
- f0 = _mm256_sllv_epi32(f0, sllvdidx);
- f0 = _mm256_srli_epi64(f0, 12);
- f0 = _mm256_shuffle_epi8(f0, shufbidx);
- t0 = _mm256_castsi256_si128(f0);
- t1 = _mm256_extracti128_si256(f0, 1);
- t0 = _mm_blend_epi16(t0, t1, 0xE0);
- _mm_storeu_si128((__m128i *)&r[20 * i + 0], t0);
- memcpy(&r[20 * i + 16], &t1, 4);
- }
-}
-
-void mlk_poly_decompress_d10_avx2(
- __m256i *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
-{
- unsigned int i;
- __m256i f;
- const __m256i q = _mm256_set1_epi32((MLKEM_Q << 16) + 4 * MLKEM_Q);
- const __m256i shufbidx =
- _mm256_set_epi8(11, 10, 10, 9, 9, 8, 8, 7, 6, 5, 5, 4, 4, 3, 3, 2, 9, 8,
- 8, 7, 7, 6, 6, 5, 4, 3, 3, 2, 2, 1, 1, 0);
- const __m256i sllvdidx = _mm256_set1_epi64x(4);
- /* TODO: Explain magic values */
- /* check-magic: off */
- const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184);
- /* check-magic: on */
-
- for (i = 0; i < (MLKEM_N / 16) - 1; i++)
- {
- f = _mm256_loadu_si256((__m256i *)&a[20 * i]);
- f = _mm256_permute4x64_epi64(f, 0x94);
- f = _mm256_shuffle_epi8(f, shufbidx);
- f = _mm256_sllv_epi32(f, sllvdidx);
- f = _mm256_srli_epi16(f, 1);
- f = _mm256_and_si256(f, mask);
- f = _mm256_mulhrs_epi16(f, q);
- _mm256_store_si256(&r[i], f);
- }
-
- /* Handle load in last iteration especially to avoid buffer overflow */
- memcpy(&f, &a[20 * i], 20);
- /* The rest is the same */
- f = _mm256_permute4x64_epi64(f, 0x94);
- f = _mm256_shuffle_epi8(f, shufbidx);
- f = _mm256_sllv_epi32(f, sllvdidx);
- f = _mm256_srli_epi16(f, 1);
- f = _mm256_and_si256(f, mask);
- f = _mm256_mulhrs_epi16(f, q);
- _mm256_store_si256(&r[i], f);
-}
-
-#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
-
-#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
-void mlk_poly_compress_d5_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
- const __m256i *MLK_RESTRICT a)
-{
- unsigned int i;
- __m256i f0, f1;
- __m128i t0, t1;
- const __m256i v =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XV / 16]);
- const __m256i shift1 = _mm256_set1_epi16(1 << 10);
- const __m256i mask = _mm256_set1_epi16(31);
- const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1);
- const __m256i shift3 = _mm256_set1_epi32((1024 << 16) + 1);
- const __m256i sllvdidx = _mm256_set1_epi64x(12);
- const __m256i shufbidx =
- _mm256_set_epi8(8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0, -1, 12, 11, 10, 9,
- -1, 12, 11, 10, 9, 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0);
-
- for (i = 0; i < MLKEM_N / 32; i++)
- {
- f0 = _mm256_load_si256(&a[2 * i + 0]);
- f1 = _mm256_load_si256(&a[2 * i + 1]);
- f0 = _mm256_mulhi_epi16(f0, v);
- f1 = _mm256_mulhi_epi16(f1, v);
- f0 = _mm256_mulhrs_epi16(f0, shift1);
- f1 = _mm256_mulhrs_epi16(f1, shift1);
- f0 = _mm256_and_si256(f0, mask);
- f1 = _mm256_and_si256(f1, mask);
- f0 = _mm256_packus_epi16(f0, f1);
- f0 = _mm256_maddubs_epi16(
- f0, shift2); /* a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7 */
- f0 = _mm256_madd_epi16(f0, shift3); /* a0 a1 b0 b1 a2 a3 b2 b3 */
- f0 = _mm256_sllv_epi32(f0, sllvdidx);
- f0 = _mm256_srlv_epi64(f0, sllvdidx);
- f0 = _mm256_shuffle_epi8(f0, shufbidx);
- t0 = _mm256_castsi256_si128(f0);
- t1 = _mm256_extracti128_si256(f0, 1);
- t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx));
- _mm_storeu_si128((__m128i *)&r[20 * i + 0], t0);
- memcpy(&r[20 * i + 16], &t1, 4);
- }
-}
-
-void mlk_poly_decompress_d5_avx2(__m256i *MLK_RESTRICT r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
-{
- unsigned int i;
- __m128i t;
- __m256i f;
- int16_t ti;
- const __m256i q =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XQ / 16]);
- const __m256i shufbidx =
- _mm256_set_epi8(9, 9, 9, 8, 8, 8, 8, 7, 7, 6, 6, 6, 6, 5, 5, 5, 4, 4, 4,
- 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0);
- /* TODO: Document those magic values */
- /* check-magic: off */
- const __m256i mask = _mm256_set_epi16(248, 1984, 62, 496, 3968, 124, 992, 31,
- 248, 1984, 62, 496, 3968, 124, 992, 31);
- const __m256i shift = _mm256_set_epi16(128, 16, 512, 64, 8, 256, 32, 1024,
- 128, 16, 512, 64, 8, 256, 32, 1024);
- /* check-magic: on */
- for (i = 0; i < MLKEM_N / 16; i++)
- {
- t = _mm_loadl_epi64((__m128i *)&a[10 * i + 0]);
- memcpy(&ti, &a[10 * i + 8], 2);
- t = _mm_insert_epi16(t, ti, 4);
- f = _mm256_broadcastsi128_si256(t);
- f = _mm256_shuffle_epi8(f, shufbidx);
- f = _mm256_and_si256(f, mask);
- f = _mm256_mullo_epi16(f, shift);
- f = _mm256_mulhrs_epi16(f, q);
- _mm256_store_si256(&r[i], f);
- }
-}
-
-void mlk_poly_compress_d11_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
- const __m256i *MLK_RESTRICT a)
-{
- unsigned int i;
- __m256i f0, f1, f2;
- __m128i t0, t1;
- const __m256i v =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XV / 16]);
- const __m256i v8 = _mm256_slli_epi16(v, 3);
- const __m256i off = _mm256_set1_epi16(36);
- const __m256i shift1 = _mm256_set1_epi16(1 << 13);
- const __m256i mask = _mm256_set1_epi16(2047);
- const __m256i shift2 =
- _mm256_set1_epi64x((2048LL << 48) + (1LL << 32) + (2048 << 16) + 1);
- const __m256i sllvdidx = _mm256_set1_epi64x(10);
- const __m256i srlvqidx = _mm256_set_epi64x(30, 10, 30, 10);
- const __m256i shufbidx =
- _mm256_set_epi8(4, 3, 2, 1, 0, 0, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, -1,
- -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-
- for (i = 0; i < (MLKEM_N / 16) - 1; i++)
- {
- f0 = _mm256_load_si256(&a[i]);
- f1 = _mm256_mullo_epi16(f0, v8);
- f2 = _mm256_add_epi16(f0, off);
- f0 = _mm256_slli_epi16(f0, 3);
- f0 = _mm256_mulhi_epi16(f0, v);
- f2 = _mm256_sub_epi16(f1, f2);
- f1 = _mm256_andnot_si256(f1, f2);
- f1 = _mm256_srli_epi16(f1, 15);
- f0 = _mm256_sub_epi16(f0, f1);
- f0 = _mm256_mulhrs_epi16(f0, shift1);
- f0 = _mm256_and_si256(f0, mask);
- f0 = _mm256_madd_epi16(f0, shift2);
- f0 = _mm256_sllv_epi32(f0, sllvdidx);
- f1 = _mm256_bsrli_epi128(f0, 8);
- f0 = _mm256_srlv_epi64(f0, srlvqidx);
- f1 = _mm256_slli_epi64(f1, 34);
- f0 = _mm256_add_epi64(f0, f1);
- f0 = _mm256_shuffle_epi8(f0, shufbidx);
- t0 = _mm256_castsi256_si128(f0);
- t1 = _mm256_extracti128_si256(f0, 1);
- t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx));
- _mm_storeu_si128((__m128i *)&r[22 * i + 0], t0);
- _mm_storel_epi64((__m128i *)&r[22 * i + 16], t1);
- }
-
- f0 = _mm256_load_si256(&a[i]);
- f1 = _mm256_mullo_epi16(f0, v8);
- f2 = _mm256_add_epi16(f0, off);
- f0 = _mm256_slli_epi16(f0, 3);
- f0 = _mm256_mulhi_epi16(f0, v);
- f2 = _mm256_sub_epi16(f1, f2);
- f1 = _mm256_andnot_si256(f1, f2);
- f1 = _mm256_srli_epi16(f1, 15);
- f0 = _mm256_sub_epi16(f0, f1);
- f0 = _mm256_mulhrs_epi16(f0, shift1);
- f0 = _mm256_and_si256(f0, mask);
- f0 = _mm256_madd_epi16(f0, shift2);
- f0 = _mm256_sllv_epi32(f0, sllvdidx);
- f1 = _mm256_bsrli_epi128(f0, 8);
- f0 = _mm256_srlv_epi64(f0, srlvqidx);
- f1 = _mm256_slli_epi64(f1, 34);
- f0 = _mm256_add_epi64(f0, f1);
- f0 = _mm256_shuffle_epi8(f0, shufbidx);
- t0 = _mm256_castsi256_si128(f0);
- t1 = _mm256_extracti128_si256(f0, 1);
- t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx));
- _mm_storeu_si128((__m128i *)&r[22 * i + 0], t0);
- /* Handle store in last iteration especially to avoid overflow */
- memcpy(&r[22 * i + 16], &t1, 6);
-}
-
-void mlk_poly_decompress_d11_avx2(
- __m256i *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
-{
- unsigned int i;
- __m256i f;
- const __m256i q =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XQ / 16]);
- const __m256i shufbidx =
- _mm256_set_epi8(13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 6, 5, 5, 4, 4, 3, 10,
- 9, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 2, 1, 1, 0);
- const __m256i srlvdidx = _mm256_set_epi32(0, 0, 1, 0, 0, 0, 1, 0);
- const __m256i srlvqidx = _mm256_set_epi64x(2, 0, 2, 0);
- const __m256i shift =
- _mm256_set_epi16(4, 32, 1, 8, 32, 1, 4, 32, 4, 32, 1, 8, 32, 1, 4, 32);
- /* TODO: Explain magic constant */
- /* check-magic: off */
- const __m256i mask = _mm256_set1_epi16(32752);
- /* check-magic: on */
-
- for (i = 0; i < (MLKEM_N / 16) - 1; i++)
- {
- f = _mm256_loadu_si256((__m256i *)&a[22 * i]);
- f = _mm256_permute4x64_epi64(f, 0x94);
- f = _mm256_shuffle_epi8(f, shufbidx);
- f = _mm256_srlv_epi32(f, srlvdidx);
- f = _mm256_srlv_epi64(f, srlvqidx);
- f = _mm256_mullo_epi16(f, shift);
- f = _mm256_srli_epi16(f, 1);
- f = _mm256_and_si256(f, mask);
- f = _mm256_mulhrs_epi16(f, q);
- _mm256_store_si256(&r[i], f);
- }
-
- /* Handle load of last iteration especially */
- memcpy(&f, &a[22 * i], 22);
- /* The rest of the iteration is the same */
- f = _mm256_permute4x64_epi64(f, 0x94);
- f = _mm256_shuffle_epi8(f, shufbidx);
- f = _mm256_srlv_epi32(f, srlvdidx);
- f = _mm256_srlv_epi64(f, srlvqidx);
- f = _mm256_mullo_epi16(f, shift);
- f = _mm256_srli_epi16(f, 1);
- f = _mm256_and_si256(f, mask);
- f = _mm256_mulhrs_epi16(f, q);
- _mm256_store_si256(&r[i], f);
-}
-
-#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
-
-#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
- */
-
-MLK_EMPTY_CU(avx2_poly_compress)
-
-#endif /* !(MLK_ARITH_BACKEND_X86_64_DEFAULT && \
- !MLK_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/compress_consts.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/compress_consts.c
new file mode 100644
index 0000000000..dcfa127582
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/compress_consts.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
+ * Do not modify it directly.
+ */
+
+#include "../../../common.h"
+
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT)
+
+#include "compress_consts.h"
+
+#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2 || \
+ MLKEM_K == 3)
+
+MLK_ALIGN const uint8_t mlk_compress_d4_data[32] = {
+ 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0,
+ 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0, /* permdidx */
+};
+
+MLK_ALIGN const uint8_t mlk_decompress_d4_data[32] = {
+ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
+ 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, /* shufbidx */
+};
+
+MLK_ALIGN const uint8_t mlk_compress_d10_data[32] = {
+ 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 255,
+ 255, 255, 255, 255, 255, 9, 10, 11, 12, 255, 255,
+ 255, 255, 255, 255, 0, 1, 2, 3, 4, 8, /* shufbidx */
+};
+
+MLK_ALIGN const uint8_t mlk_decompress_d10_data[32] = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 6, 7, 7, 8, 8, 9,
+ 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, /* shufbidx */
+};
+
+#endif /* !MLK_CONFIG_MULTILEVEL_NO_SHARED && \
+ (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3) \
+ */
+
+#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
+
+MLK_ALIGN const uint8_t
+ mlk_compress_d5_data[32] = {
+ 0, 1, 2, 3, 4, 255, 255, 255, 255, 255, 8,
+ 9, 10, 11, 12, 255, 9, 10, 11, 12, 255, 0,
+ 1, 2, 3, 4, 255, 255, 255, 255, 255, 8, /* shufbidx */
+};
+
+/* shufbidx[0:32], mask[32:64], shift[64:96] */
+MLK_ALIGN const uint8_t mlk_decompress_d5_data[96] = {
+ 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5,
+ 5, 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 9, 9, 9, /* shufbidx */
+ 31, 0, 224, 3, 124, 0, 128, 15, 240, 1, 62, 0, 192, 7, 248, 0, 31, 0,
+ 224, 3, 124, 0, 128, 15, 240, 1, 62, 0, 192, 7, 248, 0, /* mask */
+ 0, 4, 32, 0, 0, 1, 8, 0, 64, 0, 0, 2, 16, 0, 128, 0, 0, 4,
+ 32, 0, 0, 1, 8, 0, 64, 0, 0, 2, 16, 0, 128, 0, /* shift */
+};
+
+/* srlvqidx[0:32], shufbidx[32:64] */
+MLK_ALIGN const uint8_t mlk_compress_d11_data[64] = {
+ 10, 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, /* srlvqidx */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 255, 255, 255, 255, 255, 5,
+ 6, 7, 8, 9, 10, 255, 255, 255, 255, 0, 0, 1, 2, 3, 4, /* shufbidx */
+};
+
+/* shufbidx[0:32], srlvdidx[32:64], srlvqidx[64:96], shift[96:128] */
+MLK_ALIGN const uint8_t mlk_decompress_d11_data[128] = {
+ 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10,
+ 3, 4, 4, 5, 5, 6, 7, 8, 8, 9, 9, 10, 11, 12, 12, 13, /* shufbidx */
+ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* srlvdidx */
+ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, /* srlvqidx */
+ 32, 0, 4, 0, 1, 0, 32, 0, 8, 0, 1, 0, 32, 0, 4, 0,
+ 32, 0, 4, 0, 1, 0, 32, 0, 8, 0, 1, 0, 32, 0, 4, 0, /* shift */
+};
+
+#endif /* !MLK_CONFIG_MULTILEVEL_NO_SHARED && \
+ (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */
+
+#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT */
+
+MLK_EMPTY_CU(avx2_compress_consts)
+
+#endif /* !MLK_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/compress_consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/compress_consts.h
new file mode 100644
index 0000000000..6e13d05805
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/compress_consts.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
+ * Do not modify it directly.
+ */
+
+#ifndef MLK_NATIVE_X86_64_SRC_COMPRESS_CONSTS_H
+#define MLK_NATIVE_X86_64_SRC_COMPRESS_CONSTS_H
+
+#include "../../../common.h"
+
+#ifndef __ASSEMBLER__
+
+#define mlk_compress_d4_data MLK_NAMESPACE(compress_d4_data)
+extern const uint8_t mlk_compress_d4_data[32];
+
+#define mlk_decompress_d4_data MLK_NAMESPACE(decompress_d4_data)
+extern const uint8_t mlk_decompress_d4_data[32];
+
+#define mlk_compress_d10_data MLK_NAMESPACE(compress_d10_data)
+extern const uint8_t mlk_compress_d10_data[32];
+
+#define mlk_decompress_d10_data MLK_NAMESPACE(decompress_d10_data)
+extern const uint8_t mlk_decompress_d10_data[32];
+
+#define mlk_compress_d5_data MLK_NAMESPACE(compress_d5_data)
+extern const uint8_t mlk_compress_d5_data[32];
+
+#define mlk_decompress_d5_data MLK_NAMESPACE(decompress_d5_data)
+extern const uint8_t mlk_decompress_d5_data[96];
+
+#define mlk_compress_d11_data MLK_NAMESPACE(compress_d11_data)
+extern const uint8_t mlk_compress_d11_data[64];
+
+#define mlk_decompress_d11_data MLK_NAMESPACE(decompress_d11_data)
+extern const uint8_t mlk_decompress_d11_data[128];
+
+#endif /* !__ASSEMBLER__ */
+
+#endif /* !MLK_NATIVE_X86_64_SRC_COMPRESS_CONSTS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/consts.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/consts.c
index 204e98d459..17877423e5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/consts.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/consts.c
@@ -3,18 +3,10 @@
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
-/* References
- * ==========
- *
- * - [REF_AVX2]
- * CRYSTALS-Kyber optimized AVX2 implementation
- * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
- * https://github.com/pq-crystals/kyber/tree/main/avx2
- */
-
/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
+ * WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
+ * Do not modify it directly.
*/
#include "../../../common.h"
@@ -22,234 +14,84 @@
#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include "align.h"
#include "consts.h"
-#define MLK_AVX2_Q MLKEM_Q
-/* check-magic: -1044 == pow(2,16,MLKEM_Q) */
-#define MLK_AVX2_MONT -1044
-/* check-magic: -3327 == pow(MLKEM_Q,-1,2^16) */
-#define MLK_AVX2_QINV -3327
-/* check-magic: 20159 == round(2^26/MLKEM_Q) */
-#define MLK_AVX2_V 20159
-/* check-magic: 1441 == pow(2,32-7,MLKEM_Q) */
-#define MLK_AVX2_FHI 1441
-/* check-magic: -10079 == signed_mod(MLK_AVX2_QINV*MLK_AVX2_FHI,2^16) */
-#define MLK_AVX2_FLO -10079
-/* check-magic: 1353 == pow(2, 32, MLKEM_Q) */
-#define MLK_AVX2_MONTSQHI 1353
-/* check-magic: 20553 == signed_mod(MLK_AVX2_QINV*MLK_AVX2_MONTSQHI,2^16) */
-#define MLK_AVX2_MONTSQLO 20553
-#define MLK_AVX2_MASK 4095
-#define MLK_AVX2_SHIFT 32
-
-const qdata_t mlk_qdata = {{
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XQ 0
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XQINV 16
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XV 32
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XFLO 48
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XFHI 64
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO 80
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI 96
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMASK 112
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXB 128
- /* TODO: Explain these numbers */
- /* check-magic: off */
- 3854,
- 3340,
- 2826,
- 2312,
- 1798,
- 1284,
- 770,
- 256,
- 3854,
- 3340,
- 2826,
- 2312,
- 1798,
- 1284,
- 770,
- 256,
-/* check-magic: on */
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXD 144
- 7,
- 0,
- 6,
- 0,
- 5,
- 0,
- 4,
- 0,
- 3,
- 0,
- 2,
- 0,
- 1,
- 0,
- 0,
- 0,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_ZETAS_EXP 160
-#include "x86_64_zetas.i"
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XSHIFT 624
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
-#define MLK_AVX2_BACKEND_DATA_OFFSET_MULCACHE_TWIDDLES 640
-#include "x86_64_mulcache_twiddles.i"
-}};
+/*
+ * Table of zeta values used in the AVX2 NTTs
+ * See autogen for details.
+ */
+MLK_ALIGN const int16_t mlk_qdata[624] = {
+ 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, 3854,
+ 3340, 2826, 2312, 1798, 1284, 770, 256, 7, 0,
+ 6, 0, 5, 0, 4, 0, 3, 0, 2,
+ 0, 1, 0, 0, 0, 31498, 31498, 31498, 31498,
+ -758, -758, -758, -758, 0, 0, 0, 0, 0,
+ 0, 0, 0, 14745, 14745, 14745, 14745, 14745, 14745,
+ 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745,
+ 14745, -359, -359, -359, -359, -359, -359, -359, -359,
+ -359, -359, -359, -359, -359, -359, -359, -359, 13525,
+ 13525, 13525, 13525, 13525, 13525, 13525, 13525, -12402, -12402,
+ -12402, -12402, -12402, -12402, -12402, -12402, 1493, 1493, 1493,
+ 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, 1422,
+ 1422, 1422, 1422, 1422, -20907, -20907, -20907, -20907, 27758,
+ 27758, 27758, 27758, -3799, -3799, -3799, -3799, -15690, -15690,
+ -15690, -15690, -171, -171, -171, -171, 622, 622, 622,
+ 622, 1577, 1577, 1577, 1577, 182, 182, 182, 182,
+ -5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057, 5571,
+ 5571, -1102, -1102, 21438, 21438, -26242, -26242, 573, 573,
+ -1325, -1325, 264, 264, 383, 383, -829, -829, 1458,
+ 1458, -1602, -1602, -130, -130, -5689, -6516, 1496, 30967,
+ -23565, 20179, 20710, 25080, -12796, 26616, 16064, -12442, 9134,
+ -650, -25986, 27837, 1223, 652, -552, 1015, -1293, 1491,
+ -282, -1544, 516, -8, -320, -666, -1618, -1162, 126,
+ 1469, -335, -11477, -32227, 20494, -27738, 945, -14883, 6182,
+ 32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276, -1103,
+ 555, -1251, 1550, 422, 177, -291, 1574, -246, 1159,
+ -777, -602, -1590, -872, 418, -156, 11182, 13387, -14233,
+ -21655, 13131, -4587, 23092, 5493, -32502, 30317, -18741, 12639,
+ 20100, 18525, 19529, -12619, 430, 843, 871, 105, 587,
+ -235, -460, 1653, 778, -147, 1483, 1119, 644, 349,
+ 329, -75, 787, 787, 787, 787, 787, 787, 787,
+ 787, 787, 787, 787, 787, 787, 787, 787, 787,
+ -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517,
+ -1517, -1517, -1517, -1517, -1517, -1517, -1517, 28191, 28191,
+ 28191, 28191, 28191, 28191, 28191, 28191, -16694, -16694, -16694,
+ -16694, -16694, -16694, -16694, -16694, 287, 287, 287, 287,
+ 287, 287, 287, 287, 202, 202, 202, 202, 202,
+ 202, 202, 202, 10690, 10690, 10690, 10690, 1358, 1358,
+ 1358, 1358, -11202, -11202, -11202, -11202, 31164, 31164, 31164,
+ 31164, 962, 962, 962, 962, -1202, -1202, -1202, -1202,
+ -1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, -28073,
+ -28073, 24313, 24313, -10532, -10532, 8800, 8800, 18426, 18426,
+ 8859, 8859, 26675, 26675, -16163, -16163, -681, -681, 1017,
+ 1017, 732, 732, 608, 608, -1542, -1542, 411, 411,
+ -205, -205, -1571, -1571, 19883, -28250, -15887, -8898, -28309,
+ 9075, -30199, 18249, 13426, 14017, -29156, -12757, 16832, 4311,
+ -24155, -17915, -853, -90, -271, 830, 107, -1421, -247,
+ -951, -398, 961, -1508, -725, 448, -1065, 677, -1275,
+ -31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989, 10335,
+ -7934, -22502, 10906, 31636, 28644, 23998, -17422, 817, 603,
+ 1322, -1465, -1215, 1218, -874, -1187, -1185, -1278, -1510,
+ -870, -108, 996, 958, 1522, 20297, 2146, 15355, -32384,
+ -6280, -14903, -11044, 14469, -21498, -20198, 23210, -17442, -23860,
+ -20257, 7756, 23132, 1097, 610, -1285, 384, -136, -1335,
+ 220, -1659, -1530, 794, -854, 478, -308, 991, -1460,
+ 1628, -1103, 555, -1251, 1550, 422, 177, -291, 1574,
+ -246, 1159, -777, -602, -1590, -872, 418, -156, 430,
+ 843, 871, 105, 587, -235, -460, 1653, 778, -147,
+ 1483, 1119, 644, 349, 329, -75, 817, 603, 1322,
+ -1465, -1215, 1218, -874, -1187, -1185, -1278, -1510, -870,
+ -108, 996, 958, 1522, 1097, 610, -1285, 384, -136,
+ -1335, 220, -1659, -1530, 794, -854, 478, -308, 991,
+ -1460, 1628, -335, -11477, -32227, 20494, -27738, 945, -14883,
+ 6182, 32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276,
+ 11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, -32502,
+ 30317, -18741, 12639, 20100, 18525, 19529, -12619, -31183, 25435,
+ -7382, 24391, -20927, 10946, 24214, 16989, 10335, -7934, -22502,
+ 10906, 31636, 28644, 23998, -17422, 20297, 2146, 15355, -32384,
+ -6280, -14903, -11044, 14469, -21498, -20198, 23210, -17442, -23860,
+ -20257, 7756, 23132,
+};
#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
@@ -258,30 +100,3 @@ MLK_EMPTY_CU(avx2_consts)
#endif /* !(MLK_ARITH_BACKEND_X86_64_DEFAULT && \
!MLK_CONFIG_MULTILEVEL_NO_SHARED) */
-
-/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
- * Don't modify by hand -- this is auto-generated by scripts/autogen. */
-#undef MLK_AVX2_Q
-#undef MLK_AVX2_MONT
-#undef MLK_AVX2_QINV
-#undef MLK_AVX2_V
-#undef MLK_AVX2_FHI
-#undef MLK_AVX2_FLO
-#undef MLK_AVX2_MONTSQHI
-#undef MLK_AVX2_MONTSQLO
-#undef MLK_AVX2_MASK
-#undef MLK_AVX2_SHIFT
-/* Some macros are kept because they are also defined in a header. */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XQ (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XQINV (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XV (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XFLO (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XFHI (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XMASK (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXB (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXD (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_ZETAS_EXP (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XSHIFT (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_MULCACHE_TWIDDLES (consts.h) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/consts.h
index 9dedfc4999..0d0c7a9993 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/consts.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/consts.h
@@ -3,43 +3,23 @@
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
-/* References
- * ==========
- *
- * - [REF_AVX2]
- * CRYSTALS-Kyber optimized AVX2 implementation
- * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
- * https://github.com/pq-crystals/kyber/tree/main/avx2
- */
-
/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
+ * WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
+ * Do not modify it directly.
*/
#ifndef MLK_NATIVE_X86_64_SRC_CONSTS_H
#define MLK_NATIVE_X86_64_SRC_CONSTS_H
#include "../../../common.h"
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XQ 0
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XQINV 16
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XV 32
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XFLO 48
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XFHI 64
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO 80
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI 96
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMASK 112
-#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXB 128
-#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXD 144
-#define MLK_AVX2_BACKEND_DATA_OFFSET_ZETAS_EXP 160
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XSHIFT 624
-#define MLK_AVX2_BACKEND_DATA_OFFSET_MULCACHE_TWIDDLES 640
+#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXB 0
+#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXD 16
+#define MLK_AVX2_BACKEND_DATA_OFFSET_ZETAS_EXP 32
+#define MLK_AVX2_BACKEND_DATA_OFFSET_MULCACHE_TWIDDLES 496
#ifndef __ASSEMBLER__
-#include "align.h"
-typedef MLK_ALIGNED_INT16(768) qdata_t;
#define mlk_qdata MLK_NAMESPACE(qdata)
-extern const qdata_t mlk_qdata;
-#endif /* !__ASSEMBLER__ */
+extern const int16_t mlk_qdata[624];
+#endif
#endif /* !MLK_NATIVE_X86_64_SRC_CONSTS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/fq.inc b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/fq.inc
deleted file mode 100644
index 647011e208..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/fq.inc
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
- */
-
-.macro red16 r,rs=0,x=12
-vpmulhw %ymm1,%ymm\r,%ymm\x
-.if \rs
-vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
-.else
-vpsraw $10,%ymm\x,%ymm\x
-.endif
-vpmullw %ymm0,%ymm\x,%ymm\x
-vpsubw %ymm\x,%ymm\r,%ymm\r
-.endm
-
-.macro csubq r,x=12
-vpsubw %ymm0,%ymm\r,%ymm\r
-vpsraw $15,%ymm\r,%ymm\x
-vpand %ymm0,%ymm\x,%ymm\x
-vpaddw %ymm\x,%ymm\r,%ymm\r
-.endm
-
-.macro caddq r,x=12
-vpsraw $15,%ymm\r,%ymm\x
-vpand %ymm0,%ymm\x,%ymm\x
-vpaddw %ymm\x,%ymm\r,%ymm\r
-.endm
-
-/* Montgomery multiplication between b and ah,
- * with Montgomery twist of ah in al. */
-.macro fqmulprecomp al,ah,b,x=12
-vpmullw %ymm\al,%ymm\b,%ymm\x
-vpmulhw %ymm\ah,%ymm\b,%ymm\b
-vpmulhw %ymm0,%ymm\x,%ymm\x
-vpsubw %ymm\x,%ymm\b,%ymm\b
-.endm
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/intt.S
index 088adbc766..08d0bd7eb0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/intt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/intt.S
@@ -37,662 +37,683 @@
* dev/x86_64/src/intt.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(invntt_avx2)
MLK_ASM_FN_SYMBOL(invntt_avx2)
- vmovdqa (%rsi), %ymm0
- vmovdqa 0x60(%rsi), %ymm2
- vmovdqa 0x80(%rsi), %ymm3
- vmovdqa (%rdi), %ymm4
- vmovdqa 0x40(%rdi), %ymm6
- vmovdqa 0x20(%rdi), %ymm5
- vmovdqa 0x60(%rdi), %ymm7
- vpmullw %ymm2, %ymm4, %ymm12
- vpmulhw %ymm3, %ymm4, %ymm4
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm4, %ymm4
- vpmullw %ymm2, %ymm6, %ymm12
- vpmulhw %ymm3, %ymm6, %ymm6
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm6, %ymm6
- vpmullw %ymm2, %ymm5, %ymm12
- vpmulhw %ymm3, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm5, %ymm5
- vpmullw %ymm2, %ymm7, %ymm12
- vpmulhw %ymm3, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm7, %ymm7
- vmovdqa 0x80(%rdi), %ymm8
- vmovdqa 0xc0(%rdi), %ymm10
- vmovdqa 0xa0(%rdi), %ymm9
- vmovdqa 0xe0(%rdi), %ymm11
- vpmullw %ymm2, %ymm8, %ymm12
- vpmulhw %ymm3, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm8, %ymm8
- vpmullw %ymm2, %ymm10, %ymm12
- vpmulhw %ymm3, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm10, %ymm10
- vpmullw %ymm2, %ymm9, %ymm12
- vpmulhw %ymm3, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm9, %ymm9
- vpmullw %ymm2, %ymm11, %ymm12
- vpmulhw %ymm3, %ymm11, %ymm11
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm11, %ymm11
- vpermq $0x4e, 0x4a0(%rsi), %ymm15 # ymm15 = mem[2,3,0,1]
- vpermq $0x4e, 0x460(%rsi), %ymm1 # ymm1 = mem[2,3,0,1]
- vpermq $0x4e, 0x4c0(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
- vpermq $0x4e, 0x480(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
- vmovdqa 0x100(%rsi), %ymm12
- vpshufb %ymm12, %ymm15, %ymm15
- vpshufb %ymm12, %ymm1, %ymm1
- vpshufb %ymm12, %ymm2, %ymm2
- vpshufb %ymm12, %ymm3, %ymm3
- vpsubw %ymm4, %ymm6, %ymm12
- vpaddw %ymm6, %ymm4, %ymm4
- vpsubw %ymm5, %ymm7, %ymm13
- vpmullw %ymm15, %ymm12, %ymm6
- vpaddw %ymm7, %ymm5, %ymm5
- vpsubw %ymm8, %ymm10, %ymm14
- vpmullw %ymm15, %ymm13, %ymm7
- vpaddw %ymm10, %ymm8, %ymm8
- vpsubw %ymm9, %ymm11, %ymm15
- vpmullw %ymm1, %ymm14, %ymm10
- vpaddw %ymm11, %ymm9, %ymm9
- vpmullw %ymm1, %ymm15, %ymm11
- vpmulhw %ymm2, %ymm12, %ymm12
- vpmulhw %ymm2, %ymm13, %ymm13
- vpmulhw %ymm3, %ymm14, %ymm14
- vpmulhw %ymm3, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm6, %ymm6
- vpmulhw %ymm0, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm6, %ymm12, %ymm6
- vpsubw %ymm7, %ymm13, %ymm7
- vpsubw %ymm10, %ymm14, %ymm10
- vpsubw %ymm11, %ymm15, %ymm11
- vpermq $0x4e, 0x420(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
- vpermq $0x4e, 0x440(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
- vmovdqa 0x100(%rsi), %ymm1
- vpshufb %ymm1, %ymm2, %ymm2
- vpshufb %ymm1, %ymm3, %ymm3
- vpsubw %ymm4, %ymm8, %ymm12
- vpaddw %ymm8, %ymm4, %ymm4
- vpsubw %ymm5, %ymm9, %ymm13
- vpmullw %ymm2, %ymm12, %ymm8
- vpaddw %ymm9, %ymm5, %ymm5
- vpsubw %ymm6, %ymm10, %ymm14
- vpmullw %ymm2, %ymm13, %ymm9
- vpaddw %ymm10, %ymm6, %ymm6
- vpsubw %ymm7, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm10
- vpaddw %ymm11, %ymm7, %ymm7
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm3, %ymm12, %ymm12
- vpmulhw %ymm3, %ymm13, %ymm13
- vpmulhw %ymm3, %ymm14, %ymm14
- vpmulhw %ymm3, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm8, %ymm12, %ymm8
- vpsubw %ymm9, %ymm13, %ymm9
- vpsubw %ymm10, %ymm14, %ymm10
- vpsubw %ymm11, %ymm15, %ymm11
- vpslld $0x10, %ymm5, %ymm3
- vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
- vpsrld $0x10, %ymm4, %ymm4
- vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
- vpslld $0x10, %ymm7, %ymm4
- vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
- vpsrld $0x10, %ymm6, %ymm6
- vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
- vpslld $0x10, %ymm9, %ymm6
- vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
- vpsrld $0x10, %ymm8, %ymm8
- vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
- vpslld $0x10, %ymm11, %ymm8
- vpblendw $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7],ymm10[8],ymm8[9],ymm10[10],ymm8[11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
- vpsrld $0x10, %ymm10, %ymm10
- vpblendw $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7],ymm10[8],ymm11[9],ymm10[10],ymm11[11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
- vmovdqa 0x120(%rsi), %ymm12
- vpermd 0x3e0(%rsi), %ymm12, %ymm2
- vpermd 0x400(%rsi), %ymm12, %ymm10
- vpsubw %ymm3, %ymm5, %ymm12
- vpaddw %ymm5, %ymm3, %ymm3
- vpsubw %ymm4, %ymm7, %ymm13
- vpmullw %ymm2, %ymm12, %ymm5
- vpaddw %ymm7, %ymm4, %ymm4
- vpsubw %ymm6, %ymm9, %ymm14
- vpmullw %ymm2, %ymm13, %ymm7
- vpaddw %ymm9, %ymm6, %ymm6
- vpsubw %ymm8, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm9
- vpaddw %ymm11, %ymm8, %ymm8
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm10, %ymm12, %ymm12
- vpmulhw %ymm10, %ymm13, %ymm13
- vpmulhw %ymm10, %ymm14, %ymm14
- vpmulhw %ymm10, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm5, %ymm12, %ymm5
- vpsubw %ymm7, %ymm13, %ymm7
- vpsubw %ymm9, %ymm14, %ymm9
- vpsubw %ymm11, %ymm15, %ymm11
- vmovdqa 0x40(%rsi), %ymm1
- vpmulhw %ymm1, %ymm3, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm3, %ymm3
- vmovsldup %ymm4, %ymm10 # ymm10 = ymm4[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7]
- vpsrlq $0x20, %ymm3, %ymm3
- vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
- vmovsldup %ymm8, %ymm3 # ymm3 = ymm8[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
- vpsrlq $0x20, %ymm6, %ymm6
- vpblendd $0xaa, %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7]
- vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
- vpsrlq $0x20, %ymm5, %ymm5
- vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
- vmovsldup %ymm11, %ymm5 # ymm5 = ymm11[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7]
- vpsrlq $0x20, %ymm9, %ymm9
- vpblendd $0xaa, %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4],ymm11[5],ymm9[6],ymm11[7]
- vpermq $0x1b, 0x3a0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0]
- vpermq $0x1b, 0x3c0(%rsi), %ymm9 # ymm9 = mem[3,2,1,0]
- vpsubw %ymm10, %ymm4, %ymm12
- vpaddw %ymm4, %ymm10, %ymm10
- vpsubw %ymm3, %ymm8, %ymm13
- vpmullw %ymm2, %ymm12, %ymm4
- vpaddw %ymm8, %ymm3, %ymm3
- vpsubw %ymm6, %ymm7, %ymm14
- vpmullw %ymm2, %ymm13, %ymm8
- vpaddw %ymm7, %ymm6, %ymm6
- vpsubw %ymm5, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm7
- vpaddw %ymm11, %ymm5, %ymm5
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm9, %ymm12, %ymm12
- vpmulhw %ymm9, %ymm13, %ymm13
- vpmulhw %ymm9, %ymm14, %ymm14
- vpmulhw %ymm9, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm4, %ymm4
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm4, %ymm12, %ymm4
- vpsubw %ymm8, %ymm13, %ymm8
- vpsubw %ymm7, %ymm14, %ymm7
- vpsubw %ymm11, %ymm15, %ymm11
- vpmulhw %ymm1, %ymm10, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm10, %ymm10
- vpunpcklqdq %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0],ymm3[0],ymm10[2],ymm3[2]
- vpunpckhqdq %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
- vpunpcklqdq %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
- vpunpckhqdq %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
- vpunpcklqdq %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0],ymm8[0],ymm4[2],ymm8[2]
- vpunpckhqdq %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[1],ymm8[1],ymm4[3],ymm8[3]
- vpunpcklqdq %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
- vpunpckhqdq %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
- vpermq $0x4e, 0x360(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
- vpermq $0x4e, 0x380(%rsi), %ymm7 # ymm7 = mem[2,3,0,1]
- vpsubw %ymm9, %ymm3, %ymm12
- vpaddw %ymm3, %ymm9, %ymm9
- vpsubw %ymm10, %ymm5, %ymm13
- vpmullw %ymm2, %ymm12, %ymm3
- vpaddw %ymm5, %ymm10, %ymm10
- vpsubw %ymm6, %ymm8, %ymm14
- vpmullw %ymm2, %ymm13, %ymm5
- vpaddw %ymm8, %ymm6, %ymm6
- vpsubw %ymm4, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm8
- vpaddw %ymm11, %ymm4, %ymm4
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm7, %ymm12, %ymm12
- vpmulhw %ymm7, %ymm13, %ymm13
- vpmulhw %ymm7, %ymm14, %ymm14
- vpmulhw %ymm7, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm3, %ymm3
- vpmulhw %ymm0, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm3, %ymm12, %ymm3
- vpsubw %ymm5, %ymm13, %ymm5
- vpsubw %ymm8, %ymm14, %ymm8
- vpsubw %ymm11, %ymm15, %ymm11
- vpmulhw %ymm1, %ymm9, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm9, %ymm9
- vperm2i128 $0x20, %ymm10, %ymm9, %ymm7 # ymm7 = ymm9[0,1],ymm10[0,1]
- vperm2i128 $0x31, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[2,3],ymm10[2,3]
- vperm2i128 $0x20, %ymm4, %ymm6, %ymm9 # ymm9 = ymm6[0,1],ymm4[0,1]
- vperm2i128 $0x31, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[2,3],ymm4[2,3]
- vperm2i128 $0x20, %ymm5, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm5[0,1]
- vperm2i128 $0x31, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[2,3],ymm5[2,3]
- vperm2i128 $0x20, %ymm11, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm11[0,1]
- vperm2i128 $0x31, %ymm11, %ymm8, %ymm11 # ymm11 = ymm8[2,3],ymm11[2,3]
- vmovdqa 0x320(%rsi), %ymm2
- vmovdqa 0x340(%rsi), %ymm8
- vpsubw %ymm7, %ymm10, %ymm12
- vpaddw %ymm10, %ymm7, %ymm7
- vpsubw %ymm9, %ymm4, %ymm13
- vpmullw %ymm2, %ymm12, %ymm10
- vpaddw %ymm4, %ymm9, %ymm9
- vpsubw %ymm6, %ymm5, %ymm14
- vpmullw %ymm2, %ymm13, %ymm4
- vpaddw %ymm5, %ymm6, %ymm6
- vpsubw %ymm3, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm5
- vpaddw %ymm11, %ymm3, %ymm3
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm8, %ymm12, %ymm12
- vpmulhw %ymm8, %ymm13, %ymm13
- vpmulhw %ymm8, %ymm14, %ymm14
- vpmulhw %ymm8, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm4, %ymm4
- vpmulhw %ymm0, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm10, %ymm12, %ymm10
- vpsubw %ymm4, %ymm13, %ymm4
- vpsubw %ymm5, %ymm14, %ymm5
- vpsubw %ymm11, %ymm15, %ymm11
- vpmulhw %ymm1, %ymm7, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm7, %ymm7
- vmovdqa %ymm7, (%rdi)
- vmovdqa %ymm9, 0x20(%rdi)
- vmovdqa %ymm6, 0x40(%rdi)
- vmovdqa %ymm3, 0x60(%rdi)
- vmovdqa %ymm10, 0x80(%rdi)
- vmovdqa %ymm4, 0xa0(%rdi)
- vmovdqa %ymm5, 0xc0(%rdi)
- vmovdqa %ymm11, 0xe0(%rdi)
- vmovdqa 0x60(%rsi), %ymm2
- vmovdqa 0x80(%rsi), %ymm3
- vmovdqa 0x100(%rdi), %ymm4
- vmovdqa 0x140(%rdi), %ymm6
- vmovdqa 0x120(%rdi), %ymm5
- vmovdqa 0x160(%rdi), %ymm7
- vpmullw %ymm2, %ymm4, %ymm12
- vpmulhw %ymm3, %ymm4, %ymm4
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm4, %ymm4
- vpmullw %ymm2, %ymm6, %ymm12
- vpmulhw %ymm3, %ymm6, %ymm6
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm6, %ymm6
- vpmullw %ymm2, %ymm5, %ymm12
- vpmulhw %ymm3, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm5, %ymm5
- vpmullw %ymm2, %ymm7, %ymm12
- vpmulhw %ymm3, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm7, %ymm7
- vmovdqa 0x180(%rdi), %ymm8
- vmovdqa 0x1c0(%rdi), %ymm10
- vmovdqa 0x1a0(%rdi), %ymm9
- vmovdqa 0x1e0(%rdi), %ymm11
- vpmullw %ymm2, %ymm8, %ymm12
- vpmulhw %ymm3, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm8, %ymm8
- vpmullw %ymm2, %ymm10, %ymm12
- vpmulhw %ymm3, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm10, %ymm10
- vpmullw %ymm2, %ymm9, %ymm12
- vpmulhw %ymm3, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm9, %ymm9
- vpmullw %ymm2, %ymm11, %ymm12
- vpmulhw %ymm3, %ymm11, %ymm11
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm11, %ymm11
- vpermq $0x4e, 0x2e0(%rsi), %ymm15 # ymm15 = mem[2,3,0,1]
- vpermq $0x4e, 0x2a0(%rsi), %ymm1 # ymm1 = mem[2,3,0,1]
- vpermq $0x4e, 0x300(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
- vpermq $0x4e, 0x2c0(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
- vmovdqa 0x100(%rsi), %ymm12
- vpshufb %ymm12, %ymm15, %ymm15
- vpshufb %ymm12, %ymm1, %ymm1
- vpshufb %ymm12, %ymm2, %ymm2
- vpshufb %ymm12, %ymm3, %ymm3
- vpsubw %ymm4, %ymm6, %ymm12
- vpaddw %ymm6, %ymm4, %ymm4
- vpsubw %ymm5, %ymm7, %ymm13
- vpmullw %ymm15, %ymm12, %ymm6
- vpaddw %ymm7, %ymm5, %ymm5
- vpsubw %ymm8, %ymm10, %ymm14
- vpmullw %ymm15, %ymm13, %ymm7
- vpaddw %ymm10, %ymm8, %ymm8
- vpsubw %ymm9, %ymm11, %ymm15
- vpmullw %ymm1, %ymm14, %ymm10
- vpaddw %ymm11, %ymm9, %ymm9
- vpmullw %ymm1, %ymm15, %ymm11
- vpmulhw %ymm2, %ymm12, %ymm12
- vpmulhw %ymm2, %ymm13, %ymm13
- vpmulhw %ymm3, %ymm14, %ymm14
- vpmulhw %ymm3, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm6, %ymm6
- vpmulhw %ymm0, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm6, %ymm12, %ymm6
- vpsubw %ymm7, %ymm13, %ymm7
- vpsubw %ymm10, %ymm14, %ymm10
- vpsubw %ymm11, %ymm15, %ymm11
- vpermq $0x4e, 0x260(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
- vpermq $0x4e, 0x280(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
- vmovdqa 0x100(%rsi), %ymm1
- vpshufb %ymm1, %ymm2, %ymm2
- vpshufb %ymm1, %ymm3, %ymm3
- vpsubw %ymm4, %ymm8, %ymm12
- vpaddw %ymm8, %ymm4, %ymm4
- vpsubw %ymm5, %ymm9, %ymm13
- vpmullw %ymm2, %ymm12, %ymm8
- vpaddw %ymm9, %ymm5, %ymm5
- vpsubw %ymm6, %ymm10, %ymm14
- vpmullw %ymm2, %ymm13, %ymm9
- vpaddw %ymm10, %ymm6, %ymm6
- vpsubw %ymm7, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm10
- vpaddw %ymm11, %ymm7, %ymm7
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm3, %ymm12, %ymm12
- vpmulhw %ymm3, %ymm13, %ymm13
- vpmulhw %ymm3, %ymm14, %ymm14
- vpmulhw %ymm3, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm8, %ymm12, %ymm8
- vpsubw %ymm9, %ymm13, %ymm9
- vpsubw %ymm10, %ymm14, %ymm10
- vpsubw %ymm11, %ymm15, %ymm11
- vpslld $0x10, %ymm5, %ymm3
- vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
- vpsrld $0x10, %ymm4, %ymm4
- vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
- vpslld $0x10, %ymm7, %ymm4
- vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
- vpsrld $0x10, %ymm6, %ymm6
- vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
- vpslld $0x10, %ymm9, %ymm6
- vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
- vpsrld $0x10, %ymm8, %ymm8
- vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
- vpslld $0x10, %ymm11, %ymm8
- vpblendw $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7],ymm10[8],ymm8[9],ymm10[10],ymm8[11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
- vpsrld $0x10, %ymm10, %ymm10
- vpblendw $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7],ymm10[8],ymm11[9],ymm10[10],ymm11[11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
- vmovdqa 0x120(%rsi), %ymm12
- vpermd 0x220(%rsi), %ymm12, %ymm2
- vpermd 0x240(%rsi), %ymm12, %ymm10
- vpsubw %ymm3, %ymm5, %ymm12
- vpaddw %ymm5, %ymm3, %ymm3
- vpsubw %ymm4, %ymm7, %ymm13
- vpmullw %ymm2, %ymm12, %ymm5
- vpaddw %ymm7, %ymm4, %ymm4
- vpsubw %ymm6, %ymm9, %ymm14
- vpmullw %ymm2, %ymm13, %ymm7
- vpaddw %ymm9, %ymm6, %ymm6
- vpsubw %ymm8, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm9
- vpaddw %ymm11, %ymm8, %ymm8
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm10, %ymm12, %ymm12
- vpmulhw %ymm10, %ymm13, %ymm13
- vpmulhw %ymm10, %ymm14, %ymm14
- vpmulhw %ymm10, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm5, %ymm12, %ymm5
- vpsubw %ymm7, %ymm13, %ymm7
- vpsubw %ymm9, %ymm14, %ymm9
- vpsubw %ymm11, %ymm15, %ymm11
- vmovdqa 0x40(%rsi), %ymm1
- vpmulhw %ymm1, %ymm3, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm3, %ymm3
- vmovsldup %ymm4, %ymm10 # ymm10 = ymm4[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7]
- vpsrlq $0x20, %ymm3, %ymm3
- vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
- vmovsldup %ymm8, %ymm3 # ymm3 = ymm8[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
- vpsrlq $0x20, %ymm6, %ymm6
- vpblendd $0xaa, %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7]
- vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
- vpsrlq $0x20, %ymm5, %ymm5
- vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
- vmovsldup %ymm11, %ymm5 # ymm5 = ymm11[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7]
- vpsrlq $0x20, %ymm9, %ymm9
- vpblendd $0xaa, %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4],ymm11[5],ymm9[6],ymm11[7]
- vpermq $0x1b, 0x1e0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0]
- vpermq $0x1b, 0x200(%rsi), %ymm9 # ymm9 = mem[3,2,1,0]
- vpsubw %ymm10, %ymm4, %ymm12
- vpaddw %ymm4, %ymm10, %ymm10
- vpsubw %ymm3, %ymm8, %ymm13
- vpmullw %ymm2, %ymm12, %ymm4
- vpaddw %ymm8, %ymm3, %ymm3
- vpsubw %ymm6, %ymm7, %ymm14
- vpmullw %ymm2, %ymm13, %ymm8
- vpaddw %ymm7, %ymm6, %ymm6
- vpsubw %ymm5, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm7
- vpaddw %ymm11, %ymm5, %ymm5
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm9, %ymm12, %ymm12
- vpmulhw %ymm9, %ymm13, %ymm13
- vpmulhw %ymm9, %ymm14, %ymm14
- vpmulhw %ymm9, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm4, %ymm4
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm4, %ymm12, %ymm4
- vpsubw %ymm8, %ymm13, %ymm8
- vpsubw %ymm7, %ymm14, %ymm7
- vpsubw %ymm11, %ymm15, %ymm11
- vpmulhw %ymm1, %ymm10, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm10, %ymm10
- vpunpcklqdq %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0],ymm3[0],ymm10[2],ymm3[2]
- vpunpckhqdq %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
- vpunpcklqdq %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
- vpunpckhqdq %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
- vpunpcklqdq %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0],ymm8[0],ymm4[2],ymm8[2]
- vpunpckhqdq %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[1],ymm8[1],ymm4[3],ymm8[3]
- vpunpcklqdq %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
- vpunpckhqdq %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
- vpermq $0x4e, 0x1a0(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
- vpermq $0x4e, 0x1c0(%rsi), %ymm7 # ymm7 = mem[2,3,0,1]
- vpsubw %ymm9, %ymm3, %ymm12
- vpaddw %ymm3, %ymm9, %ymm9
- vpsubw %ymm10, %ymm5, %ymm13
- vpmullw %ymm2, %ymm12, %ymm3
- vpaddw %ymm5, %ymm10, %ymm10
- vpsubw %ymm6, %ymm8, %ymm14
- vpmullw %ymm2, %ymm13, %ymm5
- vpaddw %ymm8, %ymm6, %ymm6
- vpsubw %ymm4, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm8
- vpaddw %ymm11, %ymm4, %ymm4
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm7, %ymm12, %ymm12
- vpmulhw %ymm7, %ymm13, %ymm13
- vpmulhw %ymm7, %ymm14, %ymm14
- vpmulhw %ymm7, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm3, %ymm3
- vpmulhw %ymm0, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm3, %ymm12, %ymm3
- vpsubw %ymm5, %ymm13, %ymm5
- vpsubw %ymm8, %ymm14, %ymm8
- vpsubw %ymm11, %ymm15, %ymm11
- vpmulhw %ymm1, %ymm9, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm9, %ymm9
- vperm2i128 $0x20, %ymm10, %ymm9, %ymm7 # ymm7 = ymm9[0,1],ymm10[0,1]
- vperm2i128 $0x31, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[2,3],ymm10[2,3]
- vperm2i128 $0x20, %ymm4, %ymm6, %ymm9 # ymm9 = ymm6[0,1],ymm4[0,1]
- vperm2i128 $0x31, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[2,3],ymm4[2,3]
- vperm2i128 $0x20, %ymm5, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm5[0,1]
- vperm2i128 $0x31, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[2,3],ymm5[2,3]
- vperm2i128 $0x20, %ymm11, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm11[0,1]
- vperm2i128 $0x31, %ymm11, %ymm8, %ymm11 # ymm11 = ymm8[2,3],ymm11[2,3]
- vmovdqa 0x160(%rsi), %ymm2
- vmovdqa 0x180(%rsi), %ymm8
- vpsubw %ymm7, %ymm10, %ymm12
- vpaddw %ymm10, %ymm7, %ymm7
- vpsubw %ymm9, %ymm4, %ymm13
- vpmullw %ymm2, %ymm12, %ymm10
- vpaddw %ymm4, %ymm9, %ymm9
- vpsubw %ymm6, %ymm5, %ymm14
- vpmullw %ymm2, %ymm13, %ymm4
- vpaddw %ymm5, %ymm6, %ymm6
- vpsubw %ymm3, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm5
- vpaddw %ymm11, %ymm3, %ymm3
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm8, %ymm12, %ymm12
- vpmulhw %ymm8, %ymm13, %ymm13
- vpmulhw %ymm8, %ymm14, %ymm14
- vpmulhw %ymm8, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm4, %ymm4
- vpmulhw %ymm0, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm10, %ymm12, %ymm10
- vpsubw %ymm4, %ymm13, %ymm4
- vpsubw %ymm5, %ymm14, %ymm5
- vpsubw %ymm11, %ymm15, %ymm11
- vpmulhw %ymm1, %ymm7, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm7, %ymm7
- vmovdqa %ymm7, 0x100(%rdi)
- vmovdqa %ymm9, 0x120(%rdi)
- vmovdqa %ymm6, 0x140(%rdi)
- vmovdqa %ymm3, 0x160(%rdi)
- vmovdqa %ymm10, 0x180(%rdi)
- vmovdqa %ymm4, 0x1a0(%rdi)
- vmovdqa %ymm5, 0x1c0(%rdi)
- vmovdqa %ymm11, 0x1e0(%rdi)
- vmovdqa (%rdi), %ymm4
- vmovdqa 0x100(%rdi), %ymm8
- vmovdqa 0x20(%rdi), %ymm5
- vmovdqa 0x120(%rdi), %ymm9
- vpbroadcastq 0x140(%rsi), %ymm2
- vmovdqa 0x40(%rdi), %ymm6
- vmovdqa 0x140(%rdi), %ymm10
- vmovdqa 0x60(%rdi), %ymm7
- vmovdqa 0x160(%rdi), %ymm11
- vpbroadcastq 0x148(%rsi), %ymm3
- vpsubw %ymm4, %ymm8, %ymm12
- vpaddw %ymm8, %ymm4, %ymm4
- vpsubw %ymm5, %ymm9, %ymm13
- vpmullw %ymm2, %ymm12, %ymm8
- vpaddw %ymm9, %ymm5, %ymm5
- vpsubw %ymm6, %ymm10, %ymm14
- vpmullw %ymm2, %ymm13, %ymm9
- vpaddw %ymm10, %ymm6, %ymm6
- vpsubw %ymm7, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm10
- vpaddw %ymm11, %ymm7, %ymm7
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm3, %ymm12, %ymm12
- vpmulhw %ymm3, %ymm13, %ymm13
- vpmulhw %ymm3, %ymm14, %ymm14
- vpmulhw %ymm3, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm8, %ymm12, %ymm8
- vpsubw %ymm9, %ymm13, %ymm9
- vpsubw %ymm10, %ymm14, %ymm10
- vpsubw %ymm11, %ymm15, %ymm11
- vmovdqa %ymm4, (%rdi)
- vmovdqa %ymm5, 0x20(%rdi)
- vmovdqa %ymm6, 0x40(%rdi)
- vmovdqa %ymm7, 0x60(%rdi)
- vmovdqa %ymm8, 0x100(%rdi)
- vmovdqa %ymm9, 0x120(%rdi)
- vmovdqa %ymm10, 0x140(%rdi)
- vmovdqa %ymm11, 0x160(%rdi)
- vmovdqa 0x80(%rdi), %ymm4
- vmovdqa 0x180(%rdi), %ymm8
- vmovdqa 0xa0(%rdi), %ymm5
- vmovdqa 0x1a0(%rdi), %ymm9
- vpbroadcastq 0x140(%rsi), %ymm2
- vmovdqa 0xc0(%rdi), %ymm6
- vmovdqa 0x1c0(%rdi), %ymm10
- vmovdqa 0xe0(%rdi), %ymm7
- vmovdqa 0x1e0(%rdi), %ymm11
- vpbroadcastq 0x148(%rsi), %ymm3
- vpsubw %ymm4, %ymm8, %ymm12
- vpaddw %ymm8, %ymm4, %ymm4
- vpsubw %ymm5, %ymm9, %ymm13
- vpmullw %ymm2, %ymm12, %ymm8
- vpaddw %ymm9, %ymm5, %ymm5
- vpsubw %ymm6, %ymm10, %ymm14
- vpmullw %ymm2, %ymm13, %ymm9
- vpaddw %ymm10, %ymm6, %ymm6
- vpsubw %ymm7, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm10
- vpaddw %ymm11, %ymm7, %ymm7
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm3, %ymm12, %ymm12
- vpmulhw %ymm3, %ymm13, %ymm13
- vpmulhw %ymm3, %ymm14, %ymm14
- vpmulhw %ymm3, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm8, %ymm12, %ymm8
- vpsubw %ymm9, %ymm13, %ymm9
- vpsubw %ymm10, %ymm14, %ymm10
- vpsubw %ymm11, %ymm15, %ymm11
- vmovdqa %ymm4, 0x80(%rdi)
- vmovdqa %ymm5, 0xa0(%rdi)
- vmovdqa %ymm6, 0xc0(%rdi)
- vmovdqa %ymm7, 0xe0(%rdi)
- vmovdqa %ymm8, 0x180(%rdi)
- vmovdqa %ymm9, 0x1a0(%rdi)
- vmovdqa %ymm10, 0x1c0(%rdi)
- vmovdqa %ymm11, 0x1e0(%rdi)
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0xd8a1d8a1, %eax # imm = 0xD8A1D8A1
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ movl $0x5a105a1, %eax # imm = 0x5A105A1
+ vmovd %eax, %xmm3
+ vpbroadcastd %xmm3, %ymm3
+ vmovdqa (%rdi), %ymm4
+ vmovdqa 0x40(%rdi), %ymm6
+ vmovdqa 0x20(%rdi), %ymm5
+ vmovdqa 0x60(%rdi), %ymm7
+ vpmullw %ymm2, %ymm4, %ymm12
+ vpmulhw %ymm3, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm6, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm6, %ymm6
+ vpmullw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm5, %ymm5
+ vpmullw %ymm2, %ymm7, %ymm12
+ vpmulhw %ymm3, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm7, %ymm7
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xc0(%rdi), %ymm10
+ vmovdqa 0xa0(%rdi), %ymm9
+ vmovdqa 0xe0(%rdi), %ymm11
+ vpmullw %ymm2, %ymm8, %ymm12
+ vpmulhw %ymm3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm8, %ymm8
+ vpmullw %ymm2, %ymm10, %ymm12
+ vpmulhw %ymm3, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm10, %ymm10
+ vpmullw %ymm2, %ymm9, %ymm12
+ vpmulhw %ymm3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vpmullw %ymm2, %ymm11, %ymm12
+ vpmulhw %ymm3, %ymm11, %ymm11
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm11, %ymm11
+ vpermq $0x4e, 0x3a0(%rsi), %ymm15 # ymm15 = mem[2,3,0,1]
+ vpermq $0x4e, 0x360(%rsi), %ymm1 # ymm1 = mem[2,3,0,1]
+ vpermq $0x4e, 0x3c0(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
+ vpermq $0x4e, 0x380(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
+ vmovdqa (%rsi), %ymm12
+ vpshufb %ymm12, %ymm15, %ymm15
+ vpshufb %ymm12, %ymm1, %ymm1
+ vpshufb %ymm12, %ymm2, %ymm2
+ vpshufb %ymm12, %ymm3, %ymm3
+ vpsubw %ymm4, %ymm6, %ymm12
+ vpaddw %ymm6, %ymm4, %ymm4
+ vpsubw %ymm5, %ymm7, %ymm13
+ vpmullw %ymm15, %ymm12, %ymm6
+ vpaddw %ymm7, %ymm5, %ymm5
+ vpsubw %ymm8, %ymm10, %ymm14
+ vpmullw %ymm15, %ymm13, %ymm7
+ vpaddw %ymm10, %ymm8, %ymm8
+ vpsubw %ymm9, %ymm11, %ymm15
+ vpmullw %ymm1, %ymm14, %ymm10
+ vpaddw %ymm11, %ymm9, %ymm9
+ vpmullw %ymm1, %ymm15, %ymm11
+ vpmulhw %ymm2, %ymm12, %ymm12
+ vpmulhw %ymm2, %ymm13, %ymm13
+ vpmulhw %ymm3, %ymm14, %ymm14
+ vpmulhw %ymm3, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm6, %ymm12, %ymm6
+ vpsubw %ymm7, %ymm13, %ymm7
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpermq $0x4e, 0x320(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
+ vpermq $0x4e, 0x340(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
+ vmovdqa (%rsi), %ymm1
+ vpshufb %ymm1, %ymm2, %ymm2
+ vpshufb %ymm1, %ymm3, %ymm3
+ vpsubw %ymm4, %ymm8, %ymm12
+ vpaddw %ymm8, %ymm4, %ymm4
+ vpsubw %ymm5, %ymm9, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm5
+ vpsubw %ymm6, %ymm10, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm6
+ vpsubw %ymm7, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm7
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm3, %ymm12, %ymm12
+ vpmulhw %ymm3, %ymm13, %ymm13
+ vpmulhw %ymm3, %ymm14, %ymm14
+ vpmulhw %ymm3, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm8, %ymm12, %ymm8
+ vpsubw %ymm9, %ymm13, %ymm9
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpslld $0x10, %ymm5, %ymm3
+ vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+ vpslld $0x10, %ymm7, %ymm4
+ vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+ vpslld $0x10, %ymm9, %ymm6
+ vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
+ vpsrld $0x10, %ymm8, %ymm8
+ vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+ vpslld $0x10, %ymm11, %ymm8
+ vpblendw $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7],ymm10[8],ymm8[9],ymm10[10],ymm8[11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
+ vpsrld $0x10, %ymm10, %ymm10
+ vpblendw $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7],ymm10[8],ymm11[9],ymm10[10],ymm11[11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
+ vmovdqa 0x20(%rsi), %ymm12
+ vpermd 0x2e0(%rsi), %ymm12, %ymm2
+ vpermd 0x300(%rsi), %ymm12, %ymm10
+ vpsubw %ymm3, %ymm5, %ymm12
+ vpaddw %ymm5, %ymm3, %ymm3
+ vpsubw %ymm4, %ymm7, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm5
+ vpaddw %ymm7, %ymm4, %ymm4
+ vpsubw %ymm6, %ymm9, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm7
+ vpaddw %ymm9, %ymm6, %ymm6
+ vpsubw %ymm8, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm9
+ vpaddw %ymm11, %ymm8, %ymm8
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm10, %ymm12, %ymm12
+ vpmulhw %ymm10, %ymm13, %ymm13
+ vpmulhw %ymm10, %ymm14, %ymm14
+ vpmulhw %ymm10, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm5, %ymm12, %ymm5
+ vpsubw %ymm7, %ymm13, %ymm7
+ vpsubw %ymm9, %ymm14, %ymm9
+ vpsubw %ymm11, %ymm15, %ymm11
+ movl $0x4ebf4ebf, %eax # imm = 0x4EBF4EBF
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ vpmulhw %ymm1, %ymm3, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm3, %ymm3
+ vmovsldup %ymm4, %ymm10 # ymm10 = ymm4[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+ vmovsldup %ymm8, %ymm3 # ymm3 = ymm8[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7]
+ vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
+ vpsrlq $0x20, %ymm5, %ymm5
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
+ vmovsldup %ymm11, %ymm5 # ymm5 = ymm11[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7]
+ vpsrlq $0x20, %ymm9, %ymm9
+ vpblendd $0xaa, %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4],ymm11[5],ymm9[6],ymm11[7]
+ vpermq $0x1b, 0x2a0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0]
+ vpermq $0x1b, 0x2c0(%rsi), %ymm9 # ymm9 = mem[3,2,1,0]
+ vpsubw %ymm10, %ymm4, %ymm12
+ vpaddw %ymm4, %ymm10, %ymm10
+ vpsubw %ymm3, %ymm8, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm4
+ vpaddw %ymm8, %ymm3, %ymm3
+ vpsubw %ymm6, %ymm7, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm8
+ vpaddw %ymm7, %ymm6, %ymm6
+ vpsubw %ymm5, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm7
+ vpaddw %ymm11, %ymm5, %ymm5
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm9, %ymm12, %ymm12
+ vpmulhw %ymm9, %ymm13, %ymm13
+ vpmulhw %ymm9, %ymm14, %ymm14
+ vpmulhw %ymm9, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm4, %ymm12, %ymm4
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm7, %ymm14, %ymm7
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpmulhw %ymm1, %ymm10, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm10, %ymm10
+ vpunpcklqdq %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0],ymm3[0],ymm10[2],ymm3[2]
+ vpunpckhqdq %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
+ vpunpcklqdq %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+ vpunpcklqdq %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0],ymm8[0],ymm4[2],ymm8[2]
+ vpunpckhqdq %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[1],ymm8[1],ymm4[3],ymm8[3]
+ vpunpcklqdq %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
+ vpunpckhqdq %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
+ vpermq $0x4e, 0x260(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
+ vpermq $0x4e, 0x280(%rsi), %ymm7 # ymm7 = mem[2,3,0,1]
+ vpsubw %ymm9, %ymm3, %ymm12
+ vpaddw %ymm3, %ymm9, %ymm9
+ vpsubw %ymm10, %ymm5, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm3
+ vpaddw %ymm5, %ymm10, %ymm10
+ vpsubw %ymm6, %ymm8, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm5
+ vpaddw %ymm8, %ymm6, %ymm6
+ vpsubw %ymm4, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm8
+ vpaddw %ymm11, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm7, %ymm12, %ymm12
+ vpmulhw %ymm7, %ymm13, %ymm13
+ vpmulhw %ymm7, %ymm14, %ymm14
+ vpmulhw %ymm7, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm3, %ymm3
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm3, %ymm12, %ymm3
+ vpsubw %ymm5, %ymm13, %ymm5
+ vpsubw %ymm8, %ymm14, %ymm8
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpmulhw %ymm1, %ymm9, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vperm2i128 $0x20, %ymm10, %ymm9, %ymm7 # ymm7 = ymm9[0,1],ymm10[0,1]
+ vperm2i128 $0x31, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[2,3],ymm10[2,3]
+ vperm2i128 $0x20, %ymm4, %ymm6, %ymm9 # ymm9 = ymm6[0,1],ymm4[0,1]
+ vperm2i128 $0x31, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[2,3],ymm4[2,3]
+ vperm2i128 $0x20, %ymm5, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm5[0,1]
+ vperm2i128 $0x31, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[2,3],ymm5[2,3]
+ vperm2i128 $0x20, %ymm11, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm11[0,1]
+ vperm2i128 $0x31, %ymm11, %ymm8, %ymm11 # ymm11 = ymm8[2,3],ymm11[2,3]
+ vmovdqa 0x220(%rsi), %ymm2
+ vmovdqa 0x240(%rsi), %ymm8
+ vpsubw %ymm7, %ymm10, %ymm12
+ vpaddw %ymm10, %ymm7, %ymm7
+ vpsubw %ymm9, %ymm4, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm10
+ vpaddw %ymm4, %ymm9, %ymm9
+ vpsubw %ymm6, %ymm5, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm4
+ vpaddw %ymm5, %ymm6, %ymm6
+ vpsubw %ymm3, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm5
+ vpaddw %ymm11, %ymm3, %ymm3
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm8, %ymm12, %ymm12
+ vpmulhw %ymm8, %ymm13, %ymm13
+ vpmulhw %ymm8, %ymm14, %ymm14
+ vpmulhw %ymm8, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm10, %ymm12, %ymm10
+ vpsubw %ymm4, %ymm13, %ymm4
+ vpsubw %ymm5, %ymm14, %ymm5
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpmulhw %ymm1, %ymm7, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm7, %ymm7
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa %ymm6, 0x40(%rdi)
+ vmovdqa %ymm3, 0x60(%rdi)
+ vmovdqa %ymm10, 0x80(%rdi)
+ vmovdqa %ymm4, 0xa0(%rdi)
+ vmovdqa %ymm5, 0xc0(%rdi)
+ vmovdqa %ymm11, 0xe0(%rdi)
+ movl $0xd8a1d8a1, %eax # imm = 0xD8A1D8A1
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ movl $0x5a105a1, %eax # imm = 0x5A105A1
+ vmovd %eax, %xmm3
+ vpbroadcastd %xmm3, %ymm3
+ vmovdqa 0x100(%rdi), %ymm4
+ vmovdqa 0x140(%rdi), %ymm6
+ vmovdqa 0x120(%rdi), %ymm5
+ vmovdqa 0x160(%rdi), %ymm7
+ vpmullw %ymm2, %ymm4, %ymm12
+ vpmulhw %ymm3, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm6, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm6, %ymm6
+ vpmullw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm5, %ymm5
+ vpmullw %ymm2, %ymm7, %ymm12
+ vpmulhw %ymm3, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm7, %ymm7
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1c0(%rdi), %ymm10
+ vmovdqa 0x1a0(%rdi), %ymm9
+ vmovdqa 0x1e0(%rdi), %ymm11
+ vpmullw %ymm2, %ymm8, %ymm12
+ vpmulhw %ymm3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm8, %ymm8
+ vpmullw %ymm2, %ymm10, %ymm12
+ vpmulhw %ymm3, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm10, %ymm10
+ vpmullw %ymm2, %ymm9, %ymm12
+ vpmulhw %ymm3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vpmullw %ymm2, %ymm11, %ymm12
+ vpmulhw %ymm3, %ymm11, %ymm11
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm11, %ymm11
+ vpermq $0x4e, 0x1e0(%rsi), %ymm15 # ymm15 = mem[2,3,0,1]
+ vpermq $0x4e, 0x1a0(%rsi), %ymm1 # ymm1 = mem[2,3,0,1]
+ vpermq $0x4e, 0x200(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
+ vpermq $0x4e, 0x1c0(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
+ vmovdqa (%rsi), %ymm12
+ vpshufb %ymm12, %ymm15, %ymm15
+ vpshufb %ymm12, %ymm1, %ymm1
+ vpshufb %ymm12, %ymm2, %ymm2
+ vpshufb %ymm12, %ymm3, %ymm3
+ vpsubw %ymm4, %ymm6, %ymm12
+ vpaddw %ymm6, %ymm4, %ymm4
+ vpsubw %ymm5, %ymm7, %ymm13
+ vpmullw %ymm15, %ymm12, %ymm6
+ vpaddw %ymm7, %ymm5, %ymm5
+ vpsubw %ymm8, %ymm10, %ymm14
+ vpmullw %ymm15, %ymm13, %ymm7
+ vpaddw %ymm10, %ymm8, %ymm8
+ vpsubw %ymm9, %ymm11, %ymm15
+ vpmullw %ymm1, %ymm14, %ymm10
+ vpaddw %ymm11, %ymm9, %ymm9
+ vpmullw %ymm1, %ymm15, %ymm11
+ vpmulhw %ymm2, %ymm12, %ymm12
+ vpmulhw %ymm2, %ymm13, %ymm13
+ vpmulhw %ymm3, %ymm14, %ymm14
+ vpmulhw %ymm3, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm6, %ymm12, %ymm6
+ vpsubw %ymm7, %ymm13, %ymm7
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpermq $0x4e, 0x160(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
+ vpermq $0x4e, 0x180(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
+ vmovdqa (%rsi), %ymm1
+ vpshufb %ymm1, %ymm2, %ymm2
+ vpshufb %ymm1, %ymm3, %ymm3
+ vpsubw %ymm4, %ymm8, %ymm12
+ vpaddw %ymm8, %ymm4, %ymm4
+ vpsubw %ymm5, %ymm9, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm5
+ vpsubw %ymm6, %ymm10, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm6
+ vpsubw %ymm7, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm7
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm3, %ymm12, %ymm12
+ vpmulhw %ymm3, %ymm13, %ymm13
+ vpmulhw %ymm3, %ymm14, %ymm14
+ vpmulhw %ymm3, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm8, %ymm12, %ymm8
+ vpsubw %ymm9, %ymm13, %ymm9
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpslld $0x10, %ymm5, %ymm3
+ vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+ vpslld $0x10, %ymm7, %ymm4
+ vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+ vpslld $0x10, %ymm9, %ymm6
+ vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
+ vpsrld $0x10, %ymm8, %ymm8
+ vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+ vpslld $0x10, %ymm11, %ymm8
+ vpblendw $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7],ymm10[8],ymm8[9],ymm10[10],ymm8[11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
+ vpsrld $0x10, %ymm10, %ymm10
+ vpblendw $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7],ymm10[8],ymm11[9],ymm10[10],ymm11[11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
+ vmovdqa 0x20(%rsi), %ymm12
+ vpermd 0x120(%rsi), %ymm12, %ymm2
+ vpermd 0x140(%rsi), %ymm12, %ymm10
+ vpsubw %ymm3, %ymm5, %ymm12
+ vpaddw %ymm5, %ymm3, %ymm3
+ vpsubw %ymm4, %ymm7, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm5
+ vpaddw %ymm7, %ymm4, %ymm4
+ vpsubw %ymm6, %ymm9, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm7
+ vpaddw %ymm9, %ymm6, %ymm6
+ vpsubw %ymm8, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm9
+ vpaddw %ymm11, %ymm8, %ymm8
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm10, %ymm12, %ymm12
+ vpmulhw %ymm10, %ymm13, %ymm13
+ vpmulhw %ymm10, %ymm14, %ymm14
+ vpmulhw %ymm10, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm5, %ymm12, %ymm5
+ vpsubw %ymm7, %ymm13, %ymm7
+ vpsubw %ymm9, %ymm14, %ymm9
+ vpsubw %ymm11, %ymm15, %ymm11
+ movl $0x4ebf4ebf, %eax # imm = 0x4EBF4EBF
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ vpmulhw %ymm1, %ymm3, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm3, %ymm3
+ vmovsldup %ymm4, %ymm10 # ymm10 = ymm4[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+ vmovsldup %ymm8, %ymm3 # ymm3 = ymm8[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7]
+ vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
+ vpsrlq $0x20, %ymm5, %ymm5
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
+ vmovsldup %ymm11, %ymm5 # ymm5 = ymm11[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7]
+ vpsrlq $0x20, %ymm9, %ymm9
+ vpblendd $0xaa, %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4],ymm11[5],ymm9[6],ymm11[7]
+ vpermq $0x1b, 0xe0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0]
+ vpermq $0x1b, 0x100(%rsi), %ymm9 # ymm9 = mem[3,2,1,0]
+ vpsubw %ymm10, %ymm4, %ymm12
+ vpaddw %ymm4, %ymm10, %ymm10
+ vpsubw %ymm3, %ymm8, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm4
+ vpaddw %ymm8, %ymm3, %ymm3
+ vpsubw %ymm6, %ymm7, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm8
+ vpaddw %ymm7, %ymm6, %ymm6
+ vpsubw %ymm5, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm7
+ vpaddw %ymm11, %ymm5, %ymm5
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm9, %ymm12, %ymm12
+ vpmulhw %ymm9, %ymm13, %ymm13
+ vpmulhw %ymm9, %ymm14, %ymm14
+ vpmulhw %ymm9, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm4, %ymm12, %ymm4
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm7, %ymm14, %ymm7
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpmulhw %ymm1, %ymm10, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm10, %ymm10
+ vpunpcklqdq %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0],ymm3[0],ymm10[2],ymm3[2]
+ vpunpckhqdq %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
+ vpunpcklqdq %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+ vpunpcklqdq %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0],ymm8[0],ymm4[2],ymm8[2]
+ vpunpckhqdq %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[1],ymm8[1],ymm4[3],ymm8[3]
+ vpunpcklqdq %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
+ vpunpckhqdq %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
+ vpermq $0x4e, 0xa0(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
+ vpermq $0x4e, 0xc0(%rsi), %ymm7 # ymm7 = mem[2,3,0,1]
+ vpsubw %ymm9, %ymm3, %ymm12
+ vpaddw %ymm3, %ymm9, %ymm9
+ vpsubw %ymm10, %ymm5, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm3
+ vpaddw %ymm5, %ymm10, %ymm10
+ vpsubw %ymm6, %ymm8, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm5
+ vpaddw %ymm8, %ymm6, %ymm6
+ vpsubw %ymm4, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm8
+ vpaddw %ymm11, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm7, %ymm12, %ymm12
+ vpmulhw %ymm7, %ymm13, %ymm13
+ vpmulhw %ymm7, %ymm14, %ymm14
+ vpmulhw %ymm7, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm3, %ymm3
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm3, %ymm12, %ymm3
+ vpsubw %ymm5, %ymm13, %ymm5
+ vpsubw %ymm8, %ymm14, %ymm8
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpmulhw %ymm1, %ymm9, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vperm2i128 $0x20, %ymm10, %ymm9, %ymm7 # ymm7 = ymm9[0,1],ymm10[0,1]
+ vperm2i128 $0x31, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[2,3],ymm10[2,3]
+ vperm2i128 $0x20, %ymm4, %ymm6, %ymm9 # ymm9 = ymm6[0,1],ymm4[0,1]
+ vperm2i128 $0x31, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[2,3],ymm4[2,3]
+ vperm2i128 $0x20, %ymm5, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm5[0,1]
+ vperm2i128 $0x31, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[2,3],ymm5[2,3]
+ vperm2i128 $0x20, %ymm11, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm11[0,1]
+ vperm2i128 $0x31, %ymm11, %ymm8, %ymm11 # ymm11 = ymm8[2,3],ymm11[2,3]
+ vmovdqa 0x60(%rsi), %ymm2
+ vmovdqa 0x80(%rsi), %ymm8
+ vpsubw %ymm7, %ymm10, %ymm12
+ vpaddw %ymm10, %ymm7, %ymm7
+ vpsubw %ymm9, %ymm4, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm10
+ vpaddw %ymm4, %ymm9, %ymm9
+ vpsubw %ymm6, %ymm5, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm4
+ vpaddw %ymm5, %ymm6, %ymm6
+ vpsubw %ymm3, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm5
+ vpaddw %ymm11, %ymm3, %ymm3
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm8, %ymm12, %ymm12
+ vpmulhw %ymm8, %ymm13, %ymm13
+ vpmulhw %ymm8, %ymm14, %ymm14
+ vpmulhw %ymm8, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm10, %ymm12, %ymm10
+ vpsubw %ymm4, %ymm13, %ymm4
+ vpsubw %ymm5, %ymm14, %ymm5
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpmulhw %ymm1, %ymm7, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm7, %ymm7
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa %ymm6, 0x140(%rdi)
+ vmovdqa %ymm3, 0x160(%rdi)
+ vmovdqa %ymm10, 0x180(%rdi)
+ vmovdqa %ymm4, 0x1a0(%rdi)
+ vmovdqa %ymm5, 0x1c0(%rdi)
+ vmovdqa %ymm11, 0x1e0(%rdi)
+ vmovdqa (%rdi), %ymm4
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x20(%rdi), %ymm5
+ vmovdqa 0x120(%rdi), %ymm9
+ vpbroadcastq 0x40(%rsi), %ymm2
+ vmovdqa 0x40(%rdi), %ymm6
+ vmovdqa 0x140(%rdi), %ymm10
+ vmovdqa 0x60(%rdi), %ymm7
+ vmovdqa 0x160(%rdi), %ymm11
+ vpbroadcastq 0x48(%rsi), %ymm3
+ vpsubw %ymm4, %ymm8, %ymm12
+ vpaddw %ymm8, %ymm4, %ymm4
+ vpsubw %ymm5, %ymm9, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm5
+ vpsubw %ymm6, %ymm10, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm6
+ vpsubw %ymm7, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm7
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm3, %ymm12, %ymm12
+ vpmulhw %ymm3, %ymm13, %ymm13
+ vpmulhw %ymm3, %ymm14, %ymm14
+ vpmulhw %ymm3, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm8, %ymm12, %ymm8
+ vpsubw %ymm9, %ymm13, %ymm9
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpsubw %ymm11, %ymm15, %ymm11
+ vmovdqa %ymm4, (%rdi)
+ vmovdqa %ymm5, 0x20(%rdi)
+ vmovdqa %ymm6, 0x40(%rdi)
+ vmovdqa %ymm7, 0x60(%rdi)
+ vmovdqa %ymm8, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa %ymm10, 0x140(%rdi)
+ vmovdqa %ymm11, 0x160(%rdi)
+ vmovdqa 0x80(%rdi), %ymm4
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm5
+ vmovdqa 0x1a0(%rdi), %ymm9
+ vpbroadcastq 0x40(%rsi), %ymm2
+ vmovdqa 0xc0(%rdi), %ymm6
+ vmovdqa 0x1c0(%rdi), %ymm10
+ vmovdqa 0xe0(%rdi), %ymm7
+ vmovdqa 0x1e0(%rdi), %ymm11
+ vpbroadcastq 0x48(%rsi), %ymm3
+ vpsubw %ymm4, %ymm8, %ymm12
+ vpaddw %ymm8, %ymm4, %ymm4
+ vpsubw %ymm5, %ymm9, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm5
+ vpsubw %ymm6, %ymm10, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm6
+ vpsubw %ymm7, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm7
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm3, %ymm12, %ymm12
+ vpmulhw %ymm3, %ymm13, %ymm13
+ vpmulhw %ymm3, %ymm14, %ymm14
+ vpmulhw %ymm3, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm8, %ymm12, %ymm8
+ vpsubw %ymm9, %ymm13, %ymm9
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpsubw %ymm11, %ymm15, %ymm11
+ vmovdqa %ymm4, 0x80(%rdi)
+ vmovdqa %ymm5, 0xa0(%rdi)
+ vmovdqa %ymm6, 0xc0(%rdi)
+ vmovdqa %ymm7, 0xe0(%rdi)
+ vmovdqa %ymm8, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa %ymm10, 0x1c0(%rdi)
+ vmovdqa %ymm11, 0x1e0(%rdi)
retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(invntt_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/mulcache_compute.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/mulcache_compute.S
index c8bde382ec..ee7a12c6fe 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/mulcache_compute.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/mulcache_compute.S
@@ -12,70 +12,79 @@
* dev/x86_64/src/mulcache_compute.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(poly_mulcache_compute_avx2)
MLK_ASM_FN_SYMBOL(poly_mulcache_compute_avx2)
- vmovdqa (%rdx), %ymm0
- vmovdqa 0x20(%rsi), %ymm2
- vmovdqa 0x60(%rsi), %ymm3
- vmovdqa 0x500(%rdx), %ymm4
- vmovdqa 0x580(%rdx), %ymm1
- vpmullw %ymm2, %ymm1, %ymm5
- vpmullw %ymm3, %ymm1, %ymm6
- vpmulhw %ymm2, %ymm4, %ymm7
- vpmulhw %ymm3, %ymm4, %ymm8
- vpmulhw %ymm5, %ymm0, %ymm9
- vpmulhw %ymm6, %ymm0, %ymm10
- vpsubw %ymm9, %ymm7, %ymm7
- vpsubw %ymm10, %ymm8, %ymm8
- vmovdqa %ymm7, (%rdi)
- vmovdqa %ymm8, 0x20(%rdi)
- vmovdqa 0xa0(%rsi), %ymm2
- vmovdqa 0xe0(%rsi), %ymm3
- vmovdqa 0x520(%rdx), %ymm4
- vmovdqa 0x5a0(%rdx), %ymm1
- vpmullw %ymm2, %ymm1, %ymm5
- vpmullw %ymm3, %ymm1, %ymm6
- vpmulhw %ymm2, %ymm4, %ymm7
- vpmulhw %ymm3, %ymm4, %ymm8
- vpmulhw %ymm5, %ymm0, %ymm9
- vpmulhw %ymm6, %ymm0, %ymm10
- vpsubw %ymm9, %ymm7, %ymm7
- vpsubw %ymm10, %ymm8, %ymm8
- vmovdqa %ymm7, 0x40(%rdi)
- vmovdqa %ymm8, 0x60(%rdi)
- vmovdqa 0x120(%rsi), %ymm2
- vmovdqa 0x160(%rsi), %ymm3
- vmovdqa 0x540(%rdx), %ymm4
- vmovdqa 0x5c0(%rdx), %ymm1
- vpmullw %ymm2, %ymm1, %ymm5
- vpmullw %ymm3, %ymm1, %ymm6
- vpmulhw %ymm2, %ymm4, %ymm7
- vpmulhw %ymm3, %ymm4, %ymm8
- vpmulhw %ymm5, %ymm0, %ymm9
- vpmulhw %ymm6, %ymm0, %ymm10
- vpsubw %ymm9, %ymm7, %ymm7
- vpsubw %ymm10, %ymm8, %ymm8
- vmovdqa %ymm7, 0x80(%rdi)
- vmovdqa %ymm8, 0xa0(%rdi)
- vmovdqa 0x1a0(%rsi), %ymm2
- vmovdqa 0x1e0(%rsi), %ymm3
- vmovdqa 0x560(%rdx), %ymm4
- vmovdqa 0x5e0(%rdx), %ymm1
- vpmullw %ymm2, %ymm1, %ymm5
- vpmullw %ymm3, %ymm1, %ymm6
- vpmulhw %ymm2, %ymm4, %ymm7
- vpmulhw %ymm3, %ymm4, %ymm8
- vpmulhw %ymm5, %ymm0, %ymm9
- vpmulhw %ymm6, %ymm0, %ymm10
- vpsubw %ymm9, %ymm7, %ymm7
- vpsubw %ymm10, %ymm8, %ymm8
- vmovdqa %ymm7, 0xc0(%rdi)
- vmovdqa %ymm8, 0xe0(%rdi)
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ vmovdqa 0x20(%rsi), %ymm2
+ vmovdqa 0x60(%rsi), %ymm3
+ vmovdqa 0x3e0(%rdx), %ymm4
+ vmovdqa 0x460(%rdx), %ymm1
+ vpmullw %ymm2, %ymm1, %ymm5
+ vpmullw %ymm3, %ymm1, %ymm6
+ vpmulhw %ymm2, %ymm4, %ymm7
+ vpmulhw %ymm3, %ymm4, %ymm8
+ vpmulhw %ymm5, %ymm0, %ymm9
+ vpmulhw %ymm6, %ymm0, %ymm10
+ vpsubw %ymm9, %ymm7, %ymm7
+ vpsubw %ymm10, %ymm8, %ymm8
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm8, 0x20(%rdi)
+ vmovdqa 0xa0(%rsi), %ymm2
+ vmovdqa 0xe0(%rsi), %ymm3
+ vmovdqa 0x400(%rdx), %ymm4
+ vmovdqa 0x480(%rdx), %ymm1
+ vpmullw %ymm2, %ymm1, %ymm5
+ vpmullw %ymm3, %ymm1, %ymm6
+ vpmulhw %ymm2, %ymm4, %ymm7
+ vpmulhw %ymm3, %ymm4, %ymm8
+ vpmulhw %ymm5, %ymm0, %ymm9
+ vpmulhw %ymm6, %ymm0, %ymm10
+ vpsubw %ymm9, %ymm7, %ymm7
+ vpsubw %ymm10, %ymm8, %ymm8
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm8, 0x60(%rdi)
+ vmovdqa 0x120(%rsi), %ymm2
+ vmovdqa 0x160(%rsi), %ymm3
+ vmovdqa 0x420(%rdx), %ymm4
+ vmovdqa 0x4a0(%rdx), %ymm1
+ vpmullw %ymm2, %ymm1, %ymm5
+ vpmullw %ymm3, %ymm1, %ymm6
+ vpmulhw %ymm2, %ymm4, %ymm7
+ vpmulhw %ymm3, %ymm4, %ymm8
+ vpmulhw %ymm5, %ymm0, %ymm9
+ vpmulhw %ymm6, %ymm0, %ymm10
+ vpsubw %ymm9, %ymm7, %ymm7
+ vpsubw %ymm10, %ymm8, %ymm8
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm8, 0xa0(%rdi)
+ vmovdqa 0x1a0(%rsi), %ymm2
+ vmovdqa 0x1e0(%rsi), %ymm3
+ vmovdqa 0x440(%rdx), %ymm4
+ vmovdqa 0x4c0(%rdx), %ymm1
+ vpmullw %ymm2, %ymm1, %ymm5
+ vpmullw %ymm3, %ymm1, %ymm6
+ vpmulhw %ymm2, %ymm4, %ymm7
+ vpmulhw %ymm3, %ymm4, %ymm8
+ vpmulhw %ymm5, %ymm0, %ymm9
+ vpmulhw %ymm6, %ymm0, %ymm10
+ vpsubw %ymm9, %ymm7, %ymm7
+ vpsubw %ymm10, %ymm8, %ymm8
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm8, 0xe0(%rdi)
retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_mulcache_compute_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/ntt.S
index 948f963c8a..24f075e494 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/ntt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/ntt.S
@@ -33,598 +33,607 @@
* dev/x86_64/src/ntt.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(ntt_avx2)
MLK_ASM_FN_SYMBOL(ntt_avx2)
- vmovdqa (%rsi), %ymm0
- vpbroadcastq 0x140(%rsi), %ymm15
- vmovdqa 0x100(%rdi), %ymm8
- vmovdqa 0x120(%rdi), %ymm9
- vmovdqa 0x140(%rdi), %ymm10
- vmovdqa 0x160(%rdi), %ymm11
- vpbroadcastq 0x148(%rsi), %ymm2
- vpmullw %ymm15, %ymm8, %ymm12
- vpmullw %ymm15, %ymm9, %ymm13
- vpmullw %ymm15, %ymm10, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm11, %ymm11
- vmovdqa (%rdi), %ymm4
- vmovdqa 0x20(%rdi), %ymm5
- vmovdqa 0x40(%rdi), %ymm6
- vmovdqa 0x60(%rdi), %ymm7
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm8, %ymm4, %ymm3
- vpsubw %ymm8, %ymm4, %ymm8
- vpaddw %ymm9, %ymm5, %ymm4
- vpsubw %ymm9, %ymm5, %ymm9
- vpaddw %ymm10, %ymm6, %ymm5
- vpsubw %ymm10, %ymm6, %ymm10
- vpaddw %ymm11, %ymm7, %ymm6
- vpsubw %ymm11, %ymm7, %ymm11
- vpsubw %ymm12, %ymm3, %ymm3
- vpaddw %ymm12, %ymm8, %ymm8
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm9, %ymm9
- vpsubw %ymm14, %ymm5, %ymm5
- vpaddw %ymm14, %ymm10, %ymm10
- vpsubw %ymm15, %ymm6, %ymm6
- vpaddw %ymm15, %ymm11, %ymm11
- vmovdqa %ymm3, (%rdi)
- vmovdqa %ymm4, 0x20(%rdi)
- vmovdqa %ymm5, 0x40(%rdi)
- vmovdqa %ymm6, 0x60(%rdi)
- vmovdqa %ymm8, 0x100(%rdi)
- vmovdqa %ymm9, 0x120(%rdi)
- vmovdqa %ymm10, 0x140(%rdi)
- vmovdqa %ymm11, 0x160(%rdi)
- vpbroadcastq 0x140(%rsi), %ymm15
- vmovdqa 0x180(%rdi), %ymm8
- vmovdqa 0x1a0(%rdi), %ymm9
- vmovdqa 0x1c0(%rdi), %ymm10
- vmovdqa 0x1e0(%rdi), %ymm11
- vpbroadcastq 0x148(%rsi), %ymm2
- vpmullw %ymm15, %ymm8, %ymm12
- vpmullw %ymm15, %ymm9, %ymm13
- vpmullw %ymm15, %ymm10, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm11, %ymm11
- vmovdqa 0x80(%rdi), %ymm4
- vmovdqa 0xa0(%rdi), %ymm5
- vmovdqa 0xc0(%rdi), %ymm6
- vmovdqa 0xe0(%rdi), %ymm7
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm8, %ymm4, %ymm3
- vpsubw %ymm8, %ymm4, %ymm8
- vpaddw %ymm9, %ymm5, %ymm4
- vpsubw %ymm9, %ymm5, %ymm9
- vpaddw %ymm10, %ymm6, %ymm5
- vpsubw %ymm10, %ymm6, %ymm10
- vpaddw %ymm11, %ymm7, %ymm6
- vpsubw %ymm11, %ymm7, %ymm11
- vpsubw %ymm12, %ymm3, %ymm3
- vpaddw %ymm12, %ymm8, %ymm8
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm9, %ymm9
- vpsubw %ymm14, %ymm5, %ymm5
- vpaddw %ymm14, %ymm10, %ymm10
- vpsubw %ymm15, %ymm6, %ymm6
- vpaddw %ymm15, %ymm11, %ymm11
- vmovdqa %ymm3, 0x80(%rdi)
- vmovdqa %ymm4, 0xa0(%rdi)
- vmovdqa %ymm5, 0xc0(%rdi)
- vmovdqa %ymm6, 0xe0(%rdi)
- vmovdqa %ymm8, 0x180(%rdi)
- vmovdqa %ymm9, 0x1a0(%rdi)
- vmovdqa %ymm10, 0x1c0(%rdi)
- vmovdqa %ymm11, 0x1e0(%rdi)
- vmovdqa 0x160(%rsi), %ymm15
- vmovdqa 0x80(%rdi), %ymm8
- vmovdqa 0xa0(%rdi), %ymm9
- vmovdqa 0xc0(%rdi), %ymm10
- vmovdqa 0xe0(%rdi), %ymm11
- vmovdqa 0x180(%rsi), %ymm2
- vpmullw %ymm15, %ymm8, %ymm12
- vpmullw %ymm15, %ymm9, %ymm13
- vpmullw %ymm15, %ymm10, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm11, %ymm11
- vmovdqa (%rdi), %ymm4
- vmovdqa 0x20(%rdi), %ymm5
- vmovdqa 0x40(%rdi), %ymm6
- vmovdqa 0x60(%rdi), %ymm7
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm8, %ymm4, %ymm3
- vpsubw %ymm8, %ymm4, %ymm8
- vpaddw %ymm9, %ymm5, %ymm4
- vpsubw %ymm9, %ymm5, %ymm9
- vpaddw %ymm10, %ymm6, %ymm5
- vpsubw %ymm10, %ymm6, %ymm10
- vpaddw %ymm11, %ymm7, %ymm6
- vpsubw %ymm11, %ymm7, %ymm11
- vpsubw %ymm12, %ymm3, %ymm3
- vpaddw %ymm12, %ymm8, %ymm8
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm9, %ymm9
- vpsubw %ymm14, %ymm5, %ymm5
- vpaddw %ymm14, %ymm10, %ymm10
- vpsubw %ymm15, %ymm6, %ymm6
- vpaddw %ymm15, %ymm11, %ymm11
- vperm2i128 $0x20, %ymm10, %ymm5, %ymm7 # ymm7 = ymm5[0,1],ymm10[0,1]
- vperm2i128 $0x31, %ymm10, %ymm5, %ymm10 # ymm10 = ymm5[2,3],ymm10[2,3]
- vperm2i128 $0x20, %ymm11, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm11[0,1]
- vperm2i128 $0x31, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[2,3],ymm11[2,3]
- vmovdqa 0x1a0(%rsi), %ymm15
- vmovdqa 0x1c0(%rsi), %ymm2
- vpmullw %ymm15, %ymm7, %ymm12
- vpmullw %ymm15, %ymm10, %ymm13
- vpmullw %ymm15, %ymm5, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm7, %ymm7
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm5, %ymm5
- vpmulhw %ymm2, %ymm11, %ymm11
- vperm2i128 $0x20, %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm8[0,1]
- vperm2i128 $0x31, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[2,3],ymm8[2,3]
- vperm2i128 $0x20, %ymm9, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm9[0,1]
- vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm7, %ymm6, %ymm4
- vpsubw %ymm7, %ymm6, %ymm7
- vpaddw %ymm10, %ymm8, %ymm6
- vpsubw %ymm10, %ymm8, %ymm10
- vpaddw %ymm5, %ymm3, %ymm8
- vpsubw %ymm5, %ymm3, %ymm5
- vpaddw %ymm11, %ymm9, %ymm3
- vpsubw %ymm11, %ymm9, %ymm11
- vpsubw %ymm12, %ymm4, %ymm4
- vpaddw %ymm12, %ymm7, %ymm7
- vpsubw %ymm13, %ymm6, %ymm6
- vpaddw %ymm13, %ymm10, %ymm10
- vpsubw %ymm14, %ymm8, %ymm8
- vpaddw %ymm14, %ymm5, %ymm5
- vpsubw %ymm15, %ymm3, %ymm3
- vpaddw %ymm15, %ymm11, %ymm11
- vpunpcklqdq %ymm5, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm5[0],ymm8[2],ymm5[2]
- vpunpckhqdq %ymm5, %ymm8, %ymm5 # ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3]
- vpunpcklqdq %ymm11, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
- vpunpckhqdq %ymm11, %ymm3, %ymm11 # ymm11 = ymm3[1],ymm11[1],ymm3[3],ymm11[3]
- vmovdqa 0x1e0(%rsi), %ymm15
- vmovdqa 0x200(%rsi), %ymm2
- vpmullw %ymm15, %ymm9, %ymm12
- vpmullw %ymm15, %ymm5, %ymm13
- vpmullw %ymm15, %ymm8, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm5, %ymm5
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm11, %ymm11
- vpunpcklqdq %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
- vpunpckhqdq %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[1],ymm7[1],ymm4[3],ymm7[3]
- vpunpcklqdq %ymm10, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2]
- vpunpckhqdq %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm9, %ymm3, %ymm6
- vpsubw %ymm9, %ymm3, %ymm9
- vpaddw %ymm5, %ymm7, %ymm3
- vpsubw %ymm5, %ymm7, %ymm5
- vpaddw %ymm8, %ymm4, %ymm7
- vpsubw %ymm8, %ymm4, %ymm8
- vpaddw %ymm11, %ymm10, %ymm4
- vpsubw %ymm11, %ymm10, %ymm11
- vpsubw %ymm12, %ymm6, %ymm6
- vpaddw %ymm12, %ymm9, %ymm9
- vpsubw %ymm13, %ymm3, %ymm3
- vpaddw %ymm13, %ymm5, %ymm5
- vpsubw %ymm14, %ymm7, %ymm7
- vpaddw %ymm14, %ymm8, %ymm8
- vpsubw %ymm15, %ymm4, %ymm4
- vpaddw %ymm15, %ymm11, %ymm11
- vmovsldup %ymm8, %ymm10 # ymm10 = ymm8[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm10, %ymm7, %ymm10 # ymm10 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4],ymm10[5],ymm7[6],ymm10[7]
- vpsrlq $0x20, %ymm7, %ymm7
- vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
- vmovsldup %ymm11, %ymm7 # ymm7 = ymm11[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7]
- vpsrlq $0x20, %ymm4, %ymm4
- vpblendd $0xaa, %ymm11, %ymm4, %ymm11 # ymm11 = ymm4[0],ymm11[1],ymm4[2],ymm11[3],ymm4[4],ymm11[5],ymm4[6],ymm11[7]
- vmovdqa 0x220(%rsi), %ymm15
- vmovdqa 0x240(%rsi), %ymm2
- vpmullw %ymm15, %ymm10, %ymm12
- vpmullw %ymm15, %ymm8, %ymm13
- vpmullw %ymm15, %ymm7, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm7, %ymm7
- vpmulhw %ymm2, %ymm11, %ymm11
- vmovsldup %ymm9, %ymm4 # ymm4 = ymm9[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
- vpsrlq $0x20, %ymm6, %ymm6
- vpblendd $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7]
- vmovsldup %ymm5, %ymm6 # ymm6 = ymm5[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7]
- vpsrlq $0x20, %ymm3, %ymm3
- vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm10, %ymm4, %ymm3
- vpsubw %ymm10, %ymm4, %ymm10
- vpaddw %ymm8, %ymm9, %ymm4
- vpsubw %ymm8, %ymm9, %ymm8
- vpaddw %ymm7, %ymm6, %ymm9
- vpsubw %ymm7, %ymm6, %ymm7
- vpaddw %ymm11, %ymm5, %ymm6
- vpsubw %ymm11, %ymm5, %ymm11
- vpsubw %ymm12, %ymm3, %ymm3
- vpaddw %ymm12, %ymm10, %ymm10
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm8, %ymm8
- vpsubw %ymm14, %ymm9, %ymm9
- vpaddw %ymm14, %ymm7, %ymm7
- vpsubw %ymm15, %ymm6, %ymm6
- vpaddw %ymm15, %ymm11, %ymm11
- vpslld $0x10, %ymm7, %ymm5
- vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
- vpsrld $0x10, %ymm9, %ymm9
- vpblendw $0xaa, %ymm7, %ymm9, %ymm7 # ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4],ymm7[5],ymm9[6],ymm7[7],ymm9[8],ymm7[9],ymm9[10],ymm7[11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
- vpslld $0x10, %ymm11, %ymm9
- vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
- vpsrld $0x10, %ymm6, %ymm6
- vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
- vmovdqa 0x260(%rsi), %ymm15
- vmovdqa 0x280(%rsi), %ymm2
- vpmullw %ymm15, %ymm5, %ymm12
- vpmullw %ymm15, %ymm7, %ymm13
- vpmullw %ymm15, %ymm9, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm5, %ymm5
- vpmulhw %ymm2, %ymm7, %ymm7
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm11, %ymm11
- vpslld $0x10, %ymm10, %ymm6
- vpblendw $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7],ymm3[8],ymm6[9],ymm3[10],ymm6[11],ymm3[12],ymm6[13],ymm3[14],ymm6[15]
- vpsrld $0x10, %ymm3, %ymm3
- vpblendw $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7],ymm3[8],ymm10[9],ymm3[10],ymm10[11],ymm3[12],ymm10[13],ymm3[14],ymm10[15]
- vpslld $0x10, %ymm8, %ymm3
- vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
- vpsrld $0x10, %ymm4, %ymm4
- vpblendw $0xaa, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7],ymm4[8],ymm8[9],ymm4[10],ymm8[11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm5, %ymm6, %ymm4
- vpsubw %ymm5, %ymm6, %ymm5
- vpaddw %ymm7, %ymm10, %ymm6
- vpsubw %ymm7, %ymm10, %ymm7
- vpaddw %ymm9, %ymm3, %ymm10
- vpsubw %ymm9, %ymm3, %ymm9
- vpaddw %ymm11, %ymm8, %ymm3
- vpsubw %ymm11, %ymm8, %ymm11
- vpsubw %ymm12, %ymm4, %ymm4
- vpaddw %ymm12, %ymm5, %ymm5
- vpsubw %ymm13, %ymm6, %ymm6
- vpaddw %ymm13, %ymm7, %ymm7
- vpsubw %ymm14, %ymm10, %ymm10
- vpaddw %ymm14, %ymm9, %ymm9
- vpsubw %ymm15, %ymm3, %ymm3
- vpaddw %ymm15, %ymm11, %ymm11
- vmovdqa 0x2a0(%rsi), %ymm14
- vmovdqa 0x2e0(%rsi), %ymm15
- vmovdqa 0x2c0(%rsi), %ymm8
- vmovdqa 0x300(%rsi), %ymm2
- vpmullw %ymm14, %ymm10, %ymm12
- vpmullw %ymm14, %ymm3, %ymm13
- vpmullw %ymm15, %ymm9, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm8, %ymm10, %ymm10
- vpmulhw %ymm8, %ymm3, %ymm3
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm11, %ymm11
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm10, %ymm4, %ymm8
- vpsubw %ymm10, %ymm4, %ymm10
- vpaddw %ymm3, %ymm6, %ymm4
- vpsubw %ymm3, %ymm6, %ymm3
- vpaddw %ymm9, %ymm5, %ymm6
- vpsubw %ymm9, %ymm5, %ymm9
- vpaddw %ymm11, %ymm7, %ymm5
- vpsubw %ymm11, %ymm7, %ymm11
- vpsubw %ymm12, %ymm8, %ymm8
- vpaddw %ymm12, %ymm10, %ymm10
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm3, %ymm3
- vpsubw %ymm14, %ymm6, %ymm6
- vpaddw %ymm14, %ymm9, %ymm9
- vpsubw %ymm15, %ymm5, %ymm5
- vpaddw %ymm15, %ymm11, %ymm11
- vmovdqa %ymm8, (%rdi)
- vmovdqa %ymm4, 0x20(%rdi)
- vmovdqa %ymm10, 0x40(%rdi)
- vmovdqa %ymm3, 0x60(%rdi)
- vmovdqa %ymm6, 0x80(%rdi)
- vmovdqa %ymm5, 0xa0(%rdi)
- vmovdqa %ymm9, 0xc0(%rdi)
- vmovdqa %ymm11, 0xe0(%rdi)
- vmovdqa 0x320(%rsi), %ymm15
- vmovdqa 0x180(%rdi), %ymm8
- vmovdqa 0x1a0(%rdi), %ymm9
- vmovdqa 0x1c0(%rdi), %ymm10
- vmovdqa 0x1e0(%rdi), %ymm11
- vmovdqa 0x340(%rsi), %ymm2
- vpmullw %ymm15, %ymm8, %ymm12
- vpmullw %ymm15, %ymm9, %ymm13
- vpmullw %ymm15, %ymm10, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm11, %ymm11
- vmovdqa 0x100(%rdi), %ymm4
- vmovdqa 0x120(%rdi), %ymm5
- vmovdqa 0x140(%rdi), %ymm6
- vmovdqa 0x160(%rdi), %ymm7
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm8, %ymm4, %ymm3
- vpsubw %ymm8, %ymm4, %ymm8
- vpaddw %ymm9, %ymm5, %ymm4
- vpsubw %ymm9, %ymm5, %ymm9
- vpaddw %ymm10, %ymm6, %ymm5
- vpsubw %ymm10, %ymm6, %ymm10
- vpaddw %ymm11, %ymm7, %ymm6
- vpsubw %ymm11, %ymm7, %ymm11
- vpsubw %ymm12, %ymm3, %ymm3
- vpaddw %ymm12, %ymm8, %ymm8
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm9, %ymm9
- vpsubw %ymm14, %ymm5, %ymm5
- vpaddw %ymm14, %ymm10, %ymm10
- vpsubw %ymm15, %ymm6, %ymm6
- vpaddw %ymm15, %ymm11, %ymm11
- vperm2i128 $0x20, %ymm10, %ymm5, %ymm7 # ymm7 = ymm5[0,1],ymm10[0,1]
- vperm2i128 $0x31, %ymm10, %ymm5, %ymm10 # ymm10 = ymm5[2,3],ymm10[2,3]
- vperm2i128 $0x20, %ymm11, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm11[0,1]
- vperm2i128 $0x31, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[2,3],ymm11[2,3]
- vmovdqa 0x360(%rsi), %ymm15
- vmovdqa 0x380(%rsi), %ymm2
- vpmullw %ymm15, %ymm7, %ymm12
- vpmullw %ymm15, %ymm10, %ymm13
- vpmullw %ymm15, %ymm5, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm7, %ymm7
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm5, %ymm5
- vpmulhw %ymm2, %ymm11, %ymm11
- vperm2i128 $0x20, %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm8[0,1]
- vperm2i128 $0x31, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[2,3],ymm8[2,3]
- vperm2i128 $0x20, %ymm9, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm9[0,1]
- vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm7, %ymm6, %ymm4
- vpsubw %ymm7, %ymm6, %ymm7
- vpaddw %ymm10, %ymm8, %ymm6
- vpsubw %ymm10, %ymm8, %ymm10
- vpaddw %ymm5, %ymm3, %ymm8
- vpsubw %ymm5, %ymm3, %ymm5
- vpaddw %ymm11, %ymm9, %ymm3
- vpsubw %ymm11, %ymm9, %ymm11
- vpsubw %ymm12, %ymm4, %ymm4
- vpaddw %ymm12, %ymm7, %ymm7
- vpsubw %ymm13, %ymm6, %ymm6
- vpaddw %ymm13, %ymm10, %ymm10
- vpsubw %ymm14, %ymm8, %ymm8
- vpaddw %ymm14, %ymm5, %ymm5
- vpsubw %ymm15, %ymm3, %ymm3
- vpaddw %ymm15, %ymm11, %ymm11
- vpunpcklqdq %ymm5, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm5[0],ymm8[2],ymm5[2]
- vpunpckhqdq %ymm5, %ymm8, %ymm5 # ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3]
- vpunpcklqdq %ymm11, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
- vpunpckhqdq %ymm11, %ymm3, %ymm11 # ymm11 = ymm3[1],ymm11[1],ymm3[3],ymm11[3]
- vmovdqa 0x3a0(%rsi), %ymm15
- vmovdqa 0x3c0(%rsi), %ymm2
- vpmullw %ymm15, %ymm9, %ymm12
- vpmullw %ymm15, %ymm5, %ymm13
- vpmullw %ymm15, %ymm8, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm5, %ymm5
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm11, %ymm11
- vpunpcklqdq %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
- vpunpckhqdq %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[1],ymm7[1],ymm4[3],ymm7[3]
- vpunpcklqdq %ymm10, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2]
- vpunpckhqdq %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm9, %ymm3, %ymm6
- vpsubw %ymm9, %ymm3, %ymm9
- vpaddw %ymm5, %ymm7, %ymm3
- vpsubw %ymm5, %ymm7, %ymm5
- vpaddw %ymm8, %ymm4, %ymm7
- vpsubw %ymm8, %ymm4, %ymm8
- vpaddw %ymm11, %ymm10, %ymm4
- vpsubw %ymm11, %ymm10, %ymm11
- vpsubw %ymm12, %ymm6, %ymm6
- vpaddw %ymm12, %ymm9, %ymm9
- vpsubw %ymm13, %ymm3, %ymm3
- vpaddw %ymm13, %ymm5, %ymm5
- vpsubw %ymm14, %ymm7, %ymm7
- vpaddw %ymm14, %ymm8, %ymm8
- vpsubw %ymm15, %ymm4, %ymm4
- vpaddw %ymm15, %ymm11, %ymm11
- vmovsldup %ymm8, %ymm10 # ymm10 = ymm8[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm10, %ymm7, %ymm10 # ymm10 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4],ymm10[5],ymm7[6],ymm10[7]
- vpsrlq $0x20, %ymm7, %ymm7
- vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
- vmovsldup %ymm11, %ymm7 # ymm7 = ymm11[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7]
- vpsrlq $0x20, %ymm4, %ymm4
- vpblendd $0xaa, %ymm11, %ymm4, %ymm11 # ymm11 = ymm4[0],ymm11[1],ymm4[2],ymm11[3],ymm4[4],ymm11[5],ymm4[6],ymm11[7]
- vmovdqa 0x3e0(%rsi), %ymm15
- vmovdqa 0x400(%rsi), %ymm2
- vpmullw %ymm15, %ymm10, %ymm12
- vpmullw %ymm15, %ymm8, %ymm13
- vpmullw %ymm15, %ymm7, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm7, %ymm7
- vpmulhw %ymm2, %ymm11, %ymm11
- vmovsldup %ymm9, %ymm4 # ymm4 = ymm9[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
- vpsrlq $0x20, %ymm6, %ymm6
- vpblendd $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7]
- vmovsldup %ymm5, %ymm6 # ymm6 = ymm5[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7]
- vpsrlq $0x20, %ymm3, %ymm3
- vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm10, %ymm4, %ymm3
- vpsubw %ymm10, %ymm4, %ymm10
- vpaddw %ymm8, %ymm9, %ymm4
- vpsubw %ymm8, %ymm9, %ymm8
- vpaddw %ymm7, %ymm6, %ymm9
- vpsubw %ymm7, %ymm6, %ymm7
- vpaddw %ymm11, %ymm5, %ymm6
- vpsubw %ymm11, %ymm5, %ymm11
- vpsubw %ymm12, %ymm3, %ymm3
- vpaddw %ymm12, %ymm10, %ymm10
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm8, %ymm8
- vpsubw %ymm14, %ymm9, %ymm9
- vpaddw %ymm14, %ymm7, %ymm7
- vpsubw %ymm15, %ymm6, %ymm6
- vpaddw %ymm15, %ymm11, %ymm11
- vpslld $0x10, %ymm7, %ymm5
- vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
- vpsrld $0x10, %ymm9, %ymm9
- vpblendw $0xaa, %ymm7, %ymm9, %ymm7 # ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4],ymm7[5],ymm9[6],ymm7[7],ymm9[8],ymm7[9],ymm9[10],ymm7[11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
- vpslld $0x10, %ymm11, %ymm9
- vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
- vpsrld $0x10, %ymm6, %ymm6
- vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
- vmovdqa 0x420(%rsi), %ymm15
- vmovdqa 0x440(%rsi), %ymm2
- vpmullw %ymm15, %ymm5, %ymm12
- vpmullw %ymm15, %ymm7, %ymm13
- vpmullw %ymm15, %ymm9, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm5, %ymm5
- vpmulhw %ymm2, %ymm7, %ymm7
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm11, %ymm11
- vpslld $0x10, %ymm10, %ymm6
- vpblendw $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7],ymm3[8],ymm6[9],ymm3[10],ymm6[11],ymm3[12],ymm6[13],ymm3[14],ymm6[15]
- vpsrld $0x10, %ymm3, %ymm3
- vpblendw $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7],ymm3[8],ymm10[9],ymm3[10],ymm10[11],ymm3[12],ymm10[13],ymm3[14],ymm10[15]
- vpslld $0x10, %ymm8, %ymm3
- vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
- vpsrld $0x10, %ymm4, %ymm4
- vpblendw $0xaa, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7],ymm4[8],ymm8[9],ymm4[10],ymm8[11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm5, %ymm6, %ymm4
- vpsubw %ymm5, %ymm6, %ymm5
- vpaddw %ymm7, %ymm10, %ymm6
- vpsubw %ymm7, %ymm10, %ymm7
- vpaddw %ymm9, %ymm3, %ymm10
- vpsubw %ymm9, %ymm3, %ymm9
- vpaddw %ymm11, %ymm8, %ymm3
- vpsubw %ymm11, %ymm8, %ymm11
- vpsubw %ymm12, %ymm4, %ymm4
- vpaddw %ymm12, %ymm5, %ymm5
- vpsubw %ymm13, %ymm6, %ymm6
- vpaddw %ymm13, %ymm7, %ymm7
- vpsubw %ymm14, %ymm10, %ymm10
- vpaddw %ymm14, %ymm9, %ymm9
- vpsubw %ymm15, %ymm3, %ymm3
- vpaddw %ymm15, %ymm11, %ymm11
- vmovdqa 0x460(%rsi), %ymm14
- vmovdqa 0x4a0(%rsi), %ymm15
- vmovdqa 0x480(%rsi), %ymm8
- vmovdqa 0x4c0(%rsi), %ymm2
- vpmullw %ymm14, %ymm10, %ymm12
- vpmullw %ymm14, %ymm3, %ymm13
- vpmullw %ymm15, %ymm9, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm8, %ymm10, %ymm10
- vpmulhw %ymm8, %ymm3, %ymm3
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm11, %ymm11
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm10, %ymm4, %ymm8
- vpsubw %ymm10, %ymm4, %ymm10
- vpaddw %ymm3, %ymm6, %ymm4
- vpsubw %ymm3, %ymm6, %ymm3
- vpaddw %ymm9, %ymm5, %ymm6
- vpsubw %ymm9, %ymm5, %ymm9
- vpaddw %ymm11, %ymm7, %ymm5
- vpsubw %ymm11, %ymm7, %ymm11
- vpsubw %ymm12, %ymm8, %ymm8
- vpaddw %ymm12, %ymm10, %ymm10
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm3, %ymm3
- vpsubw %ymm14, %ymm6, %ymm6
- vpaddw %ymm14, %ymm9, %ymm9
- vpsubw %ymm15, %ymm5, %ymm5
- vpaddw %ymm15, %ymm11, %ymm11
- vmovdqa %ymm8, 0x100(%rdi)
- vmovdqa %ymm4, 0x120(%rdi)
- vmovdqa %ymm10, 0x140(%rdi)
- vmovdqa %ymm3, 0x160(%rdi)
- vmovdqa %ymm6, 0x180(%rdi)
- vmovdqa %ymm5, 0x1a0(%rdi)
- vmovdqa %ymm9, 0x1c0(%rdi)
- vmovdqa %ymm11, 0x1e0(%rdi)
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ vpbroadcastq 0x40(%rsi), %ymm15
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x120(%rdi), %ymm9
+ vmovdqa 0x140(%rdi), %ymm10
+ vmovdqa 0x160(%rdi), %ymm11
+ vpbroadcastq 0x48(%rsi), %ymm2
+ vpmullw %ymm15, %ymm8, %ymm12
+ vpmullw %ymm15, %ymm9, %ymm13
+ vpmullw %ymm15, %ymm10, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vmovdqa (%rdi), %ymm4
+ vmovdqa 0x20(%rdi), %ymm5
+ vmovdqa 0x40(%rdi), %ymm6
+ vmovdqa 0x60(%rdi), %ymm7
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm8, %ymm4, %ymm3
+ vpsubw %ymm8, %ymm4, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm4
+ vpsubw %ymm9, %ymm5, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm5
+ vpsubw %ymm10, %ymm6, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm6
+ vpsubw %ymm11, %ymm7, %ymm11
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpaddw %ymm12, %ymm8, %ymm8
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm9, %ymm9
+ vpsubw %ymm14, %ymm5, %ymm5
+ vpaddw %ymm14, %ymm10, %ymm10
+ vpsubw %ymm15, %ymm6, %ymm6
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovdqa %ymm3, (%rdi)
+ vmovdqa %ymm4, 0x20(%rdi)
+ vmovdqa %ymm5, 0x40(%rdi)
+ vmovdqa %ymm6, 0x60(%rdi)
+ vmovdqa %ymm8, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa %ymm10, 0x140(%rdi)
+ vmovdqa %ymm11, 0x160(%rdi)
+ vpbroadcastq 0x40(%rsi), %ymm15
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm10
+ vmovdqa 0x1e0(%rdi), %ymm11
+ vpbroadcastq 0x48(%rsi), %ymm2
+ vpmullw %ymm15, %ymm8, %ymm12
+ vpmullw %ymm15, %ymm9, %ymm13
+ vpmullw %ymm15, %ymm10, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vmovdqa 0x80(%rdi), %ymm4
+ vmovdqa 0xa0(%rdi), %ymm5
+ vmovdqa 0xc0(%rdi), %ymm6
+ vmovdqa 0xe0(%rdi), %ymm7
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm8, %ymm4, %ymm3
+ vpsubw %ymm8, %ymm4, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm4
+ vpsubw %ymm9, %ymm5, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm5
+ vpsubw %ymm10, %ymm6, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm6
+ vpsubw %ymm11, %ymm7, %ymm11
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpaddw %ymm12, %ymm8, %ymm8
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm9, %ymm9
+ vpsubw %ymm14, %ymm5, %ymm5
+ vpaddw %ymm14, %ymm10, %ymm10
+ vpsubw %ymm15, %ymm6, %ymm6
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovdqa %ymm3, 0x80(%rdi)
+ vmovdqa %ymm4, 0xa0(%rdi)
+ vmovdqa %ymm5, 0xc0(%rdi)
+ vmovdqa %ymm6, 0xe0(%rdi)
+ vmovdqa %ymm8, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa %ymm10, 0x1c0(%rdi)
+ vmovdqa %ymm11, 0x1e0(%rdi)
+ vmovdqa 0x60(%rsi), %ymm15
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm9
+ vmovdqa 0xc0(%rdi), %ymm10
+ vmovdqa 0xe0(%rdi), %ymm11
+ vmovdqa 0x80(%rsi), %ymm2
+ vpmullw %ymm15, %ymm8, %ymm12
+ vpmullw %ymm15, %ymm9, %ymm13
+ vpmullw %ymm15, %ymm10, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vmovdqa (%rdi), %ymm4
+ vmovdqa 0x20(%rdi), %ymm5
+ vmovdqa 0x40(%rdi), %ymm6
+ vmovdqa 0x60(%rdi), %ymm7
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm8, %ymm4, %ymm3
+ vpsubw %ymm8, %ymm4, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm4
+ vpsubw %ymm9, %ymm5, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm5
+ vpsubw %ymm10, %ymm6, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm6
+ vpsubw %ymm11, %ymm7, %ymm11
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpaddw %ymm12, %ymm8, %ymm8
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm9, %ymm9
+ vpsubw %ymm14, %ymm5, %ymm5
+ vpaddw %ymm14, %ymm10, %ymm10
+ vpsubw %ymm15, %ymm6, %ymm6
+ vpaddw %ymm15, %ymm11, %ymm11
+ vperm2i128 $0x20, %ymm10, %ymm5, %ymm7 # ymm7 = ymm5[0,1],ymm10[0,1]
+ vperm2i128 $0x31, %ymm10, %ymm5, %ymm10 # ymm10 = ymm5[2,3],ymm10[2,3]
+ vperm2i128 $0x20, %ymm11, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm11[0,1]
+ vperm2i128 $0x31, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[2,3],ymm11[2,3]
+ vmovdqa 0xa0(%rsi), %ymm15
+ vmovdqa 0xc0(%rsi), %ymm2
+ vpmullw %ymm15, %ymm7, %ymm12
+ vpmullw %ymm15, %ymm10, %ymm13
+ vpmullw %ymm15, %ymm5, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vperm2i128 $0x20, %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm7, %ymm6, %ymm4
+ vpsubw %ymm7, %ymm6, %ymm7
+ vpaddw %ymm10, %ymm8, %ymm6
+ vpsubw %ymm10, %ymm8, %ymm10
+ vpaddw %ymm5, %ymm3, %ymm8
+ vpsubw %ymm5, %ymm3, %ymm5
+ vpaddw %ymm11, %ymm9, %ymm3
+ vpsubw %ymm11, %ymm9, %ymm11
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpaddw %ymm12, %ymm7, %ymm7
+ vpsubw %ymm13, %ymm6, %ymm6
+ vpaddw %ymm13, %ymm10, %ymm10
+ vpsubw %ymm14, %ymm8, %ymm8
+ vpaddw %ymm14, %ymm5, %ymm5
+ vpsubw %ymm15, %ymm3, %ymm3
+ vpaddw %ymm15, %ymm11, %ymm11
+ vpunpcklqdq %ymm5, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm5[0],ymm8[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm8, %ymm5 # ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3]
+ vpunpcklqdq %ymm11, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
+ vpunpckhqdq %ymm11, %ymm3, %ymm11 # ymm11 = ymm3[1],ymm11[1],ymm3[3],ymm11[3]
+ vmovdqa 0xe0(%rsi), %ymm15
+ vmovdqa 0x100(%rsi), %ymm2
+ vpmullw %ymm15, %ymm9, %ymm12
+ vpmullw %ymm15, %ymm5, %ymm13
+ vpmullw %ymm15, %ymm8, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vpunpcklqdq %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
+ vpunpckhqdq %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[1],ymm7[1],ymm4[3],ymm7[3]
+ vpunpcklqdq %ymm10, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2]
+ vpunpckhqdq %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm9, %ymm3, %ymm6
+ vpsubw %ymm9, %ymm3, %ymm9
+ vpaddw %ymm5, %ymm7, %ymm3
+ vpsubw %ymm5, %ymm7, %ymm5
+ vpaddw %ymm8, %ymm4, %ymm7
+ vpsubw %ymm8, %ymm4, %ymm8
+ vpaddw %ymm11, %ymm10, %ymm4
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpsubw %ymm12, %ymm6, %ymm6
+ vpaddw %ymm12, %ymm9, %ymm9
+ vpsubw %ymm13, %ymm3, %ymm3
+ vpaddw %ymm13, %ymm5, %ymm5
+ vpsubw %ymm14, %ymm7, %ymm7
+ vpaddw %ymm14, %ymm8, %ymm8
+ vpsubw %ymm15, %ymm4, %ymm4
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovsldup %ymm8, %ymm10 # ymm10 = ymm8[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm10, %ymm7, %ymm10 # ymm10 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4],ymm10[5],ymm7[6],ymm10[7]
+ vpsrlq $0x20, %ymm7, %ymm7
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
+ vmovsldup %ymm11, %ymm7 # ymm7 = ymm11[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7]
+ vpsrlq $0x20, %ymm4, %ymm4
+ vpblendd $0xaa, %ymm11, %ymm4, %ymm11 # ymm11 = ymm4[0],ymm11[1],ymm4[2],ymm11[3],ymm4[4],ymm11[5],ymm4[6],ymm11[7]
+ vmovdqa 0x120(%rsi), %ymm15
+ vmovdqa 0x140(%rsi), %ymm2
+ vpmullw %ymm15, %ymm10, %ymm12
+ vpmullw %ymm15, %ymm8, %ymm13
+ vpmullw %ymm15, %ymm7, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vmovsldup %ymm9, %ymm4 # ymm4 = ymm9[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7]
+ vmovsldup %ymm5, %ymm6 # ymm6 = ymm5[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm10, %ymm4, %ymm3
+ vpsubw %ymm10, %ymm4, %ymm10
+ vpaddw %ymm8, %ymm9, %ymm4
+ vpsubw %ymm8, %ymm9, %ymm8
+ vpaddw %ymm7, %ymm6, %ymm9
+ vpsubw %ymm7, %ymm6, %ymm7
+ vpaddw %ymm11, %ymm5, %ymm6
+ vpsubw %ymm11, %ymm5, %ymm11
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpaddw %ymm12, %ymm10, %ymm10
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm14, %ymm9, %ymm9
+ vpaddw %ymm14, %ymm7, %ymm7
+ vpsubw %ymm15, %ymm6, %ymm6
+ vpaddw %ymm15, %ymm11, %ymm11
+ vpslld $0x10, %ymm7, %ymm5
+ vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
+ vpsrld $0x10, %ymm9, %ymm9
+ vpblendw $0xaa, %ymm7, %ymm9, %ymm7 # ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4],ymm7[5],ymm9[6],ymm7[7],ymm9[8],ymm7[9],ymm9[10],ymm7[11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
+ vpslld $0x10, %ymm11, %ymm9
+ vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
+ vmovdqa 0x160(%rsi), %ymm15
+ vmovdqa 0x180(%rsi), %ymm2
+ vpmullw %ymm15, %ymm5, %ymm12
+ vpmullw %ymm15, %ymm7, %ymm13
+ vpmullw %ymm15, %ymm9, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vpslld $0x10, %ymm10, %ymm6
+ vpblendw $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7],ymm3[8],ymm6[9],ymm3[10],ymm6[11],ymm3[12],ymm6[13],ymm3[14],ymm6[15]
+ vpsrld $0x10, %ymm3, %ymm3
+ vpblendw $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7],ymm3[8],ymm10[9],ymm3[10],ymm10[11],ymm3[12],ymm10[13],ymm3[14],ymm10[15]
+ vpslld $0x10, %ymm8, %ymm3
+ vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7],ymm4[8],ymm8[9],ymm4[10],ymm8[11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm5, %ymm6, %ymm4
+ vpsubw %ymm5, %ymm6, %ymm5
+ vpaddw %ymm7, %ymm10, %ymm6
+ vpsubw %ymm7, %ymm10, %ymm7
+ vpaddw %ymm9, %ymm3, %ymm10
+ vpsubw %ymm9, %ymm3, %ymm9
+ vpaddw %ymm11, %ymm8, %ymm3
+ vpsubw %ymm11, %ymm8, %ymm11
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpaddw %ymm12, %ymm5, %ymm5
+ vpsubw %ymm13, %ymm6, %ymm6
+ vpaddw %ymm13, %ymm7, %ymm7
+ vpsubw %ymm14, %ymm10, %ymm10
+ vpaddw %ymm14, %ymm9, %ymm9
+ vpsubw %ymm15, %ymm3, %ymm3
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovdqa 0x1a0(%rsi), %ymm14
+ vmovdqa 0x1e0(%rsi), %ymm15
+ vmovdqa 0x1c0(%rsi), %ymm8
+ vmovdqa 0x200(%rsi), %ymm2
+ vpmullw %ymm14, %ymm10, %ymm12
+ vpmullw %ymm14, %ymm3, %ymm13
+ vpmullw %ymm15, %ymm9, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm8, %ymm10, %ymm10
+ vpmulhw %ymm8, %ymm3, %ymm3
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm10, %ymm4, %ymm8
+ vpsubw %ymm10, %ymm4, %ymm10
+ vpaddw %ymm3, %ymm6, %ymm4
+ vpsubw %ymm3, %ymm6, %ymm3
+ vpaddw %ymm9, %ymm5, %ymm6
+ vpsubw %ymm9, %ymm5, %ymm9
+ vpaddw %ymm11, %ymm7, %ymm5
+ vpsubw %ymm11, %ymm7, %ymm11
+ vpsubw %ymm12, %ymm8, %ymm8
+ vpaddw %ymm12, %ymm10, %ymm10
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm3, %ymm3
+ vpsubw %ymm14, %ymm6, %ymm6
+ vpaddw %ymm14, %ymm9, %ymm9
+ vpsubw %ymm15, %ymm5, %ymm5
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovdqa %ymm8, (%rdi)
+ vmovdqa %ymm4, 0x20(%rdi)
+ vmovdqa %ymm10, 0x40(%rdi)
+ vmovdqa %ymm3, 0x60(%rdi)
+ vmovdqa %ymm6, 0x80(%rdi)
+ vmovdqa %ymm5, 0xa0(%rdi)
+ vmovdqa %ymm9, 0xc0(%rdi)
+ vmovdqa %ymm11, 0xe0(%rdi)
+ vmovdqa 0x220(%rsi), %ymm15
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm10
+ vmovdqa 0x1e0(%rdi), %ymm11
+ vmovdqa 0x240(%rsi), %ymm2
+ vpmullw %ymm15, %ymm8, %ymm12
+ vpmullw %ymm15, %ymm9, %ymm13
+ vpmullw %ymm15, %ymm10, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vmovdqa 0x100(%rdi), %ymm4
+ vmovdqa 0x120(%rdi), %ymm5
+ vmovdqa 0x140(%rdi), %ymm6
+ vmovdqa 0x160(%rdi), %ymm7
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm8, %ymm4, %ymm3
+ vpsubw %ymm8, %ymm4, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm4
+ vpsubw %ymm9, %ymm5, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm5
+ vpsubw %ymm10, %ymm6, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm6
+ vpsubw %ymm11, %ymm7, %ymm11
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpaddw %ymm12, %ymm8, %ymm8
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm9, %ymm9
+ vpsubw %ymm14, %ymm5, %ymm5
+ vpaddw %ymm14, %ymm10, %ymm10
+ vpsubw %ymm15, %ymm6, %ymm6
+ vpaddw %ymm15, %ymm11, %ymm11
+ vperm2i128 $0x20, %ymm10, %ymm5, %ymm7 # ymm7 = ymm5[0,1],ymm10[0,1]
+ vperm2i128 $0x31, %ymm10, %ymm5, %ymm10 # ymm10 = ymm5[2,3],ymm10[2,3]
+ vperm2i128 $0x20, %ymm11, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm11[0,1]
+ vperm2i128 $0x31, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[2,3],ymm11[2,3]
+ vmovdqa 0x260(%rsi), %ymm15
+ vmovdqa 0x280(%rsi), %ymm2
+ vpmullw %ymm15, %ymm7, %ymm12
+ vpmullw %ymm15, %ymm10, %ymm13
+ vpmullw %ymm15, %ymm5, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vperm2i128 $0x20, %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm7, %ymm6, %ymm4
+ vpsubw %ymm7, %ymm6, %ymm7
+ vpaddw %ymm10, %ymm8, %ymm6
+ vpsubw %ymm10, %ymm8, %ymm10
+ vpaddw %ymm5, %ymm3, %ymm8
+ vpsubw %ymm5, %ymm3, %ymm5
+ vpaddw %ymm11, %ymm9, %ymm3
+ vpsubw %ymm11, %ymm9, %ymm11
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpaddw %ymm12, %ymm7, %ymm7
+ vpsubw %ymm13, %ymm6, %ymm6
+ vpaddw %ymm13, %ymm10, %ymm10
+ vpsubw %ymm14, %ymm8, %ymm8
+ vpaddw %ymm14, %ymm5, %ymm5
+ vpsubw %ymm15, %ymm3, %ymm3
+ vpaddw %ymm15, %ymm11, %ymm11
+ vpunpcklqdq %ymm5, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm5[0],ymm8[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm8, %ymm5 # ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3]
+ vpunpcklqdq %ymm11, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
+ vpunpckhqdq %ymm11, %ymm3, %ymm11 # ymm11 = ymm3[1],ymm11[1],ymm3[3],ymm11[3]
+ vmovdqa 0x2a0(%rsi), %ymm15
+ vmovdqa 0x2c0(%rsi), %ymm2
+ vpmullw %ymm15, %ymm9, %ymm12
+ vpmullw %ymm15, %ymm5, %ymm13
+ vpmullw %ymm15, %ymm8, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vpunpcklqdq %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
+ vpunpckhqdq %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[1],ymm7[1],ymm4[3],ymm7[3]
+ vpunpcklqdq %ymm10, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2]
+ vpunpckhqdq %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm9, %ymm3, %ymm6
+ vpsubw %ymm9, %ymm3, %ymm9
+ vpaddw %ymm5, %ymm7, %ymm3
+ vpsubw %ymm5, %ymm7, %ymm5
+ vpaddw %ymm8, %ymm4, %ymm7
+ vpsubw %ymm8, %ymm4, %ymm8
+ vpaddw %ymm11, %ymm10, %ymm4
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpsubw %ymm12, %ymm6, %ymm6
+ vpaddw %ymm12, %ymm9, %ymm9
+ vpsubw %ymm13, %ymm3, %ymm3
+ vpaddw %ymm13, %ymm5, %ymm5
+ vpsubw %ymm14, %ymm7, %ymm7
+ vpaddw %ymm14, %ymm8, %ymm8
+ vpsubw %ymm15, %ymm4, %ymm4
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovsldup %ymm8, %ymm10 # ymm10 = ymm8[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm10, %ymm7, %ymm10 # ymm10 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4],ymm10[5],ymm7[6],ymm10[7]
+ vpsrlq $0x20, %ymm7, %ymm7
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
+ vmovsldup %ymm11, %ymm7 # ymm7 = ymm11[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7]
+ vpsrlq $0x20, %ymm4, %ymm4
+ vpblendd $0xaa, %ymm11, %ymm4, %ymm11 # ymm11 = ymm4[0],ymm11[1],ymm4[2],ymm11[3],ymm4[4],ymm11[5],ymm4[6],ymm11[7]
+ vmovdqa 0x2e0(%rsi), %ymm15
+ vmovdqa 0x300(%rsi), %ymm2
+ vpmullw %ymm15, %ymm10, %ymm12
+ vpmullw %ymm15, %ymm8, %ymm13
+ vpmullw %ymm15, %ymm7, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vmovsldup %ymm9, %ymm4 # ymm4 = ymm9[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7]
+ vmovsldup %ymm5, %ymm6 # ymm6 = ymm5[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm10, %ymm4, %ymm3
+ vpsubw %ymm10, %ymm4, %ymm10
+ vpaddw %ymm8, %ymm9, %ymm4
+ vpsubw %ymm8, %ymm9, %ymm8
+ vpaddw %ymm7, %ymm6, %ymm9
+ vpsubw %ymm7, %ymm6, %ymm7
+ vpaddw %ymm11, %ymm5, %ymm6
+ vpsubw %ymm11, %ymm5, %ymm11
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpaddw %ymm12, %ymm10, %ymm10
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm14, %ymm9, %ymm9
+ vpaddw %ymm14, %ymm7, %ymm7
+ vpsubw %ymm15, %ymm6, %ymm6
+ vpaddw %ymm15, %ymm11, %ymm11
+ vpslld $0x10, %ymm7, %ymm5
+ vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
+ vpsrld $0x10, %ymm9, %ymm9
+ vpblendw $0xaa, %ymm7, %ymm9, %ymm7 # ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4],ymm7[5],ymm9[6],ymm7[7],ymm9[8],ymm7[9],ymm9[10],ymm7[11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
+ vpslld $0x10, %ymm11, %ymm9
+ vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
+ vmovdqa 0x320(%rsi), %ymm15
+ vmovdqa 0x340(%rsi), %ymm2
+ vpmullw %ymm15, %ymm5, %ymm12
+ vpmullw %ymm15, %ymm7, %ymm13
+ vpmullw %ymm15, %ymm9, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vpslld $0x10, %ymm10, %ymm6
+ vpblendw $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7],ymm3[8],ymm6[9],ymm3[10],ymm6[11],ymm3[12],ymm6[13],ymm3[14],ymm6[15]
+ vpsrld $0x10, %ymm3, %ymm3
+ vpblendw $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7],ymm3[8],ymm10[9],ymm3[10],ymm10[11],ymm3[12],ymm10[13],ymm3[14],ymm10[15]
+ vpslld $0x10, %ymm8, %ymm3
+ vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7],ymm4[8],ymm8[9],ymm4[10],ymm8[11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm5, %ymm6, %ymm4
+ vpsubw %ymm5, %ymm6, %ymm5
+ vpaddw %ymm7, %ymm10, %ymm6
+ vpsubw %ymm7, %ymm10, %ymm7
+ vpaddw %ymm9, %ymm3, %ymm10
+ vpsubw %ymm9, %ymm3, %ymm9
+ vpaddw %ymm11, %ymm8, %ymm3
+ vpsubw %ymm11, %ymm8, %ymm11
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpaddw %ymm12, %ymm5, %ymm5
+ vpsubw %ymm13, %ymm6, %ymm6
+ vpaddw %ymm13, %ymm7, %ymm7
+ vpsubw %ymm14, %ymm10, %ymm10
+ vpaddw %ymm14, %ymm9, %ymm9
+ vpsubw %ymm15, %ymm3, %ymm3
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovdqa 0x360(%rsi), %ymm14
+ vmovdqa 0x3a0(%rsi), %ymm15
+ vmovdqa 0x380(%rsi), %ymm8
+ vmovdqa 0x3c0(%rsi), %ymm2
+ vpmullw %ymm14, %ymm10, %ymm12
+ vpmullw %ymm14, %ymm3, %ymm13
+ vpmullw %ymm15, %ymm9, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm8, %ymm10, %ymm10
+ vpmulhw %ymm8, %ymm3, %ymm3
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm10, %ymm4, %ymm8
+ vpsubw %ymm10, %ymm4, %ymm10
+ vpaddw %ymm3, %ymm6, %ymm4
+ vpsubw %ymm3, %ymm6, %ymm3
+ vpaddw %ymm9, %ymm5, %ymm6
+ vpsubw %ymm9, %ymm5, %ymm9
+ vpaddw %ymm11, %ymm7, %ymm5
+ vpsubw %ymm11, %ymm7, %ymm11
+ vpsubw %ymm12, %ymm8, %ymm8
+ vpaddw %ymm12, %ymm10, %ymm10
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm3, %ymm3
+ vpsubw %ymm14, %ymm6, %ymm6
+ vpaddw %ymm14, %ymm9, %ymm9
+ vpsubw %ymm15, %ymm5, %ymm5
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovdqa %ymm8, 0x100(%rdi)
+ vmovdqa %ymm4, 0x120(%rdi)
+ vmovdqa %ymm10, 0x140(%rdi)
+ vmovdqa %ymm3, 0x160(%rdi)
+ vmovdqa %ymm6, 0x180(%rdi)
+ vmovdqa %ymm5, 0x1a0(%rdi)
+ vmovdqa %ymm9, 0x1c0(%rdi)
+ vmovdqa %ymm11, 0x1e0(%rdi)
retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(ntt_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/nttfrombytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/nttfrombytes.S
index c4a174fa64..5ef95954ea 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/nttfrombytes.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/nttfrombytes.S
@@ -27,93 +27,167 @@
* dev/x86_64/src/nttfrombytes.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(nttfrombytes_avx2)
MLK_ASM_FN_SYMBOL(nttfrombytes_avx2)
- vmovdqa 0xe0(%rdx), %ymm0
- callq nttfrombytes_avx2_core
- addq $0x100, %rdi # imm = 0x100
- addq $0xc0, %rsi
- callq nttfrombytes_avx2_core
+ .cfi_startproc
+ movl $0xfff0fff, %eax # imm = 0xFFF0FFF
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ vmovdqu (%rsi), %ymm4
+ vmovdqu 0x20(%rsi), %ymm5
+ vmovdqu 0x40(%rsi), %ymm6
+ vmovdqu 0x60(%rsi), %ymm7
+ vmovdqu 0x80(%rsi), %ymm8
+ vmovdqu 0xa0(%rsi), %ymm9
+ vperm2i128 $0x20, %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm7[0,1]
+ vperm2i128 $0x31, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[2,3],ymm7[2,3]
+ vperm2i128 $0x20, %ymm8, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm5, %ymm8 # ymm8 = ymm5[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[2,3],ymm9[2,3]
+ vpunpcklqdq %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm8[0],ymm3[2],ymm8[2]
+ vpunpckhqdq %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[1],ymm8[1],ymm3[3],ymm8[3]
+ vpunpcklqdq %ymm5, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm7, %ymm5 # ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3]
+ vpunpcklqdq %ymm9, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm9[0],ymm4[2],ymm9[2]
+ vpunpckhqdq %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[1],ymm9[1],ymm4[3],ymm9[3]
+ vmovsldup %ymm5, %ymm4 # ymm4 = ymm5[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
+ vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7]
+ vpsrlq $0x20, %ymm8, %ymm8
+ vpblendd $0xaa, %ymm7, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7]
+ vmovsldup %ymm9, %ymm8 # ymm8 = ymm9[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm9, %ymm3, %ymm9 # ymm9 = ymm3[0],ymm9[1],ymm3[2],ymm9[3],ymm3[4],ymm9[5],ymm3[6],ymm9[7]
+ vpslld $0x10, %ymm7, %ymm10
+ vpblendw $0xaa, %ymm10, %ymm4, %ymm10 # ymm10 = ymm4[0],ymm10[1],ymm4[2],ymm10[3],ymm4[4],ymm10[5],ymm4[6],ymm10[7],ymm4[8],ymm10[9],ymm4[10],ymm10[11],ymm4[12],ymm10[13],ymm4[14],ymm10[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7],ymm4[8],ymm7[9],ymm4[10],ymm7[11],ymm4[12],ymm7[13],ymm4[14],ymm7[15]
+ vpslld $0x10, %ymm8, %ymm4
+ vpblendw $0xaa, %ymm4, %ymm5, %ymm4 # ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
+ vpsrld $0x10, %ymm5, %ymm5
+ vpblendw $0xaa, %ymm8, %ymm5, %ymm8 # ymm8 = ymm5[0],ymm8[1],ymm5[2],ymm8[3],ymm5[4],ymm8[5],ymm5[6],ymm8[7],ymm5[8],ymm8[9],ymm5[10],ymm8[11],ymm5[12],ymm8[13],ymm5[14],ymm8[15]
+ vpslld $0x10, %ymm9, %ymm5
+ vpblendw $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7],ymm6[8],ymm5[9],ymm6[10],ymm5[11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
+ vpsrlw $0xc, %ymm10, %ymm11
+ vpsllw $0x4, %ymm7, %ymm12
+ vpor %ymm11, %ymm12, %ymm11
+ vpand %ymm0, %ymm10, %ymm10
+ vpand %ymm0, %ymm11, %ymm11
+ vpsrlw $0x8, %ymm7, %ymm12
+ vpsllw $0x8, %ymm4, %ymm13
+ vpor %ymm12, %ymm13, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpsrlw $0x4, %ymm4, %ymm13
+ vpand %ymm0, %ymm13, %ymm13
+ vpsrlw $0xc, %ymm8, %ymm14
+ vpsllw $0x4, %ymm5, %ymm15
+ vpor %ymm14, %ymm15, %ymm14
+ vpand %ymm0, %ymm8, %ymm8
+ vpand %ymm0, %ymm14, %ymm14
+ vpsrlw $0x8, %ymm5, %ymm15
+ vpsllw $0x8, %ymm9, %ymm1
+ vpor %ymm15, %ymm1, %ymm15
+ vpand %ymm0, %ymm15, %ymm15
+ vpsrlw $0x4, %ymm9, %ymm1
+ vpand %ymm0, %ymm1, %ymm1
+ vmovdqa %ymm10, (%rdi)
+ vmovdqa %ymm11, 0x20(%rdi)
+ vmovdqa %ymm12, 0x40(%rdi)
+ vmovdqa %ymm13, 0x60(%rdi)
+ vmovdqa %ymm8, 0x80(%rdi)
+ vmovdqa %ymm14, 0xa0(%rdi)
+ vmovdqa %ymm15, 0xc0(%rdi)
+ vmovdqa %ymm1, 0xe0(%rdi)
+ vmovdqu 0xc0(%rsi), %ymm4
+ vmovdqu 0xe0(%rsi), %ymm5
+ vmovdqu 0x100(%rsi), %ymm6
+ vmovdqu 0x120(%rsi), %ymm7
+ vmovdqu 0x140(%rsi), %ymm8
+ vmovdqu 0x160(%rsi), %ymm9
+ vperm2i128 $0x20, %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm7[0,1]
+ vperm2i128 $0x31, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[2,3],ymm7[2,3]
+ vperm2i128 $0x20, %ymm8, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm5, %ymm8 # ymm8 = ymm5[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[2,3],ymm9[2,3]
+ vpunpcklqdq %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm8[0],ymm3[2],ymm8[2]
+ vpunpckhqdq %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[1],ymm8[1],ymm3[3],ymm8[3]
+ vpunpcklqdq %ymm5, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm7, %ymm5 # ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3]
+ vpunpcklqdq %ymm9, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm9[0],ymm4[2],ymm9[2]
+ vpunpckhqdq %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[1],ymm9[1],ymm4[3],ymm9[3]
+ vmovsldup %ymm5, %ymm4 # ymm4 = ymm5[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
+ vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7]
+ vpsrlq $0x20, %ymm8, %ymm8
+ vpblendd $0xaa, %ymm7, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7]
+ vmovsldup %ymm9, %ymm8 # ymm8 = ymm9[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm9, %ymm3, %ymm9 # ymm9 = ymm3[0],ymm9[1],ymm3[2],ymm9[3],ymm3[4],ymm9[5],ymm3[6],ymm9[7]
+ vpslld $0x10, %ymm7, %ymm10
+ vpblendw $0xaa, %ymm10, %ymm4, %ymm10 # ymm10 = ymm4[0],ymm10[1],ymm4[2],ymm10[3],ymm4[4],ymm10[5],ymm4[6],ymm10[7],ymm4[8],ymm10[9],ymm4[10],ymm10[11],ymm4[12],ymm10[13],ymm4[14],ymm10[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7],ymm4[8],ymm7[9],ymm4[10],ymm7[11],ymm4[12],ymm7[13],ymm4[14],ymm7[15]
+ vpslld $0x10, %ymm8, %ymm4
+ vpblendw $0xaa, %ymm4, %ymm5, %ymm4 # ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
+ vpsrld $0x10, %ymm5, %ymm5
+ vpblendw $0xaa, %ymm8, %ymm5, %ymm8 # ymm8 = ymm5[0],ymm8[1],ymm5[2],ymm8[3],ymm5[4],ymm8[5],ymm5[6],ymm8[7],ymm5[8],ymm8[9],ymm5[10],ymm8[11],ymm5[12],ymm8[13],ymm5[14],ymm8[15]
+ vpslld $0x10, %ymm9, %ymm5
+ vpblendw $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7],ymm6[8],ymm5[9],ymm6[10],ymm5[11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
+ vpsrlw $0xc, %ymm10, %ymm11
+ vpsllw $0x4, %ymm7, %ymm12
+ vpor %ymm11, %ymm12, %ymm11
+ vpand %ymm0, %ymm10, %ymm10
+ vpand %ymm0, %ymm11, %ymm11
+ vpsrlw $0x8, %ymm7, %ymm12
+ vpsllw $0x8, %ymm4, %ymm13
+ vpor %ymm12, %ymm13, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpsrlw $0x4, %ymm4, %ymm13
+ vpand %ymm0, %ymm13, %ymm13
+ vpsrlw $0xc, %ymm8, %ymm14
+ vpsllw $0x4, %ymm5, %ymm15
+ vpor %ymm14, %ymm15, %ymm14
+ vpand %ymm0, %ymm8, %ymm8
+ vpand %ymm0, %ymm14, %ymm14
+ vpsrlw $0x8, %ymm5, %ymm15
+ vpsllw $0x8, %ymm9, %ymm1
+ vpor %ymm15, %ymm1, %ymm15
+ vpand %ymm0, %ymm15, %ymm15
+ vpsrlw $0x4, %ymm9, %ymm1
+ vpand %ymm0, %ymm1, %ymm1
+ vmovdqa %ymm10, 0x100(%rdi)
+ vmovdqa %ymm11, 0x120(%rdi)
+ vmovdqa %ymm12, 0x140(%rdi)
+ vmovdqa %ymm13, 0x160(%rdi)
+ vmovdqa %ymm8, 0x180(%rdi)
+ vmovdqa %ymm14, 0x1a0(%rdi)
+ vmovdqa %ymm15, 0x1c0(%rdi)
+ vmovdqa %ymm1, 0x1e0(%rdi)
retq
+ .cfi_endproc
-nttfrombytes_avx2_core:
- vmovdqu (%rsi), %ymm4
- vmovdqu 0x20(%rsi), %ymm5
- vmovdqu 0x40(%rsi), %ymm6
- vmovdqu 0x60(%rsi), %ymm7
- vmovdqu 0x80(%rsi), %ymm8
- vmovdqu 0xa0(%rsi), %ymm9
- vperm2i128 $0x20, %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm7[0,1]
- vperm2i128 $0x31, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[2,3],ymm7[2,3]
- vperm2i128 $0x20, %ymm8, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm8[0,1]
- vperm2i128 $0x31, %ymm8, %ymm5, %ymm8 # ymm8 = ymm5[2,3],ymm8[2,3]
- vperm2i128 $0x20, %ymm9, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm9[0,1]
- vperm2i128 $0x31, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[2,3],ymm9[2,3]
- vpunpcklqdq %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm8[0],ymm3[2],ymm8[2]
- vpunpckhqdq %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[1],ymm8[1],ymm3[3],ymm8[3]
- vpunpcklqdq %ymm5, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
- vpunpckhqdq %ymm5, %ymm7, %ymm5 # ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3]
- vpunpcklqdq %ymm9, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm9[0],ymm4[2],ymm9[2]
- vpunpckhqdq %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[1],ymm9[1],ymm4[3],ymm9[3]
- vmovsldup %ymm5, %ymm4 # ymm4 = ymm5[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
- vpsrlq $0x20, %ymm6, %ymm6
- vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
- vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7]
- vpsrlq $0x20, %ymm8, %ymm8
- vpblendd $0xaa, %ymm7, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7]
- vmovsldup %ymm9, %ymm8 # ymm8 = ymm9[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
- vpsrlq $0x20, %ymm3, %ymm3
- vpblendd $0xaa, %ymm9, %ymm3, %ymm9 # ymm9 = ymm3[0],ymm9[1],ymm3[2],ymm9[3],ymm3[4],ymm9[5],ymm3[6],ymm9[7]
- vpslld $0x10, %ymm7, %ymm10
- vpblendw $0xaa, %ymm10, %ymm4, %ymm10 # ymm10 = ymm4[0],ymm10[1],ymm4[2],ymm10[3],ymm4[4],ymm10[5],ymm4[6],ymm10[7],ymm4[8],ymm10[9],ymm4[10],ymm10[11],ymm4[12],ymm10[13],ymm4[14],ymm10[15]
- vpsrld $0x10, %ymm4, %ymm4
- vpblendw $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7],ymm4[8],ymm7[9],ymm4[10],ymm7[11],ymm4[12],ymm7[13],ymm4[14],ymm7[15]
- vpslld $0x10, %ymm8, %ymm4
- vpblendw $0xaa, %ymm4, %ymm5, %ymm4 # ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
- vpsrld $0x10, %ymm5, %ymm5
- vpblendw $0xaa, %ymm8, %ymm5, %ymm8 # ymm8 = ymm5[0],ymm8[1],ymm5[2],ymm8[3],ymm5[4],ymm8[5],ymm5[6],ymm8[7],ymm5[8],ymm8[9],ymm5[10],ymm8[11],ymm5[12],ymm8[13],ymm5[14],ymm8[15]
- vpslld $0x10, %ymm9, %ymm5
- vpblendw $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7],ymm6[8],ymm5[9],ymm6[10],ymm5[11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
- vpsrld $0x10, %ymm6, %ymm6
- vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
- vpsrlw $0xc, %ymm10, %ymm11
- vpsllw $0x4, %ymm7, %ymm12
- vpor %ymm11, %ymm12, %ymm11
- vpand %ymm0, %ymm10, %ymm10
- vpand %ymm0, %ymm11, %ymm11
- vpsrlw $0x8, %ymm7, %ymm12
- vpsllw $0x8, %ymm4, %ymm13
- vpor %ymm12, %ymm13, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpsrlw $0x4, %ymm4, %ymm13
- vpand %ymm0, %ymm13, %ymm13
- vpsrlw $0xc, %ymm8, %ymm14
- vpsllw $0x4, %ymm5, %ymm15
- vpor %ymm14, %ymm15, %ymm14
- vpand %ymm0, %ymm8, %ymm8
- vpand %ymm0, %ymm14, %ymm14
- vpsrlw $0x8, %ymm5, %ymm15
- vpsllw $0x8, %ymm9, %ymm1
- vpor %ymm15, %ymm1, %ymm15
- vpand %ymm0, %ymm15, %ymm15
- vpsrlw $0x4, %ymm9, %ymm1
- vpand %ymm0, %ymm1, %ymm1
- vmovdqa %ymm10, (%rdi)
- vmovdqa %ymm11, 0x20(%rdi)
- vmovdqa %ymm12, 0x40(%rdi)
- vmovdqa %ymm13, 0x60(%rdi)
- vmovdqa %ymm8, 0x80(%rdi)
- vmovdqa %ymm14, 0xa0(%rdi)
- vmovdqa %ymm15, 0xc0(%rdi)
- vmovdqa %ymm1, 0xe0(%rdi)
- retq
+MLK_ASM_FN_SIZE(nttfrombytes_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/ntttobytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/ntttobytes.S
index 9bbd39f00a..b4e043bff2 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/ntttobytes.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/ntttobytes.S
@@ -27,87 +27,155 @@
* dev/x86_64/src/ntttobytes.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(ntttobytes_avx2)
MLK_ASM_FN_SYMBOL(ntttobytes_avx2)
- vmovdqa (%rdx), %ymm0
- callq ntttobytes_avx2_core
- addq $0x100, %rsi # imm = 0x100
- addq $0xc0, %rdi
- callq ntttobytes_avx2_core
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ vmovdqa (%rsi), %ymm5
+ vmovdqa 0x20(%rsi), %ymm6
+ vmovdqa 0x40(%rsi), %ymm7
+ vmovdqa 0x60(%rsi), %ymm8
+ vmovdqa 0x80(%rsi), %ymm9
+ vmovdqa 0xa0(%rsi), %ymm10
+ vmovdqa 0xc0(%rsi), %ymm11
+ vmovdqa 0xe0(%rsi), %ymm12
+ vpsllw $0xc, %ymm6, %ymm4
+ vpor %ymm4, %ymm5, %ymm4
+ vpsrlw $0x4, %ymm6, %ymm5
+ vpsllw $0x8, %ymm7, %ymm6
+ vpor %ymm5, %ymm6, %ymm5
+ vpsrlw $0x8, %ymm7, %ymm6
+ vpsllw $0x4, %ymm8, %ymm7
+ vpor %ymm6, %ymm7, %ymm6
+ vpsllw $0xc, %ymm10, %ymm7
+ vpor %ymm7, %ymm9, %ymm7
+ vpsrlw $0x4, %ymm10, %ymm8
+ vpsllw $0x8, %ymm11, %ymm9
+ vpor %ymm8, %ymm9, %ymm8
+ vpsrlw $0x8, %ymm11, %ymm9
+ vpsllw $0x4, %ymm12, %ymm10
+ vpor %ymm9, %ymm10, %ymm9
+ vpslld $0x10, %ymm5, %ymm3
+ vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+ vpslld $0x10, %ymm7, %ymm4
+ vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+ vpslld $0x10, %ymm9, %ymm6
+ vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
+ vpsrld $0x10, %ymm8, %ymm8
+ vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+ vmovsldup %ymm4, %ymm8 # ymm8 = ymm4[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+ vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
+ vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm7, %ymm6 # ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
+ vpsrlq $0x20, %ymm7, %ymm7
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
+ vpunpcklqdq %ymm3, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+ vpunpckhqdq %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3]
+ vpunpcklqdq %ymm4, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+ vpunpckhqdq %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
+ vpunpcklqdq %ymm9, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm9[0],ymm5[2],ymm9[2]
+ vpunpckhqdq %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[1],ymm9[1],ymm5[3],ymm9[3]
+ vperm2i128 $0x20, %ymm8, %ymm7, %ymm5 # ymm5 = ymm7[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm3, %ymm6, %ymm7 # ymm7 = ymm6[0,1],ymm3[0,1]
+ vperm2i128 $0x31, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[2,3],ymm3[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
+ vmovdqu %ymm5, (%rdi)
+ vmovdqu %ymm7, 0x20(%rdi)
+ vmovdqu %ymm6, 0x40(%rdi)
+ vmovdqu %ymm8, 0x60(%rdi)
+ vmovdqu %ymm3, 0x80(%rdi)
+ vmovdqu %ymm9, 0xa0(%rdi)
+ vmovdqa 0x100(%rsi), %ymm5
+ vmovdqa 0x120(%rsi), %ymm6
+ vmovdqa 0x140(%rsi), %ymm7
+ vmovdqa 0x160(%rsi), %ymm8
+ vmovdqa 0x180(%rsi), %ymm9
+ vmovdqa 0x1a0(%rsi), %ymm10
+ vmovdqa 0x1c0(%rsi), %ymm11
+ vmovdqa 0x1e0(%rsi), %ymm12
+ vpsllw $0xc, %ymm6, %ymm4
+ vpor %ymm4, %ymm5, %ymm4
+ vpsrlw $0x4, %ymm6, %ymm5
+ vpsllw $0x8, %ymm7, %ymm6
+ vpor %ymm5, %ymm6, %ymm5
+ vpsrlw $0x8, %ymm7, %ymm6
+ vpsllw $0x4, %ymm8, %ymm7
+ vpor %ymm6, %ymm7, %ymm6
+ vpsllw $0xc, %ymm10, %ymm7
+ vpor %ymm7, %ymm9, %ymm7
+ vpsrlw $0x4, %ymm10, %ymm8
+ vpsllw $0x8, %ymm11, %ymm9
+ vpor %ymm8, %ymm9, %ymm8
+ vpsrlw $0x8, %ymm11, %ymm9
+ vpsllw $0x4, %ymm12, %ymm10
+ vpor %ymm9, %ymm10, %ymm9
+ vpslld $0x10, %ymm5, %ymm3
+ vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+ vpslld $0x10, %ymm7, %ymm4
+ vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+ vpslld $0x10, %ymm9, %ymm6
+ vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
+ vpsrld $0x10, %ymm8, %ymm8
+ vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+ vmovsldup %ymm4, %ymm8 # ymm8 = ymm4[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+ vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
+ vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm7, %ymm6 # ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
+ vpsrlq $0x20, %ymm7, %ymm7
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
+ vpunpcklqdq %ymm3, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+ vpunpckhqdq %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3]
+ vpunpcklqdq %ymm4, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+ vpunpckhqdq %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
+ vpunpcklqdq %ymm9, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm9[0],ymm5[2],ymm9[2]
+ vpunpckhqdq %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[1],ymm9[1],ymm5[3],ymm9[3]
+ vperm2i128 $0x20, %ymm8, %ymm7, %ymm5 # ymm5 = ymm7[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm3, %ymm6, %ymm7 # ymm7 = ymm6[0,1],ymm3[0,1]
+ vperm2i128 $0x31, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[2,3],ymm3[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
+ vmovdqu %ymm5, 0xc0(%rdi)
+ vmovdqu %ymm7, 0xe0(%rdi)
+ vmovdqu %ymm6, 0x100(%rdi)
+ vmovdqu %ymm8, 0x120(%rdi)
+ vmovdqu %ymm3, 0x140(%rdi)
+ vmovdqu %ymm9, 0x160(%rdi)
retq
+ .cfi_endproc
-ntttobytes_avx2_core:
- vmovdqa (%rsi), %ymm5
- vmovdqa 0x20(%rsi), %ymm6
- vmovdqa 0x40(%rsi), %ymm7
- vmovdqa 0x60(%rsi), %ymm8
- vmovdqa 0x80(%rsi), %ymm9
- vmovdqa 0xa0(%rsi), %ymm10
- vmovdqa 0xc0(%rsi), %ymm11
- vmovdqa 0xe0(%rsi), %ymm12
- vpsllw $0xc, %ymm6, %ymm4
- vpor %ymm4, %ymm5, %ymm4
- vpsrlw $0x4, %ymm6, %ymm5
- vpsllw $0x8, %ymm7, %ymm6
- vpor %ymm5, %ymm6, %ymm5
- vpsrlw $0x8, %ymm7, %ymm6
- vpsllw $0x4, %ymm8, %ymm7
- vpor %ymm6, %ymm7, %ymm6
- vpsllw $0xc, %ymm10, %ymm7
- vpor %ymm7, %ymm9, %ymm7
- vpsrlw $0x4, %ymm10, %ymm8
- vpsllw $0x8, %ymm11, %ymm9
- vpor %ymm8, %ymm9, %ymm8
- vpsrlw $0x8, %ymm11, %ymm9
- vpsllw $0x4, %ymm12, %ymm10
- vpor %ymm9, %ymm10, %ymm9
- vpslld $0x10, %ymm5, %ymm3
- vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
- vpsrld $0x10, %ymm4, %ymm4
- vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
- vpslld $0x10, %ymm7, %ymm4
- vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
- vpsrld $0x10, %ymm6, %ymm6
- vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
- vpslld $0x10, %ymm9, %ymm6
- vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
- vpsrld $0x10, %ymm8, %ymm8
- vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
- vmovsldup %ymm4, %ymm8 # ymm8 = ymm4[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
- vpsrlq $0x20, %ymm3, %ymm3
- vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
- vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
- vpsrlq $0x20, %ymm6, %ymm6
- vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
- vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm6, %ymm7, %ymm6 # ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
- vpsrlq $0x20, %ymm7, %ymm7
- vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
- vpunpcklqdq %ymm3, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
- vpunpckhqdq %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3]
- vpunpcklqdq %ymm4, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
- vpunpckhqdq %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
- vpunpcklqdq %ymm9, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm9[0],ymm5[2],ymm9[2]
- vpunpckhqdq %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[1],ymm9[1],ymm5[3],ymm9[3]
- vperm2i128 $0x20, %ymm8, %ymm7, %ymm5 # ymm5 = ymm7[0,1],ymm8[0,1]
- vperm2i128 $0x31, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[2,3],ymm8[2,3]
- vperm2i128 $0x20, %ymm3, %ymm6, %ymm7 # ymm7 = ymm6[0,1],ymm3[0,1]
- vperm2i128 $0x31, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[2,3],ymm3[2,3]
- vperm2i128 $0x20, %ymm9, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm9[0,1]
- vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
- vmovdqu %ymm5, (%rdi)
- vmovdqu %ymm7, 0x20(%rdi)
- vmovdqu %ymm6, 0x40(%rdi)
- vmovdqu %ymm8, 0x60(%rdi)
- vmovdqu %ymm3, 0x80(%rdi)
- vmovdqu %ymm9, 0xa0(%rdi)
- retq
+MLK_ASM_FN_SIZE(ntttobytes_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/nttunpack.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/nttunpack.S
index 6233b1b950..6e9dc76aa5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/nttunpack.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/nttunpack.S
@@ -27,83 +27,148 @@
* dev/x86_64/src/nttunpack.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(nttunpack_avx2)
MLK_ASM_FN_SYMBOL(nttunpack_avx2)
- callq nttunpack_avx2_core
- addq $0x100, %rdi # imm = 0x100
- callq nttunpack_avx2_core
+ .cfi_startproc
+ vmovdqa (%rdi), %ymm4
+ vmovdqa 0x20(%rdi), %ymm5
+ vmovdqa 0x40(%rdi), %ymm6
+ vmovdqa 0x60(%rdi), %ymm7
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm9
+ vmovdqa 0xc0(%rdi), %ymm10
+ vmovdqa 0xe0(%rdi), %ymm11
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
+ vpsrlq $0x20, %ymm7, %ymm7
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
+ vpsrlq $0x20, %ymm5, %ymm5
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
+ vpsrlq $0x20, %ymm10, %ymm10
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
+ vpslld $0x10, %ymm5, %ymm10
+ vpblendw $0xaa, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4],ymm10[5],ymm9[6],ymm10[7],ymm9[8],ymm10[9],ymm9[10],ymm10[11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
+ vpsrld $0x10, %ymm9, %ymm9
+ vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
+ vpslld $0x10, %ymm4, %ymm9
+ vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+ vpsrld $0x10, %ymm8, %ymm8
+ vpblendw $0xaa, %ymm4, %ymm8, %ymm4 # ymm4 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4],ymm4[5],ymm8[6],ymm4[7],ymm8[8],ymm4[9],ymm8[10],ymm4[11],ymm8[12],ymm4[13],ymm8[14],ymm4[15]
+ vpslld $0x10, %ymm3, %ymm8
+ vpblendw $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7],ymm7[8],ymm8[9],ymm7[10],ymm8[11],ymm7[12],ymm8[13],ymm7[14],ymm8[15]
+ vpsrld $0x10, %ymm7, %ymm7
+ vpblendw $0xaa, %ymm3, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7],ymm7[8],ymm3[9],ymm7[10],ymm3[11],ymm7[12],ymm3[13],ymm7[14],ymm3[15]
+ vpslld $0x10, %ymm11, %ymm7
+ vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
+ vmovdqa %ymm10, (%rdi)
+ vmovdqa %ymm5, 0x20(%rdi)
+ vmovdqa %ymm9, 0x40(%rdi)
+ vmovdqa %ymm4, 0x60(%rdi)
+ vmovdqa %ymm8, 0x80(%rdi)
+ vmovdqa %ymm3, 0xa0(%rdi)
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm11, 0xe0(%rdi)
+ vmovdqa 0x100(%rdi), %ymm4
+ vmovdqa 0x120(%rdi), %ymm5
+ vmovdqa 0x140(%rdi), %ymm6
+ vmovdqa 0x160(%rdi), %ymm7
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm10
+ vmovdqa 0x1e0(%rdi), %ymm11
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
+ vpsrlq $0x20, %ymm7, %ymm7
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
+ vpsrlq $0x20, %ymm5, %ymm5
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
+ vpsrlq $0x20, %ymm10, %ymm10
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
+ vpslld $0x10, %ymm5, %ymm10
+ vpblendw $0xaa, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4],ymm10[5],ymm9[6],ymm10[7],ymm9[8],ymm10[9],ymm9[10],ymm10[11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
+ vpsrld $0x10, %ymm9, %ymm9
+ vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
+ vpslld $0x10, %ymm4, %ymm9
+ vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+ vpsrld $0x10, %ymm8, %ymm8
+ vpblendw $0xaa, %ymm4, %ymm8, %ymm4 # ymm4 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4],ymm4[5],ymm8[6],ymm4[7],ymm8[8],ymm4[9],ymm8[10],ymm4[11],ymm8[12],ymm4[13],ymm8[14],ymm4[15]
+ vpslld $0x10, %ymm3, %ymm8
+ vpblendw $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7],ymm7[8],ymm8[9],ymm7[10],ymm8[11],ymm7[12],ymm8[13],ymm7[14],ymm8[15]
+ vpsrld $0x10, %ymm7, %ymm7
+ vpblendw $0xaa, %ymm3, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7],ymm7[8],ymm3[9],ymm7[10],ymm3[11],ymm7[12],ymm3[13],ymm7[14],ymm3[15]
+ vpslld $0x10, %ymm11, %ymm7
+ vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
+ vmovdqa %ymm10, 0x100(%rdi)
+ vmovdqa %ymm5, 0x120(%rdi)
+ vmovdqa %ymm9, 0x140(%rdi)
+ vmovdqa %ymm4, 0x160(%rdi)
+ vmovdqa %ymm8, 0x180(%rdi)
+ vmovdqa %ymm3, 0x1a0(%rdi)
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm11, 0x1e0(%rdi)
retq
+ .cfi_endproc
-nttunpack_avx2_core:
- vmovdqa (%rdi), %ymm4
- vmovdqa 0x20(%rdi), %ymm5
- vmovdqa 0x40(%rdi), %ymm6
- vmovdqa 0x60(%rdi), %ymm7
- vmovdqa 0x80(%rdi), %ymm8
- vmovdqa 0xa0(%rdi), %ymm9
- vmovdqa 0xc0(%rdi), %ymm10
- vmovdqa 0xe0(%rdi), %ymm11
- vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
- vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
- vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
- vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
- vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
- vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
- vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
- vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
- vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
- vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
- vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
- vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
- vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
- vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
- vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
- vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
- vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
- vpsrlq $0x20, %ymm7, %ymm7
- vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
- vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
- vpsrlq $0x20, %ymm5, %ymm5
- vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
- vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
- vpsrlq $0x20, %ymm3, %ymm3
- vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
- vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
- vpsrlq $0x20, %ymm10, %ymm10
- vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
- vpslld $0x10, %ymm5, %ymm10
- vpblendw $0xaa, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4],ymm10[5],ymm9[6],ymm10[7],ymm9[8],ymm10[9],ymm9[10],ymm10[11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
- vpsrld $0x10, %ymm9, %ymm9
- vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
- vpslld $0x10, %ymm4, %ymm9
- vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
- vpsrld $0x10, %ymm8, %ymm8
- vpblendw $0xaa, %ymm4, %ymm8, %ymm4 # ymm4 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4],ymm4[5],ymm8[6],ymm4[7],ymm8[8],ymm4[9],ymm8[10],ymm4[11],ymm8[12],ymm4[13],ymm8[14],ymm4[15]
- vpslld $0x10, %ymm3, %ymm8
- vpblendw $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7],ymm7[8],ymm8[9],ymm7[10],ymm8[11],ymm7[12],ymm8[13],ymm7[14],ymm8[15]
- vpsrld $0x10, %ymm7, %ymm7
- vpblendw $0xaa, %ymm3, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7],ymm7[8],ymm3[9],ymm7[10],ymm3[11],ymm7[12],ymm3[13],ymm7[14],ymm3[15]
- vpslld $0x10, %ymm11, %ymm7
- vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
- vpsrld $0x10, %ymm6, %ymm6
- vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
- vmovdqa %ymm10, (%rdi)
- vmovdqa %ymm5, 0x20(%rdi)
- vmovdqa %ymm9, 0x40(%rdi)
- vmovdqa %ymm4, 0x60(%rdi)
- vmovdqa %ymm8, 0x80(%rdi)
- vmovdqa %ymm3, 0xa0(%rdi)
- vmovdqa %ymm7, 0xc0(%rdi)
- vmovdqa %ymm11, 0xe0(%rdi)
- retq
+MLK_ASM_FN_SIZE(nttunpack_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d10.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d10.S
new file mode 100644
index 0000000000..90b4cf8bf0
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d10.S
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_compress_d10_avx2
+ *
+ * Description: Compression of a polynomial to 10 bits per coefficient.
+ *
+ * Arguments: - uint8_t *r: pointer to output byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D10)
+ * - const int16_t *a: pointer to input polynomial
+ * - const uint8_t *data: pointer to shufbidx constant
+ **************************************************/
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2 || MLKEM_K == 3)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_compress_d10.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_compress_d10_avx2)
+MLK_ASM_FN_SYMBOL(poly_compress_d10_avx2)
+
+ .cfi_startproc
+ movl $0x4ebf4ebf, %eax # imm = 0x4EBF4EBF
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ vpsllw $0x3, %ymm0, %ymm1
+ movl $0xf000f, %eax # imm = 0xF000F
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ movl $0x10001000, %eax # imm = 0x10001000
+ vmovd %eax, %xmm3
+ vpbroadcastd %xmm3, %ymm3
+ movl $0x3ff03ff, %eax # imm = 0x3FF03FF
+ vmovd %eax, %xmm4
+ vpbroadcastd %xmm4, %ymm4
+ movabsq $0x400000104000001, %rax # imm = 0x400000104000001
+ vmovq %rax, %xmm5
+ vpbroadcastq %xmm5, %ymm5
+ movl $0xc, %eax
+ vmovq %rax, %xmm6
+ vpbroadcastq %xmm6, %ymm6
+ vmovdqa (%rdx), %ymm7
+ vmovdqa (%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, (%rdi)
+ vmovd %xmm9, 0x10(%rdi)
+ vmovdqa 0x20(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x14(%rdi)
+ vmovd %xmm9, 0x24(%rdi)
+ vmovdqa 0x40(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x28(%rdi)
+ vmovd %xmm9, 0x38(%rdi)
+ vmovdqa 0x60(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x3c(%rdi)
+ vmovd %xmm9, 0x4c(%rdi)
+ vmovdqa 0x80(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x50(%rdi)
+ vmovd %xmm9, 0x60(%rdi)
+ vmovdqa 0xa0(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x64(%rdi)
+ vmovd %xmm9, 0x74(%rdi)
+ vmovdqa 0xc0(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x78(%rdi)
+ vmovd %xmm9, 0x88(%rdi)
+ vmovdqa 0xe0(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x8c(%rdi)
+ vmovd %xmm9, 0x9c(%rdi)
+ vmovdqa 0x100(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0xa0(%rdi)
+ vmovd %xmm9, 0xb0(%rdi)
+ vmovdqa 0x120(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0xb4(%rdi)
+ vmovd %xmm9, 0xc4(%rdi)
+ vmovdqa 0x140(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0xc8(%rdi)
+ vmovd %xmm9, 0xd8(%rdi)
+ vmovdqa 0x160(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0xdc(%rdi)
+ vmovd %xmm9, 0xec(%rdi)
+ vmovdqa 0x180(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0xf0(%rdi)
+ vmovd %xmm9, 0x100(%rdi)
+ vmovdqa 0x1a0(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x104(%rdi)
+ vmovd %xmm9, 0x114(%rdi)
+ vmovdqa 0x1c0(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x118(%rdi)
+ vmovd %xmm9, 0x128(%rdi)
+ vmovdqa 0x1e0(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x12c(%rdi)
+ vmovd %xmm9, 0x13c(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_compress_d10_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == \
+ 3) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d11.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d11.S
new file mode 100644
index 0000000000..f26a420ec0
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d11.S
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_compress_d11_avx2
+ *
+ * Description: Compression of a polynomial to 11 bits per coefficient.
+ *
+ * Arguments: - uint8_t *r: pointer to output byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D11)
+ * - const int16_t *a: pointer to input polynomial
+ * - const uint8_t *data: pointer to constants
+ * (srlvqidx[0:32], shufbidx[32:64])
+ **************************************************/
+
+#include "../../../common.h"
+
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_compress_d11.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_compress_d11_avx2)
+MLK_ASM_FN_SYMBOL(poly_compress_d11_avx2)
+
+ .cfi_startproc
+ movl $0x4ebf4ebf, %eax # imm = 0x4EBF4EBF
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ vpsllw $0x3, %ymm0, %ymm1
+ movl $0x240024, %eax # imm = 0x240024
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ movl $0x20002000, %eax # imm = 0x20002000
+ vmovd %eax, %xmm3
+ vpbroadcastd %xmm3, %ymm3
+ movl $0x7ff07ff, %eax # imm = 0x7FF07FF
+ vmovd %eax, %xmm4
+ vpbroadcastd %xmm4, %ymm4
+ movabsq $0x800000108000001, %rax # imm = 0x800000108000001
+ vmovq %rax, %xmm5
+ vpbroadcastq %xmm5, %ymm5
+ movl $0xa, %eax
+ vmovq %rax, %xmm6
+ vpbroadcastq %xmm6, %ymm6
+ vmovdqa (%rdx), %ymm7
+ vmovdqa 0x20(%rdx), %ymm8
+ vmovdqa (%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, (%rdi)
+ vmovd %xmm10, 0x10(%rdi)
+ vpextrw $0x2, %xmm10, 0x14(%rdi)
+ vmovdqa 0x20(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x16(%rdi)
+ vmovd %xmm10, 0x26(%rdi)
+ vpextrw $0x2, %xmm10, 0x2a(%rdi)
+ vmovdqa 0x40(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x2c(%rdi)
+ vmovd %xmm10, 0x3c(%rdi)
+ vpextrw $0x2, %xmm10, 0x40(%rdi)
+ vmovdqa 0x60(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x42(%rdi)
+ vmovd %xmm10, 0x52(%rdi)
+ vpextrw $0x2, %xmm10, 0x56(%rdi)
+ vmovdqa 0x80(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x58(%rdi)
+ vmovd %xmm10, 0x68(%rdi)
+ vpextrw $0x2, %xmm10, 0x6c(%rdi)
+ vmovdqa 0xa0(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x6e(%rdi)
+ vmovd %xmm10, 0x7e(%rdi)
+ vpextrw $0x2, %xmm10, 0x82(%rdi)
+ vmovdqa 0xc0(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x84(%rdi)
+ vmovd %xmm10, 0x94(%rdi)
+ vpextrw $0x2, %xmm10, 0x98(%rdi)
+ vmovdqa 0xe0(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x9a(%rdi)
+ vmovd %xmm10, 0xaa(%rdi)
+ vpextrw $0x2, %xmm10, 0xae(%rdi)
+ vmovdqa 0x100(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0xb0(%rdi)
+ vmovd %xmm10, 0xc0(%rdi)
+ vpextrw $0x2, %xmm10, 0xc4(%rdi)
+ vmovdqa 0x120(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0xc6(%rdi)
+ vmovd %xmm10, 0xd6(%rdi)
+ vpextrw $0x2, %xmm10, 0xda(%rdi)
+ vmovdqa 0x140(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0xdc(%rdi)
+ vmovd %xmm10, 0xec(%rdi)
+ vpextrw $0x2, %xmm10, 0xf0(%rdi)
+ vmovdqa 0x160(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0xf2(%rdi)
+ vmovd %xmm10, 0x102(%rdi)
+ vpextrw $0x2, %xmm10, 0x106(%rdi)
+ vmovdqa 0x180(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x108(%rdi)
+ vmovd %xmm10, 0x118(%rdi)
+ vpextrw $0x2, %xmm10, 0x11c(%rdi)
+ vmovdqa 0x1a0(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x11e(%rdi)
+ vmovd %xmm10, 0x12e(%rdi)
+ vpextrw $0x2, %xmm10, 0x132(%rdi)
+ vmovdqa 0x1c0(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x134(%rdi)
+ vmovd %xmm10, 0x144(%rdi)
+ vpextrw $0x2, %xmm10, 0x148(%rdi)
+ vmovdqa 0x1e0(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x14a(%rdi)
+ vmovd %xmm10, 0x15a(%rdi)
+ vpextrw $0x2, %xmm10, 0x15e(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_compress_d11_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d4.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d4.S
new file mode 100644
index 0000000000..b4ca46e56b
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d4.S
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_compress_d4_avx2
+ *
+ * Description: Compression of a polynomial to 4 bits per coefficient.
+ *
+ * Arguments: - uint8_t *r: pointer to output byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D4)
+ * - const int16_t *a: pointer to input polynomial
+ * - const uint8_t *data: pointer to permdidx constant
+ **************************************************/
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2 || MLKEM_K == 3)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_compress_d4.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_compress_d4_avx2)
+MLK_ASM_FN_SYMBOL(poly_compress_d4_avx2)
+
+ .cfi_startproc
+ movl $0x4ebf4ebf, %eax # imm = 0x4EBF4EBF
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0x2000200, %eax # imm = 0x2000200
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ movl $0xf000f, %eax # imm = 0xF000F
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ movl $0x10011001, %eax # imm = 0x10011001
+ vmovd %eax, %xmm3
+ vpbroadcastd %xmm3, %ymm3
+ vmovdqa (%rdx), %ymm4
+ vmovdqa (%rsi), %ymm5
+ vmovdqa 0x20(%rsi), %ymm6
+ vmovdqa 0x40(%rsi), %ymm7
+ vmovdqa 0x60(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm5, %ymm5
+ vpmulhrsw %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm5, %ymm5
+ vpand %ymm2, %ymm6, %ymm6
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm6, %ymm5, %ymm5
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm5, %ymm5
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpackuswb %ymm7, %ymm5, %ymm5
+ vpermd %ymm5, %ymm4, %ymm5
+ vmovdqu %ymm5, (%rdi)
+ vmovdqa 0x80(%rsi), %ymm5
+ vmovdqa 0xa0(%rsi), %ymm6
+ vmovdqa 0xc0(%rsi), %ymm7
+ vmovdqa 0xe0(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm5, %ymm5
+ vpmulhrsw %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm5, %ymm5
+ vpand %ymm2, %ymm6, %ymm6
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm6, %ymm5, %ymm5
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm5, %ymm5
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpackuswb %ymm7, %ymm5, %ymm5
+ vpermd %ymm5, %ymm4, %ymm5
+ vmovdqu %ymm5, 0x20(%rdi)
+ vmovdqa 0x100(%rsi), %ymm5
+ vmovdqa 0x120(%rsi), %ymm6
+ vmovdqa 0x140(%rsi), %ymm7
+ vmovdqa 0x160(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm5, %ymm5
+ vpmulhrsw %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm5, %ymm5
+ vpand %ymm2, %ymm6, %ymm6
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm6, %ymm5, %ymm5
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm5, %ymm5
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpackuswb %ymm7, %ymm5, %ymm5
+ vpermd %ymm5, %ymm4, %ymm5
+ vmovdqu %ymm5, 0x40(%rdi)
+ vmovdqa 0x180(%rsi), %ymm5
+ vmovdqa 0x1a0(%rsi), %ymm6
+ vmovdqa 0x1c0(%rsi), %ymm7
+ vmovdqa 0x1e0(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm5, %ymm5
+ vpmulhrsw %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm5, %ymm5
+ vpand %ymm2, %ymm6, %ymm6
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm6, %ymm5, %ymm5
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm5, %ymm5
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpackuswb %ymm7, %ymm5, %ymm5
+ vpermd %ymm5, %ymm4, %ymm5
+ vmovdqu %ymm5, 0x60(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_compress_d4_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == \
+ 3) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d5.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d5.S
new file mode 100644
index 0000000000..c1cb3a6032
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_compress_d5.S
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_compress_d5_avx2
+ *
+ * Description: Compression of a polynomial to 5 bits per coefficient.
+ *
+ * Arguments: - uint8_t *r: pointer to output byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D5)
+ * - const int16_t *a: pointer to input polynomial
+ * - const uint8_t *data: pointer to shufbidx constant
+ **************************************************/
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_compress_d5.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_compress_d5_avx2)
+MLK_ASM_FN_SYMBOL(poly_compress_d5_avx2)
+
+ .cfi_startproc
+ movl $0x4ebf4ebf, %eax # imm = 0x4EBF4EBF
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0x4000400, %eax # imm = 0x4000400
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ movl $0x1f001f, %eax # imm = 0x1F001F
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ movl $0x20012001, %eax # imm = 0x20012001
+ vmovd %eax, %xmm3
+ vpbroadcastd %xmm3, %ymm3
+ movl $0x4000001, %eax # imm = 0x4000001
+ vmovd %eax, %xmm4
+ vpbroadcastd %xmm4, %ymm4
+ movl $0xc, %eax
+ vmovq %rax, %xmm5
+ vpbroadcastq %xmm5, %ymm5
+ vmovdqa (%rdx), %ymm6
+ vmovdqa (%rsi), %ymm7
+ vmovdqa 0x20(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, (%rdi)
+ vmovd %xmm8, 0x10(%rdi)
+ vmovdqa 0x40(%rsi), %ymm7
+ vmovdqa 0x60(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 0x14(%rdi)
+ vmovd %xmm8, 0x24(%rdi)
+ vmovdqa 0x80(%rsi), %ymm7
+ vmovdqa 0xa0(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 0x28(%rdi)
+ vmovd %xmm8, 0x38(%rdi)
+ vmovdqa 0xc0(%rsi), %ymm7
+ vmovdqa 0xe0(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 0x3c(%rdi)
+ vmovd %xmm8, 0x4c(%rdi)
+ vmovdqa 0x100(%rsi), %ymm7
+ vmovdqa 0x120(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 0x50(%rdi)
+ vmovd %xmm8, 0x60(%rdi)
+ vmovdqa 0x140(%rsi), %ymm7
+ vmovdqa 0x160(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 0x64(%rdi)
+ vmovd %xmm8, 0x74(%rdi)
+ vmovdqa 0x180(%rsi), %ymm7
+ vmovdqa 0x1a0(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 0x78(%rdi)
+ vmovd %xmm8, 0x88(%rdi)
+ vmovdqa 0x1c0(%rsi), %ymm7
+ vmovdqa 0x1e0(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 0x8c(%rdi)
+ vmovd %xmm8, 0x9c(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_compress_d5_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d10.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d10.S
new file mode 100644
index 0000000000..27412b18c4
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d10.S
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_decompress_d10_avx2
+ *
+ * Description: Decompression of a polynomial from 10 bits per coefficient.
+ *
+ * Arguments: - int16_t *r: pointer to output polynomial
+ * - const uint8_t *a: pointer to input byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D10)
+ * - const uint8_t *data: pointer to shufbidx constant
+ **************************************************/
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2 || MLKEM_K == 3)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_decompress_d10.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_decompress_d10_avx2)
+MLK_ASM_FN_SYMBOL(poly_decompress_d10_avx2)
+
+ .cfi_startproc
+ movl $0xd013404, %eax # imm = 0xD013404
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0x4, %eax
+ vmovq %rax, %xmm1
+ vpbroadcastq %xmm1, %ymm1
+ movl $0x7fe01ff8, %eax # imm = 0x7FE01FF8
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ vmovdqa (%rdx), %ymm3
+ vmovdqu (%rsi), %xmm4
+ vmovd 0x10(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, (%rdi)
+ vmovdqu 0x14(%rsi), %xmm4
+ vmovd 0x24(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x20(%rdi)
+ vmovdqu 0x28(%rsi), %xmm4
+ vmovd 0x38(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x40(%rdi)
+ vmovdqu 0x3c(%rsi), %xmm4
+ vmovd 0x4c(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x60(%rdi)
+ vmovdqu 0x50(%rsi), %xmm4
+ vmovd 0x60(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x80(%rdi)
+ vmovdqu 0x64(%rsi), %xmm4
+ vmovd 0x74(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xa0(%rdi)
+ vmovdqu 0x78(%rsi), %xmm4
+ vmovd 0x88(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xc0(%rdi)
+ vmovdqu 0x8c(%rsi), %xmm4
+ vmovd 0x9c(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xe0(%rdi)
+ vmovdqu 0xa0(%rsi), %xmm4
+ vmovd 0xb0(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x100(%rdi)
+ vmovdqu 0xb4(%rsi), %xmm4
+ vmovd 0xc4(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x120(%rdi)
+ vmovdqu 0xc8(%rsi), %xmm4
+ vmovd 0xd8(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x140(%rdi)
+ vmovdqu 0xdc(%rsi), %xmm4
+ vmovd 0xec(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x160(%rdi)
+ vmovdqu 0xf0(%rsi), %xmm4
+ vmovd 0x100(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x180(%rdi)
+ vmovdqu 0x104(%rsi), %xmm4
+ vmovd 0x114(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1a0(%rdi)
+ vmovdqu 0x118(%rsi), %xmm4
+ vmovd 0x128(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1c0(%rdi)
+ vmovdqu 0x12c(%rsi), %xmm4
+ vmovd 0x13c(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1e0(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_decompress_d10_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == \
+ 3) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d11.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d11.S
new file mode 100644
index 0000000000..67ef58e225
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d11.S
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_decompress_d11_avx2
+ *
+ * Description: Decompression of a polynomial from 11 bits per coefficient.
+ *
+ * Arguments: - int16_t *r: pointer to output polynomial
+ * - const uint8_t *a: pointer to input byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D11)
+ * - const uint8_t *data: pointer to constants
+ * (shufbidx[0:32], srlvdidx[32:64],
+ * srlvqidx[64:96], shift[96:128])
+ **************************************************/
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_decompress_d11.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_decompress_d11_avx2)
+MLK_ASM_FN_SYMBOL(poly_decompress_d11_avx2)
+
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0x7ff07ff0, %eax # imm = 0x7FF07FF0
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ vmovdqa (%rdx), %ymm2
+ vmovdqa 0x20(%rdx), %ymm3
+ vmovdqa 0x40(%rdx), %ymm4
+ vmovdqa 0x60(%rdx), %ymm5
+ vmovdqu (%rsi), %xmm6
+ vmovd 0x10(%rsi), %xmm7
+ vpinsrw $0x2, 0x14(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, (%rdi)
+ vmovdqu 0x16(%rsi), %xmm6
+ vmovd 0x26(%rsi), %xmm7
+ vpinsrw $0x2, 0x2a(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x20(%rdi)
+ vmovdqu 0x2c(%rsi), %xmm6
+ vmovd 0x3c(%rsi), %xmm7
+ vpinsrw $0x2, 0x40(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x40(%rdi)
+ vmovdqu 0x42(%rsi), %xmm6
+ vmovd 0x52(%rsi), %xmm7
+ vpinsrw $0x2, 0x56(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x60(%rdi)
+ vmovdqu 0x58(%rsi), %xmm6
+ vmovd 0x68(%rsi), %xmm7
+ vpinsrw $0x2, 0x6c(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x80(%rdi)
+ vmovdqu 0x6e(%rsi), %xmm6
+ vmovd 0x7e(%rsi), %xmm7
+ vpinsrw $0x2, 0x82(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0xa0(%rdi)
+ vmovdqu 0x84(%rsi), %xmm6
+ vmovd 0x94(%rsi), %xmm7
+ vpinsrw $0x2, 0x98(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0xc0(%rdi)
+ vmovdqu 0x9a(%rsi), %xmm6
+ vmovd 0xaa(%rsi), %xmm7
+ vpinsrw $0x2, 0xae(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0xe0(%rdi)
+ vmovdqu 0xb0(%rsi), %xmm6
+ vmovd 0xc0(%rsi), %xmm7
+ vpinsrw $0x2, 0xc4(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x100(%rdi)
+ vmovdqu 0xc6(%rsi), %xmm6
+ vmovd 0xd6(%rsi), %xmm7
+ vpinsrw $0x2, 0xda(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x120(%rdi)
+ vmovdqu 0xdc(%rsi), %xmm6
+ vmovd 0xec(%rsi), %xmm7
+ vpinsrw $0x2, 0xf0(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x140(%rdi)
+ vmovdqu 0xf2(%rsi), %xmm6
+ vmovd 0x102(%rsi), %xmm7
+ vpinsrw $0x2, 0x106(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x160(%rdi)
+ vmovdqu 0x108(%rsi), %xmm6
+ vmovd 0x118(%rsi), %xmm7
+ vpinsrw $0x2, 0x11c(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x180(%rdi)
+ vmovdqu 0x11e(%rsi), %xmm6
+ vmovd 0x12e(%rsi), %xmm7
+ vpinsrw $0x2, 0x132(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x1a0(%rdi)
+ vmovdqu 0x134(%rsi), %xmm6
+ vmovd 0x144(%rsi), %xmm7
+ vpinsrw $0x2, 0x148(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x1c0(%rdi)
+ vmovdqu 0x14a(%rsi), %xmm6
+ vmovd 0x15a(%rsi), %xmm7
+ vpinsrw $0x2, 0x15e(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x1e0(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_decompress_d11_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d4.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d4.S
new file mode 100644
index 0000000000..765a850c22
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d4.S
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_decompress_d4_avx2
+ *
+ * Description: Decompression of a polynomial from 4 bits per coefficient.
+ *
+ * Arguments: - int16_t *r: pointer to output polynomial
+ * - const uint8_t *a: pointer to input byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D4)
+ * - const int8_t *data: pointer to shufbidx constant
+ **************************************************/
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2 || MLKEM_K == 3)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_decompress_d4.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_decompress_d4_avx2)
+MLK_ASM_FN_SYMBOL(poly_decompress_d4_avx2)
+
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0xf0000f, %eax # imm = 0xF0000F
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ movl $0x800800, %eax # imm = 0x800800
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ vmovdqa (%rdx), %ymm3
+ vmovq (%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, (%rdi)
+ vmovq 0x8(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x20(%rdi)
+ vmovq 0x10(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x40(%rdi)
+ vmovq 0x18(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x60(%rdi)
+ vmovq 0x20(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x80(%rdi)
+ vmovq 0x28(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xa0(%rdi)
+ vmovq 0x30(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xc0(%rdi)
+ vmovq 0x38(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xe0(%rdi)
+ vmovq 0x40(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x100(%rdi)
+ vmovq 0x48(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x120(%rdi)
+ vmovq 0x50(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x140(%rdi)
+ vmovq 0x58(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x160(%rdi)
+ vmovq 0x60(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x180(%rdi)
+ vmovq 0x68(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1a0(%rdi)
+ vmovq 0x70(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1c0(%rdi)
+ vmovq 0x78(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1e0(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_decompress_d4_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == \
+ 3) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d5.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d5.S
new file mode 100644
index 0000000000..3108d6b17e
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d5.S
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_decompress_d5_avx2
+ *
+ * Description: Decompression of a polynomial from 5 bits per coefficient.
+ *
+ * Arguments: - int16_t *r: pointer to output polynomial
+ * - const uint8_t *a: pointer to input byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D5)
+ * - const uint8_t *data: pointer to constants
+ * (shufbidx[0:32], mask[32:64], shift[64:96])
+ **************************************************/
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_decompress_d5.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_decompress_d5_avx2)
+MLK_ASM_FN_SYMBOL(poly_decompress_d5_avx2)
+
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ vmovdqa (%rdx), %ymm1
+ vmovdqa 0x20(%rdx), %ymm2
+ vmovdqa 0x40(%rdx), %ymm3
+ vmovq (%rsi), %xmm4
+ vpinsrw $0x4, 0x8(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, (%rdi)
+ vmovq 0xa(%rsi), %xmm4
+ vpinsrw $0x4, 0x12(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x20(%rdi)
+ vmovq 0x14(%rsi), %xmm4
+ vpinsrw $0x4, 0x1c(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x40(%rdi)
+ vmovq 0x1e(%rsi), %xmm4
+ vpinsrw $0x4, 0x26(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x60(%rdi)
+ vmovq 0x28(%rsi), %xmm4
+ vpinsrw $0x4, 0x30(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x80(%rdi)
+ vmovq 0x32(%rsi), %xmm4
+ vpinsrw $0x4, 0x3a(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xa0(%rdi)
+ vmovq 0x3c(%rsi), %xmm4
+ vpinsrw $0x4, 0x44(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xc0(%rdi)
+ vmovq 0x46(%rsi), %xmm4
+ vpinsrw $0x4, 0x4e(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xe0(%rdi)
+ vmovq 0x50(%rsi), %xmm4
+ vpinsrw $0x4, 0x58(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x100(%rdi)
+ vmovq 0x5a(%rsi), %xmm4
+ vpinsrw $0x4, 0x62(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x120(%rdi)
+ vmovq 0x64(%rsi), %xmm4
+ vpinsrw $0x4, 0x6c(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x140(%rdi)
+ vmovq 0x6e(%rsi), %xmm4
+ vpinsrw $0x4, 0x76(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x160(%rdi)
+ vmovq 0x78(%rsi), %xmm4
+ vpinsrw $0x4, 0x80(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x180(%rdi)
+ vmovq 0x82(%rsi), %xmm4
+ vpinsrw $0x4, 0x8a(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1a0(%rdi)
+ vmovq 0x8c(%rsi), %xmm4
+ vpinsrw $0x4, 0x94(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1c0(%rdi)
+ vmovq 0x96(%rsi), %xmm4
+ vpinsrw $0x4, 0x9e(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1e0(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_decompress_d5_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S
new file mode 100644
index 0000000000..af75ec5d3b
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S
@@ -0,0 +1,502 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2)
+MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k2)
+
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0xf301f301, %eax # imm = 0xF301F301
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ vmovdqa (%rsi), %ymm2
+ vmovdqa 0x20(%rsi), %ymm3
+ vmovdqa (%rdx), %ymm4
+ vmovdqa 0x20(%rdx), %ymm5
+ vmovdqa (%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x40(%rsi), %ymm2
+ vmovdqa 0x60(%rsi), %ymm3
+ vmovdqa 0x40(%rdx), %ymm4
+ vmovdqa 0x60(%rdx), %ymm5
+ vmovdqa 0x20(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x80(%rsi), %ymm2
+ vmovdqa 0xa0(%rsi), %ymm3
+ vmovdqa 0x80(%rdx), %ymm4
+ vmovdqa 0xa0(%rdx), %ymm5
+ vmovdqa 0x40(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0xc0(%rsi), %ymm2
+ vmovdqa 0xe0(%rsi), %ymm3
+ vmovdqa 0xc0(%rdx), %ymm4
+ vmovdqa 0xe0(%rdx), %ymm5
+ vmovdqa 0x60(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x100(%rsi), %ymm2
+ vmovdqa 0x120(%rsi), %ymm3
+ vmovdqa 0x100(%rdx), %ymm4
+ vmovdqa 0x120(%rdx), %ymm5
+ vmovdqa 0x80(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x140(%rsi), %ymm2
+ vmovdqa 0x160(%rsi), %ymm3
+ vmovdqa 0x140(%rdx), %ymm4
+ vmovdqa 0x160(%rdx), %ymm5
+ vmovdqa 0xa0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x180(%rsi), %ymm2
+ vmovdqa 0x1a0(%rsi), %ymm3
+ vmovdqa 0x180(%rdx), %ymm4
+ vmovdqa 0x1a0(%rdx), %ymm5
+ vmovdqa 0xc0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x1c0(%rsi), %ymm2
+ vmovdqa 0x1e0(%rsi), %ymm3
+ vmovdqa 0x1c0(%rdx), %ymm4
+ vmovdqa 0x1e0(%rdx), %ymm5
+ vmovdqa 0xe0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ vmovdqa 0x200(%rsi), %ymm2
+ vmovdqa 0x220(%rsi), %ymm3
+ vmovdqa 0x200(%rdx), %ymm4
+ vmovdqa 0x220(%rdx), %ymm5
+ vmovdqa 0x100(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa (%rdi), %ymm8
+ vmovdqa 0x20(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x240(%rsi), %ymm2
+ vmovdqa 0x260(%rsi), %ymm3
+ vmovdqa 0x240(%rdx), %ymm4
+ vmovdqa 0x260(%rdx), %ymm5
+ vmovdqa 0x120(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x40(%rdi), %ymm8
+ vmovdqa 0x60(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x280(%rsi), %ymm2
+ vmovdqa 0x2a0(%rsi), %ymm3
+ vmovdqa 0x280(%rdx), %ymm4
+ vmovdqa 0x2a0(%rdx), %ymm5
+ vmovdqa 0x140(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0x2c0(%rsi), %ymm2
+ vmovdqa 0x2e0(%rsi), %ymm3
+ vmovdqa 0x2c0(%rdx), %ymm4
+ vmovdqa 0x2e0(%rdx), %ymm5
+ vmovdqa 0x160(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0xc0(%rdi), %ymm8
+ vmovdqa 0xe0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x300(%rsi), %ymm2
+ vmovdqa 0x320(%rsi), %ymm3
+ vmovdqa 0x300(%rdx), %ymm4
+ vmovdqa 0x320(%rdx), %ymm5
+ vmovdqa 0x180(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x120(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x340(%rsi), %ymm2
+ vmovdqa 0x360(%rsi), %ymm3
+ vmovdqa 0x340(%rdx), %ymm4
+ vmovdqa 0x360(%rdx), %ymm5
+ vmovdqa 0x1a0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x140(%rdi), %ymm8
+ vmovdqa 0x160(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x380(%rsi), %ymm2
+ vmovdqa 0x3a0(%rsi), %ymm3
+ vmovdqa 0x380(%rdx), %ymm4
+ vmovdqa 0x3a0(%rdx), %ymm5
+ vmovdqa 0x1c0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x3c0(%rsi), %ymm2
+ vmovdqa 0x3e0(%rsi), %ymm3
+ vmovdqa 0x3c0(%rdx), %ymm4
+ vmovdqa 0x3e0(%rdx), %ymm5
+ vmovdqa 0x1e0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm8
+ vmovdqa 0x1e0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_asm_k2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S
new file mode 100644
index 0000000000..931bfd63ea
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S
@@ -0,0 +1,750 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 3)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3)
+MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k3)
+
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0xf301f301, %eax # imm = 0xF301F301
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ vmovdqa (%rsi), %ymm2
+ vmovdqa 0x20(%rsi), %ymm3
+ vmovdqa (%rdx), %ymm4
+ vmovdqa 0x20(%rdx), %ymm5
+ vmovdqa (%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x40(%rsi), %ymm2
+ vmovdqa 0x60(%rsi), %ymm3
+ vmovdqa 0x40(%rdx), %ymm4
+ vmovdqa 0x60(%rdx), %ymm5
+ vmovdqa 0x20(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x80(%rsi), %ymm2
+ vmovdqa 0xa0(%rsi), %ymm3
+ vmovdqa 0x80(%rdx), %ymm4
+ vmovdqa 0xa0(%rdx), %ymm5
+ vmovdqa 0x40(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0xc0(%rsi), %ymm2
+ vmovdqa 0xe0(%rsi), %ymm3
+ vmovdqa 0xc0(%rdx), %ymm4
+ vmovdqa 0xe0(%rdx), %ymm5
+ vmovdqa 0x60(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x100(%rsi), %ymm2
+ vmovdqa 0x120(%rsi), %ymm3
+ vmovdqa 0x100(%rdx), %ymm4
+ vmovdqa 0x120(%rdx), %ymm5
+ vmovdqa 0x80(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x140(%rsi), %ymm2
+ vmovdqa 0x160(%rsi), %ymm3
+ vmovdqa 0x140(%rdx), %ymm4
+ vmovdqa 0x160(%rdx), %ymm5
+ vmovdqa 0xa0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x180(%rsi), %ymm2
+ vmovdqa 0x1a0(%rsi), %ymm3
+ vmovdqa 0x180(%rdx), %ymm4
+ vmovdqa 0x1a0(%rdx), %ymm5
+ vmovdqa 0xc0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x1c0(%rsi), %ymm2
+ vmovdqa 0x1e0(%rsi), %ymm3
+ vmovdqa 0x1c0(%rdx), %ymm4
+ vmovdqa 0x1e0(%rdx), %ymm5
+ vmovdqa 0xe0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ vmovdqa 0x200(%rsi), %ymm2
+ vmovdqa 0x220(%rsi), %ymm3
+ vmovdqa 0x200(%rdx), %ymm4
+ vmovdqa 0x220(%rdx), %ymm5
+ vmovdqa 0x100(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa (%rdi), %ymm8
+ vmovdqa 0x20(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x240(%rsi), %ymm2
+ vmovdqa 0x260(%rsi), %ymm3
+ vmovdqa 0x240(%rdx), %ymm4
+ vmovdqa 0x260(%rdx), %ymm5
+ vmovdqa 0x120(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x40(%rdi), %ymm8
+ vmovdqa 0x60(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x280(%rsi), %ymm2
+ vmovdqa 0x2a0(%rsi), %ymm3
+ vmovdqa 0x280(%rdx), %ymm4
+ vmovdqa 0x2a0(%rdx), %ymm5
+ vmovdqa 0x140(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0x2c0(%rsi), %ymm2
+ vmovdqa 0x2e0(%rsi), %ymm3
+ vmovdqa 0x2c0(%rdx), %ymm4
+ vmovdqa 0x2e0(%rdx), %ymm5
+ vmovdqa 0x160(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0xc0(%rdi), %ymm8
+ vmovdqa 0xe0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x300(%rsi), %ymm2
+ vmovdqa 0x320(%rsi), %ymm3
+ vmovdqa 0x300(%rdx), %ymm4
+ vmovdqa 0x320(%rdx), %ymm5
+ vmovdqa 0x180(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x120(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x340(%rsi), %ymm2
+ vmovdqa 0x360(%rsi), %ymm3
+ vmovdqa 0x340(%rdx), %ymm4
+ vmovdqa 0x360(%rdx), %ymm5
+ vmovdqa 0x1a0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x140(%rdi), %ymm8
+ vmovdqa 0x160(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x380(%rsi), %ymm2
+ vmovdqa 0x3a0(%rsi), %ymm3
+ vmovdqa 0x380(%rdx), %ymm4
+ vmovdqa 0x3a0(%rdx), %ymm5
+ vmovdqa 0x1c0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x3c0(%rsi), %ymm2
+ vmovdqa 0x3e0(%rsi), %ymm3
+ vmovdqa 0x3c0(%rdx), %ymm4
+ vmovdqa 0x3e0(%rdx), %ymm5
+ vmovdqa 0x1e0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm8
+ vmovdqa 0x1e0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ vmovdqa 0x400(%rsi), %ymm2
+ vmovdqa 0x420(%rsi), %ymm3
+ vmovdqa 0x400(%rdx), %ymm4
+ vmovdqa 0x420(%rdx), %ymm5
+ vmovdqa 0x200(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa (%rdi), %ymm8
+ vmovdqa 0x20(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x440(%rsi), %ymm2
+ vmovdqa 0x460(%rsi), %ymm3
+ vmovdqa 0x440(%rdx), %ymm4
+ vmovdqa 0x460(%rdx), %ymm5
+ vmovdqa 0x220(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x40(%rdi), %ymm8
+ vmovdqa 0x60(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x480(%rsi), %ymm2
+ vmovdqa 0x4a0(%rsi), %ymm3
+ vmovdqa 0x480(%rdx), %ymm4
+ vmovdqa 0x4a0(%rdx), %ymm5
+ vmovdqa 0x240(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0x4c0(%rsi), %ymm2
+ vmovdqa 0x4e0(%rsi), %ymm3
+ vmovdqa 0x4c0(%rdx), %ymm4
+ vmovdqa 0x4e0(%rdx), %ymm5
+ vmovdqa 0x260(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0xc0(%rdi), %ymm8
+ vmovdqa 0xe0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x500(%rsi), %ymm2
+ vmovdqa 0x520(%rsi), %ymm3
+ vmovdqa 0x500(%rdx), %ymm4
+ vmovdqa 0x520(%rdx), %ymm5
+ vmovdqa 0x280(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x120(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x540(%rsi), %ymm2
+ vmovdqa 0x560(%rsi), %ymm3
+ vmovdqa 0x540(%rdx), %ymm4
+ vmovdqa 0x560(%rdx), %ymm5
+ vmovdqa 0x2a0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x140(%rdi), %ymm8
+ vmovdqa 0x160(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x580(%rsi), %ymm2
+ vmovdqa 0x5a0(%rsi), %ymm3
+ vmovdqa 0x580(%rdx), %ymm4
+ vmovdqa 0x5a0(%rdx), %ymm5
+ vmovdqa 0x2c0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x5c0(%rsi), %ymm2
+ vmovdqa 0x5e0(%rsi), %ymm3
+ vmovdqa 0x5c0(%rdx), %ymm4
+ vmovdqa 0x5e0(%rdx), %ymm5
+ vmovdqa 0x2e0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm8
+ vmovdqa 0x1e0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_asm_k3)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S
new file mode 100644
index 0000000000..4f58578b9a
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S
@@ -0,0 +1,998 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4)
+MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k4)
+
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0xf301f301, %eax # imm = 0xF301F301
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ vmovdqa (%rsi), %ymm2
+ vmovdqa 0x20(%rsi), %ymm3
+ vmovdqa (%rdx), %ymm4
+ vmovdqa 0x20(%rdx), %ymm5
+ vmovdqa (%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x40(%rsi), %ymm2
+ vmovdqa 0x60(%rsi), %ymm3
+ vmovdqa 0x40(%rdx), %ymm4
+ vmovdqa 0x60(%rdx), %ymm5
+ vmovdqa 0x20(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x80(%rsi), %ymm2
+ vmovdqa 0xa0(%rsi), %ymm3
+ vmovdqa 0x80(%rdx), %ymm4
+ vmovdqa 0xa0(%rdx), %ymm5
+ vmovdqa 0x40(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0xc0(%rsi), %ymm2
+ vmovdqa 0xe0(%rsi), %ymm3
+ vmovdqa 0xc0(%rdx), %ymm4
+ vmovdqa 0xe0(%rdx), %ymm5
+ vmovdqa 0x60(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x100(%rsi), %ymm2
+ vmovdqa 0x120(%rsi), %ymm3
+ vmovdqa 0x100(%rdx), %ymm4
+ vmovdqa 0x120(%rdx), %ymm5
+ vmovdqa 0x80(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x140(%rsi), %ymm2
+ vmovdqa 0x160(%rsi), %ymm3
+ vmovdqa 0x140(%rdx), %ymm4
+ vmovdqa 0x160(%rdx), %ymm5
+ vmovdqa 0xa0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x180(%rsi), %ymm2
+ vmovdqa 0x1a0(%rsi), %ymm3
+ vmovdqa 0x180(%rdx), %ymm4
+ vmovdqa 0x1a0(%rdx), %ymm5
+ vmovdqa 0xc0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x1c0(%rsi), %ymm2
+ vmovdqa 0x1e0(%rsi), %ymm3
+ vmovdqa 0x1c0(%rdx), %ymm4
+ vmovdqa 0x1e0(%rdx), %ymm5
+ vmovdqa 0xe0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ vmovdqa 0x200(%rsi), %ymm2
+ vmovdqa 0x220(%rsi), %ymm3
+ vmovdqa 0x200(%rdx), %ymm4
+ vmovdqa 0x220(%rdx), %ymm5
+ vmovdqa 0x100(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa (%rdi), %ymm8
+ vmovdqa 0x20(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x240(%rsi), %ymm2
+ vmovdqa 0x260(%rsi), %ymm3
+ vmovdqa 0x240(%rdx), %ymm4
+ vmovdqa 0x260(%rdx), %ymm5
+ vmovdqa 0x120(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x40(%rdi), %ymm8
+ vmovdqa 0x60(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x280(%rsi), %ymm2
+ vmovdqa 0x2a0(%rsi), %ymm3
+ vmovdqa 0x280(%rdx), %ymm4
+ vmovdqa 0x2a0(%rdx), %ymm5
+ vmovdqa 0x140(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0x2c0(%rsi), %ymm2
+ vmovdqa 0x2e0(%rsi), %ymm3
+ vmovdqa 0x2c0(%rdx), %ymm4
+ vmovdqa 0x2e0(%rdx), %ymm5
+ vmovdqa 0x160(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0xc0(%rdi), %ymm8
+ vmovdqa 0xe0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x300(%rsi), %ymm2
+ vmovdqa 0x320(%rsi), %ymm3
+ vmovdqa 0x300(%rdx), %ymm4
+ vmovdqa 0x320(%rdx), %ymm5
+ vmovdqa 0x180(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x120(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x340(%rsi), %ymm2
+ vmovdqa 0x360(%rsi), %ymm3
+ vmovdqa 0x340(%rdx), %ymm4
+ vmovdqa 0x360(%rdx), %ymm5
+ vmovdqa 0x1a0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x140(%rdi), %ymm8
+ vmovdqa 0x160(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x380(%rsi), %ymm2
+ vmovdqa 0x3a0(%rsi), %ymm3
+ vmovdqa 0x380(%rdx), %ymm4
+ vmovdqa 0x3a0(%rdx), %ymm5
+ vmovdqa 0x1c0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x3c0(%rsi), %ymm2
+ vmovdqa 0x3e0(%rsi), %ymm3
+ vmovdqa 0x3c0(%rdx), %ymm4
+ vmovdqa 0x3e0(%rdx), %ymm5
+ vmovdqa 0x1e0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm8
+ vmovdqa 0x1e0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ vmovdqa 0x400(%rsi), %ymm2
+ vmovdqa 0x420(%rsi), %ymm3
+ vmovdqa 0x400(%rdx), %ymm4
+ vmovdqa 0x420(%rdx), %ymm5
+ vmovdqa 0x200(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa (%rdi), %ymm8
+ vmovdqa 0x20(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x440(%rsi), %ymm2
+ vmovdqa 0x460(%rsi), %ymm3
+ vmovdqa 0x440(%rdx), %ymm4
+ vmovdqa 0x460(%rdx), %ymm5
+ vmovdqa 0x220(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x40(%rdi), %ymm8
+ vmovdqa 0x60(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x480(%rsi), %ymm2
+ vmovdqa 0x4a0(%rsi), %ymm3
+ vmovdqa 0x480(%rdx), %ymm4
+ vmovdqa 0x4a0(%rdx), %ymm5
+ vmovdqa 0x240(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0x4c0(%rsi), %ymm2
+ vmovdqa 0x4e0(%rsi), %ymm3
+ vmovdqa 0x4c0(%rdx), %ymm4
+ vmovdqa 0x4e0(%rdx), %ymm5
+ vmovdqa 0x260(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0xc0(%rdi), %ymm8
+ vmovdqa 0xe0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x500(%rsi), %ymm2
+ vmovdqa 0x520(%rsi), %ymm3
+ vmovdqa 0x500(%rdx), %ymm4
+ vmovdqa 0x520(%rdx), %ymm5
+ vmovdqa 0x280(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x120(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x540(%rsi), %ymm2
+ vmovdqa 0x560(%rsi), %ymm3
+ vmovdqa 0x540(%rdx), %ymm4
+ vmovdqa 0x560(%rdx), %ymm5
+ vmovdqa 0x2a0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x140(%rdi), %ymm8
+ vmovdqa 0x160(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x580(%rsi), %ymm2
+ vmovdqa 0x5a0(%rsi), %ymm3
+ vmovdqa 0x580(%rdx), %ymm4
+ vmovdqa 0x5a0(%rdx), %ymm5
+ vmovdqa 0x2c0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x5c0(%rsi), %ymm2
+ vmovdqa 0x5e0(%rsi), %ymm3
+ vmovdqa 0x5c0(%rdx), %ymm4
+ vmovdqa 0x5e0(%rdx), %ymm5
+ vmovdqa 0x2e0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm8
+ vmovdqa 0x1e0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ vmovdqa 0x600(%rsi), %ymm2
+ vmovdqa 0x620(%rsi), %ymm3
+ vmovdqa 0x600(%rdx), %ymm4
+ vmovdqa 0x620(%rdx), %ymm5
+ vmovdqa 0x300(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa (%rdi), %ymm8
+ vmovdqa 0x20(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x640(%rsi), %ymm2
+ vmovdqa 0x660(%rsi), %ymm3
+ vmovdqa 0x640(%rdx), %ymm4
+ vmovdqa 0x660(%rdx), %ymm5
+ vmovdqa 0x320(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x40(%rdi), %ymm8
+ vmovdqa 0x60(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x680(%rsi), %ymm2
+ vmovdqa 0x6a0(%rsi), %ymm3
+ vmovdqa 0x680(%rdx), %ymm4
+ vmovdqa 0x6a0(%rdx), %ymm5
+ vmovdqa 0x340(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0x6c0(%rsi), %ymm2
+ vmovdqa 0x6e0(%rsi), %ymm3
+ vmovdqa 0x6c0(%rdx), %ymm4
+ vmovdqa 0x6e0(%rdx), %ymm5
+ vmovdqa 0x360(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0xc0(%rdi), %ymm8
+ vmovdqa 0xe0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x700(%rsi), %ymm2
+ vmovdqa 0x720(%rsi), %ymm3
+ vmovdqa 0x700(%rdx), %ymm4
+ vmovdqa 0x720(%rdx), %ymm5
+ vmovdqa 0x380(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x120(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x740(%rsi), %ymm2
+ vmovdqa 0x760(%rsi), %ymm3
+ vmovdqa 0x740(%rdx), %ymm4
+ vmovdqa 0x760(%rdx), %ymm5
+ vmovdqa 0x3a0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x140(%rdi), %ymm8
+ vmovdqa 0x160(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x780(%rsi), %ymm2
+ vmovdqa 0x7a0(%rsi), %ymm3
+ vmovdqa 0x780(%rdx), %ymm4
+ vmovdqa 0x7a0(%rdx), %ymm5
+ vmovdqa 0x3c0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x7c0(%rsi), %ymm2
+ vmovdqa 0x7e0(%rsi), %ymm3
+ vmovdqa 0x7c0(%rdx), %ymm4
+ vmovdqa 0x7e0(%rdx), %ymm5
+ vmovdqa 0x3e0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm8
+ vmovdqa 0x1e0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_asm_k4)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/reduce.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/reduce.S
index e550738705..76a249298f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/reduce.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/reduce.S
@@ -19,7 +19,8 @@
* Changes:
* - Add call to csub in reduce128_avx to produce outputs
* in [0,1,...,q-1] rather than [0,1,...,q], matching the
- * semantics of mlk_poly_reduce().
+ * semantics of mlk_poly_reduce(),
+ * - Use a macro instead of a local function call.
*/
#include "../../../common.h"
@@ -32,101 +33,186 @@
* dev/x86_64/src/reduce.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(reduce_avx2)
MLK_ASM_FN_SYMBOL(reduce_avx2)
- vmovdqa (%rsi), %ymm0
- vmovdqa 0x40(%rsi), %ymm1
- callq reduce_avx2_core
- addq $0x100, %rdi # imm = 0x100
- callq reduce_avx2_core
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0x4ebf4ebf, %eax # imm = 0x4EBF4EBF
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ vmovdqa (%rdi), %ymm2
+ vmovdqa 0x20(%rdi), %ymm3
+ vmovdqa 0x40(%rdi), %ymm4
+ vmovdqa 0x60(%rdi), %ymm5
+ vmovdqa 0x80(%rdi), %ymm6
+ vmovdqa 0xa0(%rdi), %ymm7
+ vmovdqa 0xc0(%rdi), %ymm8
+ vmovdqa 0xe0(%rdi), %ymm9
+ vpmulhw %ymm1, %ymm2, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm2, %ymm2
+ vpmulhw %ymm1, %ymm3, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpmulhw %ymm1, %ymm4, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpmulhw %ymm1, %ymm5, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm5, %ymm5
+ vpmulhw %ymm1, %ymm6, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm6, %ymm6
+ vpmulhw %ymm1, %ymm7, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm7, %ymm7
+ vpmulhw %ymm1, %ymm8, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm8, %ymm8
+ vpmulhw %ymm1, %ymm9, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vpsubw %ymm0, %ymm2, %ymm2
+ vpsraw $0xf, %ymm2, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm2, %ymm2
+ vpsubw %ymm0, %ymm3, %ymm3
+ vpsraw $0xf, %ymm3, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm3, %ymm3
+ vpsubw %ymm0, %ymm4, %ymm4
+ vpsraw $0xf, %ymm4, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm4, %ymm4
+ vpsubw %ymm0, %ymm5, %ymm5
+ vpsraw $0xf, %ymm5, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm5, %ymm5
+ vpsubw %ymm0, %ymm6, %ymm6
+ vpsraw $0xf, %ymm6, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm6, %ymm6
+ vpsubw %ymm0, %ymm7, %ymm7
+ vpsraw $0xf, %ymm7, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm7, %ymm7
+ vpsubw %ymm0, %ymm8, %ymm8
+ vpsraw $0xf, %ymm8, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm8, %ymm8
+ vpsubw %ymm0, %ymm9, %ymm9
+ vpsraw $0xf, %ymm9, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm9, %ymm9
+ vmovdqa %ymm2, (%rdi)
+ vmovdqa %ymm3, 0x20(%rdi)
+ vmovdqa %ymm4, 0x40(%rdi)
+ vmovdqa %ymm5, 0x60(%rdi)
+ vmovdqa %ymm6, 0x80(%rdi)
+ vmovdqa %ymm7, 0xa0(%rdi)
+ vmovdqa %ymm8, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x100(%rdi), %ymm2
+ vmovdqa 0x120(%rdi), %ymm3
+ vmovdqa 0x140(%rdi), %ymm4
+ vmovdqa 0x160(%rdi), %ymm5
+ vmovdqa 0x180(%rdi), %ymm6
+ vmovdqa 0x1a0(%rdi), %ymm7
+ vmovdqa 0x1c0(%rdi), %ymm8
+ vmovdqa 0x1e0(%rdi), %ymm9
+ vpmulhw %ymm1, %ymm2, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm2, %ymm2
+ vpmulhw %ymm1, %ymm3, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpmulhw %ymm1, %ymm4, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpmulhw %ymm1, %ymm5, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm5, %ymm5
+ vpmulhw %ymm1, %ymm6, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm6, %ymm6
+ vpmulhw %ymm1, %ymm7, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm7, %ymm7
+ vpmulhw %ymm1, %ymm8, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm8, %ymm8
+ vpmulhw %ymm1, %ymm9, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vpsubw %ymm0, %ymm2, %ymm2
+ vpsraw $0xf, %ymm2, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm2, %ymm2
+ vpsubw %ymm0, %ymm3, %ymm3
+ vpsraw $0xf, %ymm3, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm3, %ymm3
+ vpsubw %ymm0, %ymm4, %ymm4
+ vpsraw $0xf, %ymm4, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm4, %ymm4
+ vpsubw %ymm0, %ymm5, %ymm5
+ vpsraw $0xf, %ymm5, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm5, %ymm5
+ vpsubw %ymm0, %ymm6, %ymm6
+ vpsraw $0xf, %ymm6, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm6, %ymm6
+ vpsubw %ymm0, %ymm7, %ymm7
+ vpsraw $0xf, %ymm7, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm7, %ymm7
+ vpsubw %ymm0, %ymm8, %ymm8
+ vpsraw $0xf, %ymm8, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm8, %ymm8
+ vpsubw %ymm0, %ymm9, %ymm9
+ vpsraw $0xf, %ymm9, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm9, %ymm9
+ vmovdqa %ymm2, 0x100(%rdi)
+ vmovdqa %ymm3, 0x120(%rdi)
+ vmovdqa %ymm4, 0x140(%rdi)
+ vmovdqa %ymm5, 0x160(%rdi)
+ vmovdqa %ymm6, 0x180(%rdi)
+ vmovdqa %ymm7, 0x1a0(%rdi)
+ vmovdqa %ymm8, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
retq
+ .cfi_endproc
-reduce_avx2_core:
- vmovdqa (%rdi), %ymm2
- vmovdqa 0x20(%rdi), %ymm3
- vmovdqa 0x40(%rdi), %ymm4
- vmovdqa 0x60(%rdi), %ymm5
- vmovdqa 0x80(%rdi), %ymm6
- vmovdqa 0xa0(%rdi), %ymm7
- vmovdqa 0xc0(%rdi), %ymm8
- vmovdqa 0xe0(%rdi), %ymm9
- vpmulhw %ymm1, %ymm2, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm2, %ymm2
- vpmulhw %ymm1, %ymm3, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm3, %ymm3
- vpmulhw %ymm1, %ymm4, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm4, %ymm4
- vpmulhw %ymm1, %ymm5, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm5, %ymm5
- vpmulhw %ymm1, %ymm6, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm6, %ymm6
- vpmulhw %ymm1, %ymm7, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm7, %ymm7
- vpmulhw %ymm1, %ymm8, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm8, %ymm8
- vpmulhw %ymm1, %ymm9, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm9, %ymm9
- vpsubw %ymm0, %ymm2, %ymm2
- vpsraw $0xf, %ymm2, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm2, %ymm2
- vpsubw %ymm0, %ymm3, %ymm3
- vpsraw $0xf, %ymm3, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm3, %ymm3
- vpsubw %ymm0, %ymm4, %ymm4
- vpsraw $0xf, %ymm4, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm4, %ymm4
- vpsubw %ymm0, %ymm5, %ymm5
- vpsraw $0xf, %ymm5, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm5, %ymm5
- vpsubw %ymm0, %ymm6, %ymm6
- vpsraw $0xf, %ymm6, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm6, %ymm6
- vpsubw %ymm0, %ymm7, %ymm7
- vpsraw $0xf, %ymm7, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm7, %ymm7
- vpsubw %ymm0, %ymm8, %ymm8
- vpsraw $0xf, %ymm8, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm8, %ymm8
- vpsubw %ymm0, %ymm9, %ymm9
- vpsraw $0xf, %ymm9, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm9, %ymm9
- vmovdqa %ymm2, (%rdi)
- vmovdqa %ymm3, 0x20(%rdi)
- vmovdqa %ymm4, 0x40(%rdi)
- vmovdqa %ymm5, 0x60(%rdi)
- vmovdqa %ymm6, 0x80(%rdi)
- vmovdqa %ymm7, 0xa0(%rdi)
- vmovdqa %ymm8, 0xc0(%rdi)
- vmovdqa %ymm9, 0xe0(%rdi)
- retq
+MLK_ASM_FN_SIZE(reduce_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/rej_uniform_asm.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/rej_uniform_asm.S
new file mode 100644
index 0000000000..8cbc9ee1b6
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/rej_uniform_asm.S
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*************************************************
+ * Name: mlk_rej_uniform_asm
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ * uniform random integers mod q
+ *
+ * Arguments: - int16_t *r: pointer to output buffer of MLKEM_N
+ * 16-bit coefficients.
+ * - const uint8_t *buf: pointer to input buffer
+ * (assumed to be uniform random bytes)
+ * - unsigned buflen: length of input buffer in bytes.
+ * Must be a multiple of 12.
+ *
+ * Returns number of sampled 16-bit integers (at most MLKEM_N).
+ **************************************************/
+#include "../../../common.h"
+
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/rej_uniform_asm.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(rej_uniform_asm)
+MLK_ASM_FN_SYMBOL(rej_uniform_asm)
+
+ .cfi_startproc
+ subq $0x210, %rsp # imm = 0x210
+ .cfi_adjust_cfa_offset 0x210
+ xorl %eax, %eax
+ testq %rdx, %rdx
+ je Lrej_uniform_asm_end
+ movabsq $0xd010d010d010d01, %rax # imm = 0xD010D010D010D01
+ movq %rax, %xmm0
+ pinsrq $0x1, %rax, %xmm0
+ movabsq $0xfff0fff0fff0fff, %rax # imm = 0xFFF0FFF0FFF0FFF
+ movq %rax, %xmm5
+ pinsrq $0x1, %rax, %xmm5
+ movabsq $0x504040302010100, %rax # imm = 0x504040302010100
+ movq %rax, %xmm4
+ movabsq $0xb0a0a0908070706, %rax # imm = 0xB0A0A0908070706
+ pinsrq $0x1, %rax, %xmm4
+ movq $0x0, %rax
+ movq $0x0, %r8
+ movq $0x5555, %r9 # imm = 0x5555
+
+Lrej_uniform_asm_loop_start:
+ movq (%rsi,%r8), %xmm2
+ pinsrd $0x2, 0x8(%rsi,%r8), %xmm2
+ pshufb %xmm4, %xmm2
+ movdqa %xmm2, %xmm3
+ psrlw $0x4, %xmm3
+ pblendw $0xaa, %xmm3, %xmm2 # xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
+ pand %xmm5, %xmm2
+ movdqa %xmm0, %xmm1
+ pcmpgtw %xmm2, %xmm1
+ pmovmskb %xmm1, %r11d
+ pextq %r9, %r11, %r11
+ movq %r11, %r10
+ shlq $0x4, %r10
+ movdqu (%rcx,%r10), %xmm3
+ pshufb %xmm3, %xmm2
+ movdqu %xmm2, (%rsp,%rax,2)
+ popcntq %r11, %r11
+ addq %r11, %rax
+ cmpq $0x100, %rax # imm = 0x100
+ jae Lrej_uniform_asm_final_copy
+ addq $0xc, %r8
+ cmpq %r8, %rdx
+ ja Lrej_uniform_asm_loop_start
+
+Lrej_uniform_asm_final_copy:
+ movq $0x100, %rcx # imm = 0x100
+ cmpq $0x100, %rax # imm = 0x100
+ cmovaq %rcx, %rax
+ movq %rsp, %rsi
+ movq %rax, %rcx
+ shlq %rcx
+ rep movsb (%rsi), %es:(%rdi)
+
+Lrej_uniform_asm_end:
+ addq $0x210, %rsp # imm = 0x210
+ .cfi_adjust_cfa_offset -0x210
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(rej_uniform_asm)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/rej_uniform_avx2.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/rej_uniform_avx2.c
deleted file mode 100644
index 9c22e5403d..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/rej_uniform_avx2.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/* References
- * ==========
- *
- * - [REF_AVX2]
- * CRYSTALS-Kyber optimized AVX2 implementation
- * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
- * https://github.com/pq-crystals/kyber/tree/main/avx2
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
- */
-
-#include "../../../common.h"
-
-#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-
-#include
-#include
-#include
-#include "arith_native_x86_64.h"
-#include "consts.h"
-
-unsigned mlk_rej_uniform_avx2(int16_t *MLK_RESTRICT r, const uint8_t *buf)
-{
- unsigned ctr, pos;
- uint16_t val0, val1;
- uint32_t good;
- const __m256i bound =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XQ / 16]);
- const __m256i ones = _mm256_set1_epi8(1);
- const __m256i mask = _mm256_set1_epi16(0xFFF);
- const __m256i idx8 =
- _mm256_set_epi8(15, 14, 14, 13, 12, 11, 11, 10, 9, 8, 8, 7, 6, 5, 5, 4,
- 11, 10, 10, 9, 8, 7, 7, 6, 5, 4, 4, 3, 2, 1, 1, 0);
- __m256i f0, f1, g0, g1, g2, g3;
- __m128i f, t, pilo, pihi;
-
- ctr = pos = 0;
- while (ctr <= MLKEM_N - 32 && pos <= MLK_AVX2_REJ_UNIFORM_BUFLEN - 48)
- {
- f0 = _mm256_loadu_si256((__m256i *)&buf[pos]);
- /* Don't load from offset 24, as this would over-read the buffer */
- f1 = _mm256_loadu_si256((__m256i *)&buf[pos + 16]);
- f0 = _mm256_permute4x64_epi64(f0, 0x94 /* 0b10010100 ~= (2,1,1,0) */);
- f1 = _mm256_permute4x64_epi64(f1, 0xe9 /* 0x11101001 ~= (3,2,2,1) */);
- f0 = _mm256_shuffle_epi8(f0, idx8);
- f1 = _mm256_shuffle_epi8(f1, idx8);
- g0 = _mm256_srli_epi16(f0, 4);
- g1 = _mm256_srli_epi16(f1, 4);
- f0 = _mm256_blend_epi16(f0, g0, 0xAA);
- f1 = _mm256_blend_epi16(f1, g1, 0xAA);
- f0 = _mm256_and_si256(f0, mask);
- f1 = _mm256_and_si256(f1, mask);
- pos += 48;
-
- g0 = _mm256_cmpgt_epi16(bound, f0);
- g1 = _mm256_cmpgt_epi16(bound, f1);
-
- g0 = _mm256_packs_epi16(g0, g1);
- good = _mm256_movemask_epi8(g0);
-
- g0 = _mm256_castsi128_si256(
- _mm_loadl_epi64((__m128i *)&mlk_rej_uniform_table[(good >> 0) & 0xFF]));
- g1 = _mm256_castsi128_si256(
- _mm_loadl_epi64((__m128i *)&mlk_rej_uniform_table[(good >> 8) & 0xFF]));
- g0 = _mm256_inserti128_si256(
- g0,
- _mm_loadl_epi64((__m128i *)&mlk_rej_uniform_table[(good >> 16) & 0xFF]),
- 1);
- g1 = _mm256_inserti128_si256(
- g1,
- _mm_loadl_epi64((__m128i *)&mlk_rej_uniform_table[(good >> 24) & 0xFF]),
- 1);
-
- g2 = _mm256_add_epi8(g0, ones);
- g3 = _mm256_add_epi8(g1, ones);
- g0 = _mm256_unpacklo_epi8(g0, g2);
- g1 = _mm256_unpacklo_epi8(g1, g3);
-
- f0 = _mm256_shuffle_epi8(f0, g0);
- f1 = _mm256_shuffle_epi8(f1, g1);
-
- _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0));
- ctr += _mm_popcnt_u32((good >> 0) & 0xFF);
- _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1));
- ctr += _mm_popcnt_u32((good >> 16) & 0xFF);
- _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1));
- ctr += _mm_popcnt_u32((good >> 8) & 0xFF);
- _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1));
- ctr += _mm_popcnt_u32((good >> 24) & 0xFF);
- }
-
- while (ctr <= MLKEM_N - 8 && pos <= MLK_AVX2_REJ_UNIFORM_BUFLEN - 24)
- {
- f = _mm_loadu_si128((__m128i *)&buf[pos]);
- f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8));
- t = _mm_srli_epi16(f, 4);
- f = _mm_blend_epi16(f, t, 0xAA);
- f = _mm_and_si128(f, _mm256_castsi256_si128(mask));
- pos += 12;
-
- t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f);
- good = _mm_movemask_epi8(t);
-
- good = _pext_u32(good, 0x5555);
- pilo = _mm_loadl_epi64((__m128i *)&mlk_rej_uniform_table[good]);
-
- pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
- pilo = _mm_unpacklo_epi8(pilo, pihi);
- f = _mm_shuffle_epi8(f, pilo);
- _mm_storeu_si128((__m128i *)&r[ctr], f);
- ctr += _mm_popcnt_u32(good);
- }
-
- while (ctr < MLKEM_N && pos <= MLK_AVX2_REJ_UNIFORM_BUFLEN - 3)
- {
- val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
- val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4));
- pos += 3;
-
- if (val0 < MLKEM_Q)
- {
- r[ctr++] = val0;
- }
- if (val1 < MLKEM_Q && ctr < MLKEM_N)
- {
- r[ctr++] = val1;
- }
- }
-
- return ctr;
-}
-
-#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
- */
-
-MLK_EMPTY_CU(avx2_rej_uniform)
-
-#endif /* !(MLK_ARITH_BACKEND_X86_64_DEFAULT && \
- !MLK_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/rej_uniform_table.c
index 5ab9a83179..5d5b25b866 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/rej_uniform_table.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/rej_uniform_table.c
@@ -5,6 +5,7 @@
/*
* WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
* Do not modify it directly.
*/
@@ -13,142 +14,525 @@
#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
#include "arith_native_x86_64.h"
/*
* Lookup table used by rejection sampling of the public matrix.
* See autogen for details.
*/
-MLK_ALIGN const uint8_t mlk_rej_uniform_table[256][8] = {
- {-1, -1, -1, -1, -1, -1, -1, -1}, {0, -1, -1, -1, -1, -1, -1, -1},
- {2, -1, -1, -1, -1, -1, -1, -1}, {0, 2, -1, -1, -1, -1, -1, -1},
- {4, -1, -1, -1, -1, -1, -1, -1}, {0, 4, -1, -1, -1, -1, -1, -1},
- {2, 4, -1, -1, -1, -1, -1, -1}, {0, 2, 4, -1, -1, -1, -1, -1},
- {6, -1, -1, -1, -1, -1, -1, -1}, {0, 6, -1, -1, -1, -1, -1, -1},
- {2, 6, -1, -1, -1, -1, -1, -1}, {0, 2, 6, -1, -1, -1, -1, -1},
- {4, 6, -1, -1, -1, -1, -1, -1}, {0, 4, 6, -1, -1, -1, -1, -1},
- {2, 4, 6, -1, -1, -1, -1, -1}, {0, 2, 4, 6, -1, -1, -1, -1},
- {8, -1, -1, -1, -1, -1, -1, -1}, {0, 8, -1, -1, -1, -1, -1, -1},
- {2, 8, -1, -1, -1, -1, -1, -1}, {0, 2, 8, -1, -1, -1, -1, -1},
- {4, 8, -1, -1, -1, -1, -1, -1}, {0, 4, 8, -1, -1, -1, -1, -1},
- {2, 4, 8, -1, -1, -1, -1, -1}, {0, 2, 4, 8, -1, -1, -1, -1},
- {6, 8, -1, -1, -1, -1, -1, -1}, {0, 6, 8, -1, -1, -1, -1, -1},
- {2, 6, 8, -1, -1, -1, -1, -1}, {0, 2, 6, 8, -1, -1, -1, -1},
- {4, 6, 8, -1, -1, -1, -1, -1}, {0, 4, 6, 8, -1, -1, -1, -1},
- {2, 4, 6, 8, -1, -1, -1, -1}, {0, 2, 4, 6, 8, -1, -1, -1},
- {10, -1, -1, -1, -1, -1, -1, -1}, {0, 10, -1, -1, -1, -1, -1, -1},
- {2, 10, -1, -1, -1, -1, -1, -1}, {0, 2, 10, -1, -1, -1, -1, -1},
- {4, 10, -1, -1, -1, -1, -1, -1}, {0, 4, 10, -1, -1, -1, -1, -1},
- {2, 4, 10, -1, -1, -1, -1, -1}, {0, 2, 4, 10, -1, -1, -1, -1},
- {6, 10, -1, -1, -1, -1, -1, -1}, {0, 6, 10, -1, -1, -1, -1, -1},
- {2, 6, 10, -1, -1, -1, -1, -1}, {0, 2, 6, 10, -1, -1, -1, -1},
- {4, 6, 10, -1, -1, -1, -1, -1}, {0, 4, 6, 10, -1, -1, -1, -1},
- {2, 4, 6, 10, -1, -1, -1, -1}, {0, 2, 4, 6, 10, -1, -1, -1},
- {8, 10, -1, -1, -1, -1, -1, -1}, {0, 8, 10, -1, -1, -1, -1, -1},
- {2, 8, 10, -1, -1, -1, -1, -1}, {0, 2, 8, 10, -1, -1, -1, -1},
- {4, 8, 10, -1, -1, -1, -1, -1}, {0, 4, 8, 10, -1, -1, -1, -1},
- {2, 4, 8, 10, -1, -1, -1, -1}, {0, 2, 4, 8, 10, -1, -1, -1},
- {6, 8, 10, -1, -1, -1, -1, -1}, {0, 6, 8, 10, -1, -1, -1, -1},
- {2, 6, 8, 10, -1, -1, -1, -1}, {0, 2, 6, 8, 10, -1, -1, -1},
- {4, 6, 8, 10, -1, -1, -1, -1}, {0, 4, 6, 8, 10, -1, -1, -1},
- {2, 4, 6, 8, 10, -1, -1, -1}, {0, 2, 4, 6, 8, 10, -1, -1},
- {12, -1, -1, -1, -1, -1, -1, -1}, {0, 12, -1, -1, -1, -1, -1, -1},
- {2, 12, -1, -1, -1, -1, -1, -1}, {0, 2, 12, -1, -1, -1, -1, -1},
- {4, 12, -1, -1, -1, -1, -1, -1}, {0, 4, 12, -1, -1, -1, -1, -1},
- {2, 4, 12, -1, -1, -1, -1, -1}, {0, 2, 4, 12, -1, -1, -1, -1},
- {6, 12, -1, -1, -1, -1, -1, -1}, {0, 6, 12, -1, -1, -1, -1, -1},
- {2, 6, 12, -1, -1, -1, -1, -1}, {0, 2, 6, 12, -1, -1, -1, -1},
- {4, 6, 12, -1, -1, -1, -1, -1}, {0, 4, 6, 12, -1, -1, -1, -1},
- {2, 4, 6, 12, -1, -1, -1, -1}, {0, 2, 4, 6, 12, -1, -1, -1},
- {8, 12, -1, -1, -1, -1, -1, -1}, {0, 8, 12, -1, -1, -1, -1, -1},
- {2, 8, 12, -1, -1, -1, -1, -1}, {0, 2, 8, 12, -1, -1, -1, -1},
- {4, 8, 12, -1, -1, -1, -1, -1}, {0, 4, 8, 12, -1, -1, -1, -1},
- {2, 4, 8, 12, -1, -1, -1, -1}, {0, 2, 4, 8, 12, -1, -1, -1},
- {6, 8, 12, -1, -1, -1, -1, -1}, {0, 6, 8, 12, -1, -1, -1, -1},
- {2, 6, 8, 12, -1, -1, -1, -1}, {0, 2, 6, 8, 12, -1, -1, -1},
- {4, 6, 8, 12, -1, -1, -1, -1}, {0, 4, 6, 8, 12, -1, -1, -1},
- {2, 4, 6, 8, 12, -1, -1, -1}, {0, 2, 4, 6, 8, 12, -1, -1},
- {10, 12, -1, -1, -1, -1, -1, -1}, {0, 10, 12, -1, -1, -1, -1, -1},
- {2, 10, 12, -1, -1, -1, -1, -1}, {0, 2, 10, 12, -1, -1, -1, -1},
- {4, 10, 12, -1, -1, -1, -1, -1}, {0, 4, 10, 12, -1, -1, -1, -1},
- {2, 4, 10, 12, -1, -1, -1, -1}, {0, 2, 4, 10, 12, -1, -1, -1},
- {6, 10, 12, -1, -1, -1, -1, -1}, {0, 6, 10, 12, -1, -1, -1, -1},
- {2, 6, 10, 12, -1, -1, -1, -1}, {0, 2, 6, 10, 12, -1, -1, -1},
- {4, 6, 10, 12, -1, -1, -1, -1}, {0, 4, 6, 10, 12, -1, -1, -1},
- {2, 4, 6, 10, 12, -1, -1, -1}, {0, 2, 4, 6, 10, 12, -1, -1},
- {8, 10, 12, -1, -1, -1, -1, -1}, {0, 8, 10, 12, -1, -1, -1, -1},
- {2, 8, 10, 12, -1, -1, -1, -1}, {0, 2, 8, 10, 12, -1, -1, -1},
- {4, 8, 10, 12, -1, -1, -1, -1}, {0, 4, 8, 10, 12, -1, -1, -1},
- {2, 4, 8, 10, 12, -1, -1, -1}, {0, 2, 4, 8, 10, 12, -1, -1},
- {6, 8, 10, 12, -1, -1, -1, -1}, {0, 6, 8, 10, 12, -1, -1, -1},
- {2, 6, 8, 10, 12, -1, -1, -1}, {0, 2, 6, 8, 10, 12, -1, -1},
- {4, 6, 8, 10, 12, -1, -1, -1}, {0, 4, 6, 8, 10, 12, -1, -1},
- {2, 4, 6, 8, 10, 12, -1, -1}, {0, 2, 4, 6, 8, 10, 12, -1},
- {14, -1, -1, -1, -1, -1, -1, -1}, {0, 14, -1, -1, -1, -1, -1, -1},
- {2, 14, -1, -1, -1, -1, -1, -1}, {0, 2, 14, -1, -1, -1, -1, -1},
- {4, 14, -1, -1, -1, -1, -1, -1}, {0, 4, 14, -1, -1, -1, -1, -1},
- {2, 4, 14, -1, -1, -1, -1, -1}, {0, 2, 4, 14, -1, -1, -1, -1},
- {6, 14, -1, -1, -1, -1, -1, -1}, {0, 6, 14, -1, -1, -1, -1, -1},
- {2, 6, 14, -1, -1, -1, -1, -1}, {0, 2, 6, 14, -1, -1, -1, -1},
- {4, 6, 14, -1, -1, -1, -1, -1}, {0, 4, 6, 14, -1, -1, -1, -1},
- {2, 4, 6, 14, -1, -1, -1, -1}, {0, 2, 4, 6, 14, -1, -1, -1},
- {8, 14, -1, -1, -1, -1, -1, -1}, {0, 8, 14, -1, -1, -1, -1, -1},
- {2, 8, 14, -1, -1, -1, -1, -1}, {0, 2, 8, 14, -1, -1, -1, -1},
- {4, 8, 14, -1, -1, -1, -1, -1}, {0, 4, 8, 14, -1, -1, -1, -1},
- {2, 4, 8, 14, -1, -1, -1, -1}, {0, 2, 4, 8, 14, -1, -1, -1},
- {6, 8, 14, -1, -1, -1, -1, -1}, {0, 6, 8, 14, -1, -1, -1, -1},
- {2, 6, 8, 14, -1, -1, -1, -1}, {0, 2, 6, 8, 14, -1, -1, -1},
- {4, 6, 8, 14, -1, -1, -1, -1}, {0, 4, 6, 8, 14, -1, -1, -1},
- {2, 4, 6, 8, 14, -1, -1, -1}, {0, 2, 4, 6, 8, 14, -1, -1},
- {10, 14, -1, -1, -1, -1, -1, -1}, {0, 10, 14, -1, -1, -1, -1, -1},
- {2, 10, 14, -1, -1, -1, -1, -1}, {0, 2, 10, 14, -1, -1, -1, -1},
- {4, 10, 14, -1, -1, -1, -1, -1}, {0, 4, 10, 14, -1, -1, -1, -1},
- {2, 4, 10, 14, -1, -1, -1, -1}, {0, 2, 4, 10, 14, -1, -1, -1},
- {6, 10, 14, -1, -1, -1, -1, -1}, {0, 6, 10, 14, -1, -1, -1, -1},
- {2, 6, 10, 14, -1, -1, -1, -1}, {0, 2, 6, 10, 14, -1, -1, -1},
- {4, 6, 10, 14, -1, -1, -1, -1}, {0, 4, 6, 10, 14, -1, -1, -1},
- {2, 4, 6, 10, 14, -1, -1, -1}, {0, 2, 4, 6, 10, 14, -1, -1},
- {8, 10, 14, -1, -1, -1, -1, -1}, {0, 8, 10, 14, -1, -1, -1, -1},
- {2, 8, 10, 14, -1, -1, -1, -1}, {0, 2, 8, 10, 14, -1, -1, -1},
- {4, 8, 10, 14, -1, -1, -1, -1}, {0, 4, 8, 10, 14, -1, -1, -1},
- {2, 4, 8, 10, 14, -1, -1, -1}, {0, 2, 4, 8, 10, 14, -1, -1},
- {6, 8, 10, 14, -1, -1, -1, -1}, {0, 6, 8, 10, 14, -1, -1, -1},
- {2, 6, 8, 10, 14, -1, -1, -1}, {0, 2, 6, 8, 10, 14, -1, -1},
- {4, 6, 8, 10, 14, -1, -1, -1}, {0, 4, 6, 8, 10, 14, -1, -1},
- {2, 4, 6, 8, 10, 14, -1, -1}, {0, 2, 4, 6, 8, 10, 14, -1},
- {12, 14, -1, -1, -1, -1, -1, -1}, {0, 12, 14, -1, -1, -1, -1, -1},
- {2, 12, 14, -1, -1, -1, -1, -1}, {0, 2, 12, 14, -1, -1, -1, -1},
- {4, 12, 14, -1, -1, -1, -1, -1}, {0, 4, 12, 14, -1, -1, -1, -1},
- {2, 4, 12, 14, -1, -1, -1, -1}, {0, 2, 4, 12, 14, -1, -1, -1},
- {6, 12, 14, -1, -1, -1, -1, -1}, {0, 6, 12, 14, -1, -1, -1, -1},
- {2, 6, 12, 14, -1, -1, -1, -1}, {0, 2, 6, 12, 14, -1, -1, -1},
- {4, 6, 12, 14, -1, -1, -1, -1}, {0, 4, 6, 12, 14, -1, -1, -1},
- {2, 4, 6, 12, 14, -1, -1, -1}, {0, 2, 4, 6, 12, 14, -1, -1},
- {8, 12, 14, -1, -1, -1, -1, -1}, {0, 8, 12, 14, -1, -1, -1, -1},
- {2, 8, 12, 14, -1, -1, -1, -1}, {0, 2, 8, 12, 14, -1, -1, -1},
- {4, 8, 12, 14, -1, -1, -1, -1}, {0, 4, 8, 12, 14, -1, -1, -1},
- {2, 4, 8, 12, 14, -1, -1, -1}, {0, 2, 4, 8, 12, 14, -1, -1},
- {6, 8, 12, 14, -1, -1, -1, -1}, {0, 6, 8, 12, 14, -1, -1, -1},
- {2, 6, 8, 12, 14, -1, -1, -1}, {0, 2, 6, 8, 12, 14, -1, -1},
- {4, 6, 8, 12, 14, -1, -1, -1}, {0, 4, 6, 8, 12, 14, -1, -1},
- {2, 4, 6, 8, 12, 14, -1, -1}, {0, 2, 4, 6, 8, 12, 14, -1},
- {10, 12, 14, -1, -1, -1, -1, -1}, {0, 10, 12, 14, -1, -1, -1, -1},
- {2, 10, 12, 14, -1, -1, -1, -1}, {0, 2, 10, 12, 14, -1, -1, -1},
- {4, 10, 12, 14, -1, -1, -1, -1}, {0, 4, 10, 12, 14, -1, -1, -1},
- {2, 4, 10, 12, 14, -1, -1, -1}, {0, 2, 4, 10, 12, 14, -1, -1},
- {6, 10, 12, 14, -1, -1, -1, -1}, {0, 6, 10, 12, 14, -1, -1, -1},
- {2, 6, 10, 12, 14, -1, -1, -1}, {0, 2, 6, 10, 12, 14, -1, -1},
- {4, 6, 10, 12, 14, -1, -1, -1}, {0, 4, 6, 10, 12, 14, -1, -1},
- {2, 4, 6, 10, 12, 14, -1, -1}, {0, 2, 4, 6, 10, 12, 14, -1},
- {8, 10, 12, 14, -1, -1, -1, -1}, {0, 8, 10, 12, 14, -1, -1, -1},
- {2, 8, 10, 12, 14, -1, -1, -1}, {0, 2, 8, 10, 12, 14, -1, -1},
- {4, 8, 10, 12, 14, -1, -1, -1}, {0, 4, 8, 10, 12, 14, -1, -1},
- {2, 4, 8, 10, 12, 14, -1, -1}, {0, 2, 4, 8, 10, 12, 14, -1},
- {6, 8, 10, 12, 14, -1, -1, -1}, {0, 6, 8, 10, 12, 14, -1, -1},
- {2, 6, 8, 10, 12, 14, -1, -1}, {0, 2, 6, 8, 10, 12, 14, -1},
- {4, 6, 8, 10, 12, 14, -1, -1}, {0, 4, 6, 8, 10, 12, 14, -1},
- {2, 4, 6, 8, 10, 12, 14, -1}, {0, 2, 4, 6, 8, 10, 12, 14},
+MLK_ALIGN const uint8_t mlk_rej_uniform_table[] = {
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 0 */,
+ 0, 1, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 1 */,
+ 2, 3, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 2 */,
+ 0, 1, 2, 3, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 3 */,
+ 4, 5, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 4 */,
+ 0, 1, 4, 5, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 5 */,
+ 2, 3, 4, 5, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 6 */,
+ 0, 1, 2, 3, 4, 5, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 7 */,
+ 6, 7, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 8 */,
+ 0, 1, 6, 7, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 9 */,
+ 2, 3, 6, 7, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 10 */,
+ 0, 1, 2, 3, 6, 7, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 11 */,
+ 4, 5, 6, 7, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 12 */,
+ 0, 1, 4, 5, 6, 7, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 13 */,
+ 2, 3, 4, 5, 6, 7, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 14 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 15 */,
+ 8, 9, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 16 */,
+ 0, 1, 8, 9, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 17 */,
+ 2, 3, 8, 9, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 18 */,
+ 0, 1, 2, 3, 8, 9, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 19 */,
+ 4, 5, 8, 9, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 20 */,
+ 0, 1, 4, 5, 8, 9, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 21 */,
+ 2, 3, 4, 5, 8, 9, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 22 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 23 */,
+ 6, 7, 8, 9, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 24 */,
+ 0, 1, 6, 7, 8, 9, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 25 */,
+ 2, 3, 6, 7, 8, 9, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 26 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 27 */,
+ 4, 5, 6, 7, 8, 9, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 28 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 29 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 30 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 255, 255, 255, 255, 255, 255 /* 31 */,
+ 10, 11, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 32 */,
+ 0, 1, 10, 11, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 33 */,
+ 2, 3, 10, 11, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 34 */,
+ 0, 1, 2, 3, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 35 */,
+ 4, 5, 10, 11, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 36 */,
+ 0, 1, 4, 5, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 37 */,
+ 2, 3, 4, 5, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 38 */,
+ 0, 1, 2, 3, 4, 5, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 39 */,
+ 6, 7, 10, 11, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 40 */,
+ 0, 1, 6, 7, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 41 */,
+ 2, 3, 6, 7, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 42 */,
+ 0, 1, 2, 3, 6, 7, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 43 */,
+ 4, 5, 6, 7, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 44 */,
+ 0, 1, 4, 5, 6, 7, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 45 */,
+ 2, 3, 4, 5, 6, 7, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 46 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 10, 11, 255, 255, 255, 255, 255, 255 /* 47 */,
+ 8, 9, 10, 11, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 48 */,
+ 0, 1, 8, 9, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 49 */,
+ 2, 3, 8, 9, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 50 */,
+ 0, 1, 2, 3, 8, 9, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 51 */,
+ 4, 5, 8, 9, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 52 */,
+ 0, 1, 4, 5, 8, 9, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 53 */,
+ 2, 3, 4, 5, 8, 9, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 54 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 10, 11, 255, 255, 255, 255, 255, 255 /* 55 */,
+ 6, 7, 8, 9, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 56 */,
+ 0, 1, 6, 7, 8, 9, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 57 */,
+ 2, 3, 6, 7, 8, 9, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 58 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 10, 11, 255, 255, 255, 255, 255, 255 /* 59 */,
+ 4, 5, 6, 7, 8, 9, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 60 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 10, 11, 255, 255, 255, 255, 255, 255 /* 61 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 255, 255, 255, 255, 255, 255 /* 62 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 255, 255, 255, 255 /* 63 */,
+ 12, 13, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 64 */,
+ 0, 1, 12, 13, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 65 */,
+ 2, 3, 12, 13, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 66 */,
+ 0, 1, 2, 3, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 67 */,
+ 4, 5, 12, 13, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 68 */,
+ 0, 1, 4, 5, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 69 */,
+ 2, 3, 4, 5, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 70 */,
+ 0, 1, 2, 3, 4, 5, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 71 */,
+ 6, 7, 12, 13, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 72 */,
+ 0, 1, 6, 7, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 73 */,
+ 2, 3, 6, 7, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 74 */,
+ 0, 1, 2, 3, 6, 7, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 75 */,
+ 4, 5, 6, 7, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 76 */,
+ 0, 1, 4, 5, 6, 7, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 77 */,
+ 2, 3, 4, 5, 6, 7, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 78 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 79 */,
+ 8, 9, 12, 13, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 80 */,
+ 0, 1, 8, 9, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 81 */,
+ 2, 3, 8, 9, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 82 */,
+ 0, 1, 2, 3, 8, 9, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 83 */,
+ 4, 5, 8, 9, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 84 */,
+ 0, 1, 4, 5, 8, 9, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 85 */,
+ 2, 3, 4, 5, 8, 9, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 86 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 87 */,
+ 6, 7, 8, 9, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 88 */,
+ 0, 1, 6, 7, 8, 9, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 89 */,
+ 2, 3, 6, 7, 8, 9, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 90 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 91 */,
+ 4, 5, 6, 7, 8, 9, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 92 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 93 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 94 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 12, 13, 255, 255, 255, 255 /* 95 */,
+ 10, 11, 12, 13, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 96 */,
+ 0, 1, 10, 11, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 97 */,
+ 2, 3, 10, 11, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 98 */,
+ 0, 1, 2, 3, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 99 */,
+ 4, 5, 10, 11, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 100 */,
+ 0, 1, 4, 5, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 101 */,
+ 2, 3, 4, 5, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 102 */,
+ 0, 1, 2, 3, 4, 5, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 103 */,
+ 6, 7, 10, 11, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 104 */,
+ 0, 1, 6, 7, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 105 */,
+ 2, 3, 6, 7, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 106 */,
+ 0, 1, 2, 3, 6, 7, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 107 */,
+ 4, 5, 6, 7, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 108 */,
+ 0, 1, 4, 5, 6, 7, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 109 */,
+ 2, 3, 4, 5, 6, 7, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 110 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 10, 11, 12, 13, 255, 255, 255, 255 /* 111 */,
+ 8, 9, 10, 11, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 112 */,
+ 0, 1, 8, 9, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 113 */,
+ 2, 3, 8, 9, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 114 */,
+ 0, 1, 2, 3, 8, 9, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 115 */,
+ 4, 5, 8, 9, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 116 */,
+ 0, 1, 4, 5, 8, 9, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 117 */,
+ 2, 3, 4, 5, 8, 9, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 118 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 10, 11, 12, 13, 255, 255, 255, 255 /* 119 */,
+ 6, 7, 8, 9, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 120 */,
+ 0, 1, 6, 7, 8, 9, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 121 */,
+ 2, 3, 6, 7, 8, 9, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 122 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 10, 11, 12, 13, 255, 255, 255, 255 /* 123 */,
+ 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 124 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 255, 255, 255, 255 /* 125 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 255, 255, 255, 255 /* 126 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 255, 255 /* 127 */,
+ 14, 15, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 128 */,
+ 0, 1, 14, 15, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 129 */,
+ 2, 3, 14, 15, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 130 */,
+ 0, 1, 2, 3, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 131 */,
+ 4, 5, 14, 15, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 132 */,
+ 0, 1, 4, 5, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 133 */,
+ 2, 3, 4, 5, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 134 */,
+ 0, 1, 2, 3, 4, 5, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 135 */,
+ 6, 7, 14, 15, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 136 */,
+ 0, 1, 6, 7, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 137 */,
+ 2, 3, 6, 7, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 138 */,
+ 0, 1, 2, 3, 6, 7, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 139 */,
+ 4, 5, 6, 7, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 140 */,
+ 0, 1, 4, 5, 6, 7, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 141 */,
+ 2, 3, 4, 5, 6, 7, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 142 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 143 */,
+ 8, 9, 14, 15, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 144 */,
+ 0, 1, 8, 9, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 145 */,
+ 2, 3, 8, 9, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 146 */,
+ 0, 1, 2, 3, 8, 9, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 147 */,
+ 4, 5, 8, 9, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 148 */,
+ 0, 1, 4, 5, 8, 9, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 149 */,
+ 2, 3, 4, 5, 8, 9, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 150 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 151 */,
+ 6, 7, 8, 9, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 152 */,
+ 0, 1, 6, 7, 8, 9, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 153 */,
+ 2, 3, 6, 7, 8, 9, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 154 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 155 */,
+ 4, 5, 6, 7, 8, 9, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 156 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 157 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 158 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 14, 15, 255, 255, 255, 255 /* 159 */,
+ 10, 11, 14, 15, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 160 */,
+ 0, 1, 10, 11, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 161 */,
+ 2, 3, 10, 11, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 162 */,
+ 0, 1, 2, 3, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 163 */,
+ 4, 5, 10, 11, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 164 */,
+ 0, 1, 4, 5, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 165 */,
+ 2, 3, 4, 5, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 166 */,
+ 0, 1, 2, 3, 4, 5, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 167 */,
+ 6, 7, 10, 11, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 168 */,
+ 0, 1, 6, 7, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 169 */,
+ 2, 3, 6, 7, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 170 */,
+ 0, 1, 2, 3, 6, 7, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 171 */,
+ 4, 5, 6, 7, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 172 */,
+ 0, 1, 4, 5, 6, 7, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 173 */,
+ 2, 3, 4, 5, 6, 7, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 174 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 10, 11, 14, 15, 255, 255, 255, 255 /* 175 */,
+ 8, 9, 10, 11, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 176 */,
+ 0, 1, 8, 9, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 177 */,
+ 2, 3, 8, 9, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 178 */,
+ 0, 1, 2, 3, 8, 9, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 179 */,
+ 4, 5, 8, 9, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 180 */,
+ 0, 1, 4, 5, 8, 9, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 181 */,
+ 2, 3, 4, 5, 8, 9, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 182 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 10, 11, 14, 15, 255, 255, 255, 255 /* 183 */,
+ 6, 7, 8, 9, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 184 */,
+ 0, 1, 6, 7, 8, 9, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 185 */,
+ 2, 3, 6, 7, 8, 9, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 186 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 10, 11, 14, 15, 255, 255, 255, 255 /* 187 */,
+ 4, 5, 6, 7, 8, 9, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 188 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 10, 11, 14, 15, 255, 255, 255, 255 /* 189 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 14, 15, 255, 255, 255, 255 /* 190 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 14, 15, 255, 255 /* 191 */,
+ 12, 13, 14, 15, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 192 */,
+ 0, 1, 12, 13, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 193 */,
+ 2, 3, 12, 13, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 194 */,
+ 0, 1, 2, 3, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 195 */,
+ 4, 5, 12, 13, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 196 */,
+ 0, 1, 4, 5, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 197 */,
+ 2, 3, 4, 5, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 198 */,
+ 0, 1, 2, 3, 4, 5, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 199 */,
+ 6, 7, 12, 13, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 200 */,
+ 0, 1, 6, 7, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 201 */,
+ 2, 3, 6, 7, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 202 */,
+ 0, 1, 2, 3, 6, 7, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 203 */,
+ 4, 5, 6, 7, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 204 */,
+ 0, 1, 4, 5, 6, 7, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 205 */,
+ 2, 3, 4, 5, 6, 7, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 206 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 207 */,
+ 8, 9, 12, 13, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 208 */,
+ 0, 1, 8, 9, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 209 */,
+ 2, 3, 8, 9, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 210 */,
+ 0, 1, 2, 3, 8, 9, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 211 */,
+ 4, 5, 8, 9, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 212 */,
+ 0, 1, 4, 5, 8, 9, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 213 */,
+ 2, 3, 4, 5, 8, 9, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 214 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 215 */,
+ 6, 7, 8, 9, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 216 */,
+ 0, 1, 6, 7, 8, 9, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 217 */,
+ 2, 3, 6, 7, 8, 9, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 218 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 219 */,
+ 4, 5, 6, 7, 8, 9, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 220 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 221 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 222 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 12, 13, 14, 15, 255, 255 /* 223 */,
+ 10, 11, 12, 13, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 224 */,
+ 0, 1, 10, 11, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 225 */,
+ 2, 3, 10, 11, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 226 */,
+ 0, 1, 2, 3, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 227 */,
+ 4, 5, 10, 11, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 228 */,
+ 0, 1, 4, 5, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 229 */,
+ 2, 3, 4, 5, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 230 */,
+ 0, 1, 2, 3, 4, 5, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 231 */,
+ 6, 7, 10, 11, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 232 */,
+ 0, 1, 6, 7, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 233 */,
+ 2, 3, 6, 7, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 234 */,
+ 0, 1, 2, 3, 6, 7, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 235 */,
+ 4, 5, 6, 7, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 236 */,
+ 0, 1, 4, 5, 6, 7, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 237 */,
+ 2, 3, 4, 5, 6, 7, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 238 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 10, 11, 12, 13, 14, 15, 255, 255 /* 239 */,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 240 */,
+ 0, 1, 8, 9, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 241 */,
+ 2, 3, 8, 9, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 242 */,
+ 0, 1, 2, 3, 8, 9, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 243 */,
+ 4, 5, 8, 9, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 244 */,
+ 0, 1, 4, 5, 8, 9, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 245 */,
+ 2, 3, 4, 5, 8, 9, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 246 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 10, 11, 12, 13, 14, 15, 255, 255 /* 247 */,
+ 6, 7, 8, 9, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 248 */,
+ 0, 1, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 249 */,
+ 2, 3, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 250 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 255, 255 /* 251 */,
+ 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 252 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 255, 255 /* 253 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 255, 255 /* 254 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15 /* 255 */,
};
#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/shuffle.inc b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/shuffle.inc
deleted file mode 100644
index c03a82bdb5..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/shuffle.inc
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
- */
-
-.macro shuffle8 r0,r1,r2,r3
-vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
-vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-.macro shuffle4 r0,r1,r2,r3
-vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
-vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-/* Shuffle r0=(a0,b0,c0,d0,...), r1=(a1,b1,c1,d1,...) into */
-/* r2 = (a0,b0,a1,b1,e0,f0,e1,f1,...) */
-/* r3 = (c0,d0,c1,d1,g0,h0,g1,h1,...) */
-.macro shuffle2 r0,r1,r2,r3
-/* r2=(a1,b1,a1,b1,e1,f1,e1,f1,...) */
-vmovsldup %ymm\r1,%ymm\r2
-/* Conditional move */
-/* 0xAA = 0b10101010 */
-/* r2=(a0,b0,a1,b1,e0,f0,e1,f1,...) */
-vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
-/* r0=(c0,d0,0,0,g0,h0,0,0,...) */
-vpsrlq $32,%ymm\r0,%ymm\r0
-/* r3=(c0,d0,c1,d1,g0,h0,g1,h1,...) */
-vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-.macro shuffle1 r0,r1,r2,r3
-vpslld $16,%ymm\r1,%ymm\r2
-vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
-vpsrld $16,%ymm\r0,%ymm\r0
-vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/tomont.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/tomont.S
index 13e45bdc63..8d7403c22a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/tomont.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/tomont.S
@@ -18,7 +18,8 @@
* Changes:
* - Add call to csub in reduce128_avx to produce outputs
* in [0,1,...,q-1] rather than [0,1,...,q], matching the
- * semantics of mlk_poly_reduce().
+ * semantics of mlk_poly_reduce(),
+ * - Use a macro instead of a local function call.
*/
#include "../../../common.h"
@@ -30,70 +31,125 @@
* dev/x86_64/src/tomont.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(tomont_avx2)
MLK_ASM_FN_SYMBOL(tomont_avx2)
- vmovdqa (%rsi), %ymm0
- vmovdqa 0xa0(%rsi), %ymm1
- vmovdqa 0xc0(%rsi), %ymm2
- callq tomont_avx2_core
- addq $0x100, %rdi # imm = 0x100
- callq tomont_avx2_core
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0x50495049, %eax # imm = 0x50495049
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ movl $0x5490549, %eax # imm = 0x5490549
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ vmovdqa (%rdi), %ymm3
+ vmovdqa 0x20(%rdi), %ymm4
+ vmovdqa 0x40(%rdi), %ymm5
+ vmovdqa 0x60(%rdi), %ymm6
+ vmovdqa 0x80(%rdi), %ymm7
+ vmovdqa 0xa0(%rdi), %ymm8
+ vmovdqa 0xc0(%rdi), %ymm9
+ vmovdqa 0xe0(%rdi), %ymm10
+ vpmullw %ymm1, %ymm3, %ymm11
+ vpmulhw %ymm2, %ymm3, %ymm3
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm11, %ymm3, %ymm3
+ vpmullw %ymm1, %ymm4, %ymm12
+ vpmulhw %ymm2, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpmullw %ymm1, %ymm5, %ymm13
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpsubw %ymm13, %ymm5, %ymm5
+ vpmullw %ymm1, %ymm6, %ymm14
+ vpmulhw %ymm2, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpsubw %ymm14, %ymm6, %ymm6
+ vpmullw %ymm1, %ymm7, %ymm15
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpsubw %ymm15, %ymm7, %ymm7
+ vpmullw %ymm1, %ymm8, %ymm11
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm11, %ymm8, %ymm8
+ vpmullw %ymm1, %ymm9, %ymm12
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vpmullw %ymm1, %ymm10, %ymm13
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpsubw %ymm13, %ymm10, %ymm10
+ vmovdqa %ymm3, (%rdi)
+ vmovdqa %ymm4, 0x20(%rdi)
+ vmovdqa %ymm5, 0x40(%rdi)
+ vmovdqa %ymm6, 0x60(%rdi)
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm8, 0xa0(%rdi)
+ vmovdqa %ymm9, 0xc0(%rdi)
+ vmovdqa %ymm10, 0xe0(%rdi)
+ vmovdqa 0x100(%rdi), %ymm3
+ vmovdqa 0x120(%rdi), %ymm4
+ vmovdqa 0x140(%rdi), %ymm5
+ vmovdqa 0x160(%rdi), %ymm6
+ vmovdqa 0x180(%rdi), %ymm7
+ vmovdqa 0x1a0(%rdi), %ymm8
+ vmovdqa 0x1c0(%rdi), %ymm9
+ vmovdqa 0x1e0(%rdi), %ymm10
+ vpmullw %ymm1, %ymm3, %ymm11
+ vpmulhw %ymm2, %ymm3, %ymm3
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm11, %ymm3, %ymm3
+ vpmullw %ymm1, %ymm4, %ymm12
+ vpmulhw %ymm2, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpmullw %ymm1, %ymm5, %ymm13
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpsubw %ymm13, %ymm5, %ymm5
+ vpmullw %ymm1, %ymm6, %ymm14
+ vpmulhw %ymm2, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpsubw %ymm14, %ymm6, %ymm6
+ vpmullw %ymm1, %ymm7, %ymm15
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpsubw %ymm15, %ymm7, %ymm7
+ vpmullw %ymm1, %ymm8, %ymm11
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm11, %ymm8, %ymm8
+ vpmullw %ymm1, %ymm9, %ymm12
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vpmullw %ymm1, %ymm10, %ymm13
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpsubw %ymm13, %ymm10, %ymm10
+ vmovdqa %ymm3, 0x100(%rdi)
+ vmovdqa %ymm4, 0x120(%rdi)
+ vmovdqa %ymm5, 0x140(%rdi)
+ vmovdqa %ymm6, 0x160(%rdi)
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm8, 0x1a0(%rdi)
+ vmovdqa %ymm9, 0x1c0(%rdi)
+ vmovdqa %ymm10, 0x1e0(%rdi)
retq
+ .cfi_endproc
-tomont_avx2_core:
- vmovdqa (%rdi), %ymm3
- vmovdqa 0x20(%rdi), %ymm4
- vmovdqa 0x40(%rdi), %ymm5
- vmovdqa 0x60(%rdi), %ymm6
- vmovdqa 0x80(%rdi), %ymm7
- vmovdqa 0xa0(%rdi), %ymm8
- vmovdqa 0xc0(%rdi), %ymm9
- vmovdqa 0xe0(%rdi), %ymm10
- vpmullw %ymm1, %ymm3, %ymm11
- vpmulhw %ymm2, %ymm3, %ymm3
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm11, %ymm3, %ymm3
- vpmullw %ymm1, %ymm4, %ymm12
- vpmulhw %ymm2, %ymm4, %ymm4
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm4, %ymm4
- vpmullw %ymm1, %ymm5, %ymm13
- vpmulhw %ymm2, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm13, %ymm13
- vpsubw %ymm13, %ymm5, %ymm5
- vpmullw %ymm1, %ymm6, %ymm14
- vpmulhw %ymm2, %ymm6, %ymm6
- vpmulhw %ymm0, %ymm14, %ymm14
- vpsubw %ymm14, %ymm6, %ymm6
- vpmullw %ymm1, %ymm7, %ymm15
- vpmulhw %ymm2, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm15, %ymm15
- vpsubw %ymm15, %ymm7, %ymm7
- vpmullw %ymm1, %ymm8, %ymm11
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm11, %ymm8, %ymm8
- vpmullw %ymm1, %ymm9, %ymm12
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm9, %ymm9
- vpmullw %ymm1, %ymm10, %ymm13
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm13, %ymm13
- vpsubw %ymm13, %ymm10, %ymm10
- vmovdqa %ymm3, (%rdi)
- vmovdqa %ymm4, 0x20(%rdi)
- vmovdqa %ymm5, 0x40(%rdi)
- vmovdqa %ymm6, 0x60(%rdi)
- vmovdqa %ymm7, 0x80(%rdi)
- vmovdqa %ymm8, 0xa0(%rdi)
- vmovdqa %ymm9, 0xc0(%rdi)
- vmovdqa %ymm10, 0xe0(%rdi)
- retq
+MLK_ASM_FN_SIZE(tomont_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/x86_64_mulcache_twiddles.i b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/x86_64_mulcache_twiddles.i
deleted file mode 100644
index 51aeb01122..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/x86_64_mulcache_twiddles.i
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/*
- * WARNING: This file is auto-generated from scripts/autogen
- * Do not modify it directly.
- */
-
-/*
- * Table of twiddle values used in the AVX2 mulcache
- * See autogen for details.
- */
-
-- 1103,
- 555, -1251, 1550, 422, 177, -291, 1574, -246, 1159, -777, -602, -1590, -872, 418, -156, 430,
- 843, 871, 105, 587, -235, -460, 1653, 778, -147, 1483, 1119, 644, 349, 329, -75, 817, 603, 1322,
- -1465, -1215, 1218, -874, -1187, -1185, -1278, -1510, -870, -108, 996, 958, 1522, 1097, 610,
- -1285, 384, -136, -1335, 220, -1659, -1530, 794, -854, 478, -308, 991, -1460, 1628, -335,
- -11477, -32227, 20494, -27738, 945, -14883, 6182, 32010, 10631, 29175, -28762, -18486, 17560,
- -14430, -5276, 11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, -32502, 30317, -18741,
- 12639, 20100, 18525, 19529, -12619, -31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989,
- 10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, 20297, 2146, 15355, -32384, -6280,
- -14903, -11044, 14469, -21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/x86_64_zetas.i b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/x86_64_zetas.i
deleted file mode 100644
index c93ae01433..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/native/x86_64/src/x86_64_zetas.i
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/*
- * WARNING: This file is auto-generated from scripts/autogen
- * Do not modify it directly.
- */
-
-/*
- * Table of zeta values used in the AVX2 NTTs
- * See autogen for details.
- */
-
-31498, 31498, 31498, 31498, -758, -758, -758, -758, 0, 0, 0, 0, 0, 0, 0, 0,
- 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745,
- 14745, 14745, 14745, 14745, 14745, -359, -359, -359, -359, -359, -359, -359,
- -359, -359, -359, -359, -359, -359, -359, -359, -359, 13525, 13525, 13525,
- 13525, 13525, 13525, 13525, 13525, -12402, -12402, -12402, -12402, -12402,
- -12402, -12402, -12402, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493,
- 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, -20907, -20907, -20907,
- -20907, 27758, 27758, 27758, 27758, -3799, -3799, -3799, -3799, -15690,
- -15690, -15690, -15690, -171, -171, -171, -171, 622, 622, 622, 622, 1577,
- 1577, 1577, 1577, 182, 182, 182, 182, -5827, -5827, 17363, 17363, -26360,
- -26360, -29057, -29057, 5571, 5571, -1102, -1102, 21438, 21438, -26242,
- -26242, 573, 573, -1325, -1325, 264, 264, 383, 383, -829, -829, 1458, 1458,
- -1602, -1602, -130, -130, -5689, -6516, 1496, 30967, -23565, 20179, 20710,
- 25080, -12796, 26616, 16064, -12442, 9134, -650, -25986, 27837, 1223, 652,
- -552, 1015, -1293, 1491, -282, -1544, 516, -8, -320, -666, -1618, -1162,
- 126, 1469, -335, -11477, -32227, 20494, -27738, 945, -14883, 6182, 32010,
- 10631, 29175, -28762, -18486, 17560, -14430, -5276, -1103, 555, -1251, 1550,
- 422, 177, -291, 1574, -246, 1159, -777, -602, -1590, -872, 418, -156, 11182,
- 13387, -14233, -21655, 13131, -4587, 23092, 5493, -32502, 30317, -18741,
- 12639, 20100, 18525, 19529, -12619, 430, 843, 871, 105, 587, -235, -460,
- 1653, 778, -147, 1483, 1119, 644, 349, 329, -75, 787, 787, 787, 787, 787,
- 787, 787, 787, 787, 787, 787, 787, 787, 787, 787, 787, -1517, -1517, -1517,
- -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517,
- -1517, -1517, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191,
- -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694, 287, 287,
- 287, 287, 287, 287, 287, 287, 202, 202, 202, 202, 202, 202, 202, 202, 10690,
- 10690, 10690, 10690, 1358, 1358, 1358, 1358, -11202, -11202, -11202, -11202,
- 31164, 31164, 31164, 31164, 962, 962, 962, 962, -1202, -1202, -1202, -1202,
- -1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, -28073, -28073, 24313,
- 24313, -10532, -10532, 8800, 8800, 18426, 18426, 8859, 8859, 26675, 26675,
- -16163, -16163, -681, -681, 1017, 1017, 732, 732, 608, 608, -1542, -1542,
- 411, 411, -205, -205, -1571, -1571, 19883, -28250, -15887, -8898, -28309,
- 9075, -30199, 18249, 13426, 14017, -29156, -12757, 16832, 4311, -24155,
- -17915, -853, -90, -271, 830, 107, -1421, -247, -951, -398, 961, -1508,
- -725, 448, -1065, 677, -1275, -31183, 25435, -7382, 24391, -20927, 10946,
- 24214, 16989, 10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, 817,
- 603, 1322, -1465, -1215, 1218, -874, -1187, -1185, -1278, -1510, -870, -108,
- 996, 958, 1522, 20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469,
- -21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132, 1097, 610,
- -1285, 384, -136, -1335, 220, -1659, -1530, 794, -854, 478, -308, 991,
- -1460, 1628,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/params.h
index 3f81bb0e2e..04598539c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/params.h
@@ -5,12 +5,6 @@
#ifndef MLK_PARAMS_H
#define MLK_PARAMS_H
-#if defined(MLK_CONFIG_FILE)
-#include MLK_CONFIG_FILE
-#else
-#include "config.h"
-#endif
-
#if !defined(MLK_CONFIG_PARAMETER_SET)
#error MLK_CONFIG_PARAMETER_SET is not defined
#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly.c
index 40d29948c8..564d5d712b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly.c
@@ -20,8 +20,7 @@
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
-#include
+
#include "cbmc.h"
#include "debug.h"
#include "poly.h"
@@ -29,9 +28,6 @@
#include "symmetric.h"
#include "verify.h"
-#if !defined(MLK_USE_NATIVE_POLY_TOMONT) || \
- !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
- !defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_fqmul
*
@@ -68,10 +64,7 @@ __contract__(
mlk_assert_abs_bound(&res, 1, MLKEM_Q);
return res;
}
-#endif /* !MLK_USE_NATIVE_POLY_TOMONT || !MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE \
- || !MLK_USE_NATIVE_NTT || !MLK_USE_NATIVE_INTT */
-#if !defined(MLK_USE_NATIVE_POLY_REDUCE) || !defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_barrett_reduce
*
@@ -107,7 +100,7 @@ __contract__(
* Here, we assume it's sign-preserving "arithmetic" shift right.
* See (C99 6.5.7 (5))
*/
- const int32_t t = (magic * a + (1 << 25)) >> 26;
+ const int32_t t = (magic * a + ((int32_t)1 << 25)) >> 26;
/*
* t is in -10 .. +10, so we need 32-bit math to
@@ -118,12 +111,14 @@ __contract__(
mlk_assert_abs_bound(&res, 1, MLKEM_Q_HALF);
return res;
}
-#endif /* !MLK_USE_NATIVE_POLY_REDUCE || !MLK_USE_NATIVE_INTT */
-#if !defined(MLK_USE_NATIVE_POLY_TOMONT)
/* Reference: `poly_tomont()` in the reference implementation @[REF]. */
-MLK_INTERNAL_API
-void mlk_poly_tomont(mlk_poly *r)
+MLK_STATIC_TESTABLE void mlk_poly_tomont_c(mlk_poly *r)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+)
{
unsigned i;
const int16_t f = 1353; /* check-magic: 1353 == signed_mod(2^32, MLKEM_Q) */
@@ -137,16 +132,23 @@ void mlk_poly_tomont(mlk_poly *r)
mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_TOMONT */
+
MLK_INTERNAL_API
void mlk_poly_tomont(mlk_poly *r)
{
- mlk_poly_tomont_native(r->coeffs);
- mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_TOMONT)
+ int ret;
+ ret = mlk_poly_tomont_native(r->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
-#if !defined(MLK_USE_NATIVE_POLY_REDUCE)
+ mlk_poly_tomont_c(r);
+}
+
/************************************************************
* Name: mlk_scalar_signed_to_unsigned_q
*
@@ -162,7 +164,7 @@ void mlk_poly_tomont(mlk_poly *r)
* - Used here to implement different semantics of `poly_reduce()`;
* see below. in the reference implementation @[REF], this logic is
* part of all compression functions (see `compress.c`). */
-static MLK_INLINE uint16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
+static MLK_INLINE int16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
__contract__(
requires(c > -MLKEM_Q && c < MLKEM_Q)
ensures(return_value >= 0 && return_value < MLKEM_Q)
@@ -170,12 +172,14 @@ __contract__(
{
mlk_assert_abs_bound(&c, 1, MLKEM_Q);
- /* Add Q if c is negative, but in constant time */
- c = mlk_ct_sel_int16(c + MLKEM_Q, c, mlk_ct_cmask_neg_i16(c));
+ /* Add MLKEM_Q if c is negative, but in constant time.
+ *
+ * Note that c + MLKEM_Q does not overflow in int16_t,
+ * so the cast to uint16_t is safe. */
+ c = mlk_ct_sel_int16((int16_t)(c + MLKEM_Q), c, mlk_ct_cmask_neg_i16(c));
- /* and therefore cast to uint16_t is safe. */
mlk_assert_bound(&c, 1, 0, MLKEM_Q);
- return (uint16_t)c;
+ return c;
}
/* Reference: `poly_reduce()` in the reference implementation @[REF]
@@ -185,10 +189,15 @@ __contract__(
* here to go from signed to unsigned representatives.
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
-MLK_INTERNAL_API
-void mlk_poly_reduce(mlk_poly *r)
+MLK_STATIC_TESTABLE void mlk_poly_reduce_c(mlk_poly *r)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
+
for (i = 0; i < MLKEM_N; i++)
__loop__(
invariant(i <= MLKEM_N)
@@ -202,15 +211,23 @@ void mlk_poly_reduce(mlk_poly *r)
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_REDUCE */
+
MLK_INTERNAL_API
void mlk_poly_reduce(mlk_poly *r)
{
- mlk_poly_reduce_native(r->coeffs);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_REDUCE)
+ int ret;
+ ret = mlk_poly_reduce_native(r->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
+ mlk_poly_reduce_c(r);
+}
+
/* Reference: `poly_add()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
@@ -224,7 +241,8 @@ void mlk_poly_add(mlk_poly *r, const mlk_poly *b)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
{
- r->coeffs[i] = r->coeffs[i] + b->coeffs[i];
+ /* The preconditions imply that the addition stays within int16_t. */
+ r->coeffs[i] = (int16_t)(r->coeffs[i] + b->coeffs[i]);
}
}
@@ -241,24 +259,24 @@ void mlk_poly_sub(mlk_poly *r, const mlk_poly *b)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
{
- r->coeffs[i] = r->coeffs[i] - b->coeffs[i];
+ /* The preconditions imply that the subtraction stays within int16_t. */
+ r->coeffs[i] = (int16_t)(r->coeffs[i] - b->coeffs[i]);
}
}
-/* Include zeta table unless NTT, invNTT and mulcache computation
- * have been replaced by native implementations. */
-#if !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
- !defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT)
#include "zetas.inc"
-#endif
-#if !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
/* Reference: Does not exist in the reference implementation @[REF].
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This idea originates
* from @[NeonNTT] and is used at the C level here. */
-MLK_INTERNAL_API
-void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_mulcache_compute_c(mlk_poly_mulcache *x,
+ const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 4; i++)
@@ -266,8 +284,11 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
invariant(i <= MLKEM_N / 4)
invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
{
- x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
- x->coeffs[2 * i + 1] = mlk_fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
+ x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], mlk_zetas[64 + i]);
+ /* The values in zeta table are <= MLKEM_Q in absolute value,
+ * so the negation in int16_t is safe. */
+ x->coeffs[2 * i + 1] =
+ mlk_fqmul(a->coeffs[4 * i + 3], (int16_t)(-mlk_zetas[64 + i]));
}
/*
@@ -278,15 +299,22 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
*/
mlk_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
MLK_INTERNAL_API
void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
{
- mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
-}
+#if defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+ int ret;
+ ret = mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
-#if !defined(MLK_USE_NATIVE_NTT)
+ mlk_poly_mulcache_compute_c(x, a);
+}
+
/*
* Computes a block CT butterflies with a fixed twiddle factor,
* using Montgomery multiplication.
@@ -316,7 +344,8 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
/* Reference: Embedded in `ntt()` in the reference implementation @[REF]. */
static void mlk_ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
- unsigned start, unsigned len, int bound)
+ unsigned start, unsigned len,
+ unsigned bound)
__contract__(
requires(start < MLKEM_N)
requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
@@ -346,8 +375,9 @@ __contract__(
{
int16_t t;
t = mlk_fqmul(r[j + len], zeta);
- r[j + len] = r[j] - t;
- r[j] = r[j] + t;
+ /* The precondition implies that the arithmetic does not overflow. */
+ r[j + len] = (int16_t)(r[j] - t);
+ r[j] = (int16_t)(r[j] + t);
}
}
@@ -370,7 +400,7 @@ __contract__(
unsigned start, k, len;
/* Twiddle factors for layer n are at indices 2^(n-1)..2^n-1. */
k = 1u << (layer - 1);
- len = MLKEM_N >> layer;
+ len = (unsigned)MLKEM_N >> layer;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
invariant(start < MLKEM_N + 2 * len)
@@ -378,7 +408,7 @@ __contract__(
invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
{
- int16_t zeta = zetas[k++];
+ int16_t zeta = mlk_zetas[k++];
mlk_ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
}
}
@@ -395,12 +425,19 @@ __contract__(
/* Reference: `ntt()` in the reference implementation @[REF].
* - Iterate over `layer` instead of `len` in the outer loop
* to simplify computation of zeta index. */
-MLK_INTERNAL_API
-void mlk_poly_ntt(mlk_poly *p)
+MLK_STATIC_TESTABLE void mlk_poly_ntt_c(mlk_poly *p)
+__contract__(
+ requires(memory_no_alias(p, sizeof(mlk_poly)))
+ requires(array_abs_bound(p->coeffs, 0, MLKEM_N, MLKEM_Q))
+ assigns(memory_slice(p, sizeof(mlk_poly)))
+ ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_NTT_BOUND))
+)
{
unsigned layer;
int16_t *r;
+
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+
r = p->coeffs;
for (layer = 1; layer <= 7; layer++)
@@ -414,18 +451,24 @@ void mlk_poly_ntt(mlk_poly *p)
/* Check the stronger bound */
mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
}
-#else /* !MLK_USE_NATIVE_NTT */
MLK_INTERNAL_API
void mlk_poly_ntt(mlk_poly *p)
{
+#if defined(MLK_USE_NATIVE_NTT)
+ int ret;
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
- mlk_ntt_native(p->coeffs);
- mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
-}
+ ret = mlk_ntt_native(p->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
+ return;
+ }
#endif /* MLK_USE_NATIVE_NTT */
-#if !defined(MLK_USE_NATIVE_INTT)
+ mlk_poly_ntt_c(p);
+}
+
/* Compute one layer of inverse NTT */
@@ -439,7 +482,7 @@ __contract__(
ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
unsigned start, k, len;
- len = (MLKEM_N >> layer);
+ len = (unsigned)MLKEM_N >> layer;
k = (1u << layer) - 1;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
@@ -449,7 +492,7 @@ __contract__(
invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
{
unsigned j;
- int16_t zeta = zetas[k--];
+ int16_t zeta = mlk_zetas[k--];
for (j = start; j < start + len; j++)
__loop__(
invariant(start <= j && j <= start + len)
@@ -457,8 +500,9 @@ __contract__(
invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
int16_t t = r[j];
- r[j] = mlk_barrett_reduce(t + r[j + len]);
- r[j + len] = r[j + len] - t;
+ /* The preconditions imply that the arithmetic does not overflow. */
+ r[j] = mlk_barrett_reduce((int16_t)(t + r[j + len]));
+ r[j + len] = (int16_t)(r[j + len] - t);
r[j + len] = mlk_fqmul(r[j + len], zeta);
}
}
@@ -469,18 +513,22 @@ __contract__(
* while the reference implementation normalizes at
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
-MLK_INTERNAL_API
-void mlk_poly_invntt_tomont(mlk_poly *p)
+MLK_STATIC_TESTABLE void mlk_poly_invntt_tomont_c(mlk_poly *p)
+__contract__(
+ requires(memory_no_alias(p, sizeof(mlk_poly)))
+ assigns(memory_slice(p, sizeof(mlk_poly)))
+ ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND))
+)
{
+ unsigned j, layer;
+ const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
+ int16_t *r = p->coeffs;
+
/*
* Scale input polynomial to account for Montgomery factor
* and NTT twist. This also brings coefficients down to
* absolute value < MLKEM_Q.
*/
- unsigned j, layer;
- const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
- int16_t *r = p->coeffs;
-
for (j = 0; j < MLKEM_N; j++)
__loop__(
invariant(j <= MLKEM_N)
@@ -500,16 +548,23 @@ void mlk_poly_invntt_tomont(mlk_poly *p)
mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
}
-#else /* !MLK_USE_NATIVE_INTT */
MLK_INTERNAL_API
void mlk_poly_invntt_tomont(mlk_poly *p)
{
- mlk_intt_native(p->coeffs);
- mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
-}
+#if defined(MLK_USE_NATIVE_INTT)
+ int ret;
+ ret = mlk_intt_native(p->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
+ return;
+ }
#endif /* MLK_USE_NATIVE_INTT */
+ mlk_poly_invntt_tomont_c(p);
+}
+
#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
MLK_EMPTY_CU(mlk_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly.h
index 20fb65e720..587062cce5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly.h
@@ -15,8 +15,7 @@
#ifndef MLK_POLY_H
#define MLK_POLY_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include "debug.h"
@@ -46,34 +45,6 @@ typedef struct
int16_t coeffs[MLKEM_N >> 1];
} MLK_ALIGN mlk_poly_mulcache;
-/*************************************************
- * Name: mlk_cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- * input x in 0 .. 32767: returns value unchanged
- * input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
-{
- /*
- * PORTABILITY: This relies on uint16_t -> int16_t
- * being implemented as the inverse of int16_t -> uint16_t,
- * which is implementation-defined (C99 6.3.1.3 (3))
- * CBMC (correctly) fails to prove this conversion is OK,
- * so we have to suppress that check here
- */
- return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
/*************************************************
* Name: mlk_montgomery_reduce
*
@@ -90,7 +61,7 @@ static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
static MLK_ALWAYS_INLINE int16_t mlk_montgomery_reduce(int32_t a)
__contract__(
requires(a < +(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)) &&
- a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
+ a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
/* We don't attempt to express an input-dependent output bound
* as the post-condition here. There are two call-sites for this
* function:
@@ -102,8 +73,8 @@ __contract__(
/* check-magic: 62209 == unsigned_mod(pow(MLKEM_Q, -1, 2^16), 2^16) */
const uint32_t QINV = 62209;
- /* Compute a*q^{-1} mod 2^16 in unsigned representatives */
- const uint16_t a_reduced = a & UINT16_MAX;
+ /* Compute a*q^{-1} mod 2^16 in unsigned representatives. */
+ const uint16_t a_reduced = mlk_cast_int32_to_uint16(a);
const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
/* Lift to signed canonical representative mod 2^16. */
@@ -187,7 +158,7 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
__contract__(
requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
requires(memory_no_alias(a, sizeof(mlk_poly)))
- assigns(object_whole(x))
+ assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
);
#define mlk_poly_reduce MLK_NAMESPACE(poly_reduce)
@@ -280,7 +251,7 @@ __contract__(
requires(forall(k0, 0, MLKEM_N, (int32_t) r->coeffs[k0] - b->coeffs[k0] <= INT16_MAX))
requires(forall(k1, 0, MLKEM_N, (int32_t) r->coeffs[k1] - b->coeffs[k1] >= INT16_MIN))
ensures(forall(k, 0, MLKEM_N, r->coeffs[k] == old(*r).coeffs[k] - b->coeffs[k]))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
);
#define mlk_poly_ntt MLK_NAMESPACE(poly_ntt)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly_k.c
index f15ab96ce7..32b214ee04 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly_k.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly_k.c
@@ -22,12 +22,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
+#include "poly_k.h"
-#include "compress.h"
#include "debug.h"
-#include "poly_k.h"
#include "sampling.h"
#include "symmetric.h"
@@ -37,6 +34,8 @@
* within a single compilation unit. */
#define mlk_poly_cbd_eta1 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta1)
#define mlk_poly_cbd_eta2 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta2)
+#define mlk_polyvec_basemul_acc_montgomery_cached_c \
+ MLK_ADD_PARAM_SET(mlk_polyvec_basemul_acc_montgomery_cached_c)
/* End of parameter set namespacing */
/* Reference: `polyvec_compress()` in the reference implementation @[REF]
@@ -46,29 +45,29 @@
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
- const mlk_polyvec a)
+ const mlk_polyvec *a)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a[i]);
+ mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
}
}
/* Reference: `polyvec_decompress()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_decompress_du(mlk_polyvec r,
+void mlk_polyvec_decompress_du(mlk_polyvec *r,
const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_decompress_du(&r[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+ mlk_poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `polyvec_tobytes()` in the reference implementation @[REF].
@@ -77,41 +76,45 @@ void mlk_polyvec_decompress_du(mlk_polyvec r,
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
-void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a)
+void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec *a)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(r, MLKEM_POLYVECBYTES))
+ invariant(i <= MLKEM_K)
+ )
{
- mlk_poly_tobytes(r + i * MLKEM_POLYBYTES, &a[i]);
+ mlk_poly_tobytes(&r[i * MLKEM_POLYBYTES], &a->vec[i]);
}
}
/* Reference: `polyvec_frombytes()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES])
+void mlk_polyvec_frombytes(mlk_polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_frombytes(&r[i], a + i * MLKEM_POLYBYTES);
+ mlk_poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
}
/* Reference: `polyvec_ntt()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_ntt(mlk_polyvec r)
+void mlk_polyvec_ntt(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_ntt(&r[i]);
+ mlk_poly_ntt(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
}
/* Reference: `polyvec_invntt_tomont()` in the reference implementation @[REF].
@@ -120,18 +123,17 @@ void mlk_polyvec_ntt(mlk_polyvec r)
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
MLK_INTERNAL_API
-void mlk_polyvec_invntt_tomont(mlk_polyvec r)
+void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_invntt_tomont(&r[i]);
+ mlk_poly_invntt_tomont(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND);
}
-#if !defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
/* Reference: `polyvec_basemul_acc_montgomery()` in the
* reference implementation @[REF].
* - We use a multiplication cache ('mulcache') here
@@ -143,13 +145,22 @@ void mlk_polyvec_invntt_tomont(mlk_polyvec r)
* at the end. The reference implementation uses 2 * MLKEM_K
* more modular reductions since it reduces after every modular
* multiplication. */
-MLK_INTERNAL_API
-void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+MLK_STATIC_TESTABLE void mlk_polyvec_basemul_acc_montgomery_cached_c(
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, sizeof(mlk_polyvec)))
+ requires(memory_no_alias(b, sizeof(mlk_polyvec)))
+ requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
+ requires(forall(k1, 0, MLKEM_K,
+ array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(invariant(i <= MLKEM_N / 2))
{
@@ -163,53 +174,59 @@ void mlk_polyvec_basemul_acc_montgomery_cached(
t[1] <= ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768) &&
t[1] >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768)))
{
- t[0] += (int32_t)a[k].coeffs[2 * i + 1] * b_cache[k].coeffs[i];
- t[0] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i];
- t[1] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i + 1];
- t[1] += (int32_t)a[k].coeffs[2 * i + 1] * b[k].coeffs[2 * i];
+ t[0] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b_cache->vec[k].coeffs[i];
+ t[0] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i];
+ t[1] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i + 1];
+ t[1] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b->vec[k].coeffs[2 * i];
}
r->coeffs[2 * i + 0] = mlk_montgomery_reduce(t[0]);
r->coeffs[2 * i + 1] = mlk_montgomery_reduce(t[1]);
}
}
-#else /* !MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
MLK_INTERNAL_API
void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
{
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
- /* Omitting bounds assertion for cache since native implementations may
- * decide not to use a mulcache. Note that the C backend implementation
- * of poly_basemul_montgomery_cached() does still include the check. */
+#if defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+ {
+ int ret;
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
#if MLKEM_K == 2
- mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#elif MLKEM_K == 3
- mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#elif MLKEM_K == 4
- mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#endif
-}
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
+ }
#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+ mlk_polyvec_basemul_acc_montgomery_cached_c(r, a, b, b_cache);
+}
+
/* Reference: Does not exist in the reference implementation @[REF].
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This idea originates
* from @[NeonNTT] and is used at the C level here. */
MLK_INTERNAL_API
-void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
+void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_mulcache_compute(&x[i], &a[i]);
+ mlk_poly_mulcache_compute(&x->vec[i], &a->vec[i]);
}
}
@@ -221,41 +238,53 @@ void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
MLK_INTERNAL_API
-void mlk_polyvec_reduce(mlk_polyvec r)
+void mlk_polyvec_reduce(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_reduce(&r[i]);
+ mlk_poly_reduce(&r->vec[i]);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `polyvec_add()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
MLK_INTERNAL_API
-void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b)
+void mlk_polyvec_add(mlk_polyvec *r, const mlk_polyvec *b)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(r, sizeof(mlk_polyvec)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(j0, i, MLKEM_K,
+ forall(k0, 0, MLKEM_N,
+ ((int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX) &&
+ ((int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] >= INT16_MIN))))
+ invariant(forall(j2, 0, i,
+ forall(k2, 0, MLKEM_N,
+ (r->vec[j2].coeffs[k2] <= INT16_MAX) &&
+ (r->vec[j2].coeffs[k2] >= INT16_MIN))))
+ )
{
- mlk_poly_add(&r[i], &b[i]);
+ mlk_poly_add(&r->vec[i], &b->vec[i]);
}
}
/* Reference: `polyvec_tomont()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_tomont(mlk_polyvec r)
+void mlk_polyvec_tomont(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_tomont(&r[i]);
+ mlk_poly_tomont(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLKEM_Q);
}
@@ -306,24 +335,41 @@ void mlk_poly_getnoise_eta1_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
{
MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
- memcpy(extkey[0], seed, MLKEM_SYMBYTES);
- memcpy(extkey[1], seed, MLKEM_SYMBYTES);
- memcpy(extkey[2], seed, MLKEM_SYMBYTES);
- memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
extkey[0][MLKEM_SYMBYTES] = nonce0;
extkey[1][MLKEM_SYMBYTES] = nonce1;
extkey[2][MLKEM_SYMBYTES] = nonce2;
extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) && \
+ !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
mlk_prf_eta1_x4(buf, extkey);
+#else
+ mlk_prf_eta1(buf[0], extkey[0]);
+ mlk_prf_eta1(buf[1], extkey[1]);
+ mlk_prf_eta1(buf[2], extkey[2]);
+ if (r3 != NULL)
+ {
+ mlk_prf_eta1(buf[3], extkey[3]);
+ }
+#endif /* !(!FIPS202_X4_DEFAULT_IMPLEMENTATION && \
+ !MLK_CONFIG_SERIAL_FIPS202_ONLY) */
+
mlk_poly_cbd_eta1(r0, buf[0]);
mlk_poly_cbd_eta1(r1, buf[1]);
mlk_poly_cbd_eta1(r2, buf[2]);
- mlk_poly_cbd_eta1(r3, buf[3]);
+ if (r3 != NULL)
+ {
+ mlk_poly_cbd_eta1(r3, buf[3]);
+ mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+ }
mlk_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
- mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -364,7 +410,7 @@ __contract__(
#endif
}
-/* Reference: `poly_getnoise_eta1()` in the reference implementation @[REF].
+/* Reference: `poly_getnoise_eta2()` in the reference implementation @[REF].
* - We include buffer zeroization. */
MLK_INTERNAL_API
void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
@@ -373,13 +419,13 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
MLK_ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
MLK_ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
- memcpy(extkey, seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey, seed, MLKEM_SYMBYTES);
extkey[MLKEM_SYMBYTES] = nonce;
mlk_prf_eta2(buf, extkey);
mlk_poly_cbd_eta2(r, buf);
- mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+ mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA2 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -391,7 +437,7 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
#if MLKEM_K == 2
/* Reference: Does not exist in the reference implementation @[REF].
* - This implements a x4-batched version of `poly_getnoise_eta1()`
- * and `poly_getnoise_eta1()` from the reference implementation,
+ * and `poly_getnoise_eta2()` from the reference implementation,
* leveraging batched Keccak-f1600.
* - If a x4-batched Keccak-f1600 is available, we squeeze
* more random data than needed for the eta2 calls, to be
@@ -409,10 +455,10 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
- memcpy(extkey[0], seed, MLKEM_SYMBYTES);
- memcpy(extkey[1], seed, MLKEM_SYMBYTES);
- memcpy(extkey[2], seed, MLKEM_SYMBYTES);
- memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
extkey[0][MLKEM_SYMBYTES] = nonce0;
extkey[1][MLKEM_SYMBYTES] = nonce1;
extkey[2][MLKEM_SYMBYTES] = nonce2;
@@ -421,14 +467,16 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
/* On systems with fast batched Keccak, we use 4-fold batched PRF,
* even though that means generating more random data in buf[2] and buf[3]
* than necessary. */
-#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION)
+#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) && \
+ !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
mlk_prf_eta1_x4(buf, extkey);
#else
mlk_prf_eta1(buf[0], extkey[0]);
mlk_prf_eta1(buf[1], extkey[1]);
mlk_prf_eta2(buf[2], extkey[2]);
mlk_prf_eta2(buf[3], extkey[3]);
-#endif /* FIPS202_X4_DEFAULT_IMPLEMENTATION */
+#endif /* !(!FIPS202_X4_DEFAULT_IMPLEMENTATION && \
+ !MLK_CONFIG_SERIAL_FIPS202_ONLY) */
mlk_poly_cbd_eta1(r0, buf[0]);
mlk_poly_cbd_eta1(r1, buf[1]);
@@ -451,3 +499,4 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
#undef mlk_poly_cbd_eta1
#undef mlk_poly_cbd_eta2
+#undef mlk_polyvec_basemul_acc_montgomery_cached_c
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly_k.h
index f7a40ff5f9..9089a8e431 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly_k.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/poly_k.h
@@ -15,7 +15,6 @@
#ifndef MLK_POLY_K_H
#define MLK_POLY_K_H
-#include
#include "common.h"
#include "compress.h"
#include "poly.h"
@@ -29,9 +28,20 @@
#define mlk_polyvec_mulcache MLK_ADD_PARAM_SET(mlk_polyvec_mulcache)
/* End of parameter set namespacing */
-typedef mlk_poly mlk_polyvec[MLKEM_K];
-typedef mlk_poly mlk_polymat[MLKEM_K * MLKEM_K];
-typedef mlk_poly_mulcache mlk_polyvec_mulcache[MLKEM_K];
+typedef struct
+{
+ mlk_poly vec[MLKEM_K];
+} MLK_ALIGN mlk_polyvec;
+
+typedef struct
+{
+ mlk_polyvec vec[MLKEM_K];
+} MLK_ALIGN mlk_polymat;
+
+typedef struct
+{
+ mlk_poly_mulcache vec[MLKEM_K];
+} MLK_ALIGN mlk_polyvec_mulcache;
#define mlk_poly_compress_du MLK_NAMESPACE_K(poly_compress_du)
/*************************************************
@@ -131,7 +141,7 @@ __contract__(
requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r)))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DV)))
{
#if MLKEM_DV == 4
mlk_poly_compress_d4(r, a);
@@ -168,7 +178,7 @@ static MLK_INLINE void mlk_poly_decompress_dv(
__contract__(
requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
requires(memory_no_alias(r, sizeof(mlk_poly)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
{
#if MLKEM_DV == 4
@@ -200,13 +210,13 @@ __contract__(
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
- const mlk_polyvec a)
+ const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(forall(k0, 0, MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
- assigns(object_whole(r))
+ array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
);
#define mlk_polyvec_decompress_du MLK_NAMESPACE_K(polyvec_decompress_du)
@@ -228,14 +238,14 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_decompress_du(mlk_polyvec r,
+void mlk_polyvec_decompress_du(mlk_polyvec *r,
const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
__contract__(
requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_polyvec_tobytes MLK_NAMESPACE_K(polyvec_tobytes)
@@ -256,13 +266,13 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a)
+void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
requires(forall(k0, 0, MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
- assigns(object_whole(r))
+ array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(r, MLKEM_POLYVECBYTES))
);
#define mlk_polyvec_frombytes MLK_NAMESPACE_K(polyvec_frombytes)
@@ -284,13 +294,13 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES])
+void mlk_polyvec_frombytes(mlk_polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
);
#define mlk_polyvec_ntt MLK_NAMESPACE_K(polyvec_ntt)
@@ -313,14 +323,14 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_ntt(mlk_polyvec r)
+void mlk_polyvec_ntt(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
- assigns(object_whole(r))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
);
#define mlk_polyvec_invntt_tomont MLK_NAMESPACE_K(polyvec_invntt_tomont)
@@ -344,12 +354,12 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_invntt_tomont(mlk_polyvec r)
+void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
);
#define mlk_polyvec_basemul_acc_montgomery_cached \
@@ -380,16 +390,16 @@ __contract__(
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(memory_no_alias(b, sizeof(mlk_polyvec)))
requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
requires(forall(k1, 0, MLKEM_K,
- array_bound(a[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
- assigns(object_whole(r))
+ array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
);
#define mlk_polyvec_mulcache_compute MLK_NAMESPACE_K(polyvec_mulcache_compute)
@@ -423,11 +433,11 @@ __contract__(
* higher level safety proofs, and thus not part of the spec.
*/
MLK_INTERNAL_API
-void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
+void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(x, sizeof(mlk_polyvec_mulcache)))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
- assigns(object_whole(x))
+ assigns(memory_slice(x, sizeof(mlk_polyvec_mulcache)))
);
#define mlk_polyvec_reduce MLK_NAMESPACE_K(polyvec_reduce)
@@ -436,7 +446,7 @@ __contract__(
*
* Description: Applies Barrett reduction to each coefficient
* of each element of a vector of polynomials;
- * for details of the Barrett reduction see comments in reduce.c
+ * for details of the Barrett reduction see comments in poly.c
*
* Arguments: - mlk_polyvec r: pointer to input/output polynomial
*
@@ -453,12 +463,12 @@ __contract__(
* use of mlk_poly_reduce() in the context of (de)serialization.
*/
MLK_INTERNAL_API
-void mlk_polyvec_reduce(mlk_polyvec r)
+void mlk_polyvec_reduce(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_polyvec_add MLK_NAMESPACE_K(polyvec_add)
@@ -485,17 +495,17 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b)
+void mlk_polyvec_add(mlk_polyvec *r, const mlk_polyvec *b)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(memory_no_alias(b, sizeof(mlk_polyvec)))
requires(forall(j0, 0, MLKEM_K,
forall(k0, 0, MLKEM_N,
- (int32_t)r[j0].coeffs[k0] + b[j0].coeffs[k0] <= INT16_MAX)))
+ (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
requires(forall(j1, 0, MLKEM_K,
forall(k1, 0, MLKEM_N,
- (int32_t)r[j1].coeffs[k1] + b[j1].coeffs[k1] >= INT16_MIN)))
- assigns(object_whole(r))
+ (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
);
#define mlk_polyvec_tomont MLK_NAMESPACE_K(polyvec_tomont)
@@ -514,13 +524,12 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_tomont(mlk_polyvec r)
+void mlk_polyvec_tomont(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
assigns(memory_slice(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
);
#define mlk_poly_getnoise_eta1_4x MLK_NAMESPACE_K(poly_getnoise_eta1_4x)
@@ -531,7 +540,8 @@ __contract__(
* and nonces, with output polynomials close to centered binomial
* distribution with parameter MLKEM_ETA1.
*
- * Arguments: - mlk_poly *r{0,1,2,3}: pointer to output polynomial
+ * Arguments: - mlk_poly *r{0,1,2,3}: pointer to output polynomial. The last
+ * polynomial pointer may be NULL.
* - const uint8_t *seed: pointer to input seed
* (of length MLKEM_SYMBYTES bytes)
* - uint8_t nonce{0,1,2,3}: one-byte input nonce
@@ -555,16 +565,15 @@ __contract__(
requires(memory_no_alias(r0, sizeof(mlk_poly)))
requires(memory_no_alias(r1, sizeof(mlk_poly)))
requires(memory_no_alias(r2, sizeof(mlk_poly)))
- requires(memory_no_alias(r3, sizeof(mlk_poly)))
+ requires(r3 == NULL || memory_no_alias(r3, sizeof(mlk_poly)))
assigns(memory_slice(r0, sizeof(mlk_poly)))
assigns(memory_slice(r1, sizeof(mlk_poly)))
assigns(memory_slice(r2, sizeof(mlk_poly)))
- assigns(memory_slice(r3, sizeof(mlk_poly)))
- ensures(
- array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+ assigns(r3 != NULL: memory_slice(r3, sizeof(mlk_poly)))
+ ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(r3 != NULL ==> array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
);
#if MLKEM_ETA1 == MLKEM_ETA2
@@ -604,7 +613,7 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
);
#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
@@ -640,15 +649,19 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
uint8_t nonce0, uint8_t nonce1,
uint8_t nonce2, uint8_t nonce3)
__contract__(
- requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(mlk_poly)) && memory_no_alias(r2, 2 * sizeof(mlk_poly)) &&
- r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+ requires(memory_no_alias(r0, sizeof(mlk_poly)))
+ requires(memory_no_alias(r1, sizeof(mlk_poly)))
+ requires(memory_no_alias(r2, sizeof(mlk_poly)))
+ requires(memory_no_alias(r3, sizeof(mlk_poly)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
- assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+ assigns(memory_slice(r0, sizeof(mlk_poly)))
+ assigns(memory_slice(r1, sizeof(mlk_poly)))
+ assigns(memory_slice(r2, sizeof(mlk_poly)))
+ assigns(memory_slice(r3, sizeof(mlk_poly)))
ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
- && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+ && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+ && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+ && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1))
);
#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/randombytes.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/randombytes.h
index 132d920afb..3e841d26ca 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/randombytes.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/randombytes.h
@@ -5,18 +5,56 @@
#ifndef MLK_RANDOMBYTES_H
#define MLK_RANDOMBYTES_H
-#include
-#include
#include "cbmc.h"
#include "common.h"
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
#if !defined(MLK_CONFIG_CUSTOM_RANDOMBYTES)
-void randombytes(uint8_t *out, size_t outlen);
-static MLK_INLINE void mlk_randombytes(uint8_t *out, size_t outlen)
+/*************************************************
+ * Name: randombytes
+ *
+ * Description: Fill a buffer with cryptographically secure random bytes.
+ *
+ * mlkem-native does not provide an implementation of this
+ * function. It must be provided by the consumer.
+ *
+ * To use a custom random byte source with a different name
+ * or signature, set MLK_CONFIG_CUSTOM_RANDOMBYTES and define
+ * mlk_randombytes directly.
+ *
+ * Arguments: - uint8_t *out: pointer to output buffer
+ * - size_t outlen: number of random bytes to write
+ *
+ * Returns: 0 on success, non-zero on failure.
+ * On failure, top-level APIs return MLK_ERR_RNG_FAIL.
+ *
+ **************************************************/
+int randombytes(uint8_t *out, size_t outlen);
+
+/*************************************************
+ * Name: mlk_randombytes
+ *
+ * Description: Internal wrapper around randombytes().
+ *
+ * Fill a buffer with cryptographically secure random bytes.
+ *
+ * This function can be replaced by setting
+ * MLK_CONFIG_CUSTOM_RANDOMBYTES and defining mlk_randombytes
+ * directly.
+ *
+ * Arguments: - uint8_t *out: pointer to output buffer
+ * - size_t outlen: number of random bytes to write
+ *
+ * Returns: 0 on success, non-zero on failure.
+ * On failure, top-level APIs return MLK_ERR_RNG_FAIL.
+ *
+ **************************************************/
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_randombytes(uint8_t *out, size_t outlen)
__contract__(
requires(memory_no_alias(out, outlen))
- assigns(memory_slice(out, outlen))) { randombytes(out, outlen); }
+ assigns(memory_slice(out, outlen))) { return randombytes(out, outlen); }
#endif /* !MLK_CONFIG_CUSTOM_RANDOMBYTES */
-
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
#endif /* !MLK_RANDOMBYTES_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/sampling.c
index be5d931a79..945d12ed3d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/sampling.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/sampling.c
@@ -29,9 +29,10 @@
* in that it adds the offset and always expects the base of the
* target buffer. This avoids shifting the buffer base in the
* caller, which appears tricky to reason about. */
-static unsigned mlk_rej_uniform_scalar(int16_t *r, unsigned target,
- unsigned offset, const uint8_t *buf,
- unsigned buflen)
+MLK_STATIC_TESTABLE unsigned mlk_rej_uniform_c(int16_t *r, unsigned target,
+ unsigned offset,
+ const uint8_t *buf,
+ unsigned buflen)
__contract__(
requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
requires(memory_no_alias(r, sizeof(int16_t) * target))
@@ -39,11 +40,10 @@ __contract__(
requires(array_bound(r, 0, offset, 0, MLKEM_Q))
assigns(memory_slice(r, sizeof(int16_t) * target))
ensures(offset <= return_value && return_value <= target)
- ensures(array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
+ ensures(array_bound(r, 0, return_value, 0, MLKEM_Q)))
{
unsigned ctr, pos;
- uint16_t val0, val1;
+ int16_t val0, val1;
mlk_assert_bound(r, offset, 0, MLKEM_Q);
@@ -55,8 +55,8 @@ __contract__(
invariant(offset <= ctr && ctr <= target && pos <= buflen)
invariant(array_bound(r, 0, ctr, 0, MLKEM_Q)))
{
- val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
- val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+ val0 = ((buf[pos + 0] >> 0) | (buf[pos + 1] << 8)) & 0xFFF;
+ val1 = ((buf[pos + 1] >> 4) | (buf[pos + 2] << 4)) & 0xFFF;
pos += 3;
if (val0 < MLKEM_Q)
@@ -93,7 +93,7 @@ __contract__(
* Must be a multiple of 3.
*
* Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 128 is somewhat arbitrary but sufficient for all
+ * excluding. The limit of 4096 is somewhat arbitrary but sufficient for all
* uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
*
* Returns the new offset of sampled 16-bit integers, at most target,
@@ -124,8 +124,9 @@ __contract__(
#if defined(MLK_USE_NATIVE_REJ_UNIFORM)
if (offset == 0)
{
- int ret = mlk_rej_uniform_native(r, target, buf, buflen);
- if (ret != -1)
+ int ret;
+ ret = mlk_rej_uniform_native(r, target, buf, buflen);
+ if (ret != MLK_NATIVE_FUNC_FALLBACK)
{
unsigned res = (unsigned)ret;
mlk_assert_bound(r, res, 0, MLKEM_Q);
@@ -134,19 +135,22 @@ __contract__(
}
#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
- return mlk_rej_uniform_scalar(r, target, offset, buf, buflen);
+ return mlk_rej_uniform_c(r, target, offset, buf, buflen);
}
#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
- ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + MLK_XOF_RATE) / MLK_XOF_RATE)
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+ ((12 * MLKEM_N / 8 * ((uint32_t)1 << 12) / MLKEM_Q + MLK_XOF_RATE) / \
+ MLK_XOF_RATE)
#endif
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
/* Reference: Does not exist in the reference implementation @[REF].
* - x4-batched version of `rej_uniform()` from the
* reference implementation, leveraging x4-batched Keccak-f1600. */
MLK_INTERNAL_API
-void mlk_poly_rej_uniform_x4(mlk_poly *vec,
+void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
+ mlk_poly *vec3,
uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
{
/* Temporary buffers for XOF output before rejection sampling */
@@ -167,10 +171,10 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
*/
mlk_xof_x4_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &statex);
buflen = MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE;
- ctr[0] = mlk_rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf[0], buflen);
- ctr[1] = mlk_rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf[1], buflen);
- ctr[2] = mlk_rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf[2], buflen);
- ctr[3] = mlk_rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf[3], buflen);
+ ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, 0, buf[0], buflen);
+ ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, 0, buf[1], buflen);
+ ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, 0, buf[2], buflen);
+ ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, 0, buf[3], buflen);
/*
* So long as not all matrix entries have been generated, squeeze
@@ -180,20 +184,24 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
ctr[3] < MLKEM_N)
__loop__(
- assigns(ctr, statex, memory_slice(vec, sizeof(mlk_poly) * 4), object_whole(buf[0]),
- object_whole(buf[1]), object_whole(buf[2]), object_whole(buf[3]))
+ assigns(ctr, statex,
+ memory_slice(vec0, sizeof(mlk_poly)),
+ memory_slice(vec1, sizeof(mlk_poly)),
+ memory_slice(vec2, sizeof(mlk_poly)),
+ memory_slice(vec3, sizeof(mlk_poly)),
+ object_whole(buf))
invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
- invariant(array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
- invariant(array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
- invariant(array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
- invariant(array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+ invariant(array_bound(vec0->coeffs, 0, ctr[0], 0, MLKEM_Q))
+ invariant(array_bound(vec1->coeffs, 0, ctr[1], 0, MLKEM_Q))
+ invariant(array_bound(vec2->coeffs, 0, ctr[2], 0, MLKEM_Q))
+ invariant(array_bound(vec3->coeffs, 0, ctr[3], 0, MLKEM_Q)))
{
mlk_xof_x4_squeezeblocks(buf, 1, &statex);
- ctr[0] = mlk_rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf[0], buflen);
- ctr[1] = mlk_rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf[1], buflen);
- ctr[2] = mlk_rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf[2], buflen);
- ctr[3] = mlk_rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf[3], buflen);
+ ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, ctr[0], buf[0], buflen);
+ ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, ctr[1], buf[1], buflen);
+ ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, ctr[2], buf[2], buflen);
+ ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, ctr[3], buf[3], buflen);
}
mlk_xof_x4_release(&statex);
@@ -202,6 +210,7 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(buf, sizeof(buf));
}
+#endif /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
MLK_INTERNAL_API
void mlk_poly_rej_uniform(mlk_poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
@@ -284,7 +293,7 @@ void mlk_poly_cbd2(mlk_poly *r, const uint8_t buf[2 * MLKEM_N / 4])
{
const int16_t a = (d >> (4 * j + 0)) & 0x3;
const int16_t b = (d >> (4 * j + 2)) & 0x3;
- r->coeffs[8 * i + j] = a - b;
+ r->coeffs[8 * i + j] = (int16_t)(a - b);
}
}
}
@@ -336,7 +345,7 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4])
{
const int16_t a = (d >> (6 * j + 0)) & 0x7;
const int16_t b = (d >> (6 * j + 3)) & 0x7;
- r->coeffs[4 * i + j] = a - b;
+ r->coeffs[4 * i + j] = (int16_t)(a - b);
}
}
}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/sampling.h
index 2cf43c889b..24c26b34a5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/sampling.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/sampling.h
@@ -15,8 +15,6 @@
#ifndef MLK_SAMPLING_H
#define MLK_SAMPLING_H
-#include
-#include
#include "cbmc.h"
#include "common.h"
#include "poly.h"
@@ -58,6 +56,7 @@ MLK_INTERNAL_API
void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_ETA1 == 3 */
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
#define mlk_poly_rej_uniform_x4 MLK_NAMESPACE(poly_rej_uniform_x4)
/*************************************************
* Name: mlk_poly_rej_uniform_x4
@@ -65,8 +64,8 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
* Description: Generate four polynomials using rejection sampling
* on (pseudo-)uniformly random bytes sampled from a seed.
*
- * Arguments: - mlk_poly *vec:
- * Pointer to an array of 4 polynomials to be sampled.
+ * Arguments: - mlk_poly *vec0, *vec1, *vec2, *vec3:
+ * Pointers to 4 polynomials to be sampled.
* - uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)]:
* Pointer consecutive array of seed buffers of size
* MLKEM_SYMBYTES + 2 each, plus padding for alignment.
@@ -75,16 +74,24 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
*
**************************************************/
MLK_INTERNAL_API
-void mlk_poly_rej_uniform_x4(mlk_poly *vec,
+void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
+ mlk_poly *vec3,
uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
__contract__(
- requires(memory_no_alias(vec, sizeof(mlk_poly) * 4))
+ requires(memory_no_alias(vec0, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec1, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec2, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec3, sizeof(mlk_poly)))
requires(memory_no_alias(seed, 4 * MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)))
- assigns(memory_slice(vec, sizeof(mlk_poly) * 4))
- ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+ assigns(memory_slice(vec0, sizeof(mlk_poly)))
+ assigns(memory_slice(vec1, sizeof(mlk_poly)))
+ assigns(memory_slice(vec2, sizeof(mlk_poly)))
+ assigns(memory_slice(vec3, sizeof(mlk_poly)))
+ ensures(array_bound(vec0->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec1->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec2->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec3->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+#endif /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
#define mlk_poly_rej_uniform MLK_NAMESPACE(poly_rej_uniform)
/*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/symmetric.h
index 985bfeab37..68d7e1a0cd 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/symmetric.h
@@ -15,12 +15,13 @@
#ifndef MLK_SYMMETRIC_H
#define MLK_SYMMETRIC_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include MLK_FIPS202_HEADER_FILE
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
#include MLK_FIPS202X4_HEADER_FILE
+#endif
/* Macros denoting FIPS 203 specific Hash functions */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/sys.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/sys.h
index 8f690cc553..0ab8947318 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/sys.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/sys.h
@@ -20,6 +20,15 @@
#error "__BYTE_ORDER__ defined, but don't recognize value."
#endif
#endif /* __BYTE_ORDER__ */
+
+/* MSVC does not define __BYTE_ORDER__. However, MSVC only supports
+ * little endian x86, x86_64, and AArch64. It is, hence, safe to assume
+ * little endian. */
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || \
+ defined(_M_IX86) || defined(_M_ARM64))
+#define MLK_SYS_LITTLE_ENDIAN
+#endif
+
#endif /* !MLK_SYS_LITTLE_ENDIAN && !MLK_SYS_BIG_ENDIAN */
/* Check if we're running on an AArch64 little endian system. _M_ARM64 is set by
@@ -33,6 +42,11 @@
#define MLK_SYS_AARCH64_EB
#endif
+/* Check if we're running on an Armv8.1-M system with MVE */
+#if defined(__ARM_ARCH_8_1M_MAIN__) || defined(__ARM_FEATURE_MVE)
+#define MLK_SYS_ARMV81M_MVE
+#endif
+
#if defined(__x86_64__)
#define MLK_SYS_X86_64
#if defined(__AVX2__)
@@ -48,6 +62,11 @@
#define MLK_SYS_RISCV64
#endif
+#if defined(MLK_SYS_RISCV64) && defined(__riscv_vector) && \
+ defined(__riscv_v_intrinsic)
+#define MLK_SYS_RISCV64_RVV
+#endif
+
#if defined(__riscv) && defined(__riscv_xlen) && __riscv_xlen == 32
#define MLK_SYS_RISCV32
#endif
@@ -56,6 +75,14 @@
#define MLK_SYS_WINDOWS
#endif
+#if defined(__linux__)
+#define MLK_SYS_LINUX
+#endif
+
+#if defined(__APPLE__)
+#define MLK_SYS_APPLE
+#endif
+
#if defined(MLK_FORCE_AARCH64) && !defined(MLK_SYS_AARCH64)
#error "MLK_FORCE_AARCH64 is set, but we don't seem to be on an AArch64 system."
#endif
@@ -82,34 +109,46 @@
#endif
/*
- * C90 does not have the inline compiler directive yet.
- * We don't use it in C90 builds.
- * However, in that case the compiler warns about some inline functions in
- * header files not being used in every compilation unit that includes that
- * header. To work around it we silence that warning in that case using
- * __attribute__((unused)).
+ * MLK_INLINE: Hint for inlining.
+ * - MSVC: __inline
+ * - C99+: inline
+ * - GCC/Clang C90: __attribute__((unused)) to silence warnings
+ * - Other C90: empty
*/
-
-/* Do not use inline for C90 builds*/
#if !defined(MLK_INLINE)
-#if !defined(inline)
#if defined(_MSC_VER)
#define MLK_INLINE __inline
-/* Don't combine __inline and __forceinline */
-#define MLK_ALWAYS_INLINE __forceinline
-#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#elif defined(inline) || \
+ (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
#define MLK_INLINE inline
+#elif defined(__GNUC__) || defined(__clang__)
+#define MLK_INLINE __attribute__((unused))
+#else
+#define MLK_INLINE
+#endif
+#endif /* !MLK_INLINE */
+
+/*
+ * MLK_ALWAYS_INLINE: Force inlining.
+ * - MSVC: __forceinline
+ * - GCC/Clang C99+: MLK_INLINE __attribute__((always_inline))
+ * - Other: MLK_INLINE (no forced inlining)
+ */
+#if !defined(MLK_ALWAYS_INLINE)
+#if defined(_MSC_VER)
+#define MLK_ALWAYS_INLINE __forceinline
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+ (defined(inline) || \
+ (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L))
#define MLK_ALWAYS_INLINE MLK_INLINE __attribute__((always_inline))
#else
-#define MLK_INLINE __attribute__((unused))
#define MLK_ALWAYS_INLINE MLK_INLINE
#endif
+#endif /* !MLK_ALWAYS_INLINE */
-#else /* !inline */
-#define MLK_INLINE inline
-#define MLK_ALWAYS_INLINE MLK_INLINE __attribute__((always_inline))
-#endif /* inline */
-#endif /* !MLK_INLINE */
+#ifndef MLK_STATIC_TESTABLE
+#define MLK_STATIC_TESTABLE static
+#endif
/*
* C90 does not have the restrict compiler directive yet.
@@ -181,10 +220,41 @@
} while (0)
#endif /* !(MLK_CONFIG_CT_TESTING_ENABLED && !__ASSEMBLER__) */
-#if defined(__GNUC__) || defined(clang)
+#if defined(__GNUC__) || defined(__clang__)
#define MLK_MUST_CHECK_RETURN_VALUE __attribute__((warn_unused_result))
#else
#define MLK_MUST_CHECK_RETURN_VALUE
#endif
+#if !defined(__ASSEMBLER__)
+/* System capability enumeration */
+typedef enum
+{
+ /* x86_64 */
+ MLK_SYS_CAP_AVX2,
+ /* AArch64 */
+ MLK_SYS_CAP_SHA3
+} mlk_sys_cap;
+
+#if !defined(MLK_CONFIG_CUSTOM_CAPABILITY_FUNC)
+#include "cbmc.h"
+
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_sys_check_capability(mlk_sys_cap cap)
+__contract__(
+ ensures(return_value == 0 || return_value == 1)
+)
+{
+ /* By default, we rely on compile-time feature detection/specification:
+ * If a feature is enabled at compile-time, we assume it is supported by
+ * the host that the resulting library/binary will be built on.
+ * If this assumption is not true, you MUST overwrite this function.
+ * See the documentation of MLK_CONFIG_CUSTOM_CAPABILITY_FUNC in
+ * mlkem_native_config.h for more information. */
+ (void)cap;
+ return 1;
+}
+#endif /* !MLK_CONFIG_CUSTOM_CAPABILITY_FUNC */
+#endif /* !__ASSEMBLER__ */
+
#endif /* !MLK_SYS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/verify.h
index 85626c15ea..a9bdeaab30 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/verify.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/verify.h
@@ -30,9 +30,7 @@
#ifndef MLK_VERIFY_H
#define MLK_VERIFY_H
-#include
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
@@ -115,92 +113,83 @@ __contract__(ensures(return_value == b)) { return (b ^ mlk_ct_get_optblocker_u8(
static MLK_INLINE uint32_t mlk_value_barrier_u32(uint32_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
static MLK_INLINE int32_t mlk_value_barrier_i32(int32_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
static MLK_INLINE uint8_t mlk_value_barrier_u8(uint8_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
#endif /* MLK_USE_ASM_VALUE_BARRIER */
-/*
- * The ct_cmask_nonzero_xxx functions below make deliberate use of unsigned
- * overflow, which is fully defined behaviour in C. It is thus safe to disable
- * this warning.
- */
#ifdef CBMC
#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
+#pragma CPROVER check disable "conversion"
#endif
-
/*************************************************
- * Name: mlk_ct_cmask_nonzero_u16
+ * Name: mlk_cast_uint16_to_int16
*
- * Description: Return 0 if input is zero, and -1 otherwise.
+ * Description: Cast uint16 value to int16
*
- * Arguments: uint16_t x: Value to be converted into a mask
+ * Returns: For uint16_t x, the unique y in int16_t
+ * so that x == y mod 2^16.
+ *
+ * Concretely:
+ * - x < 32768: returns x
+ * - x >= 32768: returns x - 65536
*
**************************************************/
-
-/* Reference: Embedded in `cmov_int16()` in the reference implementation @[REF].
- * - Use value barrier and shift instead of `b = -b` to
- * convert condition into mask. */
-static MLK_INLINE uint16_t mlk_ct_cmask_nonzero_u16(uint16_t x)
-__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
+static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
{
- uint32_t tmp = mlk_value_barrier_u32(-((uint32_t)x));
- tmp >>= 16;
- return tmp;
+ /*
+ * PORTABILITY: This relies on uint16_t -> int16_t
+ * being implemented as the inverse of int16_t -> uint16_t,
+ * which is implementation-defined (C99 6.3.1.3 (3))
+ * CBMC (correctly) fails to prove this conversion is OK,
+ * so we have to suppress that check here
+ */
+ return (int16_t)x;
}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
/*************************************************
- * Name: mlk_ct_cmask_nonzero_u8
+ * Name: mlk_cast_int32_to_uint16
*
- * Description: Return 0 if input is zero, and -1 otherwise.
- *
- * Arguments: uint8_t x: Value to be converted into a mask
+ * Description: Cast int32 value to uint16 as per C standard.
*
+ * Returns: For int32_t x, the unique y in uint16_t
+ * so that x == y mod 2^16.
**************************************************/
-
-/* Reference: Embedded in `verify()` and `cmov()` in the
- * reference implementation @[REF].
- * - We include a value barrier not present in the
- * reference implementation, to prevent the compiler
- * from realizing that this function returns a mask. */
-static MLK_INLINE uint8_t mlk_ct_cmask_nonzero_u8(uint8_t x)
-__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
+static MLK_ALWAYS_INLINE uint16_t mlk_cast_int32_to_uint16(int32_t x)
{
- uint32_t tmp = mlk_value_barrier_u32(-((uint32_t)x));
- tmp >>= 24;
- return tmp;
+ return (uint16_t)(x & (int32_t)UINT16_MAX);
}
-/* Put unsigned overflow warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/*
- * The mlk_ct_cmask_neg_i16 function below makes deliberate use of
- * signed to unsigned integer conversion, which is fully defined
- * behaviour in C. It is thus safe to disable this warning.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
+/*************************************************
+ * Name: mlk_cast_int16_to_uint16
+ *
+ * Description: Cast int16 value to uint16 as per C standard.
+ *
+ * Returns: For int16_t x, the unique y in uint16_t
+ * so that x == y mod 2^16.
+ **************************************************/
+static MLK_ALWAYS_INLINE uint16_t mlk_cast_int16_to_uint16(int32_t x)
+{
+ return mlk_cast_int32_to_uint16((int32_t)x);
+}
/*************************************************
* Name: mlk_ct_cmask_neg_i16
@@ -225,24 +214,49 @@ __contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
{
int32_t tmp = mlk_value_barrier_i32((int32_t)x);
tmp >>= 16;
- return (int16_t)tmp;
+ return mlk_cast_int32_to_uint16(tmp);
}
-/* Put unsigned-to-signed warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
+/*************************************************
+ * Name: mlk_ct_cmask_nonzero_u16
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments: uint16_t x: Value to be converted into a mask
+ *
+ **************************************************/
-/*
- * The ct_csel_xxx functions below make deliberate use of unsigned
- * to signed integer conversion, which is implementation-defined
- * behaviour. Here, we assume that uint16_t -> int16_t is inverse
- * to int16_t -> uint16_t.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
+/* Reference: Embedded in `cmov_int16()` in the reference implementation @[REF].
+ * - Use value barrier and shift instead of `b = -b` to
+ * convert condition into mask. */
+static MLK_INLINE uint16_t mlk_ct_cmask_nonzero_u16(uint16_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
+{
+ int32_t tmp = mlk_value_barrier_i32(-((int32_t)x));
+ tmp >>= 16;
+ return mlk_cast_int32_to_uint16(tmp);
+}
+
+/*************************************************
+ * Name: mlk_ct_cmask_nonzero_u8
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments: uint8_t x: Value to be converted into a mask
+ *
+ **************************************************/
+
+/* Reference: Embedded in `verify()` and `cmov()` in the
+ * reference implementation @[REF].
+ * - We include a value barrier not present in the
+ * reference implementation, to prevent the compiler
+ * from realizing that this function returns a mask. */
+static MLK_INLINE uint8_t mlk_ct_cmask_nonzero_u8(uint8_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
+{
+ uint16_t mask = mlk_ct_cmask_nonzero_u16((uint16_t)x);
+ return (uint8_t)(mask & 0xFF);
+}
/*************************************************
* Name: mlk_ct_sel_int16
@@ -280,16 +294,12 @@ __contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
static MLK_INLINE int16_t mlk_ct_sel_int16(int16_t a, int16_t b, uint16_t cond)
__contract__(ensures(return_value == (cond ? a : b)))
{
- uint16_t au = a, bu = b;
+ uint16_t au = mlk_cast_int16_to_uint16(a);
+ uint16_t bu = mlk_cast_int16_to_uint16(b);
uint16_t res = bu ^ (mlk_ct_cmask_nonzero_u16(cond) & (au ^ bu));
- return (int16_t)res;
+ return mlk_cast_uint16_to_int16(res);
}
-/* Put unsigned-to-signed warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
/*************************************************
* Name: mlk_ct_sel_uint8
*
@@ -318,9 +328,11 @@ __contract__(ensures(return_value == (cond ? a : b)))
*
* Arguments: const uint8_t *a: pointer to first byte array
* const uint8_t *b: pointer to second byte array
- * size_t len: length of the byte arrays
+ * size_t len: length of the byte arrays, upper-bounded
+ * to UINT16_MAX to control proof complexity
+ * only.
*
- * Returns 0 if the byte arrays are equal, a non-zero value otherwise
+ * Returns 0 if the byte arrays are equal, 0xFF otherwise.
*
* Specification:
* - Used to securely compute conditional move in
@@ -338,9 +350,10 @@ __contract__(ensures(return_value == (cond ? a : b)))
static MLK_INLINE uint8_t mlk_ct_memcmp(const uint8_t *a, const uint8_t *b,
const size_t len)
__contract__(
+ requires(len <= UINT16_MAX)
requires(memory_no_alias(a, len))
requires(memory_no_alias(b, len))
- requires(len <= INT_MAX)
+ ensures((return_value == 0) || (return_value == 0xFF))
ensures((return_value == 0) == forall(i, 0, len, (a[i] == b[i]))))
{
uint8_t r = 0, s = 0;
@@ -391,13 +404,17 @@ __contract__(
static MLK_INLINE void mlk_ct_cmov_zero(uint8_t *r, const uint8_t *x,
size_t len, uint8_t b)
__contract__(
+ requires(len <= MLK_MAX_BUFFER_SIZE)
requires(memory_no_alias(r, len))
requires(memory_no_alias(x, len))
- assigns(memory_slice(r, len)))
+ assigns(memory_slice(r, len))
+ ensures(forall(i, 0, len, (r[i] == (b == 0 ? x[i] : old(r)[i])))))
{
size_t i;
for (i = 0; i < len; i++)
- __loop__(invariant(i <= len))
+ __loop__(
+ invariant(i <= len)
+ invariant(forall(k, 0, i, r[k] == (b == 0 ? x[k] : loop_entry(r)[k]))))
{
r[i] = mlk_ct_sel_uint8(r[i], x[i], b);
}
@@ -431,13 +448,13 @@ __contract__(
requires(memory_no_alias(ptr, len))
assigns(memory_slice(ptr, len)))
{
- memset(ptr, 0, len);
+ mlk_memset(ptr, 0, len);
/* This follows OpenSSL and seems sufficient to prevent the compiler
* from optimizing away the memset.
*
* If there was a reliable way to detect availability of memset_s(),
* that would be preferred. */
- __asm__ __volatile__("" : : "r"(ptr) : "memory");
+ __asm__ volatile("" : : "r"(ptr) : "memory");
}
#else /* !MLK_SYS_WINDOWS && MLK_HAVE_INLINE_ASM */
#error No plausibly-secure implementation of mlk_zeroize available. Please provide your own using MLK_CONFIG_CUSTOM_ZEROIZE.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/zetas.inc b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/zetas.inc
index 0c00b5b905..00316daf67 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/zetas.inc
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/mlkem/src/zetas.inc
@@ -5,16 +5,16 @@
/*
* WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
* Do not modify it directly.
*/
-#include
/*
* Table of zeta values used in the reference NTT and inverse NTT.
* See autogen for details.
*/
-static MLK_ALIGN const int16_t zetas[128] = {
+static MLK_ALIGN const int16_t mlk_zetas[128] = {
-1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577,
182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458,
-1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/integration/liboqs/config_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/integration/liboqs/config_aarch64.h
index 65fe4bb4b7..29f27388a9 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/integration/liboqs/config_aarch64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/integration/liboqs/config_aarch64.h
@@ -8,13 +8,23 @@
*
* - [FIPS140_3_IG]
* Implementation Guidance for FIPS 140-3 and the Cryptographic Module
- * Validation Program National Institute of Standards and Technology
+ * Validation Program
+ * National Institute of Standards and Technology
* https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
*/
#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_AARCH64_H
#define MLK_INTEGRATION_LIBOQS_CONFIG_AARCH64_H
+/* Enable valgrind-based assertions in mlkem-native through macro
+ * from libOQS. */
+#if !defined(__ASSEMBLER__)
+#include
+#if defined(OQS_ENABLE_TEST_CONSTANT_TIME)
+#define MLK_CONFIG_CT_TESTING_ENABLED
+#endif
+#endif /* !__ASSEMBLER__ */
+
/******************************************************************************
* Name: MLK_CONFIG_PARAMETER_SET
*
@@ -172,7 +182,7 @@
* consumer.
*
* If this option is not set, mlkem-native expects a function
- * void randombytes(uint8_t *out, size_t outlen).
+ * int randombytes(uint8_t *out, size_t outlen).
*
* Set this option and define `mlk_randombytes` if you want to
* use a custom method to sample randombytes with a different name
@@ -184,9 +194,10 @@
#include
#include
#include "../../mlkem/src/sys.h"
-static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len)
+static MLK_INLINE int mlk_randombytes(uint8_t *ptr, size_t len)
{
OQS_randombytes(ptr, len);
+ return 0;
}
#endif /* !__ASSEMBLER__ */
@@ -250,13 +261,4 @@ static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len)
#endif
*/
-/* Enable valgrind-based assertions in mlkem-native through macro
- * from libOQS. */
-#if !defined(__ASSEMBLER__)
-#include
-#if defined(OQS_ENABLE_TEST_CONSTANT_TIME)
-#define MLK_CONFIG_CT_TESTING_ENABLED
-#endif
-#endif /* !__ASSEMBLER__ */
-
#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_AARCH64_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/cbmc.h
index 650d32b95b..80e1a36fc7 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/cbmc.h
@@ -8,7 +8,6 @@
/***************************************************
* Basic replacements for __CPROVER_XXX contracts
***************************************************/
-
#ifndef CBMC
#define __contract__(x)
@@ -16,6 +15,7 @@
#else /* !CBMC */
+
#define __contract__(x) x
#define __loop__(x) x
@@ -49,7 +49,6 @@
*/
#define object_whole(...) __CPROVER_object_whole(__VA_ARGS__)
#define memory_slice(...) __CPROVER_object_upto(__VA_ARGS__)
-#define same_object(...) __CPROVER_same_object(__VA_ARGS__)
/*
* Pointer-related predicates
@@ -59,6 +58,17 @@
#define readable(...) __CPROVER_r_ok(__VA_ARGS__)
#define writeable(...) __CPROVER_w_ok(__VA_ARGS__)
+/* Maximum supported buffer size
+ *
+ * Larger buffers may be supported, but due to internal modeling constraints
+ * in CBMC, the proofs of memory- and type-safety won't be able to run.
+ *
+ * If you find yourself in need for a buffer size larger than this,
+ * please contact the maintainers, so we can prioritize work to relax
+ * this somewhat artificial bound.
+ */
+#define MLK_MAX_BUFFER_SIZE (SIZE_MAX >> 12)
+
/*
* History variables
* https://diffblue.github.io/cbmc/contracts-history-variables.html
@@ -83,7 +93,7 @@
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> (predicate) \
}
-#define EXISTS(qvar, qvar_lb, qvar_ub, predicate) \
+#define exists(qvar, qvar_lb, qvar_ub, predicate) \
__CPROVER_exists \
{ \
unsigned qvar; \
@@ -118,13 +128,35 @@
{ \
unsigned qvar; \
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
- (((int)(value_lb) <= ((array_var)[(qvar)])) && \
- (((array_var)[(qvar)]) < (int)(value_ub))) \
+ (((int)(value_lb) <= ((array_var)[(qvar)])) && \
+ (((array_var)[(qvar)]) < (int)(value_ub))) \
}
-#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
- array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb), \
+#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
+ array_bound_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), (qvar_lb), \
(qvar_ub), (array_var), (value_lb), (value_ub))
+
+#define array_unchanged_core(qvar, qvar_lb, qvar_ub, array_var) \
+ __CPROVER_forall \
+ { \
+ unsigned qvar; \
+ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
+ ((array_var)[(qvar)]) == (old(* (int16_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
+ }
+
+#define array_unchanged(array_var, N) \
+ array_unchanged_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var))
+
+#define array_unchanged_u64_core(qvar, qvar_lb, qvar_ub, array_var) \
+ __CPROVER_forall \
+ { \
+ unsigned qvar; \
+ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
+ ((array_var)[(qvar)]) == (old(* (uint64_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
+ }
+
+#define array_unchanged_u64(array_var, N) \
+ array_unchanged_u64_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var))
/* clang-format on */
/* Wrapper around array_bound operating on absolute values.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/common.h
index 9de9875556..bc4e9ed72c 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/common.h
@@ -5,10 +5,16 @@
#ifndef MLK_COMMON_H
#define MLK_COMMON_H
+#ifndef __ASSEMBLER__
+#include
+#endif
+
+#define MLK_BUILD_INTERNAL
+
#if defined(MLK_CONFIG_FILE)
#include MLK_CONFIG_FILE
#else
-#include "config.h"
+#include "mlkem_native_config.h"
#endif
#include "params.h"
@@ -28,15 +34,11 @@
#define MLK_EXTERNAL_API MLK_CONFIG_EXTERNAL_API_QUALIFIER
#endif
-#if defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) || \
- defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED)
-#define MLK_MULTILEVEL_BUILD
-#endif
-
#define MLK_CONCAT_(x1, x2) x1##x2
#define MLK_CONCAT(x1, x2) MLK_CONCAT_(x1, x2)
-#if defined(MLK_MULTILEVEL_BUILD)
+#if (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || \
+ defined(MLK_CONFIG_MULTILEVEL_NO_SHARED))
#define MLK_ADD_PARAM_SET(s) MLK_CONCAT(s, MLK_CONFIG_PARAMETER_SET)
#else
#define MLK_ADD_PARAM_SET(s) s
@@ -49,7 +51,7 @@
/* Functions are prefixed by MLK_CONFIG_NAMESPACE_PREFIX.
*
* If multiple parameter sets are used, functions depending on the parameter
- * set are additionally prefixed with 512/768/1024. See config.h.
+ * set are additionally prefixed with 512/768/1024. See mlkem_native_config.h.
*
* Example: If MLK_CONFIG_NAMESPACE_PREFIX is mlkem, then
* MLK_NAMESPACE_K(enc) becomes mlkem512_enc/mlkem768_enc/mlkem1024_enc.
@@ -73,8 +75,24 @@
*/
#if defined(MLK_SYS_X86_64)
#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) : MLK_CET_ENDBR
-#else
+#elif defined(MLK_SYS_ARMV81M_MVE)
+/* clang-format off */
+#define MLK_ASM_FN_SYMBOL(sym) \
+ .type MLK_ASM_NAMESPACE(sym), %function; \
+ MLK_ASM_NAMESPACE(sym) :
+/* clang-format on */
+#else /* !MLK_SYS_X86_64 && MLK_SYS_ARMV81M_MVE */
#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) :
+#endif /* !MLK_SYS_X86_64 && !MLK_SYS_ARMV81M_MVE */
+
+/*
+ * Output the size of an assembly function.
+ */
+#if defined(__ELF__)
+#define MLK_ASM_FN_SIZE(sym) \
+ .size MLK_ASM_NAMESPACE(sym), .- MLK_ASM_NAMESPACE(sym)
+#else
+#define MLK_ASM_FN_SIZE(sym)
#endif
/* We aim to simplify the user's life by supporting builds where
@@ -99,6 +117,10 @@
#error Bad configuration: MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, but MLK_CONFIG_FIPS202_BACKEND_FILE is not.
#endif
+#if defined(MLK_CONFIG_NO_RANDOMIZED_API) && defined(MLK_CONFIG_KEYGEN_PCT)
+#error Bad configuration: MLK_CONFIG_NO_RANDOMIZED_API is incompatible with MLK_CONFIG_KEYGEN_PCT as the current PCT implementation requires crypto_kem_enc()
+#endif
+
#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH)
#include MLK_CONFIG_ARITH_BACKEND_FILE
/* Include to enforce consistency of API and implementation,
@@ -135,20 +157,118 @@
#define MLK_FIPS202X4_HEADER_FILE MLK_CONFIG_FIPS202X4_CUSTOM_HEADER
#endif
-/* Just in case we want to include mlkem_native.h, set the configuration
- * for that header in accordance with the configuration used here. */
+/* Standard library function replacements */
+#if !defined(__ASSEMBLER__)
+#if !defined(MLK_CONFIG_CUSTOM_MEMCPY)
+#include
+#define mlk_memcpy memcpy
+#endif
+
+#if !defined(MLK_CONFIG_CUSTOM_MEMSET)
+#include
+#define mlk_memset memset
+#endif
+
+
+/* Allocation macros for large local structures
+ *
+ * MLK_ALLOC(v, T, N) declares T *v and attempts to point it to an T[N]
+ * MLK_FREE(v, T, N) zeroizes and frees the allocation
+ *
+ * Default implementation uses stack allocation.
+ * Can be overridden by setting the config option MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * and defining MLK_CUSTOM_ALLOC and MLK_CUSTOM_FREE.
+ */
+#if defined(MLK_CONFIG_CUSTOM_ALLOC_FREE) != \
+ (defined(MLK_CUSTOM_ALLOC) && defined(MLK_CUSTOM_FREE))
+#error Bad configuration: MLK_CONFIG_CUSTOM_ALLOC_FREE must be set together with MLK_CUSTOM_ALLOC and MLK_CUSTOM_FREE
+#endif
+
+/*
+ * If the integration wants to provide a context parameter for use in
+ * platform-specific hooks, then it should define this parameter.
+ *
+ * The MLK_CONTEXT_PARAMETERS_n macros are intended to be used with macros
+ * defining the function names and expand to either pass or discard the context
+ * argument as required by the current build. If there is no context parameter
+ * requested then these are removed from the prototypes and from all calls.
+ */
+#ifdef MLK_CONFIG_CONTEXT_PARAMETER
+#define MLK_CONTEXT_PARAMETERS_0(context) (context)
+#define MLK_CONTEXT_PARAMETERS_1(arg0, context) (arg0, context)
+#define MLK_CONTEXT_PARAMETERS_2(arg0, arg1, context) (arg0, arg1, context)
+#define MLK_CONTEXT_PARAMETERS_3(arg0, arg1, arg2, context) \
+ (arg0, arg1, arg2, context)
+#define MLK_CONTEXT_PARAMETERS_4(arg0, arg1, arg2, arg3, context) \
+ (arg0, arg1, arg2, arg3, context)
+#else /* MLK_CONFIG_CONTEXT_PARAMETER */
+#define MLK_CONTEXT_PARAMETERS_0(context) ()
+#define MLK_CONTEXT_PARAMETERS_1(arg0, context) (arg0)
+#define MLK_CONTEXT_PARAMETERS_2(arg0, arg1, context) (arg0, arg1)
+#define MLK_CONTEXT_PARAMETERS_3(arg0, arg1, arg2, context) (arg0, arg1, arg2)
+#define MLK_CONTEXT_PARAMETERS_4(arg0, arg1, arg2, arg3, context) \
+ (arg0, arg1, arg2, arg3)
+#endif /* !MLK_CONFIG_CONTEXT_PARAMETER */
+
+#if defined(MLK_CONFIG_CONTEXT_PARAMETER_TYPE) != \
+ defined(MLK_CONFIG_CONTEXT_PARAMETER)
+#error MLK_CONFIG_CONTEXT_PARAMETER_TYPE must be defined if and only if MLK_CONFIG_CONTEXT_PARAMETER is defined
+#endif
+
+#if !defined(MLK_CONFIG_CUSTOM_ALLOC_FREE)
+/* Default: stack allocation */
+
+#define MLK_ALLOC(v, T, N, context) \
+ MLK_ALIGN T mlk_alloc_##v[N]; \
+ T *v = mlk_alloc_##v
+
+/* TODO: This leads to a circular dependency between common and verify.h
+ * It just works out before we're at the end of the file, but it's still
+ * prone to issues in the future. */
+#include "verify.h"
+#define MLK_FREE(v, T, N, context) \
+ do \
+ { \
+ mlk_zeroize(mlk_alloc_##v, sizeof(mlk_alloc_##v)); \
+ (v) = NULL; \
+ } while (0)
+
+#else /* !MLK_CONFIG_CUSTOM_ALLOC_FREE */
+
+/* Custom allocation */
+
+/*
+ * The indirection here is necessary to use MLK_CONTEXT_PARAMETERS_3 here.
+ */
+#define MLK_APPLY(f, args) f args
+
+#define MLK_ALLOC(v, T, N, context) \
+ MLK_APPLY(MLK_CUSTOM_ALLOC, MLK_CONTEXT_PARAMETERS_3(v, T, N, context))
+
+#define MLK_FREE(v, T, N, context) \
+ do \
+ { \
+ if (v != NULL) \
+ { \
+ mlk_zeroize(v, sizeof(T) * (N)); \
+ MLK_APPLY(MLK_CUSTOM_FREE, MLK_CONTEXT_PARAMETERS_3(v, T, N, context)); \
+ v = NULL; \
+ } \
+ } while (0)
+
+#endif /* MLK_CONFIG_CUSTOM_ALLOC_FREE */
+
+/****************************** Error codes ***********************************/
-/* Double-check that this is not conflicting with pre-existing definitions. */
-#if defined(MLK_CONFIG_API_PARAMETER_SET) || \
- defined(MLK_CONFIG_API_NAMESPACE_PREFIX) || \
- defined(MLK_CONFIG_API_NO_SUPERCOP) || \
- defined(MLK_CONFIG_API_CONSTANTS_ONLY)
-#error Pre-existing MLK_CONFIG_API_XXX configuration is neither useful nor allowed during an mlkem-native build
-#endif /* MLK_CONFIG_API_PARAMETER_SET || MLK_CONFIG_API_NAMESPACE_PREFIX || \
- MLK_CONFIG_API_NO_SUPERCOP || MLK_CONFIG_API_CONSTANTS_ONLY */
+/* Generic failure condition */
+#define MLK_ERR_FAIL -1
+/* An allocation failed. This can only happen if MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * is defined and the provided MLK_CUSTOM_ALLOC can fail. */
+#define MLK_ERR_OUT_OF_MEMORY -2
+/* An rng failure occured. Might be due to insufficient entropy or
+ * system misconfiguration. */
+#define MLK_ERR_RNG_FAIL -3
-#define MLK_CONFIG_API_PARAMETER_SET MLK_CONFIG_PARAMETER_SET
-#define MLK_CONFIG_API_NAMESPACE_PREFIX \
- MLK_ADD_PARAM_SET(MLK_CONFIG_NAMESPACE_PREFIX)
+#endif /* !__ASSEMBLER__ */
#endif /* !MLK_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/compress.c
index d7ff2bbe7a..50da36d0e4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/compress.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/compress.c
@@ -20,24 +20,27 @@
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
-#include
+
#include "cbmc.h"
#include "compress.h"
#include "debug.h"
#include "verify.h"
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
/* Reference: `poly_compress()` in the reference implementation @[REF],
* for ML-KEM-{512,768}.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d4_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -55,32 +58,51 @@ void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
t[j] = mlk_scalar_compress_d4(a->coeffs[8 * i + j]);
}
- r[i * 4] = t[0] | (t[1] << 4);
- r[i * 4 + 1] = t[2] | (t[3] << 4);
- r[i * 4 + 2] = t[4] | (t[5] << 4);
- r[i * 4 + 3] = t[6] | (t[7] << 4);
+ /* All t[i] are 4-bit wide, so the truncations don't alter the value. */
+ r[i * 4] = (uint8_t)(t[0] | (t[1] << 4));
+ r[i * 4 + 1] = (uint8_t)(t[2] | (t[3] << 4));
+ r[i * 4 + 2] = (uint8_t)(t[4] | (t[5] << 4));
+ r[i * 4 + 3] = (uint8_t)(t[6] | (t[7] << 4));
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D4 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d4_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d4_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
+ mlk_poly_compress_d4_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation, for ML-KEM-{512,768}.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d10_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+)
{
unsigned j;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -101,29 +123,47 @@ void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
* Make all implicit truncation explicit. No data is being
* truncated for the LHS's since each t[i] is 10-bit in size.
*/
- r[5 * j + 0] = (t[0] >> 0) & 0xFF;
- r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
- r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
- r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
- r[5 * j + 4] = (t[3] >> 2);
+ r[5 * j + 0] = (uint8_t)((t[0] >> 0) & 0xFF);
+ r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] << 2) & 0xFF));
+ r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] << 4) & 0xFF));
+ r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] << 6) & 0xFF));
+ r[5 * j + 4] = (uint8_t)(t[3] >> 2);
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D10 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d10_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d10_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
+ mlk_poly_compress_d10_c(r, a);
+}
+
/* Reference: `poly_decompress()` in the reference implementation @[REF],
* for ML-KEM-{512,768}. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d4(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d4_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 2; i++)
@@ -137,22 +177,40 @@ void mlk_poly_decompress_d4(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d4(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d4_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
+ int ret;
+ ret = mlk_poly_decompress_d4_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
+ mlk_poly_decompress_d4_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation, for ML-KEM-{512,768}. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d10(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d10_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned j;
for (j = 0; j < MLKEM_N / 4; j++)
@@ -180,28 +238,46 @@ void mlk_poly_decompress_d10(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d10(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d10_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
+ int ret;
+ ret = mlk_poly_decompress_d10_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
+
+ mlk_poly_decompress_d10_c(r, a);
+}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
/* Reference: `poly_compress()` in the reference implementation @[REF],
* for ML-KEM-1024.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d5_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -219,38 +295,51 @@ void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
t[j] = mlk_scalar_compress_d5(a->coeffs[8 * i + j]);
}
- /*
- * Explicitly truncate to avoid warning about
- * implicit truncation in CBMC, and use array indexing into
- * r rather than pointer-arithmetic to simplify verification
- */
- r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
- r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
- r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
- r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
- r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+ r[i * 5] = (uint8_t)(0xFF & ((t[0] >> 0) | (t[1] << 5)));
+ r[i * 5 + 1] = (uint8_t)(0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)));
+ r[i * 5 + 2] = (uint8_t)(0xFF & ((t[3] >> 1) | (t[4] << 4)));
+ r[i * 5 + 3] = (uint8_t)(0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)));
+ r[i * 5 + 4] = (uint8_t)(0xFF & ((t[6] >> 2) | (t[7] << 3)));
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D5 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d5_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d5_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
+ mlk_poly_compress_d5_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation, for ML-KEM-1024.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d11_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+)
{
unsigned j;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -272,35 +361,53 @@ void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
* Make all implicit truncation explicit. No data is being
* truncated for the LHS's since each t[i] is 11-bit in size.
*/
- r[11 * j + 0] = (t[0] >> 0) & 0xFF;
- r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
- r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
- r[11 * j + 3] = (t[2] >> 2) & 0xFF;
- r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
- r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
- r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
- r[11 * j + 7] = (t[5] >> 1) & 0xFF;
- r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
- r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
- r[11 * j + 10] = (t[7] >> 3);
+ r[11 * j + 0] = (uint8_t)((t[0] >> 0) & 0xFF);
+ r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] << 3) & 0xFF));
+ r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] << 6) & 0xFF));
+ r[11 * j + 3] = (uint8_t)((t[2] >> 2) & 0xFF);
+ r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] << 1) & 0xFF));
+ r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] << 4) & 0xFF));
+ r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] << 7) & 0xFF));
+ r[11 * j + 7] = (uint8_t)((t[5] >> 1) & 0xFF);
+ r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] << 2) & 0xFF));
+ r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] << 5) & 0xFF));
+ r[11 * j + 10] = (uint8_t)(t[7] >> 3);
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D11 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d11_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d11_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
+ mlk_poly_compress_d11_c(r, a);
+}
+
/* Reference: `poly_decompress()` in the reference implementation @[REF],
* for ML-KEM-1024. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d5(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d5_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 8; i++)
@@ -342,22 +449,40 @@ void mlk_poly_decompress_d5(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d5(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d5_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
+ int ret;
+ ret = mlk_poly_decompress_d5_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
+ mlk_poly_decompress_d5_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation, for ML-KEM-1024. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d11(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d11_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned j;
for (j = 0; j < MLKEM_N / 8; j++)
@@ -390,26 +515,45 @@ void mlk_poly_decompress_d11(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d11(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d11_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
+ int ret;
+ ret = mlk_poly_decompress_d11_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
+ mlk_poly_decompress_d11_c(r, a);
+}
+
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
-#if !defined(MLK_USE_NATIVE_POLY_TOBYTES)
/* Reference: `poly_tobytes()` in the reference implementation @[REF].
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_tobytes_c(uint8_t r[MLKEM_POLYBYTES],
+ const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYBYTES))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -417,8 +561,10 @@ void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(invariant(i <= MLKEM_N / 2))
{
- const uint16_t t0 = a->coeffs[2 * i];
- const uint16_t t1 = a->coeffs[2 * i + 1];
+ /* The conversion to uint16_t is safe since we assume that
+ * the coefficients of `a` are non-negative. */
+ const uint16_t t0 = (uint16_t)a->coeffs[2 * i];
+ const uint16_t t1 = (uint16_t)a->coeffs[2 * i + 1];
/*
* t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
* significant data, so these can be packed into 24 bits or exactly
@@ -426,32 +572,48 @@ void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
*/
/* Least significant bits 0 - 7 of t0. */
- r[3 * i + 0] = t0 & 0xFF;
+ r[3 * i + 0] = (uint8_t)(t0 & 0xFF);
/*
* Most significant bits 8 - 11 of t0 become the least significant
* nibble of the second byte. The least significant 4 bits
* of t1 become the upper nibble of the second byte.
+ *
+ * The conversion to uint8_t does not alter the value.
*/
- r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+ r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 << 4) & 0xF0));
- /* Bits 4 - 11 of t1 become the third byte. */
- r[3 * i + 2] = t1 >> 4;
+ /* Bits 4 - 11 of t1 become the third byte. The conversion to uint8_t
+ * does not alter the value because t1 is 12-bit wide. */
+ r[3 * i + 2] = (uint8_t)(t1 >> 4);
}
}
-#else /* !MLK_USE_NATIVE_POLY_TOBYTES */
+
MLK_INTERNAL_API
void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
{
+#if defined(MLK_USE_NATIVE_POLY_TOBYTES)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_tobytes_native(r, a->coeffs);
-}
+ ret = mlk_poly_tobytes_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
-#if !defined(MLK_USE_NATIVE_POLY_FROMBYTES)
+ mlk_poly_tobytes_c(r, a);
+}
+
/* Reference: `poly_frombytes()` in the reference implementation @[REF]. */
-MLK_INTERNAL_API
-void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
+MLK_STATIC_TESTABLE void mlk_poly_frombytes_c(mlk_poly *r,
+ const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+ requires(memory_no_alias(a, MLKEM_POLYBYTES))
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 2; i++)
@@ -462,21 +624,29 @@ void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
const uint8_t t0 = a[3 * i + 0];
const uint8_t t1 = a[3 * i + 1];
const uint8_t t2 = a[3 * i + 2];
- r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
- r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+ r->coeffs[2 * i + 0] = (int16_t)(t0 | ((t1 << 8) & 0xFFF));
+ r->coeffs[2 * i + 1] = (int16_t)((t1 >> 4) | (t2 << 4));
}
/* Note that the coefficients are not canonical */
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
}
-#else /* !MLK_USE_NATIVE_POLY_FROMBYTES */
+
MLK_INTERNAL_API
void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
{
- mlk_poly_frombytes_native(r->coeffs, a);
-}
+#if defined(MLK_USE_NATIVE_POLY_FROMBYTES)
+ int ret;
+ ret = mlk_poly_frombytes_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
+ mlk_poly_frombytes_c(r, a);
+}
+
/* Reference: `poly_frommsg()` in the reference implementation @[REF].
* - We use a value barrier around the bit-selection mask to
* reduce the risk of compiler-introduced branches.
@@ -506,7 +676,7 @@ void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
* as per @[FIPS203, Eq (4.8)]. */
/* Prevent the compiler from recognizing this as a bit selection */
- uint8_t mask = mlk_value_barrier_u8(1u << j);
+ uint8_t mask = mlk_value_barrier_u8((uint8_t)(1u << j));
r->coeffs[8 * i + j] = mlk_ct_sel_int16(MLKEM_Q_HALF, 0, msg[i] & mask);
}
}
@@ -535,7 +705,7 @@ void mlk_poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const mlk_poly *a)
invariant(i <= MLKEM_N / 8 && j <= 8))
{
uint32_t t = mlk_scalar_compress_d1(a->coeffs[8 * i + j]);
- msg[i] |= t << j;
+ msg[i] |= (uint8_t)(t << j);
}
}
}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/compress.h
index f0789d42d6..b16b0889b5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/compress.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/compress.h
@@ -20,8 +20,7 @@
#ifndef MLK_COMPRESS_H
#define MLK_COMPRESS_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include "debug.h"
@@ -50,9 +49,9 @@
#endif
/* Reference: Part of poly_tomsg() in the reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d1(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d1(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 2)
ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) )
{
@@ -65,7 +64,8 @@ __contract__(
*/
/* check-magic: 1290168 == 2*round(2^31 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290168;
- return (d0 + (1u << 30)) >> 31;
+ /* Unsigned shifting by 31 positions leaves only the top bit. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 30)) >> 31);
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -93,9 +93,9 @@ __contract__(
/* Reference: Embedded into `poly_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d4(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d4(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 16)
ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
{
@@ -108,7 +108,8 @@ __contract__(
*/
/* check-magic: 1290160 == 16 * round(2^28 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290160;
- return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */
+ /* The return value is < 16, so not altered by the conversion to uint8_t. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 27)) >> 28); /* round(d0/2^28) */
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -128,11 +129,16 @@ __contract__(
/* Reference: Embedded into `poly_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d4(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d4(uint8_t u)
__contract__(
requires(0 <= u && u < 16)
ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) >> 4; }
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 8) >> 4);
+}
/************************************************************
* Name: mlk_scalar_compress_d5
@@ -156,9 +162,9 @@ __contract__(
/* Reference: Embedded into `poly_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d5(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d5(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 32)
ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) )
{
@@ -171,7 +177,8 @@ __contract__(
*/
/* check-magic: 1290176 == 2^5 * round(2^27 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290176;
- return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */
+ /* The return value is < 32, so not altered by the conversion to uint8_t. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 26)) >> 27); /* round(d0/2^27) */
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -191,11 +198,16 @@ __contract__(
/* Reference: Embedded into `poly_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d5(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d5(uint8_t u)
__contract__(
requires(0 <= u && u < 32)
- ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) >> 5; }
+ ensures(0 <= return_value && return_value <= MLKEM_Q - 1)
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 16) >> 5);
+}
/************************************************************
* Name: mlk_scalar_compress_d10
@@ -219,9 +231,9 @@ __contract__(
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d10(uint16_t u)
+static MLK_INLINE uint16_t mlk_scalar_compress_d10(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < (1u << 10))
ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
{
@@ -255,11 +267,16 @@ __contract__(
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d10(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d10(uint16_t u)
__contract__(
requires(0 <= u && u < 1024)
- ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) >> 10; }
+ ensures(0 <= return_value && return_value <= (MLKEM_Q - 1))
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 512) >> 10);
+}
/************************************************************
* Name: mlk_scalar_compress_d11
@@ -283,9 +300,9 @@ __contract__(
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d11(uint16_t u)
+static MLK_INLINE uint16_t mlk_scalar_compress_d11(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < (1u << 11))
ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
{
@@ -319,11 +336,16 @@ __contract__(
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d11(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d11(uint16_t u)
__contract__(
requires(0 <= u && u < 2048)
- ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) >> 11; }
+ ensures(0 <= return_value && return_value <= (MLKEM_Q - 1))
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 1024) >> 11);
+}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
#define mlk_poly_compress_d4 MLK_NAMESPACE(poly_compress_d4)
@@ -575,7 +597,7 @@ __contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
);
@@ -631,7 +653,7 @@ void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
__contract__(
requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
);
@@ -660,7 +682,7 @@ __contract__(
requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(msg))
+ assigns(memory_slice(msg, MLKEM_INDCPA_MSGBYTES))
);
#endif /* !MLK_COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/debug.h
index 01f7c88ccf..47c864bd36 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/debug.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/debug.h
@@ -7,7 +7,6 @@
#include "common.h"
#if defined(MLKEM_DEBUG)
-#include
/*************************************************
* Name: mlk_assert
@@ -89,14 +88,14 @@ void mlk_debug_check_bounds(const char *file, int line, const int16_t *ptr,
/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
* just use a single flattened array_bound(...) here. */
-#define mlk_assert_bound_2d(ptr, M, N, value_lb, value_ub) \
- cassert(forall(kN, 0, (M), \
- array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+#define mlk_assert_bound_2d(ptr, M, N, value_lb, value_ub) \
+ cassert(forall(kN, 0, (M), \
+ array_bound(&((int16_t (*)[(N)])(ptr))[kN][0], 0, (N), \
(value_lb), (value_ub))))
-#define mlk_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \
- cassert(forall(kN, 0, (M), \
- array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+#define mlk_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \
+ cassert(forall(kN, 0, (M), \
+ array_abs_bound(&((int16_t (*)[(N)])(ptr))[kN][0], 0, (N), \
(value_abs_bd))))
#else /* !MLKEM_DEBUG && CBMC */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/indcpa.c
index 85d4f595a9..e03b16c38b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/indcpa.c
@@ -17,15 +17,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
-#include
+#include "indcpa.h"
-#include "cbmc.h"
#include "debug.h"
-#include "indcpa.h"
-#include "poly.h"
-#include "poly_k.h"
#include "randombytes.h"
#include "sampling.h"
#include "symmetric.h"
@@ -41,6 +35,10 @@
#define mlk_pack_ciphertext MLK_ADD_PARAM_SET(mlk_pack_ciphertext)
#define mlk_unpack_ciphertext MLK_ADD_PARAM_SET(mlk_unpack_ciphertext)
#define mlk_matvec_mul MLK_ADD_PARAM_SET(mlk_matvec_mul)
+#define mlk_polyvec_permute_bitrev_to_custom \
+ MLK_ADD_PARAM_SET(mlk_polyvec_permute_bitrev_to_custom)
+#define mlk_polymat_permute_bitrev_to_custom \
+ MLK_ADD_PARAM_SET(mlk_polymat_permute_bitrev_to_custom)
/* End of parameter set namespacing */
/*************************************************
@@ -59,12 +57,13 @@
* Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L19]
*
**************************************************/
-static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], mlk_polyvec pk,
+static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const mlk_polyvec *pk,
const uint8_t seed[MLKEM_SYMBYTES])
{
- mlk_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(pk->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
mlk_polyvec_tobytes(r, pk);
- memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
+ mlk_memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
}
/*************************************************
@@ -83,11 +82,11 @@ static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], mlk_polyvec pk,
* Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L2-3]
*
**************************************************/
-static void mlk_unpack_pk(mlk_polyvec pk, uint8_t seed[MLKEM_SYMBYTES],
+static void mlk_unpack_pk(mlk_polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
const uint8_t packedpk[MLKEM_INDCPA_PUBLICKEYBYTES])
{
mlk_polyvec_frombytes(pk, packedpk);
- memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
/* NOTE: If a modulus check was conducted on the PK, we know at this
* point that the coefficients of `pk` are unsigned canonical. The
@@ -108,9 +107,10 @@ static void mlk_unpack_pk(mlk_polyvec pk, uint8_t seed[MLKEM_SYMBYTES],
* Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L20]
*
**************************************************/
-static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], mlk_polyvec sk)
+static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES],
+ const mlk_polyvec *sk)
{
- mlk_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(sk->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
mlk_polyvec_tobytes(r, sk);
}
@@ -128,7 +128,7 @@ static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], mlk_polyvec sk)
* Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L5]
*
**************************************************/
-static void mlk_unpack_sk(mlk_polyvec sk,
+static void mlk_unpack_sk(mlk_polyvec *sk,
const uint8_t packedsk[MLKEM_INDCPA_SECRETKEYBYTES])
{
mlk_polyvec_frombytes(sk, packedsk);
@@ -149,8 +149,8 @@ static void mlk_unpack_sk(mlk_polyvec sk,
* Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22-23]
*
**************************************************/
-static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], mlk_polyvec b,
- mlk_poly *v)
+static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES],
+ const mlk_polyvec *b, mlk_poly *v)
{
mlk_polyvec_compress_du(r, b);
mlk_poly_compress_dv(r + MLKEM_POLYVECCOMPRESSEDBYTES_DU, v);
@@ -170,28 +170,69 @@ static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], mlk_polyvec b,
* Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L1-4]
*
**************************************************/
-static void mlk_unpack_ciphertext(mlk_polyvec b, mlk_poly *v,
+static void mlk_unpack_ciphertext(mlk_polyvec *b, mlk_poly *v,
const uint8_t c[MLKEM_INDCPA_BYTES])
{
mlk_polyvec_decompress_du(b, c);
mlk_poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
}
-#if !defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
-/* This namespacing is not done at the top to avoid a naming conflict
- * with native backends, which are currently not yet namespaced. */
-#define mlk_poly_permute_bitrev_to_custom \
- MLK_ADD_PARAM_SET(mlk_poly_permute_bitrev_to_custom)
-
-static MLK_INLINE void mlk_poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
+/* Helper function to ensure that the polynomial entries in the output
+ * of gen_matrix use the standard (bitreversed) ordering of coefficients.
+ * No-op unless a native backend with a custom ordering is used.
+ *
+ * We don't inline this into gen_matrix to avoid having to split the CBMC
+ * proof for gen_matrix based on MLK_USE_NATIVE_NTT_CUSTOM_ORDER. */
+static void mlk_polyvec_permute_bitrev_to_custom(mlk_polyvec *v)
__contract__(
/* We don't specify that this should be a permutation, but only
* that it does not change the bound established at the end of mlk_gen_matrix. */
- requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
- requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(memory_slice(data, sizeof(mlk_poly)))
- ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+ requires(memory_no_alias(v, sizeof(mlk_polyvec)))
+ requires(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(v, sizeof(mlk_polyvec)))
+ ensures(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+{
+#if defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
+ unsigned i;
+ for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(v, sizeof(mlk_polyvec)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+ {
+ mlk_poly_permute_bitrev_to_custom(v->vec[i].coeffs);
+ }
+#else /* MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
+ /* Nothing to do */
+ (void)v;
#endif /* !MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
+}
+
+static void mlk_polymat_permute_bitrev_to_custom(mlk_polymat *a)
+__contract__(
+ /* We don't specify that this should be a permutation, but only
+ * that it does not change the bound established at the end of mlk_gen_matrix. */
+ requires(memory_no_alias(a, sizeof(mlk_polymat)))
+ requires(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+ assigns(memory_slice(a, sizeof(mlk_polymat)))
+ ensures(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))))
+{
+ unsigned i;
+ for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(a, sizeof(mlk_polymat)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))))
+ {
+ mlk_polyvec_permute_bitrev_to_custom(&a->vec[i]);
+ }
+}
/* Reference: `gen_matrix()` in the reference implementation @[REF].
* - We use a special subroutine to generate 4 polynomials
@@ -201,32 +242,27 @@ __contract__(
*
* Not static for benchmarking */
MLK_INTERNAL_API
-void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
+void mlk_gen_matrix(mlk_polymat *a, const uint8_t seed[MLKEM_SYMBYTES],
int transposed)
{
unsigned i, j;
- /*
- * We generate four separate seed arrays rather than a single one to work
- * around limitations in CBMC function contracts dealing with disjoint slices
- * of the same parent object.
- */
-
MLK_ALIGN uint8_t seed_ext[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)];
for (j = 0; j < 4; j++)
{
- memcpy(seed_ext[j], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(seed_ext[j], seed, MLKEM_SYMBYTES);
}
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
/* Sample 4 matrix entries a time. */
for (i = 0; i < (MLKEM_K * MLKEM_K / 4) * 4; i += 4)
{
- uint8_t x, y;
-
for (j = 0; j < 4; j++)
{
- x = (i + j) / MLKEM_K;
- y = (i + j) % MLKEM_K;
+ uint8_t x, y;
+ /* MLKEM_K <= 4, so the values fit in uint8_t. */
+ x = (uint8_t)((i + j) / MLKEM_K);
+ y = (uint8_t)((i + j) % MLKEM_K);
if (transposed)
{
seed_ext[j][MLKEM_SYMBYTES + 0] = x;
@@ -239,19 +275,26 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
}
}
- /*
- * This call writes across mlk_polyvec boundaries for K=2 and K=3.
- * This is intentional and safe.
- */
- mlk_poly_rej_uniform_x4(&a[i], seed_ext);
+ mlk_poly_rej_uniform_x4(&a->vec[i / MLKEM_K].vec[i % MLKEM_K],
+ &a->vec[(i + 1) / MLKEM_K].vec[(i + 1) % MLKEM_K],
+ &a->vec[(i + 2) / MLKEM_K].vec[(i + 2) % MLKEM_K],
+ &a->vec[(i + 3) / MLKEM_K].vec[(i + 3) % MLKEM_K],
+ seed_ext);
}
-
- /* For MLKEM_K == 3, sample the last entry individually. */
- if (i < MLKEM_K * MLKEM_K)
+#else /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
+ /* When using serial FIPS202, sample all entries individually. */
+ i = 0;
+#endif /* MLK_CONFIG_SERIAL_FIPS202_ONLY */
+
+ /* For MLKEM_K == 3, sample the last entry individually.
+ * When MLK_CONFIG_SERIAL_FIPS202_ONLY is set, sample all entries
+ * individually. */
+ for (; i < MLKEM_K * MLKEM_K; i++)
{
uint8_t x, y;
- x = i / MLKEM_K;
- y = i % MLKEM_K;
+ /* MLKEM_K <= 4, so the values fit in uint8_t. */
+ x = (uint8_t)(i / MLKEM_K);
+ y = (uint8_t)(i % MLKEM_K);
if (transposed)
{
@@ -264,8 +307,7 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
seed_ext[0][MLKEM_SYMBYTES + 1] = x;
}
- mlk_poly_rej_uniform(&a[i], seed_ext[0]);
- i++;
+ mlk_poly_rej_uniform(&a->vec[i / MLKEM_K].vec[i % MLKEM_K], seed_ext[0]);
}
mlk_assert(i == MLKEM_K * MLKEM_K);
@@ -274,10 +316,7 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
* The public matrix is generated in NTT domain. If the native backend
* uses a custom order in NTT domain, permute A accordingly.
*/
- for (i = 0; i < MLKEM_K * MLKEM_K; i++)
- {
- mlk_poly_permute_bitrev_to_custom(a[i].coeffs);
- }
+ mlk_polymat_permute_bitrev_to_custom(a);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -301,24 +340,25 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
* Specification: Implements @[FIPS203, Section 2.4.7, Eq (2.12), (2.13)]
*
**************************************************/
-static void mlk_matvec_mul(mlk_polyvec out, const mlk_polymat a,
- const mlk_polyvec v, const mlk_polyvec_mulcache vc)
+static void mlk_matvec_mul(mlk_polyvec *out, const mlk_polymat *a,
+ const mlk_polyvec *v, const mlk_polyvec_mulcache *vc)
__contract__(
requires(memory_no_alias(out, sizeof(mlk_polyvec)))
requires(memory_no_alias(a, sizeof(mlk_polymat)))
requires(memory_no_alias(v, sizeof(mlk_polyvec)))
requires(memory_no_alias(vc, sizeof(mlk_polyvec_mulcache)))
- requires(forall(k0, 0, MLKEM_K * MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
- assigns(object_whole(out)))
+ requires(forall(k0, 0, MLKEM_K,
+ forall(k1, 0, MLKEM_K,
+ array_bound(a->vec[k0].vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))))
+ assigns(memory_slice(out, sizeof(mlk_polyvec))))
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
__loop__(
- assigns(i, object_whole(out))
+ assigns(i, memory_slice(out, sizeof(mlk_polyvec)))
invariant(i <= MLKEM_K))
{
- mlk_polyvec_basemul_acc_montgomery_cached(&out[i], &a[MLKEM_K * i], v, vc);
+ mlk_polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a->vec[i], v, vc);
}
}
@@ -331,20 +371,34 @@ __contract__(
* - We include buffer zeroization.
*/
MLK_INTERNAL_API
-void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- const uint8_t *publicseed = buf;
- const uint8_t *noiseseed = buf + MLKEM_SYMBYTES;
- mlk_polymat a;
- mlk_polyvec e, pkpv, skpv;
- mlk_polyvec_mulcache skpv_cache;
-
- MLK_ALIGN uint8_t coins_with_domain_separator[MLKEM_SYMBYTES + 1];
+ int ret = 0;
+ const uint8_t *publicseed;
+ const uint8_t *noiseseed;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(coins_with_domain_separator, uint8_t, MLKEM_SYMBYTES + 1, context);
+ MLK_ALLOC(a, mlk_polymat, 1, context);
+ MLK_ALLOC(e, mlk_polyvec, 1, context);
+ MLK_ALLOC(pkpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (buf == NULL || coins_with_domain_separator == NULL || a == NULL ||
+ e == NULL || pkpv == NULL || skpv == NULL || skpv_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
+ publicseed = buf;
+ noiseseed = buf + MLKEM_SYMBYTES;
+
/* Concatenate coins with MLKEM_K for domain separation of security levels */
- memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
+ mlk_memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
coins_with_domain_separator[MLKEM_SYMBYTES] = MLKEM_K;
mlk_hash_g(buf, coins_with_domain_separator, MLKEM_SYMBYTES + 1);
@@ -360,24 +414,24 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
mlk_gen_matrix(a, publicseed, 0 /* no transpose */);
#if MLKEM_K == 2
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &e[0], &e[1], noiseseed, 0, 1,
- 2, 3);
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &e->vec[0],
+ &e->vec[1], noiseseed, 0, 1, 2, 3);
#elif MLKEM_K == 3
/*
* Only the first three output buffers are needed.
* The laster parameter is a dummy that's overwritten later.
*/
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &skpv[2],
- &pkpv[0] /* irrelevant */, noiseseed, 0, 1, 2,
- 0xFF /* irrelevant */);
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &skpv->vec[2], NULL,
+ noiseseed, 0, 1, 2, 0xFF /* irrelevant */);
/* Same here */
- mlk_poly_getnoise_eta1_4x(&e[0], &e[1], &e[2], &pkpv[0] /* irrelevant */,
- noiseseed, 3, 4, 5, 0xFF /* irrelevant */);
+ mlk_poly_getnoise_eta1_4x(&e->vec[0], &e->vec[1], &e->vec[2], NULL, noiseseed,
+ 3, 4, 5, 0xFF /* irrelevant */);
#elif MLKEM_K == 4
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &skpv[2], &skpv[3], noiseseed,
- 0, 1, 2, 3);
- mlk_poly_getnoise_eta1_4x(&e[0], &e[1], &e[2], &e[3], noiseseed, 4, 5, 6, 7);
-#endif
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &skpv->vec[2],
+ &skpv->vec[3], noiseseed, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta1_4x(&e->vec[0], &e->vec[1], &e->vec[2], &e->vec[3],
+ noiseseed, 4, 5, 6, 7);
+#endif /* MLKEM_K == 4 */
mlk_polyvec_ntt(skpv);
mlk_polyvec_ntt(e);
@@ -393,14 +447,17 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
mlk_pack_sk(sk, skpv);
mlk_pack_pk(pk, pkpv, publicseed);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(coins_with_domain_separator, sizeof(coins_with_domain_separator));
- mlk_zeroize(a, sizeof(a));
- mlk_zeroize(&e, sizeof(e));
- mlk_zeroize(&skpv, sizeof(skpv));
- mlk_zeroize(&skpv_cache, sizeof(skpv_cache));
+ MLK_FREE(skpv_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(skpv, mlk_polyvec, 1, context);
+ MLK_FREE(pkpv, mlk_polyvec, 1, context);
+ MLK_FREE(e, mlk_polyvec, 1, context);
+ MLK_FREE(a, mlk_polymat, 1, context);
+ MLK_FREE(coins_with_domain_separator, uint8_t, MLKEM_SYMBYTES + 1, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
/* Reference: `indcpa_enc()` in the reference implementation @[REF].
@@ -412,19 +469,33 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
* - We include buffer zeroization.
*/
MLK_INTERNAL_API
-void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t seed[MLKEM_SYMBYTES];
- mlk_polymat at;
- mlk_polyvec sp, pkpv, ep, b;
- mlk_poly v, k, epp;
- mlk_polyvec_mulcache sp_cache;
+ int ret = 0;
+ MLK_ALLOC(seed, uint8_t, MLKEM_SYMBYTES, context);
+ MLK_ALLOC(at, mlk_polymat, 1, context);
+ MLK_ALLOC(sp, mlk_polyvec, 1, context);
+ MLK_ALLOC(pkpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(ep, mlk_polyvec, 1, context);
+ MLK_ALLOC(b, mlk_polyvec, 1, context);
+ MLK_ALLOC(v, mlk_poly, 1, context);
+ MLK_ALLOC(k, mlk_poly, 1, context);
+ MLK_ALLOC(epp, mlk_poly, 1, context);
+ MLK_ALLOC(sp_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (seed == NULL || at == NULL || sp == NULL || pkpv == NULL || ep == NULL ||
+ b == NULL || v == NULL || k == NULL || epp == NULL || sp_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
mlk_unpack_pk(pkpv, seed, pk);
- mlk_poly_frommsg(&k, m);
+ mlk_poly_frommsg(k, m);
/*
* Declassify the public seed.
@@ -437,87 +508,105 @@ void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
mlk_gen_matrix(at, seed, 1 /* transpose */);
#if MLKEM_K == 2
- mlk_poly_getnoise_eta1122_4x(&sp[0], &sp[1], &ep[0], &ep[1], coins, 0, 1, 2,
- 3);
- mlk_poly_getnoise_eta2(&epp, coins, 4);
+ mlk_poly_getnoise_eta1122_4x(&sp->vec[0], &sp->vec[1], &ep->vec[0],
+ &ep->vec[1], coins, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta2(epp, coins, 4);
#elif MLKEM_K == 3
/*
* In this call, only the first three output buffers are needed.
* The last parameter is a dummy that's overwritten later.
*/
- mlk_poly_getnoise_eta1_4x(&sp[0], &sp[1], &sp[2], &b[0], coins, 0, 1, 2,
- 0xFF);
+ mlk_poly_getnoise_eta1_4x(&sp->vec[0], &sp->vec[1], &sp->vec[2], NULL, coins,
+ 0, 1, 2, 0xFF /* irrelevant */);
/* The fourth output buffer in this call _is_ used. */
- mlk_poly_getnoise_eta2_4x(&ep[0], &ep[1], &ep[2], &epp, coins, 3, 4, 5, 6);
+ mlk_poly_getnoise_eta2_4x(&ep->vec[0], &ep->vec[1], &ep->vec[2], epp, coins,
+ 3, 4, 5, 6);
#elif MLKEM_K == 4
- mlk_poly_getnoise_eta1_4x(&sp[0], &sp[1], &sp[2], &sp[3], coins, 0, 1, 2, 3);
- mlk_poly_getnoise_eta2_4x(&ep[0], &ep[1], &ep[2], &ep[3], coins, 4, 5, 6, 7);
- mlk_poly_getnoise_eta2(&epp, coins, 8);
-#endif
+ mlk_poly_getnoise_eta1_4x(&sp->vec[0], &sp->vec[1], &sp->vec[2], &sp->vec[3],
+ coins, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta2_4x(&ep->vec[0], &ep->vec[1], &ep->vec[2], &ep->vec[3],
+ coins, 4, 5, 6, 7);
+ mlk_poly_getnoise_eta2(epp, coins, 8);
+#endif /* MLKEM_K == 4 */
mlk_polyvec_ntt(sp);
mlk_polyvec_mulcache_compute(sp_cache, sp);
mlk_matvec_mul(b, at, sp, sp_cache);
- mlk_polyvec_basemul_acc_montgomery_cached(&v, pkpv, sp, sp_cache);
+ mlk_polyvec_basemul_acc_montgomery_cached(v, pkpv, sp, sp_cache);
mlk_polyvec_invntt_tomont(b);
- mlk_poly_invntt_tomont(&v);
+ mlk_poly_invntt_tomont(v);
mlk_polyvec_add(b, ep);
- mlk_poly_add(&v, &epp);
- mlk_poly_add(&v, &k);
+ mlk_poly_add(v, epp);
+ mlk_poly_add(v, k);
mlk_polyvec_reduce(b);
- mlk_poly_reduce(&v);
+ mlk_poly_reduce(v);
- mlk_pack_ciphertext(c, b, &v);
+ mlk_pack_ciphertext(c, b, v);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(seed, sizeof(seed));
- mlk_zeroize(&sp, sizeof(sp));
- mlk_zeroize(&sp_cache, sizeof(sp_cache));
- mlk_zeroize(&b, sizeof(b));
- mlk_zeroize(&v, sizeof(v));
- mlk_zeroize(at, sizeof(at));
- mlk_zeroize(&k, sizeof(k));
- mlk_zeroize(&ep, sizeof(ep));
- mlk_zeroize(&epp, sizeof(epp));
+ MLK_FREE(sp_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(epp, mlk_poly, 1, context);
+ MLK_FREE(k, mlk_poly, 1, context);
+ MLK_FREE(v, mlk_poly, 1, context);
+ MLK_FREE(b, mlk_polyvec, 1, context);
+ MLK_FREE(ep, mlk_polyvec, 1, context);
+ MLK_FREE(pkpv, mlk_polyvec, 1, context);
+ MLK_FREE(sp, mlk_polyvec, 1, context);
+ MLK_FREE(at, mlk_polymat, 1, context);
+ MLK_FREE(seed, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
/* Reference: `indcpa_dec()` in the reference implementation @[REF].
* - We use a mulcache for the scalar product.
* - We include buffer zeroization. */
MLK_INTERNAL_API
-void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+int mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- mlk_polyvec b, skpv;
- mlk_poly v, sb;
- mlk_polyvec_mulcache b_cache;
+ int ret = 0;
+ MLK_ALLOC(b, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(v, mlk_poly, 1, context);
+ MLK_ALLOC(sb, mlk_poly, 1, context);
+ MLK_ALLOC(b_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (b == NULL || skpv == NULL || v == NULL || sb == NULL || b_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
- mlk_unpack_ciphertext(b, &v, c);
+ mlk_unpack_ciphertext(b, v, c);
mlk_unpack_sk(skpv, sk);
mlk_polyvec_ntt(b);
mlk_polyvec_mulcache_compute(b_cache, b);
- mlk_polyvec_basemul_acc_montgomery_cached(&sb, skpv, b, b_cache);
- mlk_poly_invntt_tomont(&sb);
+ mlk_polyvec_basemul_acc_montgomery_cached(sb, skpv, b, b_cache);
+ mlk_poly_invntt_tomont(sb);
- mlk_poly_sub(&v, &sb);
- mlk_poly_reduce(&v);
+ mlk_poly_sub(v, sb);
+ mlk_poly_reduce(v);
- mlk_poly_tomsg(m, &v);
+ mlk_poly_tomsg(m, v);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(&skpv, sizeof(skpv));
- mlk_zeroize(&b, sizeof(b));
- mlk_zeroize(&b_cache, sizeof(b_cache));
- mlk_zeroize(&v, sizeof(v));
- mlk_zeroize(&sb, sizeof(sb));
+ MLK_FREE(b_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(sb, mlk_poly, 1, context);
+ MLK_FREE(v, mlk_poly, 1, context);
+ MLK_FREE(skpv, mlk_polyvec, 1, context);
+ MLK_FREE(b, mlk_polyvec, 1, context);
+ return ret;
}
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
@@ -529,4 +618,5 @@ void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
#undef mlk_pack_ciphertext
#undef mlk_unpack_ciphertext
#undef mlk_matvec_mul
-#undef mlk_poly_permute_bitrev_to_custom
+#undef mlk_polyvec_permute_bitrev_to_custom
+#undef mlk_polymat_permute_bitrev_to_custom
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/indcpa.h
index 4c44d0d411..b31756dcb6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/indcpa.h
@@ -15,7 +15,6 @@
#ifndef MLK_INDCPA_H
#define MLK_INDCPA_H
-#include
#include "cbmc.h"
#include "common.h"
#include "poly_k.h"
@@ -39,18 +38,19 @@
*
**************************************************/
MLK_INTERNAL_API
-void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
+void mlk_gen_matrix(mlk_polymat *a, const uint8_t seed[MLKEM_SYMBYTES],
int transposed)
__contract__(
requires(memory_no_alias(a, sizeof(mlk_polymat)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
requires(transposed == 0 || transposed == 1)
- assigns(object_whole(a))
- ensures(forall(x, 0, MLKEM_K * MLKEM_K,
- array_bound(a[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(a, sizeof(mlk_polymat)))
+ ensures(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
);
-#define mlk_indcpa_keypair_derand MLK_NAMESPACE_K(indcpa_keypair_derand)
+#define mlk_indcpa_keypair_derand \
+ MLK_NAMESPACE_K(indcpa_keypair_derand) MLK_CONTEXT_PARAMETERS_3
/*************************************************
* Name: mlk_indcpa_keypair_derand
*
@@ -68,18 +68,23 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
-#define mlk_indcpa_enc MLK_NAMESPACE_K(indcpa_enc)
+#define mlk_indcpa_enc MLK_NAMESPACE_K(indcpa_enc) MLK_CONTEXT_PARAMETERS_4
/*************************************************
* Name: mlk_indcpa_enc
*
@@ -100,19 +105,23 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(c))
+ assigns(memory_slice(c, MLKEM_INDCPA_BYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
-#define mlk_indcpa_dec MLK_NAMESPACE_K(indcpa_dec)
+#define mlk_indcpa_dec MLK_NAMESPACE_K(indcpa_dec) MLK_CONTEXT_PARAMETERS_3
/*************************************************
* Name: mlk_indcpa_dec
*
@@ -130,14 +139,18 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
- assigns(object_whole(m))
+ assigns(memory_slice(m, MLKEM_INDCPA_MSGBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
#endif /* !MLK_INDCPA_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/kem.c
index d6f4e83628..3c82d6df70 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/kem.c
@@ -8,7 +8,8 @@
*
* - [FIPS140_3_IG]
* Implementation Guidance for FIPS 140-3 and the Cryptographic Module
- * Validation Program National Institute of Standards and Technology
+ * Validation Program
+ * National Institute of Standards and Technology
* https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
*
* - [FIPS203]
@@ -22,12 +23,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
-#include
+#include "kem.h"
#include "indcpa.h"
-#include "kem.h"
#include "randombytes.h"
#include "symmetric.h"
#include "verify.h"
@@ -36,44 +34,24 @@
* This is to facilitate building multiple instances
* of mlkem-native (e.g. with varying security levels)
* within a single compilation unit. */
-#define mlk_check_pk MLK_ADD_PARAM_SET(mlk_check_pk)
-#define mlk_check_sk MLK_ADD_PARAM_SET(mlk_check_sk)
-#define mlk_check_pct MLK_ADD_PARAM_SET(mlk_check_pct)
+#define mlk_check_pct MLK_ADD_PARAM_SET(mlk_check_pct) MLK_CONTEXT_PARAMETERS_2
/* End of parameter set namespacing */
-#if defined(CBMC)
-/* Redeclaration with contract needed for CBMC only */
-int memcmp(const void *str1, const void *str2, size_t n)
-__contract__(
- requires(memory_no_alias(str1, n))
- requires(memory_no_alias(str2, n))
-);
-#endif /* CBMC */
-
-/*************************************************
- * Name: mlk_check_pk
- *
- * Description: Implements modulus check mandated by FIPS 203,
- * i.e., ensures that coefficients are in [0,q-1].
- *
- * Arguments: - const uint8_t *pk: pointer to input public key
- * (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
- * bytes)
- *
- * Returns: - 0 on success
- * - -1 on failure
- *
- * Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
- *
- **************************************************/
-
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-static int mlk_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- mlk_polyvec p;
- uint8_t p_reencoded[MLKEM_POLYVECBYTES];
+ int ret = 0;
+ MLK_ALLOC(p, mlk_polyvec, 1, context);
+ MLK_ALLOC(p_reencoded, uint8_t, MLKEM_POLYVECBYTES, context);
+
+ if (p == NULL || p_reencoded == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
mlk_polyvec_frombytes(p, pk);
mlk_polyvec_reduce(p);
@@ -81,39 +59,32 @@ static int mlk_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
/* We use a constant-time memcmp here to avoid having to
* declassify the PK before the PCT has succeeded. */
- res = mlk_ct_memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES) ? -1 : 0;
+ ret = mlk_ct_memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES) ? MLK_ERR_FAIL : 0;
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(p_reencoded, sizeof(p_reencoded));
- mlk_zeroize(&p, sizeof(p));
- return res;
+ MLK_FREE(p_reencoded, uint8_t, MLKEM_POLYVECBYTES, context);
+ MLK_FREE(p, mlk_polyvec, 1, context);
+ return ret;
}
-/*************************************************
- * Name: mlk_check_sk
- *
- * Description: Implements public key hash check mandated by FIPS 203,
- * i.e., ensures that
- * sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
- *
- * Arguments: - const uint8_t *sk: pointer to input private key
- * (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
- * bytes)
- *
- * Returns: - 0 on success
- * - -1 on failure
- *
- * Specification: Implements @[FIPS203, Section 7.3, 'hash check']
- *
- **************************************************/
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-static int mlk_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t test[MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(test, uint8_t, MLKEM_SYMBYTES, context);
+
+ if (test == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
/*
* The parts of `sk` being hashed and compared here are public, so
* no public information is leaked through the runtime or the return value
@@ -128,23 +99,32 @@ static int mlk_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
mlk_hash_h(test, sk + MLKEM_INDCPA_SECRETKEYBYTES,
MLKEM_INDCCA_PUBLICKEYBYTES);
- res = memcmp(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, test,
- MLKEM_SYMBYTES)
- ? -1
+ /* This doesn't have to be a constant-time memcmp, but it's the only place
+ * in the library where a normal memcmp would be used otherwise, so for sake
+ * of minimizing stdlib dependency, we use our constant-time one anyway. */
+ ret = mlk_ct_memcmp(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+ test, MLKEM_SYMBYTES)
+ ? MLK_ERR_FAIL
: 0;
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(test, sizeof(test));
- return res;
+ MLK_FREE(test, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
- requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES)));
+ requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
+);
#if defined(MLK_CONFIG_KEYGEN_PCT)
/* Specification:
@@ -152,21 +132,30 @@ __contract__(
* @[FIPS203, Section 7.1, Pairwise Consistency]. */
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES];
- uint8_t ss_enc[MLKEM_SSBYTES], ss_dec[MLKEM_SSBYTES];
+ int ret = 0;
+ MLK_ALLOC(ct, uint8_t, MLKEM_INDCCA_CIPHERTEXTBYTES, context);
+ MLK_ALLOC(ss_enc, uint8_t, MLKEM_SSBYTES, context);
+ MLK_ALLOC(ss_dec, uint8_t, MLKEM_SSBYTES, context);
+
+ if (ct == NULL || ss_enc == NULL || ss_dec == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
- res = crypto_kem_enc(ct, ss_enc, pk);
- if (res != 0)
+ ret = mlk_kem_enc(ct, ss_enc, pk, context);
+ if (ret != 0)
{
goto cleanup;
}
- res = crypto_kem_dec(ss_dec, ct, sk);
- if (res != 0)
+ ret = mlk_kem_dec(ss_dec, ct, sk, context);
+ if (ret != 0)
{
goto cleanup;
}
@@ -179,26 +168,36 @@ static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
}
#endif /* MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST */
- res = mlk_ct_memcmp(ss_enc, ss_dec, sizeof(ss_dec));
+ ret = mlk_ct_memcmp(ss_enc, ss_dec, MLKEM_SSBYTES);
+ /* The result of the PCT is public. */
+ MLK_CT_TESTING_DECLASSIFY(&ret, sizeof(ret));
+
+ if (ret != 0)
+ {
+ ret = MLK_ERR_FAIL;
+ }
cleanup:
- /* The result of the PCT is public. */
- MLK_CT_TESTING_DECLASSIFY(&res, sizeof(res));
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(ct, sizeof(ct));
- mlk_zeroize(ss_enc, sizeof(ss_enc));
- mlk_zeroize(ss_dec, sizeof(ss_dec));
- return res;
+ MLK_FREE(ss_dec, uint8_t, MLKEM_SSBYTES, context);
+ MLK_FREE(ss_enc, uint8_t, MLKEM_SSBYTES, context);
+ MLK_FREE(ct, uint8_t, MLKEM_INDCCA_CIPHERTEXTBYTES, context);
+ return ret;
}
-#else /* MLK_CONFIG_KEYGEN_PCT */
+#else /* MLK_CONFIG_KEYGEN_PCT */
+MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
/* Skip PCT */
((void)pk);
((void)sk);
+#if defined(MLK_CONFIG_CONTEXT_PARAMETER)
+ ((void)context);
+#endif
return 0;
}
#endif /* !MLK_CONFIG_KEYGEN_PCT */
@@ -208,164 +207,240 @@ static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
* - We optionally include PCT which is not present in
* the reference code. */
MLK_EXTERNAL_API
-int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
- const uint8_t coins[2 * MLKEM_SYMBYTES])
+int mlk_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ const uint8_t coins[2 * MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- mlk_indcpa_keypair_derand(pk, sk, coins);
- memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+ int ret;
+
+ ret = mlk_indcpa_keypair_derand(pk, sk, coins, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
+
+ mlk_memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
mlk_hash_h(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, pk,
MLKEM_INDCCA_PUBLICKEYBYTES);
/* Value z for pseudo-random output on reject */
- memcpy(sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
- coins + MLKEM_SYMBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
+ coins + MLKEM_SYMBYTES, MLKEM_SYMBYTES);
/* Declassify public key */
MLK_CT_TESTING_DECLASSIFY(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
/* Pairwise Consistency Test (PCT) @[FIPS140_3_IG, p.87] */
- if (mlk_check_pct(pk, sk))
+ ret = mlk_check_pct(pk, sk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- return 0;
+cleanup:
+ if (ret != 0)
+ {
+ mlk_zeroize(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+ mlk_zeroize(sk, MLKEM_INDCCA_SECRETKEYBYTES);
+ }
+
+ return ret;
}
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
/* Reference: `crypto_kem_keypair()` in the reference implementation @[REF]
* - We zeroize the stack buffer */
MLK_EXTERNAL_API
-int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t coins[2 * MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(coins, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+ if (coins == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Acquire necessary randomness, and mark it as secret. */
- mlk_randombytes(coins, 2 * MLKEM_SYMBYTES);
- MLK_CT_TESTING_SECRET(coins, sizeof(coins));
+ if (mlk_randombytes(coins, 2 * MLKEM_SYMBYTES) != 0)
+ {
+ ret = MLK_ERR_RNG_FAIL;
+ goto cleanup;
+ }
+
+ MLK_CT_TESTING_SECRET(coins, 2 * MLKEM_SYMBYTES);
- res = crypto_kem_keypair_derand(pk, sk, coins);
+ ret = mlk_kem_keypair_derand(pk, sk, coins, context);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(coins, sizeof(coins));
- return res;
+ MLK_FREE(coins, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
/* Reference: `crypto_kem_enc_derand()` in the reference implementation @[REF]
* - We include public key check
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- /* Will contain key, coins */
- MLK_ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+ if (buf == NULL || kr == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Specification: Implements @[FIPS203, Section 7.2, Modulus check] */
- if (mlk_check_pk(pk))
+ ret = mlk_kem_check_pk(pk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- memcpy(buf, coins, MLKEM_SYMBYTES);
+ mlk_memcpy(buf, coins, MLKEM_SYMBYTES);
/* Multitarget countermeasure for coins + contributory KEM */
mlk_hash_h(buf + MLKEM_SYMBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
/* coins are in kr+MLKEM_SYMBYTES */
- mlk_indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES);
+ ret = mlk_indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
- memcpy(ss, kr, MLKEM_SYMBYTES);
+ mlk_memcpy(ss, kr, MLKEM_SYMBYTES);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(kr, sizeof(kr));
-
- return 0;
+ MLK_FREE(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
/* Reference: `crypto_kem_enc()` in the reference implementation @[REF]
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t coins[MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(coins, uint8_t, MLKEM_SYMBYTES, context);
- mlk_randombytes(coins, MLKEM_SYMBYTES);
- MLK_CT_TESTING_SECRET(coins, sizeof(coins));
+ if (coins == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
+ if (mlk_randombytes(coins, MLKEM_SYMBYTES) != 0)
+ {
+ ret = MLK_ERR_RNG_FAIL;
+ goto cleanup;
+ }
+
+ MLK_CT_TESTING_SECRET(coins, MLKEM_SYMBYTES);
- res = crypto_kem_enc_derand(ct, ss, pk, coins);
+ ret = mlk_kem_enc_derand(ct, ss, pk, coins, context);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(coins, sizeof(coins));
- return res;
+ MLK_FREE(coins, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
/* Reference: `crypto_kem_dec()` in the reference implementation @[REF]
* - We include secret key check
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_dec(uint8_t ss[MLKEM_SSBYTES],
- const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_dec(uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
+ int ret = 0;
uint8_t fail;
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- /* Will contain key, coins */
- MLK_ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
- MLK_ALIGN uint8_t tmp[MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES];
-
const uint8_t *pk = sk + MLKEM_INDCPA_SECRETKEYBYTES;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(tmp, uint8_t, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES,
+ context);
+
+ if (buf == NULL || kr == NULL || tmp == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Specification: Implements @[FIPS203, Section 7.3, Hash check] */
- if (mlk_check_sk(sk))
+ ret = mlk_kem_check_sk(sk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- mlk_indcpa_dec(buf, ct, sk);
+ ret = mlk_indcpa_dec(buf, ct, sk, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
/* Multitarget countermeasure for coins + contributory KEM */
- memcpy(buf + MLKEM_SYMBYTES,
- sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(buf + MLKEM_SYMBYTES,
+ sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+ MLKEM_SYMBYTES);
mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
/* Recompute and compare ciphertext */
/* coins are in kr+MLKEM_SYMBYTES */
- mlk_indcpa_enc(tmp, buf, pk, kr + MLKEM_SYMBYTES);
+ ret = mlk_indcpa_enc(tmp, buf, pk, kr + MLKEM_SYMBYTES, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
+
fail = mlk_ct_memcmp(ct, tmp, MLKEM_INDCCA_CIPHERTEXTBYTES);
/* Compute rejection key */
- memcpy(tmp, sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
- MLKEM_SYMBYTES);
- memcpy(tmp + MLKEM_SYMBYTES, ct, MLKEM_INDCCA_CIPHERTEXTBYTES);
- mlk_hash_j(ss, tmp, sizeof(tmp));
+ mlk_memcpy(tmp, sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
+ MLKEM_SYMBYTES);
+ mlk_memcpy(tmp + MLKEM_SYMBYTES, ct, MLKEM_INDCCA_CIPHERTEXTBYTES);
+ mlk_hash_j(ss, tmp, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES);
/* Copy true key to return buffer if fail is 0 */
mlk_ct_cmov_zero(ss, kr, MLKEM_SYMBYTES, fail);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(kr, sizeof(kr));
- mlk_zeroize(tmp, sizeof(tmp));
+ MLK_FREE(tmp, uint8_t, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES,
+ context);
+ MLK_FREE(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
- return 0;
+ return ret;
}
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
-#undef mlk_check_pk
-#undef mlk_check_sk
#undef mlk_check_pct
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/kem.h
index d3e5f50ce6..0502715c39 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/kem.h
@@ -10,12 +10,16 @@
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [REF]
+ * CRYSTALS-Kyber C reference implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/ref
*/
#ifndef MLK_KEM_H
#define MLK_KEM_H
-#include
#include "cbmc.h"
#include "common.h"
#include "sys.h"
@@ -23,9 +27,7 @@
#if defined(MLK_CHECK_APIS)
/* Include to ensure consistency between internal kem.h
* and external mlkem_native.h. */
-#define MLK_CONFIG_API_NO_SUPERCOP
#include "mlkem_native.h"
-#undef MLK_CONFIG_API_NO_SUPERCOP
#if MLKEM_INDCCA_SECRETKEYBYTES != \
MLKEM_SECRETKEYBYTES(MLK_CONFIG_PARAMETER_SET)
@@ -44,14 +46,79 @@
#endif /* MLK_CHECK_APIS */
-#define crypto_kem_keypair_derand MLK_NAMESPACE_K(keypair_derand)
-#define crypto_kem_keypair MLK_NAMESPACE_K(keypair)
-#define crypto_kem_enc_derand MLK_NAMESPACE_K(enc_derand)
-#define crypto_kem_enc MLK_NAMESPACE_K(enc)
-#define crypto_kem_dec MLK_NAMESPACE_K(dec)
+#define mlk_kem_keypair_derand \
+ MLK_NAMESPACE_K(keypair_derand) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_keypair MLK_NAMESPACE_K(keypair) MLK_CONTEXT_PARAMETERS_2
+#define mlk_kem_enc_derand MLK_NAMESPACE_K(enc_derand) MLK_CONTEXT_PARAMETERS_4
+#define mlk_kem_enc MLK_NAMESPACE_K(enc) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_dec MLK_NAMESPACE_K(dec) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_check_pk MLK_NAMESPACE_K(check_pk) MLK_CONTEXT_PARAMETERS_1
+#define mlk_kem_check_sk MLK_NAMESPACE_K(check_sk) MLK_CONTEXT_PARAMETERS_1
+
+/*************************************************
+ * Name: mlk_kem_check_pk
+ *
+ * Description: Implements modulus check mandated by FIPS 203,
+ * i.e., ensures that coefficients are in [0,q-1].
+ *
+ * Arguments: - const uint8_t *pk: pointer to input public key
+ * (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
+ * bytes)
+ *
+ * Returns: - 0 on success
+ * - MLK_ERR_FAIL: If the modulus check failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
+ *
+ **************************************************/
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+ requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
+);
+
+
+/*************************************************
+ * Name: mlk_kem_check_sk
+ *
+ * Description: Implements public key hash check mandated by FIPS 203,
+ * i.e., ensures that
+ * sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
+ *
+ * Arguments: - const uint8_t *sk: pointer to input private key
+ * (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
+ * bytes)
+ *
+ * Returns: - 0 on success
+ * - MLK_ERR_FAIL: If the public key hash check failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.3, 'hash check']
+ *
+ **************************************************/
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+ requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
+);
/*************************************************
- * Name: crypto_kem_keypair_derand
+ * Name: mlk_kem_keypair_derand
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
@@ -67,26 +134,33 @@
* random bytes)
*
* Returns: - 0: On success
- * - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
+ * - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ * PCT failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 16, ML-KEM.KeyGen_Internal]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
- const uint8_t coins[2 * MLKEM_SYMBYTES])
+int mlk_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ const uint8_t coins[2 * MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
requires(memory_no_alias(coins, 2 * MLKEM_SYMBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_keypair
+ * Name: mlk_kem_keypair
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
@@ -99,24 +173,32 @@ __contract__(
* bytes)
*
* Returns: - 0: On success
- * - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ * - MLK_ERR_RNG_FAIL: Random number generation failed.
+ * - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ * PCT failed.
*
* Specification: Implements @[FIPS203, Algorithm 19, ML-KEM.KeyGen]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_enc_derand
+ * Name: mlk_kem_enc_derand
*
* Description: Generates cipher text and shared
* secret for given public key
@@ -134,29 +216,34 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'modulus check' @[FIPS203, Section 7.2]
- * for the public key fails.
+ * - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ * for the public key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 17, ML-KEM.Encaps_Internal]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(ct))
- assigns(object_whole(ss))
+ assigns(memory_slice(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
/*************************************************
- * Name: crypto_kem_enc
+ * Name: mlk_kem_enc
*
* Description: Generates cipher text and shared
* secret for given public key
@@ -171,27 +258,34 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'modulus check' @[FIPS203, Section 7.2]
- * for the public key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ * - MLK_ERR_RNG_FAIL: Random number generation failed.
+ * - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ * for the public key fails.
*
* Specification: Implements @[FIPS203, Algorithm 20, ML-KEM.Encaps]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
- assigns(object_whole(ct))
- assigns(object_whole(ss))
+ assigns(memory_slice(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_dec
+ * Name: mlk_kem_dec
*
* Description: Generates shared secret for given
* cipher text and private key
@@ -206,22 +300,27 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'hash check' @[FIPS203, Section 7.3]
- * for the secret key fails.
+ * - MLK_ERR_FAIL: If the 'hash check' @[FIPS203, Section 7.3]
+ * for the secret key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 21, ML-KEM.Decaps]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_dec(uint8_t ss[MLKEM_SSBYTES],
- const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_dec(uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
- assigns(object_whole(ss))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
#endif /* !MLK_KEM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/meta.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/meta.h
index edcc8b02a1..e487e68b83 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/meta.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/meta.h
@@ -22,77 +22,98 @@
#if !defined(__ASSEMBLER__)
+#include "../api.h"
#include "src/arith_native_aarch64.h"
-static MLK_INLINE void mlk_ntt_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N])
{
mlk_ntt_asm(data, mlk_aarch64_ntt_zetas_layer12345,
mlk_aarch64_ntt_zetas_layer67);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_intt_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N])
{
mlk_intt_asm(data, mlk_aarch64_invntt_zetas_layer12345,
mlk_aarch64_invntt_zetas_layer67);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_reduce_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N])
{
mlk_poly_reduce_asm(data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_tomont_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N])
{
mlk_poly_tomont_asm(data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_mulcache_compute_native(
- int16_t x[MLKEM_N / 2], const int16_t y[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
+ const int16_t y[MLKEM_N])
{
mlk_poly_mulcache_compute_asm(x, y, mlk_aarch64_zetas_mulcache_native,
mlk_aarch64_zetas_mulcache_twisted_native);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N],
const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)])
{
mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(r, a, b, b_cache);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 3
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N],
const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)])
{
mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(r, a, b, b_cache);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N],
const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)])
{
mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(r, a, b, b_cache);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
-static MLK_INLINE void mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
- const int16_t a[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+ const int16_t a[MLKEM_N])
{
mlk_poly_tobytes_asm(r, a);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
+MLK_MUST_CHECK_RETURN_VALUE
static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
const uint8_t *buf,
unsigned buflen)
{
- if (len != MLKEM_N || buflen % 24 != 0)
+ if (len != MLKEM_N ||
+ buflen % 24 != 0) /* NEON support is mandatory for AArch64 */
{
- return -1;
+ return MLK_NATIVE_FUNC_FALLBACK;
}
return (int)mlk_rej_uniform_asm(r, buf, buflen, mlk_rej_uniform_table);
}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/aarch64_zetas.c
index 487f697481..4b3f0d86c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/aarch64_zetas.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/aarch64_zetas.c
@@ -5,6 +5,7 @@
/*
* WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
* Do not modify it directly.
*/
@@ -13,7 +14,6 @@
#if defined(MLK_ARITH_BACKEND_AARCH64) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
#include "arith_native_aarch64.h"
/*
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/arith_native_aarch64.h
index 939fed7109..2941ecbd4b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/arith_native_aarch64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/arith_native_aarch64.h
@@ -5,7 +5,6 @@
#ifndef MLK_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H
#define MLK_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H
-#include
#include "../../../cbmc.h"
#include "../../../common.h"
@@ -31,10 +30,10 @@ extern const int16_t mlk_aarch64_zetas_mulcache_twisted_native[];
extern const uint8_t mlk_rej_uniform_table[];
#define mlk_ntt_asm MLK_NAMESPACE(ntt_asm)
-void mlk_ntt_asm(int16_t *p, const int16_t *twiddles12345,
- const int16_t *twiddles56)
+void mlk_ntt_asm(int16_t p[256], const int16_t twiddles12345[80],
+ const int16_t twiddles56[384])
/* This must be kept in sync with the HOL-Light specification
- * in proofs/hol_light/arm/proofs/mlkem_ntt.ml */
+ * in proofs/hol_light/aarch64/proofs/mlkem_ntt.ml */
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
requires(array_abs_bound(p, 0, MLKEM_N, 8192))
@@ -47,10 +46,10 @@ __contract__(
);
#define mlk_intt_asm MLK_NAMESPACE(intt_asm)
-void mlk_intt_asm(int16_t *p, const int16_t *twiddles12345,
- const int16_t *twiddles56)
+void mlk_intt_asm(int16_t p[256], const int16_t twiddles12345[80],
+ const int16_t twiddles56[384])
/* This must be kept in sync with the HOL-Light specification
- * in proofs/hol_light/arm/proofs/mlkem_intt.ml */
+ * in proofs/hol_light/aarch64/proofs/mlkem_intt.ml */
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
requires(twiddles12345 == mlk_aarch64_invntt_zetas_layer12345)
@@ -62,9 +61,9 @@ __contract__(
);
#define mlk_poly_reduce_asm MLK_NAMESPACE(poly_reduce_asm)
-void mlk_poly_reduce_asm(int16_t *p)
+void mlk_poly_reduce_asm(int16_t p[256])
/* This must be kept in sync with the HOL-Light specification
- * in proofs/hol_light/arm/proofs/mlkem_poly_reduce.ml */
+ * in proofs/hol_light/aarch64/proofs/mlkem_poly_reduce.ml */
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
@@ -72,9 +71,9 @@ __contract__(
);
#define mlk_poly_tomont_asm MLK_NAMESPACE(poly_tomont_asm)
-void mlk_poly_tomont_asm(int16_t *p)
+void mlk_poly_tomont_asm(int16_t p[256])
/* This must be kept in sync with the HOL-Light specification
- * in proofs/hol_light/arm/proofs/mlkem_poly_tomont.ml */
+ * in proofs/hol_light/aarch64/proofs/mlkem_poly_tomont.ml */
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
@@ -82,39 +81,39 @@ __contract__(
);
#define mlk_poly_mulcache_compute_asm MLK_NAMESPACE(poly_mulcache_compute_asm)
-void mlk_poly_mulcache_compute_asm(int16_t *cache, const int16_t *mlk_poly,
- const int16_t *zetas,
- const int16_t *zetas_twisted)
+void mlk_poly_mulcache_compute_asm(int16_t cache[128],
+ const int16_t mlk_poly[256],
+ const int16_t zetas[128],
+ const int16_t zetas_twisted[128])
/* This must be kept in sync with the HOL-Light specification
- * in proofs/hol_light/arm/proofs/mlkem_poly_mulcache_compute.ml */
+ * in proofs/hol_light/aarch64/proofs/mlkem_poly_mulcache_compute.ml */
__contract__(
requires(memory_no_alias(cache, sizeof(int16_t) * (MLKEM_N / 2)))
requires(memory_no_alias(mlk_poly, sizeof(int16_t) * MLKEM_N))
requires(zetas == mlk_aarch64_zetas_mulcache_native)
requires(zetas_twisted == mlk_aarch64_zetas_mulcache_twisted_native)
- assigns(object_whole(cache))
+ assigns(memory_slice(cache, sizeof(int16_t) * (MLKEM_N / 2)))
ensures(array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
);
#define mlk_poly_tobytes_asm MLK_NAMESPACE(poly_tobytes_asm)
-void mlk_poly_tobytes_asm(uint8_t *r, const int16_t *a)
+void mlk_poly_tobytes_asm(uint8_t r[384], const int16_t a[256])
/* This must be kept in sync with the HOL-Light specification
- * in proofs/hol_light/arm/proofs/mlkem_poly_tobytes.ml */
+ * in proofs/hol_light/aarch64/proofs/mlkem_poly_tobytes.ml */
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
- assigns(object_whole(r))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
);
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k2 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2)
-void mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(int16_t *r,
- const int16_t *a,
- const int16_t *b,
- const int16_t *b_cache)
+void mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(
+ int16_t r[256], const int16_t a[512], const int16_t b[512],
+ const int16_t b_cache[256])
/* This must be kept in sync with the HOL-Light specification in
- * proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml.
+ * proofs/hol_light/aarch64/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml.
*/
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
@@ -127,12 +126,11 @@ __contract__(
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k3 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3)
-void mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(int16_t *r,
- const int16_t *a,
- const int16_t *b,
- const int16_t *b_cache)
+void mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(
+ int16_t r[256], const int16_t a[768], const int16_t b[768],
+ const int16_t b_cache[384])
/* This must be kept in sync with the HOL-Light specification in
- * proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml.
+ * proofs/hol_light/aarch64/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml.
*/
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
@@ -145,12 +143,11 @@ __contract__(
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k4 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4)
-void mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(int16_t *r,
- const int16_t *a,
- const int16_t *b,
- const int16_t *b_cache)
+void mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(
+ int16_t r[256], const int16_t a[1024], const int16_t b[1024],
+ const int16_t b_cache[512])
/* This must be kept in sync with the HOL-Light specification in
- * proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml.
+ * proofs/hol_light/aarch64/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml.
*/
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
@@ -162,10 +159,11 @@ __contract__(
);
#define mlk_rej_uniform_asm MLK_NAMESPACE(rej_uniform_asm)
-uint64_t mlk_rej_uniform_asm(int16_t *r, const uint8_t *buf, unsigned buflen,
- const uint8_t *table)
+MLK_MUST_CHECK_RETURN_VALUE
+uint64_t mlk_rej_uniform_asm(int16_t r[256], const uint8_t *buf,
+ unsigned buflen, const uint8_t table[2048])
/* This must be kept in sync with the HOL-Light specification
- * in proofs/hol_light/arm/proofs/mlkem_rej_uniform.ml. */
+ * in proofs/hol_light/aarch64/proofs/mlkem_rej_uniform.ml. */
__contract__(
requires(buflen % 24 == 0)
requires(memory_no_alias(buf, buflen))
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/consts.h
deleted file mode 100644
index e9f877e831..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/consts.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-#ifndef MLK_NATIVE_AARCH64_SRC_CONSTS_H
-#define MLK_NATIVE_AARCH64_SRC_CONSTS_H
-
-#include
-#include "../../../common.h"
-
-#define mlk_zetas_mulcache_native MLK_NAMESPACE(zetas_mulcache_native)
-extern const int16_t mlk_zetas_mulcache_native[256];
-
-#define mlk_zetas_mulcache_twisted_native \
- MLK_NAMESPACE(zetas_mulcache_twisted_native)
-extern const int16_t mlk_zetas_mulcache_twisted_native[256];
-
-#endif /* !MLK_NATIVE_AARCH64_SRC_CONSTS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/intt.S
index fe5f1e2d14..8410ac9b30 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/intt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/intt.S
@@ -19,7 +19,33 @@
* https://eprint.iacr.org/2022/1303
*/
-/* AArch64 ML-KEM inverse NTT following @[NeonNTT] and @[SLOTHY_Paper]. */
+/*yaml
+ Name: intt_asm
+ Description: AArch64 ML-KEM inverse NTT following @[NeonNTT] and @[SLOTHY_Paper]
+ Signature: void mlk_intt_asm(int16_t p[256], const int16_t twiddles12345[80], const int16_t twiddles56[384])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: read/write
+ c_parameter: int16_t p[256]
+ description: Input/output polynomial
+ x1:
+ type: buffer
+ size_bytes: 160
+ permissions: read-only
+ c_parameter: const int16_t twiddles12345[80]
+ description: Twiddle factors for layers 1-5
+ x2:
+ type: buffer
+ size_bytes: 768
+ permissions: read-only
+ c_parameter: const int16_t twiddles56[384]
+ description: Twiddle factors for layers 6-7
+ Stack:
+ bytes: 64
+ description: saving callee-saved Neon registers
+*/
#include "../../../common.h"
#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
@@ -29,539 +55,574 @@
* dev/aarch64_opt/src/intt.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(intt_asm)
MLK_ASM_FN_SYMBOL(intt_asm)
- sub sp, sp, #0x40
- stp d8, d9, [sp]
- stp d10, d11, [sp, #0x10]
- stp d12, d13, [sp, #0x20]
- stp d14, d15, [sp, #0x30]
- mov w5, #0xd01 // =3329
- mov v7.h[0], w5
- mov w5, #0x4ebf // =20159
- mov v7.h[1], w5
- mov w5, #0x200 // =512
- dup v29.8h, w5
- mov w5, #0x13b0 // =5040
- dup v30.8h, w5
- mov x3, x0
- mov x4, #0x8 // =8
+ .cfi_startproc
+ sub sp, sp, #0x40
+ .cfi_adjust_cfa_offset 0x40
+ stp d8, d9, [sp]
+ .cfi_rel_offset d8, 0x0
+ .cfi_rel_offset d9, 0x8
+ stp d10, d11, [sp, #0x10]
+ .cfi_rel_offset d10, 0x10
+ .cfi_rel_offset d11, 0x18
+ stp d12, d13, [sp, #0x20]
+ .cfi_rel_offset d12, 0x20
+ .cfi_rel_offset d13, 0x28
+ stp d14, d15, [sp, #0x30]
+ .cfi_rel_offset d14, 0x30
+ .cfi_rel_offset d15, 0x38
+ mov w5, #0xd01 // =3329
+ mov v7.h[0], w5
+ mov w5, #0x4ebf // =20159
+ mov v7.h[1], w5
+ mov w5, #0x200 // =512
+ dup v29.8h, w5
+ mov w5, #0x13b0 // =5040
+ dup v30.8h, w5
+ mov x3, x0
+ mov x4, #0x8 // =8
+ ldr q13, [x3, #0x20]
+ ldr q8, [x3, #0x30]
+ ldr q6, [x3]
+ ldr q16, [x3, #0x10]
+ ldr q4, [x3, #0x50]
+ ldr q11, [x3, #0x40]
+ ldr q3, [x3, #0x70]
+ trn1 v23.4s, v13.4s, v8.4s
+ ldr q0, [x3, #0x60]
+ trn2 v19.4s, v6.4s, v16.4s
+ trn2 v21.4s, v13.4s, v8.4s
+ trn1 v6.4s, v6.4s, v16.4s
+ ldr q24, [x2, #0x20]
+ trn1 v10.2d, v19.2d, v21.2d
+ ldr q16, [x2], #0x60
+ trn1 v5.2d, v6.2d, v23.2d
+ trn1 v28.4s, v0.4s, v3.4s
+ trn2 v18.2d, v6.2d, v23.2d
+ mul v31.8h, v10.8h, v29.8h
+ trn2 v13.4s, v0.4s, v3.4s
+ ldur q14, [x2, #-0x50]
+ sqrdmulh v26.8h, v18.8h, v30.8h
+ ldur q20, [x2, #-0x20]
+ mul v17.8h, v18.8h, v29.8h
+ trn2 v18.2d, v19.2d, v21.2d
+ mul v9.8h, v18.8h, v29.8h
+ trn1 v12.4s, v11.4s, v4.4s
+ sqrdmulh v22.8h, v18.8h, v30.8h
+ sqrdmulh v3.8h, v10.8h, v30.8h
+ sqrdmulh v25.8h, v5.8h, v30.8h
+ mls v9.8h, v22.8h, v7.h[0]
+ mls v17.8h, v26.8h, v7.h[0]
+ trn2 v26.4s, v11.4s, v4.4s
+ mul v8.8h, v5.8h, v29.8h
+ trn1 v10.2d, v26.2d, v13.2d
+ ldur q11, [x2, #-0x10]
+ mls v31.8h, v3.8h, v7.h[0]
+ trn1 v6.2d, v12.2d, v28.2d
+ trn2 v3.2d, v26.2d, v13.2d
+ ldur q4, [x2, #-0x30]
+ mls v8.8h, v25.8h, v7.h[0]
+ sub v19.8h, v17.8h, v9.8h
+ trn2 v13.2d, v12.2d, v28.2d
+ sqrdmulh v1.8h, v3.8h, v30.8h
+ add v9.8h, v17.8h, v9.8h
+ mul v18.8h, v19.8h, v20.8h
+ add v28.8h, v8.8h, v31.8h
+ sqrdmulh v20.8h, v19.8h, v11.8h
+ sub v12.8h, v28.8h, v9.8h
+ sub v23.8h, v8.8h, v31.8h
+ sqrdmulh v11.8h, v13.8h, v30.8h
+ sqrdmulh v5.8h, v23.8h, v4.8h
+ mul v0.8h, v23.8h, v24.8h
+ mul v2.8h, v13.8h, v29.8h
+ mls v0.8h, v5.8h, v7.h[0]
+ add v24.8h, v28.8h, v9.8h
+ mls v18.8h, v20.8h, v7.h[0]
+ sqrdmulh v15.8h, v6.8h, v30.8h
+ sqrdmulh v25.8h, v12.8h, v14.8h
+ mul v21.8h, v12.8h, v16.8h
+ sub v23.8h, v0.8h, v18.8h
+ sqrdmulh v8.8h, v23.8h, v14.8h
+ mul v23.8h, v23.8h, v16.8h
+ mls v21.8h, v25.8h, v7.h[0]
+ mls v23.8h, v8.8h, v7.h[0]
+ mul v14.8h, v3.8h, v29.8h
+ add v3.8h, v0.8h, v18.8h
+ trn2 v4.4s, v24.4s, v3.4s
+ mls v14.8h, v1.8h, v7.h[0]
+ trn1 v9.4s, v24.4s, v3.4s
+ trn2 v12.4s, v21.4s, v23.4s
+ mls v2.8h, v11.8h, v7.h[0]
+ trn1 v28.4s, v21.4s, v23.4s
+ ldr q11, [x1], #0x10
+ mul v31.8h, v10.8h, v29.8h
+ trn1 v25.2d, v4.2d, v12.2d
+ trn1 v20.2d, v9.2d, v28.2d
+ ldr q23, [x2, #0x50]
+ trn2 v13.2d, v4.2d, v12.2d
+ sqrdmulh v21.8h, v10.8h, v30.8h
+ trn2 v4.2d, v9.2d, v28.2d
+ ldr q9, [x2, #0x40]
+ mul v27.8h, v6.8h, v29.8h
+ add v26.8h, v20.8h, v25.8h
+ sub v3.8h, v2.8h, v14.8h
+ sqdmulh v12.8h, v26.8h, v7.h[1]
+ add v5.8h, v4.8h, v13.8h
+ sub v8.8h, v4.8h, v13.8h
+ add v10.8h, v2.8h, v14.8h
+ sqdmulh v6.8h, v5.8h, v7.h[1]
+ ldr q2, [x2, #0x10]
+ mls v27.8h, v15.8h, v7.h[0]
+ ldr q15, [x2, #0x20]
+ srshr v12.8h, v12.8h, #0xb
+ mls v31.8h, v21.8h, v7.h[0]
+ srshr v6.8h, v6.8h, #0xb
+ sqrdmulh v23.8h, v3.8h, v23.8h
+ mls v26.8h, v12.8h, v7.h[0]
+ add v21.8h, v27.8h, v31.8h
+ mls v5.8h, v6.8h, v7.h[0]
+ sub v6.8h, v27.8h, v31.8h
+ sub v14.8h, v21.8h, v10.8h
+ ldr q27, [x2], #0x60
+ mul v3.8h, v3.8h, v9.8h
+ mls v3.8h, v23.8h, v7.h[0]
+ ldur q13, [x2, #-0x30]
+ sub v12.8h, v26.8h, v5.8h
+ add v5.8h, v26.8h, v5.8h
+ sqrdmulh v31.8h, v8.8h, v11.h[5]
+ sqrdmulh v19.8h, v12.8h, v11.h[1]
+ mul v24.8h, v12.8h, v11.h[0]
+ sqrdmulh v13.8h, v6.8h, v13.8h
+ mls v24.8h, v19.8h, v7.h[0]
+ sub x4, x4, #0x2
-intt_scale_start:
- ldr q8, [x3]
- ldr q9, [x3, #0x10]
- ldr q10, [x3, #0x20]
- ldr q11, [x3, #0x30]
- sqrdmulh v27.8h, v8.8h, v30.8h
- mul v8.8h, v8.8h, v29.8h
- mls v8.8h, v27.8h, v7.h[0]
- sqrdmulh v27.8h, v9.8h, v30.8h
- mul v9.8h, v9.8h, v29.8h
- mls v9.8h, v27.8h, v7.h[0]
- sqrdmulh v27.8h, v10.8h, v30.8h
- mul v10.8h, v10.8h, v29.8h
- mls v10.8h, v27.8h, v7.h[0]
- sqrdmulh v27.8h, v11.8h, v30.8h
- mul v11.8h, v11.8h, v29.8h
- mls v11.8h, v27.8h, v7.h[0]
- str q8, [x3], #0x40
- stur q9, [x3, #-0x30]
- stur q10, [x3, #-0x20]
- stur q11, [x3, #-0x10]
- subs x4, x4, #0x1
- cbnz x4, intt_scale_start
- mov x3, x0
- mov x4, #0x8 // =8
- ldr q3, [x3, #0x10]
- ldr q20, [x3]
- ldr q25, [x3, #0x20]
- ldr q24, [x3, #0x30]
- ldr q21, [x2, #0x50]
- trn1 v18.4s, v25.4s, v24.4s
- trn1 v6.4s, v20.4s, v3.4s
- trn2 v12.4s, v25.4s, v24.4s
- trn2 v31.4s, v20.4s, v3.4s
- trn2 v28.2d, v6.2d, v18.2d
- trn1 v25.2d, v6.2d, v18.2d
- trn2 v15.2d, v31.2d, v12.2d
- trn1 v20.2d, v31.2d, v12.2d
- add v4.8h, v28.8h, v15.8h
- add v1.8h, v25.8h, v20.8h
- sub v30.8h, v28.8h, v15.8h
- sub v3.8h, v25.8h, v20.8h
- add v6.8h, v1.8h, v4.8h
- sqrdmulh v9.8h, v30.8h, v21.8h
- ldr q21, [x2, #0x40]
- ldr q25, [x2, #0x30]
- mul v21.8h, v30.8h, v21.8h
- ldr q30, [x2, #0x20]
- sub v28.8h, v1.8h, v4.8h
- ldr q1, [x2, #0x10]
- mls v21.8h, v9.8h, v7.h[0]
- sqrdmulh v9.8h, v3.8h, v25.8h
- mul v20.8h, v3.8h, v30.8h
- ldr q29, [x2], #0x60
- ldr q17, [x3, #0x60]
- mls v20.8h, v9.8h, v7.h[0]
- ldr q3, [x3, #0x70]
- mul v4.8h, v28.8h, v29.8h
- sub v25.8h, v20.8h, v21.8h
- trn1 v15.4s, v17.4s, v3.4s
- sqrdmulh v28.8h, v28.8h, v1.8h
- trn2 v31.4s, v17.4s, v3.4s
- mul v30.8h, v25.8h, v29.8h
- add v20.8h, v20.8h, v21.8h
- mls v4.8h, v28.8h, v7.h[0]
- sqrdmulh v3.8h, v25.8h, v1.8h
- ldr q28, [x3, #0x40]
- trn1 v25.4s, v6.4s, v20.4s
- mls v30.8h, v3.8h, v7.h[0]
- ldr q27, [x3, #0x50]
- trn2 v6.4s, v6.4s, v20.4s
- trn1 v3.4s, v4.4s, v30.4s
- trn2 v10.4s, v28.4s, v27.4s
- trn2 v20.4s, v4.4s, v30.4s
- trn2 v8.2d, v25.2d, v3.2d
- trn1 v9.2d, v25.2d, v3.2d
- trn1 v1.2d, v6.2d, v20.2d
- trn2 v30.2d, v6.2d, v20.2d
- add v4.8h, v9.8h, v1.8h
- add v11.8h, v8.8h, v30.8h
- trn2 v25.2d, v10.2d, v31.2d
- sqdmulh v6.8h, v4.8h, v7.h[1]
- sqdmulh v20.8h, v11.8h, v7.h[1]
- ldr q21, [x2, #0x50]
- srshr v0.8h, v6.8h, #0xb
- srshr v3.8h, v20.8h, #0xb
- trn1 v2.4s, v28.4s, v27.4s
- mls v4.8h, v0.8h, v7.h[0]
- mls v11.8h, v3.8h, v7.h[0]
- ldr q0, [x1], #0x10
- trn2 v20.2d, v2.2d, v15.2d
- sub v6.8h, v4.8h, v11.8h
- sub v5.8h, v20.8h, v25.8h
- sub v22.8h, v9.8h, v1.8h
- sqrdmulh v3.8h, v6.8h, v0.h[1]
- mul v6.8h, v6.8h, v0.h[0]
- sqrdmulh v12.8h, v5.8h, v21.8h
- ldr q19, [x2, #0x40]
- mls v6.8h, v3.8h, v7.h[0]
- ldr q14, [x2], #0x60
- sub x4, x4, #0x2
+Lintt_layer4567_start:
+ add v16.8h, v21.8h, v10.8h
+ mul v18.8h, v6.8h, v15.8h
+ sub v19.8h, v20.8h, v25.8h
+ ldr q21, [x3, #0xa0]
+ str q5, [x3], #0x40
+ mls v18.8h, v13.8h, v7.h[0]
+ sqrdmulh v15.8h, v14.8h, v2.8h
+ ldr q10, [x3, #0x50]
+ ldr q12, [x3, #0x40]
+ stur q24, [x3, #-0x20]
+ mul v5.8h, v8.8h, v11.h[4]
+ sub v0.8h, v18.8h, v3.8h
+ ldr q24, [x3, #0x70]
+ mls v5.8h, v31.8h, v7.h[0]
+ ldr q26, [x2, #0x50]
+ trn2 v1.4s, v12.4s, v10.4s
+ add v6.8h, v18.8h, v3.8h
+ sqrdmulh v20.8h, v0.8h, v2.8h
+ trn1 v13.4s, v12.4s, v10.4s
+ trn1 v18.4s, v16.4s, v6.4s
+ mul v22.8h, v0.8h, v27.8h
+ trn1 v17.4s, v21.4s, v24.4s
+ sqrdmulh v0.8h, v19.8h, v11.h[3]
+ trn1 v25.2d, v13.2d, v17.2d
+ mls v22.8h, v20.8h, v7.h[0]
+ trn2 v21.4s, v21.4s, v24.4s
+ mul v24.8h, v25.8h, v29.8h
+ trn2 v28.2d, v13.2d, v17.2d
+ sqrdmulh v4.8h, v25.8h, v30.8h
+ trn2 v3.2d, v1.2d, v21.2d
+ mul v17.8h, v28.8h, v29.8h
+ sqrdmulh v31.8h, v28.8h, v30.8h
+ ldr q2, [x2, #0x10]
+ mls v24.8h, v4.8h, v7.h[0]
+ mul v4.8h, v19.8h, v11.h[2]
+ ldr q19, [x2, #0x40]
+ mls v4.8h, v0.8h, v7.h[0]
+ mul v0.8h, v14.8h, v27.8h
+ mls v0.8h, v15.8h, v7.h[0]
+ sub v8.8h, v4.8h, v5.8h
+ mul v12.8h, v3.8h, v29.8h
+ mul v23.8h, v8.8h, v11.h[0]
+ trn2 v28.4s, v16.4s, v6.4s
+ sqrdmulh v10.8h, v8.8h, v11.h[1]
+ trn1 v9.4s, v0.4s, v22.4s
+ trn2 v22.4s, v0.4s, v22.4s
+ ldr q11, [x1], #0x10
+ mls v17.8h, v31.8h, v7.h[0]
+ trn1 v20.2d, v18.2d, v9.2d
+ trn2 v14.2d, v18.2d, v9.2d
+ ldr q15, [x2, #0x20]
+ trn1 v6.2d, v1.2d, v21.2d
+ sqrdmulh v9.8h, v3.8h, v30.8h
+ trn1 v25.2d, v28.2d, v22.2d
+ trn2 v16.2d, v28.2d, v22.2d
+ mls v23.8h, v10.8h, v7.h[0]
+ add v1.8h, v20.8h, v25.8h
+ sqrdmulh v21.8h, v6.8h, v30.8h
+ add v8.8h, v14.8h, v16.8h
+ ldr q27, [x2], #0x60
+ sqdmulh v28.8h, v8.8h, v7.h[1]
+ mls v12.8h, v9.8h, v7.h[0]
+ sqdmulh v31.8h, v1.8h, v7.h[1]
+ mul v0.8h, v6.8h, v29.8h
+ sub v10.8h, v17.8h, v12.8h
+ mls v0.8h, v21.8h, v7.h[0]
+ srshr v21.8h, v28.8h, #0xb
+ srshr v13.8h, v31.8h, #0xb
+ sqrdmulh v22.8h, v10.8h, v26.8h
+ mls v8.8h, v21.8h, v7.h[0]
+ mls v1.8h, v13.8h, v7.h[0]
+ add v21.8h, v24.8h, v0.8h
+ stur q23, [x3, #-0x10]
+ sub v6.8h, v24.8h, v0.8h
+ mul v3.8h, v10.8h, v19.8h
+ add v0.8h, v4.8h, v5.8h
+ sqdmulh v13.8h, v0.8h, v7.h[1]
+ ldur q10, [x2, #-0x30]
+ add v5.8h, v1.8h, v8.8h
+ mls v3.8h, v22.8h, v7.h[0]
+ sub v8.8h, v1.8h, v8.8h
+ mul v24.8h, v8.8h, v11.h[0]
+ sqrdmulh v8.8h, v8.8h, v11.h[1]
+ srshr v1.8h, v13.8h, #0xb
+ sqrdmulh v13.8h, v6.8h, v10.8h
+ mls v0.8h, v1.8h, v7.h[0]
+ add v10.8h, v17.8h, v12.8h
+ mls v24.8h, v8.8h, v7.h[0]
+ sub v8.8h, v14.8h, v16.8h
+ sqrdmulh v31.8h, v8.8h, v11.h[5]
+ sub v14.8h, v21.8h, v10.8h
+ stur q0, [x3, #-0x30]
+ subs x4, x4, #0x1
+ cbnz x4, Lintt_layer4567_start
+ mul v15.8h, v6.8h, v15.8h
+ sub v22.8h, v20.8h, v25.8h
+ add v4.8h, v21.8h, v10.8h
+ str q24, [x3, #0x20]
+ mls v15.8h, v13.8h, v7.h[0]
+ str q5, [x3], #0x40
+ ldr q9, [x1], #0x10
+ sqrdmulh v28.8h, v14.8h, v2.8h
+ mul v16.8h, v14.8h, v27.8h
+ sub v18.8h, v15.8h, v3.8h
+ add v15.8h, v15.8h, v3.8h
+ sqrdmulh v0.8h, v18.8h, v2.8h
+ trn2 v24.4s, v4.4s, v15.4s
+ trn1 v2.4s, v4.4s, v15.4s
+ mul v18.8h, v18.8h, v27.8h
+ mls v16.8h, v28.8h, v7.h[0]
+ mls v18.8h, v0.8h, v7.h[0]
+ mul v23.8h, v8.8h, v11.h[4]
+ sqrdmulh v12.8h, v22.8h, v11.h[3]
+ trn1 v17.4s, v16.4s, v18.4s
+ trn2 v4.4s, v16.4s, v18.4s
+ mls v23.8h, v31.8h, v7.h[0]
+ trn2 v3.2d, v2.2d, v17.2d
+ trn2 v6.2d, v24.2d, v4.2d
+ mul v26.8h, v22.8h, v11.h[2]
+ trn1 v28.2d, v2.2d, v17.2d
+ mls v26.8h, v12.8h, v7.h[0]
+ add v25.8h, v3.8h, v6.8h
+ sub v18.8h, v3.8h, v6.8h
+ trn1 v24.2d, v24.2d, v4.2d
+ sqdmulh v1.8h, v25.8h, v7.h[1]
+ sub v27.8h, v28.8h, v24.8h
+ sqrdmulh v2.8h, v18.8h, v9.h[5]
+ add v28.8h, v28.8h, v24.8h
+ mul v24.8h, v27.8h, v9.h[2]
+ sqdmulh v12.8h, v28.8h, v7.h[1]
+ mul v20.8h, v18.8h, v9.h[4]
+ mls v20.8h, v2.8h, v7.h[0]
+ srshr v1.8h, v1.8h, #0xb
+ sqrdmulh v19.8h, v27.8h, v9.h[3]
+ srshr v15.8h, v12.8h, #0xb
+ mls v25.8h, v1.8h, v7.h[0]
+ add v8.8h, v26.8h, v23.8h
+ sub v4.8h, v26.8h, v23.8h
+ mls v28.8h, v15.8h, v7.h[0]
+ mls v24.8h, v19.8h, v7.h[0]
+ mul v2.8h, v4.8h, v11.h[0]
+ sub v19.8h, v28.8h, v25.8h
+ sqrdmulh v15.8h, v4.8h, v11.h[1]
+ add v25.8h, v28.8h, v25.8h
+ sub v10.8h, v24.8h, v20.8h
+ str q25, [x3], #0x40
+ sqrdmulh v22.8h, v19.8h, v9.h[1]
+ add v28.8h, v24.8h, v20.8h
+ sqrdmulh v25.8h, v10.8h, v9.h[1]
+ mul v27.8h, v19.8h, v9.h[0]
+ mul v26.8h, v10.8h, v9.h[0]
+ sqdmulh v20.8h, v28.8h, v7.h[1]
+ sqdmulh v16.8h, v8.8h, v7.h[1]
+ mls v26.8h, v25.8h, v7.h[0]
+ mls v2.8h, v15.8h, v7.h[0]
+ srshr v15.8h, v20.8h, #0xb
+ srshr v1.8h, v16.8h, #0xb
+ mls v27.8h, v22.8h, v7.h[0]
+ mls v28.8h, v15.8h, v7.h[0]
+ mls v8.8h, v1.8h, v7.h[0]
+ stur q27, [x3, #-0x20]
+ stur q2, [x3, #-0x50]
+ stur q28, [x3, #-0x30]
+ stur q26, [x3, #-0x10]
+ stur q8, [x3, #-0x70]
+ mov x4, #0x4 // =4
+ ldr q0, [x1], #0x20
+ ldur q1, [x1, #-0x10]
+ ldr q26, [x0]
+ ldr q13, [x0, #0x40]
+ ldr q28, [x0, #0xc0]
+ ldr q2, [x0, #0x140]
+ ldr q6, [x0, #0x80]
+ ldr q9, [x0, #0x100]
+ ldr q29, [x0, #0x1c0]
+ ldr q23, [x0, #0x180]
+ sub v17.8h, v26.8h, v13.8h
+ add v4.8h, v26.8h, v13.8h
+ ldr q25, [x0, #0xd0]
+ ldr q24, [x0, #0x50]
+ add v5.8h, v6.8h, v28.8h
+ mul v19.8h, v17.8h, v0.h[6]
+ sub v10.8h, v6.8h, v28.8h
+ ldr q30, [x0, #0x150]
+ sqrdmulh v12.8h, v17.8h, v0.h[7]
+ add v17.8h, v9.8h, v2.8h
+ sub v28.8h, v9.8h, v2.8h
+ ldr q2, [x0, #0x90]
+ sub v26.8h, v23.8h, v29.8h
+ sqrdmulh v31.8h, v10.8h, v1.h[1]
+ add v22.8h, v23.8h, v29.8h
+ ldr q3, [x0, #0x110]
+ sqrdmulh v9.8h, v28.8h, v1.h[3]
+ sub v20.8h, v4.8h, v5.8h
+ sub v27.8h, v17.8h, v22.8h
+ ldr q29, [x0, #0x10]
+ add v16.8h, v4.8h, v5.8h
+ sqrdmulh v4.8h, v26.8h, v1.h[5]
+ add v6.8h, v17.8h, v22.8h
+ ldr q22, [x0, #0x1d0]
+ mul v8.8h, v28.8h, v1.h[2]
+ sub v21.8h, v2.8h, v25.8h
+ sub v5.8h, v16.8h, v6.8h
+ mul v17.8h, v26.8h, v1.h[4]
+ mul v26.8h, v10.8h, v1.h[0]
+ mls v26.8h, v31.8h, v7.h[0]
+ mls v17.8h, v4.8h, v7.h[0]
+ mls v19.8h, v12.8h, v7.h[0]
+ mls v8.8h, v9.8h, v7.h[0]
+ sqrdmulh v10.8h, v27.8h, v0.h[5]
+ sub v12.8h, v19.8h, v26.8h
+ add v9.8h, v19.8h, v26.8h
+ sqrdmulh v26.8h, v20.8h, v0.h[3]
+ sub v11.8h, v8.8h, v17.8h
+ add v14.8h, v8.8h, v17.8h
+ sqrdmulh v13.8h, v12.8h, v0.h[3]
+ add v23.8h, v9.8h, v14.8h
+ sqrdmulh v28.8h, v11.8h, v0.h[5]
+ sub v19.8h, v9.8h, v14.8h
+ mul v17.8h, v27.8h, v0.h[4]
+ str q23, [x0, #0x40]
+ mul v14.8h, v20.8h, v0.h[2]
+ mul v8.8h, v11.8h, v0.h[4]
+ mul v4.8h, v12.8h, v0.h[2]
+ mls v14.8h, v26.8h, v7.h[0]
+ mls v17.8h, v10.8h, v7.h[0]
+ mls v8.8h, v28.8h, v7.h[0]
+ mls v4.8h, v13.8h, v7.h[0]
+ sub v10.8h, v14.8h, v17.8h
+ add v20.8h, v14.8h, v17.8h
+ sqrdmulh v28.8h, v5.8h, v0.h[1]
+ mul v18.8h, v5.8h, v0.h[0]
+ str q20, [x0, #0x80]
+ sub v13.8h, v4.8h, v8.8h
+ mul v23.8h, v10.8h, v0.h[0]
+ mul v17.8h, v19.8h, v0.h[0]
+ sqrdmulh v9.8h, v13.8h, v0.h[1]
+ mls v18.8h, v28.8h, v7.h[0]
+ sqrdmulh v10.8h, v10.8h, v0.h[1]
+ sub x4, x4, #0x2
-intt_layer4567_start:
- str q6, [x3, #0x20]
- ldur q18, [x2, #-0x50]
- mul v26.8h, v5.8h, v19.8h
- trn1 v16.2d, v10.2d, v31.2d
- mul v27.8h, v22.8h, v0.h[2]
- trn1 v10.2d, v2.2d, v15.2d
- add v5.8h, v4.8h, v11.8h
- mls v26.8h, v12.8h, v7.h[0]
- add v11.8h, v10.8h, v16.8h
- add v6.8h, v20.8h, v25.8h
- ldur q25, [x2, #-0x40]
- ldur q28, [x2, #-0x30]
- ldr q2, [x3, #0xa0]
- ldr q19, [x2, #0x40]
- sub v17.8h, v8.8h, v30.8h
- ldr q1, [x3, #0x90]
- sqrdmulh v9.8h, v17.8h, v0.h[5]
- str q5, [x3], #0x40
- ldr q30, [x3, #0x70]
- sub v10.8h, v10.8h, v16.8h
- ldr q16, [x3, #0x40]
- sqrdmulh v24.8h, v10.8h, v28.8h
- mul v13.8h, v10.8h, v25.8h
- sub v21.8h, v11.8h, v6.8h
- trn1 v15.4s, v2.4s, v30.4s
- trn2 v31.4s, v2.4s, v30.4s
- mls v13.8h, v24.8h, v7.h[0]
- mul v29.8h, v21.8h, v14.8h
- ldr q12, [x2, #0x50]
- sub v28.8h, v13.8h, v26.8h
- trn2 v10.4s, v16.4s, v1.4s
- add v30.8h, v11.8h, v6.8h
- sqrdmulh v2.8h, v28.8h, v18.8h
- mul v8.8h, v28.8h, v14.8h
- sqrdmulh v18.8h, v21.8h, v18.8h
- ldr q14, [x2], #0x60
- mls v8.8h, v2.8h, v7.h[0]
- add v11.8h, v13.8h, v26.8h
- mls v29.8h, v18.8h, v7.h[0]
- sqrdmulh v20.8h, v22.8h, v0.h[3]
- trn1 v23.4s, v30.4s, v11.4s
- trn2 v28.4s, v30.4s, v11.4s
- trn2 v13.4s, v29.4s, v8.4s
- trn1 v11.4s, v29.4s, v8.4s
- mls v27.8h, v20.8h, v7.h[0]
- trn1 v21.2d, v28.2d, v13.2d
- trn2 v8.2d, v23.2d, v11.2d
- trn1 v24.2d, v23.2d, v11.2d
- mul v26.8h, v17.8h, v0.h[4]
- trn2 v30.2d, v28.2d, v13.2d
- add v4.8h, v24.8h, v21.8h
- add v11.8h, v8.8h, v30.8h
- mls v26.8h, v9.8h, v7.h[0]
- sqdmulh v17.8h, v4.8h, v7.h[1]
- sqdmulh v29.8h, v11.8h, v7.h[1]
- trn2 v25.2d, v10.2d, v31.2d
- add v2.8h, v27.8h, v26.8h
- srshr v28.8h, v17.8h, #0xb
- srshr v13.8h, v29.8h, #0xb
- sqdmulh v20.8h, v2.8h, v7.h[1]
- sub v5.8h, v27.8h, v26.8h
- mls v4.8h, v28.8h, v7.h[0]
- mls v11.8h, v13.8h, v7.h[0]
- srshr v23.8h, v20.8h, #0xb
- sqrdmulh v17.8h, v5.8h, v0.h[1]
- mul v9.8h, v5.8h, v0.h[0]
- mls v2.8h, v23.8h, v7.h[0]
- sub v29.8h, v4.8h, v11.8h
- ldr q0, [x1], #0x10
- stur q2, [x3, #-0x30]
- trn1 v2.4s, v16.4s, v1.4s
- sqrdmulh v3.8h, v29.8h, v0.h[1]
- mul v6.8h, v29.8h, v0.h[0]
- trn2 v20.2d, v2.2d, v15.2d
- mls v9.8h, v17.8h, v7.h[0]
- sub v5.8h, v20.8h, v25.8h
- mls v6.8h, v3.8h, v7.h[0]
- sub v22.8h, v24.8h, v21.8h
- stur q9, [x3, #-0x10]
- sqrdmulh v12.8h, v5.8h, v12.8h
- subs x4, x4, #0x1
- cbnz x4, intt_layer4567_start
- mul v21.8h, v22.8h, v0.h[2]
- mul v28.8h, v5.8h, v19.8h
- trn1 v10.2d, v10.2d, v31.2d
- trn1 v2.2d, v2.2d, v15.2d
- add v11.8h, v4.8h, v11.8h
- sub v30.8h, v8.8h, v30.8h
- add v23.8h, v20.8h, v25.8h
- add v24.8h, v2.8h, v10.8h
- mul v8.8h, v30.8h, v0.h[4]
- sqrdmulh v5.8h, v30.8h, v0.h[5]
- sqrdmulh v22.8h, v22.8h, v0.h[3]
- add v30.8h, v24.8h, v23.8h
- ldur q26, [x2, #-0x30]
- mls v8.8h, v5.8h, v7.h[0]
- sub v5.8h, v2.8h, v10.8h
- ldur q13, [x2, #-0x40]
- mls v21.8h, v22.8h, v7.h[0]
- str q6, [x3, #0x20]
- mul v3.8h, v5.8h, v13.8h
- sqrdmulh v22.8h, v5.8h, v26.8h
- sub v18.8h, v21.8h, v8.8h
- mls v28.8h, v12.8h, v7.h[0]
- str q11, [x3], #0x40
- mls v3.8h, v22.8h, v7.h[0]
- sqrdmulh v16.8h, v18.8h, v0.h[1]
- sub v10.8h, v24.8h, v23.8h
- mul v17.8h, v18.8h, v0.h[0]
- sub v11.8h, v3.8h, v28.8h
- mul v13.8h, v10.8h, v14.8h
- add v22.8h, v3.8h, v28.8h
- mul v14.8h, v11.8h, v14.8h
- ldur q26, [x2, #-0x50]
- trn2 v2.4s, v30.4s, v22.4s
- mls v17.8h, v16.8h, v7.h[0]
- sqrdmulh v10.8h, v10.8h, v26.8h
- sqrdmulh v11.8h, v11.8h, v26.8h
- ldr q9, [x1], #0x10
- mls v13.8h, v10.8h, v7.h[0]
- mls v14.8h, v11.8h, v7.h[0]
- trn1 v6.4s, v30.4s, v22.4s
- add v8.8h, v21.8h, v8.8h
- stur q17, [x3, #-0x10]
- trn2 v0.4s, v13.4s, v14.4s
- trn1 v1.4s, v13.4s, v14.4s
- sqdmulh v13.8h, v8.8h, v7.h[1]
- trn1 v24.2d, v2.2d, v0.2d
- trn2 v2.2d, v2.2d, v0.2d
- trn2 v26.2d, v6.2d, v1.2d
- trn1 v11.2d, v6.2d, v1.2d
- add v22.8h, v26.8h, v2.8h
- sub v28.8h, v11.8h, v24.8h
- sub v27.8h, v26.8h, v2.8h
- add v10.8h, v11.8h, v24.8h
- sqrdmulh v11.8h, v28.8h, v9.h[3]
- mul v24.8h, v28.8h, v9.h[2]
- sqdmulh v1.8h, v22.8h, v7.h[1]
- sqrdmulh v0.8h, v27.8h, v9.h[5]
- srshr v12.8h, v13.8h, #0xb
- mls v24.8h, v11.8h, v7.h[0]
- sqdmulh v14.8h, v10.8h, v7.h[1]
- mul v27.8h, v27.8h, v9.h[4]
- mls v8.8h, v12.8h, v7.h[0]
- srshr v5.8h, v1.8h, #0xb
- srshr v14.8h, v14.8h, #0xb
- mls v27.8h, v0.8h, v7.h[0]
- mls v22.8h, v5.8h, v7.h[0]
- mls v10.8h, v14.8h, v7.h[0]
- stur q8, [x3, #-0x30]
- sub v2.8h, v24.8h, v27.8h
- add v14.8h, v24.8h, v27.8h
- sub v11.8h, v10.8h, v22.8h
- add v20.8h, v10.8h, v22.8h
- sqdmulh v22.8h, v14.8h, v7.h[1]
- sqrdmulh v8.8h, v11.8h, v9.h[1]
- mul v27.8h, v11.8h, v9.h[0]
- sqrdmulh v0.8h, v2.8h, v9.h[1]
- mul v11.8h, v2.8h, v9.h[0]
- srshr v10.8h, v22.8h, #0xb
- mls v27.8h, v8.8h, v7.h[0]
- str q20, [x3], #0x40
- mls v11.8h, v0.8h, v7.h[0]
- mls v14.8h, v10.8h, v7.h[0]
- stur q27, [x3, #-0x20]
- stur q11, [x3, #-0x10]
- stur q14, [x3, #-0x30]
- mov x4, #0x4 // =4
- ldr q0, [x1], #0x20
- ldur q1, [x1, #-0x10]
- ldr q2, [x0]
- ldr q10, [x0, #0x40]
- ldr q11, [x0, #0x80]
- sub v14.8h, v2.8h, v10.8h
- add v2.8h, v2.8h, v10.8h
- ldr q10, [x0, #0xc0]
- sqrdmulh v8.8h, v14.8h, v0.h[7]
- mul v14.8h, v14.8h, v0.h[6]
- sub v22.8h, v11.8h, v10.8h
- add v10.8h, v11.8h, v10.8h
- ldr q11, [x0, #0x1c0]
- add v13.8h, v2.8h, v10.8h
- sub v2.8h, v2.8h, v10.8h
- sqrdmulh v10.8h, v22.8h, v1.h[1]
- mul v22.8h, v22.8h, v1.h[0]
- mls v14.8h, v8.8h, v7.h[0]
- sqrdmulh v8.8h, v2.8h, v0.h[3]
- mul v2.8h, v2.8h, v0.h[2]
- mls v22.8h, v10.8h, v7.h[0]
- ldr q10, [x0, #0x100]
- mls v2.8h, v8.8h, v7.h[0]
- sub v8.8h, v14.8h, v22.8h
- add v14.8h, v14.8h, v22.8h
- ldr q22, [x0, #0x180]
- sqrdmulh v24.8h, v8.8h, v0.h[3]
- mul v8.8h, v8.8h, v0.h[2]
- sub v26.8h, v22.8h, v11.8h
- add v11.8h, v22.8h, v11.8h
- ldr q22, [x0, #0x140]
- sqrdmulh v16.8h, v26.8h, v1.h[5]
- mul v26.8h, v26.8h, v1.h[4]
- add v23.8h, v10.8h, v22.8h
- sub v10.8h, v10.8h, v22.8h
- mls v8.8h, v24.8h, v7.h[0]
- add v22.8h, v23.8h, v11.8h
- mul v24.8h, v10.8h, v1.h[2]
- sqrdmulh v10.8h, v10.8h, v1.h[3]
- sub v19.8h, v13.8h, v22.8h
- add v18.8h, v13.8h, v22.8h
- sub v11.8h, v23.8h, v11.8h
- mls v24.8h, v10.8h, v7.h[0]
- mls v26.8h, v16.8h, v7.h[0]
- sqrdmulh v10.8h, v11.8h, v0.h[5]
- mul v11.8h, v11.8h, v0.h[4]
- sqrdmulh v22.8h, v19.8h, v0.h[1]
- sub v13.8h, v24.8h, v26.8h
- mul v16.8h, v19.8h, v0.h[0]
- mls v11.8h, v10.8h, v7.h[0]
- sqrdmulh v10.8h, v13.8h, v0.h[5]
- mul v13.8h, v13.8h, v0.h[4]
- add v24.8h, v24.8h, v26.8h
- sub v26.8h, v2.8h, v11.8h
- add v9.8h, v2.8h, v11.8h
- add v11.8h, v14.8h, v24.8h
- sub v14.8h, v14.8h, v24.8h
- sqrdmulh v2.8h, v26.8h, v0.h[1]
- mul v24.8h, v26.8h, v0.h[0]
- mls v13.8h, v10.8h, v7.h[0]
- mls v16.8h, v22.8h, v7.h[0]
- sqrdmulh v10.8h, v14.8h, v0.h[1]
- mls v24.8h, v2.8h, v7.h[0]
- add v22.8h, v8.8h, v13.8h
- str q16, [x0, #0x100]
- sub v2.8h, v8.8h, v13.8h
- str q24, [x0, #0x180]
- mul v13.8h, v14.8h, v0.h[0]
- str q22, [x0, #0xc0]
- sqrdmulh v21.8h, v2.8h, v0.h[1]
- ldr q6, [x0, #0x90]
- ldr q14, [x0, #0xd0]
- mls v13.8h, v10.8h, v7.h[0]
- str q11, [x0, #0x40]
- sub v10.8h, v6.8h, v14.8h
- ldr q11, [x0, #0x10]
- sqrdmulh v19.8h, v10.8h, v1.h[1]
- mul v20.8h, v10.8h, v1.h[0]
- ldr q28, [x0, #0x50]
- sub x4, x4, #0x2
-
-intt_layer123_start:
- mls v20.8h, v19.8h, v7.h[0]
- ldr q31, [x0, #0x1d0]
- sub v22.8h, v11.8h, v28.8h
- ldr q30, [x0, #0x110]
- sqrdmulh v8.8h, v22.8h, v0.h[7]
- mul v3.8h, v22.8h, v0.h[6]
- mul v5.8h, v2.8h, v0.h[0]
- str q13, [x0, #0x140]
- add v10.8h, v11.8h, v28.8h
- ldr q22, [x0, #0x150]
- ldr q4, [x0, #0x190]
- sub v23.8h, v30.8h, v22.8h
- add v27.8h, v30.8h, v22.8h
- mls v3.8h, v8.8h, v7.h[0]
- mls v5.8h, v21.8h, v7.h[0]
- ldr q11, [x0, #0x20]
- sub v17.8h, v4.8h, v31.8h
- add v2.8h, v6.8h, v14.8h
- mul v19.8h, v23.8h, v1.h[2]
- sub v22.8h, v3.8h, v20.8h
- add v14.8h, v10.8h, v2.8h
- sub v24.8h, v10.8h, v2.8h
- sqrdmulh v2.8h, v23.8h, v1.h[3]
- sqrdmulh v30.8h, v22.8h, v0.h[3]
- mul v23.8h, v22.8h, v0.h[2]
- sqrdmulh v15.8h, v17.8h, v1.h[5]
- mls v19.8h, v2.8h, v7.h[0]
- add v2.8h, v4.8h, v31.8h
- mul v21.8h, v17.8h, v1.h[4]
- sqrdmulh v22.8h, v24.8h, v0.h[3]
- sub v26.8h, v27.8h, v2.8h
- add v8.8h, v27.8h, v2.8h
- mul v28.8h, v24.8h, v0.h[2]
- sqrdmulh v10.8h, v26.8h, v0.h[5]
- mul v31.8h, v26.8h, v0.h[4]
- mls v21.8h, v15.8h, v7.h[0]
- mls v28.8h, v22.8h, v7.h[0]
- sub v17.8h, v14.8h, v8.8h
- mls v31.8h, v10.8h, v7.h[0]
- sub v27.8h, v19.8h, v21.8h
- sqrdmulh v29.8h, v17.8h, v0.h[1]
- mul v10.8h, v17.8h, v0.h[0]
- sub v15.8h, v28.8h, v31.8h
- sqrdmulh v17.8h, v27.8h, v0.h[5]
- mul v25.8h, v27.8h, v0.h[4]
- sqrdmulh v6.8h, v15.8h, v0.h[1]
- mul v27.8h, v15.8h, v0.h[0]
- add v16.8h, v19.8h, v21.8h
- mls v25.8h, v17.8h, v7.h[0]
- mls v23.8h, v30.8h, v7.h[0]
- mls v27.8h, v6.8h, v7.h[0]
- ldr q6, [x0, #0xa0]
- add v22.8h, v23.8h, v25.8h
- str q27, [x0, #0x190]
- add v4.8h, v3.8h, v20.8h
- str q22, [x0, #0xd0]
- mls v10.8h, v29.8h, v7.h[0]
- str q5, [x0, #0x1c0]
- add v20.8h, v4.8h, v16.8h
- str q18, [x0], #0x10
- sub v18.8h, v4.8h, v16.8h
- str q10, [x0, #0x100]
- sub v2.8h, v23.8h, v25.8h
- sqrdmulh v12.8h, v18.8h, v0.h[1]
- mul v13.8h, v18.8h, v0.h[0]
- add v18.8h, v14.8h, v8.8h
- ldr q14, [x0, #0xd0]
- mls v13.8h, v12.8h, v7.h[0]
- str q9, [x0, #0x70]
- sub v3.8h, v6.8h, v14.8h
- add v9.8h, v28.8h, v31.8h
- str q20, [x0, #0x40]
- sqrdmulh v19.8h, v3.8h, v1.h[1]
- mul v20.8h, v3.8h, v1.h[0]
- sqrdmulh v21.8h, v2.8h, v0.h[1]
- ldr q28, [x0, #0x50]
- subs x4, x4, #0x1
- cbnz x4, intt_layer123_start
- mls v20.8h, v19.8h, v7.h[0]
- sub v10.8h, v11.8h, v28.8h
- add v11.8h, v11.8h, v28.8h
- mul v2.8h, v2.8h, v0.h[0]
- str q13, [x0, #0x140]
- add v25.8h, v6.8h, v14.8h
- str q18, [x0], #0x10
- sqrdmulh v17.8h, v10.8h, v0.h[7]
- str q9, [x0, #0x70]
- ldr q8, [x0, #0x1c0]
- ldr q13, [x0, #0x100]
- ldr q26, [x0, #0x180]
- ldr q24, [x0, #0x140]
- add v15.8h, v26.8h, v8.8h
- sub v8.8h, v26.8h, v8.8h
- sub v12.8h, v13.8h, v24.8h
- add v24.8h, v13.8h, v24.8h
- sqrdmulh v18.8h, v8.8h, v1.h[5]
- mul v26.8h, v12.8h, v1.h[2]
- mul v8.8h, v8.8h, v1.h[4]
- sqrdmulh v16.8h, v12.8h, v1.h[3]
- mul v10.8h, v10.8h, v0.h[6]
- add v22.8h, v11.8h, v25.8h
- mls v8.8h, v18.8h, v7.h[0]
- mls v26.8h, v16.8h, v7.h[0]
- mls v10.8h, v17.8h, v7.h[0]
- add v23.8h, v24.8h, v15.8h
- sub v11.8h, v11.8h, v25.8h
- sub v3.8h, v26.8h, v8.8h
- sub v14.8h, v10.8h, v20.8h
- sub v19.8h, v22.8h, v23.8h
- mul v18.8h, v3.8h, v0.h[4]
- sqrdmulh v17.8h, v14.8h, v0.h[3]
- mul v14.8h, v14.8h, v0.h[2]
- sqrdmulh v3.8h, v3.8h, v0.h[5]
- sub v16.8h, v24.8h, v15.8h
- mls v2.8h, v21.8h, v7.h[0]
- mls v14.8h, v17.8h, v7.h[0]
- mls v18.8h, v3.8h, v7.h[0]
- sqrdmulh v31.8h, v16.8h, v0.h[5]
- str q2, [x0, #0x1b0]
- mul v13.8h, v16.8h, v0.h[4]
- add v24.8h, v14.8h, v18.8h
- sqrdmulh v2.8h, v11.8h, v0.h[3]
- mul v21.8h, v11.8h, v0.h[2]
- mls v13.8h, v31.8h, v7.h[0]
- add v16.8h, v26.8h, v8.8h
- add v28.8h, v10.8h, v20.8h
- mls v21.8h, v2.8h, v7.h[0]
- sub v14.8h, v14.8h, v18.8h
- add v2.8h, v28.8h, v16.8h
- sub v10.8h, v28.8h, v16.8h
- sub v16.8h, v21.8h, v13.8h
- sqrdmulh v27.8h, v19.8h, v0.h[1]
- mul v26.8h, v19.8h, v0.h[0]
- sqrdmulh v19.8h, v16.8h, v0.h[1]
- mul v28.8h, v16.8h, v0.h[0]
- sqrdmulh v8.8h, v14.8h, v0.h[1]
- mls v26.8h, v27.8h, v7.h[0]
- mul v14.8h, v14.8h, v0.h[0]
- mls v28.8h, v19.8h, v7.h[0]
- sqrdmulh v20.8h, v10.8h, v0.h[1]
- str q26, [x0, #0x100]
- mul v10.8h, v10.8h, v0.h[0]
- str q28, [x0, #0x180]
- add v22.8h, v22.8h, v23.8h
- str q24, [x0, #0xc0]
- mls v10.8h, v20.8h, v7.h[0]
- str q2, [x0, #0x40]
- mls v14.8h, v8.8h, v7.h[0]
- str q22, [x0], #0x10
- add v11.8h, v21.8h, v13.8h
- str q10, [x0, #0x130]
- str q11, [x0, #0x70]
- str q14, [x0, #0x1b0]
- ldp d8, d9, [sp]
- ldp d10, d11, [sp, #0x10]
- ldp d12, d13, [sp, #0x20]
- ldp d14, d15, [sp, #0x30]
- add sp, sp, #0x40
+Lintt_layer123_start:
+ sub v12.8h, v3.8h, v30.8h
+ mul v11.8h, v21.8h, v1.h[0]
+ add v28.8h, v4.8h, v8.8h
+ ldr q20, [x0, #0x190]
+ add v27.8h, v16.8h, v6.8h
+ sqrdmulh v8.8h, v12.8h, v1.h[3]
+ add v16.8h, v29.8h, v24.8h
+ str q28, [x0, #0xc0]
+ mls v23.8h, v10.8h, v7.h[0]
+ str q27, [x0], #0x10
+ add v15.8h, v20.8h, v22.8h
+ str q18, [x0, #0xf0]
+ mul v14.8h, v13.8h, v0.h[0]
+ add v2.8h, v2.8h, v25.8h
+ sub v26.8h, v20.8h, v22.8h
+ mul v4.8h, v12.8h, v1.h[2]
+ sub v5.8h, v16.8h, v2.8h
+ str q23, [x0, #0x170]
+ add v20.8h, v3.8h, v30.8h
+ sqrdmulh v27.8h, v26.8h, v1.h[5]
+ add v16.8h, v16.8h, v2.8h
+ mul v18.8h, v26.8h, v1.h[4]
+ sub v31.8h, v20.8h, v15.8h
+ mls v4.8h, v8.8h, v7.h[0]
+ sub v28.8h, v29.8h, v24.8h
+ mls v18.8h, v27.8h, v7.h[0]
+ ldr q22, [x0, #0x1d0]
+ mul v26.8h, v28.8h, v0.h[6]
+ mul v2.8h, v5.8h, v0.h[2]
+ sub v12.8h, v4.8h, v18.8h
+ sqrdmulh v24.8h, v28.8h, v0.h[7]
+ mls v14.8h, v9.8h, v7.h[0]
+ sqrdmulh v10.8h, v12.8h, v0.h[5]
+ mls v26.8h, v24.8h, v7.h[0]
+ ldr q24, [x0, #0x50]
+ mul v8.8h, v12.8h, v0.h[4]
+ str q14, [x0, #0x1b0]
+ add v28.8h, v4.8h, v18.8h
+ sqrdmulh v5.8h, v5.8h, v0.h[3]
+ add v6.8h, v20.8h, v15.8h
+ sqrdmulh v3.8h, v19.8h, v0.h[1]
+ sub v13.8h, v16.8h, v6.8h
+ sqrdmulh v12.8h, v21.8h, v1.h[1]
+ sqrdmulh v21.8h, v13.8h, v0.h[1]
+ sqrdmulh v27.8h, v31.8h, v0.h[5]
+ ldr q25, [x0, #0xd0]
+ mls v11.8h, v12.8h, v7.h[0]
+ mul v23.8h, v31.8h, v0.h[4]
+ mul v18.8h, v13.8h, v0.h[0]
+ add v30.8h, v26.8h, v11.8h
+ sub v13.8h, v26.8h, v11.8h
+ mls v23.8h, v27.8h, v7.h[0]
+ add v12.8h, v30.8h, v28.8h
+ sub v19.8h, v30.8h, v28.8h
+ mls v2.8h, v5.8h, v7.h[0]
+ str q12, [x0, #0x40]
+ sqrdmulh v26.8h, v13.8h, v0.h[3]
+ mls v8.8h, v10.8h, v7.h[0]
+ ldr q30, [x0, #0x150]
+ sub v20.8h, v2.8h, v23.8h
+ mul v4.8h, v13.8h, v0.h[2]
+ add v13.8h, v2.8h, v23.8h
+ mls v4.8h, v26.8h, v7.h[0]
+ ldr q2, [x0, #0x90]
+ mul v23.8h, v20.8h, v0.h[0]
+ ldr q29, [x0, #0x10]
+ sqrdmulh v10.8h, v20.8h, v0.h[1]
+ str q13, [x0, #0x80]
+ sub v13.8h, v4.8h, v8.8h
+ mls v17.8h, v3.8h, v7.h[0]
+ ldr q3, [x0, #0x110]
+ mls v18.8h, v21.8h, v7.h[0]
+ sub v21.8h, v2.8h, v25.8h
+ sqrdmulh v9.8h, v13.8h, v0.h[1]
+ str q17, [x0, #0x130]
+ mul v17.8h, v19.8h, v0.h[0]
+ subs x4, x4, #0x1
+ cbnz x4, Lintt_layer123_start
+ mls v23.8h, v10.8h, v7.h[0]
+ ldr q11, [x0, #0x190]
+ str q18, [x0, #0x100]
+ add v27.8h, v3.8h, v30.8h
+ mul v13.8h, v13.8h, v0.h[0]
+ sub v5.8h, v29.8h, v24.8h
+ add v14.8h, v16.8h, v6.8h
+ mls v13.8h, v9.8h, v7.h[0]
+ add v10.8h, v11.8h, v22.8h
+ str q23, [x0, #0x180]
+ sub v20.8h, v11.8h, v22.8h
+ sub v23.8h, v27.8h, v10.8h
+ sqrdmulh v16.8h, v21.8h, v1.h[1]
+ sqrdmulh v31.8h, v23.8h, v0.h[5]
+ str q13, [x0, #0x1c0]
+ add v13.8h, v4.8h, v8.8h
+ mul v18.8h, v21.8h, v1.h[0]
+ str q13, [x0, #0xc0]
+ sqrdmulh v13.8h, v19.8h, v0.h[1]
+ sqrdmulh v28.8h, v20.8h, v1.h[5]
+ str q14, [x0], #0x10
+ mul v4.8h, v20.8h, v1.h[4]
+ mls v17.8h, v13.8h, v7.h[0]
+ sub v13.8h, v3.8h, v30.8h
+ sqrdmulh v8.8h, v13.8h, v1.h[3]
+ mul v12.8h, v13.8h, v1.h[2]
+ mls v4.8h, v28.8h, v7.h[0]
+ mls v12.8h, v8.8h, v7.h[0]
+ mls v18.8h, v16.8h, v7.h[0]
+ str q17, [x0, #0x130]
+ sqrdmulh v15.8h, v5.8h, v0.h[7]
+ add v11.8h, v27.8h, v10.8h
+ mul v16.8h, v5.8h, v0.h[6]
+ sub v8.8h, v12.8h, v4.8h
+ sqrdmulh v28.8h, v8.8h, v0.h[5]
+ add v13.8h, v2.8h, v25.8h
+ mls v16.8h, v15.8h, v7.h[0]
+ add v26.8h, v12.8h, v4.8h
+ mul v8.8h, v8.8h, v0.h[4]
+ add v4.8h, v29.8h, v24.8h
+ mls v8.8h, v28.8h, v7.h[0]
+ sub v20.8h, v4.8h, v13.8h
+ add v14.8h, v4.8h, v13.8h
+ add v12.8h, v16.8h, v18.8h
+ sqrdmulh v22.8h, v20.8h, v0.h[3]
+ add v27.8h, v14.8h, v11.8h
+ sub v13.8h, v16.8h, v18.8h
+ mul v4.8h, v20.8h, v0.h[2]
+ str q27, [x0], #0x10
+ sub v24.8h, v12.8h, v26.8h
+ sqrdmulh v3.8h, v13.8h, v0.h[3]
+ mul v13.8h, v13.8h, v0.h[2]
+ sqrdmulh v27.8h, v24.8h, v0.h[1]
+ mls v13.8h, v3.8h, v7.h[0]
+ mul v9.8h, v24.8h, v0.h[0]
+ mls v9.8h, v27.8h, v7.h[0]
+ add v30.8h, v13.8h, v8.8h
+ sub v13.8h, v13.8h, v8.8h
+ mls v4.8h, v22.8h, v7.h[0]
+ str q30, [x0, #0xb0]
+ sqrdmulh v16.8h, v13.8h, v0.h[1]
+ str q9, [x0, #0x130]
+ mul v9.8h, v13.8h, v0.h[0]
+ add v13.8h, v12.8h, v26.8h
+ str q13, [x0, #0x30]
+ mul v13.8h, v23.8h, v0.h[4]
+ sub v23.8h, v14.8h, v11.8h
+ mls v13.8h, v31.8h, v7.h[0]
+ mls v9.8h, v16.8h, v7.h[0]
+ mul v30.8h, v23.8h, v0.h[0]
+ sub v24.8h, v4.8h, v13.8h
+ add v13.8h, v4.8h, v13.8h
+ sqrdmulh v23.8h, v23.8h, v0.h[1]
+ str q9, [x0, #0x1b0]
+ str q13, [x0, #0x70]
+ sqrdmulh v13.8h, v24.8h, v0.h[1]
+ mul v21.8h, v24.8h, v0.h[0]
+ mls v30.8h, v23.8h, v7.h[0]
+ mls v21.8h, v13.8h, v7.h[0]
+ str q30, [x0, #0xf0]
+ str q21, [x0, #0x170]
+ ldp d8, d9, [sp]
+ .cfi_restore d8
+ .cfi_restore d9
+ ldp d10, d11, [sp, #0x10]
+ .cfi_restore d10
+ .cfi_restore d11
+ ldp d12, d13, [sp, #0x20]
+ .cfi_restore d12
+ .cfi_restore d13
+ ldp d14, d15, [sp, #0x30]
+ .cfi_restore d14
+ .cfi_restore d15
+ add sp, sp, #0x40
+ .cfi_adjust_cfa_offset -0x40
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(intt_asm)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/ntt.S
index bf5922c144..2ce53dc579 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/ntt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/ntt.S
@@ -19,7 +19,33 @@
* https://eprint.iacr.org/2022/1303
*/
-/* AArch64 ML-KEM forward NTT following @[NeonNTT] and @[SLOTHY_Paper]. */
+/*yaml
+ Name: ntt_asm
+ Description: AArch64 ML-KEM forward NTT following @[NeonNTT] and @[SLOTHY_Paper]
+ Signature: void mlk_ntt_asm(int16_t p[256], const int16_t twiddles12345[80], const int16_t twiddles56[384])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: read/write
+ c_parameter: int16_t p[256]
+ description: Input/output polynomial
+ x1:
+ type: buffer
+ size_bytes: 160
+ permissions: read-only
+ c_parameter: const int16_t twiddles12345[80]
+ description: Twiddle factors for layers 1-5
+ x2:
+ type: buffer
+ size_bytes: 768
+ permissions: read-only
+ c_parameter: const int16_t twiddles56[384]
+ description: Twiddle factors for layers 6-7
+ Stack:
+ bytes: 64
+ description: saving callee-saved Neon registers
+*/
#include "../../../common.h"
#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
@@ -29,336 +55,508 @@
* dev/aarch64_opt/src/ntt.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(ntt_asm)
MLK_ASM_FN_SYMBOL(ntt_asm)
- sub sp, sp, #0x40
- stp d8, d9, [sp]
- stp d10, d11, [sp, #0x10]
- stp d12, d13, [sp, #0x20]
- stp d14, d15, [sp, #0x30]
- mov w5, #0xd01 // =3329
- mov v7.h[0], w5
- mov w5, #0x4ebf // =20159
- mov v7.h[1], w5
- mov x3, x0
- mov x4, #0x4 // =4
- ldr q0, [x1], #0x20
- ldur q1, [x1, #-0x10]
- ldr q5, [x0]
- ldr q13, [x0, #0x40]
- ldr q3, [x0, #0x80]
- ldr q22, [x0, #0xc0]
- ldr q24, [x0, #0x100]
- ldr q11, [x0, #0x1c0]
- mul v23.8h, v24.8h, v0.h[0]
- ldr q2, [x0, #0x140]
- mul v17.8h, v11.8h, v0.h[0]
- ldr q19, [x0, #0x180]
- sub x4, x4, #0x1
+ .cfi_startproc
+ sub sp, sp, #0x40
+ .cfi_adjust_cfa_offset 0x40
+ stp d8, d9, [sp]
+ .cfi_rel_offset d8, 0x0
+ .cfi_rel_offset d9, 0x8
+ stp d10, d11, [sp, #0x10]
+ .cfi_rel_offset d10, 0x10
+ .cfi_rel_offset d11, 0x18
+ stp d12, d13, [sp, #0x20]
+ .cfi_rel_offset d12, 0x20
+ .cfi_rel_offset d13, 0x28
+ stp d14, d15, [sp, #0x30]
+ .cfi_rel_offset d14, 0x30
+ .cfi_rel_offset d15, 0x38
+ mov w5, #0xd01 // =3329
+ mov v7.h[0], w5
+ mov w5, #0x4ebf // =20159
+ mov v7.h[1], w5
+ mov x3, x0
+ mov x4, #0x4 // =4
+ ldr q0, [x1], #0x20
+ ldur q1, [x1, #-0x10]
+ ldr q21, [x0, #0x40]
+ ldr q5, [x0, #0x1c0]
+ ldr q30, [x0, #0x110]
+ ldr q24, [x0, #0x140]
+ ldr q12, [x0, #0x80]
+ sqrdmulh v9.8h, v5.8h, v0.h[1]
+ mul v23.8h, v5.8h, v0.h[0]
+ sqrdmulh v17.8h, v24.8h, v0.h[1]
+ ldr q13, [x0, #0xc0]
+ mls v23.8h, v9.8h, v7.h[0]
+ mul v8.8h, v24.8h, v0.h[0]
+ mls v8.8h, v17.8h, v7.h[0]
+ add v9.8h, v13.8h, v23.8h
+ sub v10.8h, v13.8h, v23.8h
+ mul v11.8h, v30.8h, v0.h[0]
+ ldr q13, [x0, #0x180]
+ sqrdmulh v28.8h, v9.8h, v0.h[3]
+ sub v29.8h, v21.8h, v8.8h
+ mul v26.8h, v9.8h, v0.h[2]
+ add v8.8h, v21.8h, v8.8h
+ mul v2.8h, v13.8h, v0.h[0]
+ mls v26.8h, v28.8h, v7.h[0]
+ mul v28.8h, v10.8h, v0.h[4]
+ sqrdmulh v23.8h, v10.8h, v0.h[5]
+ add v22.8h, v8.8h, v26.8h
+ sqrdmulh v10.8h, v13.8h, v0.h[1]
+ sqrdmulh v21.8h, v22.8h, v0.h[7]
+ ldr q13, [x0, #0x100]
+ mul v16.8h, v22.8h, v0.h[6]
+ mls v28.8h, v23.8h, v7.h[0]
+ mls v2.8h, v10.8h, v7.h[0]
+ sqrdmulh v23.8h, v13.8h, v0.h[1]
+ sub v10.8h, v29.8h, v28.8h
+ add v17.8h, v29.8h, v28.8h
+ mls v16.8h, v21.8h, v7.h[0]
+ sub v18.8h, v12.8h, v2.8h
+ ldr q29, [x0]
+ sqrdmulh v14.8h, v17.8h, v1.h[3]
+ add v22.8h, v12.8h, v2.8h
+ sqrdmulh v9.8h, v18.8h, v0.h[5]
+ mul v21.8h, v13.8h, v0.h[0]
+ ldr q13, [x0, #0x150]
+ mul v5.8h, v18.8h, v0.h[4]
+ mls v5.8h, v9.8h, v7.h[0]
+ mul v18.8h, v13.8h, v0.h[0]
+ mls v21.8h, v23.8h, v7.h[0]
+ sqrdmulh v2.8h, v13.8h, v0.h[1]
+ mul v13.8h, v17.8h, v1.h[2]
+ sub v4.8h, v29.8h, v21.8h
+ mls v13.8h, v14.8h, v7.h[0]
+ add v25.8h, v29.8h, v21.8h
+ add v6.8h, v4.8h, v5.8h
+ sqrdmulh v15.8h, v22.8h, v0.h[3]
+ sub v21.8h, v4.8h, v5.8h
+ sub v5.8h, v8.8h, v26.8h
+ mul v23.8h, v22.8h, v0.h[2]
+ add v28.8h, v6.8h, v13.8h
+ sub v13.8h, v6.8h, v13.8h
+ mul v4.8h, v5.8h, v1.h[0]
+ sub x4, x4, #0x2
-ntt_layer123_start:
- sqrdmulh v8.8h, v24.8h, v0.h[1]
- sqrdmulh v24.8h, v2.8h, v0.h[1]
- mul v2.8h, v2.8h, v0.h[0]
- sqrdmulh v14.8h, v19.8h, v0.h[1]
- mls v23.8h, v8.8h, v7.h[0]
- mul v8.8h, v19.8h, v0.h[0]
- mls v2.8h, v24.8h, v7.h[0]
- sqrdmulh v24.8h, v11.8h, v0.h[1]
- sub v11.8h, v5.8h, v23.8h
- mls v8.8h, v14.8h, v7.h[0]
- sub v14.8h, v13.8h, v2.8h
- add v2.8h, v13.8h, v2.8h
- add v23.8h, v5.8h, v23.8h
- sub v19.8h, v3.8h, v8.8h
- add v8.8h, v3.8h, v8.8h
- mls v17.8h, v24.8h, v7.h[0]
- sqrdmulh v24.8h, v19.8h, v0.h[5]
- mul v19.8h, v19.8h, v0.h[4]
- sqrdmulh v5.8h, v8.8h, v0.h[3]
- sub v13.8h, v22.8h, v17.8h
- add v17.8h, v22.8h, v17.8h
- mls v19.8h, v24.8h, v7.h[0]
- sqrdmulh v24.8h, v13.8h, v0.h[5]
- mul v13.8h, v13.8h, v0.h[4]
- mul v8.8h, v8.8h, v0.h[2]
- sub v3.8h, v11.8h, v19.8h
- add v11.8h, v11.8h, v19.8h
- mls v13.8h, v24.8h, v7.h[0]
- sqrdmulh v24.8h, v17.8h, v0.h[3]
- mul v19.8h, v17.8h, v0.h[2]
- mls v8.8h, v5.8h, v7.h[0]
- sub v17.8h, v14.8h, v13.8h
- add v14.8h, v14.8h, v13.8h
- mls v19.8h, v24.8h, v7.h[0]
- sub v24.8h, v23.8h, v8.8h
- add v8.8h, v23.8h, v8.8h
- sqrdmulh v23.8h, v14.8h, v1.h[3]
- sub v5.8h, v2.8h, v19.8h
- add v2.8h, v2.8h, v19.8h
- mul v14.8h, v14.8h, v1.h[2]
- sqrdmulh v19.8h, v5.8h, v1.h[1]
- sqrdmulh v13.8h, v2.8h, v0.h[7]
- mul v2.8h, v2.8h, v0.h[6]
- mul v5.8h, v5.8h, v1.h[0]
- mls v14.8h, v23.8h, v7.h[0]
- sqrdmulh v23.8h, v17.8h, v1.h[5]
- mls v2.8h, v13.8h, v7.h[0]
- mls v5.8h, v19.8h, v7.h[0]
- sub v19.8h, v11.8h, v14.8h
- add v14.8h, v11.8h, v14.8h
- sub v11.8h, v8.8h, v2.8h
- mul v17.8h, v17.8h, v1.h[4]
- add v8.8h, v8.8h, v2.8h
- sub v2.8h, v24.8h, v5.8h
- add v24.8h, v24.8h, v5.8h
- mls v17.8h, v23.8h, v7.h[0]
- str q8, [x0], #0x10
- ldr q5, [x0]
- sub v8.8h, v3.8h, v17.8h
- add v23.8h, v3.8h, v17.8h
- str q11, [x0, #0x30]
- ldr q13, [x0, #0x40]
- str q24, [x0, #0x70]
- ldr q3, [x0, #0x80]
- str q2, [x0, #0xb0]
- ldr q22, [x0, #0xc0]
- str q14, [x0, #0xf0]
- ldr q24, [x0, #0x100]
- str q19, [x0, #0x130]
- ldr q2, [x0, #0x140]
- str q23, [x0, #0x170]
- mul v23.8h, v24.8h, v0.h[0]
- str q8, [x0, #0x1b0]
- ldr q11, [x0, #0x1c0]
- ldr q19, [x0, #0x180]
- mul v17.8h, v11.8h, v0.h[0]
- subs x4, x4, #0x1
- cbnz x4, ntt_layer123_start
- sqrdmulh v6.8h, v11.8h, v0.h[1]
- mul v25.8h, v19.8h, v0.h[0]
- sqrdmulh v12.8h, v19.8h, v0.h[1]
- mul v11.8h, v2.8h, v0.h[0]
- mls v17.8h, v6.8h, v7.h[0]
- sqrdmulh v14.8h, v2.8h, v0.h[1]
- mls v25.8h, v12.8h, v7.h[0]
- sqrdmulh v27.8h, v24.8h, v0.h[1]
- add v9.8h, v22.8h, v17.8h
- mls v11.8h, v14.8h, v7.h[0]
- sub v26.8h, v3.8h, v25.8h
- sqrdmulh v2.8h, v9.8h, v0.h[3]
- mul v24.8h, v9.8h, v0.h[2]
- mul v19.8h, v26.8h, v0.h[4]
- sqrdmulh v14.8h, v26.8h, v0.h[5]
- mls v23.8h, v27.8h, v7.h[0]
- mls v24.8h, v2.8h, v7.h[0]
- add v6.8h, v13.8h, v11.8h
- mls v19.8h, v14.8h, v7.h[0]
- sub v4.8h, v5.8h, v23.8h
- add v10.8h, v3.8h, v25.8h
- sub v8.8h, v6.8h, v24.8h
- add v3.8h, v4.8h, v19.8h
- sub v31.8h, v4.8h, v19.8h
- mul v14.8h, v8.8h, v1.h[0]
- sqrdmulh v4.8h, v10.8h, v0.h[3]
- mul v12.8h, v10.8h, v0.h[2]
- sqrdmulh v2.8h, v8.8h, v1.h[1]
- sub v8.8h, v22.8h, v17.8h
- add v30.8h, v5.8h, v23.8h
- mls v12.8h, v4.8h, v7.h[0]
- sqrdmulh v4.8h, v8.8h, v0.h[5]
- mul v19.8h, v8.8h, v0.h[4]
- mls v14.8h, v2.8h, v7.h[0]
- sub v27.8h, v30.8h, v12.8h
- sub v23.8h, v13.8h, v11.8h
- mls v19.8h, v4.8h, v7.h[0]
- sub v2.8h, v27.8h, v14.8h
- add v8.8h, v27.8h, v14.8h
- add v14.8h, v6.8h, v24.8h
- str q2, [x0, #0xc0]
- add v2.8h, v23.8h, v19.8h
- str q8, [x0, #0x80]
- sub v19.8h, v23.8h, v19.8h
- sqrdmulh v13.8h, v2.8h, v1.h[3]
- mul v17.8h, v2.8h, v1.h[2]
- add v27.8h, v30.8h, v12.8h
- sqrdmulh v24.8h, v19.8h, v1.h[5]
- mul v19.8h, v19.8h, v1.h[4]
- mls v17.8h, v13.8h, v7.h[0]
- sqrdmulh v8.8h, v14.8h, v0.h[7]
- mul v2.8h, v14.8h, v0.h[6]
- mls v19.8h, v24.8h, v7.h[0]
- add v26.8h, v3.8h, v17.8h
- sub v14.8h, v3.8h, v17.8h
- mls v2.8h, v8.8h, v7.h[0]
- str q26, [x0, #0x100]
- add v8.8h, v31.8h, v19.8h
- str q14, [x0, #0x140]
- sub v24.8h, v31.8h, v19.8h
- str q8, [x0, #0x180]
- add v18.8h, v27.8h, v2.8h
- str q24, [x0, #0x1c0]
- sub v14.8h, v27.8h, v2.8h
- str q18, [x0], #0x10
- str q14, [x0, #0x30]
- mov x0, x3
- mov x4, #0x8 // =8
- ldr q11, [x1], #0x10
- ldr q24, [x0, #0x30]
- ldr q8, [x0, #0x20]
- sqrdmulh v14.8h, v24.8h, v11.h[1]
- mul v2.8h, v24.8h, v11.h[0]
- sqrdmulh v9.8h, v8.8h, v11.h[1]
- ldr q24, [x0, #0x10]
- mls v2.8h, v14.8h, v7.h[0]
- mul v14.8h, v8.8h, v11.h[0]
- ldr q6, [x2, #0x40]
- sub v8.8h, v24.8h, v2.8h
- mls v14.8h, v9.8h, v7.h[0]
- add v2.8h, v24.8h, v2.8h
- mul v27.8h, v8.8h, v11.h[4]
- sqrdmulh v8.8h, v8.8h, v11.h[5]
- mul v24.8h, v2.8h, v11.h[2]
- sqrdmulh v11.8h, v2.8h, v11.h[3]
- mls v27.8h, v8.8h, v7.h[0]
- ldr q5, [x2, #0x50]
- sub x4, x4, #0x1
+Lntt_layer123_start:
+ mls v23.8h, v15.8h, v7.h[0]
+ ldr q6, [x0, #0x190]
+ ldr q15, [x0, #0x90]
+ ldr q19, [x0, #0x10]
+ mul v22.8h, v10.8h, v1.h[4]
+ ldr q24, [x0, #0x50]
+ str q13, [x0, #0x140]
+ sqrdmulh v13.8h, v6.8h, v0.h[1]
+ sub v20.8h, v25.8h, v23.8h
+ sqrdmulh v3.8h, v30.8h, v0.h[1]
+ str q28, [x0, #0x100]
+ ldr q30, [x0, #0x120]
+ mul v8.8h, v6.8h, v0.h[0]
+ sqrdmulh v27.8h, v10.8h, v1.h[5]
+ mls v11.8h, v3.8h, v7.h[0]
+ mls v18.8h, v2.8h, v7.h[0]
+ ldr q31, [x0, #0x160]
+ sqrdmulh v10.8h, v5.8h, v1.h[1]
+ mls v8.8h, v13.8h, v7.h[0]
+ ldr q13, [x0, #0x1d0]
+ sub v14.8h, v24.8h, v18.8h
+ add v9.8h, v24.8h, v18.8h
+ sqrdmulh v2.8h, v31.8h, v0.h[1]
+ mls v4.8h, v10.8h, v7.h[0]
+ add v10.8h, v25.8h, v23.8h
+ sub v24.8h, v19.8h, v11.8h
+ add v25.8h, v19.8h, v11.8h
+ sqrdmulh v28.8h, v13.8h, v0.h[1]
+ mul v11.8h, v30.8h, v0.h[0]
+ mul v17.8h, v13.8h, v0.h[0]
+ sub v13.8h, v10.8h, v16.8h
+ sub v6.8h, v15.8h, v8.8h
+ mls v17.8h, v28.8h, v7.h[0]
+ str q13, [x0, #0x40]
+ mls v22.8h, v27.8h, v7.h[0]
+ ldr q13, [x0, #0xd0]
+ add v26.8h, v20.8h, v4.8h
+ mul v18.8h, v31.8h, v0.h[0]
+ add v27.8h, v10.8h, v16.8h
+ str q26, [x0, #0x80]
+ sqrdmulh v31.8h, v6.8h, v0.h[5]
+ add v3.8h, v21.8h, v22.8h
+ str q27, [x0], #0x10
+ mul v26.8h, v6.8h, v0.h[4]
+ add v6.8h, v13.8h, v17.8h
+ sub v5.8h, v13.8h, v17.8h
+ str q3, [x0, #0x170]
+ sub v17.8h, v21.8h, v22.8h
+ sqrdmulh v10.8h, v6.8h, v0.h[3]
+ sub v13.8h, v20.8h, v4.8h
+ add v20.8h, v15.8h, v8.8h
+ sqrdmulh v12.8h, v5.8h, v0.h[5]
+ str q13, [x0, #0xb0]
+ mul v8.8h, v6.8h, v0.h[2]
+ str q17, [x0, #0x1b0]
+ mls v8.8h, v10.8h, v7.h[0]
+ mul v29.8h, v5.8h, v0.h[4]
+ mls v29.8h, v12.8h, v7.h[0]
+ sub v5.8h, v9.8h, v8.8h
+ add v3.8h, v9.8h, v8.8h
+ sqrdmulh v15.8h, v20.8h, v0.h[3]
+ mul v4.8h, v5.8h, v1.h[0]
+ add v6.8h, v14.8h, v29.8h
+ sqrdmulh v9.8h, v3.8h, v0.h[7]
+ sqrdmulh v12.8h, v6.8h, v1.h[3]
+ sub v10.8h, v14.8h, v29.8h
+ mul v23.8h, v6.8h, v1.h[2]
+ mls v26.8h, v31.8h, v7.h[0]
+ mls v23.8h, v12.8h, v7.h[0]
+ mul v16.8h, v3.8h, v0.h[6]
+ add v13.8h, v24.8h, v26.8h
+ sub v21.8h, v24.8h, v26.8h
+ mls v16.8h, v9.8h, v7.h[0]
+ add v28.8h, v13.8h, v23.8h
+ sub v13.8h, v13.8h, v23.8h
+ mul v23.8h, v20.8h, v0.h[2]
+ subs x4, x4, #0x1
+ cbnz x4, Lntt_layer123_start
+ sqrdmulh v3.8h, v5.8h, v1.h[1]
+ mls v23.8h, v15.8h, v7.h[0]
+ ldr q5, [x0, #0x190]
+ mul v29.8h, v10.8h, v1.h[4]
+ mls v4.8h, v3.8h, v7.h[0]
+ sub v19.8h, v25.8h, v23.8h
+ sqrdmulh v31.8h, v5.8h, v0.h[1]
+ sqrdmulh v6.8h, v30.8h, v0.h[1]
+ sub v3.8h, v19.8h, v4.8h
+ mul v5.8h, v5.8h, v0.h[0]
+ str q3, [x0, #0xc0]
+ sqrdmulh v12.8h, v10.8h, v1.h[5]
+ mls v18.8h, v2.8h, v7.h[0]
+ ldr q3, [x0, #0x1d0]
+ mls v5.8h, v31.8h, v7.h[0]
+ sqrdmulh v10.8h, v3.8h, v0.h[1]
+ mls v11.8h, v6.8h, v7.h[0]
+ ldr q31, [x0, #0x90]
+ mul v30.8h, v3.8h, v0.h[0]
+ mls v30.8h, v10.8h, v7.h[0]
+ sub v10.8h, v31.8h, v5.8h
+ mls v29.8h, v12.8h, v7.h[0]
+ ldr q6, [x0, #0xd0]
+ sqrdmulh v15.8h, v10.8h, v0.h[5]
+ mul v17.8h, v10.8h, v0.h[4]
+ add v10.8h, v6.8h, v30.8h
+ sub v6.8h, v6.8h, v30.8h
+ sqrdmulh v12.8h, v10.8h, v0.h[3]
+ sub v27.8h, v21.8h, v29.8h
+ sqrdmulh v3.8h, v6.8h, v0.h[5]
+ mul v10.8h, v10.8h, v0.h[2]
+ ldr q20, [x0, #0x50]
+ mls v10.8h, v12.8h, v7.h[0]
+ mul v2.8h, v6.8h, v0.h[4]
+ add v6.8h, v20.8h, v18.8h
+ add v5.8h, v31.8h, v5.8h
+ mls v2.8h, v3.8h, v7.h[0]
+ sub v31.8h, v6.8h, v10.8h
+ sqrdmulh v12.8h, v5.8h, v0.h[3]
+ sub v22.8h, v20.8h, v18.8h
+ add v6.8h, v6.8h, v10.8h
+ mul v20.8h, v31.8h, v1.h[0]
+ add v30.8h, v22.8h, v2.8h
+ sqrdmulh v3.8h, v6.8h, v0.h[7]
+ sqrdmulh v10.8h, v30.8h, v1.h[3]
+ mul v9.8h, v30.8h, v1.h[2]
+ ldr q30, [x0, #0x10]
+ mls v17.8h, v15.8h, v7.h[0]
+ mls v9.8h, v10.8h, v7.h[0]
+ mul v15.8h, v6.8h, v0.h[6]
+ add v24.8h, v30.8h, v11.8h
+ sub v10.8h, v22.8h, v2.8h
+ mls v15.8h, v3.8h, v7.h[0]
+ add v6.8h, v19.8h, v4.8h
+ add v22.8h, v25.8h, v23.8h
+ sqrdmulh v3.8h, v10.8h, v1.h[5]
+ str q13, [x0, #0x140]
+ sub v19.8h, v30.8h, v11.8h
+ add v25.8h, v22.8h, v16.8h
+ mul v5.8h, v5.8h, v0.h[2]
+ sub v13.8h, v22.8h, v16.8h
+ str q28, [x0, #0x100]
+ mls v5.8h, v12.8h, v7.h[0]
+ str q13, [x0, #0x40]
+ str q6, [x0, #0x80]
+ add v21.8h, v21.8h, v29.8h
+ sqrdmulh v13.8h, v31.8h, v1.h[1]
+ str q25, [x0], #0x10
+ add v12.8h, v19.8h, v17.8h
+ sub v31.8h, v19.8h, v17.8h
+ mul v30.8h, v10.8h, v1.h[4]
+ str q21, [x0, #0x170]
+ add v21.8h, v24.8h, v5.8h
+ add v6.8h, v12.8h, v9.8h
+ mls v30.8h, v3.8h, v7.h[0]
+ str q27, [x0, #0x1b0]
+ sub v10.8h, v21.8h, v15.8h
+ sub v12.8h, v12.8h, v9.8h
+ mls v20.8h, v13.8h, v7.h[0]
+ str q6, [x0, #0x100]
+ str q10, [x0, #0x40]
+ sub v13.8h, v24.8h, v5.8h
+ add v3.8h, v21.8h, v15.8h
+ str q12, [x0, #0x140]
+ sub v10.8h, v31.8h, v30.8h
+ add v21.8h, v31.8h, v30.8h
+ str q3, [x0], #0x10
+ add v12.8h, v13.8h, v20.8h
+ sub v13.8h, v13.8h, v20.8h
+ str q21, [x0, #0x170]
+ str q10, [x0, #0x1b0]
+ str q12, [x0, #0x70]
+ str q13, [x0, #0xb0]
+ mov x0, x3
+ mov x4, #0x8 // =8
+ ldr q2, [x0, #0x20]
+ ldr q13, [x1], #0x10
+ ldr q30, [x0, #0x30]
+ ldr q25, [x2, #0x40]
+ ldr q5, [x0]
+ ldr q18, [x0, #0x60]
+ ldr q12, [x0, #0x70]
+ sqrdmulh v17.8h, v2.8h, v13.h[1]
+ ldr q4, [x1], #0x10
+ ldr q23, [x0, #0x10]
+ sqrdmulh v21.8h, v30.8h, v13.h[1]
+ ldr q24, [x2, #0x20]
+ ldr q9, [x2], #0x60
+ mul v10.8h, v30.8h, v13.h[0]
+ mul v11.8h, v2.8h, v13.h[0]
+ mls v10.8h, v21.8h, v7.h[0]
+ sqrdmulh v29.8h, v12.8h, v4.h[1]
+ mul v1.8h, v12.8h, v4.h[0]
+ add v21.8h, v23.8h, v10.8h
+ sub v10.8h, v23.8h, v10.8h
+ mul v8.8h, v18.8h, v4.h[0]
+ sqrdmulh v23.8h, v21.8h, v13.h[3]
+ mul v2.8h, v21.8h, v13.h[2]
+ mls v1.8h, v29.8h, v7.h[0]
+ mls v2.8h, v23.8h, v7.h[0]
+ ldur q15, [x2, #-0x50]
+ sqrdmulh v0.8h, v10.8h, v13.h[5]
+ mls v11.8h, v17.8h, v7.h[0]
+ ldr q29, [x0, #0x50]
+ mul v23.8h, v10.8h, v13.h[4]
+ mls v23.8h, v0.8h, v7.h[0]
+ sub v16.8h, v29.8h, v1.8h
+ add v3.8h, v5.8h, v11.8h
+ sub v31.8h, v5.8h, v11.8h
+ sqrdmulh v22.8h, v16.8h, v4.h[5]
+ add v30.8h, v3.8h, v2.8h
+ sub v0.8h, v3.8h, v2.8h
+ sqrdmulh v28.8h, v18.8h, v4.h[1]
+ add v21.8h, v31.8h, v23.8h
+ sub v19.8h, v31.8h, v23.8h
+ mul v26.8h, v16.8h, v4.h[4]
+ trn2 v3.4s, v30.4s, v0.4s
+ ldur q23, [x2, #-0x10]
+ trn2 v18.4s, v21.4s, v19.4s
+ mls v26.8h, v22.8h, v7.h[0]
+ trn1 v13.4s, v30.4s, v0.4s
+ mls v8.8h, v28.8h, v7.h[0]
+ trn2 v31.2d, v3.2d, v18.2d
+ trn1 v11.4s, v21.4s, v19.4s
+ add v27.8h, v29.8h, v1.8h
+ sqrdmulh v6.8h, v31.8h, v15.8h
+ trn1 v2.2d, v13.2d, v11.2d
+ trn2 v13.2d, v13.2d, v11.2d
+ mul v1.8h, v31.8h, v9.8h
+ ldr q11, [x0, #0x40]
+ sqrdmulh v29.8h, v13.8h, v15.8h
+ mls v1.8h, v6.8h, v7.h[0]
+ trn1 v6.2d, v3.2d, v18.2d
+ mul v17.8h, v13.8h, v9.8h
+ sub v13.8h, v11.8h, v8.8h
+ sqrdmulh v10.8h, v27.8h, v4.h[3]
+ sub v12.8h, v13.8h, v26.8h
+ sub v18.8h, v6.8h, v1.8h
+ mls v17.8h, v29.8h, v7.h[0]
+ add v30.8h, v6.8h, v1.8h
+ add v6.8h, v13.8h, v26.8h
+ ldur q13, [x2, #-0x30]
+ sqrdmulh v16.8h, v18.8h, v23.8h
+ trn1 v28.4s, v6.4s, v12.4s
+ mul v23.8h, v18.8h, v25.8h
+ ldr q25, [x2, #0x10]
+ add v20.8h, v2.8h, v17.8h
+ mul v0.8h, v30.8h, v24.8h
+ sqrdmulh v29.8h, v30.8h, v13.8h
+ sub v30.8h, v2.8h, v17.8h
+ mls v23.8h, v16.8h, v7.h[0]
+ sub x4, x4, #0x2
-ntt_layer4567_start:
- ldr q8, [x0]
- ldr q17, [x2, #0x10]
- sub v1.8h, v8.8h, v14.8h
- mls v24.8h, v11.8h, v7.h[0]
- add v8.8h, v8.8h, v14.8h
- sub v0.8h, v1.8h, v27.8h
- add v12.8h, v1.8h, v27.8h
- sub v19.8h, v8.8h, v24.8h
- add v8.8h, v8.8h, v24.8h
- trn1 v24.4s, v12.4s, v0.4s
- trn2 v13.4s, v12.4s, v0.4s
- trn1 v23.4s, v8.4s, v19.4s
- ldr q2, [x2], #0x60
- trn2 v9.2d, v23.2d, v24.2d
- trn2 v8.4s, v8.4s, v19.4s
- sqrdmulh v26.8h, v9.8h, v17.8h
- trn1 v24.2d, v23.2d, v24.2d
- trn2 v11.2d, v8.2d, v13.2d
- trn1 v29.2d, v8.2d, v13.2d
- sqrdmulh v23.8h, v11.8h, v17.8h
- mul v10.8h, v11.8h, v2.8h
- mul v0.8h, v9.8h, v2.8h
- ldur q11, [x2, #-0x40]
- mls v10.8h, v23.8h, v7.h[0]
- mls v0.8h, v26.8h, v7.h[0]
- ldur q19, [x2, #-0x30]
- add v17.8h, v29.8h, v10.8h
- sub v23.8h, v24.8h, v0.8h
- sub v30.8h, v29.8h, v10.8h
- mul v2.8h, v17.8h, v11.8h
- sqrdmulh v11.8h, v17.8h, v19.8h
- mul v8.8h, v30.8h, v6.8h
- ldr q22, [x0, #0x70]
- mls v2.8h, v11.8h, v7.h[0]
- add v24.8h, v24.8h, v0.8h
- ldr q15, [x1], #0x10
- sub v14.8h, v24.8h, v2.8h
- add v24.8h, v24.8h, v2.8h
- sqrdmulh v1.8h, v22.8h, v15.h[1]
- mul v2.8h, v22.8h, v15.h[0]
- trn1 v0.4s, v24.4s, v14.4s
- trn2 v24.4s, v24.4s, v14.4s
- sqrdmulh v19.8h, v30.8h, v5.8h
- mls v2.8h, v1.8h, v7.h[0]
- ldr q16, [x0, #0x60]
- mls v8.8h, v19.8h, v7.h[0]
- ldr q6, [x2, #0x40]
- mul v14.8h, v16.8h, v15.h[0]
- sub v3.8h, v23.8h, v8.8h
- add v8.8h, v23.8h, v8.8h
- ldr q5, [x2, #0x50]
- trn2 v23.4s, v8.4s, v3.4s
- trn1 v31.4s, v8.4s, v3.4s
- sqrdmulh v8.8h, v16.8h, v15.h[1]
- trn2 v25.2d, v24.2d, v23.2d
- trn1 v29.2d, v24.2d, v23.2d
- ldr q24, [x0, #0x50]
- trn1 v16.2d, v0.2d, v31.2d
- mls v14.8h, v8.8h, v7.h[0]
- sub v13.8h, v24.8h, v2.8h
- add v24.8h, v24.8h, v2.8h
- trn2 v2.2d, v0.2d, v31.2d
- sqrdmulh v19.8h, v13.8h, v15.h[5]
- str q2, [x0, #0x20]
- sqrdmulh v11.8h, v24.8h, v15.h[3]
- str q16, [x0], #0x40
- mul v27.8h, v13.8h, v15.h[4]
- stur q29, [x0, #-0x30]
- mul v24.8h, v24.8h, v15.h[2]
- stur q25, [x0, #-0x10]
- mls v27.8h, v19.8h, v7.h[0]
- subs x4, x4, #0x1
- cbnz x4, ntt_layer4567_start
- ldr q23, [x0]
- ldr q17, [x2], #0x60
- sub v19.8h, v23.8h, v14.8h
- mls v24.8h, v11.8h, v7.h[0]
- add v14.8h, v23.8h, v14.8h
- add v8.8h, v19.8h, v27.8h
- sub v13.8h, v19.8h, v27.8h
- add v12.8h, v14.8h, v24.8h
- sub v24.8h, v14.8h, v24.8h
- trn1 v0.4s, v8.4s, v13.4s
- trn2 v23.4s, v8.4s, v13.4s
- trn2 v19.4s, v12.4s, v24.4s
- ldur q27, [x2, #-0x50]
- trn2 v8.2d, v19.2d, v23.2d
- trn1 v22.4s, v12.4s, v24.4s
- mul v14.8h, v8.8h, v17.8h
- sqrdmulh v24.8h, v8.8h, v27.8h
- trn2 v2.2d, v22.2d, v0.2d
- trn1 v8.2d, v19.2d, v23.2d
- mul v11.8h, v2.8h, v17.8h
- mls v14.8h, v24.8h, v7.h[0]
- ldur q26, [x2, #-0x30]
- sqrdmulh v23.8h, v2.8h, v27.8h
- sub v24.8h, v8.8h, v14.8h
- ldur q2, [x2, #-0x40]
- sqrdmulh v19.8h, v24.8h, v5.8h
- add v14.8h, v8.8h, v14.8h
- mul v24.8h, v24.8h, v6.8h
- mls v11.8h, v23.8h, v7.h[0]
- sqrdmulh v8.8h, v14.8h, v26.8h
- mul v2.8h, v14.8h, v2.8h
- trn1 v14.2d, v22.2d, v0.2d
- mls v24.8h, v19.8h, v7.h[0]
- sub v23.8h, v14.8h, v11.8h
- mls v2.8h, v8.8h, v7.h[0]
- add v14.8h, v14.8h, v11.8h
- add v8.8h, v23.8h, v24.8h
- sub v24.8h, v23.8h, v24.8h
- sub v19.8h, v14.8h, v2.8h
- add v11.8h, v14.8h, v2.8h
- trn1 v2.4s, v8.4s, v24.4s
- trn2 v14.4s, v8.4s, v24.4s
- trn2 v23.4s, v11.4s, v19.4s
- trn1 v11.4s, v11.4s, v19.4s
- trn2 v8.2d, v23.2d, v14.2d
- trn1 v24.2d, v11.2d, v2.2d
- str q8, [x0, #0x30]
- trn2 v8.2d, v11.2d, v2.2d
- str q24, [x0], #0x40
- trn1 v24.2d, v23.2d, v14.2d
- stur q8, [x0, #-0x20]
- stur q24, [x0, #-0x30]
- ldp d8, d9, [sp]
- ldp d10, d11, [sp, #0x10]
- ldp d12, d13, [sp, #0x20]
- ldp d14, d15, [sp, #0x30]
- add sp, sp, #0x40
+Lntt_layer4567_start:
+ ldr q19, [x2, #0x50]
+ sub v31.8h, v30.8h, v23.8h
+ mls v0.8h, v29.8h, v7.h[0]
+ add v16.8h, v11.8h, v8.8h
+ ldr q18, [x0, #0xa0]
+ trn2 v14.4s, v6.4s, v12.4s
+ mul v26.8h, v27.8h, v4.h[2]
+ ldr q4, [x1], #0x10
+ ldr q24, [x2, #0x40]
+ ldr q21, [x0, #0xb0]
+ mls v26.8h, v10.8h, v7.h[0]
+ add v23.8h, v30.8h, v23.8h
+ sub v15.8h, v20.8h, v0.8h
+ ldr q9, [x0, #0x90]
+ add v10.8h, v20.8h, v0.8h
+ mul v8.8h, v18.8h, v4.h[0]
+ ldr q1, [x2], #0x60
+ trn1 v27.4s, v23.4s, v31.4s
+ sqrdmulh v12.8h, v18.8h, v4.h[1]
+ trn1 v5.4s, v10.4s, v15.4s
+ sub v30.8h, v16.8h, v26.8h
+ trn2 v13.2d, v5.2d, v27.2d
+ sqrdmulh v2.8h, v21.8h, v4.h[1]
+ add v29.8h, v16.8h, v26.8h
+ mul v0.8h, v21.8h, v4.h[0]
+ str q13, [x0, #0x20]
+ trn1 v11.4s, v29.4s, v30.4s
+ mls v8.8h, v12.8h, v7.h[0]
+ trn2 v26.4s, v29.4s, v30.4s
+ trn2 v6.2d, v11.2d, v28.2d
+ mls v0.8h, v2.8h, v7.h[0]
+ trn2 v16.2d, v26.2d, v14.2d
+ trn1 v26.2d, v26.2d, v14.2d
+ trn1 v20.2d, v5.2d, v27.2d
+ sqrdmulh v29.8h, v6.8h, v25.8h
+ trn2 v15.4s, v10.4s, v15.4s
+ sqrdmulh v13.8h, v16.8h, v25.8h
+ str q20, [x0], #0x40
+ sub v30.8h, v9.8h, v0.8h
+ add v27.8h, v9.8h, v0.8h
+ mul v17.8h, v6.8h, v1.8h
+ sqrdmulh v22.8h, v30.8h, v4.h[5]
+ mul v18.8h, v16.8h, v1.8h
+ mls v18.8h, v13.8h, v7.h[0]
+ mul v2.8h, v30.8h, v4.h[4]
+ mls v2.8h, v22.8h, v7.h[0]
+ trn2 v22.4s, v23.4s, v31.4s
+ sub v3.8h, v26.8h, v18.8h
+ ldur q25, [x2, #-0x30]
+ mls v17.8h, v29.8h, v7.h[0]
+ trn2 v31.2d, v15.2d, v22.2d
+ trn1 v20.2d, v15.2d, v22.2d
+ add v16.8h, v26.8h, v18.8h
+ sqrdmulh v26.8h, v3.8h, v19.8h
+ trn1 v21.2d, v11.2d, v28.2d
+ ldr q11, [x0, #0x40]
+ sqrdmulh v29.8h, v16.8h, v25.8h
+ stur q20, [x0, #-0x30]
+ add v20.8h, v21.8h, v17.8h
+ stur q31, [x0, #-0x10]
+ mul v23.8h, v3.8h, v24.8h
+ ldr q25, [x2, #0x10]
+ sub v13.8h, v11.8h, v8.8h
+ mls v23.8h, v26.8h, v7.h[0]
+ ldur q1, [x2, #-0x40]
+ sub v12.8h, v13.8h, v2.8h
+ add v6.8h, v13.8h, v2.8h
+ sqrdmulh v10.8h, v27.8h, v4.h[3]
+ sub v30.8h, v21.8h, v17.8h
+ mul v0.8h, v16.8h, v1.8h
+ trn1 v28.4s, v6.4s, v12.4s
+ subs x4, x4, #0x1
+ cbnz x4, Lntt_layer4567_start
+ add v22.8h, v11.8h, v8.8h
+ mul v27.8h, v27.8h, v4.h[2]
+ trn2 v17.4s, v6.4s, v12.4s
+ ldr q15, [x2], #0x60
+ mls v27.8h, v10.8h, v7.h[0]
+ add v4.8h, v30.8h, v23.8h
+ sub v18.8h, v30.8h, v23.8h
+ ldur q6, [x2, #-0x30]
+ mls v0.8h, v29.8h, v7.h[0]
+ ldur q12, [x2, #-0x40]
+ ldur q24, [x2, #-0x20]
+ ldur q2, [x2, #-0x10]
+ trn1 v9.4s, v4.4s, v18.4s
+ add v10.8h, v22.8h, v27.8h
+ sub v13.8h, v22.8h, v27.8h
+ sub v1.8h, v20.8h, v0.8h
+ trn2 v21.4s, v10.4s, v13.4s
+ add v27.8h, v20.8h, v0.8h
+ trn2 v3.2d, v21.2d, v17.2d
+ trn1 v13.4s, v10.4s, v13.4s
+ trn1 v31.4s, v27.4s, v1.4s
+ sqrdmulh v10.8h, v3.8h, v25.8h
+ trn2 v5.2d, v13.2d, v28.2d
+ trn1 v13.2d, v13.2d, v28.2d
+ trn1 v21.2d, v21.2d, v17.2d
+ sqrdmulh v17.8h, v5.8h, v25.8h
+ trn2 v30.2d, v31.2d, v9.2d
+ mul v25.8h, v3.8h, v15.8h
+ str q30, [x0, #0x20]
+ trn2 v30.4s, v4.4s, v18.4s
+ mls v25.8h, v10.8h, v7.h[0]
+ trn2 v3.4s, v27.4s, v1.4s
+ mul v20.8h, v5.8h, v15.8h
+ trn2 v10.2d, v3.2d, v30.2d
+ mls v20.8h, v17.8h, v7.h[0]
+ str q10, [x0, #0x30]
+ sub v18.8h, v21.8h, v25.8h
+ add v10.8h, v21.8h, v25.8h
+ trn1 v3.2d, v3.2d, v30.2d
+ sqrdmulh v30.8h, v18.8h, v2.8h
+ mul v12.8h, v10.8h, v12.8h
+ sqrdmulh v6.8h, v10.8h, v6.8h
+ str q3, [x0, #0x10]
+ add v21.8h, v13.8h, v20.8h
+ mul v10.8h, v18.8h, v24.8h
+ sub v13.8h, v13.8h, v20.8h
+ mls v10.8h, v30.8h, v7.h[0]
+ mls v12.8h, v6.8h, v7.h[0]
+ trn1 v30.2d, v31.2d, v9.2d
+ sub v3.8h, v13.8h, v10.8h
+ add v6.8h, v13.8h, v10.8h
+ add v10.8h, v21.8h, v12.8h
+ sub v21.8h, v21.8h, v12.8h
+ trn2 v13.4s, v6.4s, v3.4s
+ trn1 v12.4s, v10.4s, v21.4s
+ trn2 v21.4s, v10.4s, v21.4s
+ trn1 v3.4s, v6.4s, v3.4s
+ str q30, [x0], #0x40
+ trn2 v10.2d, v21.2d, v13.2d
+ trn1 v13.2d, v21.2d, v13.2d
+ trn2 v21.2d, v12.2d, v3.2d
+ trn1 v3.2d, v12.2d, v3.2d
+ str q10, [x0, #0x30]
+ str q13, [x0, #0x10]
+ str q3, [x0], #0x40
+ stur q21, [x0, #-0x20]
+ ldp d8, d9, [sp]
+ .cfi_restore d8
+ .cfi_restore d9
+ ldp d10, d11, [sp, #0x10]
+ .cfi_restore d10
+ .cfi_restore d11
+ ldp d12, d13, [sp, #0x20]
+ .cfi_restore d12
+ .cfi_restore d13
+ ldp d14, d15, [sp, #0x30]
+ .cfi_restore d14
+ .cfi_restore d15
+ add sp, sp, #0x40
+ .cfi_adjust_cfa_offset -0x40
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(ntt_asm)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_mulcache_compute_asm.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_mulcache_compute_asm.S
index ec7ca0c6fa..71ebbeca53 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_mulcache_compute_asm.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_mulcache_compute_asm.S
@@ -3,49 +3,125 @@
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
+/*yaml
+ Name: poly_mulcache_compute_asm
+ Description: Compute multiplication cache for polynomial
+ Signature: void mlk_poly_mulcache_compute_asm(int16_t cache[128], const int16_t mlk_poly[256], const int16_t zetas[128], const int16_t zetas_twisted[128])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 256
+ permissions: write-only
+ c_parameter: int16_t cache[128]
+ description: Output cache
+ x1:
+ type: buffer
+ size_bytes: 512
+ permissions: read-only
+ c_parameter: const int16_t mlk_poly[256]
+ description: Input polynomial
+ x2:
+ type: buffer
+ size_bytes: 256
+ permissions: read-only
+ c_parameter: const int16_t zetas[128]
+ description: Zeta values
+ x3:
+ type: buffer
+ size_bytes: 256
+ permissions: read-only
+ c_parameter: const int16_t zetas_twisted[128]
+ description: Twisted zeta values
+ Stack:
+ bytes: 0
+*/
+
#include "../../../common.h"
-#if defined(MLK_ARITH_BACKEND_AARCH64) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/poly_mulcache_compute_asm.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(poly_mulcache_compute_asm)
MLK_ASM_FN_SYMBOL(poly_mulcache_compute_asm)
- mov w5, #0xd01 // =3329
- dup v6.8h, w5
- mov w5, #0x4ebf // =20159
- dup v7.8h, w5
- mov x4, #0x10 // =16
- ldr q1, [x1, #0x10]
- ldr q27, [x1], #0x20
- ldr q23, [x2], #0x10
- uzp2 v27.8h, v27.8h, v1.8h
- ldr q1, [x3], #0x10
- mul v2.8h, v27.8h, v23.8h
- sqrdmulh v27.8h, v27.8h, v1.8h
- sub x4, x4, #0x1
-
-poly_mulcache_compute_loop:
- ldr q29, [x1, #0x10]
- ldr q21, [x2], #0x10
- mls v2.8h, v27.8h, v6.h[0]
- ldr q27, [x1], #0x20
- ldr q7, [x3], #0x10
- uzp2 v28.8h, v27.8h, v29.8h
- str q2, [x0], #0x10
- mul v2.8h, v28.8h, v21.8h
- sqrdmulh v27.8h, v28.8h, v7.8h
- sub x4, x4, #0x1
- cbnz x4, poly_mulcache_compute_loop
- mls v2.8h, v27.8h, v6.h[0]
- str q2, [x0], #0x10
+ .cfi_startproc
+ mov w5, #0xd01 // =3329
+ dup v6.8h, w5
+ mov w5, #0x4ebf // =20159
+ dup v7.8h, w5
+ mov x4, #0x10 // =16
+ ldr q0, [x1], #0x20
+ ldur q2, [x1, #-0x10]
+ ldr q19, [x1], #0x20
+ ldr q29, [x3], #0x10
+ ldur q16, [x1, #-0x10]
+ ldr q18, [x2], #0x10
+ ldr q26, [x1], #0x20
+ ldr q25, [x2], #0x10
+ uzp2 v5.8h, v0.8h, v2.8h
+ ldr q28, [x3], #0x10
+ ldur q7, [x1, #-0x10]
+ ldr q2, [x1], #0x20
+ uzp2 v27.8h, v19.8h, v16.8h
+ sqrdmulh v16.8h, v5.8h, v29.8h
+ ldr q17, [x3], #0x10
+ ldr q19, [x3], #0x10
+ mul v5.8h, v5.8h, v18.8h
+ uzp2 v29.8h, v26.8h, v7.8h
+ mul v26.8h, v27.8h, v25.8h
+ sqrdmulh v4.8h, v27.8h, v28.8h
+ mls v5.8h, v16.8h, v6.h[0]
+ lsr x4, x4, #1
+ sub x4, x4, #0x2
+
+Lpoly_mulcache_compute_loop_start:
+ str q5, [x0], #0x10
+ sqrdmulh v22.8h, v29.8h, v17.8h
+ ldr q28, [x2], #0x10
+ ldur q24, [x1, #-0x10]
+ ldr q0, [x1], #0x20
+ mls v26.8h, v4.8h, v6.h[0]
+ ldur q16, [x1, #-0x10]
+ ldr q17, [x3], #0x10
+ mul v5.8h, v29.8h, v28.8h
+ uzp2 v23.8h, v2.8h, v24.8h
+ ldr q18, [x2], #0x10
+ mls v5.8h, v22.8h, v6.h[0]
+ uzp2 v29.8h, v0.8h, v16.8h
+ sqrdmulh v4.8h, v23.8h, v19.8h
+ ldr q2, [x1], #0x20
+ ldr q19, [x3], #0x10
+ str q26, [x0], #0x10
+ mul v26.8h, v23.8h, v18.8h
+ subs x4, x4, #0x1
+ cbnz x4, Lpoly_mulcache_compute_loop_start
+ mls v26.8h, v4.8h, v6.h[0]
+ str q5, [x0], #0x10
+ ldr q5, [x2], #0x10
+ ldur q4, [x1, #-0x10]
+ sqrdmulh v16.8h, v29.8h, v17.8h
+ ldr q0, [x2], #0x10
+ mul v29.8h, v29.8h, v5.8h
+ uzp2 v18.8h, v2.8h, v4.8h
+ str q26, [x0], #0x10
+ sqrdmulh v17.8h, v18.8h, v19.8h
+ mls v29.8h, v16.8h, v6.h[0]
+ mul v26.8h, v18.8h, v0.8h
+ mls v26.8h, v17.8h, v6.h[0]
+ str q29, [x0], #0x10
+ str q26, [x0], #0x10
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_mulcache_compute_asm)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_reduce_asm.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_reduce_asm.S
index b14447f0bc..28666853b4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_reduce_asm.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_reduce_asm.S
@@ -3,95 +3,148 @@
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
+/*yaml
+ Name: poly_reduce_asm
+ Description: Barrett reduction of polynomial coefficients
+ Signature: void mlk_poly_reduce_asm(int16_t p[256])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: read/write
+ c_parameter: int16_t p[256]
+ description: Input/output polynomial
+ Stack:
+ bytes: 0
+*/
+
#include "../../../common.h"
-#if defined(MLK_ARITH_BACKEND_AARCH64) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/poly_reduce_asm.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(poly_reduce_asm)
MLK_ASM_FN_SYMBOL(poly_reduce_asm)
- mov w2, #0xd01 // =3329
- dup v3.8h, w2
- mov w2, #0x4ebf // =20159
- dup v4.8h, w2
- mov x1, #0x8 // =8
- ldr q21, [x0, #0x20]
- ldr q23, [x0, #0x30]
- sqdmulh v7.8h, v21.8h, v4.h[0]
- sqdmulh v30.8h, v23.8h, v4.h[0]
- srshr v7.8h, v7.8h, #0xb
- srshr v30.8h, v30.8h, #0xb
- mls v21.8h, v7.8h, v3.h[0]
- mls v23.8h, v30.8h, v3.h[0]
- ldr q5, [x0, #0x10]
- sshr v7.8h, v21.8h, #0xf
- sshr v30.8h, v23.8h, #0xf
- and v7.16b, v3.16b, v7.16b
- add v21.8h, v21.8h, v7.8h
- and v7.16b, v3.16b, v30.16b
- add v16.8h, v23.8h, v7.8h
- sub x1, x1, #0x1
+ .cfi_startproc
+ mov w2, #0xd01 // =3329
+ dup v3.8h, w2
+ mov w2, #0x4ebf // =20159
+ dup v4.8h, w2
+ mov x1, #0x8 // =8
+ ldr q21, [x0], #0x40
+ ldur q18, [x0, #-0x20]
+ ldur q0, [x0, #-0x30]
+ ldur q5, [x0, #-0x10]
+ ldr q26, [x0], #0x40
+ sqdmulh v17.8h, v21.8h, v4.h[0]
+ sqdmulh v27.8h, v18.8h, v4.h[0]
+ sqdmulh v22.8h, v0.8h, v4.h[0]
+ srshr v17.8h, v17.8h, #0xb
+ sqdmulh v23.8h, v5.8h, v4.h[0]
+ srshr v29.8h, v27.8h, #0xb
+ mls v21.8h, v17.8h, v3.h[0]
+ srshr v17.8h, v22.8h, #0xb
+ mls v18.8h, v29.8h, v3.h[0]
+ srshr v22.8h, v23.8h, #0xb
+ mls v0.8h, v17.8h, v3.h[0]
+ sshr v2.8h, v21.8h, #0xf
+ mls v5.8h, v22.8h, v3.h[0]
+ sshr v29.8h, v18.8h, #0xf
+ and v19.16b, v3.16b, v2.16b
+ sqdmulh v2.8h, v26.8h, v4.h[0]
+ sshr v31.8h, v0.8h, #0xf
+ add v17.8h, v21.8h, v19.8h
+ and v21.16b, v3.16b, v29.16b
+ and v31.16b, v3.16b, v31.16b
+ sub x1, x1, #0x2
-poly_reduce_loop:
- ldr q6, [x0], #0x40
- ldr q30, [x0, #0x20]
- sqdmulh v31.8h, v6.8h, v4.h[0]
- sqdmulh v29.8h, v5.8h, v4.h[0]
- sqdmulh v22.8h, v30.8h, v4.h[0]
- stur q16, [x0, #-0x10]
- srshr v20.8h, v31.8h, #0xb
- srshr v28.8h, v29.8h, #0xb
- stur q21, [x0, #-0x20]
- mls v6.8h, v20.8h, v3.h[0]
- mls v5.8h, v28.8h, v3.h[0]
- ldr q2, [x0, #0x30]
- sshr v31.8h, v6.8h, #0xf
- srshr v19.8h, v22.8h, #0xb
- and v22.16b, v3.16b, v31.16b
- add v0.8h, v6.8h, v22.8h
- mls v30.8h, v19.8h, v3.h[0]
- sshr v26.8h, v5.8h, #0xf
- sqdmulh v25.8h, v2.8h, v4.h[0]
- and v17.16b, v3.16b, v26.16b
- add v1.8h, v5.8h, v17.8h
- sshr v31.8h, v30.8h, #0xf
- srshr v25.8h, v25.8h, #0xb
- stur q1, [x0, #-0x30]
- and v18.16b, v3.16b, v31.16b
- mls v2.8h, v25.8h, v3.h[0]
- add v21.8h, v30.8h, v18.8h
- ldr q5, [x0, #0x10]
- sshr v18.8h, v2.8h, #0xf
- stur q0, [x0, #-0x40]
- and v27.16b, v3.16b, v18.16b
- add v16.8h, v2.8h, v27.8h
- sub x1, x1, #0x1
- cbnz x1, poly_reduce_loop
- sqdmulh v20.8h, v5.8h, v4.h[0]
- ldr q24, [x0], #0x40
- stur q21, [x0, #-0x20]
- srshr v20.8h, v20.8h, #0xb
- sqdmulh v25.8h, v24.8h, v4.h[0]
- stur q16, [x0, #-0x10]
- mls v5.8h, v20.8h, v3.h[0]
- srshr v20.8h, v25.8h, #0xb
- sshr v2.8h, v5.8h, #0xf
- mls v24.8h, v20.8h, v3.h[0]
- and v20.16b, v3.16b, v2.16b
- add v31.8h, v5.8h, v20.8h
- sshr v20.8h, v24.8h, #0xf
- stur q31, [x0, #-0x30]
- and v31.16b, v3.16b, v20.16b
- add v24.8h, v24.8h, v31.8h
- stur q24, [x0, #-0x40]
+Lpoly_reduce_loop_start:
+ add v21.8h, v18.8h, v21.8h
+ ldur q18, [x0, #-0x20]
+ add v25.8h, v0.8h, v31.8h
+ ldur q0, [x0, #-0x30]
+ stur q21, [x0, #-0x60]
+ sshr v28.8h, v5.8h, #0xf
+ stur q17, [x0, #-0x80]
+ srshr v23.8h, v2.8h, #0xb
+ sqdmulh v30.8h, v18.8h, v4.h[0]
+ stur q25, [x0, #-0x70]
+ and v22.16b, v3.16b, v28.16b
+ sqdmulh v7.8h, v0.8h, v4.h[0]
+ add v16.8h, v5.8h, v22.8h
+ ldur q5, [x0, #-0x10]
+ mls v26.8h, v23.8h, v3.h[0]
+ stur q16, [x0, #-0x50]
+ srshr v6.8h, v30.8h, #0xb
+ srshr v1.8h, v7.8h, #0xb
+ sqdmulh v19.8h, v5.8h, v4.h[0]
+ mls v18.8h, v6.8h, v3.h[0]
+ sshr v24.8h, v26.8h, #0xf
+ mls v0.8h, v1.8h, v3.h[0]
+ and v27.16b, v3.16b, v24.16b
+ srshr v29.8h, v19.8h, #0xb
+ add v17.8h, v26.8h, v27.8h
+ ldr q26, [x0], #0x40
+ sshr v1.8h, v18.8h, #0xf
+ mls v5.8h, v29.8h, v3.h[0]
+ sshr v20.8h, v0.8h, #0xf
+ and v21.16b, v3.16b, v1.16b
+ and v31.16b, v3.16b, v20.16b
+ sqdmulh v2.8h, v26.8h, v4.h[0]
+ subs x1, x1, #0x1
+ cbnz x1, Lpoly_reduce_loop_start
+ add v28.8h, v0.8h, v31.8h
+ ldur q29, [x0, #-0x10]
+ add v21.8h, v18.8h, v21.8h
+ srshr v18.8h, v2.8h, #0xb
+ sshr v2.8h, v5.8h, #0xf
+ ldur q16, [x0, #-0x20]
+ stur q17, [x0, #-0x80]
+ ldur q0, [x0, #-0x30]
+ and v2.16b, v3.16b, v2.16b
+ sqdmulh v24.8h, v29.8h, v4.h[0]
+ stur q28, [x0, #-0x70]
+ stur q21, [x0, #-0x60]
+ add v31.8h, v5.8h, v2.8h
+ sqdmulh v6.8h, v16.8h, v4.h[0]
+ stur q31, [x0, #-0x50]
+ sqdmulh v17.8h, v0.8h, v4.h[0]
+ srshr v22.8h, v24.8h, #0xb
+ mls v26.8h, v18.8h, v3.h[0]
+ srshr v31.8h, v6.8h, #0xb
+ mls v29.8h, v22.8h, v3.h[0]
+ srshr v19.8h, v17.8h, #0xb
+ mls v16.8h, v31.8h, v3.h[0]
+ sshr v7.8h, v26.8h, #0xf
+ mls v0.8h, v19.8h, v3.h[0]
+ and v5.16b, v3.16b, v7.16b
+ sshr v22.8h, v29.8h, #0xf
+ add v27.8h, v26.8h, v5.8h
+ and v26.16b, v3.16b, v22.16b
+ sshr v20.8h, v16.8h, #0xf
+ stur q27, [x0, #-0x40]
+ and v2.16b, v3.16b, v20.16b
+ sshr v23.8h, v0.8h, #0xf
+ add v18.8h, v29.8h, v26.8h
+ add v31.8h, v16.8h, v2.8h
+ and v29.16b, v3.16b, v23.16b
+ stur q18, [x0, #-0x10]
+ add v25.8h, v0.8h, v29.8h
+ stur q31, [x0, #-0x20]
+ stur q25, [x0, #-0x30]
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_reduce_asm)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_tobytes_asm.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_tobytes_asm.S
index 6afb25986a..2d8f01cc10 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_tobytes_asm.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_tobytes_asm.S
@@ -3,6 +3,27 @@
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
+/*yaml
+ Name: poly_tobytes_asm
+ Description: Convert polynomial to byte representation
+ Signature: void mlk_poly_tobytes_asm(uint8_t r[384], const int16_t a[256])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 384
+ permissions: write-only
+ c_parameter: uint8_t r[384]
+ description: Output byte array
+ x1:
+ type: buffer
+ size_bytes: 512
+ permissions: read-only
+ c_parameter: const int16_t a[256]
+ description: Input polynomial
+ Stack:
+ bytes: 0
+*/
+
#include "../../../common.h"
#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
@@ -11,99 +32,86 @@
* dev/aarch64_opt/src/poly_tobytes_asm.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(poly_tobytes_asm)
MLK_ASM_FN_SYMBOL(poly_tobytes_asm)
- mov x2, #0x10 // =16
- ldr q6, [x1], #0x20
- ldur q24, [x1, #-0x10]
- ldr q30, [x1], #0x20
- ldur q22, [x1, #-0x10]
- ldr q5, [x1], #0x20
- ldur q17, [x1, #-0x10]
- ldr q19, [x1], #0x20
- ldur q4, [x1, #-0x10]
- lsr x2, x2, #2
- sub x2, x2, #0x1
+ .cfi_startproc
+ mov x2, #0x10 // =16
+ ldr q5, [x1, #0x10]
+ ldr q3, [x1], #0x20
+ ldr q29, [x1], #0x20
+ ldur q2, [x1, #-0x10]
+ ldr q27, [x1, #0x10]
+ ldr q23, [x1, #0x30]
+ ldr q17, [x1], #0x20
+ ldr q16, [x1], #0x20
+ uzp2 v26.8h, v3.8h, v5.8h
+ uzp1 v19.8h, v3.8h, v5.8h
+ uzp2 v0.8h, v29.8h, v2.8h
+ uzp1 v1.8h, v29.8h, v2.8h
+ xtn v5.8b, v26.8h
+ shrn v3.8b, v19.8h, #0x8
+ shrn v4.8b, v26.8h, #0x4
+ xtn v18.8b, v0.8h
+ shrn v30.8b, v0.8h, #0x4
+ xtn v28.8b, v1.8h
+ shrn v29.8b, v1.8h, #0x8
+ sli v3.8b, v5.8b, #0x4
+ xtn v2.8b, v19.8h
+ sli v29.8b, v18.8b, #0x4
+ lsr x2, x2, #1
+ sub x2, x2, #0x2
-poly_tobytes_loop_start:
- uzp1 v25.8h, v6.8h, v24.8h
- uzp2 v6.8h, v6.8h, v24.8h
- xtn v24.8b, v25.8h
- shrn v25.8b, v25.8h, #0x8
- xtn v18.8b, v6.8h
- shrn v26.8b, v6.8h, #0x4
- sli v25.8b, v18.8b, #0x4
- st3 { v24.8b, v25.8b, v26.8b }, [x0], #24
- uzp1 v25.8h, v30.8h, v22.8h
- uzp2 v6.8h, v30.8h, v22.8h
- xtn v24.8b, v25.8h
- xtn v18.8b, v6.8h
- uzp1 v30.8h, v5.8h, v17.8h
- uzp2 v22.8h, v5.8h, v17.8h
- xtn v5.8b, v30.8h
- xtn v17.8b, v22.8h
- uzp1 v28.8h, v19.8h, v4.8h
- uzp2 v19.8h, v19.8h, v4.8h
- xtn v4.8b, v28.8h
- xtn v20.8b, v19.8h
- shrn v25.8b, v25.8h, #0x8
- sli v25.8b, v18.8b, #0x4
- shrn v26.8b, v6.8h, #0x4
- st3 { v24.8b, v25.8b, v26.8b }, [x0], #24
- shrn v6.8b, v30.8h, #0x8
- sli v6.8b, v17.8b, #0x4
- shrn v7.8b, v22.8h, #0x4
- st3 { v5.8b, v6.8b, v7.8b }, [x0], #24
- shrn v5.8b, v28.8h, #0x8
- shrn v6.8b, v19.8h, #0x4
- sli v5.8b, v20.8b, #0x4
- st3 { v4.8b, v5.8b, v6.8b }, [x0], #24
- ldr q6, [x1], #0x20
- ldur q24, [x1, #-0x10]
- ldr q30, [x1], #0x20
- ldur q22, [x1, #-0x10]
- ldr q5, [x1], #0x20
- ldur q17, [x1, #-0x10]
- ldr q19, [x1], #0x20
- ldur q4, [x1, #-0x10]
- sub x2, x2, #0x1
- cbnz x2, poly_tobytes_loop_start
- uzp1 v25.8h, v30.8h, v22.8h
- uzp2 v18.8h, v30.8h, v22.8h
- uzp1 v30.8h, v6.8h, v24.8h
- uzp2 v6.8h, v6.8h, v24.8h
- uzp1 v24.8h, v5.8h, v17.8h
- uzp2 v22.8h, v5.8h, v17.8h
- uzp1 v5.8h, v19.8h, v4.8h
- uzp2 v17.8h, v19.8h, v4.8h
- xtn v19.8b, v25.8h
- shrn v20.8b, v25.8h, #0x8
- xtn v25.8b, v18.8h
- shrn v21.8b, v18.8h, #0x4
- xtn v28.8b, v30.8h
- shrn v29.8b, v30.8h, #0x8
- xtn v18.8b, v6.8h
- shrn v30.8b, v6.8h, #0x4
- xtn v1.8b, v24.8h
- shrn v2.8b, v24.8h, #0x8
- xtn v6.8b, v22.8h
- shrn v3.8b, v22.8h, #0x4
- xtn v22.8b, v5.8h
- shrn v23.8b, v5.8h, #0x8
- xtn v5.8b, v17.8h
- shrn v24.8b, v17.8h, #0x4
- sli v20.8b, v25.8b, #0x4
- sli v29.8b, v18.8b, #0x4
- st3 { v28.8b, v29.8b, v30.8b }, [x0], #24
- st3 { v19.8b, v20.8b, v21.8b }, [x0], #24
- sli v2.8b, v6.8b, #0x4
- st3 { v1.8b, v2.8b, v3.8b }, [x0], #24
- sli v23.8b, v5.8b, #0x4
- st3 { v22.8b, v23.8b, v24.8b }, [x0], #24
+Lpoly_tobytes_loop_start:
+ uzp1 v25.8h, v17.8h, v27.8h
+ uzp2 v31.8h, v17.8h, v27.8h
+ uzp1 v24.8h, v16.8h, v23.8h
+ uzp2 v6.8h, v16.8h, v23.8h
+ st3 { v2.8b, v3.8b, v4.8b }, [x0], #24
+ shrn v3.8b, v25.8h, #0x8
+ ldr q17, [x1], #0x20
+ shrn v4.8b, v31.8h, #0x4
+ xtn v21.8b, v6.8h
+ ldr q23, [x1, #0x10]
+ st3 { v28.8b, v29.8b, v30.8b }, [x0], #24
+ shrn v29.8b, v24.8h, #0x8
+ ldur q27, [x1, #-0x10]
+ xtn v20.8b, v31.8h
+ ldr q16, [x1], #0x20
+ sli v29.8b, v21.8b, #0x4
+ xtn v2.8b, v25.8h
+ sli v3.8b, v20.8b, #0x4
+ xtn v28.8b, v24.8h
+ shrn v30.8b, v6.8h, #0x4
+ subs x2, x2, #0x1
+ cbnz x2, Lpoly_tobytes_loop_start
+ uzp2 v7.8h, v17.8h, v27.8h
+ uzp1 v25.8h, v17.8h, v27.8h
+ uzp2 v0.8h, v16.8h, v23.8h
+ st3 { v2.8b, v3.8b, v4.8b }, [x0], #24
+ st3 { v28.8b, v29.8b, v30.8b }, [x0], #24
+ shrn v21.8b, v25.8h, #0x8
+ uzp1 v2.8h, v16.8h, v23.8h
+ shrn v22.8b, v7.8h, #0x4
+ shrn v4.8b, v0.8h, #0x4
+ xtn v28.8b, v7.8h
+ xtn v27.8b, v0.8h
+ shrn v3.8b, v2.8h, #0x8
+ sli v21.8b, v28.8b, #0x4
+ xtn v2.8b, v2.8h
+ sli v3.8b, v27.8b, #0x4
+ xtn v20.8b, v25.8h
+ st3 { v20.8b, v21.8b, v22.8b }, [x0], #24
+ st3 { v2.8b, v3.8b, v4.8b }, [x0], #24
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_tobytes_asm)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_tomont_asm.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_tomont_asm.S
index f7a427f4e5..7deb2c812e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_tomont_asm.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/poly_tomont_asm.S
@@ -3,75 +3,96 @@
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
+/*yaml
+ Name: poly_tomont_asm
+ Description: Convert polynomial to Montgomery domain
+ Signature: void mlk_poly_tomont_asm(int16_t p[256])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: read/write
+ c_parameter: int16_t p[256]
+ description: Input/output polynomial
+ Stack:
+ bytes: 0
+*/
+
#include "../../../common.h"
-#if defined(MLK_ARITH_BACKEND_AARCH64) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/poly_tomont_asm.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(poly_tomont_asm)
MLK_ASM_FN_SYMBOL(poly_tomont_asm)
- mov w2, #0xd01 // =3329
- dup v4.8h, w2
- mov w2, #0x4ebf // =20159
- dup v5.8h, w2
- mov w2, #-0x414 // =-1044
- dup v2.8h, w2
- mov w2, #-0x2824 // =-10276
- dup v3.8h, w2
- mov x1, #0x8 // =8
- ldr q26, [x0, #0x30]
- ldr q23, [x0, #0x10]
- mul v17.8h, v26.8h, v2.8h
- sqrdmulh v7.8h, v26.8h, v3.8h
- ldr q27, [x0, #0x20]
- sub x1, x1, #0x1
+ .cfi_startproc
+ mov w2, #0xd01 // =3329
+ dup v4.8h, w2
+ mov w2, #0x4ebf // =20159
+ dup v5.8h, w2
+ mov w2, #-0x414 // =-1044
+ dup v2.8h, w2
+ mov w2, #-0x2824 // =-10276
+ dup v3.8h, w2
+ mov x1, #0x8 // =8
+ ldr q18, [x0, #0x20]
+ ldr q0, [x0, #0x10]
+ ldr q16, [x0], #0x40
+ sqrdmulh v23.8h, v0.8h, v3.8h
+ mul v26.8h, v0.8h, v2.8h
+ sqrdmulh v19.8h, v16.8h, v3.8h
+ mls v26.8h, v23.8h, v4.h[0]
+ mul v29.8h, v16.8h, v2.8h
+ ldur q16, [x0, #-0x10]
+ mls v29.8h, v19.8h, v4.h[0]
+ stur q26, [x0, #-0x30]
+ sqrdmulh v26.8h, v18.8h, v3.8h
+ mul v18.8h, v18.8h, v2.8h
+ stur q29, [x0, #-0x40]
+ sqrdmulh v29.8h, v16.8h, v3.8h
+ mls v18.8h, v26.8h, v4.h[0]
+ sub x1, x1, #0x1
-poly_tomont_loop:
- mls v17.8h, v7.8h, v4.h[0]
- sqrdmulh v5.8h, v23.8h, v3.8h
- ldr q7, [x0], #0x40
- stur q17, [x0, #-0x10]
- sqrdmulh v29.8h, v27.8h, v3.8h
- sqrdmulh v19.8h, v7.8h, v3.8h
- mul v25.8h, v23.8h, v2.8h
- mul v0.8h, v7.8h, v2.8h
- mul v26.8h, v27.8h, v2.8h
- ldr q7, [x0, #0x30]
- mls v25.8h, v5.8h, v4.h[0]
- ldr q23, [x0, #0x10]
- mls v26.8h, v29.8h, v4.h[0]
- mls v0.8h, v19.8h, v4.h[0]
- stur q25, [x0, #-0x30]
- mul v17.8h, v7.8h, v2.8h
- sqrdmulh v7.8h, v7.8h, v3.8h
- stur q0, [x0, #-0x40]
- ldr q27, [x0, #0x20]
- stur q26, [x0, #-0x20]
- sub x1, x1, #0x1
- cbnz x1, poly_tomont_loop
- mls v17.8h, v7.8h, v4.h[0]
- sqrdmulh v7.8h, v23.8h, v3.8h
- mul v26.8h, v23.8h, v2.8h
- sqrdmulh v25.8h, v27.8h, v3.8h
- ldr q23, [x0], #0x40
- mul v27.8h, v27.8h, v2.8h
- mls v26.8h, v7.8h, v4.h[0]
- sqrdmulh v7.8h, v23.8h, v3.8h
- mul v23.8h, v23.8h, v2.8h
- stur q17, [x0, #-0x10]
- mls v27.8h, v25.8h, v4.h[0]
- stur q26, [x0, #-0x30]
- mls v23.8h, v7.8h, v4.h[0]
- stur q27, [x0, #-0x20]
- stur q23, [x0, #-0x40]
+Lpoly_tomont_loop:
+ ldr q19, [x0, #0x10]
+ mul v26.8h, v16.8h, v2.8h
+ ldr q23, [x0, #0x20]
+ ldr q17, [x0], #0x40
+ mls v26.8h, v29.8h, v4.h[0]
+ ldur q16, [x0, #-0x10]
+ sqrdmulh v28.8h, v19.8h, v3.8h
+ stur q18, [x0, #-0x60]
+ mul v0.8h, v19.8h, v2.8h
+ stur q26, [x0, #-0x50]
+ sqrdmulh v24.8h, v23.8h, v3.8h
+ mul v18.8h, v23.8h, v2.8h
+ sqrdmulh v22.8h, v17.8h, v3.8h
+ mul v26.8h, v17.8h, v2.8h
+ mls v0.8h, v28.8h, v4.h[0]
+ mls v26.8h, v22.8h, v4.h[0]
+ sqrdmulh v29.8h, v16.8h, v3.8h
+ stur q0, [x0, #-0x30]
+ mls v18.8h, v24.8h, v4.h[0]
+ stur q26, [x0, #-0x40]
+ sub x1, x1, #0x1
+ cbnz x1, Lpoly_tomont_loop
+ mul v16.8h, v16.8h, v2.8h
+ stur q18, [x0, #-0x20]
+ mls v16.8h, v29.8h, v4.h[0]
+ stur q16, [x0, #-0x10]
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_tomont_asm)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S
index 25ed53fd6b..b8bca5fdeb 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S
@@ -12,192 +12,250 @@
* https://eprint.iacr.org/2021/986
*/
+/*yaml
+ Name: polyvec_basemul_acc_montgomery_cached_asm_k2
+ Description: Re-implementation of asymmetric base multiplication following @[NeonNTT] for k=2
+ Signature: void mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(int16_t r[256], const int16_t a[512], const int16_t b[512], const int16_t b_cache[256])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: write-only
+ c_parameter: int16_t r[256]
+ description: Output polynomial
+ x1:
+ type: buffer
+ size_bytes: 1024
+ permissions: read-only
+ c_parameter: const int16_t a[512]
+ description: Input polynomial vector a
+ x2:
+ type: buffer
+ size_bytes: 1024
+ permissions: read-only
+ c_parameter: const int16_t b[512]
+ description: Input polynomial vector b
+ x3:
+ type: buffer
+ size_bytes: 512
+ permissions: read-only
+ c_parameter: const int16_t b_cache[256]
+ description: Cached values for b
+ Stack:
+ bytes: 64
+ description: saving callee-saved Neon registers
+*/
+
/* Re-implementation of asymmetric base multiplication following @[NeonNTT] */
#include "../../../common.h"
-#if defined(MLK_ARITH_BACKEND_AARCH64) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
- (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2)
+#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2)
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2)
MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k2)
- sub sp, sp, #0x40
- stp d8, d9, [sp]
- stp d10, d11, [sp, #0x10]
- stp d12, d13, [sp, #0x20]
- stp d14, d15, [sp, #0x30]
- mov w14, #0xd01 // =3329
- dup v0.8h, w14
- mov w14, #0xcff // =3327
- dup v2.8h, w14
- add x4, x1, #0x200
- add x5, x2, #0x200
- add x6, x3, #0x100
- mov x13, #0x10 // =16
- ldr q9, [x4], #0x20
- ldur q5, [x4, #-0x10]
- ldr q11, [x5], #0x20
- uzp1 v23.8h, v9.8h, v5.8h
- uzp2 v9.8h, v9.8h, v5.8h
- ldr q5, [x2], #0x20
- ldur q7, [x5, #-0x10]
- ldur q21, [x2, #-0x10]
- uzp2 v10.8h, v11.8h, v7.8h
- uzp1 v11.8h, v11.8h, v7.8h
- uzp1 v7.8h, v5.8h, v21.8h
- uzp2 v5.8h, v5.8h, v21.8h
- ldr q21, [x1], #0x20
- ldur q25, [x1, #-0x10]
- ld1 { v6.8h }, [x3], #16
- uzp1 v26.8h, v21.8h, v25.8h
- uzp2 v21.8h, v21.8h, v25.8h
- smull v25.4s, v26.4h, v5.4h
- smull2 v5.4s, v26.8h, v5.8h
- smull v19.4s, v26.4h, v7.4h
- smull2 v26.4s, v26.8h, v7.8h
- smlal v25.4s, v21.4h, v7.4h
- smlal2 v5.4s, v21.8h, v7.8h
- smlal v19.4s, v21.4h, v6.4h
- smlal2 v26.4s, v21.8h, v6.8h
- smlal v25.4s, v23.4h, v10.4h
- smlal2 v5.4s, v23.8h, v10.8h
- smlal v19.4s, v23.4h, v11.4h
- smlal2 v26.4s, v23.8h, v11.8h
- ld1 { v23.8h }, [x6], #16
- smlal v25.4s, v9.4h, v11.4h
- smlal2 v5.4s, v9.8h, v11.8h
- smlal2 v26.4s, v9.8h, v23.8h
- smlal v19.4s, v9.4h, v23.4h
- ldr q9, [x4], #0x20
- uzp1 v11.8h, v25.8h, v5.8h
- uzp1 v23.8h, v19.8h, v26.8h
- mul v11.8h, v11.8h, v2.8h
- mul v23.8h, v23.8h, v2.8h
- ldr q7, [x5], #0x20
- smlal2 v5.4s, v11.8h, v0.8h
- smlal v25.4s, v11.4h, v0.4h
- ldr q11, [x2], #0x20
- ldur q21, [x2, #-0x10]
- ldur q6, [x4, #-0x10]
- uzp1 v17.8h, v11.8h, v21.8h
- ldr q10, [x1], #0x20
- ldur q29, [x1, #-0x10]
- uzp2 v11.8h, v11.8h, v21.8h
- uzp1 v13.8h, v9.8h, v6.8h
- uzp1 v3.8h, v10.8h, v29.8h
- uzp2 v10.8h, v10.8h, v29.8h
- smull v12.4s, v3.4h, v11.4h
- smull2 v11.4s, v3.8h, v11.8h
- ldur q21, [x5, #-0x10]
- smlal v12.4s, v10.4h, v17.4h
- smlal2 v11.4s, v10.8h, v17.8h
- uzp2 v29.8h, v7.8h, v21.8h
- uzp1 v15.8h, v7.8h, v21.8h
- smlal v12.4s, v13.4h, v29.4h
- smlal2 v11.4s, v13.8h, v29.8h
- uzp2 v28.8h, v9.8h, v6.8h
- smlal2 v26.4s, v23.8h, v0.8h
- smlal v12.4s, v28.4h, v15.4h
- smlal2 v11.4s, v28.8h, v15.8h
- smlal v19.4s, v23.4h, v0.4h
- uzp2 v27.8h, v25.8h, v5.8h
- smull v23.4s, v3.4h, v17.4h
- uzp1 v9.8h, v12.8h, v11.8h
- uzp2 v19.8h, v19.8h, v26.8h
- mul v14.8h, v9.8h, v2.8h
- ld1 { v22.8h }, [x6], #16
- zip2 v9.8h, v19.8h, v27.8h
- smlal2 v11.4s, v14.8h, v0.8h
- ld1 { v4.8h }, [x3], #16
- sub x13, x13, #0x2
+ .cfi_startproc
+ sub sp, sp, #0x40
+ .cfi_adjust_cfa_offset 0x40
+ stp d8, d9, [sp]
+ .cfi_rel_offset d8, 0x0
+ .cfi_rel_offset d9, 0x8
+ stp d10, d11, [sp, #0x10]
+ .cfi_rel_offset d10, 0x10
+ .cfi_rel_offset d11, 0x18
+ stp d12, d13, [sp, #0x20]
+ .cfi_rel_offset d12, 0x20
+ .cfi_rel_offset d13, 0x28
+ stp d14, d15, [sp, #0x30]
+ .cfi_rel_offset d14, 0x30
+ .cfi_rel_offset d15, 0x38
+ mov w14, #0xd01 // =3329
+ dup v0.8h, w14
+ mov w14, #0xcff // =3327
+ dup v2.8h, w14
+ add x4, x1, #0x200
+ add x5, x2, #0x200
+ add x6, x3, #0x100
+ mov x13, #0x10 // =16
+ ldr q12, [x1], #0x20
+ ldur q9, [x1, #-0x10]
+ ldr q22, [x2], #0x20
+ ldur q30, [x2, #-0x10]
+ ldr q6, [x5], #0x20
+ ldr q7, [x4, #0x10]
+ ldr q8, [x4], #0x20
+ ldur q23, [x5, #-0x10]
+ uzp1 v16.8h, v12.8h, v9.8h
+ uzp2 v14.8h, v12.8h, v9.8h
+ uzp2 v13.8h, v22.8h, v30.8h
+ uzp1 v18.8h, v22.8h, v30.8h
+ ld1 { v27.8h }, [x3], #16
+ ld1 { v17.8h }, [x6], #16
+ smull2 v4.4s, v16.8h, v18.8h
+ ldr q31, [x1, #0x10]
+ smull v19.4s, v16.4h, v13.4h
+ ldr q24, [x1], #0x20
+ smlal v19.4s, v14.4h, v18.4h
+ ldr q22, [x2], #0x20
+ smlal2 v4.4s, v14.8h, v27.8h
+ uzp2 v5.8h, v6.8h, v23.8h
+ smull2 v29.4s, v16.8h, v13.8h
+ uzp2 v26.8h, v8.8h, v7.8h
+ smlal2 v29.4s, v14.8h, v18.8h
+ uzp1 v30.8h, v24.8h, v31.8h
+ uzp1 v8.8h, v8.8h, v7.8h
+ smull v11.4s, v16.4h, v18.4h
+ smlal v11.4s, v14.4h, v27.4h
+ ldur q1, [x2, #-0x10]
+ uzp1 v28.8h, v6.8h, v23.8h
+ smlal2 v29.4s, v8.8h, v5.8h
+ ldr q25, [x5], #0x20
+ smlal v19.4s, v8.4h, v5.4h
+ ldr q3, [x4, #0x10]
+ smlal2 v29.4s, v26.8h, v28.8h
+ uzp1 v27.8h, v22.8h, v1.8h
+ smlal v19.4s, v26.4h, v28.4h
+ ldr q12, [x4], #0x20
+ smlal2 v4.4s, v8.8h, v28.8h
+ ldur q21, [x5, #-0x10]
+ smlal2 v4.4s, v26.8h, v17.8h
+ smlal v11.4s, v8.4h, v28.4h
+ ld1 { v15.8h }, [x6], #16
+ smlal v11.4s, v26.4h, v17.4h
+ ld1 { v20.8h }, [x3], #16
+ uzp1 v28.8h, v19.8h, v29.8h
+ smull2 v23.4s, v30.8h, v27.8h
+ smull v26.4s, v30.4h, v27.4h
+ uzp2 v16.8h, v22.8h, v1.8h
+ mul v28.8h, v28.8h, v2.8h
+ uzp1 v10.8h, v11.8h, v4.8h
+ smull2 v8.4s, v30.8h, v16.8h
+ mul v13.8h, v10.8h, v2.8h
+ smlal v19.4s, v28.4h, v0.4h
+ smlal2 v29.4s, v28.8h, v0.8h
+ smull v18.4s, v30.4h, v16.4h
+ uzp1 v30.8h, v25.8h, v21.8h
+ smlal v11.4s, v13.4h, v0.4h
+ uzp2 v6.8h, v24.8h, v31.8h
+ uzp1 v16.8h, v12.8h, v3.8h
+ smlal2 v4.4s, v13.8h, v0.8h
+ uzp2 v17.8h, v25.8h, v21.8h
+ smlal2 v8.4s, v6.8h, v27.8h
+ uzp2 v12.8h, v12.8h, v3.8h
+ smlal v18.4s, v6.4h, v27.4h
+ uzp2 v9.8h, v19.8h, v29.8h
+ smlal2 v8.4s, v16.8h, v17.8h
+ smlal2 v8.4s, v12.8h, v30.8h
+ uzp2 v19.8h, v11.8h, v4.8h
+ sub x13, x13, #0x2
-polyvec_basemul_acc_montgomery_cached_k2_loop:
- smull2 v20.4s, v3.8h, v17.8h
- ldr q18, [x4], #0x20
- ldr q30, [x5], #0x20
- smlal2 v20.4s, v10.8h, v4.8h
- smlal v12.4s, v14.4h, v0.4h
- smlal v23.4s, v10.4h, v4.4h
- str q9, [x0, #0x10]
- smlal2 v20.4s, v13.8h, v15.8h
- ldr q8, [x2], #0x20
- smlal v23.4s, v13.4h, v15.4h
- smlal2 v20.4s, v28.8h, v22.8h
- zip1 v26.8h, v19.8h, v27.8h
- ldur q9, [x2, #-0x10]
- smlal v23.4s, v28.4h, v22.4h
- uzp2 v27.8h, v12.8h, v11.8h
- uzp1 v17.8h, v8.8h, v9.8h
- uzp2 v4.8h, v8.8h, v9.8h
- uzp1 v5.8h, v23.8h, v20.8h
- str q26, [x0], #0x20
- mul v31.8h, v5.8h, v2.8h
- ldur q19, [x4, #-0x10]
- ldr q29, [x1], #0x20
- ldur q12, [x1, #-0x10]
- smlal2 v20.4s, v31.8h, v0.8h
- uzp1 v13.8h, v18.8h, v19.8h
- uzp1 v3.8h, v29.8h, v12.8h
- uzp2 v10.8h, v29.8h, v12.8h
- smull v12.4s, v3.4h, v4.4h
- smull2 v11.4s, v3.8h, v4.8h
- ldur q5, [x5, #-0x10]
- smlal v12.4s, v10.4h, v17.4h
- smlal2 v11.4s, v10.8h, v17.8h
- uzp2 v14.8h, v30.8h, v5.8h
- uzp1 v15.8h, v30.8h, v5.8h
- smlal v12.4s, v13.4h, v14.4h
- smlal2 v11.4s, v13.8h, v14.8h
- uzp2 v28.8h, v18.8h, v19.8h
- smlal v23.4s, v31.4h, v0.4h
- smlal v12.4s, v28.4h, v15.4h
- smlal2 v11.4s, v28.8h, v15.8h
- ld1 { v22.8h }, [x6], #16
- uzp2 v19.8h, v23.8h, v20.8h
- uzp1 v1.8h, v12.8h, v11.8h
- smull v23.4s, v3.4h, v17.4h
- mul v14.8h, v1.8h, v2.8h
- zip2 v9.8h, v19.8h, v27.8h
- ld1 { v4.8h }, [x3], #16
- smlal2 v11.4s, v14.8h, v0.8h
- sub x13, x13, #0x1
- cbnz x13, polyvec_basemul_acc_montgomery_cached_k2_loop
- smull2 v5.4s, v3.8h, v17.8h
- smlal v12.4s, v14.4h, v0.4h
- smlal v23.4s, v10.4h, v4.4h
- str q9, [x0, #0x10]
- smlal2 v5.4s, v10.8h, v4.8h
- uzp2 v11.8h, v12.8h, v11.8h
- zip1 v9.8h, v19.8h, v27.8h
- smlal v23.4s, v13.4h, v15.4h
- smlal2 v5.4s, v13.8h, v15.8h
- str q9, [x0], #0x20
- smlal v23.4s, v28.4h, v22.4h
- smlal2 v5.4s, v28.8h, v22.8h
- uzp1 v9.8h, v23.8h, v5.8h
- mul v9.8h, v9.8h, v2.8h
- smlal2 v5.4s, v9.8h, v0.8h
- smlal v23.4s, v9.4h, v0.4h
- uzp2 v9.8h, v23.8h, v5.8h
- zip2 v5.8h, v9.8h, v11.8h
- zip1 v9.8h, v9.8h, v11.8h
- str q5, [x0, #0x10]
- str q9, [x0], #0x20
- ldp d8, d9, [sp]
- ldp d10, d11, [sp, #0x10]
- ldp d12, d13, [sp, #0x20]
- ldp d14, d15, [sp, #0x30]
- add sp, sp, #0x40
+Lpolyvec_basemul_acc_montgomery_cached_k2_loop_start:
+ smlal v18.4s, v16.4h, v17.4h
+ ldr q7, [x4], #0x20
+ ldr q10, [x2, #0x10]
+ smlal v18.4s, v12.4h, v30.4h
+ smlal2 v23.4s, v6.8h, v20.8h
+ ldr q14, [x2], #0x20
+ smlal2 v23.4s, v16.8h, v30.8h
+ zip1 v25.8h, v19.8h, v9.8h
+ zip2 v3.8h, v19.8h, v9.8h
+ smlal2 v23.4s, v12.8h, v15.8h
+ smlal v26.4s, v6.4h, v20.4h
+ uzp1 v5.8h, v18.8h, v8.8h
+ uzp2 v21.8h, v14.8h, v10.8h
+ smlal v26.4s, v16.4h, v30.4h
+ str q25, [x0], #0x20
+ mul v29.8h, v5.8h, v2.8h
+ uzp1 v24.8h, v14.8h, v10.8h
+ stur q3, [x0, #-0x10]
+ smlal v26.4s, v12.4h, v15.4h
+ ld1 { v15.8h }, [x6], #16
+ ldr q28, [x1, #0x10]
+ ldr q11, [x1], #0x20
+ ldr q13, [x5], #0x20
+ ldur q27, [x4, #-0x10]
+ smlal2 v8.4s, v29.8h, v0.8h
+ ldur q22, [x5, #-0x10]
+ smlal v18.4s, v29.4h, v0.4h
+ uzp1 v4.8h, v26.8h, v23.8h
+ uzp1 v1.8h, v11.8h, v28.8h
+ uzp2 v6.8h, v11.8h, v28.8h
+ uzp1 v16.8h, v7.8h, v27.8h
+ mul v31.8h, v4.8h, v2.8h
+ uzp2 v17.8h, v13.8h, v22.8h
+ ld1 { v20.8h }, [x3], #16
+ uzp2 v9.8h, v18.8h, v8.8h
+ smull2 v8.4s, v1.8h, v21.8h
+ uzp1 v30.8h, v13.8h, v22.8h
+ smlal2 v8.4s, v6.8h, v24.8h
+ smlal2 v8.4s, v16.8h, v17.8h
+ uzp2 v12.8h, v7.8h, v27.8h
+ smlal v26.4s, v31.4h, v0.4h
+ smlal2 v23.4s, v31.8h, v0.8h
+ smull v18.4s, v1.4h, v21.4h
+ smlal v18.4s, v6.4h, v24.4h
+ smlal2 v8.4s, v12.8h, v30.8h
+ uzp2 v19.8h, v26.8h, v23.8h
+ smull2 v23.4s, v1.8h, v24.8h
+ smull v26.4s, v1.4h, v24.4h
+ subs x13, x13, #0x1
+ cbnz x13, Lpolyvec_basemul_acc_montgomery_cached_k2_loop_start
+ smlal v26.4s, v6.4h, v20.4h
+ smlal2 v23.4s, v6.8h, v20.8h
+ smlal v26.4s, v16.4h, v30.4h
+ smlal2 v23.4s, v16.8h, v30.8h
+ smlal v26.4s, v12.4h, v15.4h
+ smlal2 v23.4s, v12.8h, v15.8h
+ smlal v18.4s, v16.4h, v17.4h
+ smlal v18.4s, v12.4h, v30.4h
+ zip1 v12.8h, v19.8h, v9.8h
+ str q12, [x0], #0x20
+ uzp1 v12.8h, v26.8h, v23.8h
+ mul v6.8h, v12.8h, v2.8h
+ uzp1 v12.8h, v18.8h, v8.8h
+ mul v12.8h, v12.8h, v2.8h
+ smlal v26.4s, v6.4h, v0.4h
+ smlal2 v23.4s, v6.8h, v0.8h
+ smlal2 v8.4s, v12.8h, v0.8h
+ smlal v18.4s, v12.4h, v0.4h
+ zip2 v12.8h, v19.8h, v9.8h
+ uzp2 v6.8h, v26.8h, v23.8h
+ stur q12, [x0, #-0x10]
+ uzp2 v12.8h, v18.8h, v8.8h
+ zip2 v1.8h, v6.8h, v12.8h
+ zip1 v12.8h, v6.8h, v12.8h
+ str q1, [x0, #0x10]
+ str q12, [x0], #0x20
+ ldp d8, d9, [sp]
+ .cfi_restore d8
+ .cfi_restore d9
+ ldp d10, d11, [sp, #0x10]
+ .cfi_restore d10
+ .cfi_restore d11
+ ldp d12, d13, [sp, #0x20]
+ .cfi_restore d12
+ .cfi_restore d13
+ ldp d14, d15, [sp, #0x30]
+ .cfi_restore d14
+ .cfi_restore d15
+ add sp, sp, #0x40
+ .cfi_adjust_cfa_offset -0x40
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_asm_k2)
+
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED && \
(MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S
index 9a80e1d9be..885b765eea 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S
@@ -12,246 +12,303 @@
* https://eprint.iacr.org/2021/986
*/
+/*yaml
+ Name: polyvec_basemul_acc_montgomery_cached_asm_k3
+ Description: Re-implementation of asymmetric base multiplication following @[NeonNTT] for k=3
+ Signature: void mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(int16_t r[256], const int16_t a[768], const int16_t b[768], const int16_t b_cache[384])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: write-only
+ c_parameter: int16_t r[256]
+ description: Output polynomial
+ x1:
+ type: buffer
+ size_bytes: 1536
+ permissions: read-only
+ c_parameter: const int16_t a[768]
+ description: Input polynomial vector a
+ x2:
+ type: buffer
+ size_bytes: 1536
+ permissions: read-only
+ c_parameter: const int16_t b[768]
+ description: Input polynomial vector b
+ x3:
+ type: buffer
+ size_bytes: 768
+ permissions: read-only
+ c_parameter: const int16_t b_cache[384]
+ description: Cached values for b
+ Stack:
+ bytes: 64
+ description: saving callee-saved Neon registers
+*/
+
/* Re-implementation of asymmetric base multiplication following @[NeonNTT] */
#include "../../../common.h"
-#if defined(MLK_ARITH_BACKEND_AARCH64) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
- (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 3)
+#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 3)
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3)
MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k3)
- sub sp, sp, #0x40
- stp d8, d9, [sp]
- stp d10, d11, [sp, #0x10]
- stp d12, d13, [sp, #0x20]
- stp d14, d15, [sp, #0x30]
- mov w14, #0xd01 // =3329
- dup v0.8h, w14
- mov w14, #0xcff // =3327
- dup v2.8h, w14
- add x4, x1, #0x200
- add x5, x2, #0x200
- add x6, x3, #0x100
- add x7, x1, #0x400
- add x8, x2, #0x400
- add x9, x3, #0x200
- mov x13, #0x10 // =16
- ldr q7, [x2, #0x10]
- ldr q20, [x2], #0x20
- ldr q15, [x1, #0x10]
- uzp1 v8.8h, v20.8h, v7.8h
- uzp2 v7.8h, v20.8h, v7.8h
- ld1 { v20.8h }, [x3], #16
- ldr q30, [x1], #0x20
- ldr q11, [x4], #0x20
- uzp1 v16.8h, v30.8h, v15.8h
- uzp2 v15.8h, v30.8h, v15.8h
- smull v30.4s, v16.4h, v7.4h
- smull2 v7.4s, v16.8h, v7.8h
- smull v9.4s, v16.4h, v8.4h
- smull2 v16.4s, v16.8h, v8.8h
- smlal v30.4s, v15.4h, v8.4h
- smlal2 v7.4s, v15.8h, v8.8h
- smlal v9.4s, v15.4h, v20.4h
- smlal2 v16.4s, v15.8h, v20.8h
- ldur q20, [x4, #-0x10]
- ldr q15, [x5], #0x20
- uzp1 v8.8h, v11.8h, v20.8h
- uzp2 v20.8h, v11.8h, v20.8h
- ldur q11, [x5, #-0x10]
- ld1 { v27.8h }, [x6], #16
- uzp1 v10.8h, v15.8h, v11.8h
- uzp2 v15.8h, v15.8h, v11.8h
- smlal v9.4s, v8.4h, v10.4h
- smlal2 v16.4s, v8.8h, v10.8h
- smlal v30.4s, v8.4h, v15.4h
- smlal2 v7.4s, v8.8h, v15.8h
- smlal v9.4s, v20.4h, v27.4h
- smlal2 v16.4s, v20.8h, v27.8h
- smlal v30.4s, v20.4h, v10.4h
- smlal2 v7.4s, v20.8h, v10.8h
- ldr q20, [x7], #0x20
- ldur q15, [x7, #-0x10]
- ldr q8, [x8], #0x20
- uzp1 v11.8h, v20.8h, v15.8h
- uzp2 v20.8h, v20.8h, v15.8h
- ldur q15, [x8, #-0x10]
- ld1 { v27.8h }, [x9], #16
- uzp1 v10.8h, v8.8h, v15.8h
- uzp2 v15.8h, v8.8h, v15.8h
- smlal v9.4s, v11.4h, v10.4h
- smlal2 v16.4s, v11.8h, v10.8h
- smlal v30.4s, v11.4h, v15.4h
- smlal2 v7.4s, v11.8h, v15.8h
- smlal v9.4s, v20.4h, v27.4h
- smlal2 v16.4s, v20.8h, v27.8h
- smlal v30.4s, v20.4h, v10.4h
- smlal2 v7.4s, v20.8h, v10.8h
- ldr q15, [x2], #0x20
- uzp1 v20.8h, v9.8h, v16.8h
- uzp1 v8.8h, v30.8h, v7.8h
- mul v20.8h, v20.8h, v2.8h
- mul v8.8h, v8.8h, v2.8h
- ldr q21, [x4], #0x20
- smlal v9.4s, v20.4h, v0.4h
- smlal2 v16.4s, v20.8h, v0.8h
- smlal v30.4s, v8.4h, v0.4h
- smlal2 v7.4s, v8.8h, v0.8h
- ldur q6, [x4, #-0x10]
- uzp2 v27.8h, v9.8h, v16.8h
- uzp2 v10.8h, v30.8h, v7.8h
- ldur q16, [x2, #-0x10]
- ldr q30, [x1, #0x10]
- ld1 { v9.8h }, [x3], #16
- ldr q1, [x5], #0x20
- ldur q12, [x5, #-0x10]
- ld1 { v24.8h }, [x6], #16
- ldr q19, [x7], #0x20
- ldur q31, [x7, #-0x10]
- ldr q17, [x8], #0x20
- ldur q18, [x8, #-0x10]
- ld1 { v25.8h }, [x9], #16
- sub x13, x13, #0x2
+ .cfi_startproc
+ sub sp, sp, #0x40
+ .cfi_adjust_cfa_offset 0x40
+ stp d8, d9, [sp]
+ .cfi_rel_offset d8, 0x0
+ .cfi_rel_offset d9, 0x8
+ stp d10, d11, [sp, #0x10]
+ .cfi_rel_offset d10, 0x10
+ .cfi_rel_offset d11, 0x18
+ stp d12, d13, [sp, #0x20]
+ .cfi_rel_offset d12, 0x20
+ .cfi_rel_offset d13, 0x28
+ stp d14, d15, [sp, #0x30]
+ .cfi_rel_offset d14, 0x30
+ .cfi_rel_offset d15, 0x38
+ mov w14, #0xd01 // =3329
+ dup v0.8h, w14
+ mov w14, #0xcff // =3327
+ dup v2.8h, w14
+ add x4, x1, #0x200
+ add x5, x2, #0x200
+ add x6, x3, #0x100
+ add x7, x1, #0x400
+ add x8, x2, #0x400
+ add x9, x3, #0x200
+ mov x13, #0x10 // =16
+ ldr q6, [x7], #0x20
+ ldr q19, [x2, #0x10]
+ ldr q23, [x1], #0x20
+ ldur q14, [x1, #-0x10]
+ ldr q17, [x2], #0x20
+ ldr q11, [x4, #0x10]
+ ldur q28, [x7, #-0x10]
+ ld1 { v30.8h }, [x3], #16
+ ldr q26, [x4], #0x20
+ ldr q16, [x8, #0x10]
+ uzp1 v8.8h, v23.8h, v14.8h
+ ldr q22, [x5, #0x10]
+ ldr q18, [x5], #0x20
+ uzp1 v20.8h, v17.8h, v19.8h
+ uzp2 v24.8h, v23.8h, v14.8h
+ ldr q31, [x8], #0x20
+ smull2 v4.4s, v8.8h, v20.8h
+ uzp1 v25.8h, v26.8h, v11.8h
+ smull v13.4s, v8.4h, v20.4h
+ ld1 { v23.8h }, [x6], #16
+ uzp1 v1.8h, v18.8h, v22.8h
+ smlal v13.4s, v24.4h, v30.4h
+ smlal2 v4.4s, v24.8h, v30.8h
+ uzp2 v5.8h, v26.8h, v11.8h
+ smlal2 v4.4s, v25.8h, v1.8h
+ uzp1 v29.8h, v6.8h, v28.8h
+ smlal2 v4.4s, v5.8h, v23.8h
+ ld1 { v7.8h }, [x9], #16
+ smlal v13.4s, v25.4h, v1.4h
+ uzp2 v17.8h, v17.8h, v19.8h
+ uzp1 v27.8h, v31.8h, v16.8h
+ smlal v13.4s, v5.4h, v23.4h
+ uzp2 v22.8h, v18.8h, v22.8h
+ smull v18.4s, v8.4h, v17.4h
+ uzp2 v28.8h, v6.8h, v28.8h
+ smlal v13.4s, v29.4h, v27.4h
+ smlal2 v4.4s, v29.8h, v27.8h
+ uzp2 v26.8h, v31.8h, v16.8h
+ smlal2 v4.4s, v28.8h, v7.8h
+ ldr q3, [x7, #0x10]
+ smlal v13.4s, v28.4h, v7.4h
+ ldr q7, [x1], #0x20
+ smlal v18.4s, v24.4h, v20.4h
+ ldr q15, [x2], #0x20
+ smlal v18.4s, v25.4h, v22.4h
+ smull2 v8.4s, v8.8h, v17.8h
+ ldur q17, [x1, #-0x10]
+ uzp1 v23.8h, v13.8h, v4.8h
+ smlal v18.4s, v5.4h, v1.4h
+ smlal2 v8.4s, v24.8h, v20.8h
+ ld1 { v16.8h }, [x3], #16
+ mul v23.8h, v23.8h, v2.8h
+ ldr q19, [x5, #0x10]
+ ldr q14, [x4, #0x10]
+ ldr q11, [x4], #0x20
+ ldur q20, [x2, #-0x10]
+ smlal2 v8.4s, v25.8h, v22.8h
+ smlal2 v8.4s, v5.8h, v1.8h
+ ldr q22, [x5], #0x20
+ uzp1 v1.8h, v7.8h, v17.8h
+ smlal v18.4s, v29.4h, v26.4h
+ smlal v13.4s, v23.4h, v0.4h
+ uzp2 v31.8h, v11.8h, v14.8h
+ uzp1 v21.8h, v15.8h, v20.8h
+ smlal2 v4.4s, v23.8h, v0.8h
+ ld1 { v9.8h }, [x6], #16
+ smlal v18.4s, v28.4h, v27.4h
+ smlal2 v8.4s, v29.8h, v26.8h
+ ldr q25, [x7], #0x20
+ smull v26.4s, v1.4h, v21.4h
+ uzp1 v24.8h, v22.8h, v19.8h
+ smlal2 v8.4s, v28.8h, v27.8h
+ uzp2 v28.8h, v7.8h, v17.8h
+ uzp1 v29.8h, v11.8h, v14.8h
+ smull2 v23.4s, v1.8h, v21.8h
+ ldr q27, [x8], #0x20
+ smlal2 v23.4s, v28.8h, v16.8h
+ ldur q11, [x8, #-0x10]
+ smlal2 v23.4s, v29.8h, v24.8h
+ uzp2 v7.8h, v13.8h, v4.8h
+ uzp2 v19.8h, v22.8h, v19.8h
+ ld1 { v4.8h }, [x9], #16
+ smlal2 v23.4s, v31.8h, v9.8h
+ uzp1 v13.8h, v25.8h, v3.8h
+ uzp1 v14.8h, v18.8h, v8.8h
+ smlal v26.4s, v28.4h, v16.4h
+ uzp2 v17.8h, v27.8h, v11.8h
+ uzp2 v20.8h, v15.8h, v20.8h
+ mul v14.8h, v14.8h, v2.8h
+ sub x13, x13, #0x2
-polyvec_basemul_acc_montgomery_cached_k3_loop:
- ldr q20, [x1], #0x20
- uzp1 v7.8h, v15.8h, v16.8h
- uzp2 v15.8h, v15.8h, v16.8h
- uzp1 v8.8h, v20.8h, v30.8h
- uzp2 v20.8h, v20.8h, v30.8h
- smull v30.4s, v8.4h, v15.4h
- smull2 v15.4s, v8.8h, v15.8h
- smull v11.4s, v8.4h, v7.4h
- smull2 v8.4s, v8.8h, v7.8h
- smlal v30.4s, v20.4h, v7.4h
- smlal2 v15.4s, v20.8h, v7.8h
- smlal v11.4s, v20.4h, v9.4h
- smlal2 v8.4s, v20.8h, v9.8h
- uzp1 v7.8h, v21.8h, v6.8h
- uzp2 v20.8h, v21.8h, v6.8h
- uzp1 v16.8h, v1.8h, v12.8h
- uzp2 v9.8h, v1.8h, v12.8h
- smlal v11.4s, v7.4h, v16.4h
- smlal2 v8.4s, v7.8h, v16.8h
- smlal v30.4s, v7.4h, v9.4h
- smlal2 v15.4s, v7.8h, v9.8h
- smlal v11.4s, v20.4h, v24.4h
- smlal2 v8.4s, v20.8h, v24.8h
- smlal v30.4s, v20.4h, v16.4h
- smlal2 v15.4s, v20.8h, v16.8h
- uzp1 v7.8h, v19.8h, v31.8h
- uzp2 v20.8h, v19.8h, v31.8h
- uzp1 v16.8h, v17.8h, v18.8h
- uzp2 v9.8h, v17.8h, v18.8h
- smlal v11.4s, v7.4h, v16.4h
- smlal2 v8.4s, v7.8h, v16.8h
- smlal v30.4s, v7.4h, v9.4h
- smlal2 v15.4s, v7.8h, v9.8h
- smlal v11.4s, v20.4h, v25.4h
- smlal2 v8.4s, v20.8h, v25.8h
- smlal v30.4s, v20.4h, v16.4h
- smlal2 v15.4s, v20.8h, v16.8h
- ldr q16, [x2, #0x10]
- uzp1 v7.8h, v11.8h, v8.8h
- uzp1 v20.8h, v30.8h, v15.8h
- mul v7.8h, v7.8h, v2.8h
- mul v20.8h, v20.8h, v2.8h
- zip2 v9.8h, v27.8h, v10.8h
- zip1 v27.8h, v27.8h, v10.8h
- smlal v11.4s, v7.4h, v0.4h
- smlal2 v8.4s, v7.8h, v0.8h
- smlal v30.4s, v20.4h, v0.4h
- smlal2 v15.4s, v20.8h, v0.8h
- str q27, [x0], #0x20
- uzp2 v27.8h, v11.8h, v8.8h
- stur q9, [x0, #-0x10]
- uzp2 v10.8h, v30.8h, v15.8h
- ldr q30, [x1, #0x10]
- ldr q15, [x2], #0x20
- ld1 { v9.8h }, [x3], #16
- ldr q21, [x4], #0x20
- ldur q6, [x4, #-0x10]
- ldr q1, [x5], #0x20
- ldur q12, [x5, #-0x10]
- ld1 { v24.8h }, [x6], #16
- ldr q19, [x7], #0x20
- ldur q31, [x7, #-0x10]
- ldr q17, [x8], #0x20
- ldur q18, [x8, #-0x10]
- ld1 { v25.8h }, [x9], #16
- sub x13, x13, #0x1
- cbnz x13, polyvec_basemul_acc_montgomery_cached_k3_loop
- ldr q7, [x1], #0x20
- uzp1 v20.8h, v15.8h, v16.8h
- uzp2 v15.8h, v15.8h, v16.8h
- uzp1 v23.8h, v7.8h, v30.8h
- uzp2 v11.8h, v7.8h, v30.8h
- smull2 v8.4s, v23.8h, v20.8h
- smull v5.4s, v23.4h, v20.4h
- smull2 v30.4s, v23.8h, v15.8h
- uzp1 v28.8h, v1.8h, v12.8h
- smlal2 v8.4s, v11.8h, v9.8h
- smlal v5.4s, v11.4h, v9.4h
- uzp1 v3.8h, v21.8h, v6.8h
- smull v16.4s, v23.4h, v15.4h
- smlal2 v8.4s, v3.8h, v28.8h
- smlal v5.4s, v3.4h, v28.4h
- uzp2 v29.8h, v21.8h, v6.8h
- uzp1 v7.8h, v17.8h, v18.8h
- smlal2 v8.4s, v29.8h, v24.8h
- uzp1 v14.8h, v19.8h, v31.8h
- smlal v16.4s, v11.4h, v20.4h
- smlal2 v30.4s, v11.8h, v20.8h
- smlal2 v8.4s, v14.8h, v7.8h
- uzp2 v20.8h, v1.8h, v12.8h
- uzp2 v21.8h, v19.8h, v31.8h
- smlal2 v30.4s, v3.8h, v20.8h
- smlal v16.4s, v3.4h, v20.4h
- smlal v5.4s, v29.4h, v24.4h
- uzp2 v9.8h, v17.8h, v18.8h
- smlal2 v30.4s, v29.8h, v28.8h
- smlal v16.4s, v29.4h, v28.4h
- smlal v5.4s, v14.4h, v7.4h
- smlal2 v8.4s, v21.8h, v25.8h
- smlal2 v30.4s, v14.8h, v9.8h
- smlal v16.4s, v14.4h, v9.4h
- smlal v5.4s, v21.4h, v25.4h
- zip1 v20.8h, v27.8h, v10.8h
- smlal2 v30.4s, v21.8h, v7.8h
- smlal v16.4s, v21.4h, v7.4h
- uzp1 v7.8h, v5.8h, v8.8h
- str q20, [x0], #0x20
- mul v15.8h, v7.8h, v2.8h
- uzp1 v7.8h, v16.8h, v30.8h
- zip2 v31.8h, v27.8h, v10.8h
- mul v20.8h, v7.8h, v2.8h
- smlal v5.4s, v15.4h, v0.4h
- smlal2 v8.4s, v15.8h, v0.8h
- stur q31, [x0, #-0x10]
- smlal2 v30.4s, v20.8h, v0.8h
- smlal v16.4s, v20.4h, v0.4h
- uzp2 v15.8h, v5.8h, v8.8h
- uzp2 v20.8h, v16.8h, v30.8h
- zip1 v7.8h, v15.8h, v20.8h
- zip2 v20.8h, v15.8h, v20.8h
- str q7, [x0], #0x20
- stur q20, [x0, #-0x10]
- ldp d8, d9, [sp]
- ldp d10, d11, [sp, #0x10]
- ldp d12, d13, [sp, #0x20]
- ldp d14, d15, [sp, #0x30]
- add sp, sp, #0x40
+Lpolyvec_basemul_acc_montgomery_cached_k3_loop_start:
+ uzp1 v6.8h, v27.8h, v11.8h
+ smlal v26.4s, v29.4h, v24.4h
+ uzp2 v16.8h, v25.8h, v3.8h
+ smlal v26.4s, v31.4h, v9.4h
+ ldr q3, [x7, #0x10]
+ smlal v26.4s, v13.4h, v6.4h
+ smlal2 v8.4s, v14.8h, v0.8h
+ ldr q27, [x8], #0x20
+ smlal v18.4s, v14.4h, v0.4h
+ ldr q25, [x7], #0x20
+ smlal2 v23.4s, v13.8h, v6.8h
+ ldr q11, [x1], #0x20
+ smlal2 v23.4s, v16.8h, v4.8h
+ smlal v26.4s, v16.4h, v4.4h
+ ldur q22, [x1, #-0x10]
+ uzp2 v30.8h, v18.8h, v8.8h
+ smull v18.4s, v1.4h, v20.4h
+ smlal v18.4s, v28.4h, v21.4h
+ ldr q14, [x2], #0x20
+ smlal v18.4s, v29.4h, v19.4h
+ zip1 v5.8h, v7.8h, v30.8h
+ uzp1 v4.8h, v26.8h, v23.8h
+ smull2 v8.4s, v1.8h, v20.8h
+ zip2 v10.8h, v7.8h, v30.8h
+ smlal v18.4s, v31.4h, v24.4h
+ mul v12.8h, v4.8h, v2.8h
+ ldr q4, [x5, #0x10]
+ ldr q20, [x4, #0x10]
+ ldr q1, [x4], #0x20
+ ldur q30, [x2, #-0x10]
+ smlal2 v8.4s, v28.8h, v21.8h
+ smlal2 v8.4s, v29.8h, v19.8h
+ ldr q19, [x5], #0x20
+ smlal2 v8.4s, v31.8h, v24.8h
+ ld1 { v15.8h }, [x3], #16
+ uzp2 v31.8h, v1.8h, v20.8h
+ smlal v26.4s, v12.4h, v0.4h
+ smlal2 v23.4s, v12.8h, v0.8h
+ uzp1 v21.8h, v14.8h, v30.8h
+ uzp1 v29.8h, v1.8h, v20.8h
+ uzp1 v1.8h, v11.8h, v22.8h
+ smlal2 v8.4s, v13.8h, v17.8h
+ ld1 { v9.8h }, [x6], #16
+ smlal v18.4s, v13.4h, v17.4h
+ uzp1 v24.8h, v19.8h, v4.8h
+ uzp2 v7.8h, v26.8h, v23.8h
+ smull v26.4s, v1.4h, v21.4h
+ smlal v18.4s, v16.4h, v6.4h
+ uzp2 v19.8h, v19.8h, v4.8h
+ smlal2 v8.4s, v16.8h, v6.8h
+ uzp2 v28.8h, v11.8h, v22.8h
+ smull2 v23.4s, v1.8h, v21.8h
+ uzp1 v13.8h, v25.8h, v3.8h
+ smlal2 v23.4s, v28.8h, v15.8h
+ ldur q11, [x8, #-0x10]
+ smlal2 v23.4s, v29.8h, v24.8h
+ ld1 { v4.8h }, [x9], #16
+ smlal2 v23.4s, v31.8h, v9.8h
+ uzp1 v12.8h, v18.8h, v8.8h
+ uzp2 v20.8h, v14.8h, v30.8h
+ smlal v26.4s, v28.4h, v15.4h
+ str q5, [x0], #0x20
+ mul v14.8h, v12.8h, v2.8h
+ stur q10, [x0, #-0x10]
+ uzp2 v17.8h, v27.8h, v11.8h
+ subs x13, x13, #0x1
+ cbnz x13, Lpolyvec_basemul_acc_montgomery_cached_k3_loop_start
+ uzp2 v3.8h, v25.8h, v3.8h
+ smull2 v16.4s, v1.8h, v20.8h
+ smull v25.4s, v1.4h, v20.4h
+ uzp1 v22.8h, v27.8h, v11.8h
+ smlal2 v16.4s, v28.8h, v21.8h
+ smlal v25.4s, v28.4h, v21.4h
+ smlal2 v16.4s, v29.8h, v19.8h
+ smlal v25.4s, v29.4h, v19.4h
+ smlal2 v16.4s, v31.8h, v24.8h
+ smlal v25.4s, v31.4h, v24.4h
+ smlal v25.4s, v13.4h, v17.4h
+ smlal2 v16.4s, v13.8h, v17.8h
+ smlal2 v16.4s, v3.8h, v22.8h
+ smlal v25.4s, v3.4h, v22.4h
+ smlal2 v23.4s, v13.8h, v22.8h
+ smlal v26.4s, v29.4h, v24.4h
+ smlal v26.4s, v31.4h, v9.4h
+ smlal v26.4s, v13.4h, v22.4h
+ uzp1 v10.8h, v25.8h, v16.8h
+ smlal2 v23.4s, v3.8h, v4.8h
+ smlal v26.4s, v3.4h, v4.4h
+ mul v13.8h, v10.8h, v2.8h
+ smlal v18.4s, v14.4h, v0.4h
+ smlal2 v8.4s, v14.8h, v0.8h
+ uzp1 v3.8h, v26.8h, v23.8h
+ mul v24.8h, v3.8h, v2.8h
+ uzp2 v17.8h, v18.8h, v8.8h
+ smlal v25.4s, v13.4h, v0.4h
+ smlal2 v16.4s, v13.8h, v0.8h
+ zip1 v21.8h, v7.8h, v17.8h
+ zip2 v20.8h, v7.8h, v17.8h
+ smlal2 v23.4s, v24.8h, v0.8h
+ str q21, [x0], #0x20
+ smlal v26.4s, v24.4h, v0.4h
+ uzp2 v13.8h, v25.8h, v16.8h
+ stur q20, [x0, #-0x10]
+ uzp2 v23.8h, v26.8h, v23.8h
+ zip1 v18.8h, v23.8h, v13.8h
+ zip2 v13.8h, v23.8h, v13.8h
+ str q18, [x0], #0x20
+ stur q13, [x0, #-0x10]
+ ldp d8, d9, [sp]
+ .cfi_restore d8
+ .cfi_restore d9
+ ldp d10, d11, [sp, #0x10]
+ .cfi_restore d10
+ .cfi_restore d11
+ ldp d12, d13, [sp, #0x20]
+ .cfi_restore d12
+ .cfi_restore d13
+ ldp d14, d15, [sp, #0x30]
+ .cfi_restore d14
+ .cfi_restore d15
+ add sp, sp, #0x40
+ .cfi_adjust_cfa_offset -0x40
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_asm_k3)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED && \
(MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S
index 78f8693774..7c09167b17 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S
@@ -12,300 +12,357 @@
* https://eprint.iacr.org/2021/986
*/
+/*yaml
+ Name: polyvec_basemul_acc_montgomery_cached_asm_k4
+ Description: Re-implementation of asymmetric base multiplication following @[NeonNTT] for k=4
+ Signature: void mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(int16_t r[256], const int16_t a[1024], const int16_t b[1024], const int16_t b_cache[512])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: write-only
+ c_parameter: int16_t r[256]
+ description: Output polynomial
+ x1:
+ type: buffer
+ size_bytes: 2048
+ permissions: read-only
+ c_parameter: const int16_t a[1024]
+ description: Input polynomial vector a
+ x2:
+ type: buffer
+ size_bytes: 2048
+ permissions: read-only
+ c_parameter: const int16_t b[1024]
+ description: Input polynomial vector b
+ x3:
+ type: buffer
+ size_bytes: 1024
+ permissions: read-only
+ c_parameter: const int16_t b_cache[512]
+ description: Cached values for b
+ Stack:
+ bytes: 64
+ description: saving callee-saved Neon registers
+*/
+
/* Re-implementation of asymmetric base multiplication following @[NeonNTT] */
#include "../../../common.h"
-#if defined(MLK_ARITH_BACKEND_AARCH64) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
- (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
+#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4)
MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k4)
- sub sp, sp, #0x40
- stp d8, d9, [sp]
- stp d10, d11, [sp, #0x10]
- stp d12, d13, [sp, #0x20]
- stp d14, d15, [sp, #0x30]
- mov w14, #0xd01 // =3329
- dup v0.8h, w14
- mov w14, #0xcff // =3327
- dup v2.8h, w14
- add x4, x1, #0x200
- add x5, x2, #0x200
- add x6, x3, #0x100
- add x7, x1, #0x400
- add x8, x2, #0x400
- add x9, x3, #0x200
- add x10, x1, #0x600
- add x11, x2, #0x600
- add x12, x3, #0x300
- mov x13, #0x10 // =16
- ldr q23, [x2, #0x10]
- ldr q19, [x2], #0x20
- ldr q17, [x5], #0x20
- uzp2 v13.8h, v19.8h, v23.8h
- uzp1 v19.8h, v19.8h, v23.8h
- ldur q23, [x5, #-0x10]
- ldr q30, [x1, #0x10]
- uzp2 v9.8h, v17.8h, v23.8h
- uzp1 v23.8h, v17.8h, v23.8h
- ldr q17, [x1], #0x20
- ldr q10, [x7, #0x10]
- uzp1 v12.8h, v17.8h, v30.8h
- uzp2 v17.8h, v17.8h, v30.8h
- smull2 v30.4s, v12.8h, v13.8h
- smull v13.4s, v12.4h, v13.4h
- smull2 v22.4s, v12.8h, v19.8h
- smull v12.4s, v12.4h, v19.4h
- smlal2 v30.4s, v17.8h, v19.8h
- smlal v13.4s, v17.4h, v19.4h
- ldr q19, [x4], #0x20
- ldur q16, [x4, #-0x10]
- ld1 { v8.8h }, [x3], #16
- uzp1 v26.8h, v19.8h, v16.8h
- uzp2 v19.8h, v19.8h, v16.8h
- smlal2 v30.4s, v26.8h, v9.8h
- smlal v13.4s, v26.4h, v9.4h
- smlal2 v22.4s, v17.8h, v8.8h
- smlal v12.4s, v17.4h, v8.4h
- smlal2 v30.4s, v19.8h, v23.8h
- smlal v13.4s, v19.4h, v23.4h
- smlal2 v22.4s, v26.8h, v23.8h
- smlal v12.4s, v26.4h, v23.4h
- ldr q23, [x7], #0x20
- ldr q17, [x8, #0x10]
- uzp1 v9.8h, v23.8h, v10.8h
- uzp2 v23.8h, v23.8h, v10.8h
- ldr q10, [x10], #0x20
- ldur q16, [x10, #-0x10]
- ld1 { v8.8h }, [x12], #16
- uzp1 v26.8h, v10.8h, v16.8h
- uzp2 v10.8h, v10.8h, v16.8h
- ld1 { v16.8h }, [x6], #16
- ldr q3, [x11, #0x10]
- smlal2 v22.4s, v19.8h, v16.8h
- smlal v12.4s, v19.4h, v16.4h
- ldr q19, [x11], #0x20
- ld1 { v16.8h }, [x9], #16
- uzp1 v4.8h, v19.8h, v3.8h
- uzp2 v19.8h, v19.8h, v3.8h
- ldr q3, [x8], #0x20
- ldr q31, [x2], #0x20
- uzp1 v6.8h, v3.8h, v17.8h
- uzp2 v17.8h, v3.8h, v17.8h
- smlal2 v22.4s, v9.8h, v6.8h
- smlal2 v30.4s, v9.8h, v17.8h
- smlal v13.4s, v9.4h, v17.4h
- smlal v12.4s, v9.4h, v6.4h
- smlal2 v22.4s, v23.8h, v16.8h
- smlal2 v30.4s, v23.8h, v6.8h
- smlal v13.4s, v23.4h, v6.4h
- smlal v12.4s, v23.4h, v16.4h
- smlal2 v22.4s, v26.8h, v4.8h
- smlal2 v30.4s, v26.8h, v19.8h
- smlal v13.4s, v26.4h, v19.4h
- smlal v12.4s, v26.4h, v4.4h
- smlal2 v22.4s, v10.8h, v8.8h
- smlal2 v30.4s, v10.8h, v4.8h
- smlal v13.4s, v10.4h, v4.4h
- smlal v12.4s, v10.4h, v8.4h
- ldur q19, [x2, #-0x10]
- uzp1 v23.8h, v13.8h, v30.8h
- uzp1 v17.8h, v12.8h, v22.8h
- mul v23.8h, v23.8h, v2.8h
- uzp2 v21.8h, v31.8h, v19.8h
- uzp1 v19.8h, v31.8h, v19.8h
- mul v17.8h, v17.8h, v2.8h
- smlal v13.4s, v23.4h, v0.4h
- smlal2 v30.4s, v23.8h, v0.8h
- ldr q23, [x5], #0x20
- smlal2 v22.4s, v17.8h, v0.8h
- uzp2 v15.8h, v13.8h, v30.8h
- smlal v12.4s, v17.4h, v0.4h
- ldur q17, [x5, #-0x10]
- ldr q13, [x1, #0x10]
- uzp2 v27.8h, v23.8h, v17.8h
- uzp1 v28.8h, v23.8h, v17.8h
- uzp2 v7.8h, v12.8h, v22.8h
- ldr q23, [x1], #0x20
- zip1 v5.8h, v7.8h, v15.8h
- ldr q3, [x7, #0x10]
- uzp1 v31.8h, v23.8h, v13.8h
- uzp2 v16.8h, v23.8h, v13.8h
- smull2 v24.4s, v31.8h, v21.8h
- ldr q6, [x8, #0x10]
- ldr q23, [x10], #0x20
- smlal2 v24.4s, v16.8h, v19.8h
- ldur q17, [x10, #-0x10]
- ld1 { v22.8h }, [x12], #16
- uzp1 v30.8h, v23.8h, v17.8h
- uzp2 v11.8h, v23.8h, v17.8h
- ldr q23, [x4], #0x20
- ldur q17, [x4, #-0x10]
- ldr q4, [x7], #0x20
- uzp1 v20.8h, v23.8h, v17.8h
- uzp2 v26.8h, v23.8h, v17.8h
- uzp1 v9.8h, v4.8h, v3.8h
- smlal2 v24.4s, v20.8h, v27.8h
- ld1 { v8.8h }, [x6], #16
- ldr q25, [x11, #0x10]
- ldr q29, [x11], #0x20
- ld1 { v12.8h }, [x9], #16
- uzp1 v10.8h, v29.8h, v25.8h
- ldr q14, [x8], #0x20
- ld1 { v23.8h }, [x3], #16
- sub x13, x13, #0x2
+ .cfi_startproc
+ sub sp, sp, #0x40
+ .cfi_adjust_cfa_offset 0x40
+ stp d8, d9, [sp]
+ .cfi_rel_offset d8, 0x0
+ .cfi_rel_offset d9, 0x8
+ stp d10, d11, [sp, #0x10]
+ .cfi_rel_offset d10, 0x10
+ .cfi_rel_offset d11, 0x18
+ stp d12, d13, [sp, #0x20]
+ .cfi_rel_offset d12, 0x20
+ .cfi_rel_offset d13, 0x28
+ stp d14, d15, [sp, #0x30]
+ .cfi_rel_offset d14, 0x30
+ .cfi_rel_offset d15, 0x38
+ mov w14, #0xd01 // =3329
+ dup v0.8h, w14
+ mov w14, #0xcff // =3327
+ dup v2.8h, w14
+ add x4, x1, #0x200
+ add x5, x2, #0x200
+ add x6, x3, #0x100
+ add x7, x1, #0x400
+ add x8, x2, #0x400
+ add x9, x3, #0x200
+ add x10, x1, #0x600
+ add x11, x2, #0x600
+ add x12, x3, #0x300
+ mov x13, #0x10 // =16
+ ldr q28, [x1], #0x20
+ ldur q5, [x1, #-0x10]
+ ldr q31, [x2], #0x20
+ ldur q27, [x2, #-0x10]
+ ldr q7, [x5], #0x20
+ ldr q10, [x4], #0x20
+ ldur q18, [x5, #-0x10]
+ ldur q9, [x4, #-0x10]
+ uzp1 v11.8h, v28.8h, v5.8h
+ uzp2 v19.8h, v28.8h, v5.8h
+ uzp2 v4.8h, v31.8h, v27.8h
+ uzp1 v1.8h, v31.8h, v27.8h
+ ldr q29, [x7], #0x20
+ ldr q28, [x8, #0x10]
+ uzp1 v24.8h, v10.8h, v9.8h
+ uzp1 v17.8h, v7.8h, v18.8h
+ uzp2 v7.8h, v7.8h, v18.8h
+ ldr q21, [x8], #0x20
+ uzp2 v27.8h, v10.8h, v9.8h
+ ldur q6, [x7, #-0x10]
+ smull v18.4s, v11.4h, v4.4h
+ ld1 { v9.8h }, [x3], #16
+ smull2 v8.4s, v11.8h, v4.8h
+ ldr q16, [x11], #0x20
+ smlal2 v8.4s, v19.8h, v1.8h
+ ldur q14, [x11, #-0x10]
+ smlal v18.4s, v19.4h, v1.4h
+ uzp1 v10.8h, v21.8h, v28.8h
+ smlal v18.4s, v24.4h, v7.4h
+ ldr q4, [x10], #0x20
+ smlal2 v8.4s, v24.8h, v7.8h
+ ld1 { v12.8h }, [x6], #16
+ smull2 v23.4s, v11.8h, v1.8h
+ uzp2 v13.8h, v29.8h, v6.8h
+ smull v26.4s, v11.4h, v1.4h
+ uzp1 v29.8h, v29.8h, v6.8h
+ smlal v26.4s, v19.4h, v9.4h
+ ldur q15, [x10, #-0x10]
+ smlal2 v23.4s, v19.8h, v9.8h
+ uzp2 v9.8h, v21.8h, v28.8h
+ smlal v18.4s, v27.4h, v17.4h
+ uzp2 v6.8h, v16.8h, v14.8h
+ uzp1 v21.8h, v16.8h, v14.8h
+ smlal2 v8.4s, v27.8h, v17.8h
+ smlal2 v8.4s, v29.8h, v9.8h
+ uzp1 v30.8h, v4.8h, v15.8h
+ uzp2 v16.8h, v4.8h, v15.8h
+ smlal v18.4s, v29.4h, v9.4h
+ smlal2 v8.4s, v13.8h, v10.8h
+ ld1 { v15.8h }, [x9], #16
+ smlal v18.4s, v13.4h, v10.4h
+ ldr q11, [x4], #0x20
+ smlal v18.4s, v30.4h, v6.4h
+ ldr q7, [x2], #0x20
+ smlal2 v8.4s, v30.8h, v6.8h
+ ld1 { v9.8h }, [x12], #16
+ smlal2 v23.4s, v24.8h, v17.8h
+ ldur q4, [x2, #-0x10]
+ smlal v26.4s, v24.4h, v17.4h
+ ldur q25, [x4, #-0x10]
+ smlal2 v8.4s, v16.8h, v21.8h
+ ldr q5, [x5], #0x20
+ smlal v18.4s, v16.4h, v21.4h
+ ldur q22, [x5, #-0x10]
+ smlal v26.4s, v27.4h, v12.4h
+ ldr q19, [x1, #0x10]
+ smlal v26.4s, v29.4h, v10.4h
+ ld1 { v20.8h }, [x3], #16
+ smlal v26.4s, v13.4h, v15.4h
+ uzp1 v24.8h, v7.8h, v4.8h
+ smlal2 v23.4s, v27.8h, v12.8h
+ uzp1 v28.8h, v18.8h, v8.8h
+ smlal v26.4s, v30.4h, v21.4h
+ uzp2 v27.8h, v11.8h, v25.8h
+ smlal2 v23.4s, v29.8h, v10.8h
+ uzp2 v31.8h, v7.8h, v4.8h
+ smlal2 v23.4s, v13.8h, v15.8h
+ uzp1 v14.8h, v5.8h, v22.8h
+ uzp1 v17.8h, v11.8h, v25.8h
+ smlal v26.4s, v16.4h, v9.4h
+ mul v29.8h, v28.8h, v2.8h
+ sub x13, x13, #0x2
-polyvec_basemul_acc_montgomery_cached_k4_loop:
- smlal2 v24.4s, v26.8h, v28.8h
- uzp2 v4.8h, v4.8h, v3.8h
- smull2 v13.4s, v31.8h, v19.8h
- ldr q3, [x2], #0x20
- uzp2 v1.8h, v29.8h, v25.8h
- smlal2 v13.4s, v16.8h, v23.8h
- ldur q17, [x2, #-0x10]
- smull v18.4s, v31.4h, v19.4h
- smlal2 v13.4s, v20.8h, v28.8h
- smull v29.4s, v31.4h, v21.4h
- ldr q21, [x5], #0x20
- smlal2 v13.4s, v26.8h, v8.8h
- smlal v29.4s, v16.4h, v19.4h
- ldur q19, [x5, #-0x10]
- smlal v18.4s, v16.4h, v23.4h
- smlal v29.4s, v20.4h, v27.4h
- uzp1 v31.8h, v14.8h, v6.8h
- uzp2 v27.8h, v21.8h, v19.8h
- smlal v18.4s, v20.4h, v28.4h
- ldr q25, [x1, #0x10]
- smlal v29.4s, v26.4h, v28.4h
- smlal v18.4s, v26.4h, v8.4h
- uzp2 v26.8h, v14.8h, v6.8h
- smlal2 v13.4s, v9.8h, v31.8h
- smlal2 v24.4s, v9.8h, v26.8h
- smlal v29.4s, v9.4h, v26.4h
- smlal v18.4s, v9.4h, v31.4h
- smlal2 v13.4s, v4.8h, v12.8h
- smlal2 v24.4s, v4.8h, v31.8h
- smlal v29.4s, v4.4h, v31.4h
- smlal v18.4s, v4.4h, v12.4h
- smlal2 v13.4s, v30.8h, v10.8h
- smlal2 v24.4s, v30.8h, v1.8h
- smlal v29.4s, v30.4h, v1.4h
- smlal v18.4s, v30.4h, v10.4h
- smlal2 v13.4s, v11.8h, v22.8h
- smlal2 v24.4s, v11.8h, v10.8h
- smlal v29.4s, v11.4h, v10.4h
- smlal v18.4s, v11.4h, v22.4h
- ldr q22, [x1], #0x20
- uzp1 v31.8h, v29.8h, v24.8h
- uzp1 v28.8h, v21.8h, v19.8h
- mul v19.8h, v31.8h, v2.8h
- uzp1 v31.8h, v22.8h, v25.8h
- uzp2 v16.8h, v22.8h, v25.8h
- uzp2 v21.8h, v3.8h, v17.8h
- smlal v29.4s, v19.4h, v0.4h
- smlal2 v24.4s, v19.8h, v0.8h
- uzp1 v19.8h, v3.8h, v17.8h
- uzp1 v26.8h, v18.8h, v13.8h
- zip2 v14.8h, v7.8h, v15.8h
- mul v23.8h, v26.8h, v2.8h
- uzp2 v15.8h, v29.8h, v24.8h
- smull2 v24.4s, v31.8h, v21.8h
- str q14, [x0, #0x10]
- ldr q3, [x7, #0x10]
- ldr q6, [x8, #0x10]
- ldr q8, [x10], #0x20
- ldur q26, [x10, #-0x10]
- ld1 { v22.8h }, [x12], #16
- uzp1 v30.8h, v8.8h, v26.8h
- uzp2 v11.8h, v8.8h, v26.8h
- ldr q8, [x4], #0x20
- ldur q26, [x4, #-0x10]
- ldr q4, [x7], #0x20
- uzp1 v20.8h, v8.8h, v26.8h
- uzp2 v26.8h, v8.8h, v26.8h
- ld1 { v8.8h }, [x6], #16
- uzp1 v9.8h, v4.8h, v3.8h
- ldr q25, [x11, #0x10]
- ldr q29, [x11], #0x20
- ld1 { v12.8h }, [x9], #16
- ldr q14, [x8], #0x20
- smlal2 v24.4s, v16.8h, v19.8h
- smlal2 v13.4s, v23.8h, v0.8h
- smlal v18.4s, v23.4h, v0.4h
- ld1 { v23.8h }, [x3], #16
- smlal2 v24.4s, v20.8h, v27.8h
- uzp2 v7.8h, v18.8h, v13.8h
- uzp1 v10.8h, v29.8h, v25.8h
- str q5, [x0], #0x20
- zip1 v5.8h, v7.8h, v15.8h
- sub x13, x13, #0x1
- cbnz x13, polyvec_basemul_acc_montgomery_cached_k4_loop
- smull2 v17.4s, v31.8h, v19.8h
- uzp2 v1.8h, v14.8h, v6.8h
- smull v18.4s, v31.4h, v21.4h
- smlal2 v24.4s, v26.8h, v28.8h
- smlal2 v17.4s, v16.8h, v23.8h
- smull v21.4s, v31.4h, v19.4h
- smlal v18.4s, v16.4h, v19.4h
- uzp2 v31.8h, v4.8h, v3.8h
- uzp1 v3.8h, v14.8h, v6.8h
- smlal v21.4s, v16.4h, v23.4h
- smlal v18.4s, v20.4h, v27.4h
- uzp2 v14.8h, v29.8h, v25.8h
- smlal2 v17.4s, v20.8h, v28.8h
- smlal v21.4s, v20.4h, v28.4h
- smlal v18.4s, v26.4h, v28.4h
- smlal2 v24.4s, v9.8h, v1.8h
- smlal2 v17.4s, v26.8h, v8.8h
- smlal v21.4s, v26.4h, v8.4h
- smlal v18.4s, v9.4h, v1.4h
- smlal2 v24.4s, v31.8h, v3.8h
- smlal2 v17.4s, v9.8h, v3.8h
- smlal v21.4s, v9.4h, v3.4h
- smlal v18.4s, v31.4h, v3.4h
- smlal2 v24.4s, v30.8h, v14.8h
- smlal2 v17.4s, v31.8h, v12.8h
- smlal v21.4s, v31.4h, v12.4h
- smlal v18.4s, v30.4h, v14.4h
- smlal2 v24.4s, v11.8h, v10.8h
- smlal2 v17.4s, v30.8h, v10.8h
- smlal v21.4s, v30.4h, v10.4h
- smlal v18.4s, v11.4h, v10.4h
- zip2 v19.8h, v7.8h, v15.8h
- smlal2 v17.4s, v11.8h, v22.8h
- smlal v21.4s, v11.4h, v22.4h
- uzp1 v23.8h, v18.8h, v24.8h
- str q19, [x0, #0x10]
- mul v19.8h, v23.8h, v2.8h
- uzp1 v23.8h, v21.8h, v17.8h
- str q5, [x0], #0x20
- mul v26.8h, v23.8h, v2.8h
- smlal v18.4s, v19.4h, v0.4h
- smlal2 v24.4s, v19.8h, v0.8h
- smlal v21.4s, v26.4h, v0.4h
- smlal2 v17.4s, v26.8h, v0.8h
- uzp2 v13.8h, v18.8h, v24.8h
- uzp2 v19.8h, v21.8h, v17.8h
- zip1 v23.8h, v19.8h, v13.8h
- zip2 v19.8h, v19.8h, v13.8h
- str q23, [x0], #0x20
- stur q19, [x0, #-0x10]
- ldp d8, d9, [sp]
- ldp d10, d11, [sp, #0x10]
- ldp d12, d13, [sp, #0x20]
- ldp d14, d15, [sp, #0x30]
- add sp, sp, #0x40
+Lpolyvec_basemul_acc_montgomery_cached_k4_loop_start:
+ smlal2 v23.4s, v30.8h, v21.8h
+ ldr q11, [x1], #0x20
+ uzp2 v15.8h, v5.8h, v22.8h
+ smlal v18.4s, v29.4h, v0.4h
+ ldr q12, [x7], #0x20
+ smlal2 v8.4s, v29.8h, v0.8h
+ ldur q3, [x7, #-0x10]
+ ldr q21, [x8], #0x20
+ uzp1 v29.8h, v11.8h, v19.8h
+ ldur q13, [x8, #-0x10]
+ uzp2 v5.8h, v11.8h, v19.8h
+ smlal2 v23.4s, v16.8h, v9.8h
+ uzp2 v28.8h, v18.8h, v8.8h
+ smull2 v8.4s, v29.8h, v31.8h
+ smlal2 v8.4s, v5.8h, v24.8h
+ uzp1 v7.8h, v12.8h, v3.8h
+ smlal2 v8.4s, v17.8h, v15.8h
+ uzp2 v11.8h, v21.8h, v13.8h
+ uzp1 v4.8h, v26.8h, v23.8h
+ smlal2 v8.4s, v27.8h, v14.8h
+ smlal2 v8.4s, v7.8h, v11.8h
+ mul v6.8h, v4.8h, v2.8h
+ ldr q19, [x11], #0x20
+ uzp2 v25.8h, v12.8h, v3.8h
+ ldr q12, [x10], #0x20
+ smull v18.4s, v29.4h, v31.4h
+ ldur q3, [x10, #-0x10]
+ smlal v18.4s, v5.4h, v24.4h
+ uzp1 v4.8h, v21.8h, v13.8h
+ smlal v18.4s, v17.4h, v15.4h
+ ldur q13, [x11, #-0x10]
+ ld1 { v1.8h }, [x6], #16
+ smlal v26.4s, v6.4h, v0.4h
+ smlal2 v23.4s, v6.8h, v0.8h
+ ld1 { v10.8h }, [x9], #16
+ smlal v18.4s, v27.4h, v14.4h
+ uzp1 v30.8h, v12.8h, v3.8h
+ smlal2 v8.4s, v25.8h, v4.8h
+ uzp2 v31.8h, v19.8h, v13.8h
+ smlal v18.4s, v7.4h, v11.4h
+ ld1 { v9.8h }, [x12], #16
+ smlal v18.4s, v25.4h, v4.4h
+ uzp1 v21.8h, v19.8h, v13.8h
+ uzp2 v16.8h, v12.8h, v3.8h
+ smlal v18.4s, v30.4h, v31.4h
+ smlal2 v8.4s, v30.8h, v31.8h
+ uzp2 v31.8h, v26.8h, v23.8h
+ smlal2 v8.4s, v16.8h, v21.8h
+ smlal v18.4s, v16.4h, v21.4h
+ zip1 v15.8h, v31.8h, v28.8h
+ ldr q19, [x1, #0x10]
+ smull2 v23.4s, v29.8h, v24.8h
+ smull v26.4s, v29.4h, v24.4h
+ ldr q3, [x2, #0x10]
+ smlal v26.4s, v5.4h, v20.4h
+ ldr q11, [x2], #0x20
+ uzp1 v6.8h, v18.8h, v8.8h
+ smlal v26.4s, v17.4h, v14.4h
+ smlal v26.4s, v27.4h, v1.4h
+ zip2 v13.8h, v31.8h, v28.8h
+ smlal v26.4s, v7.4h, v4.4h
+ str q15, [x0], #0x20
+ smlal v26.4s, v25.4h, v10.4h
+ stur q13, [x0, #-0x10]
+ mul v29.8h, v6.8h, v2.8h
+ uzp1 v24.8h, v11.8h, v3.8h
+ uzp2 v31.8h, v11.8h, v3.8h
+ ldr q11, [x4], #0x20
+ smlal2 v23.4s, v5.8h, v20.8h
+ ldur q28, [x4, #-0x10]
+ smlal2 v23.4s, v17.8h, v14.8h
+ ldr q5, [x5], #0x20
+ smlal2 v23.4s, v27.8h, v1.8h
+ ldur q22, [x5, #-0x10]
+ smlal v26.4s, v30.4h, v21.4h
+ ld1 { v20.8h }, [x3], #16
+ smlal v26.4s, v16.4h, v9.4h
+ uzp1 v17.8h, v11.8h, v28.8h
+ smlal2 v23.4s, v7.8h, v4.8h
+ uzp2 v27.8h, v11.8h, v28.8h
+ smlal2 v23.4s, v25.8h, v10.8h
+ uzp1 v14.8h, v5.8h, v22.8h
+ subs x13, x13, #0x1
+ cbnz x13, Lpolyvec_basemul_acc_montgomery_cached_k4_loop_start
+ smlal v18.4s, v29.4h, v0.4h
+ ldr q11, [x1], #0x20
+ uzp2 v28.8h, v5.8h, v22.8h
+ smlal2 v23.4s, v30.8h, v21.8h
+ smlal2 v8.4s, v29.8h, v0.8h
+ ldr q15, [x8, #0x10]
+ smlal2 v23.4s, v16.8h, v9.8h
+ ldr q21, [x8], #0x20
+ uzp1 v22.8h, v11.8h, v19.8h
+ uzp2 v12.8h, v11.8h, v19.8h
+ ldr q1, [x7, #0x10]
+ ld1 { v6.8h }, [x6], #16
+ uzp2 v3.8h, v18.8h, v8.8h
+ smull v9.4s, v22.4h, v31.4h
+ smull2 v18.4s, v22.8h, v31.8h
+ ldr q16, [x7], #0x20
+ smull v19.4s, v22.4h, v24.4h
+ uzp1 v30.8h, v21.8h, v15.8h
+ uzp2 v25.8h, v21.8h, v15.8h
+ smull2 v8.4s, v22.8h, v24.8h
+ smlal v19.4s, v12.4h, v20.4h
+ ldr q13, [x10, #0x10]
+ smlal2 v8.4s, v12.8h, v20.8h
+ uzp1 v29.8h, v16.8h, v1.8h
+ smlal2 v18.4s, v12.8h, v24.8h
+ ldr q5, [x10], #0x20
+ smlal v9.4s, v12.4h, v24.4h
+ ldr q4, [x11], #0x20
+ smlal v9.4s, v17.4h, v28.4h
+ ldur q22, [x11, #-0x10]
+ smlal2 v18.4s, v17.8h, v28.8h
+ uzp2 v16.8h, v16.8h, v1.8h
+ smlal v19.4s, v17.4h, v14.4h
+ ld1 { v28.8h }, [x9], #16
+ smlal2 v8.4s, v17.8h, v14.8h
+ uzp1 v7.8h, v5.8h, v13.8h
+ smlal v9.4s, v27.4h, v14.4h
+ uzp1 v17.8h, v4.8h, v22.8h
+ smlal2 v18.4s, v27.8h, v14.8h
+ uzp2 v12.8h, v5.8h, v13.8h
+ uzp2 v21.8h, v4.8h, v22.8h
+ smlal v19.4s, v27.4h, v6.4h
+ smlal2 v8.4s, v27.8h, v6.8h
+ ld1 { v15.8h }, [x12], #16
+ smlal v19.4s, v29.4h, v30.4h
+ uzp1 v20.8h, v26.8h, v23.8h
+ smlal v9.4s, v29.4h, v25.4h
+ smlal2 v18.4s, v29.8h, v25.8h
+ smlal2 v8.4s, v29.8h, v30.8h
+ smlal v19.4s, v16.4h, v28.4h
+ smlal2 v8.4s, v16.8h, v28.8h
+ smlal2 v18.4s, v16.8h, v30.8h
+ smlal v9.4s, v16.4h, v30.4h
+ smlal v9.4s, v7.4h, v21.4h
+ smlal2 v18.4s, v7.8h, v21.8h
+ smlal2 v8.4s, v7.8h, v17.8h
+ smlal v19.4s, v7.4h, v17.4h
+ smlal v19.4s, v12.4h, v15.4h
+ smlal2 v8.4s, v12.8h, v15.8h
+ smlal2 v18.4s, v12.8h, v17.8h
+ smlal v9.4s, v12.4h, v17.4h
+ mul v6.8h, v20.8h, v2.8h
+ uzp1 v4.8h, v19.8h, v8.8h
+ mul v17.8h, v4.8h, v2.8h
+ uzp1 v12.8h, v9.8h, v18.8h
+ smlal v26.4s, v6.4h, v0.4h
+ mul v21.8h, v12.8h, v2.8h
+ smlal2 v23.4s, v6.8h, v0.8h
+ smlal2 v8.4s, v17.8h, v0.8h
+ smlal v19.4s, v17.4h, v0.4h
+ smlal2 v18.4s, v21.8h, v0.8h
+ uzp2 v23.8h, v26.8h, v23.8h
+ smlal v9.4s, v21.4h, v0.4h
+ zip2 v12.8h, v23.8h, v3.8h
+ zip1 v22.8h, v23.8h, v3.8h
+ uzp2 v14.8h, v19.8h, v8.8h
+ uzp2 v18.8h, v9.8h, v18.8h
+ str q12, [x0, #0x10]
+ str q22, [x0], #0x20
+ zip2 v24.8h, v14.8h, v18.8h
+ zip1 v21.8h, v14.8h, v18.8h
+ str q24, [x0, #0x10]
+ str q21, [x0], #0x20
+ ldp d8, d9, [sp]
+ .cfi_restore d8
+ .cfi_restore d9
+ ldp d10, d11, [sp, #0x10]
+ .cfi_restore d10
+ .cfi_restore d11
+ ldp d12, d13, [sp, #0x20]
+ .cfi_restore d12
+ .cfi_restore d13
+ ldp d14, d15, [sp, #0x30]
+ .cfi_restore d14
+ .cfi_restore d15
+ add sp, sp, #0x40
+ .cfi_adjust_cfa_offset -0x40
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_asm_k4)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED && \
(MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/rej_uniform_asm.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/rej_uniform_asm.S
index 6bf3b0c958..c1ad796c23 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/rej_uniform_asm.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/rej_uniform_asm.S
@@ -3,21 +3,39 @@
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
-/*************************************************
- * Name: mlk_rej_uniform_asm
- *
- * Description: Run rejection sampling on uniform random bytes to generate
- * uniform random integers mod q
- *
- * Arguments: - int16_t *r: pointer to output buffer of MLKEM_N
- * 16-bit coefficients.
- * - const uint8_t *buf: pointer to input buffer
- * (assumed to be uniform random bytes)
- * - unsigned buflen: length of input buffer in bytes.
- * Must be a multiple of 24.
- *
- * Returns number of sampled 16-bit integers (at most MLKEM_N).
- **************************************************/
+/*yaml
+ Name: rej_uniform_asm
+ Description: Run rejection sampling on uniform random bytes to generate uniform random integers mod q
+ Signature: uint64_t mlk_rej_uniform_asm(int16_t r[256], const uint8_t *buf, unsigned buflen, const uint8_t table[2048])
+ ABI:
+ x0:
+ type: buffer
+ size_bytes: 512
+ permissions: write-only
+ c_parameter: int16_t r[256]
+ description: Output buffer
+ x1:
+ type: buffer
+ size_bytes: x2
+ permissions: read-only
+ c_parameter: const uint8_t *buf
+ description: Input buffer
+ x2:
+ type: scalar
+ c_parameter: unsigned buflen
+ description: Length of input buffer (must be multiple of 24)
+ test_with: 504 # MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE
+ x3:
+ type: buffer
+ size_bytes: 2048
+ permissions: read-only
+ c_parameter: const uint8_t table[2048]
+ description: Lookup table
+ Stack:
+ bytes: 576
+ description: register preservation and temporary storage
+*/
+
#include "../../../common.h"
#if defined(MLK_ARITH_BACKEND_AARCH64) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
@@ -27,173 +45,182 @@
* dev/aarch64_opt/src/rej_uniform_asm.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(rej_uniform_asm)
MLK_ASM_FN_SYMBOL(rej_uniform_asm)
- sub sp, sp, #0x240
- mov x7, #0x1 // =1
- movk x7, #0x2, lsl #16
- movk x7, #0x4, lsl #32
- movk x7, #0x8, lsl #48
- mov v31.d[0], x7
- mov x7, #0x10 // =16
- movk x7, #0x20, lsl #16
- movk x7, #0x40, lsl #32
- movk x7, #0x80, lsl #48
- mov v31.d[1], x7
- mov w11, #0xd01 // =3329
- dup v30.8h, w11
- mov x8, sp
- mov x7, x8
- mov x11, #0x0 // =0
- eor v16.16b, v16.16b, v16.16b
+ .cfi_startproc
+ sub sp, sp, #0x240
+ .cfi_adjust_cfa_offset 0x240
+ mov x7, #0x1 // =1
+ movk x7, #0x2, lsl #16
+ movk x7, #0x4, lsl #32
+ movk x7, #0x8, lsl #48
+ mov v31.d[0], x7
+ mov x7, #0x10 // =16
+ movk x7, #0x20, lsl #16
+ movk x7, #0x40, lsl #32
+ movk x7, #0x80, lsl #48
+ mov v31.d[1], x7
+ mov w11, #0xd01 // =3329
+ dup v30.8h, w11
+ mov x8, sp
+ mov x7, x8
+ mov x11, #0x0 // =0
+ eor v16.16b, v16.16b, v16.16b
-rej_uniform_initial_zero:
- str q16, [x7], #0x40
- stur q16, [x7, #-0x30]
- stur q16, [x7, #-0x20]
- stur q16, [x7, #-0x10]
- add x11, x11, #0x20
- cmp x11, #0x100
- b.lt rej_uniform_initial_zero
- mov x7, x8
- mov x9, #0x0 // =0
- mov x4, #0x100 // =256
- cmp x2, #0x30
- b.lo rej_uniform_loop48_end
+Lrej_uniform_initial_zero:
+ str q16, [x7], #0x40
+ stur q16, [x7, #-0x30]
+ stur q16, [x7, #-0x20]
+ stur q16, [x7, #-0x10]
+ add x11, x11, #0x20
+ cmp x11, #0x100
+ b.lt Lrej_uniform_initial_zero
+ mov x7, x8
+ mov x9, #0x0 // =0
+ mov x4, #0x100 // =256
+ cmp x2, #0x30
+ b.lo Lrej_uniform_loop48_end
-rej_uniform_loop48:
- cmp x9, x4
- b.hs rej_uniform_memory_copy
- sub x2, x2, #0x30
- ld3 { v0.16b, v1.16b, v2.16b }, [x1], #48
- zip1 v4.16b, v0.16b, v1.16b
- zip2 v5.16b, v0.16b, v1.16b
- zip1 v6.16b, v1.16b, v2.16b
- zip2 v7.16b, v1.16b, v2.16b
- bic v4.8h, #0xf0, lsl #8
- bic v5.8h, #0xf0, lsl #8
- ushr v6.8h, v6.8h, #0x4
- ushr v7.8h, v7.8h, #0x4
- zip1 v16.8h, v4.8h, v6.8h
- zip2 v17.8h, v4.8h, v6.8h
- zip1 v18.8h, v5.8h, v7.8h
- zip2 v19.8h, v5.8h, v7.8h
- cmhi v4.8h, v30.8h, v16.8h
- cmhi v5.8h, v30.8h, v17.8h
- cmhi v6.8h, v30.8h, v18.8h
- cmhi v7.8h, v30.8h, v19.8h
- and v4.16b, v4.16b, v31.16b
- and v5.16b, v5.16b, v31.16b
- and v6.16b, v6.16b, v31.16b
- and v7.16b, v7.16b, v31.16b
- uaddlv s20, v4.8h
- uaddlv s21, v5.8h
- uaddlv s22, v6.8h
- uaddlv s23, v7.8h
- fmov w12, s20
- fmov w13, s21
- fmov w14, s22
- fmov w15, s23
- ldr q24, [x3, x12, lsl #4]
- ldr q25, [x3, x13, lsl #4]
- ldr q26, [x3, x14, lsl #4]
- ldr q27, [x3, x15, lsl #4]
- cnt v4.16b, v4.16b
- cnt v5.16b, v5.16b
- cnt v6.16b, v6.16b
- cnt v7.16b, v7.16b
- uaddlv s20, v4.8h
- uaddlv s21, v5.8h
- uaddlv s22, v6.8h
- uaddlv s23, v7.8h
- fmov w12, s20
- fmov w13, s21
- fmov w14, s22
- fmov w15, s23
- tbl v16.16b, { v16.16b }, v24.16b
- tbl v17.16b, { v17.16b }, v25.16b
- tbl v18.16b, { v18.16b }, v26.16b
- tbl v19.16b, { v19.16b }, v27.16b
- str q16, [x7]
- add x7, x7, x12, lsl #1
- str q17, [x7]
- add x7, x7, x13, lsl #1
- str q18, [x7]
- add x7, x7, x14, lsl #1
- str q19, [x7]
- add x7, x7, x15, lsl #1
- add x12, x12, x13
- add x14, x14, x15
- add x9, x9, x12
- add x9, x9, x14
- cmp x2, #0x30
- b.hs rej_uniform_loop48
+Lrej_uniform_loop48:
+ cmp x9, x4
+ b.hs Lrej_uniform_memory_copy
+ sub x2, x2, #0x30
+ ld3 { v0.16b, v1.16b, v2.16b }, [x1], #48
+ zip1 v4.16b, v0.16b, v1.16b
+ zip2 v5.16b, v0.16b, v1.16b
+ zip1 v6.16b, v1.16b, v2.16b
+ zip2 v7.16b, v1.16b, v2.16b
+ bic v4.8h, #0xf0, lsl #8
+ bic v5.8h, #0xf0, lsl #8
+ ushr v6.8h, v6.8h, #0x4
+ ushr v7.8h, v7.8h, #0x4
+ zip1 v16.8h, v4.8h, v6.8h
+ zip2 v17.8h, v4.8h, v6.8h
+ zip1 v18.8h, v5.8h, v7.8h
+ zip2 v19.8h, v5.8h, v7.8h
+ cmhi v4.8h, v30.8h, v16.8h
+ cmhi v5.8h, v30.8h, v17.8h
+ cmhi v6.8h, v30.8h, v18.8h
+ cmhi v7.8h, v30.8h, v19.8h
+ and v4.16b, v4.16b, v31.16b
+ and v5.16b, v5.16b, v31.16b
+ and v6.16b, v6.16b, v31.16b
+ and v7.16b, v7.16b, v31.16b
+ uaddlv s20, v4.8h
+ uaddlv s21, v5.8h
+ uaddlv s22, v6.8h
+ uaddlv s23, v7.8h
+ fmov w12, s20
+ fmov w13, s21
+ fmov w14, s22
+ fmov w15, s23
+ ldr q24, [x3, x12, lsl #4]
+ ldr q25, [x3, x13, lsl #4]
+ ldr q26, [x3, x14, lsl #4]
+ ldr q27, [x3, x15, lsl #4]
+ cnt v4.16b, v4.16b
+ cnt v5.16b, v5.16b
+ cnt v6.16b, v6.16b
+ cnt v7.16b, v7.16b
+ uaddlv s20, v4.8h
+ uaddlv s21, v5.8h
+ uaddlv s22, v6.8h
+ uaddlv s23, v7.8h
+ fmov w12, s20
+ fmov w13, s21
+ fmov w14, s22
+ fmov w15, s23
+ tbl v16.16b, { v16.16b }, v24.16b
+ tbl v17.16b, { v17.16b }, v25.16b
+ tbl v18.16b, { v18.16b }, v26.16b
+ tbl v19.16b, { v19.16b }, v27.16b
+ st1 { v16.8h }, [x7]
+ add x7, x7, x12, lsl #1
+ st1 { v17.8h }, [x7]
+ add x7, x7, x13, lsl #1
+ st1 { v18.8h }, [x7]
+ add x7, x7, x14, lsl #1
+ st1 { v19.8h }, [x7]
+ add x7, x7, x15, lsl #1
+ add x12, x12, x13
+ add x14, x14, x15
+ add x9, x9, x12
+ add x9, x9, x14
+ cmp x2, #0x30
+ b.hs Lrej_uniform_loop48
-rej_uniform_loop48_end:
- cmp x9, x4
- b.hs rej_uniform_memory_copy
- cmp x2, #0x18
- b.lo rej_uniform_memory_copy
- sub x2, x2, #0x18
- ld3 { v0.8b, v1.8b, v2.8b }, [x1], #24
- zip1 v4.16b, v0.16b, v1.16b
- zip1 v5.16b, v1.16b, v2.16b
- bic v4.8h, #0xf0, lsl #8
- ushr v5.8h, v5.8h, #0x4
- zip1 v16.8h, v4.8h, v5.8h
- zip2 v17.8h, v4.8h, v5.8h
- cmhi v4.8h, v30.8h, v16.8h
- cmhi v5.8h, v30.8h, v17.8h
- and v4.16b, v4.16b, v31.16b
- and v5.16b, v5.16b, v31.16b
- uaddlv s20, v4.8h
- uaddlv s21, v5.8h
- fmov w12, s20
- fmov w13, s21
- ldr q24, [x3, x12, lsl #4]
- ldr q25, [x3, x13, lsl #4]
- cnt v4.16b, v4.16b
- cnt v5.16b, v5.16b
- uaddlv s20, v4.8h
- uaddlv s21, v5.8h
- fmov w12, s20
- fmov w13, s21
- tbl v16.16b, { v16.16b }, v24.16b
- tbl v17.16b, { v17.16b }, v25.16b
- str q16, [x7]
- add x7, x7, x12, lsl #1
- str q17, [x7]
- add x7, x7, x13, lsl #1
- add x9, x9, x12
- add x9, x9, x13
+Lrej_uniform_loop48_end:
+ cmp x9, x4
+ b.hs Lrej_uniform_memory_copy
+ cmp x2, #0x18
+ b.lo Lrej_uniform_memory_copy
+ sub x2, x2, #0x18
+ ld3 { v0.8b, v1.8b, v2.8b }, [x1], #24
+ zip1 v4.16b, v0.16b, v1.16b
+ zip1 v5.16b, v1.16b, v2.16b
+ bic v4.8h, #0xf0, lsl #8
+ ushr v5.8h, v5.8h, #0x4
+ zip1 v16.8h, v4.8h, v5.8h
+ zip2 v17.8h, v4.8h, v5.8h
+ cmhi v4.8h, v30.8h, v16.8h
+ cmhi v5.8h, v30.8h, v17.8h
+ and v4.16b, v4.16b, v31.16b
+ and v5.16b, v5.16b, v31.16b
+ uaddlv s20, v4.8h
+ uaddlv s21, v5.8h
+ fmov w12, s20
+ fmov w13, s21
+ ldr q24, [x3, x12, lsl #4]
+ ldr q25, [x3, x13, lsl #4]
+ cnt v4.16b, v4.16b
+ cnt v5.16b, v5.16b
+ uaddlv s20, v4.8h
+ uaddlv s21, v5.8h
+ fmov w12, s20
+ fmov w13, s21
+ tbl v16.16b, { v16.16b }, v24.16b
+ tbl v17.16b, { v17.16b }, v25.16b
+ st1 { v16.8h }, [x7]
+ add x7, x7, x12, lsl #1
+ st1 { v17.8h }, [x7]
+ add x7, x7, x13, lsl #1
+ add x9, x9, x12
+ add x9, x9, x13
-rej_uniform_memory_copy:
- cmp x9, x4
- csel x9, x9, x4, lo
- mov x11, #0x0 // =0
- mov x7, x8
+Lrej_uniform_memory_copy:
+ cmp x9, x4
+ csel x9, x9, x4, lo
+ mov x11, #0x0 // =0
+ mov x7, x8
-rej_uniform_final_copy:
- ldr q16, [x7], #0x40
- ldur q17, [x7, #-0x30]
- ldur q18, [x7, #-0x20]
- ldur q19, [x7, #-0x10]
- str q16, [x0], #0x40
- stur q17, [x0, #-0x30]
- stur q18, [x0, #-0x20]
- stur q19, [x0, #-0x10]
- add x11, x11, #0x20
- cmp x11, #0x100
- b.lt rej_uniform_final_copy
- mov x0, x9
- b rej_uniform_return
+Lrej_uniform_final_copy:
+ ldr q16, [x7], #0x40
+ ldur q17, [x7, #-0x30]
+ ldur q18, [x7, #-0x20]
+ ldur q19, [x7, #-0x10]
+ str q16, [x0], #0x40
+ stur q17, [x0, #-0x30]
+ stur q18, [x0, #-0x20]
+ stur q19, [x0, #-0x10]
+ add x11, x11, #0x20
+ cmp x11, #0x100
+ b.lt Lrej_uniform_final_copy
+ mov x0, x9
+ b Lrej_uniform_return
-rej_uniform_return:
- add sp, sp, #0x240
+Lrej_uniform_return:
+ add sp, sp, #0x240
+ .cfi_adjust_cfa_offset -0x240
ret
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(rej_uniform_asm)
#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/rej_uniform_table.c
index 74a931bc4a..9a7bc210a4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/rej_uniform_table.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/aarch64/src/rej_uniform_table.c
@@ -5,6 +5,7 @@
/*
* WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
* Do not modify it directly.
*/
@@ -13,7 +14,6 @@
#if defined(MLK_ARITH_BACKEND_AARCH64) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
#include "arith_native_aarch64.h"
/*
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/api.h
index aea28a3af4..0308f2bd51 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/api.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/api.h
@@ -17,10 +17,18 @@
* and run sanity checks.
*/
-#include
#include "../cbmc.h"
#include "../common.h"
+/* Backends must return MLK_NATIVE_FUNC_SUCCESS upon success. */
+#define MLK_NATIVE_FUNC_SUCCESS (0)
+/* Backends may return MLK_NATIVE_FUNC_FALLBACK to signal to the frontend that
+ * the target/parameters are unsupported; typically, this would be because of
+ * dependencies on CPU features not detected on the host CPU. In this case,
+ * the frontend falls back to the default C implementation. */
+#define MLK_NATIVE_FUNC_FALLBACK (-1)
+
+
/* Absolute exclusive upper bound for the output of the inverse NTT
*
* NOTE: This is the same bound as in poly.h and has to be kept
@@ -74,12 +82,16 @@
*
* Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial
**************************************************/
-static MLK_INLINE void mlk_ntt_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_ntt_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
requires(array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLK_NTT_BOUND))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLK_NTT_BOUND))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_NTT */
@@ -140,11 +152,14 @@ __contract__(
*
* Arguments: - uint16_t *a: pointer to in/output polynomial
**************************************************/
-static MLK_INLINE void mlk_intt_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_intt_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLK_INVNTT_BOUND))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLK_INVNTT_BOUND))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_INTT */
@@ -156,11 +171,14 @@ __contract__(
*
* Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial
**************************************************/
-static MLK_INLINE void mlk_poly_reduce_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_reduce_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
@@ -173,11 +191,14 @@ __contract__(
*
* Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial
**************************************************/
-static MLK_INLINE void mlk_poly_tomont_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tomont_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
@@ -203,13 +224,15 @@ __contract__(
* OUTPUT
* - cache: pointer to multiplication cache
**************************************************/
-static MLK_INLINE void mlk_poly_mulcache_compute_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_mulcache_compute_native(
int16_t cache[MLKEM_N / 2], const int16_t mlk_poly[MLKEM_N])
__contract__(
requires(memory_no_alias(cache, sizeof(int16_t) * (MLKEM_N / 2)))
requires(memory_no_alias(mlk_poly, sizeof(int16_t) * MLKEM_N))
- assigns(object_whole(cache))
- ensures(array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
+ assigns(memory_slice(cache, sizeof(int16_t) * (MLKEM_N / 2)))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
);
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
@@ -234,7 +257,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N],
const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)])
__contract__(
@@ -244,6 +268,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 2 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 2 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 */
@@ -267,7 +292,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N],
const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)])
__contract__(
@@ -277,6 +303,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 3 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 3 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3 */
@@ -300,7 +327,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N],
const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)])
__contract__(
@@ -310,6 +338,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 4 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 4 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
@@ -324,18 +353,20 @@ __contract__(
*
* Arguments: INPUT:
* - a: const pointer to input polynomial,
- * with each coefficient in the range -Q+1 .. Q-1
+ * with each coefficient in the range 0 .. Q-1
* OUTPUT
* - r: pointer to output byte array
* (of MLKEM_POLYBYTES bytes)
**************************************************/
-static MLK_INLINE void mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
- const int16_t a[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+ const int16_t a[MLKEM_N])
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
);
#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
@@ -353,13 +384,15 @@ __contract__(
* - a: const pointer to input byte array
* (of MLKEM_POLYBYTES bytes)
**************************************************/
-static MLK_INLINE void mlk_poly_frombytes_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_frombytes_native(
int16_t a[MLKEM_N], const uint8_t r[MLKEM_POLYBYTES])
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(a, sizeof(int16_t) * MLKEM_N))
- ensures(array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
);
#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
@@ -381,6 +414,7 @@ __contract__(
* Otherwise, returns non-negative number of sampled 16-bit integers (at most
* len).
**************************************************/
+MLK_MUST_CHECK_RETURN_VALUE
static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
const uint8_t *buf,
unsigned buflen)
@@ -389,8 +423,10 @@ __contract__(
requires(memory_no_alias(r, sizeof(int16_t) * len))
requires(memory_no_alias(buf, buflen))
assigns(memory_slice(r, sizeof(int16_t) * len))
- ensures(return_value == -1 || (0 <= return_value && return_value <= len))
- ensures(return_value != -1 ==> array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
+ ensures(return_value != MLK_NATIVE_FUNC_FALLBACK
+ ==> (0 <= return_value && return_value <= len))
+ ensures(return_value != MLK_NATIVE_FUNC_FALLBACK
+ ==> array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
);
#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
@@ -408,8 +444,15 @@ __contract__(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d4_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d4_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
@@ -425,8 +468,15 @@ static MLK_INLINE void mlk_poly_compress_d4_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d10_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d10_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
@@ -444,8 +494,15 @@ static MLK_INLINE void mlk_poly_compress_d10_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d4_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d4_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
@@ -463,8 +520,15 @@ static MLK_INLINE void mlk_poly_decompress_d4_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d10_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d10_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
@@ -482,8 +546,15 @@ static MLK_INLINE void mlk_poly_decompress_d10_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d5_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d5_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
@@ -499,8 +570,15 @@ static MLK_INLINE void mlk_poly_compress_d5_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d11_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d11_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
@@ -518,8 +596,15 @@ static MLK_INLINE void mlk_poly_compress_d11_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d5_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d5_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
@@ -537,8 +622,15 @@ static MLK_INLINE void mlk_poly_decompress_d5_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d11_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d11_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/meta.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/meta.h
index f2b9b848b7..4291d629b1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/meta.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/native/meta.h
@@ -18,4 +18,8 @@
#include "x86_64/meta.h"
#endif
+#if defined(MLK_SYS_RISCV64_RVV)
+#include "riscv64/meta.h"
+#endif
+
#endif /* !MLK_NATIVE_META_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/params.h
index 3f81bb0e2e..04598539c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/params.h
@@ -5,12 +5,6 @@
#ifndef MLK_PARAMS_H
#define MLK_PARAMS_H
-#if defined(MLK_CONFIG_FILE)
-#include MLK_CONFIG_FILE
-#else
-#include "config.h"
-#endif
-
#if !defined(MLK_CONFIG_PARAMETER_SET)
#error MLK_CONFIG_PARAMETER_SET is not defined
#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly.c
index 40d29948c8..564d5d712b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly.c
@@ -20,8 +20,7 @@
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
-#include
+
#include "cbmc.h"
#include "debug.h"
#include "poly.h"
@@ -29,9 +28,6 @@
#include "symmetric.h"
#include "verify.h"
-#if !defined(MLK_USE_NATIVE_POLY_TOMONT) || \
- !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
- !defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_fqmul
*
@@ -68,10 +64,7 @@ __contract__(
mlk_assert_abs_bound(&res, 1, MLKEM_Q);
return res;
}
-#endif /* !MLK_USE_NATIVE_POLY_TOMONT || !MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE \
- || !MLK_USE_NATIVE_NTT || !MLK_USE_NATIVE_INTT */
-#if !defined(MLK_USE_NATIVE_POLY_REDUCE) || !defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_barrett_reduce
*
@@ -107,7 +100,7 @@ __contract__(
* Here, we assume it's sign-preserving "arithmetic" shift right.
* See (C99 6.5.7 (5))
*/
- const int32_t t = (magic * a + (1 << 25)) >> 26;
+ const int32_t t = (magic * a + ((int32_t)1 << 25)) >> 26;
/*
* t is in -10 .. +10, so we need 32-bit math to
@@ -118,12 +111,14 @@ __contract__(
mlk_assert_abs_bound(&res, 1, MLKEM_Q_HALF);
return res;
}
-#endif /* !MLK_USE_NATIVE_POLY_REDUCE || !MLK_USE_NATIVE_INTT */
-#if !defined(MLK_USE_NATIVE_POLY_TOMONT)
/* Reference: `poly_tomont()` in the reference implementation @[REF]. */
-MLK_INTERNAL_API
-void mlk_poly_tomont(mlk_poly *r)
+MLK_STATIC_TESTABLE void mlk_poly_tomont_c(mlk_poly *r)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+)
{
unsigned i;
const int16_t f = 1353; /* check-magic: 1353 == signed_mod(2^32, MLKEM_Q) */
@@ -137,16 +132,23 @@ void mlk_poly_tomont(mlk_poly *r)
mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_TOMONT */
+
MLK_INTERNAL_API
void mlk_poly_tomont(mlk_poly *r)
{
- mlk_poly_tomont_native(r->coeffs);
- mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_TOMONT)
+ int ret;
+ ret = mlk_poly_tomont_native(r->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
-#if !defined(MLK_USE_NATIVE_POLY_REDUCE)
+ mlk_poly_tomont_c(r);
+}
+
/************************************************************
* Name: mlk_scalar_signed_to_unsigned_q
*
@@ -162,7 +164,7 @@ void mlk_poly_tomont(mlk_poly *r)
* - Used here to implement different semantics of `poly_reduce()`;
* see below. in the reference implementation @[REF], this logic is
* part of all compression functions (see `compress.c`). */
-static MLK_INLINE uint16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
+static MLK_INLINE int16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
__contract__(
requires(c > -MLKEM_Q && c < MLKEM_Q)
ensures(return_value >= 0 && return_value < MLKEM_Q)
@@ -170,12 +172,14 @@ __contract__(
{
mlk_assert_abs_bound(&c, 1, MLKEM_Q);
- /* Add Q if c is negative, but in constant time */
- c = mlk_ct_sel_int16(c + MLKEM_Q, c, mlk_ct_cmask_neg_i16(c));
+ /* Add MLKEM_Q if c is negative, but in constant time.
+ *
+ * Note that c + MLKEM_Q does not overflow in int16_t,
+ * so the cast to uint16_t is safe. */
+ c = mlk_ct_sel_int16((int16_t)(c + MLKEM_Q), c, mlk_ct_cmask_neg_i16(c));
- /* and therefore cast to uint16_t is safe. */
mlk_assert_bound(&c, 1, 0, MLKEM_Q);
- return (uint16_t)c;
+ return c;
}
/* Reference: `poly_reduce()` in the reference implementation @[REF]
@@ -185,10 +189,15 @@ __contract__(
* here to go from signed to unsigned representatives.
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
-MLK_INTERNAL_API
-void mlk_poly_reduce(mlk_poly *r)
+MLK_STATIC_TESTABLE void mlk_poly_reduce_c(mlk_poly *r)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
+
for (i = 0; i < MLKEM_N; i++)
__loop__(
invariant(i <= MLKEM_N)
@@ -202,15 +211,23 @@ void mlk_poly_reduce(mlk_poly *r)
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_REDUCE */
+
MLK_INTERNAL_API
void mlk_poly_reduce(mlk_poly *r)
{
- mlk_poly_reduce_native(r->coeffs);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_REDUCE)
+ int ret;
+ ret = mlk_poly_reduce_native(r->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
+ mlk_poly_reduce_c(r);
+}
+
/* Reference: `poly_add()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
@@ -224,7 +241,8 @@ void mlk_poly_add(mlk_poly *r, const mlk_poly *b)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
{
- r->coeffs[i] = r->coeffs[i] + b->coeffs[i];
+ /* The preconditions imply that the addition stays within int16_t. */
+ r->coeffs[i] = (int16_t)(r->coeffs[i] + b->coeffs[i]);
}
}
@@ -241,24 +259,24 @@ void mlk_poly_sub(mlk_poly *r, const mlk_poly *b)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
{
- r->coeffs[i] = r->coeffs[i] - b->coeffs[i];
+ /* The preconditions imply that the subtraction stays within int16_t. */
+ r->coeffs[i] = (int16_t)(r->coeffs[i] - b->coeffs[i]);
}
}
-/* Include zeta table unless NTT, invNTT and mulcache computation
- * have been replaced by native implementations. */
-#if !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
- !defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT)
#include "zetas.inc"
-#endif
-#if !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
/* Reference: Does not exist in the reference implementation @[REF].
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This idea originates
* from @[NeonNTT] and is used at the C level here. */
-MLK_INTERNAL_API
-void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_mulcache_compute_c(mlk_poly_mulcache *x,
+ const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 4; i++)
@@ -266,8 +284,11 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
invariant(i <= MLKEM_N / 4)
invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
{
- x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
- x->coeffs[2 * i + 1] = mlk_fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
+ x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], mlk_zetas[64 + i]);
+ /* The values in zeta table are <= MLKEM_Q in absolute value,
+ * so the negation in int16_t is safe. */
+ x->coeffs[2 * i + 1] =
+ mlk_fqmul(a->coeffs[4 * i + 3], (int16_t)(-mlk_zetas[64 + i]));
}
/*
@@ -278,15 +299,22 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
*/
mlk_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
MLK_INTERNAL_API
void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
{
- mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
-}
+#if defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+ int ret;
+ ret = mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
-#if !defined(MLK_USE_NATIVE_NTT)
+ mlk_poly_mulcache_compute_c(x, a);
+}
+
/*
* Computes a block CT butterflies with a fixed twiddle factor,
* using Montgomery multiplication.
@@ -316,7 +344,8 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
/* Reference: Embedded in `ntt()` in the reference implementation @[REF]. */
static void mlk_ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
- unsigned start, unsigned len, int bound)
+ unsigned start, unsigned len,
+ unsigned bound)
__contract__(
requires(start < MLKEM_N)
requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
@@ -346,8 +375,9 @@ __contract__(
{
int16_t t;
t = mlk_fqmul(r[j + len], zeta);
- r[j + len] = r[j] - t;
- r[j] = r[j] + t;
+ /* The precondition implies that the arithmetic does not overflow. */
+ r[j + len] = (int16_t)(r[j] - t);
+ r[j] = (int16_t)(r[j] + t);
}
}
@@ -370,7 +400,7 @@ __contract__(
unsigned start, k, len;
/* Twiddle factors for layer n are at indices 2^(n-1)..2^n-1. */
k = 1u << (layer - 1);
- len = MLKEM_N >> layer;
+ len = (unsigned)MLKEM_N >> layer;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
invariant(start < MLKEM_N + 2 * len)
@@ -378,7 +408,7 @@ __contract__(
invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
{
- int16_t zeta = zetas[k++];
+ int16_t zeta = mlk_zetas[k++];
mlk_ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
}
}
@@ -395,12 +425,19 @@ __contract__(
/* Reference: `ntt()` in the reference implementation @[REF].
* - Iterate over `layer` instead of `len` in the outer loop
* to simplify computation of zeta index. */
-MLK_INTERNAL_API
-void mlk_poly_ntt(mlk_poly *p)
+MLK_STATIC_TESTABLE void mlk_poly_ntt_c(mlk_poly *p)
+__contract__(
+ requires(memory_no_alias(p, sizeof(mlk_poly)))
+ requires(array_abs_bound(p->coeffs, 0, MLKEM_N, MLKEM_Q))
+ assigns(memory_slice(p, sizeof(mlk_poly)))
+ ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_NTT_BOUND))
+)
{
unsigned layer;
int16_t *r;
+
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+
r = p->coeffs;
for (layer = 1; layer <= 7; layer++)
@@ -414,18 +451,24 @@ void mlk_poly_ntt(mlk_poly *p)
/* Check the stronger bound */
mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
}
-#else /* !MLK_USE_NATIVE_NTT */
MLK_INTERNAL_API
void mlk_poly_ntt(mlk_poly *p)
{
+#if defined(MLK_USE_NATIVE_NTT)
+ int ret;
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
- mlk_ntt_native(p->coeffs);
- mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
-}
+ ret = mlk_ntt_native(p->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
+ return;
+ }
#endif /* MLK_USE_NATIVE_NTT */
-#if !defined(MLK_USE_NATIVE_INTT)
+ mlk_poly_ntt_c(p);
+}
+
/* Compute one layer of inverse NTT */
@@ -439,7 +482,7 @@ __contract__(
ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
unsigned start, k, len;
- len = (MLKEM_N >> layer);
+ len = (unsigned)MLKEM_N >> layer;
k = (1u << layer) - 1;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
@@ -449,7 +492,7 @@ __contract__(
invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
{
unsigned j;
- int16_t zeta = zetas[k--];
+ int16_t zeta = mlk_zetas[k--];
for (j = start; j < start + len; j++)
__loop__(
invariant(start <= j && j <= start + len)
@@ -457,8 +500,9 @@ __contract__(
invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
int16_t t = r[j];
- r[j] = mlk_barrett_reduce(t + r[j + len]);
- r[j + len] = r[j + len] - t;
+ /* The preconditions imply that the arithmetic does not overflow. */
+ r[j] = mlk_barrett_reduce((int16_t)(t + r[j + len]));
+ r[j + len] = (int16_t)(r[j + len] - t);
r[j + len] = mlk_fqmul(r[j + len], zeta);
}
}
@@ -469,18 +513,22 @@ __contract__(
* while the reference implementation normalizes at
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
-MLK_INTERNAL_API
-void mlk_poly_invntt_tomont(mlk_poly *p)
+MLK_STATIC_TESTABLE void mlk_poly_invntt_tomont_c(mlk_poly *p)
+__contract__(
+ requires(memory_no_alias(p, sizeof(mlk_poly)))
+ assigns(memory_slice(p, sizeof(mlk_poly)))
+ ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND))
+)
{
+ unsigned j, layer;
+ const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
+ int16_t *r = p->coeffs;
+
/*
* Scale input polynomial to account for Montgomery factor
* and NTT twist. This also brings coefficients down to
* absolute value < MLKEM_Q.
*/
- unsigned j, layer;
- const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
- int16_t *r = p->coeffs;
-
for (j = 0; j < MLKEM_N; j++)
__loop__(
invariant(j <= MLKEM_N)
@@ -500,16 +548,23 @@ void mlk_poly_invntt_tomont(mlk_poly *p)
mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
}
-#else /* !MLK_USE_NATIVE_INTT */
MLK_INTERNAL_API
void mlk_poly_invntt_tomont(mlk_poly *p)
{
- mlk_intt_native(p->coeffs);
- mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
-}
+#if defined(MLK_USE_NATIVE_INTT)
+ int ret;
+ ret = mlk_intt_native(p->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
+ return;
+ }
#endif /* MLK_USE_NATIVE_INTT */
+ mlk_poly_invntt_tomont_c(p);
+}
+
#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
MLK_EMPTY_CU(mlk_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly.h
index 20fb65e720..587062cce5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly.h
@@ -15,8 +15,7 @@
#ifndef MLK_POLY_H
#define MLK_POLY_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include "debug.h"
@@ -46,34 +45,6 @@ typedef struct
int16_t coeffs[MLKEM_N >> 1];
} MLK_ALIGN mlk_poly_mulcache;
-/*************************************************
- * Name: mlk_cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- * input x in 0 .. 32767: returns value unchanged
- * input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
-{
- /*
- * PORTABILITY: This relies on uint16_t -> int16_t
- * being implemented as the inverse of int16_t -> uint16_t,
- * which is implementation-defined (C99 6.3.1.3 (3))
- * CBMC (correctly) fails to prove this conversion is OK,
- * so we have to suppress that check here
- */
- return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
/*************************************************
* Name: mlk_montgomery_reduce
*
@@ -90,7 +61,7 @@ static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
static MLK_ALWAYS_INLINE int16_t mlk_montgomery_reduce(int32_t a)
__contract__(
requires(a < +(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)) &&
- a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
+ a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
/* We don't attempt to express an input-dependent output bound
* as the post-condition here. There are two call-sites for this
* function:
@@ -102,8 +73,8 @@ __contract__(
/* check-magic: 62209 == unsigned_mod(pow(MLKEM_Q, -1, 2^16), 2^16) */
const uint32_t QINV = 62209;
- /* Compute a*q^{-1} mod 2^16 in unsigned representatives */
- const uint16_t a_reduced = a & UINT16_MAX;
+ /* Compute a*q^{-1} mod 2^16 in unsigned representatives. */
+ const uint16_t a_reduced = mlk_cast_int32_to_uint16(a);
const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
/* Lift to signed canonical representative mod 2^16. */
@@ -187,7 +158,7 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
__contract__(
requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
requires(memory_no_alias(a, sizeof(mlk_poly)))
- assigns(object_whole(x))
+ assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
);
#define mlk_poly_reduce MLK_NAMESPACE(poly_reduce)
@@ -280,7 +251,7 @@ __contract__(
requires(forall(k0, 0, MLKEM_N, (int32_t) r->coeffs[k0] - b->coeffs[k0] <= INT16_MAX))
requires(forall(k1, 0, MLKEM_N, (int32_t) r->coeffs[k1] - b->coeffs[k1] >= INT16_MIN))
ensures(forall(k, 0, MLKEM_N, r->coeffs[k] == old(*r).coeffs[k] - b->coeffs[k]))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
);
#define mlk_poly_ntt MLK_NAMESPACE(poly_ntt)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly_k.c
index f15ab96ce7..32b214ee04 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly_k.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly_k.c
@@ -22,12 +22,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
+#include "poly_k.h"
-#include "compress.h"
#include "debug.h"
-#include "poly_k.h"
#include "sampling.h"
#include "symmetric.h"
@@ -37,6 +34,8 @@
* within a single compilation unit. */
#define mlk_poly_cbd_eta1 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta1)
#define mlk_poly_cbd_eta2 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta2)
+#define mlk_polyvec_basemul_acc_montgomery_cached_c \
+ MLK_ADD_PARAM_SET(mlk_polyvec_basemul_acc_montgomery_cached_c)
/* End of parameter set namespacing */
/* Reference: `polyvec_compress()` in the reference implementation @[REF]
@@ -46,29 +45,29 @@
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
- const mlk_polyvec a)
+ const mlk_polyvec *a)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a[i]);
+ mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
}
}
/* Reference: `polyvec_decompress()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_decompress_du(mlk_polyvec r,
+void mlk_polyvec_decompress_du(mlk_polyvec *r,
const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_decompress_du(&r[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+ mlk_poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `polyvec_tobytes()` in the reference implementation @[REF].
@@ -77,41 +76,45 @@ void mlk_polyvec_decompress_du(mlk_polyvec r,
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
-void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a)
+void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec *a)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(r, MLKEM_POLYVECBYTES))
+ invariant(i <= MLKEM_K)
+ )
{
- mlk_poly_tobytes(r + i * MLKEM_POLYBYTES, &a[i]);
+ mlk_poly_tobytes(&r[i * MLKEM_POLYBYTES], &a->vec[i]);
}
}
/* Reference: `polyvec_frombytes()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES])
+void mlk_polyvec_frombytes(mlk_polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_frombytes(&r[i], a + i * MLKEM_POLYBYTES);
+ mlk_poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
}
/* Reference: `polyvec_ntt()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_ntt(mlk_polyvec r)
+void mlk_polyvec_ntt(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_ntt(&r[i]);
+ mlk_poly_ntt(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
}
/* Reference: `polyvec_invntt_tomont()` in the reference implementation @[REF].
@@ -120,18 +123,17 @@ void mlk_polyvec_ntt(mlk_polyvec r)
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
MLK_INTERNAL_API
-void mlk_polyvec_invntt_tomont(mlk_polyvec r)
+void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_invntt_tomont(&r[i]);
+ mlk_poly_invntt_tomont(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND);
}
-#if !defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
/* Reference: `polyvec_basemul_acc_montgomery()` in the
* reference implementation @[REF].
* - We use a multiplication cache ('mulcache') here
@@ -143,13 +145,22 @@ void mlk_polyvec_invntt_tomont(mlk_polyvec r)
* at the end. The reference implementation uses 2 * MLKEM_K
* more modular reductions since it reduces after every modular
* multiplication. */
-MLK_INTERNAL_API
-void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+MLK_STATIC_TESTABLE void mlk_polyvec_basemul_acc_montgomery_cached_c(
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, sizeof(mlk_polyvec)))
+ requires(memory_no_alias(b, sizeof(mlk_polyvec)))
+ requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
+ requires(forall(k1, 0, MLKEM_K,
+ array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(invariant(i <= MLKEM_N / 2))
{
@@ -163,53 +174,59 @@ void mlk_polyvec_basemul_acc_montgomery_cached(
t[1] <= ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768) &&
t[1] >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768)))
{
- t[0] += (int32_t)a[k].coeffs[2 * i + 1] * b_cache[k].coeffs[i];
- t[0] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i];
- t[1] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i + 1];
- t[1] += (int32_t)a[k].coeffs[2 * i + 1] * b[k].coeffs[2 * i];
+ t[0] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b_cache->vec[k].coeffs[i];
+ t[0] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i];
+ t[1] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i + 1];
+ t[1] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b->vec[k].coeffs[2 * i];
}
r->coeffs[2 * i + 0] = mlk_montgomery_reduce(t[0]);
r->coeffs[2 * i + 1] = mlk_montgomery_reduce(t[1]);
}
}
-#else /* !MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
MLK_INTERNAL_API
void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
{
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
- /* Omitting bounds assertion for cache since native implementations may
- * decide not to use a mulcache. Note that the C backend implementation
- * of poly_basemul_montgomery_cached() does still include the check. */
+#if defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+ {
+ int ret;
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
#if MLKEM_K == 2
- mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#elif MLKEM_K == 3
- mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#elif MLKEM_K == 4
- mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#endif
-}
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
+ }
#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+ mlk_polyvec_basemul_acc_montgomery_cached_c(r, a, b, b_cache);
+}
+
/* Reference: Does not exist in the reference implementation @[REF].
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This idea originates
* from @[NeonNTT] and is used at the C level here. */
MLK_INTERNAL_API
-void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
+void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_mulcache_compute(&x[i], &a[i]);
+ mlk_poly_mulcache_compute(&x->vec[i], &a->vec[i]);
}
}
@@ -221,41 +238,53 @@ void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
MLK_INTERNAL_API
-void mlk_polyvec_reduce(mlk_polyvec r)
+void mlk_polyvec_reduce(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_reduce(&r[i]);
+ mlk_poly_reduce(&r->vec[i]);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `polyvec_add()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
MLK_INTERNAL_API
-void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b)
+void mlk_polyvec_add(mlk_polyvec *r, const mlk_polyvec *b)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(r, sizeof(mlk_polyvec)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(j0, i, MLKEM_K,
+ forall(k0, 0, MLKEM_N,
+ ((int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX) &&
+ ((int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] >= INT16_MIN))))
+ invariant(forall(j2, 0, i,
+ forall(k2, 0, MLKEM_N,
+ (r->vec[j2].coeffs[k2] <= INT16_MAX) &&
+ (r->vec[j2].coeffs[k2] >= INT16_MIN))))
+ )
{
- mlk_poly_add(&r[i], &b[i]);
+ mlk_poly_add(&r->vec[i], &b->vec[i]);
}
}
/* Reference: `polyvec_tomont()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_tomont(mlk_polyvec r)
+void mlk_polyvec_tomont(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_tomont(&r[i]);
+ mlk_poly_tomont(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLKEM_Q);
}
@@ -306,24 +335,41 @@ void mlk_poly_getnoise_eta1_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
{
MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
- memcpy(extkey[0], seed, MLKEM_SYMBYTES);
- memcpy(extkey[1], seed, MLKEM_SYMBYTES);
- memcpy(extkey[2], seed, MLKEM_SYMBYTES);
- memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
extkey[0][MLKEM_SYMBYTES] = nonce0;
extkey[1][MLKEM_SYMBYTES] = nonce1;
extkey[2][MLKEM_SYMBYTES] = nonce2;
extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) && \
+ !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
mlk_prf_eta1_x4(buf, extkey);
+#else
+ mlk_prf_eta1(buf[0], extkey[0]);
+ mlk_prf_eta1(buf[1], extkey[1]);
+ mlk_prf_eta1(buf[2], extkey[2]);
+ if (r3 != NULL)
+ {
+ mlk_prf_eta1(buf[3], extkey[3]);
+ }
+#endif /* !(!FIPS202_X4_DEFAULT_IMPLEMENTATION && \
+ !MLK_CONFIG_SERIAL_FIPS202_ONLY) */
+
mlk_poly_cbd_eta1(r0, buf[0]);
mlk_poly_cbd_eta1(r1, buf[1]);
mlk_poly_cbd_eta1(r2, buf[2]);
- mlk_poly_cbd_eta1(r3, buf[3]);
+ if (r3 != NULL)
+ {
+ mlk_poly_cbd_eta1(r3, buf[3]);
+ mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+ }
mlk_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
- mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -364,7 +410,7 @@ __contract__(
#endif
}
-/* Reference: `poly_getnoise_eta1()` in the reference implementation @[REF].
+/* Reference: `poly_getnoise_eta2()` in the reference implementation @[REF].
* - We include buffer zeroization. */
MLK_INTERNAL_API
void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
@@ -373,13 +419,13 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
MLK_ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
MLK_ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
- memcpy(extkey, seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey, seed, MLKEM_SYMBYTES);
extkey[MLKEM_SYMBYTES] = nonce;
mlk_prf_eta2(buf, extkey);
mlk_poly_cbd_eta2(r, buf);
- mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+ mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA2 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -391,7 +437,7 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
#if MLKEM_K == 2
/* Reference: Does not exist in the reference implementation @[REF].
* - This implements a x4-batched version of `poly_getnoise_eta1()`
- * and `poly_getnoise_eta1()` from the reference implementation,
+ * and `poly_getnoise_eta2()` from the reference implementation,
* leveraging batched Keccak-f1600.
* - If a x4-batched Keccak-f1600 is available, we squeeze
* more random data than needed for the eta2 calls, to be
@@ -409,10 +455,10 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
- memcpy(extkey[0], seed, MLKEM_SYMBYTES);
- memcpy(extkey[1], seed, MLKEM_SYMBYTES);
- memcpy(extkey[2], seed, MLKEM_SYMBYTES);
- memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
extkey[0][MLKEM_SYMBYTES] = nonce0;
extkey[1][MLKEM_SYMBYTES] = nonce1;
extkey[2][MLKEM_SYMBYTES] = nonce2;
@@ -421,14 +467,16 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
/* On systems with fast batched Keccak, we use 4-fold batched PRF,
* even though that means generating more random data in buf[2] and buf[3]
* than necessary. */
-#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION)
+#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) && \
+ !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
mlk_prf_eta1_x4(buf, extkey);
#else
mlk_prf_eta1(buf[0], extkey[0]);
mlk_prf_eta1(buf[1], extkey[1]);
mlk_prf_eta2(buf[2], extkey[2]);
mlk_prf_eta2(buf[3], extkey[3]);
-#endif /* FIPS202_X4_DEFAULT_IMPLEMENTATION */
+#endif /* !(!FIPS202_X4_DEFAULT_IMPLEMENTATION && \
+ !MLK_CONFIG_SERIAL_FIPS202_ONLY) */
mlk_poly_cbd_eta1(r0, buf[0]);
mlk_poly_cbd_eta1(r1, buf[1]);
@@ -451,3 +499,4 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
#undef mlk_poly_cbd_eta1
#undef mlk_poly_cbd_eta2
+#undef mlk_polyvec_basemul_acc_montgomery_cached_c
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly_k.h
index f7a40ff5f9..9089a8e431 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly_k.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/poly_k.h
@@ -15,7 +15,6 @@
#ifndef MLK_POLY_K_H
#define MLK_POLY_K_H
-#include
#include "common.h"
#include "compress.h"
#include "poly.h"
@@ -29,9 +28,20 @@
#define mlk_polyvec_mulcache MLK_ADD_PARAM_SET(mlk_polyvec_mulcache)
/* End of parameter set namespacing */
-typedef mlk_poly mlk_polyvec[MLKEM_K];
-typedef mlk_poly mlk_polymat[MLKEM_K * MLKEM_K];
-typedef mlk_poly_mulcache mlk_polyvec_mulcache[MLKEM_K];
+typedef struct
+{
+ mlk_poly vec[MLKEM_K];
+} MLK_ALIGN mlk_polyvec;
+
+typedef struct
+{
+ mlk_polyvec vec[MLKEM_K];
+} MLK_ALIGN mlk_polymat;
+
+typedef struct
+{
+ mlk_poly_mulcache vec[MLKEM_K];
+} MLK_ALIGN mlk_polyvec_mulcache;
#define mlk_poly_compress_du MLK_NAMESPACE_K(poly_compress_du)
/*************************************************
@@ -131,7 +141,7 @@ __contract__(
requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r)))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DV)))
{
#if MLKEM_DV == 4
mlk_poly_compress_d4(r, a);
@@ -168,7 +178,7 @@ static MLK_INLINE void mlk_poly_decompress_dv(
__contract__(
requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
requires(memory_no_alias(r, sizeof(mlk_poly)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
{
#if MLKEM_DV == 4
@@ -200,13 +210,13 @@ __contract__(
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
- const mlk_polyvec a)
+ const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(forall(k0, 0, MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
- assigns(object_whole(r))
+ array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
);
#define mlk_polyvec_decompress_du MLK_NAMESPACE_K(polyvec_decompress_du)
@@ -228,14 +238,14 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_decompress_du(mlk_polyvec r,
+void mlk_polyvec_decompress_du(mlk_polyvec *r,
const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
__contract__(
requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_polyvec_tobytes MLK_NAMESPACE_K(polyvec_tobytes)
@@ -256,13 +266,13 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a)
+void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
requires(forall(k0, 0, MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
- assigns(object_whole(r))
+ array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(r, MLKEM_POLYVECBYTES))
);
#define mlk_polyvec_frombytes MLK_NAMESPACE_K(polyvec_frombytes)
@@ -284,13 +294,13 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES])
+void mlk_polyvec_frombytes(mlk_polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
);
#define mlk_polyvec_ntt MLK_NAMESPACE_K(polyvec_ntt)
@@ -313,14 +323,14 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_ntt(mlk_polyvec r)
+void mlk_polyvec_ntt(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
- assigns(object_whole(r))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
);
#define mlk_polyvec_invntt_tomont MLK_NAMESPACE_K(polyvec_invntt_tomont)
@@ -344,12 +354,12 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_invntt_tomont(mlk_polyvec r)
+void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
);
#define mlk_polyvec_basemul_acc_montgomery_cached \
@@ -380,16 +390,16 @@ __contract__(
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(memory_no_alias(b, sizeof(mlk_polyvec)))
requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
requires(forall(k1, 0, MLKEM_K,
- array_bound(a[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
- assigns(object_whole(r))
+ array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
);
#define mlk_polyvec_mulcache_compute MLK_NAMESPACE_K(polyvec_mulcache_compute)
@@ -423,11 +433,11 @@ __contract__(
* higher level safety proofs, and thus not part of the spec.
*/
MLK_INTERNAL_API
-void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
+void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(x, sizeof(mlk_polyvec_mulcache)))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
- assigns(object_whole(x))
+ assigns(memory_slice(x, sizeof(mlk_polyvec_mulcache)))
);
#define mlk_polyvec_reduce MLK_NAMESPACE_K(polyvec_reduce)
@@ -436,7 +446,7 @@ __contract__(
*
* Description: Applies Barrett reduction to each coefficient
* of each element of a vector of polynomials;
- * for details of the Barrett reduction see comments in reduce.c
+ * for details of the Barrett reduction see comments in poly.c
*
* Arguments: - mlk_polyvec r: pointer to input/output polynomial
*
@@ -453,12 +463,12 @@ __contract__(
* use of mlk_poly_reduce() in the context of (de)serialization.
*/
MLK_INTERNAL_API
-void mlk_polyvec_reduce(mlk_polyvec r)
+void mlk_polyvec_reduce(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_polyvec_add MLK_NAMESPACE_K(polyvec_add)
@@ -485,17 +495,17 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b)
+void mlk_polyvec_add(mlk_polyvec *r, const mlk_polyvec *b)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(memory_no_alias(b, sizeof(mlk_polyvec)))
requires(forall(j0, 0, MLKEM_K,
forall(k0, 0, MLKEM_N,
- (int32_t)r[j0].coeffs[k0] + b[j0].coeffs[k0] <= INT16_MAX)))
+ (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
requires(forall(j1, 0, MLKEM_K,
forall(k1, 0, MLKEM_N,
- (int32_t)r[j1].coeffs[k1] + b[j1].coeffs[k1] >= INT16_MIN)))
- assigns(object_whole(r))
+ (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
);
#define mlk_polyvec_tomont MLK_NAMESPACE_K(polyvec_tomont)
@@ -514,13 +524,12 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_tomont(mlk_polyvec r)
+void mlk_polyvec_tomont(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
assigns(memory_slice(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
);
#define mlk_poly_getnoise_eta1_4x MLK_NAMESPACE_K(poly_getnoise_eta1_4x)
@@ -531,7 +540,8 @@ __contract__(
* and nonces, with output polynomials close to centered binomial
* distribution with parameter MLKEM_ETA1.
*
- * Arguments: - mlk_poly *r{0,1,2,3}: pointer to output polynomial
+ * Arguments: - mlk_poly *r{0,1,2,3}: pointer to output polynomial. The last
+ * polynomial pointer may be NULL.
* - const uint8_t *seed: pointer to input seed
* (of length MLKEM_SYMBYTES bytes)
* - uint8_t nonce{0,1,2,3}: one-byte input nonce
@@ -555,16 +565,15 @@ __contract__(
requires(memory_no_alias(r0, sizeof(mlk_poly)))
requires(memory_no_alias(r1, sizeof(mlk_poly)))
requires(memory_no_alias(r2, sizeof(mlk_poly)))
- requires(memory_no_alias(r3, sizeof(mlk_poly)))
+ requires(r3 == NULL || memory_no_alias(r3, sizeof(mlk_poly)))
assigns(memory_slice(r0, sizeof(mlk_poly)))
assigns(memory_slice(r1, sizeof(mlk_poly)))
assigns(memory_slice(r2, sizeof(mlk_poly)))
- assigns(memory_slice(r3, sizeof(mlk_poly)))
- ensures(
- array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+ assigns(r3 != NULL: memory_slice(r3, sizeof(mlk_poly)))
+ ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(r3 != NULL ==> array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
);
#if MLKEM_ETA1 == MLKEM_ETA2
@@ -604,7 +613,7 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
);
#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
@@ -640,15 +649,19 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
uint8_t nonce0, uint8_t nonce1,
uint8_t nonce2, uint8_t nonce3)
__contract__(
- requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(mlk_poly)) && memory_no_alias(r2, 2 * sizeof(mlk_poly)) &&
- r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+ requires(memory_no_alias(r0, sizeof(mlk_poly)))
+ requires(memory_no_alias(r1, sizeof(mlk_poly)))
+ requires(memory_no_alias(r2, sizeof(mlk_poly)))
+ requires(memory_no_alias(r3, sizeof(mlk_poly)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
- assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+ assigns(memory_slice(r0, sizeof(mlk_poly)))
+ assigns(memory_slice(r1, sizeof(mlk_poly)))
+ assigns(memory_slice(r2, sizeof(mlk_poly)))
+ assigns(memory_slice(r3, sizeof(mlk_poly)))
ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
- && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+ && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+ && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+ && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1))
);
#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/randombytes.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/randombytes.h
index 132d920afb..3e841d26ca 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/randombytes.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/randombytes.h
@@ -5,18 +5,56 @@
#ifndef MLK_RANDOMBYTES_H
#define MLK_RANDOMBYTES_H
-#include
-#include
#include "cbmc.h"
#include "common.h"
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
#if !defined(MLK_CONFIG_CUSTOM_RANDOMBYTES)
-void randombytes(uint8_t *out, size_t outlen);
-static MLK_INLINE void mlk_randombytes(uint8_t *out, size_t outlen)
+/*************************************************
+ * Name: randombytes
+ *
+ * Description: Fill a buffer with cryptographically secure random bytes.
+ *
+ * mlkem-native does not provide an implementation of this
+ * function. It must be provided by the consumer.
+ *
+ * To use a custom random byte source with a different name
+ * or signature, set MLK_CONFIG_CUSTOM_RANDOMBYTES and define
+ * mlk_randombytes directly.
+ *
+ * Arguments: - uint8_t *out: pointer to output buffer
+ * - size_t outlen: number of random bytes to write
+ *
+ * Returns: 0 on success, non-zero on failure.
+ * On failure, top-level APIs return MLK_ERR_RNG_FAIL.
+ *
+ **************************************************/
+int randombytes(uint8_t *out, size_t outlen);
+
+/*************************************************
+ * Name: mlk_randombytes
+ *
+ * Description: Internal wrapper around randombytes().
+ *
+ * Fill a buffer with cryptographically secure random bytes.
+ *
+ * This function can be replaced by setting
+ * MLK_CONFIG_CUSTOM_RANDOMBYTES and defining mlk_randombytes
+ * directly.
+ *
+ * Arguments: - uint8_t *out: pointer to output buffer
+ * - size_t outlen: number of random bytes to write
+ *
+ * Returns: 0 on success, non-zero on failure.
+ * On failure, top-level APIs return MLK_ERR_RNG_FAIL.
+ *
+ **************************************************/
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_randombytes(uint8_t *out, size_t outlen)
__contract__(
requires(memory_no_alias(out, outlen))
- assigns(memory_slice(out, outlen))) { randombytes(out, outlen); }
+ assigns(memory_slice(out, outlen))) { return randombytes(out, outlen); }
#endif /* !MLK_CONFIG_CUSTOM_RANDOMBYTES */
-
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
#endif /* !MLK_RANDOMBYTES_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/sampling.c
index be5d931a79..945d12ed3d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/sampling.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/sampling.c
@@ -29,9 +29,10 @@
* in that it adds the offset and always expects the base of the
* target buffer. This avoids shifting the buffer base in the
* caller, which appears tricky to reason about. */
-static unsigned mlk_rej_uniform_scalar(int16_t *r, unsigned target,
- unsigned offset, const uint8_t *buf,
- unsigned buflen)
+MLK_STATIC_TESTABLE unsigned mlk_rej_uniform_c(int16_t *r, unsigned target,
+ unsigned offset,
+ const uint8_t *buf,
+ unsigned buflen)
__contract__(
requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
requires(memory_no_alias(r, sizeof(int16_t) * target))
@@ -39,11 +40,10 @@ __contract__(
requires(array_bound(r, 0, offset, 0, MLKEM_Q))
assigns(memory_slice(r, sizeof(int16_t) * target))
ensures(offset <= return_value && return_value <= target)
- ensures(array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
+ ensures(array_bound(r, 0, return_value, 0, MLKEM_Q)))
{
unsigned ctr, pos;
- uint16_t val0, val1;
+ int16_t val0, val1;
mlk_assert_bound(r, offset, 0, MLKEM_Q);
@@ -55,8 +55,8 @@ __contract__(
invariant(offset <= ctr && ctr <= target && pos <= buflen)
invariant(array_bound(r, 0, ctr, 0, MLKEM_Q)))
{
- val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
- val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+ val0 = ((buf[pos + 0] >> 0) | (buf[pos + 1] << 8)) & 0xFFF;
+ val1 = ((buf[pos + 1] >> 4) | (buf[pos + 2] << 4)) & 0xFFF;
pos += 3;
if (val0 < MLKEM_Q)
@@ -93,7 +93,7 @@ __contract__(
* Must be a multiple of 3.
*
* Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 128 is somewhat arbitrary but sufficient for all
+ * excluding. The limit of 4096 is somewhat arbitrary but sufficient for all
* uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
*
* Returns the new offset of sampled 16-bit integers, at most target,
@@ -124,8 +124,9 @@ __contract__(
#if defined(MLK_USE_NATIVE_REJ_UNIFORM)
if (offset == 0)
{
- int ret = mlk_rej_uniform_native(r, target, buf, buflen);
- if (ret != -1)
+ int ret;
+ ret = mlk_rej_uniform_native(r, target, buf, buflen);
+ if (ret != MLK_NATIVE_FUNC_FALLBACK)
{
unsigned res = (unsigned)ret;
mlk_assert_bound(r, res, 0, MLKEM_Q);
@@ -134,19 +135,22 @@ __contract__(
}
#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
- return mlk_rej_uniform_scalar(r, target, offset, buf, buflen);
+ return mlk_rej_uniform_c(r, target, offset, buf, buflen);
}
#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
- ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + MLK_XOF_RATE) / MLK_XOF_RATE)
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+ ((12 * MLKEM_N / 8 * ((uint32_t)1 << 12) / MLKEM_Q + MLK_XOF_RATE) / \
+ MLK_XOF_RATE)
#endif
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
/* Reference: Does not exist in the reference implementation @[REF].
* - x4-batched version of `rej_uniform()` from the
* reference implementation, leveraging x4-batched Keccak-f1600. */
MLK_INTERNAL_API
-void mlk_poly_rej_uniform_x4(mlk_poly *vec,
+void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
+ mlk_poly *vec3,
uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
{
/* Temporary buffers for XOF output before rejection sampling */
@@ -167,10 +171,10 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
*/
mlk_xof_x4_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &statex);
buflen = MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE;
- ctr[0] = mlk_rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf[0], buflen);
- ctr[1] = mlk_rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf[1], buflen);
- ctr[2] = mlk_rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf[2], buflen);
- ctr[3] = mlk_rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf[3], buflen);
+ ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, 0, buf[0], buflen);
+ ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, 0, buf[1], buflen);
+ ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, 0, buf[2], buflen);
+ ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, 0, buf[3], buflen);
/*
* So long as not all matrix entries have been generated, squeeze
@@ -180,20 +184,24 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
ctr[3] < MLKEM_N)
__loop__(
- assigns(ctr, statex, memory_slice(vec, sizeof(mlk_poly) * 4), object_whole(buf[0]),
- object_whole(buf[1]), object_whole(buf[2]), object_whole(buf[3]))
+ assigns(ctr, statex,
+ memory_slice(vec0, sizeof(mlk_poly)),
+ memory_slice(vec1, sizeof(mlk_poly)),
+ memory_slice(vec2, sizeof(mlk_poly)),
+ memory_slice(vec3, sizeof(mlk_poly)),
+ object_whole(buf))
invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
- invariant(array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
- invariant(array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
- invariant(array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
- invariant(array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+ invariant(array_bound(vec0->coeffs, 0, ctr[0], 0, MLKEM_Q))
+ invariant(array_bound(vec1->coeffs, 0, ctr[1], 0, MLKEM_Q))
+ invariant(array_bound(vec2->coeffs, 0, ctr[2], 0, MLKEM_Q))
+ invariant(array_bound(vec3->coeffs, 0, ctr[3], 0, MLKEM_Q)))
{
mlk_xof_x4_squeezeblocks(buf, 1, &statex);
- ctr[0] = mlk_rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf[0], buflen);
- ctr[1] = mlk_rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf[1], buflen);
- ctr[2] = mlk_rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf[2], buflen);
- ctr[3] = mlk_rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf[3], buflen);
+ ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, ctr[0], buf[0], buflen);
+ ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, ctr[1], buf[1], buflen);
+ ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, ctr[2], buf[2], buflen);
+ ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, ctr[3], buf[3], buflen);
}
mlk_xof_x4_release(&statex);
@@ -202,6 +210,7 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(buf, sizeof(buf));
}
+#endif /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
MLK_INTERNAL_API
void mlk_poly_rej_uniform(mlk_poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
@@ -284,7 +293,7 @@ void mlk_poly_cbd2(mlk_poly *r, const uint8_t buf[2 * MLKEM_N / 4])
{
const int16_t a = (d >> (4 * j + 0)) & 0x3;
const int16_t b = (d >> (4 * j + 2)) & 0x3;
- r->coeffs[8 * i + j] = a - b;
+ r->coeffs[8 * i + j] = (int16_t)(a - b);
}
}
}
@@ -336,7 +345,7 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4])
{
const int16_t a = (d >> (6 * j + 0)) & 0x7;
const int16_t b = (d >> (6 * j + 3)) & 0x7;
- r->coeffs[4 * i + j] = a - b;
+ r->coeffs[4 * i + j] = (int16_t)(a - b);
}
}
}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/sampling.h
index 2cf43c889b..24c26b34a5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/sampling.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/sampling.h
@@ -15,8 +15,6 @@
#ifndef MLK_SAMPLING_H
#define MLK_SAMPLING_H
-#include
-#include
#include "cbmc.h"
#include "common.h"
#include "poly.h"
@@ -58,6 +56,7 @@ MLK_INTERNAL_API
void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_ETA1 == 3 */
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
#define mlk_poly_rej_uniform_x4 MLK_NAMESPACE(poly_rej_uniform_x4)
/*************************************************
* Name: mlk_poly_rej_uniform_x4
@@ -65,8 +64,8 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
* Description: Generate four polynomials using rejection sampling
* on (pseudo-)uniformly random bytes sampled from a seed.
*
- * Arguments: - mlk_poly *vec:
- * Pointer to an array of 4 polynomials to be sampled.
+ * Arguments: - mlk_poly *vec0, *vec1, *vec2, *vec3:
+ * Pointers to 4 polynomials to be sampled.
* - uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)]:
* Pointer consecutive array of seed buffers of size
* MLKEM_SYMBYTES + 2 each, plus padding for alignment.
@@ -75,16 +74,24 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
*
**************************************************/
MLK_INTERNAL_API
-void mlk_poly_rej_uniform_x4(mlk_poly *vec,
+void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
+ mlk_poly *vec3,
uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
__contract__(
- requires(memory_no_alias(vec, sizeof(mlk_poly) * 4))
+ requires(memory_no_alias(vec0, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec1, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec2, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec3, sizeof(mlk_poly)))
requires(memory_no_alias(seed, 4 * MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)))
- assigns(memory_slice(vec, sizeof(mlk_poly) * 4))
- ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+ assigns(memory_slice(vec0, sizeof(mlk_poly)))
+ assigns(memory_slice(vec1, sizeof(mlk_poly)))
+ assigns(memory_slice(vec2, sizeof(mlk_poly)))
+ assigns(memory_slice(vec3, sizeof(mlk_poly)))
+ ensures(array_bound(vec0->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec1->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec2->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec3->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+#endif /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
#define mlk_poly_rej_uniform MLK_NAMESPACE(poly_rej_uniform)
/*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/symmetric.h
index 985bfeab37..68d7e1a0cd 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/symmetric.h
@@ -15,12 +15,13 @@
#ifndef MLK_SYMMETRIC_H
#define MLK_SYMMETRIC_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include MLK_FIPS202_HEADER_FILE
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
#include MLK_FIPS202X4_HEADER_FILE
+#endif
/* Macros denoting FIPS 203 specific Hash functions */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/sys.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/sys.h
index 8f690cc553..0ab8947318 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/sys.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/sys.h
@@ -20,6 +20,15 @@
#error "__BYTE_ORDER__ defined, but don't recognize value."
#endif
#endif /* __BYTE_ORDER__ */
+
+/* MSVC does not define __BYTE_ORDER__. However, MSVC only supports
+ * little endian x86, x86_64, and AArch64. It is, hence, safe to assume
+ * little endian. */
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || \
+ defined(_M_IX86) || defined(_M_ARM64))
+#define MLK_SYS_LITTLE_ENDIAN
+#endif
+
#endif /* !MLK_SYS_LITTLE_ENDIAN && !MLK_SYS_BIG_ENDIAN */
/* Check if we're running on an AArch64 little endian system. _M_ARM64 is set by
@@ -33,6 +42,11 @@
#define MLK_SYS_AARCH64_EB
#endif
+/* Check if we're running on an Armv8.1-M system with MVE */
+#if defined(__ARM_ARCH_8_1M_MAIN__) || defined(__ARM_FEATURE_MVE)
+#define MLK_SYS_ARMV81M_MVE
+#endif
+
#if defined(__x86_64__)
#define MLK_SYS_X86_64
#if defined(__AVX2__)
@@ -48,6 +62,11 @@
#define MLK_SYS_RISCV64
#endif
+#if defined(MLK_SYS_RISCV64) && defined(__riscv_vector) && \
+ defined(__riscv_v_intrinsic)
+#define MLK_SYS_RISCV64_RVV
+#endif
+
#if defined(__riscv) && defined(__riscv_xlen) && __riscv_xlen == 32
#define MLK_SYS_RISCV32
#endif
@@ -56,6 +75,14 @@
#define MLK_SYS_WINDOWS
#endif
+#if defined(__linux__)
+#define MLK_SYS_LINUX
+#endif
+
+#if defined(__APPLE__)
+#define MLK_SYS_APPLE
+#endif
+
#if defined(MLK_FORCE_AARCH64) && !defined(MLK_SYS_AARCH64)
#error "MLK_FORCE_AARCH64 is set, but we don't seem to be on an AArch64 system."
#endif
@@ -82,34 +109,46 @@
#endif
/*
- * C90 does not have the inline compiler directive yet.
- * We don't use it in C90 builds.
- * However, in that case the compiler warns about some inline functions in
- * header files not being used in every compilation unit that includes that
- * header. To work around it we silence that warning in that case using
- * __attribute__((unused)).
+ * MLK_INLINE: Hint for inlining.
+ * - MSVC: __inline
+ * - C99+: inline
+ * - GCC/Clang C90: __attribute__((unused)) to silence warnings
+ * - Other C90: empty
*/
-
-/* Do not use inline for C90 builds*/
#if !defined(MLK_INLINE)
-#if !defined(inline)
#if defined(_MSC_VER)
#define MLK_INLINE __inline
-/* Don't combine __inline and __forceinline */
-#define MLK_ALWAYS_INLINE __forceinline
-#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#elif defined(inline) || \
+ (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
#define MLK_INLINE inline
+#elif defined(__GNUC__) || defined(__clang__)
+#define MLK_INLINE __attribute__((unused))
+#else
+#define MLK_INLINE
+#endif
+#endif /* !MLK_INLINE */
+
+/*
+ * MLK_ALWAYS_INLINE: Force inlining.
+ * - MSVC: __forceinline
+ * - GCC/Clang C99+: MLK_INLINE __attribute__((always_inline))
+ * - Other: MLK_INLINE (no forced inlining)
+ */
+#if !defined(MLK_ALWAYS_INLINE)
+#if defined(_MSC_VER)
+#define MLK_ALWAYS_INLINE __forceinline
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+ (defined(inline) || \
+ (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L))
#define MLK_ALWAYS_INLINE MLK_INLINE __attribute__((always_inline))
#else
-#define MLK_INLINE __attribute__((unused))
#define MLK_ALWAYS_INLINE MLK_INLINE
#endif
+#endif /* !MLK_ALWAYS_INLINE */
-#else /* !inline */
-#define MLK_INLINE inline
-#define MLK_ALWAYS_INLINE MLK_INLINE __attribute__((always_inline))
-#endif /* inline */
-#endif /* !MLK_INLINE */
+#ifndef MLK_STATIC_TESTABLE
+#define MLK_STATIC_TESTABLE static
+#endif
/*
* C90 does not have the restrict compiler directive yet.
@@ -181,10 +220,41 @@
} while (0)
#endif /* !(MLK_CONFIG_CT_TESTING_ENABLED && !__ASSEMBLER__) */
-#if defined(__GNUC__) || defined(clang)
+#if defined(__GNUC__) || defined(__clang__)
#define MLK_MUST_CHECK_RETURN_VALUE __attribute__((warn_unused_result))
#else
#define MLK_MUST_CHECK_RETURN_VALUE
#endif
+#if !defined(__ASSEMBLER__)
+/* System capability enumeration */
+typedef enum
+{
+ /* x86_64 */
+ MLK_SYS_CAP_AVX2,
+ /* AArch64 */
+ MLK_SYS_CAP_SHA3
+} mlk_sys_cap;
+
+#if !defined(MLK_CONFIG_CUSTOM_CAPABILITY_FUNC)
+#include "cbmc.h"
+
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_sys_check_capability(mlk_sys_cap cap)
+__contract__(
+ ensures(return_value == 0 || return_value == 1)
+)
+{
+ /* By default, we rely on compile-time feature detection/specification:
+ * If a feature is enabled at compile-time, we assume it is supported by
+ * the host that the resulting library/binary will be built on.
+ * If this assumption is not true, you MUST overwrite this function.
+ * See the documentation of MLK_CONFIG_CUSTOM_CAPABILITY_FUNC in
+ * mlkem_native_config.h for more information. */
+ (void)cap;
+ return 1;
+}
+#endif /* !MLK_CONFIG_CUSTOM_CAPABILITY_FUNC */
+#endif /* !__ASSEMBLER__ */
+
#endif /* !MLK_SYS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/verify.h
index 85626c15ea..a9bdeaab30 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/verify.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/verify.h
@@ -30,9 +30,7 @@
#ifndef MLK_VERIFY_H
#define MLK_VERIFY_H
-#include
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
@@ -115,92 +113,83 @@ __contract__(ensures(return_value == b)) { return (b ^ mlk_ct_get_optblocker_u8(
static MLK_INLINE uint32_t mlk_value_barrier_u32(uint32_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
static MLK_INLINE int32_t mlk_value_barrier_i32(int32_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
static MLK_INLINE uint8_t mlk_value_barrier_u8(uint8_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
#endif /* MLK_USE_ASM_VALUE_BARRIER */
-/*
- * The ct_cmask_nonzero_xxx functions below make deliberate use of unsigned
- * overflow, which is fully defined behaviour in C. It is thus safe to disable
- * this warning.
- */
#ifdef CBMC
#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
+#pragma CPROVER check disable "conversion"
#endif
-
/*************************************************
- * Name: mlk_ct_cmask_nonzero_u16
+ * Name: mlk_cast_uint16_to_int16
*
- * Description: Return 0 if input is zero, and -1 otherwise.
+ * Description: Cast uint16 value to int16
*
- * Arguments: uint16_t x: Value to be converted into a mask
+ * Returns: For uint16_t x, the unique y in int16_t
+ * so that x == y mod 2^16.
+ *
+ * Concretely:
+ * - x < 32768: returns x
+ * - x >= 32768: returns x - 65536
*
**************************************************/
-
-/* Reference: Embedded in `cmov_int16()` in the reference implementation @[REF].
- * - Use value barrier and shift instead of `b = -b` to
- * convert condition into mask. */
-static MLK_INLINE uint16_t mlk_ct_cmask_nonzero_u16(uint16_t x)
-__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
+static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
{
- uint32_t tmp = mlk_value_barrier_u32(-((uint32_t)x));
- tmp >>= 16;
- return tmp;
+ /*
+ * PORTABILITY: This relies on uint16_t -> int16_t
+ * being implemented as the inverse of int16_t -> uint16_t,
+ * which is implementation-defined (C99 6.3.1.3 (3))
+ * CBMC (correctly) fails to prove this conversion is OK,
+ * so we have to suppress that check here
+ */
+ return (int16_t)x;
}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
/*************************************************
- * Name: mlk_ct_cmask_nonzero_u8
+ * Name: mlk_cast_int32_to_uint16
*
- * Description: Return 0 if input is zero, and -1 otherwise.
- *
- * Arguments: uint8_t x: Value to be converted into a mask
+ * Description: Cast int32 value to uint16 as per C standard.
*
+ * Returns: For int32_t x, the unique y in uint16_t
+ * so that x == y mod 2^16.
**************************************************/
-
-/* Reference: Embedded in `verify()` and `cmov()` in the
- * reference implementation @[REF].
- * - We include a value barrier not present in the
- * reference implementation, to prevent the compiler
- * from realizing that this function returns a mask. */
-static MLK_INLINE uint8_t mlk_ct_cmask_nonzero_u8(uint8_t x)
-__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
+static MLK_ALWAYS_INLINE uint16_t mlk_cast_int32_to_uint16(int32_t x)
{
- uint32_t tmp = mlk_value_barrier_u32(-((uint32_t)x));
- tmp >>= 24;
- return tmp;
+ return (uint16_t)(x & (int32_t)UINT16_MAX);
}
-/* Put unsigned overflow warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/*
- * The mlk_ct_cmask_neg_i16 function below makes deliberate use of
- * signed to unsigned integer conversion, which is fully defined
- * behaviour in C. It is thus safe to disable this warning.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
+/*************************************************
+ * Name: mlk_cast_int16_to_uint16
+ *
+ * Description: Cast int16 value to uint16 as per C standard.
+ *
+ * Returns: For int16_t x, the unique y in uint16_t
+ * so that x == y mod 2^16.
+ **************************************************/
+static MLK_ALWAYS_INLINE uint16_t mlk_cast_int16_to_uint16(int32_t x)
+{
+ return mlk_cast_int32_to_uint16((int32_t)x);
+}
/*************************************************
* Name: mlk_ct_cmask_neg_i16
@@ -225,24 +214,49 @@ __contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
{
int32_t tmp = mlk_value_barrier_i32((int32_t)x);
tmp >>= 16;
- return (int16_t)tmp;
+ return mlk_cast_int32_to_uint16(tmp);
}
-/* Put unsigned-to-signed warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
+/*************************************************
+ * Name: mlk_ct_cmask_nonzero_u16
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments: uint16_t x: Value to be converted into a mask
+ *
+ **************************************************/
-/*
- * The ct_csel_xxx functions below make deliberate use of unsigned
- * to signed integer conversion, which is implementation-defined
- * behaviour. Here, we assume that uint16_t -> int16_t is inverse
- * to int16_t -> uint16_t.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
+/* Reference: Embedded in `cmov_int16()` in the reference implementation @[REF].
+ * - Use value barrier and shift instead of `b = -b` to
+ * convert condition into mask. */
+static MLK_INLINE uint16_t mlk_ct_cmask_nonzero_u16(uint16_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
+{
+ int32_t tmp = mlk_value_barrier_i32(-((int32_t)x));
+ tmp >>= 16;
+ return mlk_cast_int32_to_uint16(tmp);
+}
+
+/*************************************************
+ * Name: mlk_ct_cmask_nonzero_u8
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments: uint8_t x: Value to be converted into a mask
+ *
+ **************************************************/
+
+/* Reference: Embedded in `verify()` and `cmov()` in the
+ * reference implementation @[REF].
+ * - We include a value barrier not present in the
+ * reference implementation, to prevent the compiler
+ * from realizing that this function returns a mask. */
+static MLK_INLINE uint8_t mlk_ct_cmask_nonzero_u8(uint8_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
+{
+ uint16_t mask = mlk_ct_cmask_nonzero_u16((uint16_t)x);
+ return (uint8_t)(mask & 0xFF);
+}
/*************************************************
* Name: mlk_ct_sel_int16
@@ -280,16 +294,12 @@ __contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
static MLK_INLINE int16_t mlk_ct_sel_int16(int16_t a, int16_t b, uint16_t cond)
__contract__(ensures(return_value == (cond ? a : b)))
{
- uint16_t au = a, bu = b;
+ uint16_t au = mlk_cast_int16_to_uint16(a);
+ uint16_t bu = mlk_cast_int16_to_uint16(b);
uint16_t res = bu ^ (mlk_ct_cmask_nonzero_u16(cond) & (au ^ bu));
- return (int16_t)res;
+ return mlk_cast_uint16_to_int16(res);
}
-/* Put unsigned-to-signed warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
/*************************************************
* Name: mlk_ct_sel_uint8
*
@@ -318,9 +328,11 @@ __contract__(ensures(return_value == (cond ? a : b)))
*
* Arguments: const uint8_t *a: pointer to first byte array
* const uint8_t *b: pointer to second byte array
- * size_t len: length of the byte arrays
+ * size_t len: length of the byte arrays, upper-bounded
+ * to UINT16_MAX to control proof complexity
+ * only.
*
- * Returns 0 if the byte arrays are equal, a non-zero value otherwise
+ * Returns 0 if the byte arrays are equal, 0xFF otherwise.
*
* Specification:
* - Used to securely compute conditional move in
@@ -338,9 +350,10 @@ __contract__(ensures(return_value == (cond ? a : b)))
static MLK_INLINE uint8_t mlk_ct_memcmp(const uint8_t *a, const uint8_t *b,
const size_t len)
__contract__(
+ requires(len <= UINT16_MAX)
requires(memory_no_alias(a, len))
requires(memory_no_alias(b, len))
- requires(len <= INT_MAX)
+ ensures((return_value == 0) || (return_value == 0xFF))
ensures((return_value == 0) == forall(i, 0, len, (a[i] == b[i]))))
{
uint8_t r = 0, s = 0;
@@ -391,13 +404,17 @@ __contract__(
static MLK_INLINE void mlk_ct_cmov_zero(uint8_t *r, const uint8_t *x,
size_t len, uint8_t b)
__contract__(
+ requires(len <= MLK_MAX_BUFFER_SIZE)
requires(memory_no_alias(r, len))
requires(memory_no_alias(x, len))
- assigns(memory_slice(r, len)))
+ assigns(memory_slice(r, len))
+ ensures(forall(i, 0, len, (r[i] == (b == 0 ? x[i] : old(r)[i])))))
{
size_t i;
for (i = 0; i < len; i++)
- __loop__(invariant(i <= len))
+ __loop__(
+ invariant(i <= len)
+ invariant(forall(k, 0, i, r[k] == (b == 0 ? x[k] : loop_entry(r)[k]))))
{
r[i] = mlk_ct_sel_uint8(r[i], x[i], b);
}
@@ -431,13 +448,13 @@ __contract__(
requires(memory_no_alias(ptr, len))
assigns(memory_slice(ptr, len)))
{
- memset(ptr, 0, len);
+ mlk_memset(ptr, 0, len);
/* This follows OpenSSL and seems sufficient to prevent the compiler
* from optimizing away the memset.
*
* If there was a reliable way to detect availability of memset_s(),
* that would be preferred. */
- __asm__ __volatile__("" : : "r"(ptr) : "memory");
+ __asm__ volatile("" : : "r"(ptr) : "memory");
}
#else /* !MLK_SYS_WINDOWS && MLK_HAVE_INLINE_ASM */
#error No plausibly-secure implementation of mlk_zeroize available. Please provide your own using MLK_CONFIG_CUSTOM_ZEROIZE.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/zetas.inc b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/zetas.inc
index 0c00b5b905..00316daf67 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/zetas.inc
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/mlkem/src/zetas.inc
@@ -5,16 +5,16 @@
/*
* WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
* Do not modify it directly.
*/
-#include
/*
* Table of zeta values used in the reference NTT and inverse NTT.
* See autogen for details.
*/
-static MLK_ALIGN const int16_t zetas[128] = {
+static MLK_ALIGN const int16_t mlk_zetas[128] = {
-1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577,
182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458,
-1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/integration/liboqs/config_c.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/integration/liboqs/config_c.h
index b546e2686d..9b1eef321a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/integration/liboqs/config_c.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/integration/liboqs/config_c.h
@@ -8,13 +8,24 @@
*
* - [FIPS140_3_IG]
* Implementation Guidance for FIPS 140-3 and the Cryptographic Module
- * Validation Program National Institute of Standards and Technology
+ * Validation Program
+ * National Institute of Standards and Technology
* https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
*/
#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_C_H
#define MLK_INTEGRATION_LIBOQS_CONFIG_C_H
+/* Enable valgrind-based assertions in mlkem-native through macro
+ * from libOQS. */
+#if !defined(__ASSEMBLER__)
+#include
+#if defined(OQS_ENABLE_TEST_CONSTANT_TIME)
+#define MLK_CONFIG_CT_TESTING_ENABLED
+#endif
+#endif /* !__ASSEMBLER__ */
+
+
/******************************************************************************
* Name: MLK_CONFIG_PARAMETER_SET
*
@@ -134,7 +145,7 @@
* consumer.
*
* If this option is not set, mlkem-native expects a function
- * void randombytes(uint8_t *out, size_t outlen).
+ * int randombytes(uint8_t *out, size_t outlen).
*
* Set this option and define `mlk_randombytes` if you want to
* use a custom method to sample randombytes with a different name
@@ -146,9 +157,10 @@
#include
#include
#include "../../mlkem/src/sys.h"
-static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len)
+static MLK_INLINE int mlk_randombytes(uint8_t *ptr, size_t len)
{
OQS_randombytes(ptr, len);
+ return 0;
}
#endif /* !__ASSEMBLER__ */
@@ -212,13 +224,4 @@ static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len)
#endif
*/
-/* Enable valgrind-based assertions in mlkem-native through macro
- * from libOQS. */
-#if !defined(__ASSEMBLER__)
-#include
-#if defined(OQS_ENABLE_TEST_CONSTANT_TIME)
-#define MLK_CONFIG_CT_TESTING_ENABLED
-#endif
-#endif /* !__ASSEMBLER__ */
-
#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_C_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/cbmc.h
index 650d32b95b..80e1a36fc7 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/cbmc.h
@@ -8,7 +8,6 @@
/***************************************************
* Basic replacements for __CPROVER_XXX contracts
***************************************************/
-
#ifndef CBMC
#define __contract__(x)
@@ -16,6 +15,7 @@
#else /* !CBMC */
+
#define __contract__(x) x
#define __loop__(x) x
@@ -49,7 +49,6 @@
*/
#define object_whole(...) __CPROVER_object_whole(__VA_ARGS__)
#define memory_slice(...) __CPROVER_object_upto(__VA_ARGS__)
-#define same_object(...) __CPROVER_same_object(__VA_ARGS__)
/*
* Pointer-related predicates
@@ -59,6 +58,17 @@
#define readable(...) __CPROVER_r_ok(__VA_ARGS__)
#define writeable(...) __CPROVER_w_ok(__VA_ARGS__)
+/* Maximum supported buffer size
+ *
+ * Larger buffers may be supported, but due to internal modeling constraints
+ * in CBMC, the proofs of memory- and type-safety won't be able to run.
+ *
+ * If you find yourself in need for a buffer size larger than this,
+ * please contact the maintainers, so we can prioritize work to relax
+ * this somewhat artificial bound.
+ */
+#define MLK_MAX_BUFFER_SIZE (SIZE_MAX >> 12)
+
/*
* History variables
* https://diffblue.github.io/cbmc/contracts-history-variables.html
@@ -83,7 +93,7 @@
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> (predicate) \
}
-#define EXISTS(qvar, qvar_lb, qvar_ub, predicate) \
+#define exists(qvar, qvar_lb, qvar_ub, predicate) \
__CPROVER_exists \
{ \
unsigned qvar; \
@@ -118,13 +128,35 @@
{ \
unsigned qvar; \
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
- (((int)(value_lb) <= ((array_var)[(qvar)])) && \
- (((array_var)[(qvar)]) < (int)(value_ub))) \
+ (((int)(value_lb) <= ((array_var)[(qvar)])) && \
+ (((array_var)[(qvar)]) < (int)(value_ub))) \
}
-#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
- array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb), \
+#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
+ array_bound_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), (qvar_lb), \
(qvar_ub), (array_var), (value_lb), (value_ub))
+
+#define array_unchanged_core(qvar, qvar_lb, qvar_ub, array_var) \
+ __CPROVER_forall \
+ { \
+ unsigned qvar; \
+ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
+ ((array_var)[(qvar)]) == (old(* (int16_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
+ }
+
+#define array_unchanged(array_var, N) \
+ array_unchanged_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var))
+
+#define array_unchanged_u64_core(qvar, qvar_lb, qvar_ub, array_var) \
+ __CPROVER_forall \
+ { \
+ unsigned qvar; \
+ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
+ ((array_var)[(qvar)]) == (old(* (uint64_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
+ }
+
+#define array_unchanged_u64(array_var, N) \
+ array_unchanged_u64_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var))
/* clang-format on */
/* Wrapper around array_bound operating on absolute values.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/common.h
index 9de9875556..bc4e9ed72c 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/common.h
@@ -5,10 +5,16 @@
#ifndef MLK_COMMON_H
#define MLK_COMMON_H
+#ifndef __ASSEMBLER__
+#include
+#endif
+
+#define MLK_BUILD_INTERNAL
+
#if defined(MLK_CONFIG_FILE)
#include MLK_CONFIG_FILE
#else
-#include "config.h"
+#include "mlkem_native_config.h"
#endif
#include "params.h"
@@ -28,15 +34,11 @@
#define MLK_EXTERNAL_API MLK_CONFIG_EXTERNAL_API_QUALIFIER
#endif
-#if defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) || \
- defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED)
-#define MLK_MULTILEVEL_BUILD
-#endif
-
#define MLK_CONCAT_(x1, x2) x1##x2
#define MLK_CONCAT(x1, x2) MLK_CONCAT_(x1, x2)
-#if defined(MLK_MULTILEVEL_BUILD)
+#if (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || \
+ defined(MLK_CONFIG_MULTILEVEL_NO_SHARED))
#define MLK_ADD_PARAM_SET(s) MLK_CONCAT(s, MLK_CONFIG_PARAMETER_SET)
#else
#define MLK_ADD_PARAM_SET(s) s
@@ -49,7 +51,7 @@
/* Functions are prefixed by MLK_CONFIG_NAMESPACE_PREFIX.
*
* If multiple parameter sets are used, functions depending on the parameter
- * set are additionally prefixed with 512/768/1024. See config.h.
+ * set are additionally prefixed with 512/768/1024. See mlkem_native_config.h.
*
* Example: If MLK_CONFIG_NAMESPACE_PREFIX is mlkem, then
* MLK_NAMESPACE_K(enc) becomes mlkem512_enc/mlkem768_enc/mlkem1024_enc.
@@ -73,8 +75,24 @@
*/
#if defined(MLK_SYS_X86_64)
#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) : MLK_CET_ENDBR
-#else
+#elif defined(MLK_SYS_ARMV81M_MVE)
+/* clang-format off */
+#define MLK_ASM_FN_SYMBOL(sym) \
+ .type MLK_ASM_NAMESPACE(sym), %function; \
+ MLK_ASM_NAMESPACE(sym) :
+/* clang-format on */
+#else /* !MLK_SYS_X86_64 && MLK_SYS_ARMV81M_MVE */
#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) :
+#endif /* !MLK_SYS_X86_64 && !MLK_SYS_ARMV81M_MVE */
+
+/*
+ * Output the size of an assembly function.
+ */
+#if defined(__ELF__)
+#define MLK_ASM_FN_SIZE(sym) \
+ .size MLK_ASM_NAMESPACE(sym), .- MLK_ASM_NAMESPACE(sym)
+#else
+#define MLK_ASM_FN_SIZE(sym)
#endif
/* We aim to simplify the user's life by supporting builds where
@@ -99,6 +117,10 @@
#error Bad configuration: MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, but MLK_CONFIG_FIPS202_BACKEND_FILE is not.
#endif
+#if defined(MLK_CONFIG_NO_RANDOMIZED_API) && defined(MLK_CONFIG_KEYGEN_PCT)
+#error Bad configuration: MLK_CONFIG_NO_RANDOMIZED_API is incompatible with MLK_CONFIG_KEYGEN_PCT as the current PCT implementation requires crypto_kem_enc()
+#endif
+
#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH)
#include MLK_CONFIG_ARITH_BACKEND_FILE
/* Include to enforce consistency of API and implementation,
@@ -135,20 +157,118 @@
#define MLK_FIPS202X4_HEADER_FILE MLK_CONFIG_FIPS202X4_CUSTOM_HEADER
#endif
-/* Just in case we want to include mlkem_native.h, set the configuration
- * for that header in accordance with the configuration used here. */
+/* Standard library function replacements */
+#if !defined(__ASSEMBLER__)
+#if !defined(MLK_CONFIG_CUSTOM_MEMCPY)
+#include
+#define mlk_memcpy memcpy
+#endif
+
+#if !defined(MLK_CONFIG_CUSTOM_MEMSET)
+#include
+#define mlk_memset memset
+#endif
+
+
+/* Allocation macros for large local structures
+ *
+ * MLK_ALLOC(v, T, N) declares T *v and attempts to point it to an T[N]
+ * MLK_FREE(v, T, N) zeroizes and frees the allocation
+ *
+ * Default implementation uses stack allocation.
+ * Can be overridden by setting the config option MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * and defining MLK_CUSTOM_ALLOC and MLK_CUSTOM_FREE.
+ */
+#if defined(MLK_CONFIG_CUSTOM_ALLOC_FREE) != \
+ (defined(MLK_CUSTOM_ALLOC) && defined(MLK_CUSTOM_FREE))
+#error Bad configuration: MLK_CONFIG_CUSTOM_ALLOC_FREE must be set together with MLK_CUSTOM_ALLOC and MLK_CUSTOM_FREE
+#endif
+
+/*
+ * If the integration wants to provide a context parameter for use in
+ * platform-specific hooks, then it should define this parameter.
+ *
+ * The MLK_CONTEXT_PARAMETERS_n macros are intended to be used with macros
+ * defining the function names and expand to either pass or discard the context
+ * argument as required by the current build. If there is no context parameter
+ * requested then these are removed from the prototypes and from all calls.
+ */
+#ifdef MLK_CONFIG_CONTEXT_PARAMETER
+#define MLK_CONTEXT_PARAMETERS_0(context) (context)
+#define MLK_CONTEXT_PARAMETERS_1(arg0, context) (arg0, context)
+#define MLK_CONTEXT_PARAMETERS_2(arg0, arg1, context) (arg0, arg1, context)
+#define MLK_CONTEXT_PARAMETERS_3(arg0, arg1, arg2, context) \
+ (arg0, arg1, arg2, context)
+#define MLK_CONTEXT_PARAMETERS_4(arg0, arg1, arg2, arg3, context) \
+ (arg0, arg1, arg2, arg3, context)
+#else /* MLK_CONFIG_CONTEXT_PARAMETER */
+#define MLK_CONTEXT_PARAMETERS_0(context) ()
+#define MLK_CONTEXT_PARAMETERS_1(arg0, context) (arg0)
+#define MLK_CONTEXT_PARAMETERS_2(arg0, arg1, context) (arg0, arg1)
+#define MLK_CONTEXT_PARAMETERS_3(arg0, arg1, arg2, context) (arg0, arg1, arg2)
+#define MLK_CONTEXT_PARAMETERS_4(arg0, arg1, arg2, arg3, context) \
+ (arg0, arg1, arg2, arg3)
+#endif /* !MLK_CONFIG_CONTEXT_PARAMETER */
+
+#if defined(MLK_CONFIG_CONTEXT_PARAMETER_TYPE) != \
+ defined(MLK_CONFIG_CONTEXT_PARAMETER)
+#error MLK_CONFIG_CONTEXT_PARAMETER_TYPE must be defined if and only if MLK_CONFIG_CONTEXT_PARAMETER is defined
+#endif
+
+#if !defined(MLK_CONFIG_CUSTOM_ALLOC_FREE)
+/* Default: stack allocation */
+
+#define MLK_ALLOC(v, T, N, context) \
+ MLK_ALIGN T mlk_alloc_##v[N]; \
+ T *v = mlk_alloc_##v
+
+/* TODO: This leads to a circular dependency between common and verify.h
+ * It just works out before we're at the end of the file, but it's still
+ * prone to issues in the future. */
+#include "verify.h"
+#define MLK_FREE(v, T, N, context) \
+ do \
+ { \
+ mlk_zeroize(mlk_alloc_##v, sizeof(mlk_alloc_##v)); \
+ (v) = NULL; \
+ } while (0)
+
+#else /* !MLK_CONFIG_CUSTOM_ALLOC_FREE */
+
+/* Custom allocation */
+
+/*
+ * The indirection here is necessary to use MLK_CONTEXT_PARAMETERS_3 here.
+ */
+#define MLK_APPLY(f, args) f args
+
+#define MLK_ALLOC(v, T, N, context) \
+ MLK_APPLY(MLK_CUSTOM_ALLOC, MLK_CONTEXT_PARAMETERS_3(v, T, N, context))
+
+#define MLK_FREE(v, T, N, context) \
+ do \
+ { \
+ if (v != NULL) \
+ { \
+ mlk_zeroize(v, sizeof(T) * (N)); \
+ MLK_APPLY(MLK_CUSTOM_FREE, MLK_CONTEXT_PARAMETERS_3(v, T, N, context)); \
+ v = NULL; \
+ } \
+ } while (0)
+
+#endif /* MLK_CONFIG_CUSTOM_ALLOC_FREE */
+
+/****************************** Error codes ***********************************/
-/* Double-check that this is not conflicting with pre-existing definitions. */
-#if defined(MLK_CONFIG_API_PARAMETER_SET) || \
- defined(MLK_CONFIG_API_NAMESPACE_PREFIX) || \
- defined(MLK_CONFIG_API_NO_SUPERCOP) || \
- defined(MLK_CONFIG_API_CONSTANTS_ONLY)
-#error Pre-existing MLK_CONFIG_API_XXX configuration is neither useful nor allowed during an mlkem-native build
-#endif /* MLK_CONFIG_API_PARAMETER_SET || MLK_CONFIG_API_NAMESPACE_PREFIX || \
- MLK_CONFIG_API_NO_SUPERCOP || MLK_CONFIG_API_CONSTANTS_ONLY */
+/* Generic failure condition */
+#define MLK_ERR_FAIL -1
+/* An allocation failed. This can only happen if MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * is defined and the provided MLK_CUSTOM_ALLOC can fail. */
+#define MLK_ERR_OUT_OF_MEMORY -2
+/* An rng failure occured. Might be due to insufficient entropy or
+ * system misconfiguration. */
+#define MLK_ERR_RNG_FAIL -3
-#define MLK_CONFIG_API_PARAMETER_SET MLK_CONFIG_PARAMETER_SET
-#define MLK_CONFIG_API_NAMESPACE_PREFIX \
- MLK_ADD_PARAM_SET(MLK_CONFIG_NAMESPACE_PREFIX)
+#endif /* !__ASSEMBLER__ */
#endif /* !MLK_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/compress.c
index d7ff2bbe7a..50da36d0e4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/compress.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/compress.c
@@ -20,24 +20,27 @@
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
-#include
+
#include "cbmc.h"
#include "compress.h"
#include "debug.h"
#include "verify.h"
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
/* Reference: `poly_compress()` in the reference implementation @[REF],
* for ML-KEM-{512,768}.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d4_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -55,32 +58,51 @@ void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
t[j] = mlk_scalar_compress_d4(a->coeffs[8 * i + j]);
}
- r[i * 4] = t[0] | (t[1] << 4);
- r[i * 4 + 1] = t[2] | (t[3] << 4);
- r[i * 4 + 2] = t[4] | (t[5] << 4);
- r[i * 4 + 3] = t[6] | (t[7] << 4);
+ /* All t[i] are 4-bit wide, so the truncations don't alter the value. */
+ r[i * 4] = (uint8_t)(t[0] | (t[1] << 4));
+ r[i * 4 + 1] = (uint8_t)(t[2] | (t[3] << 4));
+ r[i * 4 + 2] = (uint8_t)(t[4] | (t[5] << 4));
+ r[i * 4 + 3] = (uint8_t)(t[6] | (t[7] << 4));
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D4 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d4_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d4_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
+ mlk_poly_compress_d4_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation, for ML-KEM-{512,768}.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d10_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+)
{
unsigned j;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -101,29 +123,47 @@ void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
* Make all implicit truncation explicit. No data is being
* truncated for the LHS's since each t[i] is 10-bit in size.
*/
- r[5 * j + 0] = (t[0] >> 0) & 0xFF;
- r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
- r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
- r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
- r[5 * j + 4] = (t[3] >> 2);
+ r[5 * j + 0] = (uint8_t)((t[0] >> 0) & 0xFF);
+ r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] << 2) & 0xFF));
+ r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] << 4) & 0xFF));
+ r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] << 6) & 0xFF));
+ r[5 * j + 4] = (uint8_t)(t[3] >> 2);
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D10 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d10_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d10_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
+ mlk_poly_compress_d10_c(r, a);
+}
+
/* Reference: `poly_decompress()` in the reference implementation @[REF],
* for ML-KEM-{512,768}. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d4(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d4_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 2; i++)
@@ -137,22 +177,40 @@ void mlk_poly_decompress_d4(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d4(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d4_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
+ int ret;
+ ret = mlk_poly_decompress_d4_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
+ mlk_poly_decompress_d4_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation, for ML-KEM-{512,768}. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d10(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d10_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned j;
for (j = 0; j < MLKEM_N / 4; j++)
@@ -180,28 +238,46 @@ void mlk_poly_decompress_d10(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d10(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d10_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
+ int ret;
+ ret = mlk_poly_decompress_d10_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
+
+ mlk_poly_decompress_d10_c(r, a);
+}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
/* Reference: `poly_compress()` in the reference implementation @[REF],
* for ML-KEM-1024.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d5_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -219,38 +295,51 @@ void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
t[j] = mlk_scalar_compress_d5(a->coeffs[8 * i + j]);
}
- /*
- * Explicitly truncate to avoid warning about
- * implicit truncation in CBMC, and use array indexing into
- * r rather than pointer-arithmetic to simplify verification
- */
- r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
- r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
- r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
- r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
- r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+ r[i * 5] = (uint8_t)(0xFF & ((t[0] >> 0) | (t[1] << 5)));
+ r[i * 5 + 1] = (uint8_t)(0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)));
+ r[i * 5 + 2] = (uint8_t)(0xFF & ((t[3] >> 1) | (t[4] << 4)));
+ r[i * 5 + 3] = (uint8_t)(0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)));
+ r[i * 5 + 4] = (uint8_t)(0xFF & ((t[6] >> 2) | (t[7] << 3)));
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D5 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d5_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d5_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
+ mlk_poly_compress_d5_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation, for ML-KEM-1024.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d11_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+)
{
unsigned j;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -272,35 +361,53 @@ void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
* Make all implicit truncation explicit. No data is being
* truncated for the LHS's since each t[i] is 11-bit in size.
*/
- r[11 * j + 0] = (t[0] >> 0) & 0xFF;
- r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
- r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
- r[11 * j + 3] = (t[2] >> 2) & 0xFF;
- r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
- r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
- r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
- r[11 * j + 7] = (t[5] >> 1) & 0xFF;
- r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
- r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
- r[11 * j + 10] = (t[7] >> 3);
+ r[11 * j + 0] = (uint8_t)((t[0] >> 0) & 0xFF);
+ r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] << 3) & 0xFF));
+ r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] << 6) & 0xFF));
+ r[11 * j + 3] = (uint8_t)((t[2] >> 2) & 0xFF);
+ r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] << 1) & 0xFF));
+ r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] << 4) & 0xFF));
+ r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] << 7) & 0xFF));
+ r[11 * j + 7] = (uint8_t)((t[5] >> 1) & 0xFF);
+ r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] << 2) & 0xFF));
+ r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] << 5) & 0xFF));
+ r[11 * j + 10] = (uint8_t)(t[7] >> 3);
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D11 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d11_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d11_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
+ mlk_poly_compress_d11_c(r, a);
+}
+
/* Reference: `poly_decompress()` in the reference implementation @[REF],
* for ML-KEM-1024. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d5(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d5_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 8; i++)
@@ -342,22 +449,40 @@ void mlk_poly_decompress_d5(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d5(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d5_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
+ int ret;
+ ret = mlk_poly_decompress_d5_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
+ mlk_poly_decompress_d5_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation, for ML-KEM-1024. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d11(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d11_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned j;
for (j = 0; j < MLKEM_N / 8; j++)
@@ -390,26 +515,45 @@ void mlk_poly_decompress_d11(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d11(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d11_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
+ int ret;
+ ret = mlk_poly_decompress_d11_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
+ mlk_poly_decompress_d11_c(r, a);
+}
+
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
-#if !defined(MLK_USE_NATIVE_POLY_TOBYTES)
/* Reference: `poly_tobytes()` in the reference implementation @[REF].
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_tobytes_c(uint8_t r[MLKEM_POLYBYTES],
+ const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYBYTES))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -417,8 +561,10 @@ void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(invariant(i <= MLKEM_N / 2))
{
- const uint16_t t0 = a->coeffs[2 * i];
- const uint16_t t1 = a->coeffs[2 * i + 1];
+ /* The conversion to uint16_t is safe since we assume that
+ * the coefficients of `a` are non-negative. */
+ const uint16_t t0 = (uint16_t)a->coeffs[2 * i];
+ const uint16_t t1 = (uint16_t)a->coeffs[2 * i + 1];
/*
* t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
* significant data, so these can be packed into 24 bits or exactly
@@ -426,32 +572,48 @@ void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
*/
/* Least significant bits 0 - 7 of t0. */
- r[3 * i + 0] = t0 & 0xFF;
+ r[3 * i + 0] = (uint8_t)(t0 & 0xFF);
/*
* Most significant bits 8 - 11 of t0 become the least significant
* nibble of the second byte. The least significant 4 bits
* of t1 become the upper nibble of the second byte.
+ *
+ * The conversion to uint8_t does not alter the value.
*/
- r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+ r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 << 4) & 0xF0));
- /* Bits 4 - 11 of t1 become the third byte. */
- r[3 * i + 2] = t1 >> 4;
+ /* Bits 4 - 11 of t1 become the third byte. The conversion to uint8_t
+ * does not alter the value because t1 is 12-bit wide. */
+ r[3 * i + 2] = (uint8_t)(t1 >> 4);
}
}
-#else /* !MLK_USE_NATIVE_POLY_TOBYTES */
+
MLK_INTERNAL_API
void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
{
+#if defined(MLK_USE_NATIVE_POLY_TOBYTES)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_tobytes_native(r, a->coeffs);
-}
+ ret = mlk_poly_tobytes_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
-#if !defined(MLK_USE_NATIVE_POLY_FROMBYTES)
+ mlk_poly_tobytes_c(r, a);
+}
+
/* Reference: `poly_frombytes()` in the reference implementation @[REF]. */
-MLK_INTERNAL_API
-void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
+MLK_STATIC_TESTABLE void mlk_poly_frombytes_c(mlk_poly *r,
+ const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+ requires(memory_no_alias(a, MLKEM_POLYBYTES))
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 2; i++)
@@ -462,21 +624,29 @@ void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
const uint8_t t0 = a[3 * i + 0];
const uint8_t t1 = a[3 * i + 1];
const uint8_t t2 = a[3 * i + 2];
- r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
- r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+ r->coeffs[2 * i + 0] = (int16_t)(t0 | ((t1 << 8) & 0xFFF));
+ r->coeffs[2 * i + 1] = (int16_t)((t1 >> 4) | (t2 << 4));
}
/* Note that the coefficients are not canonical */
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
}
-#else /* !MLK_USE_NATIVE_POLY_FROMBYTES */
+
MLK_INTERNAL_API
void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
{
- mlk_poly_frombytes_native(r->coeffs, a);
-}
+#if defined(MLK_USE_NATIVE_POLY_FROMBYTES)
+ int ret;
+ ret = mlk_poly_frombytes_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
+ mlk_poly_frombytes_c(r, a);
+}
+
/* Reference: `poly_frommsg()` in the reference implementation @[REF].
* - We use a value barrier around the bit-selection mask to
* reduce the risk of compiler-introduced branches.
@@ -506,7 +676,7 @@ void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
* as per @[FIPS203, Eq (4.8)]. */
/* Prevent the compiler from recognizing this as a bit selection */
- uint8_t mask = mlk_value_barrier_u8(1u << j);
+ uint8_t mask = mlk_value_barrier_u8((uint8_t)(1u << j));
r->coeffs[8 * i + j] = mlk_ct_sel_int16(MLKEM_Q_HALF, 0, msg[i] & mask);
}
}
@@ -535,7 +705,7 @@ void mlk_poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const mlk_poly *a)
invariant(i <= MLKEM_N / 8 && j <= 8))
{
uint32_t t = mlk_scalar_compress_d1(a->coeffs[8 * i + j]);
- msg[i] |= t << j;
+ msg[i] |= (uint8_t)(t << j);
}
}
}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/compress.h
index f0789d42d6..b16b0889b5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/compress.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/compress.h
@@ -20,8 +20,7 @@
#ifndef MLK_COMPRESS_H
#define MLK_COMPRESS_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include "debug.h"
@@ -50,9 +49,9 @@
#endif
/* Reference: Part of poly_tomsg() in the reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d1(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d1(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 2)
ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) )
{
@@ -65,7 +64,8 @@ __contract__(
*/
/* check-magic: 1290168 == 2*round(2^31 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290168;
- return (d0 + (1u << 30)) >> 31;
+ /* Unsigned shifting by 31 positions leaves only the top bit. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 30)) >> 31);
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -93,9 +93,9 @@ __contract__(
/* Reference: Embedded into `poly_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d4(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d4(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 16)
ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
{
@@ -108,7 +108,8 @@ __contract__(
*/
/* check-magic: 1290160 == 16 * round(2^28 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290160;
- return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */
+ /* The return value is < 16, so not altered by the conversion to uint8_t. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 27)) >> 28); /* round(d0/2^28) */
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -128,11 +129,16 @@ __contract__(
/* Reference: Embedded into `poly_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d4(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d4(uint8_t u)
__contract__(
requires(0 <= u && u < 16)
ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) >> 4; }
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 8) >> 4);
+}
/************************************************************
* Name: mlk_scalar_compress_d5
@@ -156,9 +162,9 @@ __contract__(
/* Reference: Embedded into `poly_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d5(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d5(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 32)
ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) )
{
@@ -171,7 +177,8 @@ __contract__(
*/
/* check-magic: 1290176 == 2^5 * round(2^27 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290176;
- return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */
+ /* The return value is < 32, so not altered by the conversion to uint8_t. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 26)) >> 27); /* round(d0/2^27) */
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -191,11 +198,16 @@ __contract__(
/* Reference: Embedded into `poly_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d5(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d5(uint8_t u)
__contract__(
requires(0 <= u && u < 32)
- ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) >> 5; }
+ ensures(0 <= return_value && return_value <= MLKEM_Q - 1)
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 16) >> 5);
+}
/************************************************************
* Name: mlk_scalar_compress_d10
@@ -219,9 +231,9 @@ __contract__(
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d10(uint16_t u)
+static MLK_INLINE uint16_t mlk_scalar_compress_d10(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < (1u << 10))
ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
{
@@ -255,11 +267,16 @@ __contract__(
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d10(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d10(uint16_t u)
__contract__(
requires(0 <= u && u < 1024)
- ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) >> 10; }
+ ensures(0 <= return_value && return_value <= (MLKEM_Q - 1))
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 512) >> 10);
+}
/************************************************************
* Name: mlk_scalar_compress_d11
@@ -283,9 +300,9 @@ __contract__(
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d11(uint16_t u)
+static MLK_INLINE uint16_t mlk_scalar_compress_d11(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < (1u << 11))
ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
{
@@ -319,11 +336,16 @@ __contract__(
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d11(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d11(uint16_t u)
__contract__(
requires(0 <= u && u < 2048)
- ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) >> 11; }
+ ensures(0 <= return_value && return_value <= (MLKEM_Q - 1))
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 1024) >> 11);
+}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
#define mlk_poly_compress_d4 MLK_NAMESPACE(poly_compress_d4)
@@ -575,7 +597,7 @@ __contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
);
@@ -631,7 +653,7 @@ void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
__contract__(
requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
);
@@ -660,7 +682,7 @@ __contract__(
requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(msg))
+ assigns(memory_slice(msg, MLKEM_INDCPA_MSGBYTES))
);
#endif /* !MLK_COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/debug.h
index 01f7c88ccf..47c864bd36 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/debug.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/debug.h
@@ -7,7 +7,6 @@
#include "common.h"
#if defined(MLKEM_DEBUG)
-#include
/*************************************************
* Name: mlk_assert
@@ -89,14 +88,14 @@ void mlk_debug_check_bounds(const char *file, int line, const int16_t *ptr,
/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
* just use a single flattened array_bound(...) here. */
-#define mlk_assert_bound_2d(ptr, M, N, value_lb, value_ub) \
- cassert(forall(kN, 0, (M), \
- array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+#define mlk_assert_bound_2d(ptr, M, N, value_lb, value_ub) \
+ cassert(forall(kN, 0, (M), \
+ array_bound(&((int16_t (*)[(N)])(ptr))[kN][0], 0, (N), \
(value_lb), (value_ub))))
-#define mlk_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \
- cassert(forall(kN, 0, (M), \
- array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+#define mlk_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \
+ cassert(forall(kN, 0, (M), \
+ array_abs_bound(&((int16_t (*)[(N)])(ptr))[kN][0], 0, (N), \
(value_abs_bd))))
#else /* !MLKEM_DEBUG && CBMC */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/indcpa.c
index 85d4f595a9..e03b16c38b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/indcpa.c
@@ -17,15 +17,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
-#include
+#include "indcpa.h"
-#include "cbmc.h"
#include "debug.h"
-#include "indcpa.h"
-#include "poly.h"
-#include "poly_k.h"
#include "randombytes.h"
#include "sampling.h"
#include "symmetric.h"
@@ -41,6 +35,10 @@
#define mlk_pack_ciphertext MLK_ADD_PARAM_SET(mlk_pack_ciphertext)
#define mlk_unpack_ciphertext MLK_ADD_PARAM_SET(mlk_unpack_ciphertext)
#define mlk_matvec_mul MLK_ADD_PARAM_SET(mlk_matvec_mul)
+#define mlk_polyvec_permute_bitrev_to_custom \
+ MLK_ADD_PARAM_SET(mlk_polyvec_permute_bitrev_to_custom)
+#define mlk_polymat_permute_bitrev_to_custom \
+ MLK_ADD_PARAM_SET(mlk_polymat_permute_bitrev_to_custom)
/* End of parameter set namespacing */
/*************************************************
@@ -59,12 +57,13 @@
* Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L19]
*
**************************************************/
-static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], mlk_polyvec pk,
+static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const mlk_polyvec *pk,
const uint8_t seed[MLKEM_SYMBYTES])
{
- mlk_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(pk->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
mlk_polyvec_tobytes(r, pk);
- memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
+ mlk_memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
}
/*************************************************
@@ -83,11 +82,11 @@ static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], mlk_polyvec pk,
* Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L2-3]
*
**************************************************/
-static void mlk_unpack_pk(mlk_polyvec pk, uint8_t seed[MLKEM_SYMBYTES],
+static void mlk_unpack_pk(mlk_polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
const uint8_t packedpk[MLKEM_INDCPA_PUBLICKEYBYTES])
{
mlk_polyvec_frombytes(pk, packedpk);
- memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
/* NOTE: If a modulus check was conducted on the PK, we know at this
* point that the coefficients of `pk` are unsigned canonical. The
@@ -108,9 +107,10 @@ static void mlk_unpack_pk(mlk_polyvec pk, uint8_t seed[MLKEM_SYMBYTES],
* Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L20]
*
**************************************************/
-static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], mlk_polyvec sk)
+static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES],
+ const mlk_polyvec *sk)
{
- mlk_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(sk->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
mlk_polyvec_tobytes(r, sk);
}
@@ -128,7 +128,7 @@ static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], mlk_polyvec sk)
* Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L5]
*
**************************************************/
-static void mlk_unpack_sk(mlk_polyvec sk,
+static void mlk_unpack_sk(mlk_polyvec *sk,
const uint8_t packedsk[MLKEM_INDCPA_SECRETKEYBYTES])
{
mlk_polyvec_frombytes(sk, packedsk);
@@ -149,8 +149,8 @@ static void mlk_unpack_sk(mlk_polyvec sk,
* Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22-23]
*
**************************************************/
-static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], mlk_polyvec b,
- mlk_poly *v)
+static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES],
+ const mlk_polyvec *b, mlk_poly *v)
{
mlk_polyvec_compress_du(r, b);
mlk_poly_compress_dv(r + MLKEM_POLYVECCOMPRESSEDBYTES_DU, v);
@@ -170,28 +170,69 @@ static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], mlk_polyvec b,
* Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L1-4]
*
**************************************************/
-static void mlk_unpack_ciphertext(mlk_polyvec b, mlk_poly *v,
+static void mlk_unpack_ciphertext(mlk_polyvec *b, mlk_poly *v,
const uint8_t c[MLKEM_INDCPA_BYTES])
{
mlk_polyvec_decompress_du(b, c);
mlk_poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
}
-#if !defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
-/* This namespacing is not done at the top to avoid a naming conflict
- * with native backends, which are currently not yet namespaced. */
-#define mlk_poly_permute_bitrev_to_custom \
- MLK_ADD_PARAM_SET(mlk_poly_permute_bitrev_to_custom)
-
-static MLK_INLINE void mlk_poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
+/* Helper function to ensure that the polynomial entries in the output
+ * of gen_matrix use the standard (bitreversed) ordering of coefficients.
+ * No-op unless a native backend with a custom ordering is used.
+ *
+ * We don't inline this into gen_matrix to avoid having to split the CBMC
+ * proof for gen_matrix based on MLK_USE_NATIVE_NTT_CUSTOM_ORDER. */
+static void mlk_polyvec_permute_bitrev_to_custom(mlk_polyvec *v)
__contract__(
/* We don't specify that this should be a permutation, but only
* that it does not change the bound established at the end of mlk_gen_matrix. */
- requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
- requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(memory_slice(data, sizeof(mlk_poly)))
- ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+ requires(memory_no_alias(v, sizeof(mlk_polyvec)))
+ requires(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(v, sizeof(mlk_polyvec)))
+ ensures(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+{
+#if defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
+ unsigned i;
+ for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(v, sizeof(mlk_polyvec)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+ {
+ mlk_poly_permute_bitrev_to_custom(v->vec[i].coeffs);
+ }
+#else /* MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
+ /* Nothing to do */
+ (void)v;
#endif /* !MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
+}
+
+static void mlk_polymat_permute_bitrev_to_custom(mlk_polymat *a)
+__contract__(
+ /* We don't specify that this should be a permutation, but only
+ * that it does not change the bound established at the end of mlk_gen_matrix. */
+ requires(memory_no_alias(a, sizeof(mlk_polymat)))
+ requires(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+ assigns(memory_slice(a, sizeof(mlk_polymat)))
+ ensures(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))))
+{
+ unsigned i;
+ for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(a, sizeof(mlk_polymat)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))))
+ {
+ mlk_polyvec_permute_bitrev_to_custom(&a->vec[i]);
+ }
+}
/* Reference: `gen_matrix()` in the reference implementation @[REF].
* - We use a special subroutine to generate 4 polynomials
@@ -201,32 +242,27 @@ __contract__(
*
* Not static for benchmarking */
MLK_INTERNAL_API
-void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
+void mlk_gen_matrix(mlk_polymat *a, const uint8_t seed[MLKEM_SYMBYTES],
int transposed)
{
unsigned i, j;
- /*
- * We generate four separate seed arrays rather than a single one to work
- * around limitations in CBMC function contracts dealing with disjoint slices
- * of the same parent object.
- */
-
MLK_ALIGN uint8_t seed_ext[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)];
for (j = 0; j < 4; j++)
{
- memcpy(seed_ext[j], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(seed_ext[j], seed, MLKEM_SYMBYTES);
}
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
/* Sample 4 matrix entries a time. */
for (i = 0; i < (MLKEM_K * MLKEM_K / 4) * 4; i += 4)
{
- uint8_t x, y;
-
for (j = 0; j < 4; j++)
{
- x = (i + j) / MLKEM_K;
- y = (i + j) % MLKEM_K;
+ uint8_t x, y;
+ /* MLKEM_K <= 4, so the values fit in uint8_t. */
+ x = (uint8_t)((i + j) / MLKEM_K);
+ y = (uint8_t)((i + j) % MLKEM_K);
if (transposed)
{
seed_ext[j][MLKEM_SYMBYTES + 0] = x;
@@ -239,19 +275,26 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
}
}
- /*
- * This call writes across mlk_polyvec boundaries for K=2 and K=3.
- * This is intentional and safe.
- */
- mlk_poly_rej_uniform_x4(&a[i], seed_ext);
+ mlk_poly_rej_uniform_x4(&a->vec[i / MLKEM_K].vec[i % MLKEM_K],
+ &a->vec[(i + 1) / MLKEM_K].vec[(i + 1) % MLKEM_K],
+ &a->vec[(i + 2) / MLKEM_K].vec[(i + 2) % MLKEM_K],
+ &a->vec[(i + 3) / MLKEM_K].vec[(i + 3) % MLKEM_K],
+ seed_ext);
}
-
- /* For MLKEM_K == 3, sample the last entry individually. */
- if (i < MLKEM_K * MLKEM_K)
+#else /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
+ /* When using serial FIPS202, sample all entries individually. */
+ i = 0;
+#endif /* MLK_CONFIG_SERIAL_FIPS202_ONLY */
+
+ /* For MLKEM_K == 3, sample the last entry individually.
+ * When MLK_CONFIG_SERIAL_FIPS202_ONLY is set, sample all entries
+ * individually. */
+ for (; i < MLKEM_K * MLKEM_K; i++)
{
uint8_t x, y;
- x = i / MLKEM_K;
- y = i % MLKEM_K;
+ /* MLKEM_K <= 4, so the values fit in uint8_t. */
+ x = (uint8_t)(i / MLKEM_K);
+ y = (uint8_t)(i % MLKEM_K);
if (transposed)
{
@@ -264,8 +307,7 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
seed_ext[0][MLKEM_SYMBYTES + 1] = x;
}
- mlk_poly_rej_uniform(&a[i], seed_ext[0]);
- i++;
+ mlk_poly_rej_uniform(&a->vec[i / MLKEM_K].vec[i % MLKEM_K], seed_ext[0]);
}
mlk_assert(i == MLKEM_K * MLKEM_K);
@@ -274,10 +316,7 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
* The public matrix is generated in NTT domain. If the native backend
* uses a custom order in NTT domain, permute A accordingly.
*/
- for (i = 0; i < MLKEM_K * MLKEM_K; i++)
- {
- mlk_poly_permute_bitrev_to_custom(a[i].coeffs);
- }
+ mlk_polymat_permute_bitrev_to_custom(a);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -301,24 +340,25 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
* Specification: Implements @[FIPS203, Section 2.4.7, Eq (2.12), (2.13)]
*
**************************************************/
-static void mlk_matvec_mul(mlk_polyvec out, const mlk_polymat a,
- const mlk_polyvec v, const mlk_polyvec_mulcache vc)
+static void mlk_matvec_mul(mlk_polyvec *out, const mlk_polymat *a,
+ const mlk_polyvec *v, const mlk_polyvec_mulcache *vc)
__contract__(
requires(memory_no_alias(out, sizeof(mlk_polyvec)))
requires(memory_no_alias(a, sizeof(mlk_polymat)))
requires(memory_no_alias(v, sizeof(mlk_polyvec)))
requires(memory_no_alias(vc, sizeof(mlk_polyvec_mulcache)))
- requires(forall(k0, 0, MLKEM_K * MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
- assigns(object_whole(out)))
+ requires(forall(k0, 0, MLKEM_K,
+ forall(k1, 0, MLKEM_K,
+ array_bound(a->vec[k0].vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))))
+ assigns(memory_slice(out, sizeof(mlk_polyvec))))
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
__loop__(
- assigns(i, object_whole(out))
+ assigns(i, memory_slice(out, sizeof(mlk_polyvec)))
invariant(i <= MLKEM_K))
{
- mlk_polyvec_basemul_acc_montgomery_cached(&out[i], &a[MLKEM_K * i], v, vc);
+ mlk_polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a->vec[i], v, vc);
}
}
@@ -331,20 +371,34 @@ __contract__(
* - We include buffer zeroization.
*/
MLK_INTERNAL_API
-void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- const uint8_t *publicseed = buf;
- const uint8_t *noiseseed = buf + MLKEM_SYMBYTES;
- mlk_polymat a;
- mlk_polyvec e, pkpv, skpv;
- mlk_polyvec_mulcache skpv_cache;
-
- MLK_ALIGN uint8_t coins_with_domain_separator[MLKEM_SYMBYTES + 1];
+ int ret = 0;
+ const uint8_t *publicseed;
+ const uint8_t *noiseseed;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(coins_with_domain_separator, uint8_t, MLKEM_SYMBYTES + 1, context);
+ MLK_ALLOC(a, mlk_polymat, 1, context);
+ MLK_ALLOC(e, mlk_polyvec, 1, context);
+ MLK_ALLOC(pkpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (buf == NULL || coins_with_domain_separator == NULL || a == NULL ||
+ e == NULL || pkpv == NULL || skpv == NULL || skpv_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
+ publicseed = buf;
+ noiseseed = buf + MLKEM_SYMBYTES;
+
/* Concatenate coins with MLKEM_K for domain separation of security levels */
- memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
+ mlk_memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
coins_with_domain_separator[MLKEM_SYMBYTES] = MLKEM_K;
mlk_hash_g(buf, coins_with_domain_separator, MLKEM_SYMBYTES + 1);
@@ -360,24 +414,24 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
mlk_gen_matrix(a, publicseed, 0 /* no transpose */);
#if MLKEM_K == 2
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &e[0], &e[1], noiseseed, 0, 1,
- 2, 3);
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &e->vec[0],
+ &e->vec[1], noiseseed, 0, 1, 2, 3);
#elif MLKEM_K == 3
/*
* Only the first three output buffers are needed.
* The laster parameter is a dummy that's overwritten later.
*/
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &skpv[2],
- &pkpv[0] /* irrelevant */, noiseseed, 0, 1, 2,
- 0xFF /* irrelevant */);
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &skpv->vec[2], NULL,
+ noiseseed, 0, 1, 2, 0xFF /* irrelevant */);
/* Same here */
- mlk_poly_getnoise_eta1_4x(&e[0], &e[1], &e[2], &pkpv[0] /* irrelevant */,
- noiseseed, 3, 4, 5, 0xFF /* irrelevant */);
+ mlk_poly_getnoise_eta1_4x(&e->vec[0], &e->vec[1], &e->vec[2], NULL, noiseseed,
+ 3, 4, 5, 0xFF /* irrelevant */);
#elif MLKEM_K == 4
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &skpv[2], &skpv[3], noiseseed,
- 0, 1, 2, 3);
- mlk_poly_getnoise_eta1_4x(&e[0], &e[1], &e[2], &e[3], noiseseed, 4, 5, 6, 7);
-#endif
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &skpv->vec[2],
+ &skpv->vec[3], noiseseed, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta1_4x(&e->vec[0], &e->vec[1], &e->vec[2], &e->vec[3],
+ noiseseed, 4, 5, 6, 7);
+#endif /* MLKEM_K == 4 */
mlk_polyvec_ntt(skpv);
mlk_polyvec_ntt(e);
@@ -393,14 +447,17 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
mlk_pack_sk(sk, skpv);
mlk_pack_pk(pk, pkpv, publicseed);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(coins_with_domain_separator, sizeof(coins_with_domain_separator));
- mlk_zeroize(a, sizeof(a));
- mlk_zeroize(&e, sizeof(e));
- mlk_zeroize(&skpv, sizeof(skpv));
- mlk_zeroize(&skpv_cache, sizeof(skpv_cache));
+ MLK_FREE(skpv_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(skpv, mlk_polyvec, 1, context);
+ MLK_FREE(pkpv, mlk_polyvec, 1, context);
+ MLK_FREE(e, mlk_polyvec, 1, context);
+ MLK_FREE(a, mlk_polymat, 1, context);
+ MLK_FREE(coins_with_domain_separator, uint8_t, MLKEM_SYMBYTES + 1, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
/* Reference: `indcpa_enc()` in the reference implementation @[REF].
@@ -412,19 +469,33 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
* - We include buffer zeroization.
*/
MLK_INTERNAL_API
-void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t seed[MLKEM_SYMBYTES];
- mlk_polymat at;
- mlk_polyvec sp, pkpv, ep, b;
- mlk_poly v, k, epp;
- mlk_polyvec_mulcache sp_cache;
+ int ret = 0;
+ MLK_ALLOC(seed, uint8_t, MLKEM_SYMBYTES, context);
+ MLK_ALLOC(at, mlk_polymat, 1, context);
+ MLK_ALLOC(sp, mlk_polyvec, 1, context);
+ MLK_ALLOC(pkpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(ep, mlk_polyvec, 1, context);
+ MLK_ALLOC(b, mlk_polyvec, 1, context);
+ MLK_ALLOC(v, mlk_poly, 1, context);
+ MLK_ALLOC(k, mlk_poly, 1, context);
+ MLK_ALLOC(epp, mlk_poly, 1, context);
+ MLK_ALLOC(sp_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (seed == NULL || at == NULL || sp == NULL || pkpv == NULL || ep == NULL ||
+ b == NULL || v == NULL || k == NULL || epp == NULL || sp_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
mlk_unpack_pk(pkpv, seed, pk);
- mlk_poly_frommsg(&k, m);
+ mlk_poly_frommsg(k, m);
/*
* Declassify the public seed.
@@ -437,87 +508,105 @@ void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
mlk_gen_matrix(at, seed, 1 /* transpose */);
#if MLKEM_K == 2
- mlk_poly_getnoise_eta1122_4x(&sp[0], &sp[1], &ep[0], &ep[1], coins, 0, 1, 2,
- 3);
- mlk_poly_getnoise_eta2(&epp, coins, 4);
+ mlk_poly_getnoise_eta1122_4x(&sp->vec[0], &sp->vec[1], &ep->vec[0],
+ &ep->vec[1], coins, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta2(epp, coins, 4);
#elif MLKEM_K == 3
/*
* In this call, only the first three output buffers are needed.
* The last parameter is a dummy that's overwritten later.
*/
- mlk_poly_getnoise_eta1_4x(&sp[0], &sp[1], &sp[2], &b[0], coins, 0, 1, 2,
- 0xFF);
+ mlk_poly_getnoise_eta1_4x(&sp->vec[0], &sp->vec[1], &sp->vec[2], NULL, coins,
+ 0, 1, 2, 0xFF /* irrelevant */);
/* The fourth output buffer in this call _is_ used. */
- mlk_poly_getnoise_eta2_4x(&ep[0], &ep[1], &ep[2], &epp, coins, 3, 4, 5, 6);
+ mlk_poly_getnoise_eta2_4x(&ep->vec[0], &ep->vec[1], &ep->vec[2], epp, coins,
+ 3, 4, 5, 6);
#elif MLKEM_K == 4
- mlk_poly_getnoise_eta1_4x(&sp[0], &sp[1], &sp[2], &sp[3], coins, 0, 1, 2, 3);
- mlk_poly_getnoise_eta2_4x(&ep[0], &ep[1], &ep[2], &ep[3], coins, 4, 5, 6, 7);
- mlk_poly_getnoise_eta2(&epp, coins, 8);
-#endif
+ mlk_poly_getnoise_eta1_4x(&sp->vec[0], &sp->vec[1], &sp->vec[2], &sp->vec[3],
+ coins, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta2_4x(&ep->vec[0], &ep->vec[1], &ep->vec[2], &ep->vec[3],
+ coins, 4, 5, 6, 7);
+ mlk_poly_getnoise_eta2(epp, coins, 8);
+#endif /* MLKEM_K == 4 */
mlk_polyvec_ntt(sp);
mlk_polyvec_mulcache_compute(sp_cache, sp);
mlk_matvec_mul(b, at, sp, sp_cache);
- mlk_polyvec_basemul_acc_montgomery_cached(&v, pkpv, sp, sp_cache);
+ mlk_polyvec_basemul_acc_montgomery_cached(v, pkpv, sp, sp_cache);
mlk_polyvec_invntt_tomont(b);
- mlk_poly_invntt_tomont(&v);
+ mlk_poly_invntt_tomont(v);
mlk_polyvec_add(b, ep);
- mlk_poly_add(&v, &epp);
- mlk_poly_add(&v, &k);
+ mlk_poly_add(v, epp);
+ mlk_poly_add(v, k);
mlk_polyvec_reduce(b);
- mlk_poly_reduce(&v);
+ mlk_poly_reduce(v);
- mlk_pack_ciphertext(c, b, &v);
+ mlk_pack_ciphertext(c, b, v);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(seed, sizeof(seed));
- mlk_zeroize(&sp, sizeof(sp));
- mlk_zeroize(&sp_cache, sizeof(sp_cache));
- mlk_zeroize(&b, sizeof(b));
- mlk_zeroize(&v, sizeof(v));
- mlk_zeroize(at, sizeof(at));
- mlk_zeroize(&k, sizeof(k));
- mlk_zeroize(&ep, sizeof(ep));
- mlk_zeroize(&epp, sizeof(epp));
+ MLK_FREE(sp_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(epp, mlk_poly, 1, context);
+ MLK_FREE(k, mlk_poly, 1, context);
+ MLK_FREE(v, mlk_poly, 1, context);
+ MLK_FREE(b, mlk_polyvec, 1, context);
+ MLK_FREE(ep, mlk_polyvec, 1, context);
+ MLK_FREE(pkpv, mlk_polyvec, 1, context);
+ MLK_FREE(sp, mlk_polyvec, 1, context);
+ MLK_FREE(at, mlk_polymat, 1, context);
+ MLK_FREE(seed, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
/* Reference: `indcpa_dec()` in the reference implementation @[REF].
* - We use a mulcache for the scalar product.
* - We include buffer zeroization. */
MLK_INTERNAL_API
-void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+int mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- mlk_polyvec b, skpv;
- mlk_poly v, sb;
- mlk_polyvec_mulcache b_cache;
+ int ret = 0;
+ MLK_ALLOC(b, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(v, mlk_poly, 1, context);
+ MLK_ALLOC(sb, mlk_poly, 1, context);
+ MLK_ALLOC(b_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (b == NULL || skpv == NULL || v == NULL || sb == NULL || b_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
- mlk_unpack_ciphertext(b, &v, c);
+ mlk_unpack_ciphertext(b, v, c);
mlk_unpack_sk(skpv, sk);
mlk_polyvec_ntt(b);
mlk_polyvec_mulcache_compute(b_cache, b);
- mlk_polyvec_basemul_acc_montgomery_cached(&sb, skpv, b, b_cache);
- mlk_poly_invntt_tomont(&sb);
+ mlk_polyvec_basemul_acc_montgomery_cached(sb, skpv, b, b_cache);
+ mlk_poly_invntt_tomont(sb);
- mlk_poly_sub(&v, &sb);
- mlk_poly_reduce(&v);
+ mlk_poly_sub(v, sb);
+ mlk_poly_reduce(v);
- mlk_poly_tomsg(m, &v);
+ mlk_poly_tomsg(m, v);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(&skpv, sizeof(skpv));
- mlk_zeroize(&b, sizeof(b));
- mlk_zeroize(&b_cache, sizeof(b_cache));
- mlk_zeroize(&v, sizeof(v));
- mlk_zeroize(&sb, sizeof(sb));
+ MLK_FREE(b_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(sb, mlk_poly, 1, context);
+ MLK_FREE(v, mlk_poly, 1, context);
+ MLK_FREE(skpv, mlk_polyvec, 1, context);
+ MLK_FREE(b, mlk_polyvec, 1, context);
+ return ret;
}
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
@@ -529,4 +618,5 @@ void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
#undef mlk_pack_ciphertext
#undef mlk_unpack_ciphertext
#undef mlk_matvec_mul
-#undef mlk_poly_permute_bitrev_to_custom
+#undef mlk_polyvec_permute_bitrev_to_custom
+#undef mlk_polymat_permute_bitrev_to_custom
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/indcpa.h
index 4c44d0d411..b31756dcb6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/indcpa.h
@@ -15,7 +15,6 @@
#ifndef MLK_INDCPA_H
#define MLK_INDCPA_H
-#include
#include "cbmc.h"
#include "common.h"
#include "poly_k.h"
@@ -39,18 +38,19 @@
*
**************************************************/
MLK_INTERNAL_API
-void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
+void mlk_gen_matrix(mlk_polymat *a, const uint8_t seed[MLKEM_SYMBYTES],
int transposed)
__contract__(
requires(memory_no_alias(a, sizeof(mlk_polymat)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
requires(transposed == 0 || transposed == 1)
- assigns(object_whole(a))
- ensures(forall(x, 0, MLKEM_K * MLKEM_K,
- array_bound(a[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(a, sizeof(mlk_polymat)))
+ ensures(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
);
-#define mlk_indcpa_keypair_derand MLK_NAMESPACE_K(indcpa_keypair_derand)
+#define mlk_indcpa_keypair_derand \
+ MLK_NAMESPACE_K(indcpa_keypair_derand) MLK_CONTEXT_PARAMETERS_3
/*************************************************
* Name: mlk_indcpa_keypair_derand
*
@@ -68,18 +68,23 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
-#define mlk_indcpa_enc MLK_NAMESPACE_K(indcpa_enc)
+#define mlk_indcpa_enc MLK_NAMESPACE_K(indcpa_enc) MLK_CONTEXT_PARAMETERS_4
/*************************************************
* Name: mlk_indcpa_enc
*
@@ -100,19 +105,23 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(c))
+ assigns(memory_slice(c, MLKEM_INDCPA_BYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
-#define mlk_indcpa_dec MLK_NAMESPACE_K(indcpa_dec)
+#define mlk_indcpa_dec MLK_NAMESPACE_K(indcpa_dec) MLK_CONTEXT_PARAMETERS_3
/*************************************************
* Name: mlk_indcpa_dec
*
@@ -130,14 +139,18 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
- assigns(object_whole(m))
+ assigns(memory_slice(m, MLKEM_INDCPA_MSGBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
#endif /* !MLK_INDCPA_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/kem.c
index d6f4e83628..3c82d6df70 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/kem.c
@@ -8,7 +8,8 @@
*
* - [FIPS140_3_IG]
* Implementation Guidance for FIPS 140-3 and the Cryptographic Module
- * Validation Program National Institute of Standards and Technology
+ * Validation Program
+ * National Institute of Standards and Technology
* https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
*
* - [FIPS203]
@@ -22,12 +23,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
-#include
+#include "kem.h"
#include "indcpa.h"
-#include "kem.h"
#include "randombytes.h"
#include "symmetric.h"
#include "verify.h"
@@ -36,44 +34,24 @@
* This is to facilitate building multiple instances
* of mlkem-native (e.g. with varying security levels)
* within a single compilation unit. */
-#define mlk_check_pk MLK_ADD_PARAM_SET(mlk_check_pk)
-#define mlk_check_sk MLK_ADD_PARAM_SET(mlk_check_sk)
-#define mlk_check_pct MLK_ADD_PARAM_SET(mlk_check_pct)
+#define mlk_check_pct MLK_ADD_PARAM_SET(mlk_check_pct) MLK_CONTEXT_PARAMETERS_2
/* End of parameter set namespacing */
-#if defined(CBMC)
-/* Redeclaration with contract needed for CBMC only */
-int memcmp(const void *str1, const void *str2, size_t n)
-__contract__(
- requires(memory_no_alias(str1, n))
- requires(memory_no_alias(str2, n))
-);
-#endif /* CBMC */
-
-/*************************************************
- * Name: mlk_check_pk
- *
- * Description: Implements modulus check mandated by FIPS 203,
- * i.e., ensures that coefficients are in [0,q-1].
- *
- * Arguments: - const uint8_t *pk: pointer to input public key
- * (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
- * bytes)
- *
- * Returns: - 0 on success
- * - -1 on failure
- *
- * Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
- *
- **************************************************/
-
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-static int mlk_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- mlk_polyvec p;
- uint8_t p_reencoded[MLKEM_POLYVECBYTES];
+ int ret = 0;
+ MLK_ALLOC(p, mlk_polyvec, 1, context);
+ MLK_ALLOC(p_reencoded, uint8_t, MLKEM_POLYVECBYTES, context);
+
+ if (p == NULL || p_reencoded == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
mlk_polyvec_frombytes(p, pk);
mlk_polyvec_reduce(p);
@@ -81,39 +59,32 @@ static int mlk_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
/* We use a constant-time memcmp here to avoid having to
* declassify the PK before the PCT has succeeded. */
- res = mlk_ct_memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES) ? -1 : 0;
+ ret = mlk_ct_memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES) ? MLK_ERR_FAIL : 0;
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(p_reencoded, sizeof(p_reencoded));
- mlk_zeroize(&p, sizeof(p));
- return res;
+ MLK_FREE(p_reencoded, uint8_t, MLKEM_POLYVECBYTES, context);
+ MLK_FREE(p, mlk_polyvec, 1, context);
+ return ret;
}
-/*************************************************
- * Name: mlk_check_sk
- *
- * Description: Implements public key hash check mandated by FIPS 203,
- * i.e., ensures that
- * sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
- *
- * Arguments: - const uint8_t *sk: pointer to input private key
- * (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
- * bytes)
- *
- * Returns: - 0 on success
- * - -1 on failure
- *
- * Specification: Implements @[FIPS203, Section 7.3, 'hash check']
- *
- **************************************************/
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-static int mlk_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t test[MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(test, uint8_t, MLKEM_SYMBYTES, context);
+
+ if (test == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
/*
* The parts of `sk` being hashed and compared here are public, so
* no public information is leaked through the runtime or the return value
@@ -128,23 +99,32 @@ static int mlk_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
mlk_hash_h(test, sk + MLKEM_INDCPA_SECRETKEYBYTES,
MLKEM_INDCCA_PUBLICKEYBYTES);
- res = memcmp(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, test,
- MLKEM_SYMBYTES)
- ? -1
+ /* This doesn't have to be a constant-time memcmp, but it's the only place
+ * in the library where a normal memcmp would be used otherwise, so for sake
+ * of minimizing stdlib dependency, we use our constant-time one anyway. */
+ ret = mlk_ct_memcmp(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+ test, MLKEM_SYMBYTES)
+ ? MLK_ERR_FAIL
: 0;
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(test, sizeof(test));
- return res;
+ MLK_FREE(test, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
- requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES)));
+ requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
+);
#if defined(MLK_CONFIG_KEYGEN_PCT)
/* Specification:
@@ -152,21 +132,30 @@ __contract__(
* @[FIPS203, Section 7.1, Pairwise Consistency]. */
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES];
- uint8_t ss_enc[MLKEM_SSBYTES], ss_dec[MLKEM_SSBYTES];
+ int ret = 0;
+ MLK_ALLOC(ct, uint8_t, MLKEM_INDCCA_CIPHERTEXTBYTES, context);
+ MLK_ALLOC(ss_enc, uint8_t, MLKEM_SSBYTES, context);
+ MLK_ALLOC(ss_dec, uint8_t, MLKEM_SSBYTES, context);
+
+ if (ct == NULL || ss_enc == NULL || ss_dec == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
- res = crypto_kem_enc(ct, ss_enc, pk);
- if (res != 0)
+ ret = mlk_kem_enc(ct, ss_enc, pk, context);
+ if (ret != 0)
{
goto cleanup;
}
- res = crypto_kem_dec(ss_dec, ct, sk);
- if (res != 0)
+ ret = mlk_kem_dec(ss_dec, ct, sk, context);
+ if (ret != 0)
{
goto cleanup;
}
@@ -179,26 +168,36 @@ static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
}
#endif /* MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST */
- res = mlk_ct_memcmp(ss_enc, ss_dec, sizeof(ss_dec));
+ ret = mlk_ct_memcmp(ss_enc, ss_dec, MLKEM_SSBYTES);
+ /* The result of the PCT is public. */
+ MLK_CT_TESTING_DECLASSIFY(&ret, sizeof(ret));
+
+ if (ret != 0)
+ {
+ ret = MLK_ERR_FAIL;
+ }
cleanup:
- /* The result of the PCT is public. */
- MLK_CT_TESTING_DECLASSIFY(&res, sizeof(res));
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(ct, sizeof(ct));
- mlk_zeroize(ss_enc, sizeof(ss_enc));
- mlk_zeroize(ss_dec, sizeof(ss_dec));
- return res;
+ MLK_FREE(ss_dec, uint8_t, MLKEM_SSBYTES, context);
+ MLK_FREE(ss_enc, uint8_t, MLKEM_SSBYTES, context);
+ MLK_FREE(ct, uint8_t, MLKEM_INDCCA_CIPHERTEXTBYTES, context);
+ return ret;
}
-#else /* MLK_CONFIG_KEYGEN_PCT */
+#else /* MLK_CONFIG_KEYGEN_PCT */
+MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
/* Skip PCT */
((void)pk);
((void)sk);
+#if defined(MLK_CONFIG_CONTEXT_PARAMETER)
+ ((void)context);
+#endif
return 0;
}
#endif /* !MLK_CONFIG_KEYGEN_PCT */
@@ -208,164 +207,240 @@ static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
* - We optionally include PCT which is not present in
* the reference code. */
MLK_EXTERNAL_API
-int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
- const uint8_t coins[2 * MLKEM_SYMBYTES])
+int mlk_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ const uint8_t coins[2 * MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- mlk_indcpa_keypair_derand(pk, sk, coins);
- memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+ int ret;
+
+ ret = mlk_indcpa_keypair_derand(pk, sk, coins, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
+
+ mlk_memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
mlk_hash_h(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, pk,
MLKEM_INDCCA_PUBLICKEYBYTES);
/* Value z for pseudo-random output on reject */
- memcpy(sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
- coins + MLKEM_SYMBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
+ coins + MLKEM_SYMBYTES, MLKEM_SYMBYTES);
/* Declassify public key */
MLK_CT_TESTING_DECLASSIFY(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
/* Pairwise Consistency Test (PCT) @[FIPS140_3_IG, p.87] */
- if (mlk_check_pct(pk, sk))
+ ret = mlk_check_pct(pk, sk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- return 0;
+cleanup:
+ if (ret != 0)
+ {
+ mlk_zeroize(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+ mlk_zeroize(sk, MLKEM_INDCCA_SECRETKEYBYTES);
+ }
+
+ return ret;
}
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
/* Reference: `crypto_kem_keypair()` in the reference implementation @[REF]
* - We zeroize the stack buffer */
MLK_EXTERNAL_API
-int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t coins[2 * MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(coins, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+ if (coins == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Acquire necessary randomness, and mark it as secret. */
- mlk_randombytes(coins, 2 * MLKEM_SYMBYTES);
- MLK_CT_TESTING_SECRET(coins, sizeof(coins));
+ if (mlk_randombytes(coins, 2 * MLKEM_SYMBYTES) != 0)
+ {
+ ret = MLK_ERR_RNG_FAIL;
+ goto cleanup;
+ }
+
+ MLK_CT_TESTING_SECRET(coins, 2 * MLKEM_SYMBYTES);
- res = crypto_kem_keypair_derand(pk, sk, coins);
+ ret = mlk_kem_keypair_derand(pk, sk, coins, context);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(coins, sizeof(coins));
- return res;
+ MLK_FREE(coins, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
/* Reference: `crypto_kem_enc_derand()` in the reference implementation @[REF]
* - We include public key check
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- /* Will contain key, coins */
- MLK_ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+ if (buf == NULL || kr == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Specification: Implements @[FIPS203, Section 7.2, Modulus check] */
- if (mlk_check_pk(pk))
+ ret = mlk_kem_check_pk(pk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- memcpy(buf, coins, MLKEM_SYMBYTES);
+ mlk_memcpy(buf, coins, MLKEM_SYMBYTES);
/* Multitarget countermeasure for coins + contributory KEM */
mlk_hash_h(buf + MLKEM_SYMBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
/* coins are in kr+MLKEM_SYMBYTES */
- mlk_indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES);
+ ret = mlk_indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
- memcpy(ss, kr, MLKEM_SYMBYTES);
+ mlk_memcpy(ss, kr, MLKEM_SYMBYTES);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(kr, sizeof(kr));
-
- return 0;
+ MLK_FREE(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
/* Reference: `crypto_kem_enc()` in the reference implementation @[REF]
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t coins[MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(coins, uint8_t, MLKEM_SYMBYTES, context);
- mlk_randombytes(coins, MLKEM_SYMBYTES);
- MLK_CT_TESTING_SECRET(coins, sizeof(coins));
+ if (coins == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
+ if (mlk_randombytes(coins, MLKEM_SYMBYTES) != 0)
+ {
+ ret = MLK_ERR_RNG_FAIL;
+ goto cleanup;
+ }
+
+ MLK_CT_TESTING_SECRET(coins, MLKEM_SYMBYTES);
- res = crypto_kem_enc_derand(ct, ss, pk, coins);
+ ret = mlk_kem_enc_derand(ct, ss, pk, coins, context);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(coins, sizeof(coins));
- return res;
+ MLK_FREE(coins, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
/* Reference: `crypto_kem_dec()` in the reference implementation @[REF]
* - We include secret key check
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_dec(uint8_t ss[MLKEM_SSBYTES],
- const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_dec(uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
+ int ret = 0;
uint8_t fail;
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- /* Will contain key, coins */
- MLK_ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
- MLK_ALIGN uint8_t tmp[MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES];
-
const uint8_t *pk = sk + MLKEM_INDCPA_SECRETKEYBYTES;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(tmp, uint8_t, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES,
+ context);
+
+ if (buf == NULL || kr == NULL || tmp == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Specification: Implements @[FIPS203, Section 7.3, Hash check] */
- if (mlk_check_sk(sk))
+ ret = mlk_kem_check_sk(sk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- mlk_indcpa_dec(buf, ct, sk);
+ ret = mlk_indcpa_dec(buf, ct, sk, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
/* Multitarget countermeasure for coins + contributory KEM */
- memcpy(buf + MLKEM_SYMBYTES,
- sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(buf + MLKEM_SYMBYTES,
+ sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+ MLKEM_SYMBYTES);
mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
/* Recompute and compare ciphertext */
/* coins are in kr+MLKEM_SYMBYTES */
- mlk_indcpa_enc(tmp, buf, pk, kr + MLKEM_SYMBYTES);
+ ret = mlk_indcpa_enc(tmp, buf, pk, kr + MLKEM_SYMBYTES, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
+
fail = mlk_ct_memcmp(ct, tmp, MLKEM_INDCCA_CIPHERTEXTBYTES);
/* Compute rejection key */
- memcpy(tmp, sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
- MLKEM_SYMBYTES);
- memcpy(tmp + MLKEM_SYMBYTES, ct, MLKEM_INDCCA_CIPHERTEXTBYTES);
- mlk_hash_j(ss, tmp, sizeof(tmp));
+ mlk_memcpy(tmp, sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
+ MLKEM_SYMBYTES);
+ mlk_memcpy(tmp + MLKEM_SYMBYTES, ct, MLKEM_INDCCA_CIPHERTEXTBYTES);
+ mlk_hash_j(ss, tmp, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES);
/* Copy true key to return buffer if fail is 0 */
mlk_ct_cmov_zero(ss, kr, MLKEM_SYMBYTES, fail);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(kr, sizeof(kr));
- mlk_zeroize(tmp, sizeof(tmp));
+ MLK_FREE(tmp, uint8_t, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES,
+ context);
+ MLK_FREE(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
- return 0;
+ return ret;
}
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
-#undef mlk_check_pk
-#undef mlk_check_sk
#undef mlk_check_pct
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/kem.h
index d3e5f50ce6..0502715c39 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/kem.h
@@ -10,12 +10,16 @@
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [REF]
+ * CRYSTALS-Kyber C reference implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/ref
*/
#ifndef MLK_KEM_H
#define MLK_KEM_H
-#include
#include "cbmc.h"
#include "common.h"
#include "sys.h"
@@ -23,9 +27,7 @@
#if defined(MLK_CHECK_APIS)
/* Include to ensure consistency between internal kem.h
* and external mlkem_native.h. */
-#define MLK_CONFIG_API_NO_SUPERCOP
#include "mlkem_native.h"
-#undef MLK_CONFIG_API_NO_SUPERCOP
#if MLKEM_INDCCA_SECRETKEYBYTES != \
MLKEM_SECRETKEYBYTES(MLK_CONFIG_PARAMETER_SET)
@@ -44,14 +46,79 @@
#endif /* MLK_CHECK_APIS */
-#define crypto_kem_keypair_derand MLK_NAMESPACE_K(keypair_derand)
-#define crypto_kem_keypair MLK_NAMESPACE_K(keypair)
-#define crypto_kem_enc_derand MLK_NAMESPACE_K(enc_derand)
-#define crypto_kem_enc MLK_NAMESPACE_K(enc)
-#define crypto_kem_dec MLK_NAMESPACE_K(dec)
+#define mlk_kem_keypair_derand \
+ MLK_NAMESPACE_K(keypair_derand) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_keypair MLK_NAMESPACE_K(keypair) MLK_CONTEXT_PARAMETERS_2
+#define mlk_kem_enc_derand MLK_NAMESPACE_K(enc_derand) MLK_CONTEXT_PARAMETERS_4
+#define mlk_kem_enc MLK_NAMESPACE_K(enc) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_dec MLK_NAMESPACE_K(dec) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_check_pk MLK_NAMESPACE_K(check_pk) MLK_CONTEXT_PARAMETERS_1
+#define mlk_kem_check_sk MLK_NAMESPACE_K(check_sk) MLK_CONTEXT_PARAMETERS_1
+
+/*************************************************
+ * Name: mlk_kem_check_pk
+ *
+ * Description: Implements modulus check mandated by FIPS 203,
+ * i.e., ensures that coefficients are in [0,q-1].
+ *
+ * Arguments: - const uint8_t *pk: pointer to input public key
+ * (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
+ * bytes)
+ *
+ * Returns: - 0 on success
+ * - MLK_ERR_FAIL: If the modulus check failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
+ *
+ **************************************************/
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+ requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
+);
+
+
+/*************************************************
+ * Name: mlk_kem_check_sk
+ *
+ * Description: Implements public key hash check mandated by FIPS 203,
+ * i.e., ensures that
+ * sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
+ *
+ * Arguments: - const uint8_t *sk: pointer to input private key
+ * (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
+ * bytes)
+ *
+ * Returns: - 0 on success
+ * - MLK_ERR_FAIL: If the public key hash check failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.3, 'hash check']
+ *
+ **************************************************/
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+ requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
+);
/*************************************************
- * Name: crypto_kem_keypair_derand
+ * Name: mlk_kem_keypair_derand
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
@@ -67,26 +134,33 @@
* random bytes)
*
* Returns: - 0: On success
- * - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
+ * - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ * PCT failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 16, ML-KEM.KeyGen_Internal]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
- const uint8_t coins[2 * MLKEM_SYMBYTES])
+int mlk_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ const uint8_t coins[2 * MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
requires(memory_no_alias(coins, 2 * MLKEM_SYMBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_keypair
+ * Name: mlk_kem_keypair
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
@@ -99,24 +173,32 @@ __contract__(
* bytes)
*
* Returns: - 0: On success
- * - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ * - MLK_ERR_RNG_FAIL: Random number generation failed.
+ * - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ * PCT failed.
*
* Specification: Implements @[FIPS203, Algorithm 19, ML-KEM.KeyGen]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_enc_derand
+ * Name: mlk_kem_enc_derand
*
* Description: Generates cipher text and shared
* secret for given public key
@@ -134,29 +216,34 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'modulus check' @[FIPS203, Section 7.2]
- * for the public key fails.
+ * - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ * for the public key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 17, ML-KEM.Encaps_Internal]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(ct))
- assigns(object_whole(ss))
+ assigns(memory_slice(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
/*************************************************
- * Name: crypto_kem_enc
+ * Name: mlk_kem_enc
*
* Description: Generates cipher text and shared
* secret for given public key
@@ -171,27 +258,34 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'modulus check' @[FIPS203, Section 7.2]
- * for the public key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ * - MLK_ERR_RNG_FAIL: Random number generation failed.
+ * - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ * for the public key fails.
*
* Specification: Implements @[FIPS203, Algorithm 20, ML-KEM.Encaps]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
- assigns(object_whole(ct))
- assigns(object_whole(ss))
+ assigns(memory_slice(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_dec
+ * Name: mlk_kem_dec
*
* Description: Generates shared secret for given
* cipher text and private key
@@ -206,22 +300,27 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'hash check' @[FIPS203, Section 7.3]
- * for the secret key fails.
+ * - MLK_ERR_FAIL: If the 'hash check' @[FIPS203, Section 7.3]
+ * for the secret key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 21, ML-KEM.Decaps]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_dec(uint8_t ss[MLKEM_SSBYTES],
- const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_dec(uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
- assigns(object_whole(ss))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
#endif /* !MLK_KEM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/native/api.h
index aea28a3af4..0308f2bd51 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/native/api.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/native/api.h
@@ -17,10 +17,18 @@
* and run sanity checks.
*/
-#include
#include "../cbmc.h"
#include "../common.h"
+/* Backends must return MLK_NATIVE_FUNC_SUCCESS upon success. */
+#define MLK_NATIVE_FUNC_SUCCESS (0)
+/* Backends may return MLK_NATIVE_FUNC_FALLBACK to signal to the frontend that
+ * the target/parameters are unsupported; typically, this would be because of
+ * dependencies on CPU features not detected on the host CPU. In this case,
+ * the frontend falls back to the default C implementation. */
+#define MLK_NATIVE_FUNC_FALLBACK (-1)
+
+
/* Absolute exclusive upper bound for the output of the inverse NTT
*
* NOTE: This is the same bound as in poly.h and has to be kept
@@ -74,12 +82,16 @@
*
* Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial
**************************************************/
-static MLK_INLINE void mlk_ntt_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_ntt_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
requires(array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLK_NTT_BOUND))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLK_NTT_BOUND))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_NTT */
@@ -140,11 +152,14 @@ __contract__(
*
* Arguments: - uint16_t *a: pointer to in/output polynomial
**************************************************/
-static MLK_INLINE void mlk_intt_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_intt_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLK_INVNTT_BOUND))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLK_INVNTT_BOUND))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_INTT */
@@ -156,11 +171,14 @@ __contract__(
*
* Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial
**************************************************/
-static MLK_INLINE void mlk_poly_reduce_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_reduce_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
@@ -173,11 +191,14 @@ __contract__(
*
* Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial
**************************************************/
-static MLK_INLINE void mlk_poly_tomont_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tomont_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
@@ -203,13 +224,15 @@ __contract__(
* OUTPUT
* - cache: pointer to multiplication cache
**************************************************/
-static MLK_INLINE void mlk_poly_mulcache_compute_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_mulcache_compute_native(
int16_t cache[MLKEM_N / 2], const int16_t mlk_poly[MLKEM_N])
__contract__(
requires(memory_no_alias(cache, sizeof(int16_t) * (MLKEM_N / 2)))
requires(memory_no_alias(mlk_poly, sizeof(int16_t) * MLKEM_N))
- assigns(object_whole(cache))
- ensures(array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
+ assigns(memory_slice(cache, sizeof(int16_t) * (MLKEM_N / 2)))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
);
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
@@ -234,7 +257,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N],
const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)])
__contract__(
@@ -244,6 +268,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 2 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 2 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 */
@@ -267,7 +292,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N],
const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)])
__contract__(
@@ -277,6 +303,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 3 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 3 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3 */
@@ -300,7 +327,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N],
const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)])
__contract__(
@@ -310,6 +338,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 4 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 4 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
@@ -324,18 +353,20 @@ __contract__(
*
* Arguments: INPUT:
* - a: const pointer to input polynomial,
- * with each coefficient in the range -Q+1 .. Q-1
+ * with each coefficient in the range 0 .. Q-1
* OUTPUT
* - r: pointer to output byte array
* (of MLKEM_POLYBYTES bytes)
**************************************************/
-static MLK_INLINE void mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
- const int16_t a[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+ const int16_t a[MLKEM_N])
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
);
#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
@@ -353,13 +384,15 @@ __contract__(
* - a: const pointer to input byte array
* (of MLKEM_POLYBYTES bytes)
**************************************************/
-static MLK_INLINE void mlk_poly_frombytes_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_frombytes_native(
int16_t a[MLKEM_N], const uint8_t r[MLKEM_POLYBYTES])
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(a, sizeof(int16_t) * MLKEM_N))
- ensures(array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
);
#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
@@ -381,6 +414,7 @@ __contract__(
* Otherwise, returns non-negative number of sampled 16-bit integers (at most
* len).
**************************************************/
+MLK_MUST_CHECK_RETURN_VALUE
static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
const uint8_t *buf,
unsigned buflen)
@@ -389,8 +423,10 @@ __contract__(
requires(memory_no_alias(r, sizeof(int16_t) * len))
requires(memory_no_alias(buf, buflen))
assigns(memory_slice(r, sizeof(int16_t) * len))
- ensures(return_value == -1 || (0 <= return_value && return_value <= len))
- ensures(return_value != -1 ==> array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
+ ensures(return_value != MLK_NATIVE_FUNC_FALLBACK
+ ==> (0 <= return_value && return_value <= len))
+ ensures(return_value != MLK_NATIVE_FUNC_FALLBACK
+ ==> array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
);
#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
@@ -408,8 +444,15 @@ __contract__(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d4_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d4_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
@@ -425,8 +468,15 @@ static MLK_INLINE void mlk_poly_compress_d4_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d10_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d10_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
@@ -444,8 +494,15 @@ static MLK_INLINE void mlk_poly_compress_d10_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d4_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d4_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
@@ -463,8 +520,15 @@ static MLK_INLINE void mlk_poly_decompress_d4_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d10_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d10_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
@@ -482,8 +546,15 @@ static MLK_INLINE void mlk_poly_decompress_d10_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d5_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d5_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
@@ -499,8 +570,15 @@ static MLK_INLINE void mlk_poly_compress_d5_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d11_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d11_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
@@ -518,8 +596,15 @@ static MLK_INLINE void mlk_poly_compress_d11_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d5_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d5_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
@@ -537,8 +622,15 @@ static MLK_INLINE void mlk_poly_decompress_d5_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d11_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d11_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/native/meta.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/native/meta.h
index f2b9b848b7..4291d629b1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/native/meta.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/native/meta.h
@@ -18,4 +18,8 @@
#include "x86_64/meta.h"
#endif
+#if defined(MLK_SYS_RISCV64_RVV)
+#include "riscv64/meta.h"
+#endif
+
#endif /* !MLK_NATIVE_META_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/params.h
index 3f81bb0e2e..04598539c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/params.h
@@ -5,12 +5,6 @@
#ifndef MLK_PARAMS_H
#define MLK_PARAMS_H
-#if defined(MLK_CONFIG_FILE)
-#include MLK_CONFIG_FILE
-#else
-#include "config.h"
-#endif
-
#if !defined(MLK_CONFIG_PARAMETER_SET)
#error MLK_CONFIG_PARAMETER_SET is not defined
#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly.c
index 40d29948c8..564d5d712b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly.c
@@ -20,8 +20,7 @@
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
-#include
+
#include "cbmc.h"
#include "debug.h"
#include "poly.h"
@@ -29,9 +28,6 @@
#include "symmetric.h"
#include "verify.h"
-#if !defined(MLK_USE_NATIVE_POLY_TOMONT) || \
- !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
- !defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_fqmul
*
@@ -68,10 +64,7 @@ __contract__(
mlk_assert_abs_bound(&res, 1, MLKEM_Q);
return res;
}
-#endif /* !MLK_USE_NATIVE_POLY_TOMONT || !MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE \
- || !MLK_USE_NATIVE_NTT || !MLK_USE_NATIVE_INTT */
-#if !defined(MLK_USE_NATIVE_POLY_REDUCE) || !defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_barrett_reduce
*
@@ -107,7 +100,7 @@ __contract__(
* Here, we assume it's sign-preserving "arithmetic" shift right.
* See (C99 6.5.7 (5))
*/
- const int32_t t = (magic * a + (1 << 25)) >> 26;
+ const int32_t t = (magic * a + ((int32_t)1 << 25)) >> 26;
/*
* t is in -10 .. +10, so we need 32-bit math to
@@ -118,12 +111,14 @@ __contract__(
mlk_assert_abs_bound(&res, 1, MLKEM_Q_HALF);
return res;
}
-#endif /* !MLK_USE_NATIVE_POLY_REDUCE || !MLK_USE_NATIVE_INTT */
-#if !defined(MLK_USE_NATIVE_POLY_TOMONT)
/* Reference: `poly_tomont()` in the reference implementation @[REF]. */
-MLK_INTERNAL_API
-void mlk_poly_tomont(mlk_poly *r)
+MLK_STATIC_TESTABLE void mlk_poly_tomont_c(mlk_poly *r)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+)
{
unsigned i;
const int16_t f = 1353; /* check-magic: 1353 == signed_mod(2^32, MLKEM_Q) */
@@ -137,16 +132,23 @@ void mlk_poly_tomont(mlk_poly *r)
mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_TOMONT */
+
MLK_INTERNAL_API
void mlk_poly_tomont(mlk_poly *r)
{
- mlk_poly_tomont_native(r->coeffs);
- mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_TOMONT)
+ int ret;
+ ret = mlk_poly_tomont_native(r->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
-#if !defined(MLK_USE_NATIVE_POLY_REDUCE)
+ mlk_poly_tomont_c(r);
+}
+
/************************************************************
* Name: mlk_scalar_signed_to_unsigned_q
*
@@ -162,7 +164,7 @@ void mlk_poly_tomont(mlk_poly *r)
* - Used here to implement different semantics of `poly_reduce()`;
* see below. in the reference implementation @[REF], this logic is
* part of all compression functions (see `compress.c`). */
-static MLK_INLINE uint16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
+static MLK_INLINE int16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
__contract__(
requires(c > -MLKEM_Q && c < MLKEM_Q)
ensures(return_value >= 0 && return_value < MLKEM_Q)
@@ -170,12 +172,14 @@ __contract__(
{
mlk_assert_abs_bound(&c, 1, MLKEM_Q);
- /* Add Q if c is negative, but in constant time */
- c = mlk_ct_sel_int16(c + MLKEM_Q, c, mlk_ct_cmask_neg_i16(c));
+ /* Add MLKEM_Q if c is negative, but in constant time.
+ *
+ * Note that c + MLKEM_Q does not overflow in int16_t,
+ * so the cast to uint16_t is safe. */
+ c = mlk_ct_sel_int16((int16_t)(c + MLKEM_Q), c, mlk_ct_cmask_neg_i16(c));
- /* and therefore cast to uint16_t is safe. */
mlk_assert_bound(&c, 1, 0, MLKEM_Q);
- return (uint16_t)c;
+ return c;
}
/* Reference: `poly_reduce()` in the reference implementation @[REF]
@@ -185,10 +189,15 @@ __contract__(
* here to go from signed to unsigned representatives.
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
-MLK_INTERNAL_API
-void mlk_poly_reduce(mlk_poly *r)
+MLK_STATIC_TESTABLE void mlk_poly_reduce_c(mlk_poly *r)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
+
for (i = 0; i < MLKEM_N; i++)
__loop__(
invariant(i <= MLKEM_N)
@@ -202,15 +211,23 @@ void mlk_poly_reduce(mlk_poly *r)
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_REDUCE */
+
MLK_INTERNAL_API
void mlk_poly_reduce(mlk_poly *r)
{
- mlk_poly_reduce_native(r->coeffs);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_REDUCE)
+ int ret;
+ ret = mlk_poly_reduce_native(r->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
+ mlk_poly_reduce_c(r);
+}
+
/* Reference: `poly_add()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
@@ -224,7 +241,8 @@ void mlk_poly_add(mlk_poly *r, const mlk_poly *b)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
{
- r->coeffs[i] = r->coeffs[i] + b->coeffs[i];
+ /* The preconditions imply that the addition stays within int16_t. */
+ r->coeffs[i] = (int16_t)(r->coeffs[i] + b->coeffs[i]);
}
}
@@ -241,24 +259,24 @@ void mlk_poly_sub(mlk_poly *r, const mlk_poly *b)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
{
- r->coeffs[i] = r->coeffs[i] - b->coeffs[i];
+ /* The preconditions imply that the subtraction stays within int16_t. */
+ r->coeffs[i] = (int16_t)(r->coeffs[i] - b->coeffs[i]);
}
}
-/* Include zeta table unless NTT, invNTT and mulcache computation
- * have been replaced by native implementations. */
-#if !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
- !defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT)
#include "zetas.inc"
-#endif
-#if !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
/* Reference: Does not exist in the reference implementation @[REF].
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This idea originates
* from @[NeonNTT] and is used at the C level here. */
-MLK_INTERNAL_API
-void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_mulcache_compute_c(mlk_poly_mulcache *x,
+ const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 4; i++)
@@ -266,8 +284,11 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
invariant(i <= MLKEM_N / 4)
invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
{
- x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
- x->coeffs[2 * i + 1] = mlk_fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
+ x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], mlk_zetas[64 + i]);
+ /* The values in zeta table are <= MLKEM_Q in absolute value,
+ * so the negation in int16_t is safe. */
+ x->coeffs[2 * i + 1] =
+ mlk_fqmul(a->coeffs[4 * i + 3], (int16_t)(-mlk_zetas[64 + i]));
}
/*
@@ -278,15 +299,22 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
*/
mlk_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
MLK_INTERNAL_API
void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
{
- mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
-}
+#if defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+ int ret;
+ ret = mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
-#if !defined(MLK_USE_NATIVE_NTT)
+ mlk_poly_mulcache_compute_c(x, a);
+}
+
/*
* Computes a block CT butterflies with a fixed twiddle factor,
* using Montgomery multiplication.
@@ -316,7 +344,8 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
/* Reference: Embedded in `ntt()` in the reference implementation @[REF]. */
static void mlk_ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
- unsigned start, unsigned len, int bound)
+ unsigned start, unsigned len,
+ unsigned bound)
__contract__(
requires(start < MLKEM_N)
requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
@@ -346,8 +375,9 @@ __contract__(
{
int16_t t;
t = mlk_fqmul(r[j + len], zeta);
- r[j + len] = r[j] - t;
- r[j] = r[j] + t;
+ /* The precondition implies that the arithmetic does not overflow. */
+ r[j + len] = (int16_t)(r[j] - t);
+ r[j] = (int16_t)(r[j] + t);
}
}
@@ -370,7 +400,7 @@ __contract__(
unsigned start, k, len;
/* Twiddle factors for layer n are at indices 2^(n-1)..2^n-1. */
k = 1u << (layer - 1);
- len = MLKEM_N >> layer;
+ len = (unsigned)MLKEM_N >> layer;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
invariant(start < MLKEM_N + 2 * len)
@@ -378,7 +408,7 @@ __contract__(
invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
{
- int16_t zeta = zetas[k++];
+ int16_t zeta = mlk_zetas[k++];
mlk_ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
}
}
@@ -395,12 +425,19 @@ __contract__(
/* Reference: `ntt()` in the reference implementation @[REF].
* - Iterate over `layer` instead of `len` in the outer loop
* to simplify computation of zeta index. */
-MLK_INTERNAL_API
-void mlk_poly_ntt(mlk_poly *p)
+MLK_STATIC_TESTABLE void mlk_poly_ntt_c(mlk_poly *p)
+__contract__(
+ requires(memory_no_alias(p, sizeof(mlk_poly)))
+ requires(array_abs_bound(p->coeffs, 0, MLKEM_N, MLKEM_Q))
+ assigns(memory_slice(p, sizeof(mlk_poly)))
+ ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_NTT_BOUND))
+)
{
unsigned layer;
int16_t *r;
+
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+
r = p->coeffs;
for (layer = 1; layer <= 7; layer++)
@@ -414,18 +451,24 @@ void mlk_poly_ntt(mlk_poly *p)
/* Check the stronger bound */
mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
}
-#else /* !MLK_USE_NATIVE_NTT */
MLK_INTERNAL_API
void mlk_poly_ntt(mlk_poly *p)
{
+#if defined(MLK_USE_NATIVE_NTT)
+ int ret;
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
- mlk_ntt_native(p->coeffs);
- mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
-}
+ ret = mlk_ntt_native(p->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
+ return;
+ }
#endif /* MLK_USE_NATIVE_NTT */
-#if !defined(MLK_USE_NATIVE_INTT)
+ mlk_poly_ntt_c(p);
+}
+
/* Compute one layer of inverse NTT */
@@ -439,7 +482,7 @@ __contract__(
ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
unsigned start, k, len;
- len = (MLKEM_N >> layer);
+ len = (unsigned)MLKEM_N >> layer;
k = (1u << layer) - 1;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
@@ -449,7 +492,7 @@ __contract__(
invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
{
unsigned j;
- int16_t zeta = zetas[k--];
+ int16_t zeta = mlk_zetas[k--];
for (j = start; j < start + len; j++)
__loop__(
invariant(start <= j && j <= start + len)
@@ -457,8 +500,9 @@ __contract__(
invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
int16_t t = r[j];
- r[j] = mlk_barrett_reduce(t + r[j + len]);
- r[j + len] = r[j + len] - t;
+ /* The preconditions imply that the arithmetic does not overflow. */
+ r[j] = mlk_barrett_reduce((int16_t)(t + r[j + len]));
+ r[j + len] = (int16_t)(r[j + len] - t);
r[j + len] = mlk_fqmul(r[j + len], zeta);
}
}
@@ -469,18 +513,22 @@ __contract__(
* while the reference implementation normalizes at
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
-MLK_INTERNAL_API
-void mlk_poly_invntt_tomont(mlk_poly *p)
+MLK_STATIC_TESTABLE void mlk_poly_invntt_tomont_c(mlk_poly *p)
+__contract__(
+ requires(memory_no_alias(p, sizeof(mlk_poly)))
+ assigns(memory_slice(p, sizeof(mlk_poly)))
+ ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND))
+)
{
+ unsigned j, layer;
+ const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
+ int16_t *r = p->coeffs;
+
/*
* Scale input polynomial to account for Montgomery factor
* and NTT twist. This also brings coefficients down to
* absolute value < MLKEM_Q.
*/
- unsigned j, layer;
- const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
- int16_t *r = p->coeffs;
-
for (j = 0; j < MLKEM_N; j++)
__loop__(
invariant(j <= MLKEM_N)
@@ -500,16 +548,23 @@ void mlk_poly_invntt_tomont(mlk_poly *p)
mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
}
-#else /* !MLK_USE_NATIVE_INTT */
MLK_INTERNAL_API
void mlk_poly_invntt_tomont(mlk_poly *p)
{
- mlk_intt_native(p->coeffs);
- mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
-}
+#if defined(MLK_USE_NATIVE_INTT)
+ int ret;
+ ret = mlk_intt_native(p->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
+ return;
+ }
#endif /* MLK_USE_NATIVE_INTT */
+ mlk_poly_invntt_tomont_c(p);
+}
+
#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
MLK_EMPTY_CU(mlk_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly.h
index 20fb65e720..587062cce5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly.h
@@ -15,8 +15,7 @@
#ifndef MLK_POLY_H
#define MLK_POLY_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include "debug.h"
@@ -46,34 +45,6 @@ typedef struct
int16_t coeffs[MLKEM_N >> 1];
} MLK_ALIGN mlk_poly_mulcache;
-/*************************************************
- * Name: mlk_cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- * input x in 0 .. 32767: returns value unchanged
- * input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
-{
- /*
- * PORTABILITY: This relies on uint16_t -> int16_t
- * being implemented as the inverse of int16_t -> uint16_t,
- * which is implementation-defined (C99 6.3.1.3 (3))
- * CBMC (correctly) fails to prove this conversion is OK,
- * so we have to suppress that check here
- */
- return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
/*************************************************
* Name: mlk_montgomery_reduce
*
@@ -90,7 +61,7 @@ static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
static MLK_ALWAYS_INLINE int16_t mlk_montgomery_reduce(int32_t a)
__contract__(
requires(a < +(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)) &&
- a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
+ a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
/* We don't attempt to express an input-dependent output bound
* as the post-condition here. There are two call-sites for this
* function:
@@ -102,8 +73,8 @@ __contract__(
/* check-magic: 62209 == unsigned_mod(pow(MLKEM_Q, -1, 2^16), 2^16) */
const uint32_t QINV = 62209;
- /* Compute a*q^{-1} mod 2^16 in unsigned representatives */
- const uint16_t a_reduced = a & UINT16_MAX;
+ /* Compute a*q^{-1} mod 2^16 in unsigned representatives. */
+ const uint16_t a_reduced = mlk_cast_int32_to_uint16(a);
const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
/* Lift to signed canonical representative mod 2^16. */
@@ -187,7 +158,7 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
__contract__(
requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
requires(memory_no_alias(a, sizeof(mlk_poly)))
- assigns(object_whole(x))
+ assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
);
#define mlk_poly_reduce MLK_NAMESPACE(poly_reduce)
@@ -280,7 +251,7 @@ __contract__(
requires(forall(k0, 0, MLKEM_N, (int32_t) r->coeffs[k0] - b->coeffs[k0] <= INT16_MAX))
requires(forall(k1, 0, MLKEM_N, (int32_t) r->coeffs[k1] - b->coeffs[k1] >= INT16_MIN))
ensures(forall(k, 0, MLKEM_N, r->coeffs[k] == old(*r).coeffs[k] - b->coeffs[k]))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
);
#define mlk_poly_ntt MLK_NAMESPACE(poly_ntt)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly_k.c
index f15ab96ce7..32b214ee04 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly_k.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly_k.c
@@ -22,12 +22,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
+#include "poly_k.h"
-#include "compress.h"
#include "debug.h"
-#include "poly_k.h"
#include "sampling.h"
#include "symmetric.h"
@@ -37,6 +34,8 @@
* within a single compilation unit. */
#define mlk_poly_cbd_eta1 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta1)
#define mlk_poly_cbd_eta2 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta2)
+#define mlk_polyvec_basemul_acc_montgomery_cached_c \
+ MLK_ADD_PARAM_SET(mlk_polyvec_basemul_acc_montgomery_cached_c)
/* End of parameter set namespacing */
/* Reference: `polyvec_compress()` in the reference implementation @[REF]
@@ -46,29 +45,29 @@
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
- const mlk_polyvec a)
+ const mlk_polyvec *a)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a[i]);
+ mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
}
}
/* Reference: `polyvec_decompress()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_decompress_du(mlk_polyvec r,
+void mlk_polyvec_decompress_du(mlk_polyvec *r,
const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_decompress_du(&r[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+ mlk_poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `polyvec_tobytes()` in the reference implementation @[REF].
@@ -77,41 +76,45 @@ void mlk_polyvec_decompress_du(mlk_polyvec r,
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
-void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a)
+void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec *a)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(r, MLKEM_POLYVECBYTES))
+ invariant(i <= MLKEM_K)
+ )
{
- mlk_poly_tobytes(r + i * MLKEM_POLYBYTES, &a[i]);
+ mlk_poly_tobytes(&r[i * MLKEM_POLYBYTES], &a->vec[i]);
}
}
/* Reference: `polyvec_frombytes()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES])
+void mlk_polyvec_frombytes(mlk_polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_frombytes(&r[i], a + i * MLKEM_POLYBYTES);
+ mlk_poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
}
/* Reference: `polyvec_ntt()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_ntt(mlk_polyvec r)
+void mlk_polyvec_ntt(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_ntt(&r[i]);
+ mlk_poly_ntt(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
}
/* Reference: `polyvec_invntt_tomont()` in the reference implementation @[REF].
@@ -120,18 +123,17 @@ void mlk_polyvec_ntt(mlk_polyvec r)
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
MLK_INTERNAL_API
-void mlk_polyvec_invntt_tomont(mlk_polyvec r)
+void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_invntt_tomont(&r[i]);
+ mlk_poly_invntt_tomont(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND);
}
-#if !defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
/* Reference: `polyvec_basemul_acc_montgomery()` in the
* reference implementation @[REF].
* - We use a multiplication cache ('mulcache') here
@@ -143,13 +145,22 @@ void mlk_polyvec_invntt_tomont(mlk_polyvec r)
* at the end. The reference implementation uses 2 * MLKEM_K
* more modular reductions since it reduces after every modular
* multiplication. */
-MLK_INTERNAL_API
-void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+MLK_STATIC_TESTABLE void mlk_polyvec_basemul_acc_montgomery_cached_c(
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, sizeof(mlk_polyvec)))
+ requires(memory_no_alias(b, sizeof(mlk_polyvec)))
+ requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
+ requires(forall(k1, 0, MLKEM_K,
+ array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(invariant(i <= MLKEM_N / 2))
{
@@ -163,53 +174,59 @@ void mlk_polyvec_basemul_acc_montgomery_cached(
t[1] <= ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768) &&
t[1] >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768)))
{
- t[0] += (int32_t)a[k].coeffs[2 * i + 1] * b_cache[k].coeffs[i];
- t[0] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i];
- t[1] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i + 1];
- t[1] += (int32_t)a[k].coeffs[2 * i + 1] * b[k].coeffs[2 * i];
+ t[0] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b_cache->vec[k].coeffs[i];
+ t[0] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i];
+ t[1] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i + 1];
+ t[1] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b->vec[k].coeffs[2 * i];
}
r->coeffs[2 * i + 0] = mlk_montgomery_reduce(t[0]);
r->coeffs[2 * i + 1] = mlk_montgomery_reduce(t[1]);
}
}
-#else /* !MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
MLK_INTERNAL_API
void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
{
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
- /* Omitting bounds assertion for cache since native implementations may
- * decide not to use a mulcache. Note that the C backend implementation
- * of poly_basemul_montgomery_cached() does still include the check. */
+#if defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+ {
+ int ret;
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
#if MLKEM_K == 2
- mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#elif MLKEM_K == 3
- mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#elif MLKEM_K == 4
- mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#endif
-}
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
+ }
#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+ mlk_polyvec_basemul_acc_montgomery_cached_c(r, a, b, b_cache);
+}
+
/* Reference: Does not exist in the reference implementation @[REF].
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This idea originates
* from @[NeonNTT] and is used at the C level here. */
MLK_INTERNAL_API
-void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
+void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_mulcache_compute(&x[i], &a[i]);
+ mlk_poly_mulcache_compute(&x->vec[i], &a->vec[i]);
}
}
@@ -221,41 +238,53 @@ void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
MLK_INTERNAL_API
-void mlk_polyvec_reduce(mlk_polyvec r)
+void mlk_polyvec_reduce(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_reduce(&r[i]);
+ mlk_poly_reduce(&r->vec[i]);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `polyvec_add()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
MLK_INTERNAL_API
-void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b)
+void mlk_polyvec_add(mlk_polyvec *r, const mlk_polyvec *b)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(r, sizeof(mlk_polyvec)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(j0, i, MLKEM_K,
+ forall(k0, 0, MLKEM_N,
+ ((int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX) &&
+ ((int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] >= INT16_MIN))))
+ invariant(forall(j2, 0, i,
+ forall(k2, 0, MLKEM_N,
+ (r->vec[j2].coeffs[k2] <= INT16_MAX) &&
+ (r->vec[j2].coeffs[k2] >= INT16_MIN))))
+ )
{
- mlk_poly_add(&r[i], &b[i]);
+ mlk_poly_add(&r->vec[i], &b->vec[i]);
}
}
/* Reference: `polyvec_tomont()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_tomont(mlk_polyvec r)
+void mlk_polyvec_tomont(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_tomont(&r[i]);
+ mlk_poly_tomont(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLKEM_Q);
}
@@ -306,24 +335,41 @@ void mlk_poly_getnoise_eta1_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
{
MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
- memcpy(extkey[0], seed, MLKEM_SYMBYTES);
- memcpy(extkey[1], seed, MLKEM_SYMBYTES);
- memcpy(extkey[2], seed, MLKEM_SYMBYTES);
- memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
extkey[0][MLKEM_SYMBYTES] = nonce0;
extkey[1][MLKEM_SYMBYTES] = nonce1;
extkey[2][MLKEM_SYMBYTES] = nonce2;
extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) && \
+ !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
mlk_prf_eta1_x4(buf, extkey);
+#else
+ mlk_prf_eta1(buf[0], extkey[0]);
+ mlk_prf_eta1(buf[1], extkey[1]);
+ mlk_prf_eta1(buf[2], extkey[2]);
+ if (r3 != NULL)
+ {
+ mlk_prf_eta1(buf[3], extkey[3]);
+ }
+#endif /* !(!FIPS202_X4_DEFAULT_IMPLEMENTATION && \
+ !MLK_CONFIG_SERIAL_FIPS202_ONLY) */
+
mlk_poly_cbd_eta1(r0, buf[0]);
mlk_poly_cbd_eta1(r1, buf[1]);
mlk_poly_cbd_eta1(r2, buf[2]);
- mlk_poly_cbd_eta1(r3, buf[3]);
+ if (r3 != NULL)
+ {
+ mlk_poly_cbd_eta1(r3, buf[3]);
+ mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+ }
mlk_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
- mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -364,7 +410,7 @@ __contract__(
#endif
}
-/* Reference: `poly_getnoise_eta1()` in the reference implementation @[REF].
+/* Reference: `poly_getnoise_eta2()` in the reference implementation @[REF].
* - We include buffer zeroization. */
MLK_INTERNAL_API
void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
@@ -373,13 +419,13 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
MLK_ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
MLK_ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
- memcpy(extkey, seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey, seed, MLKEM_SYMBYTES);
extkey[MLKEM_SYMBYTES] = nonce;
mlk_prf_eta2(buf, extkey);
mlk_poly_cbd_eta2(r, buf);
- mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+ mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA2 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -391,7 +437,7 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
#if MLKEM_K == 2
/* Reference: Does not exist in the reference implementation @[REF].
* - This implements a x4-batched version of `poly_getnoise_eta1()`
- * and `poly_getnoise_eta1()` from the reference implementation,
+ * and `poly_getnoise_eta2()` from the reference implementation,
* leveraging batched Keccak-f1600.
* - If a x4-batched Keccak-f1600 is available, we squeeze
* more random data than needed for the eta2 calls, to be
@@ -409,10 +455,10 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
- memcpy(extkey[0], seed, MLKEM_SYMBYTES);
- memcpy(extkey[1], seed, MLKEM_SYMBYTES);
- memcpy(extkey[2], seed, MLKEM_SYMBYTES);
- memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
extkey[0][MLKEM_SYMBYTES] = nonce0;
extkey[1][MLKEM_SYMBYTES] = nonce1;
extkey[2][MLKEM_SYMBYTES] = nonce2;
@@ -421,14 +467,16 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
/* On systems with fast batched Keccak, we use 4-fold batched PRF,
* even though that means generating more random data in buf[2] and buf[3]
* than necessary. */
-#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION)
+#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) && \
+ !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
mlk_prf_eta1_x4(buf, extkey);
#else
mlk_prf_eta1(buf[0], extkey[0]);
mlk_prf_eta1(buf[1], extkey[1]);
mlk_prf_eta2(buf[2], extkey[2]);
mlk_prf_eta2(buf[3], extkey[3]);
-#endif /* FIPS202_X4_DEFAULT_IMPLEMENTATION */
+#endif /* !(!FIPS202_X4_DEFAULT_IMPLEMENTATION && \
+ !MLK_CONFIG_SERIAL_FIPS202_ONLY) */
mlk_poly_cbd_eta1(r0, buf[0]);
mlk_poly_cbd_eta1(r1, buf[1]);
@@ -451,3 +499,4 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
#undef mlk_poly_cbd_eta1
#undef mlk_poly_cbd_eta2
+#undef mlk_polyvec_basemul_acc_montgomery_cached_c
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly_k.h
index f7a40ff5f9..9089a8e431 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly_k.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/poly_k.h
@@ -15,7 +15,6 @@
#ifndef MLK_POLY_K_H
#define MLK_POLY_K_H
-#include
#include "common.h"
#include "compress.h"
#include "poly.h"
@@ -29,9 +28,20 @@
#define mlk_polyvec_mulcache MLK_ADD_PARAM_SET(mlk_polyvec_mulcache)
/* End of parameter set namespacing */
-typedef mlk_poly mlk_polyvec[MLKEM_K];
-typedef mlk_poly mlk_polymat[MLKEM_K * MLKEM_K];
-typedef mlk_poly_mulcache mlk_polyvec_mulcache[MLKEM_K];
+typedef struct
+{
+ mlk_poly vec[MLKEM_K];
+} MLK_ALIGN mlk_polyvec;
+
+typedef struct
+{
+ mlk_polyvec vec[MLKEM_K];
+} MLK_ALIGN mlk_polymat;
+
+typedef struct
+{
+ mlk_poly_mulcache vec[MLKEM_K];
+} MLK_ALIGN mlk_polyvec_mulcache;
#define mlk_poly_compress_du MLK_NAMESPACE_K(poly_compress_du)
/*************************************************
@@ -131,7 +141,7 @@ __contract__(
requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r)))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DV)))
{
#if MLKEM_DV == 4
mlk_poly_compress_d4(r, a);
@@ -168,7 +178,7 @@ static MLK_INLINE void mlk_poly_decompress_dv(
__contract__(
requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
requires(memory_no_alias(r, sizeof(mlk_poly)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
{
#if MLKEM_DV == 4
@@ -200,13 +210,13 @@ __contract__(
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
- const mlk_polyvec a)
+ const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(forall(k0, 0, MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
- assigns(object_whole(r))
+ array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
);
#define mlk_polyvec_decompress_du MLK_NAMESPACE_K(polyvec_decompress_du)
@@ -228,14 +238,14 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_decompress_du(mlk_polyvec r,
+void mlk_polyvec_decompress_du(mlk_polyvec *r,
const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
__contract__(
requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_polyvec_tobytes MLK_NAMESPACE_K(polyvec_tobytes)
@@ -256,13 +266,13 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a)
+void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
requires(forall(k0, 0, MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
- assigns(object_whole(r))
+ array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(r, MLKEM_POLYVECBYTES))
);
#define mlk_polyvec_frombytes MLK_NAMESPACE_K(polyvec_frombytes)
@@ -284,13 +294,13 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES])
+void mlk_polyvec_frombytes(mlk_polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
);
#define mlk_polyvec_ntt MLK_NAMESPACE_K(polyvec_ntt)
@@ -313,14 +323,14 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_ntt(mlk_polyvec r)
+void mlk_polyvec_ntt(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
- assigns(object_whole(r))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
);
#define mlk_polyvec_invntt_tomont MLK_NAMESPACE_K(polyvec_invntt_tomont)
@@ -344,12 +354,12 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_invntt_tomont(mlk_polyvec r)
+void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
);
#define mlk_polyvec_basemul_acc_montgomery_cached \
@@ -380,16 +390,16 @@ __contract__(
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(memory_no_alias(b, sizeof(mlk_polyvec)))
requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
requires(forall(k1, 0, MLKEM_K,
- array_bound(a[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
- assigns(object_whole(r))
+ array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
);
#define mlk_polyvec_mulcache_compute MLK_NAMESPACE_K(polyvec_mulcache_compute)
@@ -423,11 +433,11 @@ __contract__(
* higher level safety proofs, and thus not part of the spec.
*/
MLK_INTERNAL_API
-void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
+void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(x, sizeof(mlk_polyvec_mulcache)))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
- assigns(object_whole(x))
+ assigns(memory_slice(x, sizeof(mlk_polyvec_mulcache)))
);
#define mlk_polyvec_reduce MLK_NAMESPACE_K(polyvec_reduce)
@@ -436,7 +446,7 @@ __contract__(
*
* Description: Applies Barrett reduction to each coefficient
* of each element of a vector of polynomials;
- * for details of the Barrett reduction see comments in reduce.c
+ * for details of the Barrett reduction see comments in poly.c
*
* Arguments: - mlk_polyvec r: pointer to input/output polynomial
*
@@ -453,12 +463,12 @@ __contract__(
* use of mlk_poly_reduce() in the context of (de)serialization.
*/
MLK_INTERNAL_API
-void mlk_polyvec_reduce(mlk_polyvec r)
+void mlk_polyvec_reduce(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_polyvec_add MLK_NAMESPACE_K(polyvec_add)
@@ -485,17 +495,17 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b)
+void mlk_polyvec_add(mlk_polyvec *r, const mlk_polyvec *b)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(memory_no_alias(b, sizeof(mlk_polyvec)))
requires(forall(j0, 0, MLKEM_K,
forall(k0, 0, MLKEM_N,
- (int32_t)r[j0].coeffs[k0] + b[j0].coeffs[k0] <= INT16_MAX)))
+ (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
requires(forall(j1, 0, MLKEM_K,
forall(k1, 0, MLKEM_N,
- (int32_t)r[j1].coeffs[k1] + b[j1].coeffs[k1] >= INT16_MIN)))
- assigns(object_whole(r))
+ (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
);
#define mlk_polyvec_tomont MLK_NAMESPACE_K(polyvec_tomont)
@@ -514,13 +524,12 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_tomont(mlk_polyvec r)
+void mlk_polyvec_tomont(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
assigns(memory_slice(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
);
#define mlk_poly_getnoise_eta1_4x MLK_NAMESPACE_K(poly_getnoise_eta1_4x)
@@ -531,7 +540,8 @@ __contract__(
* and nonces, with output polynomials close to centered binomial
* distribution with parameter MLKEM_ETA1.
*
- * Arguments: - mlk_poly *r{0,1,2,3}: pointer to output polynomial
+ * Arguments: - mlk_poly *r{0,1,2,3}: pointer to output polynomial. The last
+ * polynomial pointer may be NULL.
* - const uint8_t *seed: pointer to input seed
* (of length MLKEM_SYMBYTES bytes)
* - uint8_t nonce{0,1,2,3}: one-byte input nonce
@@ -555,16 +565,15 @@ __contract__(
requires(memory_no_alias(r0, sizeof(mlk_poly)))
requires(memory_no_alias(r1, sizeof(mlk_poly)))
requires(memory_no_alias(r2, sizeof(mlk_poly)))
- requires(memory_no_alias(r3, sizeof(mlk_poly)))
+ requires(r3 == NULL || memory_no_alias(r3, sizeof(mlk_poly)))
assigns(memory_slice(r0, sizeof(mlk_poly)))
assigns(memory_slice(r1, sizeof(mlk_poly)))
assigns(memory_slice(r2, sizeof(mlk_poly)))
- assigns(memory_slice(r3, sizeof(mlk_poly)))
- ensures(
- array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+ assigns(r3 != NULL: memory_slice(r3, sizeof(mlk_poly)))
+ ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(r3 != NULL ==> array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
);
#if MLKEM_ETA1 == MLKEM_ETA2
@@ -604,7 +613,7 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
);
#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
@@ -640,15 +649,19 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
uint8_t nonce0, uint8_t nonce1,
uint8_t nonce2, uint8_t nonce3)
__contract__(
- requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(mlk_poly)) && memory_no_alias(r2, 2 * sizeof(mlk_poly)) &&
- r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+ requires(memory_no_alias(r0, sizeof(mlk_poly)))
+ requires(memory_no_alias(r1, sizeof(mlk_poly)))
+ requires(memory_no_alias(r2, sizeof(mlk_poly)))
+ requires(memory_no_alias(r3, sizeof(mlk_poly)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
- assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+ assigns(memory_slice(r0, sizeof(mlk_poly)))
+ assigns(memory_slice(r1, sizeof(mlk_poly)))
+ assigns(memory_slice(r2, sizeof(mlk_poly)))
+ assigns(memory_slice(r3, sizeof(mlk_poly)))
ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
- && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+ && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+ && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+ && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1))
);
#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/randombytes.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/randombytes.h
index 132d920afb..3e841d26ca 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/randombytes.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/randombytes.h
@@ -5,18 +5,56 @@
#ifndef MLK_RANDOMBYTES_H
#define MLK_RANDOMBYTES_H
-#include
-#include
#include "cbmc.h"
#include "common.h"
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
#if !defined(MLK_CONFIG_CUSTOM_RANDOMBYTES)
-void randombytes(uint8_t *out, size_t outlen);
-static MLK_INLINE void mlk_randombytes(uint8_t *out, size_t outlen)
+/*************************************************
+ * Name: randombytes
+ *
+ * Description: Fill a buffer with cryptographically secure random bytes.
+ *
+ * mlkem-native does not provide an implementation of this
+ * function. It must be provided by the consumer.
+ *
+ * To use a custom random byte source with a different name
+ * or signature, set MLK_CONFIG_CUSTOM_RANDOMBYTES and define
+ * mlk_randombytes directly.
+ *
+ * Arguments: - uint8_t *out: pointer to output buffer
+ * - size_t outlen: number of random bytes to write
+ *
+ * Returns: 0 on success, non-zero on failure.
+ * On failure, top-level APIs return MLK_ERR_RNG_FAIL.
+ *
+ **************************************************/
+int randombytes(uint8_t *out, size_t outlen);
+
+/*************************************************
+ * Name: mlk_randombytes
+ *
+ * Description: Internal wrapper around randombytes().
+ *
+ * Fill a buffer with cryptographically secure random bytes.
+ *
+ * This function can be replaced by setting
+ * MLK_CONFIG_CUSTOM_RANDOMBYTES and defining mlk_randombytes
+ * directly.
+ *
+ * Arguments: - uint8_t *out: pointer to output buffer
+ * - size_t outlen: number of random bytes to write
+ *
+ * Returns: 0 on success, non-zero on failure.
+ * On failure, top-level APIs return MLK_ERR_RNG_FAIL.
+ *
+ **************************************************/
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_randombytes(uint8_t *out, size_t outlen)
__contract__(
requires(memory_no_alias(out, outlen))
- assigns(memory_slice(out, outlen))) { randombytes(out, outlen); }
+ assigns(memory_slice(out, outlen))) { return randombytes(out, outlen); }
#endif /* !MLK_CONFIG_CUSTOM_RANDOMBYTES */
-
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
#endif /* !MLK_RANDOMBYTES_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/sampling.c
index be5d931a79..945d12ed3d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/sampling.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/sampling.c
@@ -29,9 +29,10 @@
* in that it adds the offset and always expects the base of the
* target buffer. This avoids shifting the buffer base in the
* caller, which appears tricky to reason about. */
-static unsigned mlk_rej_uniform_scalar(int16_t *r, unsigned target,
- unsigned offset, const uint8_t *buf,
- unsigned buflen)
+MLK_STATIC_TESTABLE unsigned mlk_rej_uniform_c(int16_t *r, unsigned target,
+ unsigned offset,
+ const uint8_t *buf,
+ unsigned buflen)
__contract__(
requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
requires(memory_no_alias(r, sizeof(int16_t) * target))
@@ -39,11 +40,10 @@ __contract__(
requires(array_bound(r, 0, offset, 0, MLKEM_Q))
assigns(memory_slice(r, sizeof(int16_t) * target))
ensures(offset <= return_value && return_value <= target)
- ensures(array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
+ ensures(array_bound(r, 0, return_value, 0, MLKEM_Q)))
{
unsigned ctr, pos;
- uint16_t val0, val1;
+ int16_t val0, val1;
mlk_assert_bound(r, offset, 0, MLKEM_Q);
@@ -55,8 +55,8 @@ __contract__(
invariant(offset <= ctr && ctr <= target && pos <= buflen)
invariant(array_bound(r, 0, ctr, 0, MLKEM_Q)))
{
- val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
- val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+ val0 = ((buf[pos + 0] >> 0) | (buf[pos + 1] << 8)) & 0xFFF;
+ val1 = ((buf[pos + 1] >> 4) | (buf[pos + 2] << 4)) & 0xFFF;
pos += 3;
if (val0 < MLKEM_Q)
@@ -93,7 +93,7 @@ __contract__(
* Must be a multiple of 3.
*
* Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 128 is somewhat arbitrary but sufficient for all
+ * excluding. The limit of 4096 is somewhat arbitrary but sufficient for all
* uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
*
* Returns the new offset of sampled 16-bit integers, at most target,
@@ -124,8 +124,9 @@ __contract__(
#if defined(MLK_USE_NATIVE_REJ_UNIFORM)
if (offset == 0)
{
- int ret = mlk_rej_uniform_native(r, target, buf, buflen);
- if (ret != -1)
+ int ret;
+ ret = mlk_rej_uniform_native(r, target, buf, buflen);
+ if (ret != MLK_NATIVE_FUNC_FALLBACK)
{
unsigned res = (unsigned)ret;
mlk_assert_bound(r, res, 0, MLKEM_Q);
@@ -134,19 +135,22 @@ __contract__(
}
#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
- return mlk_rej_uniform_scalar(r, target, offset, buf, buflen);
+ return mlk_rej_uniform_c(r, target, offset, buf, buflen);
}
#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
- ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + MLK_XOF_RATE) / MLK_XOF_RATE)
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+ ((12 * MLKEM_N / 8 * ((uint32_t)1 << 12) / MLKEM_Q + MLK_XOF_RATE) / \
+ MLK_XOF_RATE)
#endif
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
/* Reference: Does not exist in the reference implementation @[REF].
* - x4-batched version of `rej_uniform()` from the
* reference implementation, leveraging x4-batched Keccak-f1600. */
MLK_INTERNAL_API
-void mlk_poly_rej_uniform_x4(mlk_poly *vec,
+void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
+ mlk_poly *vec3,
uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
{
/* Temporary buffers for XOF output before rejection sampling */
@@ -167,10 +171,10 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
*/
mlk_xof_x4_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &statex);
buflen = MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE;
- ctr[0] = mlk_rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf[0], buflen);
- ctr[1] = mlk_rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf[1], buflen);
- ctr[2] = mlk_rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf[2], buflen);
- ctr[3] = mlk_rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf[3], buflen);
+ ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, 0, buf[0], buflen);
+ ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, 0, buf[1], buflen);
+ ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, 0, buf[2], buflen);
+ ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, 0, buf[3], buflen);
/*
* So long as not all matrix entries have been generated, squeeze
@@ -180,20 +184,24 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
ctr[3] < MLKEM_N)
__loop__(
- assigns(ctr, statex, memory_slice(vec, sizeof(mlk_poly) * 4), object_whole(buf[0]),
- object_whole(buf[1]), object_whole(buf[2]), object_whole(buf[3]))
+ assigns(ctr, statex,
+ memory_slice(vec0, sizeof(mlk_poly)),
+ memory_slice(vec1, sizeof(mlk_poly)),
+ memory_slice(vec2, sizeof(mlk_poly)),
+ memory_slice(vec3, sizeof(mlk_poly)),
+ object_whole(buf))
invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
- invariant(array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
- invariant(array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
- invariant(array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
- invariant(array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+ invariant(array_bound(vec0->coeffs, 0, ctr[0], 0, MLKEM_Q))
+ invariant(array_bound(vec1->coeffs, 0, ctr[1], 0, MLKEM_Q))
+ invariant(array_bound(vec2->coeffs, 0, ctr[2], 0, MLKEM_Q))
+ invariant(array_bound(vec3->coeffs, 0, ctr[3], 0, MLKEM_Q)))
{
mlk_xof_x4_squeezeblocks(buf, 1, &statex);
- ctr[0] = mlk_rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf[0], buflen);
- ctr[1] = mlk_rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf[1], buflen);
- ctr[2] = mlk_rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf[2], buflen);
- ctr[3] = mlk_rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf[3], buflen);
+ ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, ctr[0], buf[0], buflen);
+ ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, ctr[1], buf[1], buflen);
+ ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, ctr[2], buf[2], buflen);
+ ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, ctr[3], buf[3], buflen);
}
mlk_xof_x4_release(&statex);
@@ -202,6 +210,7 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(buf, sizeof(buf));
}
+#endif /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
MLK_INTERNAL_API
void mlk_poly_rej_uniform(mlk_poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
@@ -284,7 +293,7 @@ void mlk_poly_cbd2(mlk_poly *r, const uint8_t buf[2 * MLKEM_N / 4])
{
const int16_t a = (d >> (4 * j + 0)) & 0x3;
const int16_t b = (d >> (4 * j + 2)) & 0x3;
- r->coeffs[8 * i + j] = a - b;
+ r->coeffs[8 * i + j] = (int16_t)(a - b);
}
}
}
@@ -336,7 +345,7 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4])
{
const int16_t a = (d >> (6 * j + 0)) & 0x7;
const int16_t b = (d >> (6 * j + 3)) & 0x7;
- r->coeffs[4 * i + j] = a - b;
+ r->coeffs[4 * i + j] = (int16_t)(a - b);
}
}
}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/sampling.h
index 2cf43c889b..24c26b34a5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/sampling.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/sampling.h
@@ -15,8 +15,6 @@
#ifndef MLK_SAMPLING_H
#define MLK_SAMPLING_H
-#include
-#include
#include "cbmc.h"
#include "common.h"
#include "poly.h"
@@ -58,6 +56,7 @@ MLK_INTERNAL_API
void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_ETA1 == 3 */
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
#define mlk_poly_rej_uniform_x4 MLK_NAMESPACE(poly_rej_uniform_x4)
/*************************************************
* Name: mlk_poly_rej_uniform_x4
@@ -65,8 +64,8 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
* Description: Generate four polynomials using rejection sampling
* on (pseudo-)uniformly random bytes sampled from a seed.
*
- * Arguments: - mlk_poly *vec:
- * Pointer to an array of 4 polynomials to be sampled.
+ * Arguments: - mlk_poly *vec0, *vec1, *vec2, *vec3:
+ * Pointers to 4 polynomials to be sampled.
* - uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)]:
* Pointer consecutive array of seed buffers of size
* MLKEM_SYMBYTES + 2 each, plus padding for alignment.
@@ -75,16 +74,24 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
*
**************************************************/
MLK_INTERNAL_API
-void mlk_poly_rej_uniform_x4(mlk_poly *vec,
+void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
+ mlk_poly *vec3,
uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
__contract__(
- requires(memory_no_alias(vec, sizeof(mlk_poly) * 4))
+ requires(memory_no_alias(vec0, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec1, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec2, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec3, sizeof(mlk_poly)))
requires(memory_no_alias(seed, 4 * MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)))
- assigns(memory_slice(vec, sizeof(mlk_poly) * 4))
- ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+ assigns(memory_slice(vec0, sizeof(mlk_poly)))
+ assigns(memory_slice(vec1, sizeof(mlk_poly)))
+ assigns(memory_slice(vec2, sizeof(mlk_poly)))
+ assigns(memory_slice(vec3, sizeof(mlk_poly)))
+ ensures(array_bound(vec0->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec1->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec2->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec3->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+#endif /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
#define mlk_poly_rej_uniform MLK_NAMESPACE(poly_rej_uniform)
/*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/symmetric.h
index 985bfeab37..68d7e1a0cd 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/symmetric.h
@@ -15,12 +15,13 @@
#ifndef MLK_SYMMETRIC_H
#define MLK_SYMMETRIC_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include MLK_FIPS202_HEADER_FILE
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
#include MLK_FIPS202X4_HEADER_FILE
+#endif
/* Macros denoting FIPS 203 specific Hash functions */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/sys.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/sys.h
index 8f690cc553..0ab8947318 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/sys.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/sys.h
@@ -20,6 +20,15 @@
#error "__BYTE_ORDER__ defined, but don't recognize value."
#endif
#endif /* __BYTE_ORDER__ */
+
+/* MSVC does not define __BYTE_ORDER__. However, MSVC only supports
+ * little endian x86, x86_64, and AArch64. It is, hence, safe to assume
+ * little endian. */
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || \
+ defined(_M_IX86) || defined(_M_ARM64))
+#define MLK_SYS_LITTLE_ENDIAN
+#endif
+
#endif /* !MLK_SYS_LITTLE_ENDIAN && !MLK_SYS_BIG_ENDIAN */
/* Check if we're running on an AArch64 little endian system. _M_ARM64 is set by
@@ -33,6 +42,11 @@
#define MLK_SYS_AARCH64_EB
#endif
+/* Check if we're running on an Armv8.1-M system with MVE */
+#if defined(__ARM_ARCH_8_1M_MAIN__) || defined(__ARM_FEATURE_MVE)
+#define MLK_SYS_ARMV81M_MVE
+#endif
+
#if defined(__x86_64__)
#define MLK_SYS_X86_64
#if defined(__AVX2__)
@@ -48,6 +62,11 @@
#define MLK_SYS_RISCV64
#endif
+#if defined(MLK_SYS_RISCV64) && defined(__riscv_vector) && \
+ defined(__riscv_v_intrinsic)
+#define MLK_SYS_RISCV64_RVV
+#endif
+
#if defined(__riscv) && defined(__riscv_xlen) && __riscv_xlen == 32
#define MLK_SYS_RISCV32
#endif
@@ -56,6 +75,14 @@
#define MLK_SYS_WINDOWS
#endif
+#if defined(__linux__)
+#define MLK_SYS_LINUX
+#endif
+
+#if defined(__APPLE__)
+#define MLK_SYS_APPLE
+#endif
+
#if defined(MLK_FORCE_AARCH64) && !defined(MLK_SYS_AARCH64)
#error "MLK_FORCE_AARCH64 is set, but we don't seem to be on an AArch64 system."
#endif
@@ -82,34 +109,46 @@
#endif
/*
- * C90 does not have the inline compiler directive yet.
- * We don't use it in C90 builds.
- * However, in that case the compiler warns about some inline functions in
- * header files not being used in every compilation unit that includes that
- * header. To work around it we silence that warning in that case using
- * __attribute__((unused)).
+ * MLK_INLINE: Hint for inlining.
+ * - MSVC: __inline
+ * - C99+: inline
+ * - GCC/Clang C90: __attribute__((unused)) to silence warnings
+ * - Other C90: empty
*/
-
-/* Do not use inline for C90 builds*/
#if !defined(MLK_INLINE)
-#if !defined(inline)
#if defined(_MSC_VER)
#define MLK_INLINE __inline
-/* Don't combine __inline and __forceinline */
-#define MLK_ALWAYS_INLINE __forceinline
-#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#elif defined(inline) || \
+ (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
#define MLK_INLINE inline
+#elif defined(__GNUC__) || defined(__clang__)
+#define MLK_INLINE __attribute__((unused))
+#else
+#define MLK_INLINE
+#endif
+#endif /* !MLK_INLINE */
+
+/*
+ * MLK_ALWAYS_INLINE: Force inlining.
+ * - MSVC: __forceinline
+ * - GCC/Clang C99+: MLK_INLINE __attribute__((always_inline))
+ * - Other: MLK_INLINE (no forced inlining)
+ */
+#if !defined(MLK_ALWAYS_INLINE)
+#if defined(_MSC_VER)
+#define MLK_ALWAYS_INLINE __forceinline
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+ (defined(inline) || \
+ (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L))
#define MLK_ALWAYS_INLINE MLK_INLINE __attribute__((always_inline))
#else
-#define MLK_INLINE __attribute__((unused))
#define MLK_ALWAYS_INLINE MLK_INLINE
#endif
+#endif /* !MLK_ALWAYS_INLINE */
-#else /* !inline */
-#define MLK_INLINE inline
-#define MLK_ALWAYS_INLINE MLK_INLINE __attribute__((always_inline))
-#endif /* inline */
-#endif /* !MLK_INLINE */
+#ifndef MLK_STATIC_TESTABLE
+#define MLK_STATIC_TESTABLE static
+#endif
/*
* C90 does not have the restrict compiler directive yet.
@@ -181,10 +220,41 @@
} while (0)
#endif /* !(MLK_CONFIG_CT_TESTING_ENABLED && !__ASSEMBLER__) */
-#if defined(__GNUC__) || defined(clang)
+#if defined(__GNUC__) || defined(__clang__)
#define MLK_MUST_CHECK_RETURN_VALUE __attribute__((warn_unused_result))
#else
#define MLK_MUST_CHECK_RETURN_VALUE
#endif
+#if !defined(__ASSEMBLER__)
+/* System capability enumeration */
+typedef enum
+{
+ /* x86_64 */
+ MLK_SYS_CAP_AVX2,
+ /* AArch64 */
+ MLK_SYS_CAP_SHA3
+} mlk_sys_cap;
+
+#if !defined(MLK_CONFIG_CUSTOM_CAPABILITY_FUNC)
+#include "cbmc.h"
+
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_sys_check_capability(mlk_sys_cap cap)
+__contract__(
+ ensures(return_value == 0 || return_value == 1)
+)
+{
+ /* By default, we rely on compile-time feature detection/specification:
+ * If a feature is enabled at compile-time, we assume it is supported by
+ * the host that the resulting library/binary will be built on.
+ * If this assumption is not true, you MUST overwrite this function.
+ * See the documentation of MLK_CONFIG_CUSTOM_CAPABILITY_FUNC in
+ * mlkem_native_config.h for more information. */
+ (void)cap;
+ return 1;
+}
+#endif /* !MLK_CONFIG_CUSTOM_CAPABILITY_FUNC */
+#endif /* !__ASSEMBLER__ */
+
#endif /* !MLK_SYS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/verify.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/verify.h
index 85626c15ea..a9bdeaab30 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/verify.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/verify.h
@@ -30,9 +30,7 @@
#ifndef MLK_VERIFY_H
#define MLK_VERIFY_H
-#include
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
@@ -115,92 +113,83 @@ __contract__(ensures(return_value == b)) { return (b ^ mlk_ct_get_optblocker_u8(
static MLK_INLINE uint32_t mlk_value_barrier_u32(uint32_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
static MLK_INLINE int32_t mlk_value_barrier_i32(int32_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
static MLK_INLINE uint8_t mlk_value_barrier_u8(uint8_t b)
__contract__(ensures(return_value == b))
{
- __asm__("" : "+r"(b));
+ __asm__ volatile("" : "+r"(b));
return b;
}
#endif /* MLK_USE_ASM_VALUE_BARRIER */
-/*
- * The ct_cmask_nonzero_xxx functions below make deliberate use of unsigned
- * overflow, which is fully defined behaviour in C. It is thus safe to disable
- * this warning.
- */
#ifdef CBMC
#pragma CPROVER check push
-#pragma CPROVER check disable "unsigned-overflow"
+#pragma CPROVER check disable "conversion"
#endif
-
/*************************************************
- * Name: mlk_ct_cmask_nonzero_u16
+ * Name: mlk_cast_uint16_to_int16
*
- * Description: Return 0 if input is zero, and -1 otherwise.
+ * Description: Cast uint16 value to int16
*
- * Arguments: uint16_t x: Value to be converted into a mask
+ * Returns: For uint16_t x, the unique y in int16_t
+ * so that x == y mod 2^16.
+ *
+ * Concretely:
+ * - x < 32768: returns x
+ * - x >= 32768: returns x - 65536
*
**************************************************/
-
-/* Reference: Embedded in `cmov_int16()` in the reference implementation @[REF].
- * - Use value barrier and shift instead of `b = -b` to
- * convert condition into mask. */
-static MLK_INLINE uint16_t mlk_ct_cmask_nonzero_u16(uint16_t x)
-__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
+static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
{
- uint32_t tmp = mlk_value_barrier_u32(-((uint32_t)x));
- tmp >>= 16;
- return tmp;
+ /*
+ * PORTABILITY: This relies on uint16_t -> int16_t
+ * being implemented as the inverse of int16_t -> uint16_t,
+ * which is implementation-defined (C99 6.3.1.3 (3))
+ * CBMC (correctly) fails to prove this conversion is OK,
+ * so we have to suppress that check here
+ */
+ return (int16_t)x;
}
+#ifdef CBMC
+#pragma CPROVER check pop
+#endif
/*************************************************
- * Name: mlk_ct_cmask_nonzero_u8
+ * Name: mlk_cast_int32_to_uint16
*
- * Description: Return 0 if input is zero, and -1 otherwise.
- *
- * Arguments: uint8_t x: Value to be converted into a mask
+ * Description: Cast int32 value to uint16 as per C standard.
*
+ * Returns: For int32_t x, the unique y in uint16_t
+ * so that x == y mod 2^16.
**************************************************/
-
-/* Reference: Embedded in `verify()` and `cmov()` in the
- * reference implementation @[REF].
- * - We include a value barrier not present in the
- * reference implementation, to prevent the compiler
- * from realizing that this function returns a mask. */
-static MLK_INLINE uint8_t mlk_ct_cmask_nonzero_u8(uint8_t x)
-__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
+static MLK_ALWAYS_INLINE uint16_t mlk_cast_int32_to_uint16(int32_t x)
{
- uint32_t tmp = mlk_value_barrier_u32(-((uint32_t)x));
- tmp >>= 24;
- return tmp;
+ return (uint16_t)(x & (int32_t)UINT16_MAX);
}
-/* Put unsigned overflow warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
-/*
- * The mlk_ct_cmask_neg_i16 function below makes deliberate use of
- * signed to unsigned integer conversion, which is fully defined
- * behaviour in C. It is thus safe to disable this warning.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
+/*************************************************
+ * Name: mlk_cast_int16_to_uint16
+ *
+ * Description: Cast int16 value to uint16 as per C standard.
+ *
+ * Returns: For int16_t x, the unique y in uint16_t
+ * so that x == y mod 2^16.
+ **************************************************/
+static MLK_ALWAYS_INLINE uint16_t mlk_cast_int16_to_uint16(int32_t x)
+{
+ return mlk_cast_int32_to_uint16((int32_t)x);
+}
/*************************************************
* Name: mlk_ct_cmask_neg_i16
@@ -225,24 +214,49 @@ __contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
{
int32_t tmp = mlk_value_barrier_i32((int32_t)x);
tmp >>= 16;
- return (int16_t)tmp;
+ return mlk_cast_int32_to_uint16(tmp);
}
-/* Put unsigned-to-signed warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
+/*************************************************
+ * Name: mlk_ct_cmask_nonzero_u16
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments: uint16_t x: Value to be converted into a mask
+ *
+ **************************************************/
-/*
- * The ct_csel_xxx functions below make deliberate use of unsigned
- * to signed integer conversion, which is implementation-defined
- * behaviour. Here, we assume that uint16_t -> int16_t is inverse
- * to int16_t -> uint16_t.
- */
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
+/* Reference: Embedded in `cmov_int16()` in the reference implementation @[REF].
+ * - Use value barrier and shift instead of `b = -b` to
+ * convert condition into mask. */
+static MLK_INLINE uint16_t mlk_ct_cmask_nonzero_u16(uint16_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
+{
+ int32_t tmp = mlk_value_barrier_i32(-((int32_t)x));
+ tmp >>= 16;
+ return mlk_cast_int32_to_uint16(tmp);
+}
+
+/*************************************************
+ * Name: mlk_ct_cmask_nonzero_u8
+ *
+ * Description: Return 0 if input is zero, and -1 otherwise.
+ *
+ * Arguments: uint8_t x: Value to be converted into a mask
+ *
+ **************************************************/
+
+/* Reference: Embedded in `verify()` and `cmov()` in the
+ * reference implementation @[REF].
+ * - We include a value barrier not present in the
+ * reference implementation, to prevent the compiler
+ * from realizing that this function returns a mask. */
+static MLK_INLINE uint8_t mlk_ct_cmask_nonzero_u8(uint8_t x)
+__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
+{
+ uint16_t mask = mlk_ct_cmask_nonzero_u16((uint16_t)x);
+ return (uint8_t)(mask & 0xFF);
+}
/*************************************************
* Name: mlk_ct_sel_int16
@@ -280,16 +294,12 @@ __contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
static MLK_INLINE int16_t mlk_ct_sel_int16(int16_t a, int16_t b, uint16_t cond)
__contract__(ensures(return_value == (cond ? a : b)))
{
- uint16_t au = a, bu = b;
+ uint16_t au = mlk_cast_int16_to_uint16(a);
+ uint16_t bu = mlk_cast_int16_to_uint16(b);
uint16_t res = bu ^ (mlk_ct_cmask_nonzero_u16(cond) & (au ^ bu));
- return (int16_t)res;
+ return mlk_cast_uint16_to_int16(res);
}
-/* Put unsigned-to-signed warnings in CBMC back into scope */
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
/*************************************************
* Name: mlk_ct_sel_uint8
*
@@ -318,9 +328,11 @@ __contract__(ensures(return_value == (cond ? a : b)))
*
* Arguments: const uint8_t *a: pointer to first byte array
* const uint8_t *b: pointer to second byte array
- * size_t len: length of the byte arrays
+ * size_t len: length of the byte arrays, upper-bounded
+ * to UINT16_MAX to control proof complexity
+ * only.
*
- * Returns 0 if the byte arrays are equal, a non-zero value otherwise
+ * Returns 0 if the byte arrays are equal, 0xFF otherwise.
*
* Specification:
* - Used to securely compute conditional move in
@@ -338,9 +350,10 @@ __contract__(ensures(return_value == (cond ? a : b)))
static MLK_INLINE uint8_t mlk_ct_memcmp(const uint8_t *a, const uint8_t *b,
const size_t len)
__contract__(
+ requires(len <= UINT16_MAX)
requires(memory_no_alias(a, len))
requires(memory_no_alias(b, len))
- requires(len <= INT_MAX)
+ ensures((return_value == 0) || (return_value == 0xFF))
ensures((return_value == 0) == forall(i, 0, len, (a[i] == b[i]))))
{
uint8_t r = 0, s = 0;
@@ -391,13 +404,17 @@ __contract__(
static MLK_INLINE void mlk_ct_cmov_zero(uint8_t *r, const uint8_t *x,
size_t len, uint8_t b)
__contract__(
+ requires(len <= MLK_MAX_BUFFER_SIZE)
requires(memory_no_alias(r, len))
requires(memory_no_alias(x, len))
- assigns(memory_slice(r, len)))
+ assigns(memory_slice(r, len))
+ ensures(forall(i, 0, len, (r[i] == (b == 0 ? x[i] : old(r)[i])))))
{
size_t i;
for (i = 0; i < len; i++)
- __loop__(invariant(i <= len))
+ __loop__(
+ invariant(i <= len)
+ invariant(forall(k, 0, i, r[k] == (b == 0 ? x[k] : loop_entry(r)[k]))))
{
r[i] = mlk_ct_sel_uint8(r[i], x[i], b);
}
@@ -431,13 +448,13 @@ __contract__(
requires(memory_no_alias(ptr, len))
assigns(memory_slice(ptr, len)))
{
- memset(ptr, 0, len);
+ mlk_memset(ptr, 0, len);
/* This follows OpenSSL and seems sufficient to prevent the compiler
* from optimizing away the memset.
*
* If there was a reliable way to detect availability of memset_s(),
* that would be preferred. */
- __asm__ __volatile__("" : : "r"(ptr) : "memory");
+ __asm__ volatile("" : : "r"(ptr) : "memory");
}
#else /* !MLK_SYS_WINDOWS && MLK_HAVE_INLINE_ASM */
#error No plausibly-secure implementation of mlk_zeroize available. Please provide your own using MLK_CONFIG_CUSTOM_ZEROIZE.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/zetas.inc b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/zetas.inc
index 0c00b5b905..00316daf67 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/zetas.inc
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/mlkem/src/zetas.inc
@@ -5,16 +5,16 @@
/*
* WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
* Do not modify it directly.
*/
-#include
/*
* Table of zeta values used in the reference NTT and inverse NTT.
* See autogen for details.
*/
-static MLK_ALIGN const int16_t zetas[128] = {
+static MLK_ALIGN const int16_t mlk_zetas[128] = {
-1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577,
182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458,
-1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/integration/liboqs/config_x86_64.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/integration/liboqs/config_x86_64.h
index c818bcc980..b82f3dd434 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/integration/liboqs/config_x86_64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/integration/liboqs/config_x86_64.h
@@ -8,13 +8,23 @@
*
* - [FIPS140_3_IG]
* Implementation Guidance for FIPS 140-3 and the Cryptographic Module
- * Validation Program National Institute of Standards and Technology
+ * Validation Program
+ * National Institute of Standards and Technology
* https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
*/
#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_X86_64_H
#define MLK_INTEGRATION_LIBOQS_CONFIG_X86_64_H
+/* Enable valgrind-based assertions in mlkem-native through macro
+ * from libOQS. */
+#if !defined(__ASSEMBLER__)
+#include
+#if defined(OQS_ENABLE_TEST_CONSTANT_TIME)
+#define MLK_CONFIG_CT_TESTING_ENABLED
+#endif
+#endif /* !__ASSEMBLER__ */
+
/******************************************************************************
* Name: MLK_CONFIG_PARAMETER_SET
*
@@ -172,7 +182,7 @@
* consumer.
*
* If this option is not set, mlkem-native expects a function
- * void randombytes(uint8_t *out, size_t outlen).
+ * int randombytes(uint8_t *out, size_t outlen).
*
* Set this option and define `mlk_randombytes` if you want to
* use a custom method to sample randombytes with a different name
@@ -184,9 +194,10 @@
#include
#include
#include "../../mlkem/src/sys.h"
-static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len)
+static MLK_INLINE int mlk_randombytes(uint8_t *ptr, size_t len)
{
OQS_randombytes(ptr, len);
+ return 0;
}
#endif /* !__ASSEMBLER__ */
@@ -251,13 +262,4 @@ static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len)
#endif
*/
-/* Enable valgrind-based assertions in mlkem-native through macro
- * from libOQS. */
-#if !defined(__ASSEMBLER__)
-#include
-#if defined(OQS_ENABLE_TEST_CONSTANT_TIME)
-#define MLK_CONFIG_CT_TESTING_ENABLED
-#endif
-#endif /* !__ASSEMBLER__ */
-
#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_X86_64_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/cbmc.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/cbmc.h
index 650d32b95b..80e1a36fc7 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/cbmc.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/cbmc.h
@@ -8,7 +8,6 @@
/***************************************************
* Basic replacements for __CPROVER_XXX contracts
***************************************************/
-
#ifndef CBMC
#define __contract__(x)
@@ -16,6 +15,7 @@
#else /* !CBMC */
+
#define __contract__(x) x
#define __loop__(x) x
@@ -49,7 +49,6 @@
*/
#define object_whole(...) __CPROVER_object_whole(__VA_ARGS__)
#define memory_slice(...) __CPROVER_object_upto(__VA_ARGS__)
-#define same_object(...) __CPROVER_same_object(__VA_ARGS__)
/*
* Pointer-related predicates
@@ -59,6 +58,17 @@
#define readable(...) __CPROVER_r_ok(__VA_ARGS__)
#define writeable(...) __CPROVER_w_ok(__VA_ARGS__)
+/* Maximum supported buffer size
+ *
+ * Larger buffers may be supported, but due to internal modeling constraints
+ * in CBMC, the proofs of memory- and type-safety won't be able to run.
+ *
+ * If you find yourself in need for a buffer size larger than this,
+ * please contact the maintainers, so we can prioritize work to relax
+ * this somewhat artificial bound.
+ */
+#define MLK_MAX_BUFFER_SIZE (SIZE_MAX >> 12)
+
/*
* History variables
* https://diffblue.github.io/cbmc/contracts-history-variables.html
@@ -83,7 +93,7 @@
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> (predicate) \
}
-#define EXISTS(qvar, qvar_lb, qvar_ub, predicate) \
+#define exists(qvar, qvar_lb, qvar_ub, predicate) \
__CPROVER_exists \
{ \
unsigned qvar; \
@@ -118,13 +128,35 @@
{ \
unsigned qvar; \
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
- (((int)(value_lb) <= ((array_var)[(qvar)])) && \
- (((array_var)[(qvar)]) < (int)(value_ub))) \
+ (((int)(value_lb) <= ((array_var)[(qvar)])) && \
+ (((array_var)[(qvar)]) < (int)(value_ub))) \
}
-#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
- array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb), \
+#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
+ array_bound_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), (qvar_lb), \
(qvar_ub), (array_var), (value_lb), (value_ub))
+
+#define array_unchanged_core(qvar, qvar_lb, qvar_ub, array_var) \
+ __CPROVER_forall \
+ { \
+ unsigned qvar; \
+ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
+ ((array_var)[(qvar)]) == (old(* (int16_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
+ }
+
+#define array_unchanged(array_var, N) \
+ array_unchanged_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var))
+
+#define array_unchanged_u64_core(qvar, qvar_lb, qvar_ub, array_var) \
+ __CPROVER_forall \
+ { \
+ unsigned qvar; \
+ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
+ ((array_var)[(qvar)]) == (old(* (uint64_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
+ }
+
+#define array_unchanged_u64(array_var, N) \
+ array_unchanged_u64_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var))
/* clang-format on */
/* Wrapper around array_bound operating on absolute values.
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/common.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/common.h
index 9de9875556..bc4e9ed72c 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/common.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/common.h
@@ -5,10 +5,16 @@
#ifndef MLK_COMMON_H
#define MLK_COMMON_H
+#ifndef __ASSEMBLER__
+#include
+#endif
+
+#define MLK_BUILD_INTERNAL
+
#if defined(MLK_CONFIG_FILE)
#include MLK_CONFIG_FILE
#else
-#include "config.h"
+#include "mlkem_native_config.h"
#endif
#include "params.h"
@@ -28,15 +34,11 @@
#define MLK_EXTERNAL_API MLK_CONFIG_EXTERNAL_API_QUALIFIER
#endif
-#if defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) || \
- defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED)
-#define MLK_MULTILEVEL_BUILD
-#endif
-
#define MLK_CONCAT_(x1, x2) x1##x2
#define MLK_CONCAT(x1, x2) MLK_CONCAT_(x1, x2)
-#if defined(MLK_MULTILEVEL_BUILD)
+#if (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || \
+ defined(MLK_CONFIG_MULTILEVEL_NO_SHARED))
#define MLK_ADD_PARAM_SET(s) MLK_CONCAT(s, MLK_CONFIG_PARAMETER_SET)
#else
#define MLK_ADD_PARAM_SET(s) s
@@ -49,7 +51,7 @@
/* Functions are prefixed by MLK_CONFIG_NAMESPACE_PREFIX.
*
* If multiple parameter sets are used, functions depending on the parameter
- * set are additionally prefixed with 512/768/1024. See config.h.
+ * set are additionally prefixed with 512/768/1024. See mlkem_native_config.h.
*
* Example: If MLK_CONFIG_NAMESPACE_PREFIX is mlkem, then
* MLK_NAMESPACE_K(enc) becomes mlkem512_enc/mlkem768_enc/mlkem1024_enc.
@@ -73,8 +75,24 @@
*/
#if defined(MLK_SYS_X86_64)
#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) : MLK_CET_ENDBR
-#else
+#elif defined(MLK_SYS_ARMV81M_MVE)
+/* clang-format off */
+#define MLK_ASM_FN_SYMBOL(sym) \
+ .type MLK_ASM_NAMESPACE(sym), %function; \
+ MLK_ASM_NAMESPACE(sym) :
+/* clang-format on */
+#else /* !MLK_SYS_X86_64 && MLK_SYS_ARMV81M_MVE */
#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) :
+#endif /* !MLK_SYS_X86_64 && !MLK_SYS_ARMV81M_MVE */
+
+/*
+ * Output the size of an assembly function.
+ */
+#if defined(__ELF__)
+#define MLK_ASM_FN_SIZE(sym) \
+ .size MLK_ASM_NAMESPACE(sym), .- MLK_ASM_NAMESPACE(sym)
+#else
+#define MLK_ASM_FN_SIZE(sym)
#endif
/* We aim to simplify the user's life by supporting builds where
@@ -99,6 +117,10 @@
#error Bad configuration: MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, but MLK_CONFIG_FIPS202_BACKEND_FILE is not.
#endif
+#if defined(MLK_CONFIG_NO_RANDOMIZED_API) && defined(MLK_CONFIG_KEYGEN_PCT)
+#error Bad configuration: MLK_CONFIG_NO_RANDOMIZED_API is incompatible with MLK_CONFIG_KEYGEN_PCT as the current PCT implementation requires crypto_kem_enc()
+#endif
+
#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH)
#include MLK_CONFIG_ARITH_BACKEND_FILE
/* Include to enforce consistency of API and implementation,
@@ -135,20 +157,118 @@
#define MLK_FIPS202X4_HEADER_FILE MLK_CONFIG_FIPS202X4_CUSTOM_HEADER
#endif
-/* Just in case we want to include mlkem_native.h, set the configuration
- * for that header in accordance with the configuration used here. */
+/* Standard library function replacements */
+#if !defined(__ASSEMBLER__)
+#if !defined(MLK_CONFIG_CUSTOM_MEMCPY)
+#include
+#define mlk_memcpy memcpy
+#endif
+
+#if !defined(MLK_CONFIG_CUSTOM_MEMSET)
+#include
+#define mlk_memset memset
+#endif
+
+
+/* Allocation macros for large local structures
+ *
+ * MLK_ALLOC(v, T, N) declares T *v and attempts to point it to an T[N]
+ * MLK_FREE(v, T, N) zeroizes and frees the allocation
+ *
+ * Default implementation uses stack allocation.
+ * Can be overridden by setting the config option MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * and defining MLK_CUSTOM_ALLOC and MLK_CUSTOM_FREE.
+ */
+#if defined(MLK_CONFIG_CUSTOM_ALLOC_FREE) != \
+ (defined(MLK_CUSTOM_ALLOC) && defined(MLK_CUSTOM_FREE))
+#error Bad configuration: MLK_CONFIG_CUSTOM_ALLOC_FREE must be set together with MLK_CUSTOM_ALLOC and MLK_CUSTOM_FREE
+#endif
+
+/*
+ * If the integration wants to provide a context parameter for use in
+ * platform-specific hooks, then it should define this parameter.
+ *
+ * The MLK_CONTEXT_PARAMETERS_n macros are intended to be used with macros
+ * defining the function names and expand to either pass or discard the context
+ * argument as required by the current build. If there is no context parameter
+ * requested then these are removed from the prototypes and from all calls.
+ */
+#ifdef MLK_CONFIG_CONTEXT_PARAMETER
+#define MLK_CONTEXT_PARAMETERS_0(context) (context)
+#define MLK_CONTEXT_PARAMETERS_1(arg0, context) (arg0, context)
+#define MLK_CONTEXT_PARAMETERS_2(arg0, arg1, context) (arg0, arg1, context)
+#define MLK_CONTEXT_PARAMETERS_3(arg0, arg1, arg2, context) \
+ (arg0, arg1, arg2, context)
+#define MLK_CONTEXT_PARAMETERS_4(arg0, arg1, arg2, arg3, context) \
+ (arg0, arg1, arg2, arg3, context)
+#else /* MLK_CONFIG_CONTEXT_PARAMETER */
+#define MLK_CONTEXT_PARAMETERS_0(context) ()
+#define MLK_CONTEXT_PARAMETERS_1(arg0, context) (arg0)
+#define MLK_CONTEXT_PARAMETERS_2(arg0, arg1, context) (arg0, arg1)
+#define MLK_CONTEXT_PARAMETERS_3(arg0, arg1, arg2, context) (arg0, arg1, arg2)
+#define MLK_CONTEXT_PARAMETERS_4(arg0, arg1, arg2, arg3, context) \
+ (arg0, arg1, arg2, arg3)
+#endif /* !MLK_CONFIG_CONTEXT_PARAMETER */
+
+#if defined(MLK_CONFIG_CONTEXT_PARAMETER_TYPE) != \
+ defined(MLK_CONFIG_CONTEXT_PARAMETER)
+#error MLK_CONFIG_CONTEXT_PARAMETER_TYPE must be defined if and only if MLK_CONFIG_CONTEXT_PARAMETER is defined
+#endif
+
+#if !defined(MLK_CONFIG_CUSTOM_ALLOC_FREE)
+/* Default: stack allocation */
+
+#define MLK_ALLOC(v, T, N, context) \
+ MLK_ALIGN T mlk_alloc_##v[N]; \
+ T *v = mlk_alloc_##v
+
+/* TODO: This leads to a circular dependency between common and verify.h
+ * It just works out before we're at the end of the file, but it's still
+ * prone to issues in the future. */
+#include "verify.h"
+#define MLK_FREE(v, T, N, context) \
+ do \
+ { \
+ mlk_zeroize(mlk_alloc_##v, sizeof(mlk_alloc_##v)); \
+ (v) = NULL; \
+ } while (0)
+
+#else /* !MLK_CONFIG_CUSTOM_ALLOC_FREE */
+
+/* Custom allocation */
+
+/*
+ * The indirection here is necessary to use MLK_CONTEXT_PARAMETERS_3 here.
+ */
+#define MLK_APPLY(f, args) f args
+
+#define MLK_ALLOC(v, T, N, context) \
+ MLK_APPLY(MLK_CUSTOM_ALLOC, MLK_CONTEXT_PARAMETERS_3(v, T, N, context))
+
+#define MLK_FREE(v, T, N, context) \
+ do \
+ { \
+ if (v != NULL) \
+ { \
+ mlk_zeroize(v, sizeof(T) * (N)); \
+ MLK_APPLY(MLK_CUSTOM_FREE, MLK_CONTEXT_PARAMETERS_3(v, T, N, context)); \
+ v = NULL; \
+ } \
+ } while (0)
+
+#endif /* MLK_CONFIG_CUSTOM_ALLOC_FREE */
+
+/****************************** Error codes ***********************************/
-/* Double-check that this is not conflicting with pre-existing definitions. */
-#if defined(MLK_CONFIG_API_PARAMETER_SET) || \
- defined(MLK_CONFIG_API_NAMESPACE_PREFIX) || \
- defined(MLK_CONFIG_API_NO_SUPERCOP) || \
- defined(MLK_CONFIG_API_CONSTANTS_ONLY)
-#error Pre-existing MLK_CONFIG_API_XXX configuration is neither useful nor allowed during an mlkem-native build
-#endif /* MLK_CONFIG_API_PARAMETER_SET || MLK_CONFIG_API_NAMESPACE_PREFIX || \
- MLK_CONFIG_API_NO_SUPERCOP || MLK_CONFIG_API_CONSTANTS_ONLY */
+/* Generic failure condition */
+#define MLK_ERR_FAIL -1
+/* An allocation failed. This can only happen if MLK_CONFIG_CUSTOM_ALLOC_FREE
+ * is defined and the provided MLK_CUSTOM_ALLOC can fail. */
+#define MLK_ERR_OUT_OF_MEMORY -2
+/* An rng failure occured. Might be due to insufficient entropy or
+ * system misconfiguration. */
+#define MLK_ERR_RNG_FAIL -3
-#define MLK_CONFIG_API_PARAMETER_SET MLK_CONFIG_PARAMETER_SET
-#define MLK_CONFIG_API_NAMESPACE_PREFIX \
- MLK_ADD_PARAM_SET(MLK_CONFIG_NAMESPACE_PREFIX)
+#endif /* !__ASSEMBLER__ */
#endif /* !MLK_COMMON_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/compress.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/compress.c
index d7ff2bbe7a..50da36d0e4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/compress.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/compress.c
@@ -20,24 +20,27 @@
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
-#include
+
#include "cbmc.h"
#include "compress.h"
#include "debug.h"
#include "verify.h"
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
/* Reference: `poly_compress()` in the reference implementation @[REF],
* for ML-KEM-{512,768}.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d4_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -55,32 +58,51 @@ void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
t[j] = mlk_scalar_compress_d4(a->coeffs[8 * i + j]);
}
- r[i * 4] = t[0] | (t[1] << 4);
- r[i * 4 + 1] = t[2] | (t[3] << 4);
- r[i * 4 + 2] = t[4] | (t[5] << 4);
- r[i * 4 + 3] = t[6] | (t[7] << 4);
+ /* All t[i] are 4-bit wide, so the truncations don't alter the value. */
+ r[i * 4] = (uint8_t)(t[0] | (t[1] << 4));
+ r[i * 4 + 1] = (uint8_t)(t[2] | (t[3] << 4));
+ r[i * 4 + 2] = (uint8_t)(t[4] | (t[5] << 4));
+ r[i * 4 + 3] = (uint8_t)(t[6] | (t[7] << 4));
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D4 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d4_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d4_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
+ mlk_poly_compress_d4_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation, for ML-KEM-{512,768}.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d10_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+)
{
unsigned j;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -101,29 +123,47 @@ void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
* Make all implicit truncation explicit. No data is being
* truncated for the LHS's since each t[i] is 10-bit in size.
*/
- r[5 * j + 0] = (t[0] >> 0) & 0xFF;
- r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
- r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
- r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
- r[5 * j + 4] = (t[3] >> 2);
+ r[5 * j + 0] = (uint8_t)((t[0] >> 0) & 0xFF);
+ r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] << 2) & 0xFF));
+ r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] << 4) & 0xFF));
+ r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] << 6) & 0xFF));
+ r[5 * j + 4] = (uint8_t)(t[3] >> 2);
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D10 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d10_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d10_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
+ mlk_poly_compress_d10_c(r, a);
+}
+
/* Reference: `poly_decompress()` in the reference implementation @[REF],
* for ML-KEM-{512,768}. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d4(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d4_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 2; i++)
@@ -137,22 +177,40 @@ void mlk_poly_decompress_d4(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d4(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d4_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
+ int ret;
+ ret = mlk_poly_decompress_d4_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
+ mlk_poly_decompress_d4_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation, for ML-KEM-{512,768}. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d10(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d10_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned j;
for (j = 0; j < MLKEM_N / 4; j++)
@@ -180,28 +238,46 @@ void mlk_poly_decompress_d10(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d10(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d10_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
+ int ret;
+ ret = mlk_poly_decompress_d10_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
+
+ mlk_poly_decompress_d10_c(r, a);
+}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
/* Reference: `poly_compress()` in the reference implementation @[REF],
* for ML-KEM-1024.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d5_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -219,38 +295,51 @@ void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
t[j] = mlk_scalar_compress_d5(a->coeffs[8 * i + j]);
}
- /*
- * Explicitly truncate to avoid warning about
- * implicit truncation in CBMC, and use array indexing into
- * r rather than pointer-arithmetic to simplify verification
- */
- r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
- r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
- r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
- r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
- r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
+ r[i * 5] = (uint8_t)(0xFF & ((t[0] >> 0) | (t[1] << 5)));
+ r[i * 5 + 1] = (uint8_t)(0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)));
+ r[i * 5 + 2] = (uint8_t)(0xFF & ((t[3] >> 1) | (t[4] << 4)));
+ r[i * 5 + 3] = (uint8_t)(0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)));
+ r[i * 5 + 4] = (uint8_t)(0xFF & ((t[6] >> 2) | (t[7] << 3)));
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D5 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d5_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d5_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
-#if !defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
+ mlk_poly_compress_d5_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation, for ML-KEM-1024.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
- const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_compress_d11_c(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+)
{
unsigned j;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -272,35 +361,53 @@ void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
* Make all implicit truncation explicit. No data is being
* truncated for the LHS's since each t[i] is 11-bit in size.
*/
- r[11 * j + 0] = (t[0] >> 0) & 0xFF;
- r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
- r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
- r[11 * j + 3] = (t[2] >> 2) & 0xFF;
- r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
- r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
- r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
- r[11 * j + 7] = (t[5] >> 1) & 0xFF;
- r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
- r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
- r[11 * j + 10] = (t[7] >> 3);
+ r[11 * j + 0] = (uint8_t)((t[0] >> 0) & 0xFF);
+ r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] << 3) & 0xFF));
+ r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] << 6) & 0xFF));
+ r[11 * j + 3] = (uint8_t)((t[2] >> 2) & 0xFF);
+ r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] << 1) & 0xFF));
+ r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] << 4) & 0xFF));
+ r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] << 7) & 0xFF));
+ r[11 * j + 7] = (uint8_t)((t[5] >> 1) & 0xFF);
+ r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] << 2) & 0xFF));
+ r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] << 5) & 0xFF));
+ r[11 * j + 10] = (uint8_t)(t[7] >> 3);
}
}
-#else /* !MLK_USE_NATIVE_POLY_COMPRESS_D11 */
+
MLK_INTERNAL_API
void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+)
{
+#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_compress_d11_native(r, a->coeffs);
-}
+ ret = mlk_poly_compress_d11_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
+ mlk_poly_compress_d11_c(r, a);
+}
+
/* Reference: `poly_decompress()` in the reference implementation @[REF],
* for ML-KEM-1024. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d5(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d5_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 8; i++)
@@ -342,22 +449,40 @@ void mlk_poly_decompress_d5(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d5(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d5_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
+ int ret;
+ ret = mlk_poly_decompress_d5_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
-#if !defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
+ mlk_poly_decompress_d5_c(r, a);
+}
+
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation, for ML-KEM-1024. */
-MLK_INTERNAL_API
-void mlk_poly_decompress_d11(mlk_poly *r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+MLK_STATIC_TESTABLE void mlk_poly_decompress_d11_c(
+ mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned j;
for (j = 0; j < MLKEM_N / 8; j++)
@@ -390,26 +515,45 @@ void mlk_poly_decompress_d11(mlk_poly *r,
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
+
MLK_INTERNAL_API
void mlk_poly_decompress_d11(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
- mlk_poly_decompress_d11_native(r->coeffs, a);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
+ int ret;
+ ret = mlk_poly_decompress_d11_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
+ mlk_poly_decompress_d11_c(r, a);
+}
+
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
-#if !defined(MLK_USE_NATIVE_POLY_TOBYTES)
/* Reference: `poly_tobytes()` in the reference implementation @[REF].
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
-MLK_INTERNAL_API
-void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_tobytes_c(uint8_t r[MLKEM_POLYBYTES],
+ const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYBYTES))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
+)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
@@ -417,8 +561,10 @@ void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(invariant(i <= MLKEM_N / 2))
{
- const uint16_t t0 = a->coeffs[2 * i];
- const uint16_t t1 = a->coeffs[2 * i + 1];
+ /* The conversion to uint16_t is safe since we assume that
+ * the coefficients of `a` are non-negative. */
+ const uint16_t t0 = (uint16_t)a->coeffs[2 * i];
+ const uint16_t t1 = (uint16_t)a->coeffs[2 * i + 1];
/*
* t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
* significant data, so these can be packed into 24 bits or exactly
@@ -426,32 +572,48 @@ void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
*/
/* Least significant bits 0 - 7 of t0. */
- r[3 * i + 0] = t0 & 0xFF;
+ r[3 * i + 0] = (uint8_t)(t0 & 0xFF);
/*
* Most significant bits 8 - 11 of t0 become the least significant
* nibble of the second byte. The least significant 4 bits
* of t1 become the upper nibble of the second byte.
+ *
+ * The conversion to uint8_t does not alter the value.
*/
- r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
+ r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 << 4) & 0xF0));
- /* Bits 4 - 11 of t1 become the third byte. */
- r[3 * i + 2] = t1 >> 4;
+ /* Bits 4 - 11 of t1 become the third byte. The conversion to uint8_t
+ * does not alter the value because t1 is 12-bit wide. */
+ r[3 * i + 2] = (uint8_t)(t1 >> 4);
}
}
-#else /* !MLK_USE_NATIVE_POLY_TOBYTES */
+
MLK_INTERNAL_API
void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
{
+#if defined(MLK_USE_NATIVE_POLY_TOBYTES)
+ int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
- mlk_poly_tobytes_native(r, a->coeffs);
-}
+ ret = mlk_poly_tobytes_native(r, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
-#if !defined(MLK_USE_NATIVE_POLY_FROMBYTES)
+ mlk_poly_tobytes_c(r, a);
+}
+
/* Reference: `poly_frombytes()` in the reference implementation @[REF]. */
-MLK_INTERNAL_API
-void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
+MLK_STATIC_TESTABLE void mlk_poly_frombytes_c(mlk_poly *r,
+ const uint8_t a[MLKEM_POLYBYTES])
+__contract__(
+ requires(memory_no_alias(a, MLKEM_POLYBYTES))
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 2; i++)
@@ -462,21 +624,29 @@ void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
const uint8_t t0 = a[3 * i + 0];
const uint8_t t1 = a[3 * i + 1];
const uint8_t t2 = a[3 * i + 2];
- r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
- r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
+ r->coeffs[2 * i + 0] = (int16_t)(t0 | ((t1 << 8) & 0xFFF));
+ r->coeffs[2 * i + 1] = (int16_t)((t1 >> 4) | (t2 << 4));
}
/* Note that the coefficients are not canonical */
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
}
-#else /* !MLK_USE_NATIVE_POLY_FROMBYTES */
+
MLK_INTERNAL_API
void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
{
- mlk_poly_frombytes_native(r->coeffs, a);
-}
+#if defined(MLK_USE_NATIVE_POLY_FROMBYTES)
+ int ret;
+ ret = mlk_poly_frombytes_native(r->coeffs, a);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
+ mlk_poly_frombytes_c(r, a);
+}
+
/* Reference: `poly_frommsg()` in the reference implementation @[REF].
* - We use a value barrier around the bit-selection mask to
* reduce the risk of compiler-introduced branches.
@@ -506,7 +676,7 @@ void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
* as per @[FIPS203, Eq (4.8)]. */
/* Prevent the compiler from recognizing this as a bit selection */
- uint8_t mask = mlk_value_barrier_u8(1u << j);
+ uint8_t mask = mlk_value_barrier_u8((uint8_t)(1u << j));
r->coeffs[8 * i + j] = mlk_ct_sel_int16(MLKEM_Q_HALF, 0, msg[i] & mask);
}
}
@@ -535,7 +705,7 @@ void mlk_poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const mlk_poly *a)
invariant(i <= MLKEM_N / 8 && j <= 8))
{
uint32_t t = mlk_scalar_compress_d1(a->coeffs[8 * i + j]);
- msg[i] |= t << j;
+ msg[i] |= (uint8_t)(t << j);
}
}
}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/compress.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/compress.h
index f0789d42d6..b16b0889b5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/compress.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/compress.h
@@ -20,8 +20,7 @@
#ifndef MLK_COMPRESS_H
#define MLK_COMPRESS_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include "debug.h"
@@ -50,9 +49,9 @@
#endif
/* Reference: Part of poly_tomsg() in the reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d1(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d1(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 2)
ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) )
{
@@ -65,7 +64,8 @@ __contract__(
*/
/* check-magic: 1290168 == 2*round(2^31 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290168;
- return (d0 + (1u << 30)) >> 31;
+ /* Unsigned shifting by 31 positions leaves only the top bit. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 30)) >> 31);
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -93,9 +93,9 @@ __contract__(
/* Reference: Embedded into `poly_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d4(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d4(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 16)
ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
{
@@ -108,7 +108,8 @@ __contract__(
*/
/* check-magic: 1290160 == 16 * round(2^28 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290160;
- return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */
+ /* The return value is < 16, so not altered by the conversion to uint8_t. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 27)) >> 28); /* round(d0/2^28) */
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -128,11 +129,16 @@ __contract__(
/* Reference: Embedded into `poly_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d4(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d4(uint8_t u)
__contract__(
requires(0 <= u && u < 16)
ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 8) >> 4; }
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 8) >> 4);
+}
/************************************************************
* Name: mlk_scalar_compress_d5
@@ -156,9 +162,9 @@ __contract__(
/* Reference: Embedded into `poly_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d5(uint16_t u)
+static MLK_INLINE uint8_t mlk_scalar_compress_d5(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < 32)
ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) )
{
@@ -171,7 +177,8 @@ __contract__(
*/
/* check-magic: 1290176 == 2^5 * round(2^27 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290176;
- return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */
+ /* The return value is < 32, so not altered by the conversion to uint8_t. */
+ return (uint8_t)((d0 + ((uint32_t)1u << 26)) >> 27); /* round(d0/2^27) */
}
#ifdef CBMC
#pragma CPROVER check pop
@@ -191,11 +198,16 @@ __contract__(
/* Reference: Embedded into `poly_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d5(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d5(uint8_t u)
__contract__(
requires(0 <= u && u < 32)
- ensures(return_value <= MLKEM_Q - 1)
-) { return ((u * MLKEM_Q) + 16) >> 5; }
+ ensures(0 <= return_value && return_value <= MLKEM_Q - 1)
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 16) >> 5);
+}
/************************************************************
* Name: mlk_scalar_compress_d10
@@ -219,9 +231,9 @@ __contract__(
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d10(uint16_t u)
+static MLK_INLINE uint16_t mlk_scalar_compress_d10(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < (1u << 10))
ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
{
@@ -255,11 +267,16 @@ __contract__(
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d10(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d10(uint16_t u)
__contract__(
requires(0 <= u && u < 1024)
- ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 512) >> 10; }
+ ensures(0 <= return_value && return_value <= (MLKEM_Q - 1))
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 512) >> 10);
+}
/************************************************************
* Name: mlk_scalar_compress_d11
@@ -283,9 +300,9 @@ __contract__(
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint32_t mlk_scalar_compress_d11(uint16_t u)
+static MLK_INLINE uint16_t mlk_scalar_compress_d11(int16_t u)
__contract__(
- requires(u <= MLKEM_Q - 1)
+ requires(0 <= u && u <= MLKEM_Q - 1)
ensures(return_value < (1u << 11))
ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
{
@@ -319,11 +336,16 @@ __contract__(
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation @[REF]. */
-static MLK_INLINE uint16_t mlk_scalar_decompress_d11(uint32_t u)
+static MLK_INLINE int16_t mlk_scalar_decompress_d11(uint16_t u)
__contract__(
requires(0 <= u && u < 2048)
- ensures(return_value <= (MLKEM_Q - 1))
-) { return ((u * MLKEM_Q) + 1024) >> 11; }
+ ensures(0 <= return_value && return_value <= (MLKEM_Q - 1))
+)
+{
+ /* The return value is in 0..MLKEM_Q-1, hence not altered by the
+ * conversion to int16_t. */
+ return (int16_t)((((uint32_t)u * MLKEM_Q) + 1024) >> 11);
+}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
#define mlk_poly_compress_d4 MLK_NAMESPACE(poly_compress_d4)
@@ -575,7 +597,7 @@ __contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
);
@@ -631,7 +653,7 @@ void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
__contract__(
requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
);
@@ -660,7 +682,7 @@ __contract__(
requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(msg))
+ assigns(memory_slice(msg, MLKEM_INDCPA_MSGBYTES))
);
#endif /* !MLK_COMPRESS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/debug.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/debug.h
index 01f7c88ccf..47c864bd36 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/debug.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/debug.h
@@ -7,7 +7,6 @@
#include "common.h"
#if defined(MLKEM_DEBUG)
-#include
/*************************************************
* Name: mlk_assert
@@ -89,14 +88,14 @@ void mlk_debug_check_bounds(const char *file, int line, const int16_t *ptr,
/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
* just use a single flattened array_bound(...) here. */
-#define mlk_assert_bound_2d(ptr, M, N, value_lb, value_ub) \
- cassert(forall(kN, 0, (M), \
- array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+#define mlk_assert_bound_2d(ptr, M, N, value_lb, value_ub) \
+ cassert(forall(kN, 0, (M), \
+ array_bound(&((int16_t (*)[(N)])(ptr))[kN][0], 0, (N), \
(value_lb), (value_ub))))
-#define mlk_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \
- cassert(forall(kN, 0, (M), \
- array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
+#define mlk_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \
+ cassert(forall(kN, 0, (M), \
+ array_abs_bound(&((int16_t (*)[(N)])(ptr))[kN][0], 0, (N), \
(value_abs_bd))))
#else /* !MLKEM_DEBUG && CBMC */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/indcpa.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/indcpa.c
index 85d4f595a9..e03b16c38b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/indcpa.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/indcpa.c
@@ -17,15 +17,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
-#include
+#include "indcpa.h"
-#include "cbmc.h"
#include "debug.h"
-#include "indcpa.h"
-#include "poly.h"
-#include "poly_k.h"
#include "randombytes.h"
#include "sampling.h"
#include "symmetric.h"
@@ -41,6 +35,10 @@
#define mlk_pack_ciphertext MLK_ADD_PARAM_SET(mlk_pack_ciphertext)
#define mlk_unpack_ciphertext MLK_ADD_PARAM_SET(mlk_unpack_ciphertext)
#define mlk_matvec_mul MLK_ADD_PARAM_SET(mlk_matvec_mul)
+#define mlk_polyvec_permute_bitrev_to_custom \
+ MLK_ADD_PARAM_SET(mlk_polyvec_permute_bitrev_to_custom)
+#define mlk_polymat_permute_bitrev_to_custom \
+ MLK_ADD_PARAM_SET(mlk_polymat_permute_bitrev_to_custom)
/* End of parameter set namespacing */
/*************************************************
@@ -59,12 +57,13 @@
* Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L19]
*
**************************************************/
-static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], mlk_polyvec pk,
+static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const mlk_polyvec *pk,
const uint8_t seed[MLKEM_SYMBYTES])
{
- mlk_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(pk->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
mlk_polyvec_tobytes(r, pk);
- memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
+ mlk_memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
}
/*************************************************
@@ -83,11 +82,11 @@ static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], mlk_polyvec pk,
* Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L2-3]
*
**************************************************/
-static void mlk_unpack_pk(mlk_polyvec pk, uint8_t seed[MLKEM_SYMBYTES],
+static void mlk_unpack_pk(mlk_polyvec *pk, uint8_t seed[MLKEM_SYMBYTES],
const uint8_t packedpk[MLKEM_INDCPA_PUBLICKEYBYTES])
{
mlk_polyvec_frombytes(pk, packedpk);
- memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
/* NOTE: If a modulus check was conducted on the PK, we know at this
* point that the coefficients of `pk` are unsigned canonical. The
@@ -108,9 +107,10 @@ static void mlk_unpack_pk(mlk_polyvec pk, uint8_t seed[MLKEM_SYMBYTES],
* Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L20]
*
**************************************************/
-static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], mlk_polyvec sk)
+static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES],
+ const mlk_polyvec *sk)
{
- mlk_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(sk->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
mlk_polyvec_tobytes(r, sk);
}
@@ -128,7 +128,7 @@ static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], mlk_polyvec sk)
* Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L5]
*
**************************************************/
-static void mlk_unpack_sk(mlk_polyvec sk,
+static void mlk_unpack_sk(mlk_polyvec *sk,
const uint8_t packedsk[MLKEM_INDCPA_SECRETKEYBYTES])
{
mlk_polyvec_frombytes(sk, packedsk);
@@ -149,8 +149,8 @@ static void mlk_unpack_sk(mlk_polyvec sk,
* Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22-23]
*
**************************************************/
-static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], mlk_polyvec b,
- mlk_poly *v)
+static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES],
+ const mlk_polyvec *b, mlk_poly *v)
{
mlk_polyvec_compress_du(r, b);
mlk_poly_compress_dv(r + MLKEM_POLYVECCOMPRESSEDBYTES_DU, v);
@@ -170,28 +170,69 @@ static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], mlk_polyvec b,
* Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L1-4]
*
**************************************************/
-static void mlk_unpack_ciphertext(mlk_polyvec b, mlk_poly *v,
+static void mlk_unpack_ciphertext(mlk_polyvec *b, mlk_poly *v,
const uint8_t c[MLKEM_INDCPA_BYTES])
{
mlk_polyvec_decompress_du(b, c);
mlk_poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
}
-#if !defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
-/* This namespacing is not done at the top to avoid a naming conflict
- * with native backends, which are currently not yet namespaced. */
-#define mlk_poly_permute_bitrev_to_custom \
- MLK_ADD_PARAM_SET(mlk_poly_permute_bitrev_to_custom)
-
-static MLK_INLINE void mlk_poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
+/* Helper function to ensure that the polynomial entries in the output
+ * of gen_matrix use the standard (bitreversed) ordering of coefficients.
+ * No-op unless a native backend with a custom ordering is used.
+ *
+ * We don't inline this into gen_matrix to avoid having to split the CBMC
+ * proof for gen_matrix based on MLK_USE_NATIVE_NTT_CUSTOM_ORDER. */
+static void mlk_polyvec_permute_bitrev_to_custom(mlk_polyvec *v)
__contract__(
/* We don't specify that this should be a permutation, but only
* that it does not change the bound established at the end of mlk_gen_matrix. */
- requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
- requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(memory_slice(data, sizeof(mlk_poly)))
- ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
+ requires(memory_no_alias(v, sizeof(mlk_polyvec)))
+ requires(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(v, sizeof(mlk_polyvec)))
+ ensures(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+{
+#if defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
+ unsigned i;
+ for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(v, sizeof(mlk_polyvec)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(x, 0, MLKEM_K,
+ array_bound(v->vec[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+ {
+ mlk_poly_permute_bitrev_to_custom(v->vec[i].coeffs);
+ }
+#else /* MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
+ /* Nothing to do */
+ (void)v;
#endif /* !MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
+}
+
+static void mlk_polymat_permute_bitrev_to_custom(mlk_polymat *a)
+__contract__(
+ /* We don't specify that this should be a permutation, but only
+ * that it does not change the bound established at the end of mlk_gen_matrix. */
+ requires(memory_no_alias(a, sizeof(mlk_polymat)))
+ requires(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
+ assigns(memory_slice(a, sizeof(mlk_polymat)))
+ ensures(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))))
+{
+ unsigned i;
+ for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(a, sizeof(mlk_polymat)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))))
+ {
+ mlk_polyvec_permute_bitrev_to_custom(&a->vec[i]);
+ }
+}
/* Reference: `gen_matrix()` in the reference implementation @[REF].
* - We use a special subroutine to generate 4 polynomials
@@ -201,32 +242,27 @@ __contract__(
*
* Not static for benchmarking */
MLK_INTERNAL_API
-void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
+void mlk_gen_matrix(mlk_polymat *a, const uint8_t seed[MLKEM_SYMBYTES],
int transposed)
{
unsigned i, j;
- /*
- * We generate four separate seed arrays rather than a single one to work
- * around limitations in CBMC function contracts dealing with disjoint slices
- * of the same parent object.
- */
-
MLK_ALIGN uint8_t seed_ext[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)];
for (j = 0; j < 4; j++)
{
- memcpy(seed_ext[j], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(seed_ext[j], seed, MLKEM_SYMBYTES);
}
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
/* Sample 4 matrix entries a time. */
for (i = 0; i < (MLKEM_K * MLKEM_K / 4) * 4; i += 4)
{
- uint8_t x, y;
-
for (j = 0; j < 4; j++)
{
- x = (i + j) / MLKEM_K;
- y = (i + j) % MLKEM_K;
+ uint8_t x, y;
+ /* MLKEM_K <= 4, so the values fit in uint8_t. */
+ x = (uint8_t)((i + j) / MLKEM_K);
+ y = (uint8_t)((i + j) % MLKEM_K);
if (transposed)
{
seed_ext[j][MLKEM_SYMBYTES + 0] = x;
@@ -239,19 +275,26 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
}
}
- /*
- * This call writes across mlk_polyvec boundaries for K=2 and K=3.
- * This is intentional and safe.
- */
- mlk_poly_rej_uniform_x4(&a[i], seed_ext);
+ mlk_poly_rej_uniform_x4(&a->vec[i / MLKEM_K].vec[i % MLKEM_K],
+ &a->vec[(i + 1) / MLKEM_K].vec[(i + 1) % MLKEM_K],
+ &a->vec[(i + 2) / MLKEM_K].vec[(i + 2) % MLKEM_K],
+ &a->vec[(i + 3) / MLKEM_K].vec[(i + 3) % MLKEM_K],
+ seed_ext);
}
-
- /* For MLKEM_K == 3, sample the last entry individually. */
- if (i < MLKEM_K * MLKEM_K)
+#else /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
+ /* When using serial FIPS202, sample all entries individually. */
+ i = 0;
+#endif /* MLK_CONFIG_SERIAL_FIPS202_ONLY */
+
+ /* For MLKEM_K == 3, sample the last entry individually.
+ * When MLK_CONFIG_SERIAL_FIPS202_ONLY is set, sample all entries
+ * individually. */
+ for (; i < MLKEM_K * MLKEM_K; i++)
{
uint8_t x, y;
- x = i / MLKEM_K;
- y = i % MLKEM_K;
+ /* MLKEM_K <= 4, so the values fit in uint8_t. */
+ x = (uint8_t)(i / MLKEM_K);
+ y = (uint8_t)(i % MLKEM_K);
if (transposed)
{
@@ -264,8 +307,7 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
seed_ext[0][MLKEM_SYMBYTES + 1] = x;
}
- mlk_poly_rej_uniform(&a[i], seed_ext[0]);
- i++;
+ mlk_poly_rej_uniform(&a->vec[i / MLKEM_K].vec[i % MLKEM_K], seed_ext[0]);
}
mlk_assert(i == MLKEM_K * MLKEM_K);
@@ -274,10 +316,7 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
* The public matrix is generated in NTT domain. If the native backend
* uses a custom order in NTT domain, permute A accordingly.
*/
- for (i = 0; i < MLKEM_K * MLKEM_K; i++)
- {
- mlk_poly_permute_bitrev_to_custom(a[i].coeffs);
- }
+ mlk_polymat_permute_bitrev_to_custom(a);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -301,24 +340,25 @@ void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
* Specification: Implements @[FIPS203, Section 2.4.7, Eq (2.12), (2.13)]
*
**************************************************/
-static void mlk_matvec_mul(mlk_polyvec out, const mlk_polymat a,
- const mlk_polyvec v, const mlk_polyvec_mulcache vc)
+static void mlk_matvec_mul(mlk_polyvec *out, const mlk_polymat *a,
+ const mlk_polyvec *v, const mlk_polyvec_mulcache *vc)
__contract__(
requires(memory_no_alias(out, sizeof(mlk_polyvec)))
requires(memory_no_alias(a, sizeof(mlk_polymat)))
requires(memory_no_alias(v, sizeof(mlk_polyvec)))
requires(memory_no_alias(vc, sizeof(mlk_polyvec_mulcache)))
- requires(forall(k0, 0, MLKEM_K * MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
- assigns(object_whole(out)))
+ requires(forall(k0, 0, MLKEM_K,
+ forall(k1, 0, MLKEM_K,
+ array_bound(a->vec[k0].vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))))
+ assigns(memory_slice(out, sizeof(mlk_polyvec))))
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
__loop__(
- assigns(i, object_whole(out))
+ assigns(i, memory_slice(out, sizeof(mlk_polyvec)))
invariant(i <= MLKEM_K))
{
- mlk_polyvec_basemul_acc_montgomery_cached(&out[i], &a[MLKEM_K * i], v, vc);
+ mlk_polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a->vec[i], v, vc);
}
}
@@ -331,20 +371,34 @@ __contract__(
* - We include buffer zeroization.
*/
MLK_INTERNAL_API
-void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- const uint8_t *publicseed = buf;
- const uint8_t *noiseseed = buf + MLKEM_SYMBYTES;
- mlk_polymat a;
- mlk_polyvec e, pkpv, skpv;
- mlk_polyvec_mulcache skpv_cache;
-
- MLK_ALIGN uint8_t coins_with_domain_separator[MLKEM_SYMBYTES + 1];
+ int ret = 0;
+ const uint8_t *publicseed;
+ const uint8_t *noiseseed;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(coins_with_domain_separator, uint8_t, MLKEM_SYMBYTES + 1, context);
+ MLK_ALLOC(a, mlk_polymat, 1, context);
+ MLK_ALLOC(e, mlk_polyvec, 1, context);
+ MLK_ALLOC(pkpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (buf == NULL || coins_with_domain_separator == NULL || a == NULL ||
+ e == NULL || pkpv == NULL || skpv == NULL || skpv_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
+ publicseed = buf;
+ noiseseed = buf + MLKEM_SYMBYTES;
+
/* Concatenate coins with MLKEM_K for domain separation of security levels */
- memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
+ mlk_memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
coins_with_domain_separator[MLKEM_SYMBYTES] = MLKEM_K;
mlk_hash_g(buf, coins_with_domain_separator, MLKEM_SYMBYTES + 1);
@@ -360,24 +414,24 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
mlk_gen_matrix(a, publicseed, 0 /* no transpose */);
#if MLKEM_K == 2
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &e[0], &e[1], noiseseed, 0, 1,
- 2, 3);
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &e->vec[0],
+ &e->vec[1], noiseseed, 0, 1, 2, 3);
#elif MLKEM_K == 3
/*
* Only the first three output buffers are needed.
* The laster parameter is a dummy that's overwritten later.
*/
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &skpv[2],
- &pkpv[0] /* irrelevant */, noiseseed, 0, 1, 2,
- 0xFF /* irrelevant */);
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &skpv->vec[2], NULL,
+ noiseseed, 0, 1, 2, 0xFF /* irrelevant */);
/* Same here */
- mlk_poly_getnoise_eta1_4x(&e[0], &e[1], &e[2], &pkpv[0] /* irrelevant */,
- noiseseed, 3, 4, 5, 0xFF /* irrelevant */);
+ mlk_poly_getnoise_eta1_4x(&e->vec[0], &e->vec[1], &e->vec[2], NULL, noiseseed,
+ 3, 4, 5, 0xFF /* irrelevant */);
#elif MLKEM_K == 4
- mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &skpv[2], &skpv[3], noiseseed,
- 0, 1, 2, 3);
- mlk_poly_getnoise_eta1_4x(&e[0], &e[1], &e[2], &e[3], noiseseed, 4, 5, 6, 7);
-#endif
+ mlk_poly_getnoise_eta1_4x(&skpv->vec[0], &skpv->vec[1], &skpv->vec[2],
+ &skpv->vec[3], noiseseed, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta1_4x(&e->vec[0], &e->vec[1], &e->vec[2], &e->vec[3],
+ noiseseed, 4, 5, 6, 7);
+#endif /* MLKEM_K == 4 */
mlk_polyvec_ntt(skpv);
mlk_polyvec_ntt(e);
@@ -393,14 +447,17 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
mlk_pack_sk(sk, skpv);
mlk_pack_pk(pk, pkpv, publicseed);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(coins_with_domain_separator, sizeof(coins_with_domain_separator));
- mlk_zeroize(a, sizeof(a));
- mlk_zeroize(&e, sizeof(e));
- mlk_zeroize(&skpv, sizeof(skpv));
- mlk_zeroize(&skpv_cache, sizeof(skpv_cache));
+ MLK_FREE(skpv_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(skpv, mlk_polyvec, 1, context);
+ MLK_FREE(pkpv, mlk_polyvec, 1, context);
+ MLK_FREE(e, mlk_polyvec, 1, context);
+ MLK_FREE(a, mlk_polymat, 1, context);
+ MLK_FREE(coins_with_domain_separator, uint8_t, MLKEM_SYMBYTES + 1, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
/* Reference: `indcpa_enc()` in the reference implementation @[REF].
@@ -412,19 +469,33 @@ void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
* - We include buffer zeroization.
*/
MLK_INTERNAL_API
-void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t seed[MLKEM_SYMBYTES];
- mlk_polymat at;
- mlk_polyvec sp, pkpv, ep, b;
- mlk_poly v, k, epp;
- mlk_polyvec_mulcache sp_cache;
+ int ret = 0;
+ MLK_ALLOC(seed, uint8_t, MLKEM_SYMBYTES, context);
+ MLK_ALLOC(at, mlk_polymat, 1, context);
+ MLK_ALLOC(sp, mlk_polyvec, 1, context);
+ MLK_ALLOC(pkpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(ep, mlk_polyvec, 1, context);
+ MLK_ALLOC(b, mlk_polyvec, 1, context);
+ MLK_ALLOC(v, mlk_poly, 1, context);
+ MLK_ALLOC(k, mlk_poly, 1, context);
+ MLK_ALLOC(epp, mlk_poly, 1, context);
+ MLK_ALLOC(sp_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (seed == NULL || at == NULL || sp == NULL || pkpv == NULL || ep == NULL ||
+ b == NULL || v == NULL || k == NULL || epp == NULL || sp_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
mlk_unpack_pk(pkpv, seed, pk);
- mlk_poly_frommsg(&k, m);
+ mlk_poly_frommsg(k, m);
/*
* Declassify the public seed.
@@ -437,87 +508,105 @@ void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
mlk_gen_matrix(at, seed, 1 /* transpose */);
#if MLKEM_K == 2
- mlk_poly_getnoise_eta1122_4x(&sp[0], &sp[1], &ep[0], &ep[1], coins, 0, 1, 2,
- 3);
- mlk_poly_getnoise_eta2(&epp, coins, 4);
+ mlk_poly_getnoise_eta1122_4x(&sp->vec[0], &sp->vec[1], &ep->vec[0],
+ &ep->vec[1], coins, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta2(epp, coins, 4);
#elif MLKEM_K == 3
/*
* In this call, only the first three output buffers are needed.
* The last parameter is a dummy that's overwritten later.
*/
- mlk_poly_getnoise_eta1_4x(&sp[0], &sp[1], &sp[2], &b[0], coins, 0, 1, 2,
- 0xFF);
+ mlk_poly_getnoise_eta1_4x(&sp->vec[0], &sp->vec[1], &sp->vec[2], NULL, coins,
+ 0, 1, 2, 0xFF /* irrelevant */);
/* The fourth output buffer in this call _is_ used. */
- mlk_poly_getnoise_eta2_4x(&ep[0], &ep[1], &ep[2], &epp, coins, 3, 4, 5, 6);
+ mlk_poly_getnoise_eta2_4x(&ep->vec[0], &ep->vec[1], &ep->vec[2], epp, coins,
+ 3, 4, 5, 6);
#elif MLKEM_K == 4
- mlk_poly_getnoise_eta1_4x(&sp[0], &sp[1], &sp[2], &sp[3], coins, 0, 1, 2, 3);
- mlk_poly_getnoise_eta2_4x(&ep[0], &ep[1], &ep[2], &ep[3], coins, 4, 5, 6, 7);
- mlk_poly_getnoise_eta2(&epp, coins, 8);
-#endif
+ mlk_poly_getnoise_eta1_4x(&sp->vec[0], &sp->vec[1], &sp->vec[2], &sp->vec[3],
+ coins, 0, 1, 2, 3);
+ mlk_poly_getnoise_eta2_4x(&ep->vec[0], &ep->vec[1], &ep->vec[2], &ep->vec[3],
+ coins, 4, 5, 6, 7);
+ mlk_poly_getnoise_eta2(epp, coins, 8);
+#endif /* MLKEM_K == 4 */
mlk_polyvec_ntt(sp);
mlk_polyvec_mulcache_compute(sp_cache, sp);
mlk_matvec_mul(b, at, sp, sp_cache);
- mlk_polyvec_basemul_acc_montgomery_cached(&v, pkpv, sp, sp_cache);
+ mlk_polyvec_basemul_acc_montgomery_cached(v, pkpv, sp, sp_cache);
mlk_polyvec_invntt_tomont(b);
- mlk_poly_invntt_tomont(&v);
+ mlk_poly_invntt_tomont(v);
mlk_polyvec_add(b, ep);
- mlk_poly_add(&v, &epp);
- mlk_poly_add(&v, &k);
+ mlk_poly_add(v, epp);
+ mlk_poly_add(v, k);
mlk_polyvec_reduce(b);
- mlk_poly_reduce(&v);
+ mlk_poly_reduce(v);
- mlk_pack_ciphertext(c, b, &v);
+ mlk_pack_ciphertext(c, b, v);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(seed, sizeof(seed));
- mlk_zeroize(&sp, sizeof(sp));
- mlk_zeroize(&sp_cache, sizeof(sp_cache));
- mlk_zeroize(&b, sizeof(b));
- mlk_zeroize(&v, sizeof(v));
- mlk_zeroize(at, sizeof(at));
- mlk_zeroize(&k, sizeof(k));
- mlk_zeroize(&ep, sizeof(ep));
- mlk_zeroize(&epp, sizeof(epp));
+ MLK_FREE(sp_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(epp, mlk_poly, 1, context);
+ MLK_FREE(k, mlk_poly, 1, context);
+ MLK_FREE(v, mlk_poly, 1, context);
+ MLK_FREE(b, mlk_polyvec, 1, context);
+ MLK_FREE(ep, mlk_polyvec, 1, context);
+ MLK_FREE(pkpv, mlk_polyvec, 1, context);
+ MLK_FREE(sp, mlk_polyvec, 1, context);
+ MLK_FREE(at, mlk_polymat, 1, context);
+ MLK_FREE(seed, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
/* Reference: `indcpa_dec()` in the reference implementation @[REF].
* - We use a mulcache for the scalar product.
* - We include buffer zeroization. */
MLK_INTERNAL_API
-void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+int mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- mlk_polyvec b, skpv;
- mlk_poly v, sb;
- mlk_polyvec_mulcache b_cache;
+ int ret = 0;
+ MLK_ALLOC(b, mlk_polyvec, 1, context);
+ MLK_ALLOC(skpv, mlk_polyvec, 1, context);
+ MLK_ALLOC(v, mlk_poly, 1, context);
+ MLK_ALLOC(sb, mlk_poly, 1, context);
+ MLK_ALLOC(b_cache, mlk_polyvec_mulcache, 1, context);
+
+ if (b == NULL || skpv == NULL || v == NULL || sb == NULL || b_cache == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
- mlk_unpack_ciphertext(b, &v, c);
+ mlk_unpack_ciphertext(b, v, c);
mlk_unpack_sk(skpv, sk);
mlk_polyvec_ntt(b);
mlk_polyvec_mulcache_compute(b_cache, b);
- mlk_polyvec_basemul_acc_montgomery_cached(&sb, skpv, b, b_cache);
- mlk_poly_invntt_tomont(&sb);
+ mlk_polyvec_basemul_acc_montgomery_cached(sb, skpv, b, b_cache);
+ mlk_poly_invntt_tomont(sb);
- mlk_poly_sub(&v, &sb);
- mlk_poly_reduce(&v);
+ mlk_poly_sub(v, sb);
+ mlk_poly_reduce(v);
- mlk_poly_tomsg(m, &v);
+ mlk_poly_tomsg(m, v);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(&skpv, sizeof(skpv));
- mlk_zeroize(&b, sizeof(b));
- mlk_zeroize(&b_cache, sizeof(b_cache));
- mlk_zeroize(&v, sizeof(v));
- mlk_zeroize(&sb, sizeof(sb));
+ MLK_FREE(b_cache, mlk_polyvec_mulcache, 1, context);
+ MLK_FREE(sb, mlk_poly, 1, context);
+ MLK_FREE(v, mlk_poly, 1, context);
+ MLK_FREE(skpv, mlk_polyvec, 1, context);
+ MLK_FREE(b, mlk_polyvec, 1, context);
+ return ret;
}
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
@@ -529,4 +618,5 @@ void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
#undef mlk_pack_ciphertext
#undef mlk_unpack_ciphertext
#undef mlk_matvec_mul
-#undef mlk_poly_permute_bitrev_to_custom
+#undef mlk_polyvec_permute_bitrev_to_custom
+#undef mlk_polymat_permute_bitrev_to_custom
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/indcpa.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/indcpa.h
index 4c44d0d411..b31756dcb6 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/indcpa.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/indcpa.h
@@ -15,7 +15,6 @@
#ifndef MLK_INDCPA_H
#define MLK_INDCPA_H
-#include
#include "cbmc.h"
#include "common.h"
#include "poly_k.h"
@@ -39,18 +38,19 @@
*
**************************************************/
MLK_INTERNAL_API
-void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
+void mlk_gen_matrix(mlk_polymat *a, const uint8_t seed[MLKEM_SYMBYTES],
int transposed)
__contract__(
requires(memory_no_alias(a, sizeof(mlk_polymat)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
requires(transposed == 0 || transposed == 1)
- assigns(object_whole(a))
- ensures(forall(x, 0, MLKEM_K * MLKEM_K,
- array_bound(a[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(a, sizeof(mlk_polymat)))
+ ensures(forall(x, 0, MLKEM_K, forall(y, 0, MLKEM_K,
+ array_bound(a->vec[x].vec[y].coeffs, 0, MLKEM_N, 0, MLKEM_Q))))
);
-#define mlk_indcpa_keypair_derand MLK_NAMESPACE_K(indcpa_keypair_derand)
+#define mlk_indcpa_keypair_derand \
+ MLK_NAMESPACE_K(indcpa_keypair_derand) MLK_CONTEXT_PARAMETERS_3
/*************************************************
* Name: mlk_indcpa_keypair_derand
*
@@ -68,18 +68,23 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCPA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
-#define mlk_indcpa_enc MLK_NAMESPACE_K(indcpa_enc)
+#define mlk_indcpa_enc MLK_NAMESPACE_K(indcpa_enc) MLK_CONTEXT_PARAMETERS_4
/*************************************************
* Name: mlk_indcpa_enc
*
@@ -100,19 +105,23 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(c))
+ assigns(memory_slice(c, MLKEM_INDCPA_BYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
-#define mlk_indcpa_dec MLK_NAMESPACE_K(indcpa_dec)
+#define mlk_indcpa_dec MLK_NAMESPACE_K(indcpa_dec) MLK_CONTEXT_PARAMETERS_3
/*************************************************
* Name: mlk_indcpa_dec
*
@@ -130,14 +139,18 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
- const uint8_t c[MLKEM_INDCPA_BYTES],
- const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
+ const uint8_t c[MLKEM_INDCPA_BYTES],
+ const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
- assigns(object_whole(m))
+ assigns(memory_slice(m, MLKEM_INDCPA_MSGBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
#endif /* !MLK_INDCPA_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/kem.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/kem.c
index d6f4e83628..3c82d6df70 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/kem.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/kem.c
@@ -8,7 +8,8 @@
*
* - [FIPS140_3_IG]
* Implementation Guidance for FIPS 140-3 and the Cryptographic Module
- * Validation Program National Institute of Standards and Technology
+ * Validation Program
+ * National Institute of Standards and Technology
* https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
*
* - [FIPS203]
@@ -22,12 +23,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
-#include
+#include "kem.h"
#include "indcpa.h"
-#include "kem.h"
#include "randombytes.h"
#include "symmetric.h"
#include "verify.h"
@@ -36,44 +34,24 @@
* This is to facilitate building multiple instances
* of mlkem-native (e.g. with varying security levels)
* within a single compilation unit. */
-#define mlk_check_pk MLK_ADD_PARAM_SET(mlk_check_pk)
-#define mlk_check_sk MLK_ADD_PARAM_SET(mlk_check_sk)
-#define mlk_check_pct MLK_ADD_PARAM_SET(mlk_check_pct)
+#define mlk_check_pct MLK_ADD_PARAM_SET(mlk_check_pct) MLK_CONTEXT_PARAMETERS_2
/* End of parameter set namespacing */
-#if defined(CBMC)
-/* Redeclaration with contract needed for CBMC only */
-int memcmp(const void *str1, const void *str2, size_t n)
-__contract__(
- requires(memory_no_alias(str1, n))
- requires(memory_no_alias(str2, n))
-);
-#endif /* CBMC */
-
-/*************************************************
- * Name: mlk_check_pk
- *
- * Description: Implements modulus check mandated by FIPS 203,
- * i.e., ensures that coefficients are in [0,q-1].
- *
- * Arguments: - const uint8_t *pk: pointer to input public key
- * (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
- * bytes)
- *
- * Returns: - 0 on success
- * - -1 on failure
- *
- * Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
- *
- **************************************************/
-
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-static int mlk_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- mlk_polyvec p;
- uint8_t p_reencoded[MLKEM_POLYVECBYTES];
+ int ret = 0;
+ MLK_ALLOC(p, mlk_polyvec, 1, context);
+ MLK_ALLOC(p_reencoded, uint8_t, MLKEM_POLYVECBYTES, context);
+
+ if (p == NULL || p_reencoded == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
mlk_polyvec_frombytes(p, pk);
mlk_polyvec_reduce(p);
@@ -81,39 +59,32 @@ static int mlk_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
/* We use a constant-time memcmp here to avoid having to
* declassify the PK before the PCT has succeeded. */
- res = mlk_ct_memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES) ? -1 : 0;
+ ret = mlk_ct_memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES) ? MLK_ERR_FAIL : 0;
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(p_reencoded, sizeof(p_reencoded));
- mlk_zeroize(&p, sizeof(p));
- return res;
+ MLK_FREE(p_reencoded, uint8_t, MLKEM_POLYVECBYTES, context);
+ MLK_FREE(p, mlk_polyvec, 1, context);
+ return ret;
}
-/*************************************************
- * Name: mlk_check_sk
- *
- * Description: Implements public key hash check mandated by FIPS 203,
- * i.e., ensures that
- * sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
- *
- * Arguments: - const uint8_t *sk: pointer to input private key
- * (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
- * bytes)
- *
- * Returns: - 0 on success
- * - -1 on failure
- *
- * Specification: Implements @[FIPS203, Section 7.3, 'hash check']
- *
- **************************************************/
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-static int mlk_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t test[MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(test, uint8_t, MLKEM_SYMBYTES, context);
+
+ if (test == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
/*
* The parts of `sk` being hashed and compared here are public, so
* no public information is leaked through the runtime or the return value
@@ -128,23 +99,32 @@ static int mlk_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
mlk_hash_h(test, sk + MLKEM_INDCPA_SECRETKEYBYTES,
MLKEM_INDCCA_PUBLICKEYBYTES);
- res = memcmp(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, test,
- MLKEM_SYMBYTES)
- ? -1
+ /* This doesn't have to be a constant-time memcmp, but it's the only place
+ * in the library where a normal memcmp would be used otherwise, so for sake
+ * of minimizing stdlib dependency, we use our constant-time one anyway. */
+ ret = mlk_ct_memcmp(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+ test, MLKEM_SYMBYTES)
+ ? MLK_ERR_FAIL
: 0;
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(test, sizeof(test));
- return res;
+ MLK_FREE(test, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
- requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES)));
+ requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
+);
#if defined(MLK_CONFIG_KEYGEN_PCT)
/* Specification:
@@ -152,21 +132,30 @@ __contract__(
* @[FIPS203, Section 7.1, Pairwise Consistency]. */
/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES];
- uint8_t ss_enc[MLKEM_SSBYTES], ss_dec[MLKEM_SSBYTES];
+ int ret = 0;
+ MLK_ALLOC(ct, uint8_t, MLKEM_INDCCA_CIPHERTEXTBYTES, context);
+ MLK_ALLOC(ss_enc, uint8_t, MLKEM_SSBYTES, context);
+ MLK_ALLOC(ss_dec, uint8_t, MLKEM_SSBYTES, context);
+
+ if (ct == NULL || ss_enc == NULL || ss_dec == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
- res = crypto_kem_enc(ct, ss_enc, pk);
- if (res != 0)
+ ret = mlk_kem_enc(ct, ss_enc, pk, context);
+ if (ret != 0)
{
goto cleanup;
}
- res = crypto_kem_dec(ss_dec, ct, sk);
- if (res != 0)
+ ret = mlk_kem_dec(ss_dec, ct, sk, context);
+ if (ret != 0)
{
goto cleanup;
}
@@ -179,26 +168,36 @@ static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
}
#endif /* MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST */
- res = mlk_ct_memcmp(ss_enc, ss_dec, sizeof(ss_dec));
+ ret = mlk_ct_memcmp(ss_enc, ss_dec, MLKEM_SSBYTES);
+ /* The result of the PCT is public. */
+ MLK_CT_TESTING_DECLASSIFY(&ret, sizeof(ret));
+
+ if (ret != 0)
+ {
+ ret = MLK_ERR_FAIL;
+ }
cleanup:
- /* The result of the PCT is public. */
- MLK_CT_TESTING_DECLASSIFY(&res, sizeof(res));
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(ct, sizeof(ct));
- mlk_zeroize(ss_enc, sizeof(ss_enc));
- mlk_zeroize(ss_dec, sizeof(ss_dec));
- return res;
+ MLK_FREE(ss_dec, uint8_t, MLKEM_SSBYTES, context);
+ MLK_FREE(ss_enc, uint8_t, MLKEM_SSBYTES, context);
+ MLK_FREE(ct, uint8_t, MLKEM_INDCCA_CIPHERTEXTBYTES, context);
+ return ret;
}
-#else /* MLK_CONFIG_KEYGEN_PCT */
+#else /* MLK_CONFIG_KEYGEN_PCT */
+MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
+ uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
/* Skip PCT */
((void)pk);
((void)sk);
+#if defined(MLK_CONFIG_CONTEXT_PARAMETER)
+ ((void)context);
+#endif
return 0;
}
#endif /* !MLK_CONFIG_KEYGEN_PCT */
@@ -208,164 +207,240 @@ static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
* - We optionally include PCT which is not present in
* the reference code. */
MLK_EXTERNAL_API
-int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
- const uint8_t coins[2 * MLKEM_SYMBYTES])
+int mlk_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ const uint8_t coins[2 * MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- mlk_indcpa_keypair_derand(pk, sk, coins);
- memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+ int ret;
+
+ ret = mlk_indcpa_keypair_derand(pk, sk, coins, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
+
+ mlk_memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
mlk_hash_h(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, pk,
MLKEM_INDCCA_PUBLICKEYBYTES);
/* Value z for pseudo-random output on reject */
- memcpy(sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
- coins + MLKEM_SYMBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
+ coins + MLKEM_SYMBYTES, MLKEM_SYMBYTES);
/* Declassify public key */
MLK_CT_TESTING_DECLASSIFY(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
/* Pairwise Consistency Test (PCT) @[FIPS140_3_IG, p.87] */
- if (mlk_check_pct(pk, sk))
+ ret = mlk_check_pct(pk, sk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- return 0;
+cleanup:
+ if (ret != 0)
+ {
+ mlk_zeroize(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
+ mlk_zeroize(sk, MLKEM_INDCCA_SECRETKEYBYTES);
+ }
+
+ return ret;
}
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
/* Reference: `crypto_kem_keypair()` in the reference implementation @[REF]
* - We zeroize the stack buffer */
MLK_EXTERNAL_API
-int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t coins[2 * MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(coins, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+ if (coins == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Acquire necessary randomness, and mark it as secret. */
- mlk_randombytes(coins, 2 * MLKEM_SYMBYTES);
- MLK_CT_TESTING_SECRET(coins, sizeof(coins));
+ if (mlk_randombytes(coins, 2 * MLKEM_SYMBYTES) != 0)
+ {
+ ret = MLK_ERR_RNG_FAIL;
+ goto cleanup;
+ }
+
+ MLK_CT_TESTING_SECRET(coins, 2 * MLKEM_SYMBYTES);
- res = crypto_kem_keypair_derand(pk, sk, coins);
+ ret = mlk_kem_keypair_derand(pk, sk, coins, context);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(coins, sizeof(coins));
- return res;
+ MLK_FREE(coins, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
/* Reference: `crypto_kem_enc_derand()` in the reference implementation @[REF]
* - We include public key check
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- /* Will contain key, coins */
- MLK_ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+
+ if (buf == NULL || kr == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Specification: Implements @[FIPS203, Section 7.2, Modulus check] */
- if (mlk_check_pk(pk))
+ ret = mlk_kem_check_pk(pk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- memcpy(buf, coins, MLKEM_SYMBYTES);
+ mlk_memcpy(buf, coins, MLKEM_SYMBYTES);
/* Multitarget countermeasure for coins + contributory KEM */
mlk_hash_h(buf + MLKEM_SYMBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
/* coins are in kr+MLKEM_SYMBYTES */
- mlk_indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES);
+ ret = mlk_indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
- memcpy(ss, kr, MLKEM_SYMBYTES);
+ mlk_memcpy(ss, kr, MLKEM_SYMBYTES);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(kr, sizeof(kr));
-
- return 0;
+ MLK_FREE(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ return ret;
}
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
/* Reference: `crypto_kem_enc()` in the reference implementation @[REF]
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
- int res;
- MLK_ALIGN uint8_t coins[MLKEM_SYMBYTES];
+ int ret = 0;
+ MLK_ALLOC(coins, uint8_t, MLKEM_SYMBYTES, context);
- mlk_randombytes(coins, MLKEM_SYMBYTES);
- MLK_CT_TESTING_SECRET(coins, sizeof(coins));
+ if (coins == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
+
+ if (mlk_randombytes(coins, MLKEM_SYMBYTES) != 0)
+ {
+ ret = MLK_ERR_RNG_FAIL;
+ goto cleanup;
+ }
+
+ MLK_CT_TESTING_SECRET(coins, MLKEM_SYMBYTES);
- res = crypto_kem_enc_derand(ct, ss, pk, coins);
+ ret = mlk_kem_enc_derand(ct, ss, pk, coins, context);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(coins, sizeof(coins));
- return res;
+ MLK_FREE(coins, uint8_t, MLKEM_SYMBYTES, context);
+ return ret;
}
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
/* Reference: `crypto_kem_dec()` in the reference implementation @[REF]
* - We include secret key check
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
-int crypto_kem_dec(uint8_t ss[MLKEM_SSBYTES],
- const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_dec(uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
{
+ int ret = 0;
uint8_t fail;
- MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
- /* Will contain key, coins */
- MLK_ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
- MLK_ALIGN uint8_t tmp[MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES];
-
const uint8_t *pk = sk + MLKEM_INDCPA_SECRETKEYBYTES;
+ MLK_ALLOC(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_ALLOC(tmp, uint8_t, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES,
+ context);
+
+ if (buf == NULL || kr == NULL || tmp == NULL)
+ {
+ ret = MLK_ERR_OUT_OF_MEMORY;
+ goto cleanup;
+ }
/* Specification: Implements @[FIPS203, Section 7.3, Hash check] */
- if (mlk_check_sk(sk))
+ ret = mlk_kem_check_sk(sk, context);
+ if (ret != 0)
{
- return -1;
+ goto cleanup;
}
- mlk_indcpa_dec(buf, ct, sk);
+ ret = mlk_indcpa_dec(buf, ct, sk, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
/* Multitarget countermeasure for coins + contributory KEM */
- memcpy(buf + MLKEM_SYMBYTES,
- sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, MLKEM_SYMBYTES);
+ mlk_memcpy(buf + MLKEM_SYMBYTES,
+ sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
+ MLKEM_SYMBYTES);
mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
/* Recompute and compare ciphertext */
/* coins are in kr+MLKEM_SYMBYTES */
- mlk_indcpa_enc(tmp, buf, pk, kr + MLKEM_SYMBYTES);
+ ret = mlk_indcpa_enc(tmp, buf, pk, kr + MLKEM_SYMBYTES, context);
+ if (ret != 0)
+ {
+ goto cleanup;
+ }
+
fail = mlk_ct_memcmp(ct, tmp, MLKEM_INDCCA_CIPHERTEXTBYTES);
/* Compute rejection key */
- memcpy(tmp, sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
- MLKEM_SYMBYTES);
- memcpy(tmp + MLKEM_SYMBYTES, ct, MLKEM_INDCCA_CIPHERTEXTBYTES);
- mlk_hash_j(ss, tmp, sizeof(tmp));
+ mlk_memcpy(tmp, sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
+ MLKEM_SYMBYTES);
+ mlk_memcpy(tmp + MLKEM_SYMBYTES, ct, MLKEM_INDCCA_CIPHERTEXTBYTES);
+ mlk_hash_j(ss, tmp, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES);
/* Copy true key to return buffer if fail is 0 */
mlk_ct_cmov_zero(ss, kr, MLKEM_SYMBYTES, fail);
+cleanup:
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(buf, sizeof(buf));
- mlk_zeroize(kr, sizeof(kr));
- mlk_zeroize(tmp, sizeof(tmp));
+ MLK_FREE(tmp, uint8_t, MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES,
+ context);
+ MLK_FREE(kr, uint8_t, 2 * MLKEM_SYMBYTES, context);
+ MLK_FREE(buf, uint8_t, 2 * MLKEM_SYMBYTES, context);
- return 0;
+ return ret;
}
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
-#undef mlk_check_pk
-#undef mlk_check_sk
#undef mlk_check_pct
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/kem.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/kem.h
index d3e5f50ce6..0502715c39 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/kem.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/kem.h
@@ -10,12 +10,16 @@
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
+ *
+ * - [REF]
+ * CRYSTALS-Kyber C reference implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/ref
*/
#ifndef MLK_KEM_H
#define MLK_KEM_H
-#include
#include "cbmc.h"
#include "common.h"
#include "sys.h"
@@ -23,9 +27,7 @@
#if defined(MLK_CHECK_APIS)
/* Include to ensure consistency between internal kem.h
* and external mlkem_native.h. */
-#define MLK_CONFIG_API_NO_SUPERCOP
#include "mlkem_native.h"
-#undef MLK_CONFIG_API_NO_SUPERCOP
#if MLKEM_INDCCA_SECRETKEYBYTES != \
MLKEM_SECRETKEYBYTES(MLK_CONFIG_PARAMETER_SET)
@@ -44,14 +46,79 @@
#endif /* MLK_CHECK_APIS */
-#define crypto_kem_keypair_derand MLK_NAMESPACE_K(keypair_derand)
-#define crypto_kem_keypair MLK_NAMESPACE_K(keypair)
-#define crypto_kem_enc_derand MLK_NAMESPACE_K(enc_derand)
-#define crypto_kem_enc MLK_NAMESPACE_K(enc)
-#define crypto_kem_dec MLK_NAMESPACE_K(dec)
+#define mlk_kem_keypair_derand \
+ MLK_NAMESPACE_K(keypair_derand) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_keypair MLK_NAMESPACE_K(keypair) MLK_CONTEXT_PARAMETERS_2
+#define mlk_kem_enc_derand MLK_NAMESPACE_K(enc_derand) MLK_CONTEXT_PARAMETERS_4
+#define mlk_kem_enc MLK_NAMESPACE_K(enc) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_dec MLK_NAMESPACE_K(dec) MLK_CONTEXT_PARAMETERS_3
+#define mlk_kem_check_pk MLK_NAMESPACE_K(check_pk) MLK_CONTEXT_PARAMETERS_1
+#define mlk_kem_check_sk MLK_NAMESPACE_K(check_sk) MLK_CONTEXT_PARAMETERS_1
+
+/*************************************************
+ * Name: mlk_kem_check_pk
+ *
+ * Description: Implements modulus check mandated by FIPS 203,
+ * i.e., ensures that coefficients are in [0,q-1].
+ *
+ * Arguments: - const uint8_t *pk: pointer to input public key
+ * (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
+ * bytes)
+ *
+ * Returns: - 0 on success
+ * - MLK_ERR_FAIL: If the modulus check failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
+ *
+ **************************************************/
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+ requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
+);
+
+
+/*************************************************
+ * Name: mlk_kem_check_sk
+ *
+ * Description: Implements public key hash check mandated by FIPS 203,
+ * i.e., ensures that
+ * sk[768𝑘+32 ∶ 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
+ *
+ * Arguments: - const uint8_t *sk: pointer to input private key
+ * (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
+ * bytes)
+ *
+ * Returns: - 0 on success
+ * - MLK_ERR_FAIL: If the public key hash check failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ *
+ * Specification: Implements @[FIPS203, Section 7.3, 'hash check']
+ *
+ **************************************************/
+
+/* Reference: Not implemented in the reference implementation @[REF]. */
+MLK_EXTERNAL_API
+MLK_MUST_CHECK_RETURN_VALUE
+int mlk_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
+__contract__(
+ requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
+);
/*************************************************
- * Name: crypto_kem_keypair_derand
+ * Name: mlk_kem_keypair_derand
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
@@ -67,26 +134,33 @@
* random bytes)
*
* Returns: - 0: On success
- * - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
+ * - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ * PCT failed.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 16, ML-KEM.KeyGen_Internal]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
- const uint8_t coins[2 * MLKEM_SYMBYTES])
+int mlk_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ const uint8_t coins[2 * MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
requires(memory_no_alias(coins, 2 * MLKEM_SYMBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_keypair
+ * Name: mlk_kem_keypair
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
@@ -99,24 +173,32 @@ __contract__(
* bytes)
*
* Returns: - 0: On success
- * - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ * - MLK_ERR_RNG_FAIL: Random number generation failed.
+ * - MLK_ERR_FAIL: If MLK_CONFIG_KEYGEN_PCT is enabled and the
+ * PCT failed.
*
* Specification: Implements @[FIPS203, Algorithm 19, ML-KEM.KeyGen]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
- assigns(object_whole(pk))
- assigns(object_whole(sk))
+ assigns(memory_slice(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
+ assigns(memory_slice(sk, MLKEM_INDCCA_SECRETKEYBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_enc_derand
+ * Name: mlk_kem_enc_derand
*
* Description: Generates cipher text and shared
* secret for given public key
@@ -134,29 +216,34 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'modulus check' @[FIPS203, Section 7.2]
- * for the public key fails.
+ * - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ * for the public key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 17, ML-KEM.Encaps_Internal]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
- const uint8_t coins[MLKEM_SYMBYTES])
+int mlk_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ const uint8_t coins[MLKEM_SYMBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
- assigns(object_whole(ct))
- assigns(object_whole(ss))
+ assigns(memory_slice(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
/*************************************************
- * Name: crypto_kem_enc
+ * Name: mlk_kem_enc
*
* Description: Generates cipher text and shared
* secret for given public key
@@ -171,27 +258,34 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'modulus check' @[FIPS203, Section 7.2]
- * for the public key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
+ * - MLK_ERR_RNG_FAIL: Random number generation failed.
+ * - MLK_ERR_FAIL: If the 'modulus check' @[FIPS203, Section 7.2]
+ * for the public key fails.
*
* Specification: Implements @[FIPS203, Algorithm 20, ML-KEM.Encaps]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- uint8_t ss[MLKEM_SSBYTES],
- const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
+int mlk_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
- assigns(object_whole(ct))
- assigns(object_whole(ss))
+ assigns(memory_slice(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY ||
+ return_value == MLK_ERR_RNG_FAIL)
);
/*************************************************
- * Name: crypto_kem_dec
+ * Name: mlk_kem_dec
*
* Description: Generates shared secret for given
* cipher text and private key
@@ -206,22 +300,27 @@ __contract__(
* bytes)
*
* Returns: - 0 on success
- * - -1 if the 'hash check' @[FIPS203, Section 7.3]
- * for the secret key fails.
+ * - MLK_ERR_FAIL: If the 'hash check' @[FIPS203, Section 7.3]
+ * for the secret key fails.
+ * - MLK_ERR_OUT_OF_MEMORY: If MLK_CONFIG_CUSTOM_ALLOC_FREE is
+ * used and an allocation via MLK_CUSTOM_ALLOC returned NULL.
*
* Specification: Implements @[FIPS203, Algorithm 21, ML-KEM.Decaps]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
-int crypto_kem_dec(uint8_t ss[MLKEM_SSBYTES],
- const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
- const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
+int mlk_kem_dec(uint8_t ss[MLKEM_SSBYTES],
+ const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
+ const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
+ MLK_CONFIG_CONTEXT_PARAMETER_TYPE context)
__contract__(
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
- assigns(object_whole(ss))
+ assigns(memory_slice(ss, MLKEM_SSBYTES))
+ ensures(return_value == 0 || return_value == MLK_ERR_FAIL ||
+ return_value == MLK_ERR_OUT_OF_MEMORY)
);
#endif /* !MLK_KEM_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/api.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/api.h
index aea28a3af4..0308f2bd51 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/api.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/api.h
@@ -17,10 +17,18 @@
* and run sanity checks.
*/
-#include
#include "../cbmc.h"
#include "../common.h"
+/* Backends must return MLK_NATIVE_FUNC_SUCCESS upon success. */
+#define MLK_NATIVE_FUNC_SUCCESS (0)
+/* Backends may return MLK_NATIVE_FUNC_FALLBACK to signal to the frontend that
+ * the target/parameters are unsupported; typically, this would be because of
+ * dependencies on CPU features not detected on the host CPU. In this case,
+ * the frontend falls back to the default C implementation. */
+#define MLK_NATIVE_FUNC_FALLBACK (-1)
+
+
/* Absolute exclusive upper bound for the output of the inverse NTT
*
* NOTE: This is the same bound as in poly.h and has to be kept
@@ -74,12 +82,16 @@
*
* Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial
**************************************************/
-static MLK_INLINE void mlk_ntt_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_ntt_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
requires(array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLK_NTT_BOUND))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLK_NTT_BOUND))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_NTT */
@@ -140,11 +152,14 @@ __contract__(
*
* Arguments: - uint16_t *a: pointer to in/output polynomial
**************************************************/
-static MLK_INLINE void mlk_intt_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_intt_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLK_INVNTT_BOUND))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLK_INVNTT_BOUND))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_INTT */
@@ -156,11 +171,14 @@ __contract__(
*
* Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial
**************************************************/
-static MLK_INLINE void mlk_poly_reduce_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_reduce_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
@@ -173,11 +191,14 @@ __contract__(
*
* Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial
**************************************************/
-static MLK_INLINE void mlk_poly_tomont_native(int16_t p[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tomont_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
- ensures(array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
+ ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
@@ -203,13 +224,15 @@ __contract__(
* OUTPUT
* - cache: pointer to multiplication cache
**************************************************/
-static MLK_INLINE void mlk_poly_mulcache_compute_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_mulcache_compute_native(
int16_t cache[MLKEM_N / 2], const int16_t mlk_poly[MLKEM_N])
__contract__(
requires(memory_no_alias(cache, sizeof(int16_t) * (MLKEM_N / 2)))
requires(memory_no_alias(mlk_poly, sizeof(int16_t) * MLKEM_N))
- assigns(object_whole(cache))
- ensures(array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
+ assigns(memory_slice(cache, sizeof(int16_t) * (MLKEM_N / 2)))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
);
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
@@ -234,7 +257,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N],
const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)])
__contract__(
@@ -244,6 +268,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 2 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 2 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 */
@@ -267,7 +292,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N],
const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)])
__contract__(
@@ -277,6 +303,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 3 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 3 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3 */
@@ -300,7 +327,8 @@ __contract__(
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N],
const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)])
__contract__(
@@ -310,6 +338,7 @@ __contract__(
requires(memory_no_alias(b_cache, sizeof(int16_t) * 4 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 4 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
@@ -324,18 +353,20 @@ __contract__(
*
* Arguments: INPUT:
* - a: const pointer to input polynomial,
- * with each coefficient in the range -Q+1 .. Q-1
+ * with each coefficient in the range 0 .. Q-1
* OUTPUT
* - r: pointer to output byte array
* (of MLKEM_POLYBYTES bytes)
**************************************************/
-static MLK_INLINE void mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
- const int16_t a[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+ const int16_t a[MLKEM_N])
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
);
#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
@@ -353,13 +384,15 @@ __contract__(
* - a: const pointer to input byte array
* (of MLKEM_POLYBYTES bytes)
**************************************************/
-static MLK_INLINE void mlk_poly_frombytes_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_frombytes_native(
int16_t a[MLKEM_N], const uint8_t r[MLKEM_POLYBYTES])
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(a, sizeof(int16_t) * MLKEM_N))
- ensures(array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
);
#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
@@ -381,6 +414,7 @@ __contract__(
* Otherwise, returns non-negative number of sampled 16-bit integers (at most
* len).
**************************************************/
+MLK_MUST_CHECK_RETURN_VALUE
static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
const uint8_t *buf,
unsigned buflen)
@@ -389,8 +423,10 @@ __contract__(
requires(memory_no_alias(r, sizeof(int16_t) * len))
requires(memory_no_alias(buf, buflen))
assigns(memory_slice(r, sizeof(int16_t) * len))
- ensures(return_value == -1 || (0 <= return_value && return_value <= len))
- ensures(return_value != -1 ==> array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
+ ensures(return_value != MLK_NATIVE_FUNC_FALLBACK
+ ==> (0 <= return_value && return_value <= len))
+ ensures(return_value != MLK_NATIVE_FUNC_FALLBACK
+ ==> array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
);
#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
@@ -408,8 +444,15 @@ __contract__(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d4_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d4_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
@@ -425,8 +468,15 @@ static MLK_INLINE void mlk_poly_compress_d4_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d10_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d10_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
@@ -444,8 +494,15 @@ static MLK_INLINE void mlk_poly_compress_d10_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d4_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d4_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
@@ -463,8 +520,15 @@ static MLK_INLINE void mlk_poly_decompress_d4_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d10_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d10_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
@@ -482,8 +546,15 @@ static MLK_INLINE void mlk_poly_decompress_d10_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d5_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d5_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
@@ -499,8 +570,15 @@ static MLK_INLINE void mlk_poly_compress_d5_native(
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
-static MLK_INLINE void mlk_poly_compress_d11_native(
- uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d11_native(
+ uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N])
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK));
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
@@ -518,8 +596,15 @@ static MLK_INLINE void mlk_poly_compress_d11_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d5_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d5_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
@@ -537,8 +622,15 @@ static MLK_INLINE void mlk_poly_decompress_d5_native(
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
-static MLK_INLINE void mlk_poly_decompress_d11_native(
- int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d11_native(
+ int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
+ ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/meta.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/meta.h
index f2b9b848b7..4291d629b1 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/meta.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/meta.h
@@ -18,4 +18,8 @@
#include "x86_64/meta.h"
#endif
+#if defined(MLK_SYS_RISCV64_RVV)
+#include "riscv64/meta.h"
+#endif
+
#endif /* !MLK_NATIVE_META_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/meta.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/meta.h
index d8459ec6fc..39fa04c2b3 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/meta.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/meta.h
@@ -30,143 +30,272 @@
#define MLK_USE_NATIVE_POLY_DECOMPRESS_D11
#if !defined(__ASSEMBLER__)
-#include
#include "../../common.h"
+#include "../api.h"
#include "src/arith_native_x86_64.h"
+#include "src/compress_consts.h"
static MLK_INLINE void mlk_poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
{
- mlk_nttunpack_avx2((__m256i *)(data));
+ if (mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ mlk_nttunpack_avx2(data);
+ }
}
+MLK_MUST_CHECK_RETURN_VALUE
static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
const uint8_t *buf,
unsigned buflen)
{
- /* AVX2 implementation assumes specific buffer lengths */
- if (len != MLKEM_N || buflen != MLK_AVX2_REJ_UNIFORM_BUFLEN)
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2) || len != MLKEM_N ||
+ buflen % 12 != 0)
{
- return -1;
+ return MLK_NATIVE_FUNC_FALLBACK;
}
-
- return (int)mlk_rej_uniform_avx2(r, buf);
+ return (int)mlk_rej_uniform_asm(r, buf, buflen, mlk_rej_uniform_table);
}
-static MLK_INLINE void mlk_ntt_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N])
{
- mlk_ntt_avx2((__m256i *)data, mlk_qdata.vec);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_ntt_avx2(data, mlk_qdata);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_intt_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N])
{
- mlk_invntt_avx2((__m256i *)data, mlk_qdata.vec);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_invntt_avx2(data, mlk_qdata);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_reduce_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N])
{
- mlk_reduce_avx2((__m256i *)data, mlk_qdata.vec);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_reduce_avx2(data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_tomont_native(int16_t data[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N])
{
- mlk_tomont_avx2((__m256i *)data, mlk_qdata.vec);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_tomont_avx2(data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_mulcache_compute_native(
- int16_t x[MLKEM_N / 2], const int16_t y[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
+ const int16_t y[MLKEM_N])
{
- mlk_poly_mulcache_compute_avx2((__m256i *)x, (const __m256i *)y,
- mlk_qdata.vec);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_mulcache_compute_avx2(x, y, mlk_qdata);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N],
const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)])
{
- mlk_polyvec_basemul_acc_montgomery_cached_avx2(2, r, a, b, b_cache);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(r, a, b, b_cache);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 3
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N],
const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)])
{
- mlk_polyvec_basemul_acc_montgomery_cached_avx2(3, r, a, b, b_cache);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(r, a, b, b_cache);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
-static MLK_INLINE void mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N],
const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)])
{
- mlk_polyvec_basemul_acc_montgomery_cached_avx2(4, r, a, b, b_cache);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(r, a, b, b_cache);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
-static MLK_INLINE void mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
- const int16_t a[MLKEM_N])
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
+ const int16_t a[MLKEM_N])
{
- mlk_ntttobytes_avx2(r, (const __m256i *)a, mlk_qdata.vec);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_ntttobytes_avx2(r, a);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_frombytes_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_frombytes_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYBYTES])
{
- mlk_nttfrombytes_avx2((__m256i *)r, a, mlk_qdata.vec);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_nttfrombytes_avx2(r, a);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-static MLK_INLINE void mlk_poly_compress_d4_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d4_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N])
{
- mlk_poly_compress_d4_avx2(r, (const __m256i *)a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_compress_d4_avx2(r, a, mlk_compress_d4_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_compress_d10_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d10_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N])
{
- mlk_poly_compress_d10_avx2(r, (const __m256i *)a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_compress_d10_avx2(r, a, mlk_compress_d10_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_decompress_d4_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d4_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
{
- mlk_poly_decompress_d4_avx2((__m256i *)r, a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_decompress_d4_avx2(r, a, mlk_decompress_d4_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_decompress_d10_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d10_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
{
- mlk_poly_decompress_d10_avx2((__m256i *)r, a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_decompress_d10_avx2(r, a, mlk_decompress_d10_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
-static MLK_INLINE void mlk_poly_compress_d5_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d5_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N])
{
- mlk_poly_compress_d5_avx2(r, (const __m256i *)a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_compress_d5_avx2(r, a, mlk_compress_d5_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_compress_d11_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_compress_d11_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N])
{
- mlk_poly_compress_d11_avx2(r, (const __m256i *)a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_compress_d11_avx2(r, a, mlk_compress_d11_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_decompress_d5_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d5_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
{
- mlk_poly_decompress_d5_avx2((__m256i *)r, a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_decompress_d5_avx2(r, a, mlk_decompress_d5_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
-static MLK_INLINE void mlk_poly_decompress_d11_native(
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_poly_decompress_d11_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
{
- mlk_poly_decompress_d11_avx2((__m256i *)r, a);
+ if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
+ {
+ return MLK_NATIVE_FUNC_FALLBACK;
+ }
+
+ mlk_poly_decompress_d11_avx2(r, a, mlk_decompress_d11_data);
+ return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/align.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/align.h
deleted file mode 100644
index 5086f69864..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/align.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/* References
- * ==========
- *
- * - [REF_AVX2]
- * CRYSTALS-Kyber optimized AVX2 implementation
- * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
- * https://github.com/pq-crystals/kyber/tree/main/avx2
- */
-
-#ifndef MLK_NATIVE_X86_64_SRC_ALIGN_H
-#define MLK_NATIVE_X86_64_SRC_ALIGN_H
-
-/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
- */
-
-#include
-#include
-
-#define MLK_ALIGNED_INT16(N) \
- union \
- { \
- int16_t coeffs[N]; \
- __m256i vec[(N + 15) / 16]; \
- }
-
-#endif /* !MLK_NATIVE_X86_64_SRC_ALIGN_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/arith_native_x86_64.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/arith_native_x86_64.h
index 2e8d6849a3..d73ba4346e 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/arith_native_x86_64.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/arith_native_x86_64.h
@@ -7,81 +7,303 @@
#include "../../../common.h"
-#include
#include
+#include "compress_consts.h"
#include "consts.h"
#define MLK_AVX2_REJ_UNIFORM_BUFLEN \
(3 * 168) /* REJ_UNIFORM_NBLOCKS * SHAKE128_RATE */
-#define mlk_rej_uniform_avx2 MLK_NAMESPACE(rej_uniform_avx2)
-unsigned mlk_rej_uniform_avx2(int16_t *r, const uint8_t *buf);
-
#define mlk_rej_uniform_table MLK_NAMESPACE(rej_uniform_table)
-extern const uint8_t mlk_rej_uniform_table[256][8];
+extern const uint8_t mlk_rej_uniform_table[];
+
+#define mlk_rej_uniform_asm MLK_NAMESPACE(rej_uniform_asm)
+MLK_MUST_CHECK_RETURN_VALUE
+uint64_t mlk_rej_uniform_asm(int16_t *r, const uint8_t *buf, unsigned buflen,
+ const uint8_t *table)
+/* This must be kept in sync with the HOL-Light specification
+ * in proofs/hol_light/x86_64/proofs/mlkem_rej_uniform.ml. */
+__contract__(
+ requires(buflen % 12 == 0)
+ requires(memory_no_alias(buf, buflen))
+ requires(table == mlk_rej_uniform_table)
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(return_value <= MLKEM_N)
+ ensures(array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
+);
#define mlk_ntt_avx2 MLK_NAMESPACE(ntt_avx2)
-void mlk_ntt_avx2(__m256i *r, const __m256i *mlk_qdata);
+void mlk_ntt_avx2(int16_t *r, const int16_t *qdata)
+/* This must be kept in sync with the HOL-Light specification
+ * in proofs/hol_light/x86_64/proofs/mlkem_ntt.ml */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(array_abs_bound(r, 0, MLKEM_N, 8192))
+ requires(qdata == mlk_qdata)
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ /* check-magic: off */
+ ensures(array_abs_bound(r, 0, MLKEM_N, 23595))
+ /* check-magic: on */
+);
#define mlk_invntt_avx2 MLK_NAMESPACE(invntt_avx2)
-void mlk_invntt_avx2(__m256i *r, const __m256i *mlk_qdata);
+void mlk_invntt_avx2(int16_t *r, const int16_t *qdata)
+/* This must be kept in sync with the HOL-Light specification
+ * in proofs/hol_light/x86_64/proofs/mlkem_intt.ml */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(qdata == mlk_qdata)
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ /* check-magic: off */
+ ensures(array_abs_bound(r, 0, MLKEM_N, 26632))
+ /* check-magic: on */
+);
#define mlk_nttunpack_avx2 MLK_NAMESPACE(nttunpack_avx2)
-void mlk_nttunpack_avx2(__m256i *r);
+void mlk_nttunpack_avx2(int16_t *r)
+/* This must be kept in sync with the HOL-Light specification
+ * in proofs/hol_light/x86_64/proofs/mlkem_unpack.ml */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(r, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ /* Output is a permutation of input: every output coefficient
+ * is some input coefficient */
+ ensures(forall(i, 0, MLKEM_N, exists(j, 0, MLKEM_N,
+ r[i] == old(*(int16_t (*)[MLKEM_N])r)[j])))
+);
#define mlk_reduce_avx2 MLK_NAMESPACE(reduce_avx2)
-void mlk_reduce_avx2(__m256i *r, const __m256i *mlk_qdata);
-
-#define mlk_basemul_avx2 MLK_NAMESPACE(basemul_avx2)
-void mlk_basemul_avx2(__m256i *r, const __m256i *a, const __m256i *b,
- const __m256i *b_cache, const __m256i *mlk_qdata);
+void mlk_reduce_avx2(int16_t *r)
+/* This must be kept in sync with the HOL-Light specification
+ * in proofs/hol_light/x86_64/proofs/mlkem_reduce.ml */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(array_bound(r, 0, MLKEM_N, 0, MLKEM_Q))
+);
#define mlk_poly_mulcache_compute_avx2 MLK_NAMESPACE(poly_mulcache_compute_avx2)
-void mlk_poly_mulcache_compute_avx2(__m256i *out, const __m256i *in,
- const __m256i *mlk_qdata);
+void mlk_poly_mulcache_compute_avx2(int16_t *out, const int16_t *in,
+ const int16_t *qdata)
+/* This must be kept in sync with the HOL-Light specification
+ * in proofs/hol_light/x86_64/proofs/mlkem_mulcache_compute.ml */
+__contract__(
+ requires(memory_no_alias(out, sizeof(int16_t) * (MLKEM_N / 2)))
+ requires(memory_no_alias(in, sizeof(int16_t) * MLKEM_N))
+ requires(qdata == mlk_qdata)
+ assigns(memory_slice(out, sizeof(int16_t) * (MLKEM_N / 2)))
+ ensures(array_abs_bound(out, 0, MLKEM_N/2, MLKEM_Q))
+);
-#define mlk_polyvec_basemul_acc_montgomery_cached_avx2 \
- MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_avx2)
-void mlk_polyvec_basemul_acc_montgomery_cached_avx2(unsigned k,
- int16_t r[MLKEM_N],
- const int16_t *a,
- const int16_t *b,
- const int16_t *kb_cache);
+#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k2 \
+ MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2)
+void mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, sizeof(int16_t) * 2 * MLKEM_N))
+ requires(memory_no_alias(b, sizeof(int16_t) * 2 * MLKEM_N))
+ requires(memory_no_alias(b_cache, sizeof(int16_t) * 2 * (MLKEM_N / 2)))
+ requires(array_abs_bound(a, 0, 2 * MLKEM_N, MLKEM_UINT12_LIMIT + 1))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+);
+
+#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k3 \
+ MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3)
+void mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, sizeof(int16_t) * 3 * MLKEM_N))
+ requires(memory_no_alias(b, sizeof(int16_t) * 3 * MLKEM_N))
+ requires(memory_no_alias(b_cache, sizeof(int16_t) * 3 * (MLKEM_N / 2)))
+ requires(array_abs_bound(a, 0, 3 * MLKEM_N, MLKEM_UINT12_LIMIT + 1))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+);
+
+#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k4 \
+ MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4)
+void mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(int16_t *r,
+ const int16_t *a,
+ const int16_t *b,
+ const int16_t *b_cache)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, sizeof(int16_t) * 4 * MLKEM_N))
+ requires(memory_no_alias(b, sizeof(int16_t) * 4 * MLKEM_N))
+ requires(memory_no_alias(b_cache, sizeof(int16_t) * 4 * (MLKEM_N / 2)))
+ requires(array_abs_bound(a, 0, 4 * MLKEM_N, MLKEM_UINT12_LIMIT + 1))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+);
#define mlk_ntttobytes_avx2 MLK_NAMESPACE(ntttobytes_avx2)
-void mlk_ntttobytes_avx2(uint8_t *r, const __m256i *a,
- const __m256i *mlk_qdata);
+void mlk_ntttobytes_avx2(uint8_t *r, const int16_t *a)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_tobytes.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYBYTES))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ assigns(memory_slice(r, MLKEM_POLYBYTES))
+);
#define mlk_nttfrombytes_avx2 MLK_NAMESPACE(nttfrombytes_avx2)
-void mlk_nttfrombytes_avx2(__m256i *r, const uint8_t *a,
- const __m256i *mlk_qdata);
+void mlk_nttfrombytes_avx2(int16_t *r, const uint8_t *a)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_frombytes.ml.
+ */
+__contract__(
+ requires(memory_no_alias(a, MLKEM_POLYBYTES))
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(array_bound(r, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
+);
#define mlk_tomont_avx2 MLK_NAMESPACE(tomont_avx2)
-void mlk_tomont_avx2(__m256i *r, const __m256i *mlk_qdata);
+void mlk_tomont_avx2(int16_t *r)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_tomont.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+);
#define mlk_poly_compress_d4_avx2 MLK_NAMESPACE(poly_compress_d4_avx2)
void mlk_poly_compress_d4_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
- const __m256i *MLK_RESTRICT a);
+ const int16_t *MLK_RESTRICT a,
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_compress_d4.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ requires(data == mlk_compress_d4_data)
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D4))
+);
+
#define mlk_poly_decompress_d4_avx2 MLK_NAMESPACE(poly_decompress_d4_avx2)
-void mlk_poly_decompress_d4_avx2(__m256i *MLK_RESTRICT r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
-#define mlk_poly_compress_d10_avx2 MLK_NAMESPACE(poly_compress10_avx2)
+void mlk_poly_decompress_d4_avx2(int16_t *MLK_RESTRICT r,
+ const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4],
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_decompress_d4.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D4))
+ requires(data == mlk_decompress_d4_data)
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(array_bound(r, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define mlk_poly_compress_d10_avx2 MLK_NAMESPACE(poly_compress_d10_avx2)
void mlk_poly_compress_d10_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
- const __m256i *MLK_RESTRICT a);
-#define mlk_poly_decompress_d10_avx2 MLK_NAMESPACE(poly_decompress10_avx2)
+ const int16_t *MLK_RESTRICT a,
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_compress_d10.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ requires(data == mlk_compress_d10_data)
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D10))
+);
+
+#define mlk_poly_decompress_d10_avx2 MLK_NAMESPACE(poly_decompress_d10_avx2)
void mlk_poly_decompress_d10_avx2(
- __m256i *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
+ int16_t *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10],
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_decompress_d10.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D10))
+ requires(data == mlk_decompress_d10_data)
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(array_bound(r, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
#define mlk_poly_compress_d5_avx2 MLK_NAMESPACE(poly_compress_d5_avx2)
void mlk_poly_compress_d5_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
- const __m256i *MLK_RESTRICT a);
+ const int16_t *MLK_RESTRICT a,
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_compress_d5.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ requires(data == mlk_compress_d5_data)
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D5))
+);
+
#define mlk_poly_decompress_d5_avx2 MLK_NAMESPACE(poly_decompress_d5_avx2)
-void mlk_poly_decompress_d5_avx2(__m256i *MLK_RESTRICT r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
-#define mlk_poly_compress_d11_avx2 MLK_NAMESPACE(poly_compress11_avx2)
+void mlk_poly_decompress_d5_avx2(int16_t *MLK_RESTRICT r,
+ const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5],
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_decompress_d5.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D5))
+ requires(data == mlk_decompress_d5_data)
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(array_bound(r, 0, MLKEM_N, 0, MLKEM_Q))
+);
+
+#define mlk_poly_compress_d11_avx2 MLK_NAMESPACE(poly_compress_d11_avx2)
void mlk_poly_compress_d11_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
- const __m256i *MLK_RESTRICT a);
-#define mlk_poly_decompress_d11_avx2 MLK_NAMESPACE(poly_decompress11_avx2)
+ const int16_t *MLK_RESTRICT a,
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_compress_d11.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
+ requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
+ requires(data == mlk_compress_d11_data)
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_D11))
+);
+
+#define mlk_poly_decompress_d11_avx2 MLK_NAMESPACE(poly_decompress_d11_avx2)
void mlk_poly_decompress_d11_avx2(
- __m256i *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
+ int16_t *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11],
+ const uint8_t *data)
+/* This must be kept in sync with the HOL-Light specification in
+ * proofs/hol_light/x86_64/proofs/mlkem_poly_decompress_d11.ml.
+ */
+__contract__(
+ requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+ requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_D11))
+ requires(data == mlk_decompress_d11_data)
+ assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+ ensures(array_bound(r, 0, MLKEM_N, 0, MLKEM_Q))
+);
#endif /* !MLK_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/basemul.S
deleted file mode 100644
index fbe5a8e91f..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/basemul.S
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/* References
- * ==========
- *
- * - [REF_AVX2]
- * CRYSTALS-Kyber optimized AVX2 implementation
- * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
- * https://github.com/pq-crystals/kyber/tree/main/avx2
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
- *
- * The main difference is the use of a mulcache.
- */
-
-#include "../../../common.h"
-#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-
-/*
- * WARNING: This file is auto-derived from the mlkem-native source file
- * dev/x86_64/src/basemul.S using scripts/simpasm. Do not modify it directly.
- */
-
-
-.text
-.balign 4
-.global MLK_ASM_NAMESPACE(basemul_avx2)
-MLK_ASM_FN_SYMBOL(basemul_avx2)
-
- movq %rsp, %r11
- andq $-0x20, %rsp
- subq $0x20, %rsp
- vmovdqa 0x20(%r8), %ymm0
- vmovdqa (%rsi), %ymm1
- vmovdqa 0x20(%rsi), %ymm2
- vmovdqa 0x40(%rsi), %ymm3
- vmovdqa 0x60(%rsi), %ymm4
- vpmullw %ymm0, %ymm1, %ymm9
- vpmullw %ymm0, %ymm2, %ymm10
- vpmullw %ymm0, %ymm3, %ymm11
- vpmullw %ymm0, %ymm4, %ymm12
- vmovdqa (%rdx), %ymm5
- vmovdqa 0x20(%rdx), %ymm6
- vpmulhw %ymm5, %ymm1, %ymm13
- vpmulhw %ymm6, %ymm1, %ymm1
- vpmulhw %ymm5, %ymm2, %ymm14
- vmovdqa 0x40(%rdx), %ymm7
- vmovdqa 0x60(%rdx), %ymm8
- vpmulhw %ymm7, %ymm3, %ymm15
- vpmulhw %ymm8, %ymm3, %ymm3
- vpmulhw %ymm7, %ymm4, %ymm0
- vmovdqa %ymm13, (%rsp)
- vpmullw %ymm5, %ymm9, %ymm13
- vpmullw %ymm6, %ymm9, %ymm9
- vpmullw %ymm5, %ymm10, %ymm5
- vpmullw %ymm7, %ymm11, %ymm6
- vpmullw %ymm8, %ymm11, %ymm11
- vpmullw %ymm7, %ymm12, %ymm7
- vmovdqa (%rcx), %ymm8
- vpmulhw %ymm8, %ymm2, %ymm2
- vpmullw %ymm8, %ymm10, %ymm10
- vmovdqa 0x20(%rcx), %ymm8
- vpmulhw %ymm8, %ymm4, %ymm4
- vpmullw %ymm8, %ymm12, %ymm12
- vmovdqa (%r8), %ymm8
- vpmulhw %ymm8, %ymm13, %ymm13
- vpmulhw %ymm8, %ymm9, %ymm9
- vpmulhw %ymm8, %ymm5, %ymm5
- vpmulhw %ymm8, %ymm10, %ymm10
- vpmulhw %ymm8, %ymm6, %ymm6
- vpmulhw %ymm8, %ymm11, %ymm11
- vpmulhw %ymm8, %ymm7, %ymm7
- vpmulhw %ymm8, %ymm12, %ymm12
- vpsubw (%rsp), %ymm13, %ymm13
- vpsubw %ymm9, %ymm1, %ymm9
- vpsubw %ymm5, %ymm14, %ymm5
- vpsubw %ymm10, %ymm2, %ymm10
- vpsubw %ymm6, %ymm15, %ymm6
- vpsubw %ymm11, %ymm3, %ymm11
- vpsubw %ymm7, %ymm0, %ymm7
- vpsubw %ymm12, %ymm4, %ymm12
- vpaddw %ymm5, %ymm9, %ymm9
- vpaddw %ymm7, %ymm11, %ymm11
- vpsubw %ymm13, %ymm10, %ymm13
- vpsubw %ymm12, %ymm6, %ymm6
- vmovdqa %ymm13, (%rdi)
- vmovdqa %ymm9, 0x20(%rdi)
- vmovdqa %ymm6, 0x40(%rdi)
- vmovdqa %ymm11, 0x60(%rdi)
- vmovdqa 0x20(%r8), %ymm0
- vmovdqa 0x80(%rsi), %ymm1
- vmovdqa 0xa0(%rsi), %ymm2
- vmovdqa 0xc0(%rsi), %ymm3
- vmovdqa 0xe0(%rsi), %ymm4
- vpmullw %ymm0, %ymm1, %ymm9
- vpmullw %ymm0, %ymm2, %ymm10
- vpmullw %ymm0, %ymm3, %ymm11
- vpmullw %ymm0, %ymm4, %ymm12
- vmovdqa 0x80(%rdx), %ymm5
- vmovdqa 0xa0(%rdx), %ymm6
- vpmulhw %ymm5, %ymm1, %ymm13
- vpmulhw %ymm6, %ymm1, %ymm1
- vpmulhw %ymm5, %ymm2, %ymm14
- vmovdqa 0xc0(%rdx), %ymm7
- vmovdqa 0xe0(%rdx), %ymm8
- vpmulhw %ymm7, %ymm3, %ymm15
- vpmulhw %ymm8, %ymm3, %ymm3
- vpmulhw %ymm7, %ymm4, %ymm0
- vmovdqa %ymm13, (%rsp)
- vpmullw %ymm5, %ymm9, %ymm13
- vpmullw %ymm6, %ymm9, %ymm9
- vpmullw %ymm5, %ymm10, %ymm5
- vpmullw %ymm7, %ymm11, %ymm6
- vpmullw %ymm8, %ymm11, %ymm11
- vpmullw %ymm7, %ymm12, %ymm7
- vmovdqa 0x40(%rcx), %ymm8
- vpmulhw %ymm8, %ymm2, %ymm2
- vpmullw %ymm8, %ymm10, %ymm10
- vmovdqa 0x60(%rcx), %ymm8
- vpmulhw %ymm8, %ymm4, %ymm4
- vpmullw %ymm8, %ymm12, %ymm12
- vmovdqa (%r8), %ymm8
- vpmulhw %ymm8, %ymm13, %ymm13
- vpmulhw %ymm8, %ymm9, %ymm9
- vpmulhw %ymm8, %ymm5, %ymm5
- vpmulhw %ymm8, %ymm10, %ymm10
- vpmulhw %ymm8, %ymm6, %ymm6
- vpmulhw %ymm8, %ymm11, %ymm11
- vpmulhw %ymm8, %ymm7, %ymm7
- vpmulhw %ymm8, %ymm12, %ymm12
- vpsubw (%rsp), %ymm13, %ymm13
- vpsubw %ymm9, %ymm1, %ymm9
- vpsubw %ymm5, %ymm14, %ymm5
- vpsubw %ymm10, %ymm2, %ymm10
- vpsubw %ymm6, %ymm15, %ymm6
- vpsubw %ymm11, %ymm3, %ymm11
- vpsubw %ymm7, %ymm0, %ymm7
- vpsubw %ymm12, %ymm4, %ymm12
- vpaddw %ymm5, %ymm9, %ymm9
- vpaddw %ymm7, %ymm11, %ymm11
- vpsubw %ymm13, %ymm10, %ymm13
- vpsubw %ymm12, %ymm6, %ymm6
- vmovdqa %ymm13, 0x80(%rdi)
- vmovdqa %ymm9, 0xa0(%rdi)
- vmovdqa %ymm6, 0xc0(%rdi)
- vmovdqa %ymm11, 0xe0(%rdi)
- vmovdqa 0x20(%r8), %ymm0
- vmovdqa 0x100(%rsi), %ymm1
- vmovdqa 0x120(%rsi), %ymm2
- vmovdqa 0x140(%rsi), %ymm3
- vmovdqa 0x160(%rsi), %ymm4
- vpmullw %ymm0, %ymm1, %ymm9
- vpmullw %ymm0, %ymm2, %ymm10
- vpmullw %ymm0, %ymm3, %ymm11
- vpmullw %ymm0, %ymm4, %ymm12
- vmovdqa 0x100(%rdx), %ymm5
- vmovdqa 0x120(%rdx), %ymm6
- vpmulhw %ymm5, %ymm1, %ymm13
- vpmulhw %ymm6, %ymm1, %ymm1
- vpmulhw %ymm5, %ymm2, %ymm14
- vmovdqa 0x140(%rdx), %ymm7
- vmovdqa 0x160(%rdx), %ymm8
- vpmulhw %ymm7, %ymm3, %ymm15
- vpmulhw %ymm8, %ymm3, %ymm3
- vpmulhw %ymm7, %ymm4, %ymm0
- vmovdqa %ymm13, (%rsp)
- vpmullw %ymm5, %ymm9, %ymm13
- vpmullw %ymm6, %ymm9, %ymm9
- vpmullw %ymm5, %ymm10, %ymm5
- vpmullw %ymm7, %ymm11, %ymm6
- vpmullw %ymm8, %ymm11, %ymm11
- vpmullw %ymm7, %ymm12, %ymm7
- vmovdqa 0x80(%rcx), %ymm8
- vpmulhw %ymm8, %ymm2, %ymm2
- vpmullw %ymm8, %ymm10, %ymm10
- vmovdqa 0xa0(%rcx), %ymm8
- vpmulhw %ymm8, %ymm4, %ymm4
- vpmullw %ymm8, %ymm12, %ymm12
- vmovdqa (%r8), %ymm8
- vpmulhw %ymm8, %ymm13, %ymm13
- vpmulhw %ymm8, %ymm9, %ymm9
- vpmulhw %ymm8, %ymm5, %ymm5
- vpmulhw %ymm8, %ymm10, %ymm10
- vpmulhw %ymm8, %ymm6, %ymm6
- vpmulhw %ymm8, %ymm11, %ymm11
- vpmulhw %ymm8, %ymm7, %ymm7
- vpmulhw %ymm8, %ymm12, %ymm12
- vpsubw (%rsp), %ymm13, %ymm13
- vpsubw %ymm9, %ymm1, %ymm9
- vpsubw %ymm5, %ymm14, %ymm5
- vpsubw %ymm10, %ymm2, %ymm10
- vpsubw %ymm6, %ymm15, %ymm6
- vpsubw %ymm11, %ymm3, %ymm11
- vpsubw %ymm7, %ymm0, %ymm7
- vpsubw %ymm12, %ymm4, %ymm12
- vpaddw %ymm5, %ymm9, %ymm9
- vpaddw %ymm7, %ymm11, %ymm11
- vpsubw %ymm13, %ymm10, %ymm13
- vpsubw %ymm12, %ymm6, %ymm6
- vmovdqa %ymm13, 0x100(%rdi)
- vmovdqa %ymm9, 0x120(%rdi)
- vmovdqa %ymm6, 0x140(%rdi)
- vmovdqa %ymm11, 0x160(%rdi)
- vmovdqa 0x20(%r8), %ymm0
- vmovdqa 0x180(%rsi), %ymm1
- vmovdqa 0x1a0(%rsi), %ymm2
- vmovdqa 0x1c0(%rsi), %ymm3
- vmovdqa 0x1e0(%rsi), %ymm4
- vpmullw %ymm0, %ymm1, %ymm9
- vpmullw %ymm0, %ymm2, %ymm10
- vpmullw %ymm0, %ymm3, %ymm11
- vpmullw %ymm0, %ymm4, %ymm12
- vmovdqa 0x180(%rdx), %ymm5
- vmovdqa 0x1a0(%rdx), %ymm6
- vpmulhw %ymm5, %ymm1, %ymm13
- vpmulhw %ymm6, %ymm1, %ymm1
- vpmulhw %ymm5, %ymm2, %ymm14
- vmovdqa 0x1c0(%rdx), %ymm7
- vmovdqa 0x1e0(%rdx), %ymm8
- vpmulhw %ymm7, %ymm3, %ymm15
- vpmulhw %ymm8, %ymm3, %ymm3
- vpmulhw %ymm7, %ymm4, %ymm0
- vmovdqa %ymm13, (%rsp)
- vpmullw %ymm5, %ymm9, %ymm13
- vpmullw %ymm6, %ymm9, %ymm9
- vpmullw %ymm5, %ymm10, %ymm5
- vpmullw %ymm7, %ymm11, %ymm6
- vpmullw %ymm8, %ymm11, %ymm11
- vpmullw %ymm7, %ymm12, %ymm7
- vmovdqa 0xc0(%rcx), %ymm8
- vpmulhw %ymm8, %ymm2, %ymm2
- vpmullw %ymm8, %ymm10, %ymm10
- vmovdqa 0xe0(%rcx), %ymm8
- vpmulhw %ymm8, %ymm4, %ymm4
- vpmullw %ymm8, %ymm12, %ymm12
- vmovdqa (%r8), %ymm8
- vpmulhw %ymm8, %ymm13, %ymm13
- vpmulhw %ymm8, %ymm9, %ymm9
- vpmulhw %ymm8, %ymm5, %ymm5
- vpmulhw %ymm8, %ymm10, %ymm10
- vpmulhw %ymm8, %ymm6, %ymm6
- vpmulhw %ymm8, %ymm11, %ymm11
- vpmulhw %ymm8, %ymm7, %ymm7
- vpmulhw %ymm8, %ymm12, %ymm12
- vpsubw (%rsp), %ymm13, %ymm13
- vpsubw %ymm9, %ymm1, %ymm9
- vpsubw %ymm5, %ymm14, %ymm5
- vpsubw %ymm10, %ymm2, %ymm10
- vpsubw %ymm6, %ymm15, %ymm6
- vpsubw %ymm11, %ymm3, %ymm11
- vpsubw %ymm7, %ymm0, %ymm7
- vpsubw %ymm12, %ymm4, %ymm12
- vpaddw %ymm5, %ymm9, %ymm9
- vpaddw %ymm7, %ymm11, %ymm11
- vpsubw %ymm13, %ymm10, %ymm13
- vpsubw %ymm12, %ymm6, %ymm6
- vmovdqa %ymm13, 0x180(%rdi)
- vmovdqa %ymm9, 0x1a0(%rdi)
- vmovdqa %ymm6, 0x1c0(%rdi)
- vmovdqa %ymm11, 0x1e0(%rdi)
- movq %r11, %rsp
- retq
-
-#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
- */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/basemul.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/basemul.c
deleted file mode 100644
index 24f6231101..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/basemul.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/* References
- * ==========
- *
- * - [FIPS203]
- * FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
- * National Institute of Standards and Technology
- * https://csrc.nist.gov/pubs/fips/203/final
- *
- * - [REF_AVX2]
- * CRYSTALS-Kyber optimized AVX2 implementation
- * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
- * https://github.com/pq-crystals/kyber/tree/main/avx2
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
- */
-
-#include "../../../common.h"
-
-#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-
-#include "../../../verify.h"
-#include "arith_native_x86_64.h"
-#include "consts.h"
-
-static void poly_basemul_montgomery_avx2(int16_t r[MLKEM_N],
- const int16_t a[MLKEM_N],
- const int16_t b[MLKEM_N],
- const int16_t b_cache[MLKEM_N / 2])
-{
- mlk_basemul_avx2((__m256i *)r, (const __m256i *)a, (const __m256i *)b,
- (const __m256i *)b_cache, mlk_qdata.vec);
-}
-
-static void poly_add_avx2(int16_t r[MLKEM_N], const int16_t a[MLKEM_N],
- const int16_t b[MLKEM_N])
-{
- unsigned i;
- __m256i f0, f1;
-
- for (i = 0; i < MLKEM_N; i += 16)
- {
- f0 = _mm256_load_si256((const __m256i *)&a[i]);
- f1 = _mm256_load_si256((const __m256i *)&b[i]);
- f0 = _mm256_add_epi16(f0, f1);
- _mm256_store_si256((__m256i *)&r[i], f0);
- }
-}
-
-void mlk_polyvec_basemul_acc_montgomery_cached_avx2(unsigned k,
- int16_t r[MLKEM_N],
- const int16_t *a,
- const int16_t *b,
- const int16_t *b_cache)
-{
- unsigned i;
- int16_t t[MLKEM_N] MLK_ALIGN;
-
- /* Coefficient-wise bound of each basemul is 2q.
- * Since we are accumulating at most 4 times, the
- * overall bound is 8q < INT16_MAX. */
- poly_basemul_montgomery_avx2(r, &a[0], &b[0], &b_cache[0]);
- for (i = 1; i < k; i++)
- {
- poly_basemul_montgomery_avx2(t, &a[i * MLKEM_N], &b[i * MLKEM_N],
- &b_cache[i * (MLKEM_N / 2)]);
- poly_add_avx2(r, r, t);
- }
-
- /* Specification: Partially implements
- * @[FIPS203, Section 3.3, Destruction of intermediate values] */
- mlk_zeroize(t, sizeof(t));
-}
-
-#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
- */
-
-MLK_EMPTY_CU(avx2_basemul)
-
-#endif /* !(MLK_ARITH_BACKEND_X86_64_DEFAULT && \
- !MLK_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/compress_avx2.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/compress_avx2.c
deleted file mode 100644
index c9827099d0..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/compress_avx2.c
+++ /dev/null
@@ -1,387 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/* References
- * ==========
- *
- * - [REF_AVX2]
- * CRYSTALS-Kyber optimized AVX2 implementation
- * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
- * https://github.com/pq-crystals/kyber/tree/main/avx2
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
- */
-
-#include "../../../common.h"
-
-#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-
-#include
-#include
-#include
-#include "arith_native_x86_64.h"
-#include "consts.h"
-
-#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
-void mlk_poly_compress_d4_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
- const __m256i *MLK_RESTRICT a)
-{
- unsigned int i;
- __m256i f0, f1, f2, f3;
- const __m256i v =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XV / 16]);
- const __m256i shift1 = _mm256_set1_epi16(1 << 9);
- const __m256i mask = _mm256_set1_epi16(15);
- const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1);
- const __m256i permdidx = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-
- for (i = 0; i < MLKEM_N / 64; i++)
- {
- f0 = _mm256_load_si256(&a[4 * i + 0]);
- f1 = _mm256_load_si256(&a[4 * i + 1]);
- f2 = _mm256_load_si256(&a[4 * i + 2]);
- f3 = _mm256_load_si256(&a[4 * i + 3]);
- f0 = _mm256_mulhi_epi16(f0, v);
- f1 = _mm256_mulhi_epi16(f1, v);
- f2 = _mm256_mulhi_epi16(f2, v);
- f3 = _mm256_mulhi_epi16(f3, v);
- f0 = _mm256_mulhrs_epi16(f0, shift1);
- f1 = _mm256_mulhrs_epi16(f1, shift1);
- f2 = _mm256_mulhrs_epi16(f2, shift1);
- f3 = _mm256_mulhrs_epi16(f3, shift1);
- f0 = _mm256_and_si256(f0, mask);
- f1 = _mm256_and_si256(f1, mask);
- f2 = _mm256_and_si256(f2, mask);
- f3 = _mm256_and_si256(f3, mask);
- f0 = _mm256_packus_epi16(f0, f1);
- f2 = _mm256_packus_epi16(f2, f3);
- f0 = _mm256_maddubs_epi16(f0, shift2);
- f2 = _mm256_maddubs_epi16(f2, shift2);
- f0 = _mm256_packus_epi16(f0, f2);
- f0 = _mm256_permutevar8x32_epi32(f0, permdidx);
- _mm256_storeu_si256((__m256i *)&r[32 * i], f0);
- }
-}
-
-void mlk_poly_decompress_d4_avx2(__m256i *MLK_RESTRICT r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
-{
- unsigned int i;
- __m128i t;
- __m256i f;
- const __m256i q =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XQ / 16]);
- const __m256i shufbidx =
- _mm256_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3,
- 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
- const __m256i mask = _mm256_set1_epi32(0x00F0000F);
- const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048);
-
- for (i = 0; i < MLKEM_N / 16; i++)
- {
- t = _mm_loadl_epi64((__m128i *)&a[8 * i]);
- f = _mm256_broadcastsi128_si256(t);
- f = _mm256_shuffle_epi8(f, shufbidx);
- f = _mm256_and_si256(f, mask);
- f = _mm256_mullo_epi16(f, shift);
- f = _mm256_mulhrs_epi16(f, q);
- _mm256_store_si256(&r[i], f);
- }
-}
-
-void mlk_poly_compress_d10_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
- const __m256i *MLK_RESTRICT a)
-{
- unsigned int i;
- __m256i f0, f1, f2;
- __m128i t0, t1;
- const __m256i v =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XV / 16]);
- const __m256i v8 = _mm256_slli_epi16(v, 3);
- const __m256i off = _mm256_set1_epi16(15);
- const __m256i shift1 = _mm256_set1_epi16(1 << 12);
- const __m256i mask = _mm256_set1_epi16(1023);
- const __m256i shift2 =
- _mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1);
- const __m256i sllvdidx = _mm256_set1_epi64x(12);
- const __m256i shufbidx =
- _mm256_set_epi8(8, 4, 3, 2, 1, 0, -1, -1, -1, -1, -1, -1, 12, 11, 10, 9,
- -1, -1, -1, -1, -1, -1, 12, 11, 10, 9, 8, 4, 3, 2, 1, 0);
-
- for (i = 0; i < MLKEM_N / 16; i++)
- {
- f0 = _mm256_load_si256(&a[i]);
- f1 = _mm256_mullo_epi16(f0, v8);
- f2 = _mm256_add_epi16(f0, off);
- f0 = _mm256_slli_epi16(f0, 3);
- f0 = _mm256_mulhi_epi16(f0, v);
- f2 = _mm256_sub_epi16(f1, f2);
- f1 = _mm256_andnot_si256(f1, f2);
- f1 = _mm256_srli_epi16(f1, 15);
- f0 = _mm256_sub_epi16(f0, f1);
- f0 = _mm256_mulhrs_epi16(f0, shift1);
- f0 = _mm256_and_si256(f0, mask);
- f0 = _mm256_madd_epi16(f0, shift2);
- f0 = _mm256_sllv_epi32(f0, sllvdidx);
- f0 = _mm256_srli_epi64(f0, 12);
- f0 = _mm256_shuffle_epi8(f0, shufbidx);
- t0 = _mm256_castsi256_si128(f0);
- t1 = _mm256_extracti128_si256(f0, 1);
- t0 = _mm_blend_epi16(t0, t1, 0xE0);
- _mm_storeu_si128((__m128i *)&r[20 * i + 0], t0);
- memcpy(&r[20 * i + 16], &t1, 4);
- }
-}
-
-void mlk_poly_decompress_d10_avx2(
- __m256i *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
-{
- unsigned int i;
- __m256i f;
- const __m256i q = _mm256_set1_epi32((MLKEM_Q << 16) + 4 * MLKEM_Q);
- const __m256i shufbidx =
- _mm256_set_epi8(11, 10, 10, 9, 9, 8, 8, 7, 6, 5, 5, 4, 4, 3, 3, 2, 9, 8,
- 8, 7, 7, 6, 6, 5, 4, 3, 3, 2, 2, 1, 1, 0);
- const __m256i sllvdidx = _mm256_set1_epi64x(4);
- /* TODO: Explain magic values */
- /* check-magic: off */
- const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184);
- /* check-magic: on */
-
- for (i = 0; i < (MLKEM_N / 16) - 1; i++)
- {
- f = _mm256_loadu_si256((__m256i *)&a[20 * i]);
- f = _mm256_permute4x64_epi64(f, 0x94);
- f = _mm256_shuffle_epi8(f, shufbidx);
- f = _mm256_sllv_epi32(f, sllvdidx);
- f = _mm256_srli_epi16(f, 1);
- f = _mm256_and_si256(f, mask);
- f = _mm256_mulhrs_epi16(f, q);
- _mm256_store_si256(&r[i], f);
- }
-
- /* Handle load in last iteration especially to avoid buffer overflow */
- memcpy(&f, &a[20 * i], 20);
- /* The rest is the same */
- f = _mm256_permute4x64_epi64(f, 0x94);
- f = _mm256_shuffle_epi8(f, shufbidx);
- f = _mm256_sllv_epi32(f, sllvdidx);
- f = _mm256_srli_epi16(f, 1);
- f = _mm256_and_si256(f, mask);
- f = _mm256_mulhrs_epi16(f, q);
- _mm256_store_si256(&r[i], f);
-}
-
-#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
-
-#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
-void mlk_poly_compress_d5_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
- const __m256i *MLK_RESTRICT a)
-{
- unsigned int i;
- __m256i f0, f1;
- __m128i t0, t1;
- const __m256i v =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XV / 16]);
- const __m256i shift1 = _mm256_set1_epi16(1 << 10);
- const __m256i mask = _mm256_set1_epi16(31);
- const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1);
- const __m256i shift3 = _mm256_set1_epi32((1024 << 16) + 1);
- const __m256i sllvdidx = _mm256_set1_epi64x(12);
- const __m256i shufbidx =
- _mm256_set_epi8(8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0, -1, 12, 11, 10, 9,
- -1, 12, 11, 10, 9, 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0);
-
- for (i = 0; i < MLKEM_N / 32; i++)
- {
- f0 = _mm256_load_si256(&a[2 * i + 0]);
- f1 = _mm256_load_si256(&a[2 * i + 1]);
- f0 = _mm256_mulhi_epi16(f0, v);
- f1 = _mm256_mulhi_epi16(f1, v);
- f0 = _mm256_mulhrs_epi16(f0, shift1);
- f1 = _mm256_mulhrs_epi16(f1, shift1);
- f0 = _mm256_and_si256(f0, mask);
- f1 = _mm256_and_si256(f1, mask);
- f0 = _mm256_packus_epi16(f0, f1);
- f0 = _mm256_maddubs_epi16(
- f0, shift2); /* a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7 */
- f0 = _mm256_madd_epi16(f0, shift3); /* a0 a1 b0 b1 a2 a3 b2 b3 */
- f0 = _mm256_sllv_epi32(f0, sllvdidx);
- f0 = _mm256_srlv_epi64(f0, sllvdidx);
- f0 = _mm256_shuffle_epi8(f0, shufbidx);
- t0 = _mm256_castsi256_si128(f0);
- t1 = _mm256_extracti128_si256(f0, 1);
- t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx));
- _mm_storeu_si128((__m128i *)&r[20 * i + 0], t0);
- memcpy(&r[20 * i + 16], &t1, 4);
- }
-}
-
-void mlk_poly_decompress_d5_avx2(__m256i *MLK_RESTRICT r,
- const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
-{
- unsigned int i;
- __m128i t;
- __m256i f;
- int16_t ti;
- const __m256i q =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XQ / 16]);
- const __m256i shufbidx =
- _mm256_set_epi8(9, 9, 9, 8, 8, 8, 8, 7, 7, 6, 6, 6, 6, 5, 5, 5, 4, 4, 4,
- 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0);
- /* TODO: Document those magic values */
- /* check-magic: off */
- const __m256i mask = _mm256_set_epi16(248, 1984, 62, 496, 3968, 124, 992, 31,
- 248, 1984, 62, 496, 3968, 124, 992, 31);
- const __m256i shift = _mm256_set_epi16(128, 16, 512, 64, 8, 256, 32, 1024,
- 128, 16, 512, 64, 8, 256, 32, 1024);
- /* check-magic: on */
- for (i = 0; i < MLKEM_N / 16; i++)
- {
- t = _mm_loadl_epi64((__m128i *)&a[10 * i + 0]);
- memcpy(&ti, &a[10 * i + 8], 2);
- t = _mm_insert_epi16(t, ti, 4);
- f = _mm256_broadcastsi128_si256(t);
- f = _mm256_shuffle_epi8(f, shufbidx);
- f = _mm256_and_si256(f, mask);
- f = _mm256_mullo_epi16(f, shift);
- f = _mm256_mulhrs_epi16(f, q);
- _mm256_store_si256(&r[i], f);
- }
-}
-
-void mlk_poly_compress_d11_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
- const __m256i *MLK_RESTRICT a)
-{
- unsigned int i;
- __m256i f0, f1, f2;
- __m128i t0, t1;
- const __m256i v =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XV / 16]);
- const __m256i v8 = _mm256_slli_epi16(v, 3);
- const __m256i off = _mm256_set1_epi16(36);
- const __m256i shift1 = _mm256_set1_epi16(1 << 13);
- const __m256i mask = _mm256_set1_epi16(2047);
- const __m256i shift2 =
- _mm256_set1_epi64x((2048LL << 48) + (1LL << 32) + (2048 << 16) + 1);
- const __m256i sllvdidx = _mm256_set1_epi64x(10);
- const __m256i srlvqidx = _mm256_set_epi64x(30, 10, 30, 10);
- const __m256i shufbidx =
- _mm256_set_epi8(4, 3, 2, 1, 0, 0, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, -1,
- -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-
- for (i = 0; i < (MLKEM_N / 16) - 1; i++)
- {
- f0 = _mm256_load_si256(&a[i]);
- f1 = _mm256_mullo_epi16(f0, v8);
- f2 = _mm256_add_epi16(f0, off);
- f0 = _mm256_slli_epi16(f0, 3);
- f0 = _mm256_mulhi_epi16(f0, v);
- f2 = _mm256_sub_epi16(f1, f2);
- f1 = _mm256_andnot_si256(f1, f2);
- f1 = _mm256_srli_epi16(f1, 15);
- f0 = _mm256_sub_epi16(f0, f1);
- f0 = _mm256_mulhrs_epi16(f0, shift1);
- f0 = _mm256_and_si256(f0, mask);
- f0 = _mm256_madd_epi16(f0, shift2);
- f0 = _mm256_sllv_epi32(f0, sllvdidx);
- f1 = _mm256_bsrli_epi128(f0, 8);
- f0 = _mm256_srlv_epi64(f0, srlvqidx);
- f1 = _mm256_slli_epi64(f1, 34);
- f0 = _mm256_add_epi64(f0, f1);
- f0 = _mm256_shuffle_epi8(f0, shufbidx);
- t0 = _mm256_castsi256_si128(f0);
- t1 = _mm256_extracti128_si256(f0, 1);
- t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx));
- _mm_storeu_si128((__m128i *)&r[22 * i + 0], t0);
- _mm_storel_epi64((__m128i *)&r[22 * i + 16], t1);
- }
-
- f0 = _mm256_load_si256(&a[i]);
- f1 = _mm256_mullo_epi16(f0, v8);
- f2 = _mm256_add_epi16(f0, off);
- f0 = _mm256_slli_epi16(f0, 3);
- f0 = _mm256_mulhi_epi16(f0, v);
- f2 = _mm256_sub_epi16(f1, f2);
- f1 = _mm256_andnot_si256(f1, f2);
- f1 = _mm256_srli_epi16(f1, 15);
- f0 = _mm256_sub_epi16(f0, f1);
- f0 = _mm256_mulhrs_epi16(f0, shift1);
- f0 = _mm256_and_si256(f0, mask);
- f0 = _mm256_madd_epi16(f0, shift2);
- f0 = _mm256_sllv_epi32(f0, sllvdidx);
- f1 = _mm256_bsrli_epi128(f0, 8);
- f0 = _mm256_srlv_epi64(f0, srlvqidx);
- f1 = _mm256_slli_epi64(f1, 34);
- f0 = _mm256_add_epi64(f0, f1);
- f0 = _mm256_shuffle_epi8(f0, shufbidx);
- t0 = _mm256_castsi256_si128(f0);
- t1 = _mm256_extracti128_si256(f0, 1);
- t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx));
- _mm_storeu_si128((__m128i *)&r[22 * i + 0], t0);
- /* Handle store in last iteration especially to avoid overflow */
- memcpy(&r[22 * i + 16], &t1, 6);
-}
-
-void mlk_poly_decompress_d11_avx2(
- __m256i *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
-{
- unsigned int i;
- __m256i f;
- const __m256i q =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XQ / 16]);
- const __m256i shufbidx =
- _mm256_set_epi8(13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 6, 5, 5, 4, 4, 3, 10,
- 9, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 2, 1, 1, 0);
- const __m256i srlvdidx = _mm256_set_epi32(0, 0, 1, 0, 0, 0, 1, 0);
- const __m256i srlvqidx = _mm256_set_epi64x(2, 0, 2, 0);
- const __m256i shift =
- _mm256_set_epi16(4, 32, 1, 8, 32, 1, 4, 32, 4, 32, 1, 8, 32, 1, 4, 32);
- /* TODO: Explain magic constant */
- /* check-magic: off */
- const __m256i mask = _mm256_set1_epi16(32752);
- /* check-magic: on */
-
- for (i = 0; i < (MLKEM_N / 16) - 1; i++)
- {
- f = _mm256_loadu_si256((__m256i *)&a[22 * i]);
- f = _mm256_permute4x64_epi64(f, 0x94);
- f = _mm256_shuffle_epi8(f, shufbidx);
- f = _mm256_srlv_epi32(f, srlvdidx);
- f = _mm256_srlv_epi64(f, srlvqidx);
- f = _mm256_mullo_epi16(f, shift);
- f = _mm256_srli_epi16(f, 1);
- f = _mm256_and_si256(f, mask);
- f = _mm256_mulhrs_epi16(f, q);
- _mm256_store_si256(&r[i], f);
- }
-
- /* Handle load of last iteration especially */
- memcpy(&f, &a[22 * i], 22);
- /* The rest of the iteration is the same */
- f = _mm256_permute4x64_epi64(f, 0x94);
- f = _mm256_shuffle_epi8(f, shufbidx);
- f = _mm256_srlv_epi32(f, srlvdidx);
- f = _mm256_srlv_epi64(f, srlvqidx);
- f = _mm256_mullo_epi16(f, shift);
- f = _mm256_srli_epi16(f, 1);
- f = _mm256_and_si256(f, mask);
- f = _mm256_mulhrs_epi16(f, q);
- _mm256_store_si256(&r[i], f);
-}
-
-#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
-
-#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
- */
-
-MLK_EMPTY_CU(avx2_poly_compress)
-
-#endif /* !(MLK_ARITH_BACKEND_X86_64_DEFAULT && \
- !MLK_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/compress_consts.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/compress_consts.c
new file mode 100644
index 0000000000..dcfa127582
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/compress_consts.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
+ * Do not modify it directly.
+ */
+
+#include "../../../common.h"
+
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT)
+
+#include "compress_consts.h"
+
+#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2 || \
+ MLKEM_K == 3)
+
+MLK_ALIGN const uint8_t mlk_compress_d4_data[32] = {
+ 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0,
+ 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0, /* permdidx */
+};
+
+MLK_ALIGN const uint8_t mlk_decompress_d4_data[32] = {
+ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
+ 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, /* shufbidx */
+};
+
+MLK_ALIGN const uint8_t mlk_compress_d10_data[32] = {
+ 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 255,
+ 255, 255, 255, 255, 255, 9, 10, 11, 12, 255, 255,
+ 255, 255, 255, 255, 0, 1, 2, 3, 4, 8, /* shufbidx */
+};
+
+MLK_ALIGN const uint8_t mlk_decompress_d10_data[32] = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 6, 7, 7, 8, 8, 9,
+ 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, /* shufbidx */
+};
+
+#endif /* !MLK_CONFIG_MULTILEVEL_NO_SHARED && \
+ (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3) \
+ */
+
+#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
+
+MLK_ALIGN const uint8_t
+ mlk_compress_d5_data[32] = {
+ 0, 1, 2, 3, 4, 255, 255, 255, 255, 255, 8,
+ 9, 10, 11, 12, 255, 9, 10, 11, 12, 255, 0,
+ 1, 2, 3, 4, 255, 255, 255, 255, 255, 8, /* shufbidx */
+};
+
+/* shufbidx[0:32], mask[32:64], shift[64:96] */
+MLK_ALIGN const uint8_t mlk_decompress_d5_data[96] = {
+ 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5,
+ 5, 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 9, 9, 9, /* shufbidx */
+ 31, 0, 224, 3, 124, 0, 128, 15, 240, 1, 62, 0, 192, 7, 248, 0, 31, 0,
+ 224, 3, 124, 0, 128, 15, 240, 1, 62, 0, 192, 7, 248, 0, /* mask */
+ 0, 4, 32, 0, 0, 1, 8, 0, 64, 0, 0, 2, 16, 0, 128, 0, 0, 4,
+ 32, 0, 0, 1, 8, 0, 64, 0, 0, 2, 16, 0, 128, 0, /* shift */
+};
+
+/* srlvqidx[0:32], shufbidx[32:64] */
+MLK_ALIGN const uint8_t mlk_compress_d11_data[64] = {
+ 10, 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 10,
+ 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, /* srlvqidx */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 255, 255, 255, 255, 255, 5,
+ 6, 7, 8, 9, 10, 255, 255, 255, 255, 0, 0, 1, 2, 3, 4, /* shufbidx */
+};
+
+/* shufbidx[0:32], srlvdidx[32:64], srlvqidx[64:96], shift[96:128] */
+MLK_ALIGN const uint8_t mlk_decompress_d11_data[128] = {
+ 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10,
+ 3, 4, 4, 5, 5, 6, 7, 8, 8, 9, 9, 10, 11, 12, 12, 13, /* shufbidx */
+ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* srlvdidx */
+ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, /* srlvqidx */
+ 32, 0, 4, 0, 1, 0, 32, 0, 8, 0, 1, 0, 32, 0, 4, 0,
+ 32, 0, 4, 0, 1, 0, 32, 0, 8, 0, 1, 0, 32, 0, 4, 0, /* shift */
+};
+
+#endif /* !MLK_CONFIG_MULTILEVEL_NO_SHARED && \
+ (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */
+
+#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT */
+
+MLK_EMPTY_CU(avx2_compress_consts)
+
+#endif /* !MLK_ARITH_BACKEND_X86_64_DEFAULT */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/compress_consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/compress_consts.h
new file mode 100644
index 0000000000..6e13d05805
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/compress_consts.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
+ * Do not modify it directly.
+ */
+
+#ifndef MLK_NATIVE_X86_64_SRC_COMPRESS_CONSTS_H
+#define MLK_NATIVE_X86_64_SRC_COMPRESS_CONSTS_H
+
+#include "../../../common.h"
+
+#ifndef __ASSEMBLER__
+
+#define mlk_compress_d4_data MLK_NAMESPACE(compress_d4_data)
+extern const uint8_t mlk_compress_d4_data[32];
+
+#define mlk_decompress_d4_data MLK_NAMESPACE(decompress_d4_data)
+extern const uint8_t mlk_decompress_d4_data[32];
+
+#define mlk_compress_d10_data MLK_NAMESPACE(compress_d10_data)
+extern const uint8_t mlk_compress_d10_data[32];
+
+#define mlk_decompress_d10_data MLK_NAMESPACE(decompress_d10_data)
+extern const uint8_t mlk_decompress_d10_data[32];
+
+#define mlk_compress_d5_data MLK_NAMESPACE(compress_d5_data)
+extern const uint8_t mlk_compress_d5_data[32];
+
+#define mlk_decompress_d5_data MLK_NAMESPACE(decompress_d5_data)
+extern const uint8_t mlk_decompress_d5_data[96];
+
+#define mlk_compress_d11_data MLK_NAMESPACE(compress_d11_data)
+extern const uint8_t mlk_compress_d11_data[64];
+
+#define mlk_decompress_d11_data MLK_NAMESPACE(decompress_d11_data)
+extern const uint8_t mlk_decompress_d11_data[128];
+
+#endif /* !__ASSEMBLER__ */
+
+#endif /* !MLK_NATIVE_X86_64_SRC_COMPRESS_CONSTS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/consts.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/consts.c
index 204e98d459..17877423e5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/consts.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/consts.c
@@ -3,18 +3,10 @@
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
-/* References
- * ==========
- *
- * - [REF_AVX2]
- * CRYSTALS-Kyber optimized AVX2 implementation
- * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
- * https://github.com/pq-crystals/kyber/tree/main/avx2
- */
-
/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
+ * WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
+ * Do not modify it directly.
*/
#include "../../../common.h"
@@ -22,234 +14,84 @@
#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include "align.h"
#include "consts.h"
-#define MLK_AVX2_Q MLKEM_Q
-/* check-magic: -1044 == pow(2,16,MLKEM_Q) */
-#define MLK_AVX2_MONT -1044
-/* check-magic: -3327 == pow(MLKEM_Q,-1,2^16) */
-#define MLK_AVX2_QINV -3327
-/* check-magic: 20159 == round(2^26/MLKEM_Q) */
-#define MLK_AVX2_V 20159
-/* check-magic: 1441 == pow(2,32-7,MLKEM_Q) */
-#define MLK_AVX2_FHI 1441
-/* check-magic: -10079 == signed_mod(MLK_AVX2_QINV*MLK_AVX2_FHI,2^16) */
-#define MLK_AVX2_FLO -10079
-/* check-magic: 1353 == pow(2, 32, MLKEM_Q) */
-#define MLK_AVX2_MONTSQHI 1353
-/* check-magic: 20553 == signed_mod(MLK_AVX2_QINV*MLK_AVX2_MONTSQHI,2^16) */
-#define MLK_AVX2_MONTSQLO 20553
-#define MLK_AVX2_MASK 4095
-#define MLK_AVX2_SHIFT 32
-
-const qdata_t mlk_qdata = {{
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XQ 0
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
- MLK_AVX2_Q,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XQINV 16
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
- MLK_AVX2_QINV,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XV 32
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
- MLK_AVX2_V,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XFLO 48
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
- MLK_AVX2_FLO,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XFHI 64
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
- MLK_AVX2_FHI,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO 80
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
- MLK_AVX2_MONTSQLO,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI 96
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
- MLK_AVX2_MONTSQHI,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMASK 112
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
- MLK_AVX2_MASK,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXB 128
- /* TODO: Explain these numbers */
- /* check-magic: off */
- 3854,
- 3340,
- 2826,
- 2312,
- 1798,
- 1284,
- 770,
- 256,
- 3854,
- 3340,
- 2826,
- 2312,
- 1798,
- 1284,
- 770,
- 256,
-/* check-magic: on */
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXD 144
- 7,
- 0,
- 6,
- 0,
- 5,
- 0,
- 4,
- 0,
- 3,
- 0,
- 2,
- 0,
- 1,
- 0,
- 0,
- 0,
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_ZETAS_EXP 160
-#include "x86_64_zetas.i"
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XSHIFT 624
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
- MLK_AVX2_SHIFT,
-#define MLK_AVX2_BACKEND_DATA_OFFSET_MULCACHE_TWIDDLES 640
-#include "x86_64_mulcache_twiddles.i"
-}};
+/*
+ * Table of zeta values used in the AVX2 NTTs
+ * See autogen for details.
+ */
+MLK_ALIGN const int16_t mlk_qdata[624] = {
+ 3854, 3340, 2826, 2312, 1798, 1284, 770, 256, 3854,
+ 3340, 2826, 2312, 1798, 1284, 770, 256, 7, 0,
+ 6, 0, 5, 0, 4, 0, 3, 0, 2,
+ 0, 1, 0, 0, 0, 31498, 31498, 31498, 31498,
+ -758, -758, -758, -758, 0, 0, 0, 0, 0,
+ 0, 0, 0, 14745, 14745, 14745, 14745, 14745, 14745,
+ 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745,
+ 14745, -359, -359, -359, -359, -359, -359, -359, -359,
+ -359, -359, -359, -359, -359, -359, -359, -359, 13525,
+ 13525, 13525, 13525, 13525, 13525, 13525, 13525, -12402, -12402,
+ -12402, -12402, -12402, -12402, -12402, -12402, 1493, 1493, 1493,
+ 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, 1422,
+ 1422, 1422, 1422, 1422, -20907, -20907, -20907, -20907, 27758,
+ 27758, 27758, 27758, -3799, -3799, -3799, -3799, -15690, -15690,
+ -15690, -15690, -171, -171, -171, -171, 622, 622, 622,
+ 622, 1577, 1577, 1577, 1577, 182, 182, 182, 182,
+ -5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057, 5571,
+ 5571, -1102, -1102, 21438, 21438, -26242, -26242, 573, 573,
+ -1325, -1325, 264, 264, 383, 383, -829, -829, 1458,
+ 1458, -1602, -1602, -130, -130, -5689, -6516, 1496, 30967,
+ -23565, 20179, 20710, 25080, -12796, 26616, 16064, -12442, 9134,
+ -650, -25986, 27837, 1223, 652, -552, 1015, -1293, 1491,
+ -282, -1544, 516, -8, -320, -666, -1618, -1162, 126,
+ 1469, -335, -11477, -32227, 20494, -27738, 945, -14883, 6182,
+ 32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276, -1103,
+ 555, -1251, 1550, 422, 177, -291, 1574, -246, 1159,
+ -777, -602, -1590, -872, 418, -156, 11182, 13387, -14233,
+ -21655, 13131, -4587, 23092, 5493, -32502, 30317, -18741, 12639,
+ 20100, 18525, 19529, -12619, 430, 843, 871, 105, 587,
+ -235, -460, 1653, 778, -147, 1483, 1119, 644, 349,
+ 329, -75, 787, 787, 787, 787, 787, 787, 787,
+ 787, 787, 787, 787, 787, 787, 787, 787, 787,
+ -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517,
+ -1517, -1517, -1517, -1517, -1517, -1517, -1517, 28191, 28191,
+ 28191, 28191, 28191, 28191, 28191, 28191, -16694, -16694, -16694,
+ -16694, -16694, -16694, -16694, -16694, 287, 287, 287, 287,
+ 287, 287, 287, 287, 202, 202, 202, 202, 202,
+ 202, 202, 202, 10690, 10690, 10690, 10690, 1358, 1358,
+ 1358, 1358, -11202, -11202, -11202, -11202, 31164, 31164, 31164,
+ 31164, 962, 962, 962, 962, -1202, -1202, -1202, -1202,
+ -1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, -28073,
+ -28073, 24313, 24313, -10532, -10532, 8800, 8800, 18426, 18426,
+ 8859, 8859, 26675, 26675, -16163, -16163, -681, -681, 1017,
+ 1017, 732, 732, 608, 608, -1542, -1542, 411, 411,
+ -205, -205, -1571, -1571, 19883, -28250, -15887, -8898, -28309,
+ 9075, -30199, 18249, 13426, 14017, -29156, -12757, 16832, 4311,
+ -24155, -17915, -853, -90, -271, 830, 107, -1421, -247,
+ -951, -398, 961, -1508, -725, 448, -1065, 677, -1275,
+ -31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989, 10335,
+ -7934, -22502, 10906, 31636, 28644, 23998, -17422, 817, 603,
+ 1322, -1465, -1215, 1218, -874, -1187, -1185, -1278, -1510,
+ -870, -108, 996, 958, 1522, 20297, 2146, 15355, -32384,
+ -6280, -14903, -11044, 14469, -21498, -20198, 23210, -17442, -23860,
+ -20257, 7756, 23132, 1097, 610, -1285, 384, -136, -1335,
+ 220, -1659, -1530, 794, -854, 478, -308, 991, -1460,
+ 1628, -1103, 555, -1251, 1550, 422, 177, -291, 1574,
+ -246, 1159, -777, -602, -1590, -872, 418, -156, 430,
+ 843, 871, 105, 587, -235, -460, 1653, 778, -147,
+ 1483, 1119, 644, 349, 329, -75, 817, 603, 1322,
+ -1465, -1215, 1218, -874, -1187, -1185, -1278, -1510, -870,
+ -108, 996, 958, 1522, 1097, 610, -1285, 384, -136,
+ -1335, 220, -1659, -1530, 794, -854, 478, -308, 991,
+ -1460, 1628, -335, -11477, -32227, 20494, -27738, 945, -14883,
+ 6182, 32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276,
+ 11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, -32502,
+ 30317, -18741, 12639, 20100, 18525, 19529, -12619, -31183, 25435,
+ -7382, 24391, -20927, 10946, 24214, 16989, 10335, -7934, -22502,
+ 10906, 31636, 28644, 23998, -17422, 20297, 2146, 15355, -32384,
+ -6280, -14903, -11044, 14469, -21498, -20198, 23210, -17442, -23860,
+ -20257, 7756, 23132,
+};
#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
@@ -258,30 +100,3 @@ MLK_EMPTY_CU(avx2_consts)
#endif /* !(MLK_ARITH_BACKEND_X86_64_DEFAULT && \
!MLK_CONFIG_MULTILEVEL_NO_SHARED) */
-
-/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
- * Don't modify by hand -- this is auto-generated by scripts/autogen. */
-#undef MLK_AVX2_Q
-#undef MLK_AVX2_MONT
-#undef MLK_AVX2_QINV
-#undef MLK_AVX2_V
-#undef MLK_AVX2_FHI
-#undef MLK_AVX2_FLO
-#undef MLK_AVX2_MONTSQHI
-#undef MLK_AVX2_MONTSQLO
-#undef MLK_AVX2_MASK
-#undef MLK_AVX2_SHIFT
-/* Some macros are kept because they are also defined in a header. */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XQ (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XQINV (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XV (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XFLO (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XFHI (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XMASK (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXB (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXD (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_ZETAS_EXP (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XSHIFT (consts.h) */
-/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_MULCACHE_TWIDDLES (consts.h) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/consts.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/consts.h
index 9dedfc4999..0d0c7a9993 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/consts.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/consts.h
@@ -3,43 +3,23 @@
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
-/* References
- * ==========
- *
- * - [REF_AVX2]
- * CRYSTALS-Kyber optimized AVX2 implementation
- * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
- * https://github.com/pq-crystals/kyber/tree/main/avx2
- */
-
/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
+ * WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
+ * Do not modify it directly.
*/
#ifndef MLK_NATIVE_X86_64_SRC_CONSTS_H
#define MLK_NATIVE_X86_64_SRC_CONSTS_H
#include "../../../common.h"
-
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XQ 0
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XQINV 16
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XV 32
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XFLO 48
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XFHI 64
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO 80
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI 96
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMASK 112
-#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXB 128
-#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXD 144
-#define MLK_AVX2_BACKEND_DATA_OFFSET_ZETAS_EXP 160
-#define MLK_AVX2_BACKEND_DATA_OFFSET_16XSHIFT 624
-#define MLK_AVX2_BACKEND_DATA_OFFSET_MULCACHE_TWIDDLES 640
+#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXB 0
+#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXD 16
+#define MLK_AVX2_BACKEND_DATA_OFFSET_ZETAS_EXP 32
+#define MLK_AVX2_BACKEND_DATA_OFFSET_MULCACHE_TWIDDLES 496
#ifndef __ASSEMBLER__
-#include "align.h"
-typedef MLK_ALIGNED_INT16(768) qdata_t;
#define mlk_qdata MLK_NAMESPACE(qdata)
-extern const qdata_t mlk_qdata;
-#endif /* !__ASSEMBLER__ */
+extern const int16_t mlk_qdata[624];
+#endif
#endif /* !MLK_NATIVE_X86_64_SRC_CONSTS_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/fq.inc b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/fq.inc
deleted file mode 100644
index 647011e208..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/fq.inc
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
- */
-
-.macro red16 r,rs=0,x=12
-vpmulhw %ymm1,%ymm\r,%ymm\x
-.if \rs
-vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
-.else
-vpsraw $10,%ymm\x,%ymm\x
-.endif
-vpmullw %ymm0,%ymm\x,%ymm\x
-vpsubw %ymm\x,%ymm\r,%ymm\r
-.endm
-
-.macro csubq r,x=12
-vpsubw %ymm0,%ymm\r,%ymm\r
-vpsraw $15,%ymm\r,%ymm\x
-vpand %ymm0,%ymm\x,%ymm\x
-vpaddw %ymm\x,%ymm\r,%ymm\r
-.endm
-
-.macro caddq r,x=12
-vpsraw $15,%ymm\r,%ymm\x
-vpand %ymm0,%ymm\x,%ymm\x
-vpaddw %ymm\x,%ymm\r,%ymm\r
-.endm
-
-/* Montgomery multiplication between b and ah,
- * with Montgomery twist of ah in al. */
-.macro fqmulprecomp al,ah,b,x=12
-vpmullw %ymm\al,%ymm\b,%ymm\x
-vpmulhw %ymm\ah,%ymm\b,%ymm\b
-vpmulhw %ymm0,%ymm\x,%ymm\x
-vpsubw %ymm\x,%ymm\b,%ymm\b
-.endm
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/intt.S
index 088adbc766..08d0bd7eb0 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/intt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/intt.S
@@ -37,662 +37,683 @@
* dev/x86_64/src/intt.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(invntt_avx2)
MLK_ASM_FN_SYMBOL(invntt_avx2)
- vmovdqa (%rsi), %ymm0
- vmovdqa 0x60(%rsi), %ymm2
- vmovdqa 0x80(%rsi), %ymm3
- vmovdqa (%rdi), %ymm4
- vmovdqa 0x40(%rdi), %ymm6
- vmovdqa 0x20(%rdi), %ymm5
- vmovdqa 0x60(%rdi), %ymm7
- vpmullw %ymm2, %ymm4, %ymm12
- vpmulhw %ymm3, %ymm4, %ymm4
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm4, %ymm4
- vpmullw %ymm2, %ymm6, %ymm12
- vpmulhw %ymm3, %ymm6, %ymm6
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm6, %ymm6
- vpmullw %ymm2, %ymm5, %ymm12
- vpmulhw %ymm3, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm5, %ymm5
- vpmullw %ymm2, %ymm7, %ymm12
- vpmulhw %ymm3, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm7, %ymm7
- vmovdqa 0x80(%rdi), %ymm8
- vmovdqa 0xc0(%rdi), %ymm10
- vmovdqa 0xa0(%rdi), %ymm9
- vmovdqa 0xe0(%rdi), %ymm11
- vpmullw %ymm2, %ymm8, %ymm12
- vpmulhw %ymm3, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm8, %ymm8
- vpmullw %ymm2, %ymm10, %ymm12
- vpmulhw %ymm3, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm10, %ymm10
- vpmullw %ymm2, %ymm9, %ymm12
- vpmulhw %ymm3, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm9, %ymm9
- vpmullw %ymm2, %ymm11, %ymm12
- vpmulhw %ymm3, %ymm11, %ymm11
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm11, %ymm11
- vpermq $0x4e, 0x4a0(%rsi), %ymm15 # ymm15 = mem[2,3,0,1]
- vpermq $0x4e, 0x460(%rsi), %ymm1 # ymm1 = mem[2,3,0,1]
- vpermq $0x4e, 0x4c0(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
- vpermq $0x4e, 0x480(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
- vmovdqa 0x100(%rsi), %ymm12
- vpshufb %ymm12, %ymm15, %ymm15
- vpshufb %ymm12, %ymm1, %ymm1
- vpshufb %ymm12, %ymm2, %ymm2
- vpshufb %ymm12, %ymm3, %ymm3
- vpsubw %ymm4, %ymm6, %ymm12
- vpaddw %ymm6, %ymm4, %ymm4
- vpsubw %ymm5, %ymm7, %ymm13
- vpmullw %ymm15, %ymm12, %ymm6
- vpaddw %ymm7, %ymm5, %ymm5
- vpsubw %ymm8, %ymm10, %ymm14
- vpmullw %ymm15, %ymm13, %ymm7
- vpaddw %ymm10, %ymm8, %ymm8
- vpsubw %ymm9, %ymm11, %ymm15
- vpmullw %ymm1, %ymm14, %ymm10
- vpaddw %ymm11, %ymm9, %ymm9
- vpmullw %ymm1, %ymm15, %ymm11
- vpmulhw %ymm2, %ymm12, %ymm12
- vpmulhw %ymm2, %ymm13, %ymm13
- vpmulhw %ymm3, %ymm14, %ymm14
- vpmulhw %ymm3, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm6, %ymm6
- vpmulhw %ymm0, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm6, %ymm12, %ymm6
- vpsubw %ymm7, %ymm13, %ymm7
- vpsubw %ymm10, %ymm14, %ymm10
- vpsubw %ymm11, %ymm15, %ymm11
- vpermq $0x4e, 0x420(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
- vpermq $0x4e, 0x440(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
- vmovdqa 0x100(%rsi), %ymm1
- vpshufb %ymm1, %ymm2, %ymm2
- vpshufb %ymm1, %ymm3, %ymm3
- vpsubw %ymm4, %ymm8, %ymm12
- vpaddw %ymm8, %ymm4, %ymm4
- vpsubw %ymm5, %ymm9, %ymm13
- vpmullw %ymm2, %ymm12, %ymm8
- vpaddw %ymm9, %ymm5, %ymm5
- vpsubw %ymm6, %ymm10, %ymm14
- vpmullw %ymm2, %ymm13, %ymm9
- vpaddw %ymm10, %ymm6, %ymm6
- vpsubw %ymm7, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm10
- vpaddw %ymm11, %ymm7, %ymm7
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm3, %ymm12, %ymm12
- vpmulhw %ymm3, %ymm13, %ymm13
- vpmulhw %ymm3, %ymm14, %ymm14
- vpmulhw %ymm3, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm8, %ymm12, %ymm8
- vpsubw %ymm9, %ymm13, %ymm9
- vpsubw %ymm10, %ymm14, %ymm10
- vpsubw %ymm11, %ymm15, %ymm11
- vpslld $0x10, %ymm5, %ymm3
- vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
- vpsrld $0x10, %ymm4, %ymm4
- vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
- vpslld $0x10, %ymm7, %ymm4
- vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
- vpsrld $0x10, %ymm6, %ymm6
- vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
- vpslld $0x10, %ymm9, %ymm6
- vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
- vpsrld $0x10, %ymm8, %ymm8
- vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
- vpslld $0x10, %ymm11, %ymm8
- vpblendw $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7],ymm10[8],ymm8[9],ymm10[10],ymm8[11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
- vpsrld $0x10, %ymm10, %ymm10
- vpblendw $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7],ymm10[8],ymm11[9],ymm10[10],ymm11[11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
- vmovdqa 0x120(%rsi), %ymm12
- vpermd 0x3e0(%rsi), %ymm12, %ymm2
- vpermd 0x400(%rsi), %ymm12, %ymm10
- vpsubw %ymm3, %ymm5, %ymm12
- vpaddw %ymm5, %ymm3, %ymm3
- vpsubw %ymm4, %ymm7, %ymm13
- vpmullw %ymm2, %ymm12, %ymm5
- vpaddw %ymm7, %ymm4, %ymm4
- vpsubw %ymm6, %ymm9, %ymm14
- vpmullw %ymm2, %ymm13, %ymm7
- vpaddw %ymm9, %ymm6, %ymm6
- vpsubw %ymm8, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm9
- vpaddw %ymm11, %ymm8, %ymm8
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm10, %ymm12, %ymm12
- vpmulhw %ymm10, %ymm13, %ymm13
- vpmulhw %ymm10, %ymm14, %ymm14
- vpmulhw %ymm10, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm5, %ymm12, %ymm5
- vpsubw %ymm7, %ymm13, %ymm7
- vpsubw %ymm9, %ymm14, %ymm9
- vpsubw %ymm11, %ymm15, %ymm11
- vmovdqa 0x40(%rsi), %ymm1
- vpmulhw %ymm1, %ymm3, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm3, %ymm3
- vmovsldup %ymm4, %ymm10 # ymm10 = ymm4[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7]
- vpsrlq $0x20, %ymm3, %ymm3
- vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
- vmovsldup %ymm8, %ymm3 # ymm3 = ymm8[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
- vpsrlq $0x20, %ymm6, %ymm6
- vpblendd $0xaa, %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7]
- vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
- vpsrlq $0x20, %ymm5, %ymm5
- vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
- vmovsldup %ymm11, %ymm5 # ymm5 = ymm11[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7]
- vpsrlq $0x20, %ymm9, %ymm9
- vpblendd $0xaa, %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4],ymm11[5],ymm9[6],ymm11[7]
- vpermq $0x1b, 0x3a0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0]
- vpermq $0x1b, 0x3c0(%rsi), %ymm9 # ymm9 = mem[3,2,1,0]
- vpsubw %ymm10, %ymm4, %ymm12
- vpaddw %ymm4, %ymm10, %ymm10
- vpsubw %ymm3, %ymm8, %ymm13
- vpmullw %ymm2, %ymm12, %ymm4
- vpaddw %ymm8, %ymm3, %ymm3
- vpsubw %ymm6, %ymm7, %ymm14
- vpmullw %ymm2, %ymm13, %ymm8
- vpaddw %ymm7, %ymm6, %ymm6
- vpsubw %ymm5, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm7
- vpaddw %ymm11, %ymm5, %ymm5
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm9, %ymm12, %ymm12
- vpmulhw %ymm9, %ymm13, %ymm13
- vpmulhw %ymm9, %ymm14, %ymm14
- vpmulhw %ymm9, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm4, %ymm4
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm4, %ymm12, %ymm4
- vpsubw %ymm8, %ymm13, %ymm8
- vpsubw %ymm7, %ymm14, %ymm7
- vpsubw %ymm11, %ymm15, %ymm11
- vpmulhw %ymm1, %ymm10, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm10, %ymm10
- vpunpcklqdq %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0],ymm3[0],ymm10[2],ymm3[2]
- vpunpckhqdq %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
- vpunpcklqdq %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
- vpunpckhqdq %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
- vpunpcklqdq %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0],ymm8[0],ymm4[2],ymm8[2]
- vpunpckhqdq %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[1],ymm8[1],ymm4[3],ymm8[3]
- vpunpcklqdq %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
- vpunpckhqdq %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
- vpermq $0x4e, 0x360(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
- vpermq $0x4e, 0x380(%rsi), %ymm7 # ymm7 = mem[2,3,0,1]
- vpsubw %ymm9, %ymm3, %ymm12
- vpaddw %ymm3, %ymm9, %ymm9
- vpsubw %ymm10, %ymm5, %ymm13
- vpmullw %ymm2, %ymm12, %ymm3
- vpaddw %ymm5, %ymm10, %ymm10
- vpsubw %ymm6, %ymm8, %ymm14
- vpmullw %ymm2, %ymm13, %ymm5
- vpaddw %ymm8, %ymm6, %ymm6
- vpsubw %ymm4, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm8
- vpaddw %ymm11, %ymm4, %ymm4
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm7, %ymm12, %ymm12
- vpmulhw %ymm7, %ymm13, %ymm13
- vpmulhw %ymm7, %ymm14, %ymm14
- vpmulhw %ymm7, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm3, %ymm3
- vpmulhw %ymm0, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm3, %ymm12, %ymm3
- vpsubw %ymm5, %ymm13, %ymm5
- vpsubw %ymm8, %ymm14, %ymm8
- vpsubw %ymm11, %ymm15, %ymm11
- vpmulhw %ymm1, %ymm9, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm9, %ymm9
- vperm2i128 $0x20, %ymm10, %ymm9, %ymm7 # ymm7 = ymm9[0,1],ymm10[0,1]
- vperm2i128 $0x31, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[2,3],ymm10[2,3]
- vperm2i128 $0x20, %ymm4, %ymm6, %ymm9 # ymm9 = ymm6[0,1],ymm4[0,1]
- vperm2i128 $0x31, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[2,3],ymm4[2,3]
- vperm2i128 $0x20, %ymm5, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm5[0,1]
- vperm2i128 $0x31, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[2,3],ymm5[2,3]
- vperm2i128 $0x20, %ymm11, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm11[0,1]
- vperm2i128 $0x31, %ymm11, %ymm8, %ymm11 # ymm11 = ymm8[2,3],ymm11[2,3]
- vmovdqa 0x320(%rsi), %ymm2
- vmovdqa 0x340(%rsi), %ymm8
- vpsubw %ymm7, %ymm10, %ymm12
- vpaddw %ymm10, %ymm7, %ymm7
- vpsubw %ymm9, %ymm4, %ymm13
- vpmullw %ymm2, %ymm12, %ymm10
- vpaddw %ymm4, %ymm9, %ymm9
- vpsubw %ymm6, %ymm5, %ymm14
- vpmullw %ymm2, %ymm13, %ymm4
- vpaddw %ymm5, %ymm6, %ymm6
- vpsubw %ymm3, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm5
- vpaddw %ymm11, %ymm3, %ymm3
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm8, %ymm12, %ymm12
- vpmulhw %ymm8, %ymm13, %ymm13
- vpmulhw %ymm8, %ymm14, %ymm14
- vpmulhw %ymm8, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm4, %ymm4
- vpmulhw %ymm0, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm10, %ymm12, %ymm10
- vpsubw %ymm4, %ymm13, %ymm4
- vpsubw %ymm5, %ymm14, %ymm5
- vpsubw %ymm11, %ymm15, %ymm11
- vpmulhw %ymm1, %ymm7, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm7, %ymm7
- vmovdqa %ymm7, (%rdi)
- vmovdqa %ymm9, 0x20(%rdi)
- vmovdqa %ymm6, 0x40(%rdi)
- vmovdqa %ymm3, 0x60(%rdi)
- vmovdqa %ymm10, 0x80(%rdi)
- vmovdqa %ymm4, 0xa0(%rdi)
- vmovdqa %ymm5, 0xc0(%rdi)
- vmovdqa %ymm11, 0xe0(%rdi)
- vmovdqa 0x60(%rsi), %ymm2
- vmovdqa 0x80(%rsi), %ymm3
- vmovdqa 0x100(%rdi), %ymm4
- vmovdqa 0x140(%rdi), %ymm6
- vmovdqa 0x120(%rdi), %ymm5
- vmovdqa 0x160(%rdi), %ymm7
- vpmullw %ymm2, %ymm4, %ymm12
- vpmulhw %ymm3, %ymm4, %ymm4
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm4, %ymm4
- vpmullw %ymm2, %ymm6, %ymm12
- vpmulhw %ymm3, %ymm6, %ymm6
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm6, %ymm6
- vpmullw %ymm2, %ymm5, %ymm12
- vpmulhw %ymm3, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm5, %ymm5
- vpmullw %ymm2, %ymm7, %ymm12
- vpmulhw %ymm3, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm7, %ymm7
- vmovdqa 0x180(%rdi), %ymm8
- vmovdqa 0x1c0(%rdi), %ymm10
- vmovdqa 0x1a0(%rdi), %ymm9
- vmovdqa 0x1e0(%rdi), %ymm11
- vpmullw %ymm2, %ymm8, %ymm12
- vpmulhw %ymm3, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm8, %ymm8
- vpmullw %ymm2, %ymm10, %ymm12
- vpmulhw %ymm3, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm10, %ymm10
- vpmullw %ymm2, %ymm9, %ymm12
- vpmulhw %ymm3, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm9, %ymm9
- vpmullw %ymm2, %ymm11, %ymm12
- vpmulhw %ymm3, %ymm11, %ymm11
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm11, %ymm11
- vpermq $0x4e, 0x2e0(%rsi), %ymm15 # ymm15 = mem[2,3,0,1]
- vpermq $0x4e, 0x2a0(%rsi), %ymm1 # ymm1 = mem[2,3,0,1]
- vpermq $0x4e, 0x300(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
- vpermq $0x4e, 0x2c0(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
- vmovdqa 0x100(%rsi), %ymm12
- vpshufb %ymm12, %ymm15, %ymm15
- vpshufb %ymm12, %ymm1, %ymm1
- vpshufb %ymm12, %ymm2, %ymm2
- vpshufb %ymm12, %ymm3, %ymm3
- vpsubw %ymm4, %ymm6, %ymm12
- vpaddw %ymm6, %ymm4, %ymm4
- vpsubw %ymm5, %ymm7, %ymm13
- vpmullw %ymm15, %ymm12, %ymm6
- vpaddw %ymm7, %ymm5, %ymm5
- vpsubw %ymm8, %ymm10, %ymm14
- vpmullw %ymm15, %ymm13, %ymm7
- vpaddw %ymm10, %ymm8, %ymm8
- vpsubw %ymm9, %ymm11, %ymm15
- vpmullw %ymm1, %ymm14, %ymm10
- vpaddw %ymm11, %ymm9, %ymm9
- vpmullw %ymm1, %ymm15, %ymm11
- vpmulhw %ymm2, %ymm12, %ymm12
- vpmulhw %ymm2, %ymm13, %ymm13
- vpmulhw %ymm3, %ymm14, %ymm14
- vpmulhw %ymm3, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm6, %ymm6
- vpmulhw %ymm0, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm6, %ymm12, %ymm6
- vpsubw %ymm7, %ymm13, %ymm7
- vpsubw %ymm10, %ymm14, %ymm10
- vpsubw %ymm11, %ymm15, %ymm11
- vpermq $0x4e, 0x260(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
- vpermq $0x4e, 0x280(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
- vmovdqa 0x100(%rsi), %ymm1
- vpshufb %ymm1, %ymm2, %ymm2
- vpshufb %ymm1, %ymm3, %ymm3
- vpsubw %ymm4, %ymm8, %ymm12
- vpaddw %ymm8, %ymm4, %ymm4
- vpsubw %ymm5, %ymm9, %ymm13
- vpmullw %ymm2, %ymm12, %ymm8
- vpaddw %ymm9, %ymm5, %ymm5
- vpsubw %ymm6, %ymm10, %ymm14
- vpmullw %ymm2, %ymm13, %ymm9
- vpaddw %ymm10, %ymm6, %ymm6
- vpsubw %ymm7, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm10
- vpaddw %ymm11, %ymm7, %ymm7
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm3, %ymm12, %ymm12
- vpmulhw %ymm3, %ymm13, %ymm13
- vpmulhw %ymm3, %ymm14, %ymm14
- vpmulhw %ymm3, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm8, %ymm12, %ymm8
- vpsubw %ymm9, %ymm13, %ymm9
- vpsubw %ymm10, %ymm14, %ymm10
- vpsubw %ymm11, %ymm15, %ymm11
- vpslld $0x10, %ymm5, %ymm3
- vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
- vpsrld $0x10, %ymm4, %ymm4
- vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
- vpslld $0x10, %ymm7, %ymm4
- vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
- vpsrld $0x10, %ymm6, %ymm6
- vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
- vpslld $0x10, %ymm9, %ymm6
- vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
- vpsrld $0x10, %ymm8, %ymm8
- vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
- vpslld $0x10, %ymm11, %ymm8
- vpblendw $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7],ymm10[8],ymm8[9],ymm10[10],ymm8[11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
- vpsrld $0x10, %ymm10, %ymm10
- vpblendw $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7],ymm10[8],ymm11[9],ymm10[10],ymm11[11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
- vmovdqa 0x120(%rsi), %ymm12
- vpermd 0x220(%rsi), %ymm12, %ymm2
- vpermd 0x240(%rsi), %ymm12, %ymm10
- vpsubw %ymm3, %ymm5, %ymm12
- vpaddw %ymm5, %ymm3, %ymm3
- vpsubw %ymm4, %ymm7, %ymm13
- vpmullw %ymm2, %ymm12, %ymm5
- vpaddw %ymm7, %ymm4, %ymm4
- vpsubw %ymm6, %ymm9, %ymm14
- vpmullw %ymm2, %ymm13, %ymm7
- vpaddw %ymm9, %ymm6, %ymm6
- vpsubw %ymm8, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm9
- vpaddw %ymm11, %ymm8, %ymm8
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm10, %ymm12, %ymm12
- vpmulhw %ymm10, %ymm13, %ymm13
- vpmulhw %ymm10, %ymm14, %ymm14
- vpmulhw %ymm10, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm5, %ymm12, %ymm5
- vpsubw %ymm7, %ymm13, %ymm7
- vpsubw %ymm9, %ymm14, %ymm9
- vpsubw %ymm11, %ymm15, %ymm11
- vmovdqa 0x40(%rsi), %ymm1
- vpmulhw %ymm1, %ymm3, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm3, %ymm3
- vmovsldup %ymm4, %ymm10 # ymm10 = ymm4[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7]
- vpsrlq $0x20, %ymm3, %ymm3
- vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
- vmovsldup %ymm8, %ymm3 # ymm3 = ymm8[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
- vpsrlq $0x20, %ymm6, %ymm6
- vpblendd $0xaa, %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7]
- vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
- vpsrlq $0x20, %ymm5, %ymm5
- vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
- vmovsldup %ymm11, %ymm5 # ymm5 = ymm11[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7]
- vpsrlq $0x20, %ymm9, %ymm9
- vpblendd $0xaa, %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4],ymm11[5],ymm9[6],ymm11[7]
- vpermq $0x1b, 0x1e0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0]
- vpermq $0x1b, 0x200(%rsi), %ymm9 # ymm9 = mem[3,2,1,0]
- vpsubw %ymm10, %ymm4, %ymm12
- vpaddw %ymm4, %ymm10, %ymm10
- vpsubw %ymm3, %ymm8, %ymm13
- vpmullw %ymm2, %ymm12, %ymm4
- vpaddw %ymm8, %ymm3, %ymm3
- vpsubw %ymm6, %ymm7, %ymm14
- vpmullw %ymm2, %ymm13, %ymm8
- vpaddw %ymm7, %ymm6, %ymm6
- vpsubw %ymm5, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm7
- vpaddw %ymm11, %ymm5, %ymm5
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm9, %ymm12, %ymm12
- vpmulhw %ymm9, %ymm13, %ymm13
- vpmulhw %ymm9, %ymm14, %ymm14
- vpmulhw %ymm9, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm4, %ymm4
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm4, %ymm12, %ymm4
- vpsubw %ymm8, %ymm13, %ymm8
- vpsubw %ymm7, %ymm14, %ymm7
- vpsubw %ymm11, %ymm15, %ymm11
- vpmulhw %ymm1, %ymm10, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm10, %ymm10
- vpunpcklqdq %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0],ymm3[0],ymm10[2],ymm3[2]
- vpunpckhqdq %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
- vpunpcklqdq %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
- vpunpckhqdq %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
- vpunpcklqdq %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0],ymm8[0],ymm4[2],ymm8[2]
- vpunpckhqdq %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[1],ymm8[1],ymm4[3],ymm8[3]
- vpunpcklqdq %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
- vpunpckhqdq %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
- vpermq $0x4e, 0x1a0(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
- vpermq $0x4e, 0x1c0(%rsi), %ymm7 # ymm7 = mem[2,3,0,1]
- vpsubw %ymm9, %ymm3, %ymm12
- vpaddw %ymm3, %ymm9, %ymm9
- vpsubw %ymm10, %ymm5, %ymm13
- vpmullw %ymm2, %ymm12, %ymm3
- vpaddw %ymm5, %ymm10, %ymm10
- vpsubw %ymm6, %ymm8, %ymm14
- vpmullw %ymm2, %ymm13, %ymm5
- vpaddw %ymm8, %ymm6, %ymm6
- vpsubw %ymm4, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm8
- vpaddw %ymm11, %ymm4, %ymm4
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm7, %ymm12, %ymm12
- vpmulhw %ymm7, %ymm13, %ymm13
- vpmulhw %ymm7, %ymm14, %ymm14
- vpmulhw %ymm7, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm3, %ymm3
- vpmulhw %ymm0, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm3, %ymm12, %ymm3
- vpsubw %ymm5, %ymm13, %ymm5
- vpsubw %ymm8, %ymm14, %ymm8
- vpsubw %ymm11, %ymm15, %ymm11
- vpmulhw %ymm1, %ymm9, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm9, %ymm9
- vperm2i128 $0x20, %ymm10, %ymm9, %ymm7 # ymm7 = ymm9[0,1],ymm10[0,1]
- vperm2i128 $0x31, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[2,3],ymm10[2,3]
- vperm2i128 $0x20, %ymm4, %ymm6, %ymm9 # ymm9 = ymm6[0,1],ymm4[0,1]
- vperm2i128 $0x31, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[2,3],ymm4[2,3]
- vperm2i128 $0x20, %ymm5, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm5[0,1]
- vperm2i128 $0x31, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[2,3],ymm5[2,3]
- vperm2i128 $0x20, %ymm11, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm11[0,1]
- vperm2i128 $0x31, %ymm11, %ymm8, %ymm11 # ymm11 = ymm8[2,3],ymm11[2,3]
- vmovdqa 0x160(%rsi), %ymm2
- vmovdqa 0x180(%rsi), %ymm8
- vpsubw %ymm7, %ymm10, %ymm12
- vpaddw %ymm10, %ymm7, %ymm7
- vpsubw %ymm9, %ymm4, %ymm13
- vpmullw %ymm2, %ymm12, %ymm10
- vpaddw %ymm4, %ymm9, %ymm9
- vpsubw %ymm6, %ymm5, %ymm14
- vpmullw %ymm2, %ymm13, %ymm4
- vpaddw %ymm5, %ymm6, %ymm6
- vpsubw %ymm3, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm5
- vpaddw %ymm11, %ymm3, %ymm3
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm8, %ymm12, %ymm12
- vpmulhw %ymm8, %ymm13, %ymm13
- vpmulhw %ymm8, %ymm14, %ymm14
- vpmulhw %ymm8, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm4, %ymm4
- vpmulhw %ymm0, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm10, %ymm12, %ymm10
- vpsubw %ymm4, %ymm13, %ymm4
- vpsubw %ymm5, %ymm14, %ymm5
- vpsubw %ymm11, %ymm15, %ymm11
- vpmulhw %ymm1, %ymm7, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm7, %ymm7
- vmovdqa %ymm7, 0x100(%rdi)
- vmovdqa %ymm9, 0x120(%rdi)
- vmovdqa %ymm6, 0x140(%rdi)
- vmovdqa %ymm3, 0x160(%rdi)
- vmovdqa %ymm10, 0x180(%rdi)
- vmovdqa %ymm4, 0x1a0(%rdi)
- vmovdqa %ymm5, 0x1c0(%rdi)
- vmovdqa %ymm11, 0x1e0(%rdi)
- vmovdqa (%rdi), %ymm4
- vmovdqa 0x100(%rdi), %ymm8
- vmovdqa 0x20(%rdi), %ymm5
- vmovdqa 0x120(%rdi), %ymm9
- vpbroadcastq 0x140(%rsi), %ymm2
- vmovdqa 0x40(%rdi), %ymm6
- vmovdqa 0x140(%rdi), %ymm10
- vmovdqa 0x60(%rdi), %ymm7
- vmovdqa 0x160(%rdi), %ymm11
- vpbroadcastq 0x148(%rsi), %ymm3
- vpsubw %ymm4, %ymm8, %ymm12
- vpaddw %ymm8, %ymm4, %ymm4
- vpsubw %ymm5, %ymm9, %ymm13
- vpmullw %ymm2, %ymm12, %ymm8
- vpaddw %ymm9, %ymm5, %ymm5
- vpsubw %ymm6, %ymm10, %ymm14
- vpmullw %ymm2, %ymm13, %ymm9
- vpaddw %ymm10, %ymm6, %ymm6
- vpsubw %ymm7, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm10
- vpaddw %ymm11, %ymm7, %ymm7
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm3, %ymm12, %ymm12
- vpmulhw %ymm3, %ymm13, %ymm13
- vpmulhw %ymm3, %ymm14, %ymm14
- vpmulhw %ymm3, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm8, %ymm12, %ymm8
- vpsubw %ymm9, %ymm13, %ymm9
- vpsubw %ymm10, %ymm14, %ymm10
- vpsubw %ymm11, %ymm15, %ymm11
- vmovdqa %ymm4, (%rdi)
- vmovdqa %ymm5, 0x20(%rdi)
- vmovdqa %ymm6, 0x40(%rdi)
- vmovdqa %ymm7, 0x60(%rdi)
- vmovdqa %ymm8, 0x100(%rdi)
- vmovdqa %ymm9, 0x120(%rdi)
- vmovdqa %ymm10, 0x140(%rdi)
- vmovdqa %ymm11, 0x160(%rdi)
- vmovdqa 0x80(%rdi), %ymm4
- vmovdqa 0x180(%rdi), %ymm8
- vmovdqa 0xa0(%rdi), %ymm5
- vmovdqa 0x1a0(%rdi), %ymm9
- vpbroadcastq 0x140(%rsi), %ymm2
- vmovdqa 0xc0(%rdi), %ymm6
- vmovdqa 0x1c0(%rdi), %ymm10
- vmovdqa 0xe0(%rdi), %ymm7
- vmovdqa 0x1e0(%rdi), %ymm11
- vpbroadcastq 0x148(%rsi), %ymm3
- vpsubw %ymm4, %ymm8, %ymm12
- vpaddw %ymm8, %ymm4, %ymm4
- vpsubw %ymm5, %ymm9, %ymm13
- vpmullw %ymm2, %ymm12, %ymm8
- vpaddw %ymm9, %ymm5, %ymm5
- vpsubw %ymm6, %ymm10, %ymm14
- vpmullw %ymm2, %ymm13, %ymm9
- vpaddw %ymm10, %ymm6, %ymm6
- vpsubw %ymm7, %ymm11, %ymm15
- vpmullw %ymm2, %ymm14, %ymm10
- vpaddw %ymm11, %ymm7, %ymm7
- vpmullw %ymm2, %ymm15, %ymm11
- vpmulhw %ymm3, %ymm12, %ymm12
- vpmulhw %ymm3, %ymm13, %ymm13
- vpmulhw %ymm3, %ymm14, %ymm14
- vpmulhw %ymm3, %ymm15, %ymm15
- vpmulhw %ymm0, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm8, %ymm12, %ymm8
- vpsubw %ymm9, %ymm13, %ymm9
- vpsubw %ymm10, %ymm14, %ymm10
- vpsubw %ymm11, %ymm15, %ymm11
- vmovdqa %ymm4, 0x80(%rdi)
- vmovdqa %ymm5, 0xa0(%rdi)
- vmovdqa %ymm6, 0xc0(%rdi)
- vmovdqa %ymm7, 0xe0(%rdi)
- vmovdqa %ymm8, 0x180(%rdi)
- vmovdqa %ymm9, 0x1a0(%rdi)
- vmovdqa %ymm10, 0x1c0(%rdi)
- vmovdqa %ymm11, 0x1e0(%rdi)
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0xd8a1d8a1, %eax # imm = 0xD8A1D8A1
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ movl $0x5a105a1, %eax # imm = 0x5A105A1
+ vmovd %eax, %xmm3
+ vpbroadcastd %xmm3, %ymm3
+ vmovdqa (%rdi), %ymm4
+ vmovdqa 0x40(%rdi), %ymm6
+ vmovdqa 0x20(%rdi), %ymm5
+ vmovdqa 0x60(%rdi), %ymm7
+ vpmullw %ymm2, %ymm4, %ymm12
+ vpmulhw %ymm3, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm6, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm6, %ymm6
+ vpmullw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm5, %ymm5
+ vpmullw %ymm2, %ymm7, %ymm12
+ vpmulhw %ymm3, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm7, %ymm7
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xc0(%rdi), %ymm10
+ vmovdqa 0xa0(%rdi), %ymm9
+ vmovdqa 0xe0(%rdi), %ymm11
+ vpmullw %ymm2, %ymm8, %ymm12
+ vpmulhw %ymm3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm8, %ymm8
+ vpmullw %ymm2, %ymm10, %ymm12
+ vpmulhw %ymm3, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm10, %ymm10
+ vpmullw %ymm2, %ymm9, %ymm12
+ vpmulhw %ymm3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vpmullw %ymm2, %ymm11, %ymm12
+ vpmulhw %ymm3, %ymm11, %ymm11
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm11, %ymm11
+ vpermq $0x4e, 0x3a0(%rsi), %ymm15 # ymm15 = mem[2,3,0,1]
+ vpermq $0x4e, 0x360(%rsi), %ymm1 # ymm1 = mem[2,3,0,1]
+ vpermq $0x4e, 0x3c0(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
+ vpermq $0x4e, 0x380(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
+ vmovdqa (%rsi), %ymm12
+ vpshufb %ymm12, %ymm15, %ymm15
+ vpshufb %ymm12, %ymm1, %ymm1
+ vpshufb %ymm12, %ymm2, %ymm2
+ vpshufb %ymm12, %ymm3, %ymm3
+ vpsubw %ymm4, %ymm6, %ymm12
+ vpaddw %ymm6, %ymm4, %ymm4
+ vpsubw %ymm5, %ymm7, %ymm13
+ vpmullw %ymm15, %ymm12, %ymm6
+ vpaddw %ymm7, %ymm5, %ymm5
+ vpsubw %ymm8, %ymm10, %ymm14
+ vpmullw %ymm15, %ymm13, %ymm7
+ vpaddw %ymm10, %ymm8, %ymm8
+ vpsubw %ymm9, %ymm11, %ymm15
+ vpmullw %ymm1, %ymm14, %ymm10
+ vpaddw %ymm11, %ymm9, %ymm9
+ vpmullw %ymm1, %ymm15, %ymm11
+ vpmulhw %ymm2, %ymm12, %ymm12
+ vpmulhw %ymm2, %ymm13, %ymm13
+ vpmulhw %ymm3, %ymm14, %ymm14
+ vpmulhw %ymm3, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm6, %ymm12, %ymm6
+ vpsubw %ymm7, %ymm13, %ymm7
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpermq $0x4e, 0x320(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
+ vpermq $0x4e, 0x340(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
+ vmovdqa (%rsi), %ymm1
+ vpshufb %ymm1, %ymm2, %ymm2
+ vpshufb %ymm1, %ymm3, %ymm3
+ vpsubw %ymm4, %ymm8, %ymm12
+ vpaddw %ymm8, %ymm4, %ymm4
+ vpsubw %ymm5, %ymm9, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm5
+ vpsubw %ymm6, %ymm10, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm6
+ vpsubw %ymm7, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm7
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm3, %ymm12, %ymm12
+ vpmulhw %ymm3, %ymm13, %ymm13
+ vpmulhw %ymm3, %ymm14, %ymm14
+ vpmulhw %ymm3, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm8, %ymm12, %ymm8
+ vpsubw %ymm9, %ymm13, %ymm9
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpslld $0x10, %ymm5, %ymm3
+ vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+ vpslld $0x10, %ymm7, %ymm4
+ vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+ vpslld $0x10, %ymm9, %ymm6
+ vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
+ vpsrld $0x10, %ymm8, %ymm8
+ vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+ vpslld $0x10, %ymm11, %ymm8
+ vpblendw $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7],ymm10[8],ymm8[9],ymm10[10],ymm8[11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
+ vpsrld $0x10, %ymm10, %ymm10
+ vpblendw $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7],ymm10[8],ymm11[9],ymm10[10],ymm11[11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
+ vmovdqa 0x20(%rsi), %ymm12
+ vpermd 0x2e0(%rsi), %ymm12, %ymm2
+ vpermd 0x300(%rsi), %ymm12, %ymm10
+ vpsubw %ymm3, %ymm5, %ymm12
+ vpaddw %ymm5, %ymm3, %ymm3
+ vpsubw %ymm4, %ymm7, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm5
+ vpaddw %ymm7, %ymm4, %ymm4
+ vpsubw %ymm6, %ymm9, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm7
+ vpaddw %ymm9, %ymm6, %ymm6
+ vpsubw %ymm8, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm9
+ vpaddw %ymm11, %ymm8, %ymm8
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm10, %ymm12, %ymm12
+ vpmulhw %ymm10, %ymm13, %ymm13
+ vpmulhw %ymm10, %ymm14, %ymm14
+ vpmulhw %ymm10, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm5, %ymm12, %ymm5
+ vpsubw %ymm7, %ymm13, %ymm7
+ vpsubw %ymm9, %ymm14, %ymm9
+ vpsubw %ymm11, %ymm15, %ymm11
+ movl $0x4ebf4ebf, %eax # imm = 0x4EBF4EBF
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ vpmulhw %ymm1, %ymm3, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm3, %ymm3
+ vmovsldup %ymm4, %ymm10 # ymm10 = ymm4[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+ vmovsldup %ymm8, %ymm3 # ymm3 = ymm8[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7]
+ vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
+ vpsrlq $0x20, %ymm5, %ymm5
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
+ vmovsldup %ymm11, %ymm5 # ymm5 = ymm11[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7]
+ vpsrlq $0x20, %ymm9, %ymm9
+ vpblendd $0xaa, %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4],ymm11[5],ymm9[6],ymm11[7]
+ vpermq $0x1b, 0x2a0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0]
+ vpermq $0x1b, 0x2c0(%rsi), %ymm9 # ymm9 = mem[3,2,1,0]
+ vpsubw %ymm10, %ymm4, %ymm12
+ vpaddw %ymm4, %ymm10, %ymm10
+ vpsubw %ymm3, %ymm8, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm4
+ vpaddw %ymm8, %ymm3, %ymm3
+ vpsubw %ymm6, %ymm7, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm8
+ vpaddw %ymm7, %ymm6, %ymm6
+ vpsubw %ymm5, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm7
+ vpaddw %ymm11, %ymm5, %ymm5
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm9, %ymm12, %ymm12
+ vpmulhw %ymm9, %ymm13, %ymm13
+ vpmulhw %ymm9, %ymm14, %ymm14
+ vpmulhw %ymm9, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm4, %ymm12, %ymm4
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm7, %ymm14, %ymm7
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpmulhw %ymm1, %ymm10, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm10, %ymm10
+ vpunpcklqdq %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0],ymm3[0],ymm10[2],ymm3[2]
+ vpunpckhqdq %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
+ vpunpcklqdq %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+ vpunpcklqdq %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0],ymm8[0],ymm4[2],ymm8[2]
+ vpunpckhqdq %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[1],ymm8[1],ymm4[3],ymm8[3]
+ vpunpcklqdq %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
+ vpunpckhqdq %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
+ vpermq $0x4e, 0x260(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
+ vpermq $0x4e, 0x280(%rsi), %ymm7 # ymm7 = mem[2,3,0,1]
+ vpsubw %ymm9, %ymm3, %ymm12
+ vpaddw %ymm3, %ymm9, %ymm9
+ vpsubw %ymm10, %ymm5, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm3
+ vpaddw %ymm5, %ymm10, %ymm10
+ vpsubw %ymm6, %ymm8, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm5
+ vpaddw %ymm8, %ymm6, %ymm6
+ vpsubw %ymm4, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm8
+ vpaddw %ymm11, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm7, %ymm12, %ymm12
+ vpmulhw %ymm7, %ymm13, %ymm13
+ vpmulhw %ymm7, %ymm14, %ymm14
+ vpmulhw %ymm7, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm3, %ymm3
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm3, %ymm12, %ymm3
+ vpsubw %ymm5, %ymm13, %ymm5
+ vpsubw %ymm8, %ymm14, %ymm8
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpmulhw %ymm1, %ymm9, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vperm2i128 $0x20, %ymm10, %ymm9, %ymm7 # ymm7 = ymm9[0,1],ymm10[0,1]
+ vperm2i128 $0x31, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[2,3],ymm10[2,3]
+ vperm2i128 $0x20, %ymm4, %ymm6, %ymm9 # ymm9 = ymm6[0,1],ymm4[0,1]
+ vperm2i128 $0x31, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[2,3],ymm4[2,3]
+ vperm2i128 $0x20, %ymm5, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm5[0,1]
+ vperm2i128 $0x31, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[2,3],ymm5[2,3]
+ vperm2i128 $0x20, %ymm11, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm11[0,1]
+ vperm2i128 $0x31, %ymm11, %ymm8, %ymm11 # ymm11 = ymm8[2,3],ymm11[2,3]
+ vmovdqa 0x220(%rsi), %ymm2
+ vmovdqa 0x240(%rsi), %ymm8
+ vpsubw %ymm7, %ymm10, %ymm12
+ vpaddw %ymm10, %ymm7, %ymm7
+ vpsubw %ymm9, %ymm4, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm10
+ vpaddw %ymm4, %ymm9, %ymm9
+ vpsubw %ymm6, %ymm5, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm4
+ vpaddw %ymm5, %ymm6, %ymm6
+ vpsubw %ymm3, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm5
+ vpaddw %ymm11, %ymm3, %ymm3
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm8, %ymm12, %ymm12
+ vpmulhw %ymm8, %ymm13, %ymm13
+ vpmulhw %ymm8, %ymm14, %ymm14
+ vpmulhw %ymm8, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm10, %ymm12, %ymm10
+ vpsubw %ymm4, %ymm13, %ymm4
+ vpsubw %ymm5, %ymm14, %ymm5
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpmulhw %ymm1, %ymm7, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm7, %ymm7
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa %ymm6, 0x40(%rdi)
+ vmovdqa %ymm3, 0x60(%rdi)
+ vmovdqa %ymm10, 0x80(%rdi)
+ vmovdqa %ymm4, 0xa0(%rdi)
+ vmovdqa %ymm5, 0xc0(%rdi)
+ vmovdqa %ymm11, 0xe0(%rdi)
+ movl $0xd8a1d8a1, %eax # imm = 0xD8A1D8A1
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ movl $0x5a105a1, %eax # imm = 0x5A105A1
+ vmovd %eax, %xmm3
+ vpbroadcastd %xmm3, %ymm3
+ vmovdqa 0x100(%rdi), %ymm4
+ vmovdqa 0x140(%rdi), %ymm6
+ vmovdqa 0x120(%rdi), %ymm5
+ vmovdqa 0x160(%rdi), %ymm7
+ vpmullw %ymm2, %ymm4, %ymm12
+ vpmulhw %ymm3, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm6, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm6, %ymm6
+ vpmullw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm5, %ymm5
+ vpmullw %ymm2, %ymm7, %ymm12
+ vpmulhw %ymm3, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm7, %ymm7
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1c0(%rdi), %ymm10
+ vmovdqa 0x1a0(%rdi), %ymm9
+ vmovdqa 0x1e0(%rdi), %ymm11
+ vpmullw %ymm2, %ymm8, %ymm12
+ vpmulhw %ymm3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm8, %ymm8
+ vpmullw %ymm2, %ymm10, %ymm12
+ vpmulhw %ymm3, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm10, %ymm10
+ vpmullw %ymm2, %ymm9, %ymm12
+ vpmulhw %ymm3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vpmullw %ymm2, %ymm11, %ymm12
+ vpmulhw %ymm3, %ymm11, %ymm11
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm11, %ymm11
+ vpermq $0x4e, 0x1e0(%rsi), %ymm15 # ymm15 = mem[2,3,0,1]
+ vpermq $0x4e, 0x1a0(%rsi), %ymm1 # ymm1 = mem[2,3,0,1]
+ vpermq $0x4e, 0x200(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
+ vpermq $0x4e, 0x1c0(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
+ vmovdqa (%rsi), %ymm12
+ vpshufb %ymm12, %ymm15, %ymm15
+ vpshufb %ymm12, %ymm1, %ymm1
+ vpshufb %ymm12, %ymm2, %ymm2
+ vpshufb %ymm12, %ymm3, %ymm3
+ vpsubw %ymm4, %ymm6, %ymm12
+ vpaddw %ymm6, %ymm4, %ymm4
+ vpsubw %ymm5, %ymm7, %ymm13
+ vpmullw %ymm15, %ymm12, %ymm6
+ vpaddw %ymm7, %ymm5, %ymm5
+ vpsubw %ymm8, %ymm10, %ymm14
+ vpmullw %ymm15, %ymm13, %ymm7
+ vpaddw %ymm10, %ymm8, %ymm8
+ vpsubw %ymm9, %ymm11, %ymm15
+ vpmullw %ymm1, %ymm14, %ymm10
+ vpaddw %ymm11, %ymm9, %ymm9
+ vpmullw %ymm1, %ymm15, %ymm11
+ vpmulhw %ymm2, %ymm12, %ymm12
+ vpmulhw %ymm2, %ymm13, %ymm13
+ vpmulhw %ymm3, %ymm14, %ymm14
+ vpmulhw %ymm3, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm6, %ymm12, %ymm6
+ vpsubw %ymm7, %ymm13, %ymm7
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpermq $0x4e, 0x160(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
+ vpermq $0x4e, 0x180(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
+ vmovdqa (%rsi), %ymm1
+ vpshufb %ymm1, %ymm2, %ymm2
+ vpshufb %ymm1, %ymm3, %ymm3
+ vpsubw %ymm4, %ymm8, %ymm12
+ vpaddw %ymm8, %ymm4, %ymm4
+ vpsubw %ymm5, %ymm9, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm5
+ vpsubw %ymm6, %ymm10, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm6
+ vpsubw %ymm7, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm7
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm3, %ymm12, %ymm12
+ vpmulhw %ymm3, %ymm13, %ymm13
+ vpmulhw %ymm3, %ymm14, %ymm14
+ vpmulhw %ymm3, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm8, %ymm12, %ymm8
+ vpsubw %ymm9, %ymm13, %ymm9
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpslld $0x10, %ymm5, %ymm3
+ vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+ vpslld $0x10, %ymm7, %ymm4
+ vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+ vpslld $0x10, %ymm9, %ymm6
+ vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
+ vpsrld $0x10, %ymm8, %ymm8
+ vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+ vpslld $0x10, %ymm11, %ymm8
+ vpblendw $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7],ymm10[8],ymm8[9],ymm10[10],ymm8[11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
+ vpsrld $0x10, %ymm10, %ymm10
+ vpblendw $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7],ymm10[8],ymm11[9],ymm10[10],ymm11[11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
+ vmovdqa 0x20(%rsi), %ymm12
+ vpermd 0x120(%rsi), %ymm12, %ymm2
+ vpermd 0x140(%rsi), %ymm12, %ymm10
+ vpsubw %ymm3, %ymm5, %ymm12
+ vpaddw %ymm5, %ymm3, %ymm3
+ vpsubw %ymm4, %ymm7, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm5
+ vpaddw %ymm7, %ymm4, %ymm4
+ vpsubw %ymm6, %ymm9, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm7
+ vpaddw %ymm9, %ymm6, %ymm6
+ vpsubw %ymm8, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm9
+ vpaddw %ymm11, %ymm8, %ymm8
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm10, %ymm12, %ymm12
+ vpmulhw %ymm10, %ymm13, %ymm13
+ vpmulhw %ymm10, %ymm14, %ymm14
+ vpmulhw %ymm10, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm5, %ymm12, %ymm5
+ vpsubw %ymm7, %ymm13, %ymm7
+ vpsubw %ymm9, %ymm14, %ymm9
+ vpsubw %ymm11, %ymm15, %ymm11
+ movl $0x4ebf4ebf, %eax # imm = 0x4EBF4EBF
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ vpmulhw %ymm1, %ymm3, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm3, %ymm3
+ vmovsldup %ymm4, %ymm10 # ymm10 = ymm4[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+ vmovsldup %ymm8, %ymm3 # ymm3 = ymm8[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7]
+ vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
+ vpsrlq $0x20, %ymm5, %ymm5
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
+ vmovsldup %ymm11, %ymm5 # ymm5 = ymm11[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7]
+ vpsrlq $0x20, %ymm9, %ymm9
+ vpblendd $0xaa, %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4],ymm11[5],ymm9[6],ymm11[7]
+ vpermq $0x1b, 0xe0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0]
+ vpermq $0x1b, 0x100(%rsi), %ymm9 # ymm9 = mem[3,2,1,0]
+ vpsubw %ymm10, %ymm4, %ymm12
+ vpaddw %ymm4, %ymm10, %ymm10
+ vpsubw %ymm3, %ymm8, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm4
+ vpaddw %ymm8, %ymm3, %ymm3
+ vpsubw %ymm6, %ymm7, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm8
+ vpaddw %ymm7, %ymm6, %ymm6
+ vpsubw %ymm5, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm7
+ vpaddw %ymm11, %ymm5, %ymm5
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm9, %ymm12, %ymm12
+ vpmulhw %ymm9, %ymm13, %ymm13
+ vpmulhw %ymm9, %ymm14, %ymm14
+ vpmulhw %ymm9, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm4, %ymm12, %ymm4
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm7, %ymm14, %ymm7
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpmulhw %ymm1, %ymm10, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm10, %ymm10
+ vpunpcklqdq %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0],ymm3[0],ymm10[2],ymm3[2]
+ vpunpckhqdq %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
+ vpunpcklqdq %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
+ vpunpcklqdq %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0],ymm8[0],ymm4[2],ymm8[2]
+ vpunpckhqdq %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[1],ymm8[1],ymm4[3],ymm8[3]
+ vpunpcklqdq %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
+ vpunpckhqdq %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
+ vpermq $0x4e, 0xa0(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
+ vpermq $0x4e, 0xc0(%rsi), %ymm7 # ymm7 = mem[2,3,0,1]
+ vpsubw %ymm9, %ymm3, %ymm12
+ vpaddw %ymm3, %ymm9, %ymm9
+ vpsubw %ymm10, %ymm5, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm3
+ vpaddw %ymm5, %ymm10, %ymm10
+ vpsubw %ymm6, %ymm8, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm5
+ vpaddw %ymm8, %ymm6, %ymm6
+ vpsubw %ymm4, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm8
+ vpaddw %ymm11, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm7, %ymm12, %ymm12
+ vpmulhw %ymm7, %ymm13, %ymm13
+ vpmulhw %ymm7, %ymm14, %ymm14
+ vpmulhw %ymm7, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm3, %ymm3
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm3, %ymm12, %ymm3
+ vpsubw %ymm5, %ymm13, %ymm5
+ vpsubw %ymm8, %ymm14, %ymm8
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpmulhw %ymm1, %ymm9, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vperm2i128 $0x20, %ymm10, %ymm9, %ymm7 # ymm7 = ymm9[0,1],ymm10[0,1]
+ vperm2i128 $0x31, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[2,3],ymm10[2,3]
+ vperm2i128 $0x20, %ymm4, %ymm6, %ymm9 # ymm9 = ymm6[0,1],ymm4[0,1]
+ vperm2i128 $0x31, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[2,3],ymm4[2,3]
+ vperm2i128 $0x20, %ymm5, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm5[0,1]
+ vperm2i128 $0x31, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[2,3],ymm5[2,3]
+ vperm2i128 $0x20, %ymm11, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm11[0,1]
+ vperm2i128 $0x31, %ymm11, %ymm8, %ymm11 # ymm11 = ymm8[2,3],ymm11[2,3]
+ vmovdqa 0x60(%rsi), %ymm2
+ vmovdqa 0x80(%rsi), %ymm8
+ vpsubw %ymm7, %ymm10, %ymm12
+ vpaddw %ymm10, %ymm7, %ymm7
+ vpsubw %ymm9, %ymm4, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm10
+ vpaddw %ymm4, %ymm9, %ymm9
+ vpsubw %ymm6, %ymm5, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm4
+ vpaddw %ymm5, %ymm6, %ymm6
+ vpsubw %ymm3, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm5
+ vpaddw %ymm11, %ymm3, %ymm3
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm8, %ymm12, %ymm12
+ vpmulhw %ymm8, %ymm13, %ymm13
+ vpmulhw %ymm8, %ymm14, %ymm14
+ vpmulhw %ymm8, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm10, %ymm12, %ymm10
+ vpsubw %ymm4, %ymm13, %ymm4
+ vpsubw %ymm5, %ymm14, %ymm5
+ vpsubw %ymm11, %ymm15, %ymm11
+ vpmulhw %ymm1, %ymm7, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm7, %ymm7
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa %ymm6, 0x140(%rdi)
+ vmovdqa %ymm3, 0x160(%rdi)
+ vmovdqa %ymm10, 0x180(%rdi)
+ vmovdqa %ymm4, 0x1a0(%rdi)
+ vmovdqa %ymm5, 0x1c0(%rdi)
+ vmovdqa %ymm11, 0x1e0(%rdi)
+ vmovdqa (%rdi), %ymm4
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x20(%rdi), %ymm5
+ vmovdqa 0x120(%rdi), %ymm9
+ vpbroadcastq 0x40(%rsi), %ymm2
+ vmovdqa 0x40(%rdi), %ymm6
+ vmovdqa 0x140(%rdi), %ymm10
+ vmovdqa 0x60(%rdi), %ymm7
+ vmovdqa 0x160(%rdi), %ymm11
+ vpbroadcastq 0x48(%rsi), %ymm3
+ vpsubw %ymm4, %ymm8, %ymm12
+ vpaddw %ymm8, %ymm4, %ymm4
+ vpsubw %ymm5, %ymm9, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm5
+ vpsubw %ymm6, %ymm10, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm6
+ vpsubw %ymm7, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm7
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm3, %ymm12, %ymm12
+ vpmulhw %ymm3, %ymm13, %ymm13
+ vpmulhw %ymm3, %ymm14, %ymm14
+ vpmulhw %ymm3, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm8, %ymm12, %ymm8
+ vpsubw %ymm9, %ymm13, %ymm9
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpsubw %ymm11, %ymm15, %ymm11
+ vmovdqa %ymm4, (%rdi)
+ vmovdqa %ymm5, 0x20(%rdi)
+ vmovdqa %ymm6, 0x40(%rdi)
+ vmovdqa %ymm7, 0x60(%rdi)
+ vmovdqa %ymm8, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa %ymm10, 0x140(%rdi)
+ vmovdqa %ymm11, 0x160(%rdi)
+ vmovdqa 0x80(%rdi), %ymm4
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm5
+ vmovdqa 0x1a0(%rdi), %ymm9
+ vpbroadcastq 0x40(%rsi), %ymm2
+ vmovdqa 0xc0(%rdi), %ymm6
+ vmovdqa 0x1c0(%rdi), %ymm10
+ vmovdqa 0xe0(%rdi), %ymm7
+ vmovdqa 0x1e0(%rdi), %ymm11
+ vpbroadcastq 0x48(%rsi), %ymm3
+ vpsubw %ymm4, %ymm8, %ymm12
+ vpaddw %ymm8, %ymm4, %ymm4
+ vpsubw %ymm5, %ymm9, %ymm13
+ vpmullw %ymm2, %ymm12, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm5
+ vpsubw %ymm6, %ymm10, %ymm14
+ vpmullw %ymm2, %ymm13, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm6
+ vpsubw %ymm7, %ymm11, %ymm15
+ vpmullw %ymm2, %ymm14, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm7
+ vpmullw %ymm2, %ymm15, %ymm11
+ vpmulhw %ymm3, %ymm12, %ymm12
+ vpmulhw %ymm3, %ymm13, %ymm13
+ vpmulhw %ymm3, %ymm14, %ymm14
+ vpmulhw %ymm3, %ymm15, %ymm15
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm8, %ymm12, %ymm8
+ vpsubw %ymm9, %ymm13, %ymm9
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpsubw %ymm11, %ymm15, %ymm11
+ vmovdqa %ymm4, 0x80(%rdi)
+ vmovdqa %ymm5, 0xa0(%rdi)
+ vmovdqa %ymm6, 0xc0(%rdi)
+ vmovdqa %ymm7, 0xe0(%rdi)
+ vmovdqa %ymm8, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa %ymm10, 0x1c0(%rdi)
+ vmovdqa %ymm11, 0x1e0(%rdi)
retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(invntt_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/mulcache_compute.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/mulcache_compute.S
index c8bde382ec..ee7a12c6fe 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/mulcache_compute.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/mulcache_compute.S
@@ -12,70 +12,79 @@
* dev/x86_64/src/mulcache_compute.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(poly_mulcache_compute_avx2)
MLK_ASM_FN_SYMBOL(poly_mulcache_compute_avx2)
- vmovdqa (%rdx), %ymm0
- vmovdqa 0x20(%rsi), %ymm2
- vmovdqa 0x60(%rsi), %ymm3
- vmovdqa 0x500(%rdx), %ymm4
- vmovdqa 0x580(%rdx), %ymm1
- vpmullw %ymm2, %ymm1, %ymm5
- vpmullw %ymm3, %ymm1, %ymm6
- vpmulhw %ymm2, %ymm4, %ymm7
- vpmulhw %ymm3, %ymm4, %ymm8
- vpmulhw %ymm5, %ymm0, %ymm9
- vpmulhw %ymm6, %ymm0, %ymm10
- vpsubw %ymm9, %ymm7, %ymm7
- vpsubw %ymm10, %ymm8, %ymm8
- vmovdqa %ymm7, (%rdi)
- vmovdqa %ymm8, 0x20(%rdi)
- vmovdqa 0xa0(%rsi), %ymm2
- vmovdqa 0xe0(%rsi), %ymm3
- vmovdqa 0x520(%rdx), %ymm4
- vmovdqa 0x5a0(%rdx), %ymm1
- vpmullw %ymm2, %ymm1, %ymm5
- vpmullw %ymm3, %ymm1, %ymm6
- vpmulhw %ymm2, %ymm4, %ymm7
- vpmulhw %ymm3, %ymm4, %ymm8
- vpmulhw %ymm5, %ymm0, %ymm9
- vpmulhw %ymm6, %ymm0, %ymm10
- vpsubw %ymm9, %ymm7, %ymm7
- vpsubw %ymm10, %ymm8, %ymm8
- vmovdqa %ymm7, 0x40(%rdi)
- vmovdqa %ymm8, 0x60(%rdi)
- vmovdqa 0x120(%rsi), %ymm2
- vmovdqa 0x160(%rsi), %ymm3
- vmovdqa 0x540(%rdx), %ymm4
- vmovdqa 0x5c0(%rdx), %ymm1
- vpmullw %ymm2, %ymm1, %ymm5
- vpmullw %ymm3, %ymm1, %ymm6
- vpmulhw %ymm2, %ymm4, %ymm7
- vpmulhw %ymm3, %ymm4, %ymm8
- vpmulhw %ymm5, %ymm0, %ymm9
- vpmulhw %ymm6, %ymm0, %ymm10
- vpsubw %ymm9, %ymm7, %ymm7
- vpsubw %ymm10, %ymm8, %ymm8
- vmovdqa %ymm7, 0x80(%rdi)
- vmovdqa %ymm8, 0xa0(%rdi)
- vmovdqa 0x1a0(%rsi), %ymm2
- vmovdqa 0x1e0(%rsi), %ymm3
- vmovdqa 0x560(%rdx), %ymm4
- vmovdqa 0x5e0(%rdx), %ymm1
- vpmullw %ymm2, %ymm1, %ymm5
- vpmullw %ymm3, %ymm1, %ymm6
- vpmulhw %ymm2, %ymm4, %ymm7
- vpmulhw %ymm3, %ymm4, %ymm8
- vpmulhw %ymm5, %ymm0, %ymm9
- vpmulhw %ymm6, %ymm0, %ymm10
- vpsubw %ymm9, %ymm7, %ymm7
- vpsubw %ymm10, %ymm8, %ymm8
- vmovdqa %ymm7, 0xc0(%rdi)
- vmovdqa %ymm8, 0xe0(%rdi)
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ vmovdqa 0x20(%rsi), %ymm2
+ vmovdqa 0x60(%rsi), %ymm3
+ vmovdqa 0x3e0(%rdx), %ymm4
+ vmovdqa 0x460(%rdx), %ymm1
+ vpmullw %ymm2, %ymm1, %ymm5
+ vpmullw %ymm3, %ymm1, %ymm6
+ vpmulhw %ymm2, %ymm4, %ymm7
+ vpmulhw %ymm3, %ymm4, %ymm8
+ vpmulhw %ymm5, %ymm0, %ymm9
+ vpmulhw %ymm6, %ymm0, %ymm10
+ vpsubw %ymm9, %ymm7, %ymm7
+ vpsubw %ymm10, %ymm8, %ymm8
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm8, 0x20(%rdi)
+ vmovdqa 0xa0(%rsi), %ymm2
+ vmovdqa 0xe0(%rsi), %ymm3
+ vmovdqa 0x400(%rdx), %ymm4
+ vmovdqa 0x480(%rdx), %ymm1
+ vpmullw %ymm2, %ymm1, %ymm5
+ vpmullw %ymm3, %ymm1, %ymm6
+ vpmulhw %ymm2, %ymm4, %ymm7
+ vpmulhw %ymm3, %ymm4, %ymm8
+ vpmulhw %ymm5, %ymm0, %ymm9
+ vpmulhw %ymm6, %ymm0, %ymm10
+ vpsubw %ymm9, %ymm7, %ymm7
+ vpsubw %ymm10, %ymm8, %ymm8
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm8, 0x60(%rdi)
+ vmovdqa 0x120(%rsi), %ymm2
+ vmovdqa 0x160(%rsi), %ymm3
+ vmovdqa 0x420(%rdx), %ymm4
+ vmovdqa 0x4a0(%rdx), %ymm1
+ vpmullw %ymm2, %ymm1, %ymm5
+ vpmullw %ymm3, %ymm1, %ymm6
+ vpmulhw %ymm2, %ymm4, %ymm7
+ vpmulhw %ymm3, %ymm4, %ymm8
+ vpmulhw %ymm5, %ymm0, %ymm9
+ vpmulhw %ymm6, %ymm0, %ymm10
+ vpsubw %ymm9, %ymm7, %ymm7
+ vpsubw %ymm10, %ymm8, %ymm8
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm8, 0xa0(%rdi)
+ vmovdqa 0x1a0(%rsi), %ymm2
+ vmovdqa 0x1e0(%rsi), %ymm3
+ vmovdqa 0x440(%rdx), %ymm4
+ vmovdqa 0x4c0(%rdx), %ymm1
+ vpmullw %ymm2, %ymm1, %ymm5
+ vpmullw %ymm3, %ymm1, %ymm6
+ vpmulhw %ymm2, %ymm4, %ymm7
+ vpmulhw %ymm3, %ymm4, %ymm8
+ vpmulhw %ymm5, %ymm0, %ymm9
+ vpmulhw %ymm6, %ymm0, %ymm10
+ vpsubw %ymm9, %ymm7, %ymm7
+ vpsubw %ymm10, %ymm8, %ymm8
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm8, 0xe0(%rdi)
retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_mulcache_compute_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/ntt.S
index 948f963c8a..24f075e494 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/ntt.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/ntt.S
@@ -33,598 +33,607 @@
* dev/x86_64/src/ntt.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(ntt_avx2)
MLK_ASM_FN_SYMBOL(ntt_avx2)
- vmovdqa (%rsi), %ymm0
- vpbroadcastq 0x140(%rsi), %ymm15
- vmovdqa 0x100(%rdi), %ymm8
- vmovdqa 0x120(%rdi), %ymm9
- vmovdqa 0x140(%rdi), %ymm10
- vmovdqa 0x160(%rdi), %ymm11
- vpbroadcastq 0x148(%rsi), %ymm2
- vpmullw %ymm15, %ymm8, %ymm12
- vpmullw %ymm15, %ymm9, %ymm13
- vpmullw %ymm15, %ymm10, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm11, %ymm11
- vmovdqa (%rdi), %ymm4
- vmovdqa 0x20(%rdi), %ymm5
- vmovdqa 0x40(%rdi), %ymm6
- vmovdqa 0x60(%rdi), %ymm7
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm8, %ymm4, %ymm3
- vpsubw %ymm8, %ymm4, %ymm8
- vpaddw %ymm9, %ymm5, %ymm4
- vpsubw %ymm9, %ymm5, %ymm9
- vpaddw %ymm10, %ymm6, %ymm5
- vpsubw %ymm10, %ymm6, %ymm10
- vpaddw %ymm11, %ymm7, %ymm6
- vpsubw %ymm11, %ymm7, %ymm11
- vpsubw %ymm12, %ymm3, %ymm3
- vpaddw %ymm12, %ymm8, %ymm8
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm9, %ymm9
- vpsubw %ymm14, %ymm5, %ymm5
- vpaddw %ymm14, %ymm10, %ymm10
- vpsubw %ymm15, %ymm6, %ymm6
- vpaddw %ymm15, %ymm11, %ymm11
- vmovdqa %ymm3, (%rdi)
- vmovdqa %ymm4, 0x20(%rdi)
- vmovdqa %ymm5, 0x40(%rdi)
- vmovdqa %ymm6, 0x60(%rdi)
- vmovdqa %ymm8, 0x100(%rdi)
- vmovdqa %ymm9, 0x120(%rdi)
- vmovdqa %ymm10, 0x140(%rdi)
- vmovdqa %ymm11, 0x160(%rdi)
- vpbroadcastq 0x140(%rsi), %ymm15
- vmovdqa 0x180(%rdi), %ymm8
- vmovdqa 0x1a0(%rdi), %ymm9
- vmovdqa 0x1c0(%rdi), %ymm10
- vmovdqa 0x1e0(%rdi), %ymm11
- vpbroadcastq 0x148(%rsi), %ymm2
- vpmullw %ymm15, %ymm8, %ymm12
- vpmullw %ymm15, %ymm9, %ymm13
- vpmullw %ymm15, %ymm10, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm11, %ymm11
- vmovdqa 0x80(%rdi), %ymm4
- vmovdqa 0xa0(%rdi), %ymm5
- vmovdqa 0xc0(%rdi), %ymm6
- vmovdqa 0xe0(%rdi), %ymm7
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm8, %ymm4, %ymm3
- vpsubw %ymm8, %ymm4, %ymm8
- vpaddw %ymm9, %ymm5, %ymm4
- vpsubw %ymm9, %ymm5, %ymm9
- vpaddw %ymm10, %ymm6, %ymm5
- vpsubw %ymm10, %ymm6, %ymm10
- vpaddw %ymm11, %ymm7, %ymm6
- vpsubw %ymm11, %ymm7, %ymm11
- vpsubw %ymm12, %ymm3, %ymm3
- vpaddw %ymm12, %ymm8, %ymm8
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm9, %ymm9
- vpsubw %ymm14, %ymm5, %ymm5
- vpaddw %ymm14, %ymm10, %ymm10
- vpsubw %ymm15, %ymm6, %ymm6
- vpaddw %ymm15, %ymm11, %ymm11
- vmovdqa %ymm3, 0x80(%rdi)
- vmovdqa %ymm4, 0xa0(%rdi)
- vmovdqa %ymm5, 0xc0(%rdi)
- vmovdqa %ymm6, 0xe0(%rdi)
- vmovdqa %ymm8, 0x180(%rdi)
- vmovdqa %ymm9, 0x1a0(%rdi)
- vmovdqa %ymm10, 0x1c0(%rdi)
- vmovdqa %ymm11, 0x1e0(%rdi)
- vmovdqa 0x160(%rsi), %ymm15
- vmovdqa 0x80(%rdi), %ymm8
- vmovdqa 0xa0(%rdi), %ymm9
- vmovdqa 0xc0(%rdi), %ymm10
- vmovdqa 0xe0(%rdi), %ymm11
- vmovdqa 0x180(%rsi), %ymm2
- vpmullw %ymm15, %ymm8, %ymm12
- vpmullw %ymm15, %ymm9, %ymm13
- vpmullw %ymm15, %ymm10, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm11, %ymm11
- vmovdqa (%rdi), %ymm4
- vmovdqa 0x20(%rdi), %ymm5
- vmovdqa 0x40(%rdi), %ymm6
- vmovdqa 0x60(%rdi), %ymm7
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm8, %ymm4, %ymm3
- vpsubw %ymm8, %ymm4, %ymm8
- vpaddw %ymm9, %ymm5, %ymm4
- vpsubw %ymm9, %ymm5, %ymm9
- vpaddw %ymm10, %ymm6, %ymm5
- vpsubw %ymm10, %ymm6, %ymm10
- vpaddw %ymm11, %ymm7, %ymm6
- vpsubw %ymm11, %ymm7, %ymm11
- vpsubw %ymm12, %ymm3, %ymm3
- vpaddw %ymm12, %ymm8, %ymm8
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm9, %ymm9
- vpsubw %ymm14, %ymm5, %ymm5
- vpaddw %ymm14, %ymm10, %ymm10
- vpsubw %ymm15, %ymm6, %ymm6
- vpaddw %ymm15, %ymm11, %ymm11
- vperm2i128 $0x20, %ymm10, %ymm5, %ymm7 # ymm7 = ymm5[0,1],ymm10[0,1]
- vperm2i128 $0x31, %ymm10, %ymm5, %ymm10 # ymm10 = ymm5[2,3],ymm10[2,3]
- vperm2i128 $0x20, %ymm11, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm11[0,1]
- vperm2i128 $0x31, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[2,3],ymm11[2,3]
- vmovdqa 0x1a0(%rsi), %ymm15
- vmovdqa 0x1c0(%rsi), %ymm2
- vpmullw %ymm15, %ymm7, %ymm12
- vpmullw %ymm15, %ymm10, %ymm13
- vpmullw %ymm15, %ymm5, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm7, %ymm7
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm5, %ymm5
- vpmulhw %ymm2, %ymm11, %ymm11
- vperm2i128 $0x20, %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm8[0,1]
- vperm2i128 $0x31, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[2,3],ymm8[2,3]
- vperm2i128 $0x20, %ymm9, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm9[0,1]
- vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm7, %ymm6, %ymm4
- vpsubw %ymm7, %ymm6, %ymm7
- vpaddw %ymm10, %ymm8, %ymm6
- vpsubw %ymm10, %ymm8, %ymm10
- vpaddw %ymm5, %ymm3, %ymm8
- vpsubw %ymm5, %ymm3, %ymm5
- vpaddw %ymm11, %ymm9, %ymm3
- vpsubw %ymm11, %ymm9, %ymm11
- vpsubw %ymm12, %ymm4, %ymm4
- vpaddw %ymm12, %ymm7, %ymm7
- vpsubw %ymm13, %ymm6, %ymm6
- vpaddw %ymm13, %ymm10, %ymm10
- vpsubw %ymm14, %ymm8, %ymm8
- vpaddw %ymm14, %ymm5, %ymm5
- vpsubw %ymm15, %ymm3, %ymm3
- vpaddw %ymm15, %ymm11, %ymm11
- vpunpcklqdq %ymm5, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm5[0],ymm8[2],ymm5[2]
- vpunpckhqdq %ymm5, %ymm8, %ymm5 # ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3]
- vpunpcklqdq %ymm11, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
- vpunpckhqdq %ymm11, %ymm3, %ymm11 # ymm11 = ymm3[1],ymm11[1],ymm3[3],ymm11[3]
- vmovdqa 0x1e0(%rsi), %ymm15
- vmovdqa 0x200(%rsi), %ymm2
- vpmullw %ymm15, %ymm9, %ymm12
- vpmullw %ymm15, %ymm5, %ymm13
- vpmullw %ymm15, %ymm8, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm5, %ymm5
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm11, %ymm11
- vpunpcklqdq %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
- vpunpckhqdq %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[1],ymm7[1],ymm4[3],ymm7[3]
- vpunpcklqdq %ymm10, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2]
- vpunpckhqdq %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm9, %ymm3, %ymm6
- vpsubw %ymm9, %ymm3, %ymm9
- vpaddw %ymm5, %ymm7, %ymm3
- vpsubw %ymm5, %ymm7, %ymm5
- vpaddw %ymm8, %ymm4, %ymm7
- vpsubw %ymm8, %ymm4, %ymm8
- vpaddw %ymm11, %ymm10, %ymm4
- vpsubw %ymm11, %ymm10, %ymm11
- vpsubw %ymm12, %ymm6, %ymm6
- vpaddw %ymm12, %ymm9, %ymm9
- vpsubw %ymm13, %ymm3, %ymm3
- vpaddw %ymm13, %ymm5, %ymm5
- vpsubw %ymm14, %ymm7, %ymm7
- vpaddw %ymm14, %ymm8, %ymm8
- vpsubw %ymm15, %ymm4, %ymm4
- vpaddw %ymm15, %ymm11, %ymm11
- vmovsldup %ymm8, %ymm10 # ymm10 = ymm8[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm10, %ymm7, %ymm10 # ymm10 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4],ymm10[5],ymm7[6],ymm10[7]
- vpsrlq $0x20, %ymm7, %ymm7
- vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
- vmovsldup %ymm11, %ymm7 # ymm7 = ymm11[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7]
- vpsrlq $0x20, %ymm4, %ymm4
- vpblendd $0xaa, %ymm11, %ymm4, %ymm11 # ymm11 = ymm4[0],ymm11[1],ymm4[2],ymm11[3],ymm4[4],ymm11[5],ymm4[6],ymm11[7]
- vmovdqa 0x220(%rsi), %ymm15
- vmovdqa 0x240(%rsi), %ymm2
- vpmullw %ymm15, %ymm10, %ymm12
- vpmullw %ymm15, %ymm8, %ymm13
- vpmullw %ymm15, %ymm7, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm7, %ymm7
- vpmulhw %ymm2, %ymm11, %ymm11
- vmovsldup %ymm9, %ymm4 # ymm4 = ymm9[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
- vpsrlq $0x20, %ymm6, %ymm6
- vpblendd $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7]
- vmovsldup %ymm5, %ymm6 # ymm6 = ymm5[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7]
- vpsrlq $0x20, %ymm3, %ymm3
- vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm10, %ymm4, %ymm3
- vpsubw %ymm10, %ymm4, %ymm10
- vpaddw %ymm8, %ymm9, %ymm4
- vpsubw %ymm8, %ymm9, %ymm8
- vpaddw %ymm7, %ymm6, %ymm9
- vpsubw %ymm7, %ymm6, %ymm7
- vpaddw %ymm11, %ymm5, %ymm6
- vpsubw %ymm11, %ymm5, %ymm11
- vpsubw %ymm12, %ymm3, %ymm3
- vpaddw %ymm12, %ymm10, %ymm10
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm8, %ymm8
- vpsubw %ymm14, %ymm9, %ymm9
- vpaddw %ymm14, %ymm7, %ymm7
- vpsubw %ymm15, %ymm6, %ymm6
- vpaddw %ymm15, %ymm11, %ymm11
- vpslld $0x10, %ymm7, %ymm5
- vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
- vpsrld $0x10, %ymm9, %ymm9
- vpblendw $0xaa, %ymm7, %ymm9, %ymm7 # ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4],ymm7[5],ymm9[6],ymm7[7],ymm9[8],ymm7[9],ymm9[10],ymm7[11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
- vpslld $0x10, %ymm11, %ymm9
- vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
- vpsrld $0x10, %ymm6, %ymm6
- vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
- vmovdqa 0x260(%rsi), %ymm15
- vmovdqa 0x280(%rsi), %ymm2
- vpmullw %ymm15, %ymm5, %ymm12
- vpmullw %ymm15, %ymm7, %ymm13
- vpmullw %ymm15, %ymm9, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm5, %ymm5
- vpmulhw %ymm2, %ymm7, %ymm7
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm11, %ymm11
- vpslld $0x10, %ymm10, %ymm6
- vpblendw $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7],ymm3[8],ymm6[9],ymm3[10],ymm6[11],ymm3[12],ymm6[13],ymm3[14],ymm6[15]
- vpsrld $0x10, %ymm3, %ymm3
- vpblendw $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7],ymm3[8],ymm10[9],ymm3[10],ymm10[11],ymm3[12],ymm10[13],ymm3[14],ymm10[15]
- vpslld $0x10, %ymm8, %ymm3
- vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
- vpsrld $0x10, %ymm4, %ymm4
- vpblendw $0xaa, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7],ymm4[8],ymm8[9],ymm4[10],ymm8[11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm5, %ymm6, %ymm4
- vpsubw %ymm5, %ymm6, %ymm5
- vpaddw %ymm7, %ymm10, %ymm6
- vpsubw %ymm7, %ymm10, %ymm7
- vpaddw %ymm9, %ymm3, %ymm10
- vpsubw %ymm9, %ymm3, %ymm9
- vpaddw %ymm11, %ymm8, %ymm3
- vpsubw %ymm11, %ymm8, %ymm11
- vpsubw %ymm12, %ymm4, %ymm4
- vpaddw %ymm12, %ymm5, %ymm5
- vpsubw %ymm13, %ymm6, %ymm6
- vpaddw %ymm13, %ymm7, %ymm7
- vpsubw %ymm14, %ymm10, %ymm10
- vpaddw %ymm14, %ymm9, %ymm9
- vpsubw %ymm15, %ymm3, %ymm3
- vpaddw %ymm15, %ymm11, %ymm11
- vmovdqa 0x2a0(%rsi), %ymm14
- vmovdqa 0x2e0(%rsi), %ymm15
- vmovdqa 0x2c0(%rsi), %ymm8
- vmovdqa 0x300(%rsi), %ymm2
- vpmullw %ymm14, %ymm10, %ymm12
- vpmullw %ymm14, %ymm3, %ymm13
- vpmullw %ymm15, %ymm9, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm8, %ymm10, %ymm10
- vpmulhw %ymm8, %ymm3, %ymm3
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm11, %ymm11
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm10, %ymm4, %ymm8
- vpsubw %ymm10, %ymm4, %ymm10
- vpaddw %ymm3, %ymm6, %ymm4
- vpsubw %ymm3, %ymm6, %ymm3
- vpaddw %ymm9, %ymm5, %ymm6
- vpsubw %ymm9, %ymm5, %ymm9
- vpaddw %ymm11, %ymm7, %ymm5
- vpsubw %ymm11, %ymm7, %ymm11
- vpsubw %ymm12, %ymm8, %ymm8
- vpaddw %ymm12, %ymm10, %ymm10
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm3, %ymm3
- vpsubw %ymm14, %ymm6, %ymm6
- vpaddw %ymm14, %ymm9, %ymm9
- vpsubw %ymm15, %ymm5, %ymm5
- vpaddw %ymm15, %ymm11, %ymm11
- vmovdqa %ymm8, (%rdi)
- vmovdqa %ymm4, 0x20(%rdi)
- vmovdqa %ymm10, 0x40(%rdi)
- vmovdqa %ymm3, 0x60(%rdi)
- vmovdqa %ymm6, 0x80(%rdi)
- vmovdqa %ymm5, 0xa0(%rdi)
- vmovdqa %ymm9, 0xc0(%rdi)
- vmovdqa %ymm11, 0xe0(%rdi)
- vmovdqa 0x320(%rsi), %ymm15
- vmovdqa 0x180(%rdi), %ymm8
- vmovdqa 0x1a0(%rdi), %ymm9
- vmovdqa 0x1c0(%rdi), %ymm10
- vmovdqa 0x1e0(%rdi), %ymm11
- vmovdqa 0x340(%rsi), %ymm2
- vpmullw %ymm15, %ymm8, %ymm12
- vpmullw %ymm15, %ymm9, %ymm13
- vpmullw %ymm15, %ymm10, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm11, %ymm11
- vmovdqa 0x100(%rdi), %ymm4
- vmovdqa 0x120(%rdi), %ymm5
- vmovdqa 0x140(%rdi), %ymm6
- vmovdqa 0x160(%rdi), %ymm7
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm8, %ymm4, %ymm3
- vpsubw %ymm8, %ymm4, %ymm8
- vpaddw %ymm9, %ymm5, %ymm4
- vpsubw %ymm9, %ymm5, %ymm9
- vpaddw %ymm10, %ymm6, %ymm5
- vpsubw %ymm10, %ymm6, %ymm10
- vpaddw %ymm11, %ymm7, %ymm6
- vpsubw %ymm11, %ymm7, %ymm11
- vpsubw %ymm12, %ymm3, %ymm3
- vpaddw %ymm12, %ymm8, %ymm8
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm9, %ymm9
- vpsubw %ymm14, %ymm5, %ymm5
- vpaddw %ymm14, %ymm10, %ymm10
- vpsubw %ymm15, %ymm6, %ymm6
- vpaddw %ymm15, %ymm11, %ymm11
- vperm2i128 $0x20, %ymm10, %ymm5, %ymm7 # ymm7 = ymm5[0,1],ymm10[0,1]
- vperm2i128 $0x31, %ymm10, %ymm5, %ymm10 # ymm10 = ymm5[2,3],ymm10[2,3]
- vperm2i128 $0x20, %ymm11, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm11[0,1]
- vperm2i128 $0x31, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[2,3],ymm11[2,3]
- vmovdqa 0x360(%rsi), %ymm15
- vmovdqa 0x380(%rsi), %ymm2
- vpmullw %ymm15, %ymm7, %ymm12
- vpmullw %ymm15, %ymm10, %ymm13
- vpmullw %ymm15, %ymm5, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm7, %ymm7
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm5, %ymm5
- vpmulhw %ymm2, %ymm11, %ymm11
- vperm2i128 $0x20, %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm8[0,1]
- vperm2i128 $0x31, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[2,3],ymm8[2,3]
- vperm2i128 $0x20, %ymm9, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm9[0,1]
- vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm7, %ymm6, %ymm4
- vpsubw %ymm7, %ymm6, %ymm7
- vpaddw %ymm10, %ymm8, %ymm6
- vpsubw %ymm10, %ymm8, %ymm10
- vpaddw %ymm5, %ymm3, %ymm8
- vpsubw %ymm5, %ymm3, %ymm5
- vpaddw %ymm11, %ymm9, %ymm3
- vpsubw %ymm11, %ymm9, %ymm11
- vpsubw %ymm12, %ymm4, %ymm4
- vpaddw %ymm12, %ymm7, %ymm7
- vpsubw %ymm13, %ymm6, %ymm6
- vpaddw %ymm13, %ymm10, %ymm10
- vpsubw %ymm14, %ymm8, %ymm8
- vpaddw %ymm14, %ymm5, %ymm5
- vpsubw %ymm15, %ymm3, %ymm3
- vpaddw %ymm15, %ymm11, %ymm11
- vpunpcklqdq %ymm5, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm5[0],ymm8[2],ymm5[2]
- vpunpckhqdq %ymm5, %ymm8, %ymm5 # ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3]
- vpunpcklqdq %ymm11, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
- vpunpckhqdq %ymm11, %ymm3, %ymm11 # ymm11 = ymm3[1],ymm11[1],ymm3[3],ymm11[3]
- vmovdqa 0x3a0(%rsi), %ymm15
- vmovdqa 0x3c0(%rsi), %ymm2
- vpmullw %ymm15, %ymm9, %ymm12
- vpmullw %ymm15, %ymm5, %ymm13
- vpmullw %ymm15, %ymm8, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm5, %ymm5
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm11, %ymm11
- vpunpcklqdq %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
- vpunpckhqdq %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[1],ymm7[1],ymm4[3],ymm7[3]
- vpunpcklqdq %ymm10, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2]
- vpunpckhqdq %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm9, %ymm3, %ymm6
- vpsubw %ymm9, %ymm3, %ymm9
- vpaddw %ymm5, %ymm7, %ymm3
- vpsubw %ymm5, %ymm7, %ymm5
- vpaddw %ymm8, %ymm4, %ymm7
- vpsubw %ymm8, %ymm4, %ymm8
- vpaddw %ymm11, %ymm10, %ymm4
- vpsubw %ymm11, %ymm10, %ymm11
- vpsubw %ymm12, %ymm6, %ymm6
- vpaddw %ymm12, %ymm9, %ymm9
- vpsubw %ymm13, %ymm3, %ymm3
- vpaddw %ymm13, %ymm5, %ymm5
- vpsubw %ymm14, %ymm7, %ymm7
- vpaddw %ymm14, %ymm8, %ymm8
- vpsubw %ymm15, %ymm4, %ymm4
- vpaddw %ymm15, %ymm11, %ymm11
- vmovsldup %ymm8, %ymm10 # ymm10 = ymm8[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm10, %ymm7, %ymm10 # ymm10 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4],ymm10[5],ymm7[6],ymm10[7]
- vpsrlq $0x20, %ymm7, %ymm7
- vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
- vmovsldup %ymm11, %ymm7 # ymm7 = ymm11[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7]
- vpsrlq $0x20, %ymm4, %ymm4
- vpblendd $0xaa, %ymm11, %ymm4, %ymm11 # ymm11 = ymm4[0],ymm11[1],ymm4[2],ymm11[3],ymm4[4],ymm11[5],ymm4[6],ymm11[7]
- vmovdqa 0x3e0(%rsi), %ymm15
- vmovdqa 0x400(%rsi), %ymm2
- vpmullw %ymm15, %ymm10, %ymm12
- vpmullw %ymm15, %ymm8, %ymm13
- vpmullw %ymm15, %ymm7, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm2, %ymm7, %ymm7
- vpmulhw %ymm2, %ymm11, %ymm11
- vmovsldup %ymm9, %ymm4 # ymm4 = ymm9[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
- vpsrlq $0x20, %ymm6, %ymm6
- vpblendd $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7]
- vmovsldup %ymm5, %ymm6 # ymm6 = ymm5[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7]
- vpsrlq $0x20, %ymm3, %ymm3
- vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm10, %ymm4, %ymm3
- vpsubw %ymm10, %ymm4, %ymm10
- vpaddw %ymm8, %ymm9, %ymm4
- vpsubw %ymm8, %ymm9, %ymm8
- vpaddw %ymm7, %ymm6, %ymm9
- vpsubw %ymm7, %ymm6, %ymm7
- vpaddw %ymm11, %ymm5, %ymm6
- vpsubw %ymm11, %ymm5, %ymm11
- vpsubw %ymm12, %ymm3, %ymm3
- vpaddw %ymm12, %ymm10, %ymm10
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm8, %ymm8
- vpsubw %ymm14, %ymm9, %ymm9
- vpaddw %ymm14, %ymm7, %ymm7
- vpsubw %ymm15, %ymm6, %ymm6
- vpaddw %ymm15, %ymm11, %ymm11
- vpslld $0x10, %ymm7, %ymm5
- vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
- vpsrld $0x10, %ymm9, %ymm9
- vpblendw $0xaa, %ymm7, %ymm9, %ymm7 # ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4],ymm7[5],ymm9[6],ymm7[7],ymm9[8],ymm7[9],ymm9[10],ymm7[11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
- vpslld $0x10, %ymm11, %ymm9
- vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
- vpsrld $0x10, %ymm6, %ymm6
- vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
- vmovdqa 0x420(%rsi), %ymm15
- vmovdqa 0x440(%rsi), %ymm2
- vpmullw %ymm15, %ymm5, %ymm12
- vpmullw %ymm15, %ymm7, %ymm13
- vpmullw %ymm15, %ymm9, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm2, %ymm5, %ymm5
- vpmulhw %ymm2, %ymm7, %ymm7
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm11, %ymm11
- vpslld $0x10, %ymm10, %ymm6
- vpblendw $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7],ymm3[8],ymm6[9],ymm3[10],ymm6[11],ymm3[12],ymm6[13],ymm3[14],ymm6[15]
- vpsrld $0x10, %ymm3, %ymm3
- vpblendw $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7],ymm3[8],ymm10[9],ymm3[10],ymm10[11],ymm3[12],ymm10[13],ymm3[14],ymm10[15]
- vpslld $0x10, %ymm8, %ymm3
- vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
- vpsrld $0x10, %ymm4, %ymm4
- vpblendw $0xaa, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7],ymm4[8],ymm8[9],ymm4[10],ymm8[11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm5, %ymm6, %ymm4
- vpsubw %ymm5, %ymm6, %ymm5
- vpaddw %ymm7, %ymm10, %ymm6
- vpsubw %ymm7, %ymm10, %ymm7
- vpaddw %ymm9, %ymm3, %ymm10
- vpsubw %ymm9, %ymm3, %ymm9
- vpaddw %ymm11, %ymm8, %ymm3
- vpsubw %ymm11, %ymm8, %ymm11
- vpsubw %ymm12, %ymm4, %ymm4
- vpaddw %ymm12, %ymm5, %ymm5
- vpsubw %ymm13, %ymm6, %ymm6
- vpaddw %ymm13, %ymm7, %ymm7
- vpsubw %ymm14, %ymm10, %ymm10
- vpaddw %ymm14, %ymm9, %ymm9
- vpsubw %ymm15, %ymm3, %ymm3
- vpaddw %ymm15, %ymm11, %ymm11
- vmovdqa 0x460(%rsi), %ymm14
- vmovdqa 0x4a0(%rsi), %ymm15
- vmovdqa 0x480(%rsi), %ymm8
- vmovdqa 0x4c0(%rsi), %ymm2
- vpmullw %ymm14, %ymm10, %ymm12
- vpmullw %ymm14, %ymm3, %ymm13
- vpmullw %ymm15, %ymm9, %ymm14
- vpmullw %ymm15, %ymm11, %ymm15
- vpmulhw %ymm8, %ymm10, %ymm10
- vpmulhw %ymm8, %ymm3, %ymm3
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm2, %ymm11, %ymm11
- vpmulhw %ymm0, %ymm12, %ymm12
- vpmulhw %ymm0, %ymm13, %ymm13
- vpmulhw %ymm0, %ymm14, %ymm14
- vpmulhw %ymm0, %ymm15, %ymm15
- vpaddw %ymm10, %ymm4, %ymm8
- vpsubw %ymm10, %ymm4, %ymm10
- vpaddw %ymm3, %ymm6, %ymm4
- vpsubw %ymm3, %ymm6, %ymm3
- vpaddw %ymm9, %ymm5, %ymm6
- vpsubw %ymm9, %ymm5, %ymm9
- vpaddw %ymm11, %ymm7, %ymm5
- vpsubw %ymm11, %ymm7, %ymm11
- vpsubw %ymm12, %ymm8, %ymm8
- vpaddw %ymm12, %ymm10, %ymm10
- vpsubw %ymm13, %ymm4, %ymm4
- vpaddw %ymm13, %ymm3, %ymm3
- vpsubw %ymm14, %ymm6, %ymm6
- vpaddw %ymm14, %ymm9, %ymm9
- vpsubw %ymm15, %ymm5, %ymm5
- vpaddw %ymm15, %ymm11, %ymm11
- vmovdqa %ymm8, 0x100(%rdi)
- vmovdqa %ymm4, 0x120(%rdi)
- vmovdqa %ymm10, 0x140(%rdi)
- vmovdqa %ymm3, 0x160(%rdi)
- vmovdqa %ymm6, 0x180(%rdi)
- vmovdqa %ymm5, 0x1a0(%rdi)
- vmovdqa %ymm9, 0x1c0(%rdi)
- vmovdqa %ymm11, 0x1e0(%rdi)
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ vpbroadcastq 0x40(%rsi), %ymm15
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x120(%rdi), %ymm9
+ vmovdqa 0x140(%rdi), %ymm10
+ vmovdqa 0x160(%rdi), %ymm11
+ vpbroadcastq 0x48(%rsi), %ymm2
+ vpmullw %ymm15, %ymm8, %ymm12
+ vpmullw %ymm15, %ymm9, %ymm13
+ vpmullw %ymm15, %ymm10, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vmovdqa (%rdi), %ymm4
+ vmovdqa 0x20(%rdi), %ymm5
+ vmovdqa 0x40(%rdi), %ymm6
+ vmovdqa 0x60(%rdi), %ymm7
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm8, %ymm4, %ymm3
+ vpsubw %ymm8, %ymm4, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm4
+ vpsubw %ymm9, %ymm5, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm5
+ vpsubw %ymm10, %ymm6, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm6
+ vpsubw %ymm11, %ymm7, %ymm11
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpaddw %ymm12, %ymm8, %ymm8
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm9, %ymm9
+ vpsubw %ymm14, %ymm5, %ymm5
+ vpaddw %ymm14, %ymm10, %ymm10
+ vpsubw %ymm15, %ymm6, %ymm6
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovdqa %ymm3, (%rdi)
+ vmovdqa %ymm4, 0x20(%rdi)
+ vmovdqa %ymm5, 0x40(%rdi)
+ vmovdqa %ymm6, 0x60(%rdi)
+ vmovdqa %ymm8, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa %ymm10, 0x140(%rdi)
+ vmovdqa %ymm11, 0x160(%rdi)
+ vpbroadcastq 0x40(%rsi), %ymm15
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm10
+ vmovdqa 0x1e0(%rdi), %ymm11
+ vpbroadcastq 0x48(%rsi), %ymm2
+ vpmullw %ymm15, %ymm8, %ymm12
+ vpmullw %ymm15, %ymm9, %ymm13
+ vpmullw %ymm15, %ymm10, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vmovdqa 0x80(%rdi), %ymm4
+ vmovdqa 0xa0(%rdi), %ymm5
+ vmovdqa 0xc0(%rdi), %ymm6
+ vmovdqa 0xe0(%rdi), %ymm7
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm8, %ymm4, %ymm3
+ vpsubw %ymm8, %ymm4, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm4
+ vpsubw %ymm9, %ymm5, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm5
+ vpsubw %ymm10, %ymm6, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm6
+ vpsubw %ymm11, %ymm7, %ymm11
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpaddw %ymm12, %ymm8, %ymm8
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm9, %ymm9
+ vpsubw %ymm14, %ymm5, %ymm5
+ vpaddw %ymm14, %ymm10, %ymm10
+ vpsubw %ymm15, %ymm6, %ymm6
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovdqa %ymm3, 0x80(%rdi)
+ vmovdqa %ymm4, 0xa0(%rdi)
+ vmovdqa %ymm5, 0xc0(%rdi)
+ vmovdqa %ymm6, 0xe0(%rdi)
+ vmovdqa %ymm8, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa %ymm10, 0x1c0(%rdi)
+ vmovdqa %ymm11, 0x1e0(%rdi)
+ vmovdqa 0x60(%rsi), %ymm15
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm9
+ vmovdqa 0xc0(%rdi), %ymm10
+ vmovdqa 0xe0(%rdi), %ymm11
+ vmovdqa 0x80(%rsi), %ymm2
+ vpmullw %ymm15, %ymm8, %ymm12
+ vpmullw %ymm15, %ymm9, %ymm13
+ vpmullw %ymm15, %ymm10, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vmovdqa (%rdi), %ymm4
+ vmovdqa 0x20(%rdi), %ymm5
+ vmovdqa 0x40(%rdi), %ymm6
+ vmovdqa 0x60(%rdi), %ymm7
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm8, %ymm4, %ymm3
+ vpsubw %ymm8, %ymm4, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm4
+ vpsubw %ymm9, %ymm5, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm5
+ vpsubw %ymm10, %ymm6, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm6
+ vpsubw %ymm11, %ymm7, %ymm11
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpaddw %ymm12, %ymm8, %ymm8
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm9, %ymm9
+ vpsubw %ymm14, %ymm5, %ymm5
+ vpaddw %ymm14, %ymm10, %ymm10
+ vpsubw %ymm15, %ymm6, %ymm6
+ vpaddw %ymm15, %ymm11, %ymm11
+ vperm2i128 $0x20, %ymm10, %ymm5, %ymm7 # ymm7 = ymm5[0,1],ymm10[0,1]
+ vperm2i128 $0x31, %ymm10, %ymm5, %ymm10 # ymm10 = ymm5[2,3],ymm10[2,3]
+ vperm2i128 $0x20, %ymm11, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm11[0,1]
+ vperm2i128 $0x31, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[2,3],ymm11[2,3]
+ vmovdqa 0xa0(%rsi), %ymm15
+ vmovdqa 0xc0(%rsi), %ymm2
+ vpmullw %ymm15, %ymm7, %ymm12
+ vpmullw %ymm15, %ymm10, %ymm13
+ vpmullw %ymm15, %ymm5, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vperm2i128 $0x20, %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm7, %ymm6, %ymm4
+ vpsubw %ymm7, %ymm6, %ymm7
+ vpaddw %ymm10, %ymm8, %ymm6
+ vpsubw %ymm10, %ymm8, %ymm10
+ vpaddw %ymm5, %ymm3, %ymm8
+ vpsubw %ymm5, %ymm3, %ymm5
+ vpaddw %ymm11, %ymm9, %ymm3
+ vpsubw %ymm11, %ymm9, %ymm11
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpaddw %ymm12, %ymm7, %ymm7
+ vpsubw %ymm13, %ymm6, %ymm6
+ vpaddw %ymm13, %ymm10, %ymm10
+ vpsubw %ymm14, %ymm8, %ymm8
+ vpaddw %ymm14, %ymm5, %ymm5
+ vpsubw %ymm15, %ymm3, %ymm3
+ vpaddw %ymm15, %ymm11, %ymm11
+ vpunpcklqdq %ymm5, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm5[0],ymm8[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm8, %ymm5 # ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3]
+ vpunpcklqdq %ymm11, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
+ vpunpckhqdq %ymm11, %ymm3, %ymm11 # ymm11 = ymm3[1],ymm11[1],ymm3[3],ymm11[3]
+ vmovdqa 0xe0(%rsi), %ymm15
+ vmovdqa 0x100(%rsi), %ymm2
+ vpmullw %ymm15, %ymm9, %ymm12
+ vpmullw %ymm15, %ymm5, %ymm13
+ vpmullw %ymm15, %ymm8, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vpunpcklqdq %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
+ vpunpckhqdq %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[1],ymm7[1],ymm4[3],ymm7[3]
+ vpunpcklqdq %ymm10, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2]
+ vpunpckhqdq %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm9, %ymm3, %ymm6
+ vpsubw %ymm9, %ymm3, %ymm9
+ vpaddw %ymm5, %ymm7, %ymm3
+ vpsubw %ymm5, %ymm7, %ymm5
+ vpaddw %ymm8, %ymm4, %ymm7
+ vpsubw %ymm8, %ymm4, %ymm8
+ vpaddw %ymm11, %ymm10, %ymm4
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpsubw %ymm12, %ymm6, %ymm6
+ vpaddw %ymm12, %ymm9, %ymm9
+ vpsubw %ymm13, %ymm3, %ymm3
+ vpaddw %ymm13, %ymm5, %ymm5
+ vpsubw %ymm14, %ymm7, %ymm7
+ vpaddw %ymm14, %ymm8, %ymm8
+ vpsubw %ymm15, %ymm4, %ymm4
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovsldup %ymm8, %ymm10 # ymm10 = ymm8[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm10, %ymm7, %ymm10 # ymm10 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4],ymm10[5],ymm7[6],ymm10[7]
+ vpsrlq $0x20, %ymm7, %ymm7
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
+ vmovsldup %ymm11, %ymm7 # ymm7 = ymm11[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7]
+ vpsrlq $0x20, %ymm4, %ymm4
+ vpblendd $0xaa, %ymm11, %ymm4, %ymm11 # ymm11 = ymm4[0],ymm11[1],ymm4[2],ymm11[3],ymm4[4],ymm11[5],ymm4[6],ymm11[7]
+ vmovdqa 0x120(%rsi), %ymm15
+ vmovdqa 0x140(%rsi), %ymm2
+ vpmullw %ymm15, %ymm10, %ymm12
+ vpmullw %ymm15, %ymm8, %ymm13
+ vpmullw %ymm15, %ymm7, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vmovsldup %ymm9, %ymm4 # ymm4 = ymm9[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7]
+ vmovsldup %ymm5, %ymm6 # ymm6 = ymm5[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm10, %ymm4, %ymm3
+ vpsubw %ymm10, %ymm4, %ymm10
+ vpaddw %ymm8, %ymm9, %ymm4
+ vpsubw %ymm8, %ymm9, %ymm8
+ vpaddw %ymm7, %ymm6, %ymm9
+ vpsubw %ymm7, %ymm6, %ymm7
+ vpaddw %ymm11, %ymm5, %ymm6
+ vpsubw %ymm11, %ymm5, %ymm11
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpaddw %ymm12, %ymm10, %ymm10
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm14, %ymm9, %ymm9
+ vpaddw %ymm14, %ymm7, %ymm7
+ vpsubw %ymm15, %ymm6, %ymm6
+ vpaddw %ymm15, %ymm11, %ymm11
+ vpslld $0x10, %ymm7, %ymm5
+ vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
+ vpsrld $0x10, %ymm9, %ymm9
+ vpblendw $0xaa, %ymm7, %ymm9, %ymm7 # ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4],ymm7[5],ymm9[6],ymm7[7],ymm9[8],ymm7[9],ymm9[10],ymm7[11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
+ vpslld $0x10, %ymm11, %ymm9
+ vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
+ vmovdqa 0x160(%rsi), %ymm15
+ vmovdqa 0x180(%rsi), %ymm2
+ vpmullw %ymm15, %ymm5, %ymm12
+ vpmullw %ymm15, %ymm7, %ymm13
+ vpmullw %ymm15, %ymm9, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vpslld $0x10, %ymm10, %ymm6
+ vpblendw $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7],ymm3[8],ymm6[9],ymm3[10],ymm6[11],ymm3[12],ymm6[13],ymm3[14],ymm6[15]
+ vpsrld $0x10, %ymm3, %ymm3
+ vpblendw $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7],ymm3[8],ymm10[9],ymm3[10],ymm10[11],ymm3[12],ymm10[13],ymm3[14],ymm10[15]
+ vpslld $0x10, %ymm8, %ymm3
+ vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7],ymm4[8],ymm8[9],ymm4[10],ymm8[11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm5, %ymm6, %ymm4
+ vpsubw %ymm5, %ymm6, %ymm5
+ vpaddw %ymm7, %ymm10, %ymm6
+ vpsubw %ymm7, %ymm10, %ymm7
+ vpaddw %ymm9, %ymm3, %ymm10
+ vpsubw %ymm9, %ymm3, %ymm9
+ vpaddw %ymm11, %ymm8, %ymm3
+ vpsubw %ymm11, %ymm8, %ymm11
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpaddw %ymm12, %ymm5, %ymm5
+ vpsubw %ymm13, %ymm6, %ymm6
+ vpaddw %ymm13, %ymm7, %ymm7
+ vpsubw %ymm14, %ymm10, %ymm10
+ vpaddw %ymm14, %ymm9, %ymm9
+ vpsubw %ymm15, %ymm3, %ymm3
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovdqa 0x1a0(%rsi), %ymm14
+ vmovdqa 0x1e0(%rsi), %ymm15
+ vmovdqa 0x1c0(%rsi), %ymm8
+ vmovdqa 0x200(%rsi), %ymm2
+ vpmullw %ymm14, %ymm10, %ymm12
+ vpmullw %ymm14, %ymm3, %ymm13
+ vpmullw %ymm15, %ymm9, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm8, %ymm10, %ymm10
+ vpmulhw %ymm8, %ymm3, %ymm3
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm10, %ymm4, %ymm8
+ vpsubw %ymm10, %ymm4, %ymm10
+ vpaddw %ymm3, %ymm6, %ymm4
+ vpsubw %ymm3, %ymm6, %ymm3
+ vpaddw %ymm9, %ymm5, %ymm6
+ vpsubw %ymm9, %ymm5, %ymm9
+ vpaddw %ymm11, %ymm7, %ymm5
+ vpsubw %ymm11, %ymm7, %ymm11
+ vpsubw %ymm12, %ymm8, %ymm8
+ vpaddw %ymm12, %ymm10, %ymm10
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm3, %ymm3
+ vpsubw %ymm14, %ymm6, %ymm6
+ vpaddw %ymm14, %ymm9, %ymm9
+ vpsubw %ymm15, %ymm5, %ymm5
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovdqa %ymm8, (%rdi)
+ vmovdqa %ymm4, 0x20(%rdi)
+ vmovdqa %ymm10, 0x40(%rdi)
+ vmovdqa %ymm3, 0x60(%rdi)
+ vmovdqa %ymm6, 0x80(%rdi)
+ vmovdqa %ymm5, 0xa0(%rdi)
+ vmovdqa %ymm9, 0xc0(%rdi)
+ vmovdqa %ymm11, 0xe0(%rdi)
+ vmovdqa 0x220(%rsi), %ymm15
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm10
+ vmovdqa 0x1e0(%rdi), %ymm11
+ vmovdqa 0x240(%rsi), %ymm2
+ vpmullw %ymm15, %ymm8, %ymm12
+ vpmullw %ymm15, %ymm9, %ymm13
+ vpmullw %ymm15, %ymm10, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vmovdqa 0x100(%rdi), %ymm4
+ vmovdqa 0x120(%rdi), %ymm5
+ vmovdqa 0x140(%rdi), %ymm6
+ vmovdqa 0x160(%rdi), %ymm7
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm8, %ymm4, %ymm3
+ vpsubw %ymm8, %ymm4, %ymm8
+ vpaddw %ymm9, %ymm5, %ymm4
+ vpsubw %ymm9, %ymm5, %ymm9
+ vpaddw %ymm10, %ymm6, %ymm5
+ vpsubw %ymm10, %ymm6, %ymm10
+ vpaddw %ymm11, %ymm7, %ymm6
+ vpsubw %ymm11, %ymm7, %ymm11
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpaddw %ymm12, %ymm8, %ymm8
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm9, %ymm9
+ vpsubw %ymm14, %ymm5, %ymm5
+ vpaddw %ymm14, %ymm10, %ymm10
+ vpsubw %ymm15, %ymm6, %ymm6
+ vpaddw %ymm15, %ymm11, %ymm11
+ vperm2i128 $0x20, %ymm10, %ymm5, %ymm7 # ymm7 = ymm5[0,1],ymm10[0,1]
+ vperm2i128 $0x31, %ymm10, %ymm5, %ymm10 # ymm10 = ymm5[2,3],ymm10[2,3]
+ vperm2i128 $0x20, %ymm11, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm11[0,1]
+ vperm2i128 $0x31, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[2,3],ymm11[2,3]
+ vmovdqa 0x260(%rsi), %ymm15
+ vmovdqa 0x280(%rsi), %ymm2
+ vpmullw %ymm15, %ymm7, %ymm12
+ vpmullw %ymm15, %ymm10, %ymm13
+ vpmullw %ymm15, %ymm5, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vperm2i128 $0x20, %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm7, %ymm6, %ymm4
+ vpsubw %ymm7, %ymm6, %ymm7
+ vpaddw %ymm10, %ymm8, %ymm6
+ vpsubw %ymm10, %ymm8, %ymm10
+ vpaddw %ymm5, %ymm3, %ymm8
+ vpsubw %ymm5, %ymm3, %ymm5
+ vpaddw %ymm11, %ymm9, %ymm3
+ vpsubw %ymm11, %ymm9, %ymm11
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpaddw %ymm12, %ymm7, %ymm7
+ vpsubw %ymm13, %ymm6, %ymm6
+ vpaddw %ymm13, %ymm10, %ymm10
+ vpsubw %ymm14, %ymm8, %ymm8
+ vpaddw %ymm14, %ymm5, %ymm5
+ vpsubw %ymm15, %ymm3, %ymm3
+ vpaddw %ymm15, %ymm11, %ymm11
+ vpunpcklqdq %ymm5, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm5[0],ymm8[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm8, %ymm5 # ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3]
+ vpunpcklqdq %ymm11, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
+ vpunpckhqdq %ymm11, %ymm3, %ymm11 # ymm11 = ymm3[1],ymm11[1],ymm3[3],ymm11[3]
+ vmovdqa 0x2a0(%rsi), %ymm15
+ vmovdqa 0x2c0(%rsi), %ymm2
+ vpmullw %ymm15, %ymm9, %ymm12
+ vpmullw %ymm15, %ymm5, %ymm13
+ vpmullw %ymm15, %ymm8, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vpunpcklqdq %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
+ vpunpckhqdq %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[1],ymm7[1],ymm4[3],ymm7[3]
+ vpunpcklqdq %ymm10, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2]
+ vpunpckhqdq %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm9, %ymm3, %ymm6
+ vpsubw %ymm9, %ymm3, %ymm9
+ vpaddw %ymm5, %ymm7, %ymm3
+ vpsubw %ymm5, %ymm7, %ymm5
+ vpaddw %ymm8, %ymm4, %ymm7
+ vpsubw %ymm8, %ymm4, %ymm8
+ vpaddw %ymm11, %ymm10, %ymm4
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpsubw %ymm12, %ymm6, %ymm6
+ vpaddw %ymm12, %ymm9, %ymm9
+ vpsubw %ymm13, %ymm3, %ymm3
+ vpaddw %ymm13, %ymm5, %ymm5
+ vpsubw %ymm14, %ymm7, %ymm7
+ vpaddw %ymm14, %ymm8, %ymm8
+ vpsubw %ymm15, %ymm4, %ymm4
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovsldup %ymm8, %ymm10 # ymm10 = ymm8[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm10, %ymm7, %ymm10 # ymm10 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4],ymm10[5],ymm7[6],ymm10[7]
+ vpsrlq $0x20, %ymm7, %ymm7
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
+ vmovsldup %ymm11, %ymm7 # ymm7 = ymm11[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7]
+ vpsrlq $0x20, %ymm4, %ymm4
+ vpblendd $0xaa, %ymm11, %ymm4, %ymm11 # ymm11 = ymm4[0],ymm11[1],ymm4[2],ymm11[3],ymm4[4],ymm11[5],ymm4[6],ymm11[7]
+ vmovdqa 0x2e0(%rsi), %ymm15
+ vmovdqa 0x300(%rsi), %ymm2
+ vpmullw %ymm15, %ymm10, %ymm12
+ vpmullw %ymm15, %ymm8, %ymm13
+ vpmullw %ymm15, %ymm7, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vmovsldup %ymm9, %ymm4 # ymm4 = ymm9[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7]
+ vmovsldup %ymm5, %ymm6 # ymm6 = ymm5[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm10, %ymm4, %ymm3
+ vpsubw %ymm10, %ymm4, %ymm10
+ vpaddw %ymm8, %ymm9, %ymm4
+ vpsubw %ymm8, %ymm9, %ymm8
+ vpaddw %ymm7, %ymm6, %ymm9
+ vpsubw %ymm7, %ymm6, %ymm7
+ vpaddw %ymm11, %ymm5, %ymm6
+ vpsubw %ymm11, %ymm5, %ymm11
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpaddw %ymm12, %ymm10, %ymm10
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm14, %ymm9, %ymm9
+ vpaddw %ymm14, %ymm7, %ymm7
+ vpsubw %ymm15, %ymm6, %ymm6
+ vpaddw %ymm15, %ymm11, %ymm11
+ vpslld $0x10, %ymm7, %ymm5
+ vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
+ vpsrld $0x10, %ymm9, %ymm9
+ vpblendw $0xaa, %ymm7, %ymm9, %ymm7 # ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4],ymm7[5],ymm9[6],ymm7[7],ymm9[8],ymm7[9],ymm9[10],ymm7[11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
+ vpslld $0x10, %ymm11, %ymm9
+ vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
+ vmovdqa 0x320(%rsi), %ymm15
+ vmovdqa 0x340(%rsi), %ymm2
+ vpmullw %ymm15, %ymm5, %ymm12
+ vpmullw %ymm15, %ymm7, %ymm13
+ vpmullw %ymm15, %ymm9, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vpslld $0x10, %ymm10, %ymm6
+ vpblendw $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7],ymm3[8],ymm6[9],ymm3[10],ymm6[11],ymm3[12],ymm6[13],ymm3[14],ymm6[15]
+ vpsrld $0x10, %ymm3, %ymm3
+ vpblendw $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7],ymm3[8],ymm10[9],ymm3[10],ymm10[11],ymm3[12],ymm10[13],ymm3[14],ymm10[15]
+ vpslld $0x10, %ymm8, %ymm3
+ vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7],ymm4[8],ymm8[9],ymm4[10],ymm8[11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm5, %ymm6, %ymm4
+ vpsubw %ymm5, %ymm6, %ymm5
+ vpaddw %ymm7, %ymm10, %ymm6
+ vpsubw %ymm7, %ymm10, %ymm7
+ vpaddw %ymm9, %ymm3, %ymm10
+ vpsubw %ymm9, %ymm3, %ymm9
+ vpaddw %ymm11, %ymm8, %ymm3
+ vpsubw %ymm11, %ymm8, %ymm11
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpaddw %ymm12, %ymm5, %ymm5
+ vpsubw %ymm13, %ymm6, %ymm6
+ vpaddw %ymm13, %ymm7, %ymm7
+ vpsubw %ymm14, %ymm10, %ymm10
+ vpaddw %ymm14, %ymm9, %ymm9
+ vpsubw %ymm15, %ymm3, %ymm3
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovdqa 0x360(%rsi), %ymm14
+ vmovdqa 0x3a0(%rsi), %ymm15
+ vmovdqa 0x380(%rsi), %ymm8
+ vmovdqa 0x3c0(%rsi), %ymm2
+ vpmullw %ymm14, %ymm10, %ymm12
+ vpmullw %ymm14, %ymm3, %ymm13
+ vpmullw %ymm15, %ymm9, %ymm14
+ vpmullw %ymm15, %ymm11, %ymm15
+ vpmulhw %ymm8, %ymm10, %ymm10
+ vpmulhw %ymm8, %ymm3, %ymm3
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm2, %ymm11, %ymm11
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpaddw %ymm10, %ymm4, %ymm8
+ vpsubw %ymm10, %ymm4, %ymm10
+ vpaddw %ymm3, %ymm6, %ymm4
+ vpsubw %ymm3, %ymm6, %ymm3
+ vpaddw %ymm9, %ymm5, %ymm6
+ vpsubw %ymm9, %ymm5, %ymm9
+ vpaddw %ymm11, %ymm7, %ymm5
+ vpsubw %ymm11, %ymm7, %ymm11
+ vpsubw %ymm12, %ymm8, %ymm8
+ vpaddw %ymm12, %ymm10, %ymm10
+ vpsubw %ymm13, %ymm4, %ymm4
+ vpaddw %ymm13, %ymm3, %ymm3
+ vpsubw %ymm14, %ymm6, %ymm6
+ vpaddw %ymm14, %ymm9, %ymm9
+ vpsubw %ymm15, %ymm5, %ymm5
+ vpaddw %ymm15, %ymm11, %ymm11
+ vmovdqa %ymm8, 0x100(%rdi)
+ vmovdqa %ymm4, 0x120(%rdi)
+ vmovdqa %ymm10, 0x140(%rdi)
+ vmovdqa %ymm3, 0x160(%rdi)
+ vmovdqa %ymm6, 0x180(%rdi)
+ vmovdqa %ymm5, 0x1a0(%rdi)
+ vmovdqa %ymm9, 0x1c0(%rdi)
+ vmovdqa %ymm11, 0x1e0(%rdi)
retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(ntt_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/nttfrombytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/nttfrombytes.S
index c4a174fa64..5ef95954ea 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/nttfrombytes.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/nttfrombytes.S
@@ -27,93 +27,167 @@
* dev/x86_64/src/nttfrombytes.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(nttfrombytes_avx2)
MLK_ASM_FN_SYMBOL(nttfrombytes_avx2)
- vmovdqa 0xe0(%rdx), %ymm0
- callq nttfrombytes_avx2_core
- addq $0x100, %rdi # imm = 0x100
- addq $0xc0, %rsi
- callq nttfrombytes_avx2_core
+ .cfi_startproc
+ movl $0xfff0fff, %eax # imm = 0xFFF0FFF
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ vmovdqu (%rsi), %ymm4
+ vmovdqu 0x20(%rsi), %ymm5
+ vmovdqu 0x40(%rsi), %ymm6
+ vmovdqu 0x60(%rsi), %ymm7
+ vmovdqu 0x80(%rsi), %ymm8
+ vmovdqu 0xa0(%rsi), %ymm9
+ vperm2i128 $0x20, %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm7[0,1]
+ vperm2i128 $0x31, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[2,3],ymm7[2,3]
+ vperm2i128 $0x20, %ymm8, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm5, %ymm8 # ymm8 = ymm5[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[2,3],ymm9[2,3]
+ vpunpcklqdq %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm8[0],ymm3[2],ymm8[2]
+ vpunpckhqdq %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[1],ymm8[1],ymm3[3],ymm8[3]
+ vpunpcklqdq %ymm5, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm7, %ymm5 # ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3]
+ vpunpcklqdq %ymm9, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm9[0],ymm4[2],ymm9[2]
+ vpunpckhqdq %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[1],ymm9[1],ymm4[3],ymm9[3]
+ vmovsldup %ymm5, %ymm4 # ymm4 = ymm5[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
+ vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7]
+ vpsrlq $0x20, %ymm8, %ymm8
+ vpblendd $0xaa, %ymm7, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7]
+ vmovsldup %ymm9, %ymm8 # ymm8 = ymm9[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm9, %ymm3, %ymm9 # ymm9 = ymm3[0],ymm9[1],ymm3[2],ymm9[3],ymm3[4],ymm9[5],ymm3[6],ymm9[7]
+ vpslld $0x10, %ymm7, %ymm10
+ vpblendw $0xaa, %ymm10, %ymm4, %ymm10 # ymm10 = ymm4[0],ymm10[1],ymm4[2],ymm10[3],ymm4[4],ymm10[5],ymm4[6],ymm10[7],ymm4[8],ymm10[9],ymm4[10],ymm10[11],ymm4[12],ymm10[13],ymm4[14],ymm10[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7],ymm4[8],ymm7[9],ymm4[10],ymm7[11],ymm4[12],ymm7[13],ymm4[14],ymm7[15]
+ vpslld $0x10, %ymm8, %ymm4
+ vpblendw $0xaa, %ymm4, %ymm5, %ymm4 # ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
+ vpsrld $0x10, %ymm5, %ymm5
+ vpblendw $0xaa, %ymm8, %ymm5, %ymm8 # ymm8 = ymm5[0],ymm8[1],ymm5[2],ymm8[3],ymm5[4],ymm8[5],ymm5[6],ymm8[7],ymm5[8],ymm8[9],ymm5[10],ymm8[11],ymm5[12],ymm8[13],ymm5[14],ymm8[15]
+ vpslld $0x10, %ymm9, %ymm5
+ vpblendw $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7],ymm6[8],ymm5[9],ymm6[10],ymm5[11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
+ vpsrlw $0xc, %ymm10, %ymm11
+ vpsllw $0x4, %ymm7, %ymm12
+ vpor %ymm11, %ymm12, %ymm11
+ vpand %ymm0, %ymm10, %ymm10
+ vpand %ymm0, %ymm11, %ymm11
+ vpsrlw $0x8, %ymm7, %ymm12
+ vpsllw $0x8, %ymm4, %ymm13
+ vpor %ymm12, %ymm13, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpsrlw $0x4, %ymm4, %ymm13
+ vpand %ymm0, %ymm13, %ymm13
+ vpsrlw $0xc, %ymm8, %ymm14
+ vpsllw $0x4, %ymm5, %ymm15
+ vpor %ymm14, %ymm15, %ymm14
+ vpand %ymm0, %ymm8, %ymm8
+ vpand %ymm0, %ymm14, %ymm14
+ vpsrlw $0x8, %ymm5, %ymm15
+ vpsllw $0x8, %ymm9, %ymm1
+ vpor %ymm15, %ymm1, %ymm15
+ vpand %ymm0, %ymm15, %ymm15
+ vpsrlw $0x4, %ymm9, %ymm1
+ vpand %ymm0, %ymm1, %ymm1
+ vmovdqa %ymm10, (%rdi)
+ vmovdqa %ymm11, 0x20(%rdi)
+ vmovdqa %ymm12, 0x40(%rdi)
+ vmovdqa %ymm13, 0x60(%rdi)
+ vmovdqa %ymm8, 0x80(%rdi)
+ vmovdqa %ymm14, 0xa0(%rdi)
+ vmovdqa %ymm15, 0xc0(%rdi)
+ vmovdqa %ymm1, 0xe0(%rdi)
+ vmovdqu 0xc0(%rsi), %ymm4
+ vmovdqu 0xe0(%rsi), %ymm5
+ vmovdqu 0x100(%rsi), %ymm6
+ vmovdqu 0x120(%rsi), %ymm7
+ vmovdqu 0x140(%rsi), %ymm8
+ vmovdqu 0x160(%rsi), %ymm9
+ vperm2i128 $0x20, %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm7[0,1]
+ vperm2i128 $0x31, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[2,3],ymm7[2,3]
+ vperm2i128 $0x20, %ymm8, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm5, %ymm8 # ymm8 = ymm5[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[2,3],ymm9[2,3]
+ vpunpcklqdq %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm8[0],ymm3[2],ymm8[2]
+ vpunpckhqdq %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[1],ymm8[1],ymm3[3],ymm8[3]
+ vpunpcklqdq %ymm5, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm7, %ymm5 # ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3]
+ vpunpcklqdq %ymm9, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm9[0],ymm4[2],ymm9[2]
+ vpunpckhqdq %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[1],ymm9[1],ymm4[3],ymm9[3]
+ vmovsldup %ymm5, %ymm4 # ymm4 = ymm5[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
+ vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7]
+ vpsrlq $0x20, %ymm8, %ymm8
+ vpblendd $0xaa, %ymm7, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7]
+ vmovsldup %ymm9, %ymm8 # ymm8 = ymm9[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm9, %ymm3, %ymm9 # ymm9 = ymm3[0],ymm9[1],ymm3[2],ymm9[3],ymm3[4],ymm9[5],ymm3[6],ymm9[7]
+ vpslld $0x10, %ymm7, %ymm10
+ vpblendw $0xaa, %ymm10, %ymm4, %ymm10 # ymm10 = ymm4[0],ymm10[1],ymm4[2],ymm10[3],ymm4[4],ymm10[5],ymm4[6],ymm10[7],ymm4[8],ymm10[9],ymm4[10],ymm10[11],ymm4[12],ymm10[13],ymm4[14],ymm10[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7],ymm4[8],ymm7[9],ymm4[10],ymm7[11],ymm4[12],ymm7[13],ymm4[14],ymm7[15]
+ vpslld $0x10, %ymm8, %ymm4
+ vpblendw $0xaa, %ymm4, %ymm5, %ymm4 # ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
+ vpsrld $0x10, %ymm5, %ymm5
+ vpblendw $0xaa, %ymm8, %ymm5, %ymm8 # ymm8 = ymm5[0],ymm8[1],ymm5[2],ymm8[3],ymm5[4],ymm8[5],ymm5[6],ymm8[7],ymm5[8],ymm8[9],ymm5[10],ymm8[11],ymm5[12],ymm8[13],ymm5[14],ymm8[15]
+ vpslld $0x10, %ymm9, %ymm5
+ vpblendw $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7],ymm6[8],ymm5[9],ymm6[10],ymm5[11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
+ vpsrlw $0xc, %ymm10, %ymm11
+ vpsllw $0x4, %ymm7, %ymm12
+ vpor %ymm11, %ymm12, %ymm11
+ vpand %ymm0, %ymm10, %ymm10
+ vpand %ymm0, %ymm11, %ymm11
+ vpsrlw $0x8, %ymm7, %ymm12
+ vpsllw $0x8, %ymm4, %ymm13
+ vpor %ymm12, %ymm13, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpsrlw $0x4, %ymm4, %ymm13
+ vpand %ymm0, %ymm13, %ymm13
+ vpsrlw $0xc, %ymm8, %ymm14
+ vpsllw $0x4, %ymm5, %ymm15
+ vpor %ymm14, %ymm15, %ymm14
+ vpand %ymm0, %ymm8, %ymm8
+ vpand %ymm0, %ymm14, %ymm14
+ vpsrlw $0x8, %ymm5, %ymm15
+ vpsllw $0x8, %ymm9, %ymm1
+ vpor %ymm15, %ymm1, %ymm15
+ vpand %ymm0, %ymm15, %ymm15
+ vpsrlw $0x4, %ymm9, %ymm1
+ vpand %ymm0, %ymm1, %ymm1
+ vmovdqa %ymm10, 0x100(%rdi)
+ vmovdqa %ymm11, 0x120(%rdi)
+ vmovdqa %ymm12, 0x140(%rdi)
+ vmovdqa %ymm13, 0x160(%rdi)
+ vmovdqa %ymm8, 0x180(%rdi)
+ vmovdqa %ymm14, 0x1a0(%rdi)
+ vmovdqa %ymm15, 0x1c0(%rdi)
+ vmovdqa %ymm1, 0x1e0(%rdi)
retq
+ .cfi_endproc
-nttfrombytes_avx2_core:
- vmovdqu (%rsi), %ymm4
- vmovdqu 0x20(%rsi), %ymm5
- vmovdqu 0x40(%rsi), %ymm6
- vmovdqu 0x60(%rsi), %ymm7
- vmovdqu 0x80(%rsi), %ymm8
- vmovdqu 0xa0(%rsi), %ymm9
- vperm2i128 $0x20, %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm7[0,1]
- vperm2i128 $0x31, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[2,3],ymm7[2,3]
- vperm2i128 $0x20, %ymm8, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm8[0,1]
- vperm2i128 $0x31, %ymm8, %ymm5, %ymm8 # ymm8 = ymm5[2,3],ymm8[2,3]
- vperm2i128 $0x20, %ymm9, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm9[0,1]
- vperm2i128 $0x31, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[2,3],ymm9[2,3]
- vpunpcklqdq %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm8[0],ymm3[2],ymm8[2]
- vpunpckhqdq %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[1],ymm8[1],ymm3[3],ymm8[3]
- vpunpcklqdq %ymm5, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
- vpunpckhqdq %ymm5, %ymm7, %ymm5 # ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3]
- vpunpcklqdq %ymm9, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm9[0],ymm4[2],ymm9[2]
- vpunpckhqdq %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[1],ymm9[1],ymm4[3],ymm9[3]
- vmovsldup %ymm5, %ymm4 # ymm4 = ymm5[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
- vpsrlq $0x20, %ymm6, %ymm6
- vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
- vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7]
- vpsrlq $0x20, %ymm8, %ymm8
- vpblendd $0xaa, %ymm7, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7]
- vmovsldup %ymm9, %ymm8 # ymm8 = ymm9[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
- vpsrlq $0x20, %ymm3, %ymm3
- vpblendd $0xaa, %ymm9, %ymm3, %ymm9 # ymm9 = ymm3[0],ymm9[1],ymm3[2],ymm9[3],ymm3[4],ymm9[5],ymm3[6],ymm9[7]
- vpslld $0x10, %ymm7, %ymm10
- vpblendw $0xaa, %ymm10, %ymm4, %ymm10 # ymm10 = ymm4[0],ymm10[1],ymm4[2],ymm10[3],ymm4[4],ymm10[5],ymm4[6],ymm10[7],ymm4[8],ymm10[9],ymm4[10],ymm10[11],ymm4[12],ymm10[13],ymm4[14],ymm10[15]
- vpsrld $0x10, %ymm4, %ymm4
- vpblendw $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7],ymm4[8],ymm7[9],ymm4[10],ymm7[11],ymm4[12],ymm7[13],ymm4[14],ymm7[15]
- vpslld $0x10, %ymm8, %ymm4
- vpblendw $0xaa, %ymm4, %ymm5, %ymm4 # ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
- vpsrld $0x10, %ymm5, %ymm5
- vpblendw $0xaa, %ymm8, %ymm5, %ymm8 # ymm8 = ymm5[0],ymm8[1],ymm5[2],ymm8[3],ymm5[4],ymm8[5],ymm5[6],ymm8[7],ymm5[8],ymm8[9],ymm5[10],ymm8[11],ymm5[12],ymm8[13],ymm5[14],ymm8[15]
- vpslld $0x10, %ymm9, %ymm5
- vpblendw $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7],ymm6[8],ymm5[9],ymm6[10],ymm5[11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
- vpsrld $0x10, %ymm6, %ymm6
- vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
- vpsrlw $0xc, %ymm10, %ymm11
- vpsllw $0x4, %ymm7, %ymm12
- vpor %ymm11, %ymm12, %ymm11
- vpand %ymm0, %ymm10, %ymm10
- vpand %ymm0, %ymm11, %ymm11
- vpsrlw $0x8, %ymm7, %ymm12
- vpsllw $0x8, %ymm4, %ymm13
- vpor %ymm12, %ymm13, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpsrlw $0x4, %ymm4, %ymm13
- vpand %ymm0, %ymm13, %ymm13
- vpsrlw $0xc, %ymm8, %ymm14
- vpsllw $0x4, %ymm5, %ymm15
- vpor %ymm14, %ymm15, %ymm14
- vpand %ymm0, %ymm8, %ymm8
- vpand %ymm0, %ymm14, %ymm14
- vpsrlw $0x8, %ymm5, %ymm15
- vpsllw $0x8, %ymm9, %ymm1
- vpor %ymm15, %ymm1, %ymm15
- vpand %ymm0, %ymm15, %ymm15
- vpsrlw $0x4, %ymm9, %ymm1
- vpand %ymm0, %ymm1, %ymm1
- vmovdqa %ymm10, (%rdi)
- vmovdqa %ymm11, 0x20(%rdi)
- vmovdqa %ymm12, 0x40(%rdi)
- vmovdqa %ymm13, 0x60(%rdi)
- vmovdqa %ymm8, 0x80(%rdi)
- vmovdqa %ymm14, 0xa0(%rdi)
- vmovdqa %ymm15, 0xc0(%rdi)
- vmovdqa %ymm1, 0xe0(%rdi)
- retq
+MLK_ASM_FN_SIZE(nttfrombytes_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/ntttobytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/ntttobytes.S
index 9bbd39f00a..b4e043bff2 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/ntttobytes.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/ntttobytes.S
@@ -27,87 +27,155 @@
* dev/x86_64/src/ntttobytes.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(ntttobytes_avx2)
MLK_ASM_FN_SYMBOL(ntttobytes_avx2)
- vmovdqa (%rdx), %ymm0
- callq ntttobytes_avx2_core
- addq $0x100, %rsi # imm = 0x100
- addq $0xc0, %rdi
- callq ntttobytes_avx2_core
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ vmovdqa (%rsi), %ymm5
+ vmovdqa 0x20(%rsi), %ymm6
+ vmovdqa 0x40(%rsi), %ymm7
+ vmovdqa 0x60(%rsi), %ymm8
+ vmovdqa 0x80(%rsi), %ymm9
+ vmovdqa 0xa0(%rsi), %ymm10
+ vmovdqa 0xc0(%rsi), %ymm11
+ vmovdqa 0xe0(%rsi), %ymm12
+ vpsllw $0xc, %ymm6, %ymm4
+ vpor %ymm4, %ymm5, %ymm4
+ vpsrlw $0x4, %ymm6, %ymm5
+ vpsllw $0x8, %ymm7, %ymm6
+ vpor %ymm5, %ymm6, %ymm5
+ vpsrlw $0x8, %ymm7, %ymm6
+ vpsllw $0x4, %ymm8, %ymm7
+ vpor %ymm6, %ymm7, %ymm6
+ vpsllw $0xc, %ymm10, %ymm7
+ vpor %ymm7, %ymm9, %ymm7
+ vpsrlw $0x4, %ymm10, %ymm8
+ vpsllw $0x8, %ymm11, %ymm9
+ vpor %ymm8, %ymm9, %ymm8
+ vpsrlw $0x8, %ymm11, %ymm9
+ vpsllw $0x4, %ymm12, %ymm10
+ vpor %ymm9, %ymm10, %ymm9
+ vpslld $0x10, %ymm5, %ymm3
+ vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+ vpslld $0x10, %ymm7, %ymm4
+ vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+ vpslld $0x10, %ymm9, %ymm6
+ vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
+ vpsrld $0x10, %ymm8, %ymm8
+ vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+ vmovsldup %ymm4, %ymm8 # ymm8 = ymm4[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+ vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
+ vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm7, %ymm6 # ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
+ vpsrlq $0x20, %ymm7, %ymm7
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
+ vpunpcklqdq %ymm3, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+ vpunpckhqdq %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3]
+ vpunpcklqdq %ymm4, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+ vpunpckhqdq %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
+ vpunpcklqdq %ymm9, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm9[0],ymm5[2],ymm9[2]
+ vpunpckhqdq %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[1],ymm9[1],ymm5[3],ymm9[3]
+ vperm2i128 $0x20, %ymm8, %ymm7, %ymm5 # ymm5 = ymm7[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm3, %ymm6, %ymm7 # ymm7 = ymm6[0,1],ymm3[0,1]
+ vperm2i128 $0x31, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[2,3],ymm3[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
+ vmovdqu %ymm5, (%rdi)
+ vmovdqu %ymm7, 0x20(%rdi)
+ vmovdqu %ymm6, 0x40(%rdi)
+ vmovdqu %ymm8, 0x60(%rdi)
+ vmovdqu %ymm3, 0x80(%rdi)
+ vmovdqu %ymm9, 0xa0(%rdi)
+ vmovdqa 0x100(%rsi), %ymm5
+ vmovdqa 0x120(%rsi), %ymm6
+ vmovdqa 0x140(%rsi), %ymm7
+ vmovdqa 0x160(%rsi), %ymm8
+ vmovdqa 0x180(%rsi), %ymm9
+ vmovdqa 0x1a0(%rsi), %ymm10
+ vmovdqa 0x1c0(%rsi), %ymm11
+ vmovdqa 0x1e0(%rsi), %ymm12
+ vpsllw $0xc, %ymm6, %ymm4
+ vpor %ymm4, %ymm5, %ymm4
+ vpsrlw $0x4, %ymm6, %ymm5
+ vpsllw $0x8, %ymm7, %ymm6
+ vpor %ymm5, %ymm6, %ymm5
+ vpsrlw $0x8, %ymm7, %ymm6
+ vpsllw $0x4, %ymm8, %ymm7
+ vpor %ymm6, %ymm7, %ymm6
+ vpsllw $0xc, %ymm10, %ymm7
+ vpor %ymm7, %ymm9, %ymm7
+ vpsrlw $0x4, %ymm10, %ymm8
+ vpsllw $0x8, %ymm11, %ymm9
+ vpor %ymm8, %ymm9, %ymm8
+ vpsrlw $0x8, %ymm11, %ymm9
+ vpsllw $0x4, %ymm12, %ymm10
+ vpor %ymm9, %ymm10, %ymm9
+ vpslld $0x10, %ymm5, %ymm3
+ vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+ vpsrld $0x10, %ymm4, %ymm4
+ vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+ vpslld $0x10, %ymm7, %ymm4
+ vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+ vpslld $0x10, %ymm9, %ymm6
+ vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
+ vpsrld $0x10, %ymm8, %ymm8
+ vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+ vmovsldup %ymm4, %ymm8 # ymm8 = ymm4[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+ vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
+ vpsrlq $0x20, %ymm6, %ymm6
+ vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
+ vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm6, %ymm7, %ymm6 # ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
+ vpsrlq $0x20, %ymm7, %ymm7
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
+ vpunpcklqdq %ymm3, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
+ vpunpckhqdq %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3]
+ vpunpcklqdq %ymm4, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
+ vpunpckhqdq %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
+ vpunpcklqdq %ymm9, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm9[0],ymm5[2],ymm9[2]
+ vpunpckhqdq %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[1],ymm9[1],ymm5[3],ymm9[3]
+ vperm2i128 $0x20, %ymm8, %ymm7, %ymm5 # ymm5 = ymm7[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm3, %ymm6, %ymm7 # ymm7 = ymm6[0,1],ymm3[0,1]
+ vperm2i128 $0x31, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[2,3],ymm3[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
+ vmovdqu %ymm5, 0xc0(%rdi)
+ vmovdqu %ymm7, 0xe0(%rdi)
+ vmovdqu %ymm6, 0x100(%rdi)
+ vmovdqu %ymm8, 0x120(%rdi)
+ vmovdqu %ymm3, 0x140(%rdi)
+ vmovdqu %ymm9, 0x160(%rdi)
retq
+ .cfi_endproc
-ntttobytes_avx2_core:
- vmovdqa (%rsi), %ymm5
- vmovdqa 0x20(%rsi), %ymm6
- vmovdqa 0x40(%rsi), %ymm7
- vmovdqa 0x60(%rsi), %ymm8
- vmovdqa 0x80(%rsi), %ymm9
- vmovdqa 0xa0(%rsi), %ymm10
- vmovdqa 0xc0(%rsi), %ymm11
- vmovdqa 0xe0(%rsi), %ymm12
- vpsllw $0xc, %ymm6, %ymm4
- vpor %ymm4, %ymm5, %ymm4
- vpsrlw $0x4, %ymm6, %ymm5
- vpsllw $0x8, %ymm7, %ymm6
- vpor %ymm5, %ymm6, %ymm5
- vpsrlw $0x8, %ymm7, %ymm6
- vpsllw $0x4, %ymm8, %ymm7
- vpor %ymm6, %ymm7, %ymm6
- vpsllw $0xc, %ymm10, %ymm7
- vpor %ymm7, %ymm9, %ymm7
- vpsrlw $0x4, %ymm10, %ymm8
- vpsllw $0x8, %ymm11, %ymm9
- vpor %ymm8, %ymm9, %ymm8
- vpsrlw $0x8, %ymm11, %ymm9
- vpsllw $0x4, %ymm12, %ymm10
- vpor %ymm9, %ymm10, %ymm9
- vpslld $0x10, %ymm5, %ymm3
- vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
- vpsrld $0x10, %ymm4, %ymm4
- vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
- vpslld $0x10, %ymm7, %ymm4
- vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
- vpsrld $0x10, %ymm6, %ymm6
- vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
- vpslld $0x10, %ymm9, %ymm6
- vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
- vpsrld $0x10, %ymm8, %ymm8
- vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
- vmovsldup %ymm4, %ymm8 # ymm8 = ymm4[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
- vpsrlq $0x20, %ymm3, %ymm3
- vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
- vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
- vpsrlq $0x20, %ymm6, %ymm6
- vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
- vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm6, %ymm7, %ymm6 # ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
- vpsrlq $0x20, %ymm7, %ymm7
- vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
- vpunpcklqdq %ymm3, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
- vpunpckhqdq %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3]
- vpunpcklqdq %ymm4, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
- vpunpckhqdq %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
- vpunpcklqdq %ymm9, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm9[0],ymm5[2],ymm9[2]
- vpunpckhqdq %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[1],ymm9[1],ymm5[3],ymm9[3]
- vperm2i128 $0x20, %ymm8, %ymm7, %ymm5 # ymm5 = ymm7[0,1],ymm8[0,1]
- vperm2i128 $0x31, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[2,3],ymm8[2,3]
- vperm2i128 $0x20, %ymm3, %ymm6, %ymm7 # ymm7 = ymm6[0,1],ymm3[0,1]
- vperm2i128 $0x31, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[2,3],ymm3[2,3]
- vperm2i128 $0x20, %ymm9, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm9[0,1]
- vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
- vmovdqu %ymm5, (%rdi)
- vmovdqu %ymm7, 0x20(%rdi)
- vmovdqu %ymm6, 0x40(%rdi)
- vmovdqu %ymm8, 0x60(%rdi)
- vmovdqu %ymm3, 0x80(%rdi)
- vmovdqu %ymm9, 0xa0(%rdi)
- retq
+MLK_ASM_FN_SIZE(ntttobytes_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/nttunpack.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/nttunpack.S
index 6233b1b950..6e9dc76aa5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/nttunpack.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/nttunpack.S
@@ -27,83 +27,148 @@
* dev/x86_64/src/nttunpack.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(nttunpack_avx2)
MLK_ASM_FN_SYMBOL(nttunpack_avx2)
- callq nttunpack_avx2_core
- addq $0x100, %rdi # imm = 0x100
- callq nttunpack_avx2_core
+ .cfi_startproc
+ vmovdqa (%rdi), %ymm4
+ vmovdqa 0x20(%rdi), %ymm5
+ vmovdqa 0x40(%rdi), %ymm6
+ vmovdqa 0x60(%rdi), %ymm7
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm9
+ vmovdqa 0xc0(%rdi), %ymm10
+ vmovdqa 0xe0(%rdi), %ymm11
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
+ vpsrlq $0x20, %ymm7, %ymm7
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
+ vpsrlq $0x20, %ymm5, %ymm5
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
+ vpsrlq $0x20, %ymm10, %ymm10
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
+ vpslld $0x10, %ymm5, %ymm10
+ vpblendw $0xaa, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4],ymm10[5],ymm9[6],ymm10[7],ymm9[8],ymm10[9],ymm9[10],ymm10[11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
+ vpsrld $0x10, %ymm9, %ymm9
+ vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
+ vpslld $0x10, %ymm4, %ymm9
+ vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+ vpsrld $0x10, %ymm8, %ymm8
+ vpblendw $0xaa, %ymm4, %ymm8, %ymm4 # ymm4 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4],ymm4[5],ymm8[6],ymm4[7],ymm8[8],ymm4[9],ymm8[10],ymm4[11],ymm8[12],ymm4[13],ymm8[14],ymm4[15]
+ vpslld $0x10, %ymm3, %ymm8
+ vpblendw $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7],ymm7[8],ymm8[9],ymm7[10],ymm8[11],ymm7[12],ymm8[13],ymm7[14],ymm8[15]
+ vpsrld $0x10, %ymm7, %ymm7
+ vpblendw $0xaa, %ymm3, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7],ymm7[8],ymm3[9],ymm7[10],ymm3[11],ymm7[12],ymm3[13],ymm7[14],ymm3[15]
+ vpslld $0x10, %ymm11, %ymm7
+ vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
+ vmovdqa %ymm10, (%rdi)
+ vmovdqa %ymm5, 0x20(%rdi)
+ vmovdqa %ymm9, 0x40(%rdi)
+ vmovdqa %ymm4, 0x60(%rdi)
+ vmovdqa %ymm8, 0x80(%rdi)
+ vmovdqa %ymm3, 0xa0(%rdi)
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm11, 0xe0(%rdi)
+ vmovdqa 0x100(%rdi), %ymm4
+ vmovdqa 0x120(%rdi), %ymm5
+ vmovdqa 0x140(%rdi), %ymm6
+ vmovdqa 0x160(%rdi), %ymm7
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm10
+ vmovdqa 0x1e0(%rdi), %ymm11
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
+ vpsrlq $0x20, %ymm7, %ymm7
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
+ vpsrlq $0x20, %ymm5, %ymm5
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
+ vpsrlq $0x20, %ymm3, %ymm3
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
+ vpsrlq $0x20, %ymm10, %ymm10
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
+ vpslld $0x10, %ymm5, %ymm10
+ vpblendw $0xaa, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4],ymm10[5],ymm9[6],ymm10[7],ymm9[8],ymm10[9],ymm9[10],ymm10[11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
+ vpsrld $0x10, %ymm9, %ymm9
+ vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
+ vpslld $0x10, %ymm4, %ymm9
+ vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+ vpsrld $0x10, %ymm8, %ymm8
+ vpblendw $0xaa, %ymm4, %ymm8, %ymm4 # ymm4 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4],ymm4[5],ymm8[6],ymm4[7],ymm8[8],ymm4[9],ymm8[10],ymm4[11],ymm8[12],ymm4[13],ymm8[14],ymm4[15]
+ vpslld $0x10, %ymm3, %ymm8
+ vpblendw $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7],ymm7[8],ymm8[9],ymm7[10],ymm8[11],ymm7[12],ymm8[13],ymm7[14],ymm8[15]
+ vpsrld $0x10, %ymm7, %ymm7
+ vpblendw $0xaa, %ymm3, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7],ymm7[8],ymm3[9],ymm7[10],ymm3[11],ymm7[12],ymm3[13],ymm7[14],ymm3[15]
+ vpslld $0x10, %ymm11, %ymm7
+ vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+ vpsrld $0x10, %ymm6, %ymm6
+ vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
+ vmovdqa %ymm10, 0x100(%rdi)
+ vmovdqa %ymm5, 0x120(%rdi)
+ vmovdqa %ymm9, 0x140(%rdi)
+ vmovdqa %ymm4, 0x160(%rdi)
+ vmovdqa %ymm8, 0x180(%rdi)
+ vmovdqa %ymm3, 0x1a0(%rdi)
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm11, 0x1e0(%rdi)
retq
+ .cfi_endproc
-nttunpack_avx2_core:
- vmovdqa (%rdi), %ymm4
- vmovdqa 0x20(%rdi), %ymm5
- vmovdqa 0x40(%rdi), %ymm6
- vmovdqa 0x60(%rdi), %ymm7
- vmovdqa 0x80(%rdi), %ymm8
- vmovdqa 0xa0(%rdi), %ymm9
- vmovdqa 0xc0(%rdi), %ymm10
- vmovdqa 0xe0(%rdi), %ymm11
- vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
- vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
- vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
- vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
- vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
- vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
- vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
- vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
- vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
- vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
- vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
- vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
- vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
- vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
- vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
- vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
- vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
- vpsrlq $0x20, %ymm7, %ymm7
- vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
- vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
- vpsrlq $0x20, %ymm5, %ymm5
- vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
- vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
- vpsrlq $0x20, %ymm3, %ymm3
- vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
- vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
- vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
- vpsrlq $0x20, %ymm10, %ymm10
- vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
- vpslld $0x10, %ymm5, %ymm10
- vpblendw $0xaa, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4],ymm10[5],ymm9[6],ymm10[7],ymm9[8],ymm10[9],ymm9[10],ymm10[11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
- vpsrld $0x10, %ymm9, %ymm9
- vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
- vpslld $0x10, %ymm4, %ymm9
- vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
- vpsrld $0x10, %ymm8, %ymm8
- vpblendw $0xaa, %ymm4, %ymm8, %ymm4 # ymm4 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4],ymm4[5],ymm8[6],ymm4[7],ymm8[8],ymm4[9],ymm8[10],ymm4[11],ymm8[12],ymm4[13],ymm8[14],ymm4[15]
- vpslld $0x10, %ymm3, %ymm8
- vpblendw $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7],ymm7[8],ymm8[9],ymm7[10],ymm8[11],ymm7[12],ymm8[13],ymm7[14],ymm8[15]
- vpsrld $0x10, %ymm7, %ymm7
- vpblendw $0xaa, %ymm3, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7],ymm7[8],ymm3[9],ymm7[10],ymm3[11],ymm7[12],ymm3[13],ymm7[14],ymm3[15]
- vpslld $0x10, %ymm11, %ymm7
- vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
- vpsrld $0x10, %ymm6, %ymm6
- vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
- vmovdqa %ymm10, (%rdi)
- vmovdqa %ymm5, 0x20(%rdi)
- vmovdqa %ymm9, 0x40(%rdi)
- vmovdqa %ymm4, 0x60(%rdi)
- vmovdqa %ymm8, 0x80(%rdi)
- vmovdqa %ymm3, 0xa0(%rdi)
- vmovdqa %ymm7, 0xc0(%rdi)
- vmovdqa %ymm11, 0xe0(%rdi)
- retq
+MLK_ASM_FN_SIZE(nttunpack_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d10.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d10.S
new file mode 100644
index 0000000000..90b4cf8bf0
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d10.S
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_compress_d10_avx2
+ *
+ * Description: Compression of a polynomial to 10 bits per coefficient.
+ *
+ * Arguments: - uint8_t *r: pointer to output byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D10)
+ * - const int16_t *a: pointer to input polynomial
+ * - const uint8_t *data: pointer to shufbidx constant
+ **************************************************/
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2 || MLKEM_K == 3)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_compress_d10.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_compress_d10_avx2)
+MLK_ASM_FN_SYMBOL(poly_compress_d10_avx2)
+
+ .cfi_startproc
+ movl $0x4ebf4ebf, %eax # imm = 0x4EBF4EBF
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ vpsllw $0x3, %ymm0, %ymm1
+ movl $0xf000f, %eax # imm = 0xF000F
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ movl $0x10001000, %eax # imm = 0x10001000
+ vmovd %eax, %xmm3
+ vpbroadcastd %xmm3, %ymm3
+ movl $0x3ff03ff, %eax # imm = 0x3FF03FF
+ vmovd %eax, %xmm4
+ vpbroadcastd %xmm4, %ymm4
+ movabsq $0x400000104000001, %rax # imm = 0x400000104000001
+ vmovq %rax, %xmm5
+ vpbroadcastq %xmm5, %ymm5
+ movl $0xc, %eax
+ vmovq %rax, %xmm6
+ vpbroadcastq %xmm6, %ymm6
+ vmovdqa (%rdx), %ymm7
+ vmovdqa (%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, (%rdi)
+ vmovd %xmm9, 0x10(%rdi)
+ vmovdqa 0x20(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x14(%rdi)
+ vmovd %xmm9, 0x24(%rdi)
+ vmovdqa 0x40(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x28(%rdi)
+ vmovd %xmm9, 0x38(%rdi)
+ vmovdqa 0x60(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x3c(%rdi)
+ vmovd %xmm9, 0x4c(%rdi)
+ vmovdqa 0x80(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x50(%rdi)
+ vmovd %xmm9, 0x60(%rdi)
+ vmovdqa 0xa0(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x64(%rdi)
+ vmovd %xmm9, 0x74(%rdi)
+ vmovdqa 0xc0(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x78(%rdi)
+ vmovd %xmm9, 0x88(%rdi)
+ vmovdqa 0xe0(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x8c(%rdi)
+ vmovd %xmm9, 0x9c(%rdi)
+ vmovdqa 0x100(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0xa0(%rdi)
+ vmovd %xmm9, 0xb0(%rdi)
+ vmovdqa 0x120(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0xb4(%rdi)
+ vmovd %xmm9, 0xc4(%rdi)
+ vmovdqa 0x140(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0xc8(%rdi)
+ vmovd %xmm9, 0xd8(%rdi)
+ vmovdqa 0x160(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0xdc(%rdi)
+ vmovd %xmm9, 0xec(%rdi)
+ vmovdqa 0x180(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0xf0(%rdi)
+ vmovd %xmm9, 0x100(%rdi)
+ vmovdqa 0x1a0(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x104(%rdi)
+ vmovd %xmm9, 0x114(%rdi)
+ vmovdqa 0x1c0(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x118(%rdi)
+ vmovd %xmm9, 0x128(%rdi)
+ vmovdqa 0x1e0(%rsi), %ymm8
+ vpmullw %ymm1, %ymm8, %ymm9
+ vpaddw %ymm2, %ymm8, %ymm10
+ vpsllw $0x3, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm9, %ymm10
+ vpandn %ymm10, %ymm9, %ymm9
+ vpsrlw $0xf, %ymm9, %ymm9
+ vpsubw %ymm9, %ymm8, %ymm8
+ vpmulhrsw %ymm3, %ymm8, %ymm8
+ vpand %ymm4, %ymm8, %ymm8
+ vpmaddwd %ymm5, %ymm8, %ymm8
+ vpsllvd %ymm6, %ymm8, %ymm8
+ vpsrlq $0xc, %ymm8, %ymm8
+ vpshufb %ymm7, %ymm8, %ymm8
+ vextracti128 $0x1, %ymm8, %xmm9
+ vpblendw $0xe0, %xmm9, %xmm8, %xmm8 # xmm8 = xmm8[0,1,2,3,4],xmm9[5,6,7]
+ vmovdqu %xmm8, 0x12c(%rdi)
+ vmovd %xmm9, 0x13c(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_compress_d10_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == \
+ 3) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d11.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d11.S
new file mode 100644
index 0000000000..f26a420ec0
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d11.S
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_compress_d11_avx2
+ *
+ * Description: Compression of a polynomial to 11 bits per coefficient.
+ *
+ * Arguments: - uint8_t *r: pointer to output byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D11)
+ * - const int16_t *a: pointer to input polynomial
+ * - const uint8_t *data: pointer to constants
+ * (srlvqidx[0:32], shufbidx[32:64])
+ **************************************************/
+
+#include "../../../common.h"
+
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_compress_d11.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_compress_d11_avx2)
+MLK_ASM_FN_SYMBOL(poly_compress_d11_avx2)
+
+ .cfi_startproc
+ movl $0x4ebf4ebf, %eax # imm = 0x4EBF4EBF
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ vpsllw $0x3, %ymm0, %ymm1
+ movl $0x240024, %eax # imm = 0x240024
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ movl $0x20002000, %eax # imm = 0x20002000
+ vmovd %eax, %xmm3
+ vpbroadcastd %xmm3, %ymm3
+ movl $0x7ff07ff, %eax # imm = 0x7FF07FF
+ vmovd %eax, %xmm4
+ vpbroadcastd %xmm4, %ymm4
+ movabsq $0x800000108000001, %rax # imm = 0x800000108000001
+ vmovq %rax, %xmm5
+ vpbroadcastq %xmm5, %ymm5
+ movl $0xa, %eax
+ vmovq %rax, %xmm6
+ vpbroadcastq %xmm6, %ymm6
+ vmovdqa (%rdx), %ymm7
+ vmovdqa 0x20(%rdx), %ymm8
+ vmovdqa (%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, (%rdi)
+ vmovd %xmm10, 0x10(%rdi)
+ vpextrw $0x2, %xmm10, 0x14(%rdi)
+ vmovdqa 0x20(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x16(%rdi)
+ vmovd %xmm10, 0x26(%rdi)
+ vpextrw $0x2, %xmm10, 0x2a(%rdi)
+ vmovdqa 0x40(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x2c(%rdi)
+ vmovd %xmm10, 0x3c(%rdi)
+ vpextrw $0x2, %xmm10, 0x40(%rdi)
+ vmovdqa 0x60(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x42(%rdi)
+ vmovd %xmm10, 0x52(%rdi)
+ vpextrw $0x2, %xmm10, 0x56(%rdi)
+ vmovdqa 0x80(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x58(%rdi)
+ vmovd %xmm10, 0x68(%rdi)
+ vpextrw $0x2, %xmm10, 0x6c(%rdi)
+ vmovdqa 0xa0(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x6e(%rdi)
+ vmovd %xmm10, 0x7e(%rdi)
+ vpextrw $0x2, %xmm10, 0x82(%rdi)
+ vmovdqa 0xc0(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x84(%rdi)
+ vmovd %xmm10, 0x94(%rdi)
+ vpextrw $0x2, %xmm10, 0x98(%rdi)
+ vmovdqa 0xe0(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x9a(%rdi)
+ vmovd %xmm10, 0xaa(%rdi)
+ vpextrw $0x2, %xmm10, 0xae(%rdi)
+ vmovdqa 0x100(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0xb0(%rdi)
+ vmovd %xmm10, 0xc0(%rdi)
+ vpextrw $0x2, %xmm10, 0xc4(%rdi)
+ vmovdqa 0x120(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0xc6(%rdi)
+ vmovd %xmm10, 0xd6(%rdi)
+ vpextrw $0x2, %xmm10, 0xda(%rdi)
+ vmovdqa 0x140(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0xdc(%rdi)
+ vmovd %xmm10, 0xec(%rdi)
+ vpextrw $0x2, %xmm10, 0xf0(%rdi)
+ vmovdqa 0x160(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0xf2(%rdi)
+ vmovd %xmm10, 0x102(%rdi)
+ vpextrw $0x2, %xmm10, 0x106(%rdi)
+ vmovdqa 0x180(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x108(%rdi)
+ vmovd %xmm10, 0x118(%rdi)
+ vpextrw $0x2, %xmm10, 0x11c(%rdi)
+ vmovdqa 0x1a0(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x11e(%rdi)
+ vmovd %xmm10, 0x12e(%rdi)
+ vpextrw $0x2, %xmm10, 0x132(%rdi)
+ vmovdqa 0x1c0(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x134(%rdi)
+ vmovd %xmm10, 0x144(%rdi)
+ vpextrw $0x2, %xmm10, 0x148(%rdi)
+ vmovdqa 0x1e0(%rsi), %ymm9
+ vpmullw %ymm1, %ymm9, %ymm10
+ vpaddw %ymm2, %ymm9, %ymm11
+ vpsllw $0x3, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm9, %ymm9
+ vpsubw %ymm11, %ymm10, %ymm11
+ vpandn %ymm11, %ymm10, %ymm10
+ vpsrlw $0xf, %ymm10, %ymm10
+ vpsubw %ymm10, %ymm9, %ymm9
+ vpmulhrsw %ymm3, %ymm9, %ymm9
+ vpand %ymm4, %ymm9, %ymm9
+ vpmaddwd %ymm5, %ymm9, %ymm9
+ vpsllvd %ymm6, %ymm9, %ymm9
+ vpsrldq $0x8, %ymm9, %ymm10 # ymm10 = ymm9[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+ vpsrlvq %ymm7, %ymm9, %ymm9
+ vpsllq $0x22, %ymm10, %ymm10
+ vpaddq %ymm10, %ymm9, %ymm9
+ vpshufb %ymm8, %ymm9, %ymm9
+ vextracti128 $0x1, %ymm9, %xmm10
+ vpblendvb %xmm8, %xmm10, %xmm9, %xmm9
+ vmovdqu %xmm9, 0x14a(%rdi)
+ vmovd %xmm10, 0x15a(%rdi)
+ vpextrw $0x2, %xmm10, 0x15e(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_compress_d11_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d4.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d4.S
new file mode 100644
index 0000000000..b4ca46e56b
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d4.S
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_compress_d4_avx2
+ *
+ * Description: Compression of a polynomial to 4 bits per coefficient.
+ *
+ * Arguments: - uint8_t *r: pointer to output byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D4)
+ * - const int16_t *a: pointer to input polynomial
+ * - const uint8_t *data: pointer to permdidx constant
+ **************************************************/
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2 || MLKEM_K == 3)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_compress_d4.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_compress_d4_avx2)
+MLK_ASM_FN_SYMBOL(poly_compress_d4_avx2)
+
+ .cfi_startproc
+ movl $0x4ebf4ebf, %eax # imm = 0x4EBF4EBF
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0x2000200, %eax # imm = 0x2000200
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ movl $0xf000f, %eax # imm = 0xF000F
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ movl $0x10011001, %eax # imm = 0x10011001
+ vmovd %eax, %xmm3
+ vpbroadcastd %xmm3, %ymm3
+ vmovdqa (%rdx), %ymm4
+ vmovdqa (%rsi), %ymm5
+ vmovdqa 0x20(%rsi), %ymm6
+ vmovdqa 0x40(%rsi), %ymm7
+ vmovdqa 0x60(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm5, %ymm5
+ vpmulhrsw %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm5, %ymm5
+ vpand %ymm2, %ymm6, %ymm6
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm6, %ymm5, %ymm5
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm5, %ymm5
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpackuswb %ymm7, %ymm5, %ymm5
+ vpermd %ymm5, %ymm4, %ymm5
+ vmovdqu %ymm5, (%rdi)
+ vmovdqa 0x80(%rsi), %ymm5
+ vmovdqa 0xa0(%rsi), %ymm6
+ vmovdqa 0xc0(%rsi), %ymm7
+ vmovdqa 0xe0(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm5, %ymm5
+ vpmulhrsw %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm5, %ymm5
+ vpand %ymm2, %ymm6, %ymm6
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm6, %ymm5, %ymm5
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm5, %ymm5
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpackuswb %ymm7, %ymm5, %ymm5
+ vpermd %ymm5, %ymm4, %ymm5
+ vmovdqu %ymm5, 0x20(%rdi)
+ vmovdqa 0x100(%rsi), %ymm5
+ vmovdqa 0x120(%rsi), %ymm6
+ vmovdqa 0x140(%rsi), %ymm7
+ vmovdqa 0x160(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm5, %ymm5
+ vpmulhrsw %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm5, %ymm5
+ vpand %ymm2, %ymm6, %ymm6
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm6, %ymm5, %ymm5
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm5, %ymm5
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpackuswb %ymm7, %ymm5, %ymm5
+ vpermd %ymm5, %ymm4, %ymm5
+ vmovdqu %ymm5, 0x40(%rdi)
+ vmovdqa 0x180(%rsi), %ymm5
+ vmovdqa 0x1a0(%rsi), %ymm6
+ vmovdqa 0x1c0(%rsi), %ymm7
+ vmovdqa 0x1e0(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm5, %ymm5
+ vpmulhrsw %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm5, %ymm5
+ vpand %ymm2, %ymm6, %ymm6
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm6, %ymm5, %ymm5
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm5, %ymm5
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpackuswb %ymm7, %ymm5, %ymm5
+ vpermd %ymm5, %ymm4, %ymm5
+ vmovdqu %ymm5, 0x60(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_compress_d4_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == \
+ 3) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d5.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d5.S
new file mode 100644
index 0000000000..c1cb3a6032
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_compress_d5.S
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_compress_d5_avx2
+ *
+ * Description: Compression of a polynomial to 5 bits per coefficient.
+ *
+ * Arguments: - uint8_t *r: pointer to output byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D5)
+ * - const int16_t *a: pointer to input polynomial
+ * - const uint8_t *data: pointer to shufbidx constant
+ **************************************************/
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_compress_d5.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_compress_d5_avx2)
+MLK_ASM_FN_SYMBOL(poly_compress_d5_avx2)
+
+ .cfi_startproc
+ movl $0x4ebf4ebf, %eax # imm = 0x4EBF4EBF
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0x4000400, %eax # imm = 0x4000400
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ movl $0x1f001f, %eax # imm = 0x1F001F
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ movl $0x20012001, %eax # imm = 0x20012001
+ vmovd %eax, %xmm3
+ vpbroadcastd %xmm3, %ymm3
+ movl $0x4000001, %eax # imm = 0x4000001
+ vmovd %eax, %xmm4
+ vpbroadcastd %xmm4, %ymm4
+ movl $0xc, %eax
+ vmovq %rax, %xmm5
+ vpbroadcastq %xmm5, %ymm5
+ vmovdqa (%rdx), %ymm6
+ vmovdqa (%rsi), %ymm7
+ vmovdqa 0x20(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, (%rdi)
+ vmovd %xmm8, 0x10(%rdi)
+ vmovdqa 0x40(%rsi), %ymm7
+ vmovdqa 0x60(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 0x14(%rdi)
+ vmovd %xmm8, 0x24(%rdi)
+ vmovdqa 0x80(%rsi), %ymm7
+ vmovdqa 0xa0(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 0x28(%rdi)
+ vmovd %xmm8, 0x38(%rdi)
+ vmovdqa 0xc0(%rsi), %ymm7
+ vmovdqa 0xe0(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 0x3c(%rdi)
+ vmovd %xmm8, 0x4c(%rdi)
+ vmovdqa 0x100(%rsi), %ymm7
+ vmovdqa 0x120(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 0x50(%rdi)
+ vmovd %xmm8, 0x60(%rdi)
+ vmovdqa 0x140(%rsi), %ymm7
+ vmovdqa 0x160(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 0x64(%rdi)
+ vmovd %xmm8, 0x74(%rdi)
+ vmovdqa 0x180(%rsi), %ymm7
+ vmovdqa 0x1a0(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 0x78(%rdi)
+ vmovd %xmm8, 0x88(%rdi)
+ vmovdqa 0x1c0(%rsi), %ymm7
+ vmovdqa 0x1e0(%rsi), %ymm8
+ vpmulhw %ymm0, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm8, %ymm8
+ vpmulhrsw %ymm1, %ymm7, %ymm7
+ vpmulhrsw %ymm1, %ymm8, %ymm8
+ vpand %ymm2, %ymm7, %ymm7
+ vpand %ymm2, %ymm8, %ymm8
+ vpackuswb %ymm8, %ymm7, %ymm7
+ vpmaddubsw %ymm3, %ymm7, %ymm7
+ vpmaddwd %ymm4, %ymm7, %ymm7
+ vpsllvd %ymm5, %ymm7, %ymm7
+ vpsrlvq %ymm5, %ymm7, %ymm7
+ vpshufb %ymm6, %ymm7, %ymm7
+ vextracti128 $0x1, %ymm7, %xmm8
+ vpblendvb %xmm6, %xmm8, %xmm7, %xmm7
+ vmovdqu %xmm7, 0x8c(%rdi)
+ vmovd %xmm8, 0x9c(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_compress_d5_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d10.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d10.S
new file mode 100644
index 0000000000..27412b18c4
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d10.S
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_decompress_d10_avx2
+ *
+ * Description: Decompression of a polynomial from 10 bits per coefficient.
+ *
+ * Arguments: - int16_t *r: pointer to output polynomial
+ * - const uint8_t *a: pointer to input byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D10)
+ * - const uint8_t *data: pointer to shufbidx constant
+ **************************************************/
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2 || MLKEM_K == 3)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_decompress_d10.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_decompress_d10_avx2)
+MLK_ASM_FN_SYMBOL(poly_decompress_d10_avx2)
+
+ .cfi_startproc
+ movl $0xd013404, %eax # imm = 0xD013404
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0x4, %eax
+ vmovq %rax, %xmm1
+ vpbroadcastq %xmm1, %ymm1
+ movl $0x7fe01ff8, %eax # imm = 0x7FE01FF8
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ vmovdqa (%rdx), %ymm3
+ vmovdqu (%rsi), %xmm4
+ vmovd 0x10(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, (%rdi)
+ vmovdqu 0x14(%rsi), %xmm4
+ vmovd 0x24(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x20(%rdi)
+ vmovdqu 0x28(%rsi), %xmm4
+ vmovd 0x38(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x40(%rdi)
+ vmovdqu 0x3c(%rsi), %xmm4
+ vmovd 0x4c(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x60(%rdi)
+ vmovdqu 0x50(%rsi), %xmm4
+ vmovd 0x60(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x80(%rdi)
+ vmovdqu 0x64(%rsi), %xmm4
+ vmovd 0x74(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xa0(%rdi)
+ vmovdqu 0x78(%rsi), %xmm4
+ vmovd 0x88(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xc0(%rdi)
+ vmovdqu 0x8c(%rsi), %xmm4
+ vmovd 0x9c(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xe0(%rdi)
+ vmovdqu 0xa0(%rsi), %xmm4
+ vmovd 0xb0(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x100(%rdi)
+ vmovdqu 0xb4(%rsi), %xmm4
+ vmovd 0xc4(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x120(%rdi)
+ vmovdqu 0xc8(%rsi), %xmm4
+ vmovd 0xd8(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x140(%rdi)
+ vmovdqu 0xdc(%rsi), %xmm4
+ vmovd 0xec(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x160(%rdi)
+ vmovdqu 0xf0(%rsi), %xmm4
+ vmovd 0x100(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x180(%rdi)
+ vmovdqu 0x104(%rsi), %xmm4
+ vmovd 0x114(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1a0(%rdi)
+ vmovdqu 0x118(%rsi), %xmm4
+ vmovd 0x128(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1c0(%rdi)
+ vmovdqu 0x12c(%rsi), %xmm4
+ vmovd 0x13c(%rsi), %xmm5
+ vinserti128 $0x1, %xmm5, %ymm4, %ymm4
+ vpermq $0x94, %ymm4, %ymm4 # ymm4 = ymm4[0,1,1,2]
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpsllvd %ymm1, %ymm4, %ymm4
+ vpsrlw $0x1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1e0(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_decompress_d10_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == \
+ 3) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d11.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d11.S
new file mode 100644
index 0000000000..67ef58e225
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d11.S
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_decompress_d11_avx2
+ *
+ * Description: Decompression of a polynomial from 11 bits per coefficient.
+ *
+ * Arguments: - int16_t *r: pointer to output polynomial
+ * - const uint8_t *a: pointer to input byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D11)
+ * - const uint8_t *data: pointer to constants
+ * (shufbidx[0:32], srlvdidx[32:64],
+ * srlvqidx[64:96], shift[96:128])
+ **************************************************/
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_decompress_d11.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_decompress_d11_avx2)
+MLK_ASM_FN_SYMBOL(poly_decompress_d11_avx2)
+
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0x7ff07ff0, %eax # imm = 0x7FF07FF0
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ vmovdqa (%rdx), %ymm2
+ vmovdqa 0x20(%rdx), %ymm3
+ vmovdqa 0x40(%rdx), %ymm4
+ vmovdqa 0x60(%rdx), %ymm5
+ vmovdqu (%rsi), %xmm6
+ vmovd 0x10(%rsi), %xmm7
+ vpinsrw $0x2, 0x14(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, (%rdi)
+ vmovdqu 0x16(%rsi), %xmm6
+ vmovd 0x26(%rsi), %xmm7
+ vpinsrw $0x2, 0x2a(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x20(%rdi)
+ vmovdqu 0x2c(%rsi), %xmm6
+ vmovd 0x3c(%rsi), %xmm7
+ vpinsrw $0x2, 0x40(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x40(%rdi)
+ vmovdqu 0x42(%rsi), %xmm6
+ vmovd 0x52(%rsi), %xmm7
+ vpinsrw $0x2, 0x56(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x60(%rdi)
+ vmovdqu 0x58(%rsi), %xmm6
+ vmovd 0x68(%rsi), %xmm7
+ vpinsrw $0x2, 0x6c(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x80(%rdi)
+ vmovdqu 0x6e(%rsi), %xmm6
+ vmovd 0x7e(%rsi), %xmm7
+ vpinsrw $0x2, 0x82(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0xa0(%rdi)
+ vmovdqu 0x84(%rsi), %xmm6
+ vmovd 0x94(%rsi), %xmm7
+ vpinsrw $0x2, 0x98(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0xc0(%rdi)
+ vmovdqu 0x9a(%rsi), %xmm6
+ vmovd 0xaa(%rsi), %xmm7
+ vpinsrw $0x2, 0xae(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0xe0(%rdi)
+ vmovdqu 0xb0(%rsi), %xmm6
+ vmovd 0xc0(%rsi), %xmm7
+ vpinsrw $0x2, 0xc4(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x100(%rdi)
+ vmovdqu 0xc6(%rsi), %xmm6
+ vmovd 0xd6(%rsi), %xmm7
+ vpinsrw $0x2, 0xda(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x120(%rdi)
+ vmovdqu 0xdc(%rsi), %xmm6
+ vmovd 0xec(%rsi), %xmm7
+ vpinsrw $0x2, 0xf0(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x140(%rdi)
+ vmovdqu 0xf2(%rsi), %xmm6
+ vmovd 0x102(%rsi), %xmm7
+ vpinsrw $0x2, 0x106(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x160(%rdi)
+ vmovdqu 0x108(%rsi), %xmm6
+ vmovd 0x118(%rsi), %xmm7
+ vpinsrw $0x2, 0x11c(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x180(%rdi)
+ vmovdqu 0x11e(%rsi), %xmm6
+ vmovd 0x12e(%rsi), %xmm7
+ vpinsrw $0x2, 0x132(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x1a0(%rdi)
+ vmovdqu 0x134(%rsi), %xmm6
+ vmovd 0x144(%rsi), %xmm7
+ vpinsrw $0x2, 0x148(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x1c0(%rdi)
+ vmovdqu 0x14a(%rsi), %xmm6
+ vmovd 0x15a(%rsi), %xmm7
+ vpinsrw $0x2, 0x15e(%rsi), %xmm7, %xmm7
+ vinserti128 $0x1, %xmm7, %ymm6, %ymm6
+ vpermq $0x94, %ymm6, %ymm6 # ymm6 = ymm6[0,1,1,2]
+ vpshufb %ymm2, %ymm6, %ymm6
+ vpsrlvd %ymm3, %ymm6, %ymm6
+ vpsrlvq %ymm4, %ymm6, %ymm6
+ vpmullw %ymm5, %ymm6, %ymm6
+ vpsrlw $0x1, %ymm6, %ymm6
+ vpand %ymm1, %ymm6, %ymm6
+ vpmulhrsw %ymm0, %ymm6, %ymm6
+ vmovdqu %ymm6, 0x1e0(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_decompress_d11_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d4.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d4.S
new file mode 100644
index 0000000000..765a850c22
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d4.S
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_decompress_d4_avx2
+ *
+ * Description: Decompression of a polynomial from 4 bits per coefficient.
+ *
+ * Arguments: - int16_t *r: pointer to output polynomial
+ * - const uint8_t *a: pointer to input byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D4)
+ * - const int8_t *data: pointer to shufbidx constant
+ **************************************************/
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2 || MLKEM_K == 3)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_decompress_d4.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_decompress_d4_avx2)
+MLK_ASM_FN_SYMBOL(poly_decompress_d4_avx2)
+
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0xf0000f, %eax # imm = 0xF0000F
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ movl $0x800800, %eax # imm = 0x800800
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ vmovdqa (%rdx), %ymm3
+ vmovq (%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, (%rdi)
+ vmovq 0x8(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x20(%rdi)
+ vmovq 0x10(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x40(%rdi)
+ vmovq 0x18(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x60(%rdi)
+ vmovq 0x20(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x80(%rdi)
+ vmovq 0x28(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xa0(%rdi)
+ vmovq 0x30(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xc0(%rdi)
+ vmovq 0x38(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xe0(%rdi)
+ vmovq 0x40(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x100(%rdi)
+ vmovq 0x48(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x120(%rdi)
+ vmovq 0x50(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x140(%rdi)
+ vmovq 0x58(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x160(%rdi)
+ vmovq 0x60(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x180(%rdi)
+ vmovq 0x68(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1a0(%rdi)
+ vmovq 0x70(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1c0(%rdi)
+ vmovq 0x78(%rsi), %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm3, %ymm4, %ymm4
+ vpand %ymm1, %ymm4, %ymm4
+ vpmullw %ymm2, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1e0(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_decompress_d4_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == \
+ 3) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d5.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d5.S
new file mode 100644
index 0000000000..3108d6b17e
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/poly_decompress_d5.S
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ * CRYSTALS-Kyber optimized AVX2 implementation
+ * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
+ * https://github.com/pq-crystals/kyber/tree/main/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Kyber implementation @[REF_AVX2].
+ */
+
+/*************************************************
+ * Name: mlk_poly_decompress_d5_avx2
+ *
+ * Description: Decompression of a polynomial from 5 bits per coefficient.
+ *
+ * Arguments: - int16_t *r: pointer to output polynomial
+ * - const uint8_t *a: pointer to input byte array
+ * (of length MLKEM_POLYCOMPRESSEDBYTES_D5)
+ * - const uint8_t *data: pointer to constants
+ * (shufbidx[0:32], mask[32:64], shift[64:96])
+ **************************************************/
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/poly_decompress_d5.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(poly_decompress_d5_avx2)
+MLK_ASM_FN_SYMBOL(poly_decompress_d5_avx2)
+
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ vmovdqa (%rdx), %ymm1
+ vmovdqa 0x20(%rdx), %ymm2
+ vmovdqa 0x40(%rdx), %ymm3
+ vmovq (%rsi), %xmm4
+ vpinsrw $0x4, 0x8(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, (%rdi)
+ vmovq 0xa(%rsi), %xmm4
+ vpinsrw $0x4, 0x12(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x20(%rdi)
+ vmovq 0x14(%rsi), %xmm4
+ vpinsrw $0x4, 0x1c(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x40(%rdi)
+ vmovq 0x1e(%rsi), %xmm4
+ vpinsrw $0x4, 0x26(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x60(%rdi)
+ vmovq 0x28(%rsi), %xmm4
+ vpinsrw $0x4, 0x30(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x80(%rdi)
+ vmovq 0x32(%rsi), %xmm4
+ vpinsrw $0x4, 0x3a(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xa0(%rdi)
+ vmovq 0x3c(%rsi), %xmm4
+ vpinsrw $0x4, 0x44(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xc0(%rdi)
+ vmovq 0x46(%rsi), %xmm4
+ vpinsrw $0x4, 0x4e(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0xe0(%rdi)
+ vmovq 0x50(%rsi), %xmm4
+ vpinsrw $0x4, 0x58(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x100(%rdi)
+ vmovq 0x5a(%rsi), %xmm4
+ vpinsrw $0x4, 0x62(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x120(%rdi)
+ vmovq 0x64(%rsi), %xmm4
+ vpinsrw $0x4, 0x6c(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x140(%rdi)
+ vmovq 0x6e(%rsi), %xmm4
+ vpinsrw $0x4, 0x76(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x160(%rdi)
+ vmovq 0x78(%rsi), %xmm4
+ vpinsrw $0x4, 0x80(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x180(%rdi)
+ vmovq 0x82(%rsi), %xmm4
+ vpinsrw $0x4, 0x8a(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1a0(%rdi)
+ vmovq 0x8c(%rsi), %xmm4
+ vpinsrw $0x4, 0x94(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1c0(%rdi)
+ vmovq 0x96(%rsi), %xmm4
+ vpinsrw $0x4, 0x9e(%rsi), %xmm4, %xmm4
+ vinserti128 $0x1, %xmm4, %ymm4, %ymm4
+ vpshufb %ymm1, %ymm4, %ymm4
+ vpand %ymm2, %ymm4, %ymm4
+ vpmullw %ymm3, %ymm4, %ymm4
+ vpmulhrsw %ymm0, %ymm4, %ymm4
+ vmovdqu %ymm4, 0x1e0(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(poly_decompress_d5_avx2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S
new file mode 100644
index 0000000000..af75ec5d3b
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S
@@ -0,0 +1,502 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2)
+MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k2)
+
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0xf301f301, %eax # imm = 0xF301F301
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ vmovdqa (%rsi), %ymm2
+ vmovdqa 0x20(%rsi), %ymm3
+ vmovdqa (%rdx), %ymm4
+ vmovdqa 0x20(%rdx), %ymm5
+ vmovdqa (%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x40(%rsi), %ymm2
+ vmovdqa 0x60(%rsi), %ymm3
+ vmovdqa 0x40(%rdx), %ymm4
+ vmovdqa 0x60(%rdx), %ymm5
+ vmovdqa 0x20(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x80(%rsi), %ymm2
+ vmovdqa 0xa0(%rsi), %ymm3
+ vmovdqa 0x80(%rdx), %ymm4
+ vmovdqa 0xa0(%rdx), %ymm5
+ vmovdqa 0x40(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0xc0(%rsi), %ymm2
+ vmovdqa 0xe0(%rsi), %ymm3
+ vmovdqa 0xc0(%rdx), %ymm4
+ vmovdqa 0xe0(%rdx), %ymm5
+ vmovdqa 0x60(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x100(%rsi), %ymm2
+ vmovdqa 0x120(%rsi), %ymm3
+ vmovdqa 0x100(%rdx), %ymm4
+ vmovdqa 0x120(%rdx), %ymm5
+ vmovdqa 0x80(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x140(%rsi), %ymm2
+ vmovdqa 0x160(%rsi), %ymm3
+ vmovdqa 0x140(%rdx), %ymm4
+ vmovdqa 0x160(%rdx), %ymm5
+ vmovdqa 0xa0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x180(%rsi), %ymm2
+ vmovdqa 0x1a0(%rsi), %ymm3
+ vmovdqa 0x180(%rdx), %ymm4
+ vmovdqa 0x1a0(%rdx), %ymm5
+ vmovdqa 0xc0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x1c0(%rsi), %ymm2
+ vmovdqa 0x1e0(%rsi), %ymm3
+ vmovdqa 0x1c0(%rdx), %ymm4
+ vmovdqa 0x1e0(%rdx), %ymm5
+ vmovdqa 0xe0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ vmovdqa 0x200(%rsi), %ymm2
+ vmovdqa 0x220(%rsi), %ymm3
+ vmovdqa 0x200(%rdx), %ymm4
+ vmovdqa 0x220(%rdx), %ymm5
+ vmovdqa 0x100(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa (%rdi), %ymm8
+ vmovdqa 0x20(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x240(%rsi), %ymm2
+ vmovdqa 0x260(%rsi), %ymm3
+ vmovdqa 0x240(%rdx), %ymm4
+ vmovdqa 0x260(%rdx), %ymm5
+ vmovdqa 0x120(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x40(%rdi), %ymm8
+ vmovdqa 0x60(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x280(%rsi), %ymm2
+ vmovdqa 0x2a0(%rsi), %ymm3
+ vmovdqa 0x280(%rdx), %ymm4
+ vmovdqa 0x2a0(%rdx), %ymm5
+ vmovdqa 0x140(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0x2c0(%rsi), %ymm2
+ vmovdqa 0x2e0(%rsi), %ymm3
+ vmovdqa 0x2c0(%rdx), %ymm4
+ vmovdqa 0x2e0(%rdx), %ymm5
+ vmovdqa 0x160(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0xc0(%rdi), %ymm8
+ vmovdqa 0xe0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x300(%rsi), %ymm2
+ vmovdqa 0x320(%rsi), %ymm3
+ vmovdqa 0x300(%rdx), %ymm4
+ vmovdqa 0x320(%rdx), %ymm5
+ vmovdqa 0x180(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x120(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x340(%rsi), %ymm2
+ vmovdqa 0x360(%rsi), %ymm3
+ vmovdqa 0x340(%rdx), %ymm4
+ vmovdqa 0x360(%rdx), %ymm5
+ vmovdqa 0x1a0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x140(%rdi), %ymm8
+ vmovdqa 0x160(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x380(%rsi), %ymm2
+ vmovdqa 0x3a0(%rsi), %ymm3
+ vmovdqa 0x380(%rdx), %ymm4
+ vmovdqa 0x3a0(%rdx), %ymm5
+ vmovdqa 0x1c0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x3c0(%rsi), %ymm2
+ vmovdqa 0x3e0(%rsi), %ymm3
+ vmovdqa 0x3c0(%rdx), %ymm4
+ vmovdqa 0x3e0(%rdx), %ymm5
+ vmovdqa 0x1e0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm8
+ vmovdqa 0x1e0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_asm_k2)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S
new file mode 100644
index 0000000000..931bfd63ea
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S
@@ -0,0 +1,750 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 3)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3)
+MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k3)
+
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0xf301f301, %eax # imm = 0xF301F301
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ vmovdqa (%rsi), %ymm2
+ vmovdqa 0x20(%rsi), %ymm3
+ vmovdqa (%rdx), %ymm4
+ vmovdqa 0x20(%rdx), %ymm5
+ vmovdqa (%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x40(%rsi), %ymm2
+ vmovdqa 0x60(%rsi), %ymm3
+ vmovdqa 0x40(%rdx), %ymm4
+ vmovdqa 0x60(%rdx), %ymm5
+ vmovdqa 0x20(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x80(%rsi), %ymm2
+ vmovdqa 0xa0(%rsi), %ymm3
+ vmovdqa 0x80(%rdx), %ymm4
+ vmovdqa 0xa0(%rdx), %ymm5
+ vmovdqa 0x40(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0xc0(%rsi), %ymm2
+ vmovdqa 0xe0(%rsi), %ymm3
+ vmovdqa 0xc0(%rdx), %ymm4
+ vmovdqa 0xe0(%rdx), %ymm5
+ vmovdqa 0x60(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x100(%rsi), %ymm2
+ vmovdqa 0x120(%rsi), %ymm3
+ vmovdqa 0x100(%rdx), %ymm4
+ vmovdqa 0x120(%rdx), %ymm5
+ vmovdqa 0x80(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x140(%rsi), %ymm2
+ vmovdqa 0x160(%rsi), %ymm3
+ vmovdqa 0x140(%rdx), %ymm4
+ vmovdqa 0x160(%rdx), %ymm5
+ vmovdqa 0xa0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x180(%rsi), %ymm2
+ vmovdqa 0x1a0(%rsi), %ymm3
+ vmovdqa 0x180(%rdx), %ymm4
+ vmovdqa 0x1a0(%rdx), %ymm5
+ vmovdqa 0xc0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x1c0(%rsi), %ymm2
+ vmovdqa 0x1e0(%rsi), %ymm3
+ vmovdqa 0x1c0(%rdx), %ymm4
+ vmovdqa 0x1e0(%rdx), %ymm5
+ vmovdqa 0xe0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ vmovdqa 0x200(%rsi), %ymm2
+ vmovdqa 0x220(%rsi), %ymm3
+ vmovdqa 0x200(%rdx), %ymm4
+ vmovdqa 0x220(%rdx), %ymm5
+ vmovdqa 0x100(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa (%rdi), %ymm8
+ vmovdqa 0x20(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x240(%rsi), %ymm2
+ vmovdqa 0x260(%rsi), %ymm3
+ vmovdqa 0x240(%rdx), %ymm4
+ vmovdqa 0x260(%rdx), %ymm5
+ vmovdqa 0x120(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x40(%rdi), %ymm8
+ vmovdqa 0x60(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x280(%rsi), %ymm2
+ vmovdqa 0x2a0(%rsi), %ymm3
+ vmovdqa 0x280(%rdx), %ymm4
+ vmovdqa 0x2a0(%rdx), %ymm5
+ vmovdqa 0x140(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0x2c0(%rsi), %ymm2
+ vmovdqa 0x2e0(%rsi), %ymm3
+ vmovdqa 0x2c0(%rdx), %ymm4
+ vmovdqa 0x2e0(%rdx), %ymm5
+ vmovdqa 0x160(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0xc0(%rdi), %ymm8
+ vmovdqa 0xe0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x300(%rsi), %ymm2
+ vmovdqa 0x320(%rsi), %ymm3
+ vmovdqa 0x300(%rdx), %ymm4
+ vmovdqa 0x320(%rdx), %ymm5
+ vmovdqa 0x180(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x120(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x340(%rsi), %ymm2
+ vmovdqa 0x360(%rsi), %ymm3
+ vmovdqa 0x340(%rdx), %ymm4
+ vmovdqa 0x360(%rdx), %ymm5
+ vmovdqa 0x1a0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x140(%rdi), %ymm8
+ vmovdqa 0x160(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x380(%rsi), %ymm2
+ vmovdqa 0x3a0(%rsi), %ymm3
+ vmovdqa 0x380(%rdx), %ymm4
+ vmovdqa 0x3a0(%rdx), %ymm5
+ vmovdqa 0x1c0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x3c0(%rsi), %ymm2
+ vmovdqa 0x3e0(%rsi), %ymm3
+ vmovdqa 0x3c0(%rdx), %ymm4
+ vmovdqa 0x3e0(%rdx), %ymm5
+ vmovdqa 0x1e0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm8
+ vmovdqa 0x1e0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ vmovdqa 0x400(%rsi), %ymm2
+ vmovdqa 0x420(%rsi), %ymm3
+ vmovdqa 0x400(%rdx), %ymm4
+ vmovdqa 0x420(%rdx), %ymm5
+ vmovdqa 0x200(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa (%rdi), %ymm8
+ vmovdqa 0x20(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x440(%rsi), %ymm2
+ vmovdqa 0x460(%rsi), %ymm3
+ vmovdqa 0x440(%rdx), %ymm4
+ vmovdqa 0x460(%rdx), %ymm5
+ vmovdqa 0x220(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x40(%rdi), %ymm8
+ vmovdqa 0x60(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x480(%rsi), %ymm2
+ vmovdqa 0x4a0(%rsi), %ymm3
+ vmovdqa 0x480(%rdx), %ymm4
+ vmovdqa 0x4a0(%rdx), %ymm5
+ vmovdqa 0x240(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0x4c0(%rsi), %ymm2
+ vmovdqa 0x4e0(%rsi), %ymm3
+ vmovdqa 0x4c0(%rdx), %ymm4
+ vmovdqa 0x4e0(%rdx), %ymm5
+ vmovdqa 0x260(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0xc0(%rdi), %ymm8
+ vmovdqa 0xe0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x500(%rsi), %ymm2
+ vmovdqa 0x520(%rsi), %ymm3
+ vmovdqa 0x500(%rdx), %ymm4
+ vmovdqa 0x520(%rdx), %ymm5
+ vmovdqa 0x280(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x120(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x540(%rsi), %ymm2
+ vmovdqa 0x560(%rsi), %ymm3
+ vmovdqa 0x540(%rdx), %ymm4
+ vmovdqa 0x560(%rdx), %ymm5
+ vmovdqa 0x2a0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x140(%rdi), %ymm8
+ vmovdqa 0x160(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x580(%rsi), %ymm2
+ vmovdqa 0x5a0(%rsi), %ymm3
+ vmovdqa 0x580(%rdx), %ymm4
+ vmovdqa 0x5a0(%rdx), %ymm5
+ vmovdqa 0x2c0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x5c0(%rsi), %ymm2
+ vmovdqa 0x5e0(%rsi), %ymm3
+ vmovdqa 0x5c0(%rdx), %ymm4
+ vmovdqa 0x5e0(%rdx), %ymm5
+ vmovdqa 0x2e0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm8
+ vmovdqa 0x1e0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_asm_k3)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S
new file mode 100644
index 0000000000..4f58578b9a
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S
@@ -0,0 +1,998 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && \
+ (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4)
+MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_asm_k4)
+
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0xf301f301, %eax # imm = 0xF301F301
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ vmovdqa (%rsi), %ymm2
+ vmovdqa 0x20(%rsi), %ymm3
+ vmovdqa (%rdx), %ymm4
+ vmovdqa 0x20(%rdx), %ymm5
+ vmovdqa (%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x40(%rsi), %ymm2
+ vmovdqa 0x60(%rsi), %ymm3
+ vmovdqa 0x40(%rdx), %ymm4
+ vmovdqa 0x60(%rdx), %ymm5
+ vmovdqa 0x20(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x80(%rsi), %ymm2
+ vmovdqa 0xa0(%rsi), %ymm3
+ vmovdqa 0x80(%rdx), %ymm4
+ vmovdqa 0xa0(%rdx), %ymm5
+ vmovdqa 0x40(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0xc0(%rsi), %ymm2
+ vmovdqa 0xe0(%rsi), %ymm3
+ vmovdqa 0xc0(%rdx), %ymm4
+ vmovdqa 0xe0(%rdx), %ymm5
+ vmovdqa 0x60(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x100(%rsi), %ymm2
+ vmovdqa 0x120(%rsi), %ymm3
+ vmovdqa 0x100(%rdx), %ymm4
+ vmovdqa 0x120(%rdx), %ymm5
+ vmovdqa 0x80(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x140(%rsi), %ymm2
+ vmovdqa 0x160(%rsi), %ymm3
+ vmovdqa 0x140(%rdx), %ymm4
+ vmovdqa 0x160(%rdx), %ymm5
+ vmovdqa 0xa0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x180(%rsi), %ymm2
+ vmovdqa 0x1a0(%rsi), %ymm3
+ vmovdqa 0x180(%rdx), %ymm4
+ vmovdqa 0x1a0(%rdx), %ymm5
+ vmovdqa 0xc0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x1c0(%rsi), %ymm2
+ vmovdqa 0x1e0(%rsi), %ymm3
+ vmovdqa 0x1c0(%rdx), %ymm4
+ vmovdqa 0x1e0(%rdx), %ymm5
+ vmovdqa 0xe0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ vmovdqa 0x200(%rsi), %ymm2
+ vmovdqa 0x220(%rsi), %ymm3
+ vmovdqa 0x200(%rdx), %ymm4
+ vmovdqa 0x220(%rdx), %ymm5
+ vmovdqa 0x100(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa (%rdi), %ymm8
+ vmovdqa 0x20(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x240(%rsi), %ymm2
+ vmovdqa 0x260(%rsi), %ymm3
+ vmovdqa 0x240(%rdx), %ymm4
+ vmovdqa 0x260(%rdx), %ymm5
+ vmovdqa 0x120(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x40(%rdi), %ymm8
+ vmovdqa 0x60(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x280(%rsi), %ymm2
+ vmovdqa 0x2a0(%rsi), %ymm3
+ vmovdqa 0x280(%rdx), %ymm4
+ vmovdqa 0x2a0(%rdx), %ymm5
+ vmovdqa 0x140(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0x2c0(%rsi), %ymm2
+ vmovdqa 0x2e0(%rsi), %ymm3
+ vmovdqa 0x2c0(%rdx), %ymm4
+ vmovdqa 0x2e0(%rdx), %ymm5
+ vmovdqa 0x160(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0xc0(%rdi), %ymm8
+ vmovdqa 0xe0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x300(%rsi), %ymm2
+ vmovdqa 0x320(%rsi), %ymm3
+ vmovdqa 0x300(%rdx), %ymm4
+ vmovdqa 0x320(%rdx), %ymm5
+ vmovdqa 0x180(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x120(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x340(%rsi), %ymm2
+ vmovdqa 0x360(%rsi), %ymm3
+ vmovdqa 0x340(%rdx), %ymm4
+ vmovdqa 0x360(%rdx), %ymm5
+ vmovdqa 0x1a0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x140(%rdi), %ymm8
+ vmovdqa 0x160(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x380(%rsi), %ymm2
+ vmovdqa 0x3a0(%rsi), %ymm3
+ vmovdqa 0x380(%rdx), %ymm4
+ vmovdqa 0x3a0(%rdx), %ymm5
+ vmovdqa 0x1c0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x3c0(%rsi), %ymm2
+ vmovdqa 0x3e0(%rsi), %ymm3
+ vmovdqa 0x3c0(%rdx), %ymm4
+ vmovdqa 0x3e0(%rdx), %ymm5
+ vmovdqa 0x1e0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm8
+ vmovdqa 0x1e0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ vmovdqa 0x400(%rsi), %ymm2
+ vmovdqa 0x420(%rsi), %ymm3
+ vmovdqa 0x400(%rdx), %ymm4
+ vmovdqa 0x420(%rdx), %ymm5
+ vmovdqa 0x200(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa (%rdi), %ymm8
+ vmovdqa 0x20(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x440(%rsi), %ymm2
+ vmovdqa 0x460(%rsi), %ymm3
+ vmovdqa 0x440(%rdx), %ymm4
+ vmovdqa 0x460(%rdx), %ymm5
+ vmovdqa 0x220(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x40(%rdi), %ymm8
+ vmovdqa 0x60(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x480(%rsi), %ymm2
+ vmovdqa 0x4a0(%rsi), %ymm3
+ vmovdqa 0x480(%rdx), %ymm4
+ vmovdqa 0x4a0(%rdx), %ymm5
+ vmovdqa 0x240(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0x4c0(%rsi), %ymm2
+ vmovdqa 0x4e0(%rsi), %ymm3
+ vmovdqa 0x4c0(%rdx), %ymm4
+ vmovdqa 0x4e0(%rdx), %ymm5
+ vmovdqa 0x260(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0xc0(%rdi), %ymm8
+ vmovdqa 0xe0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x500(%rsi), %ymm2
+ vmovdqa 0x520(%rsi), %ymm3
+ vmovdqa 0x500(%rdx), %ymm4
+ vmovdqa 0x520(%rdx), %ymm5
+ vmovdqa 0x280(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x120(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x540(%rsi), %ymm2
+ vmovdqa 0x560(%rsi), %ymm3
+ vmovdqa 0x540(%rdx), %ymm4
+ vmovdqa 0x560(%rdx), %ymm5
+ vmovdqa 0x2a0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x140(%rdi), %ymm8
+ vmovdqa 0x160(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x580(%rsi), %ymm2
+ vmovdqa 0x5a0(%rsi), %ymm3
+ vmovdqa 0x580(%rdx), %ymm4
+ vmovdqa 0x5a0(%rdx), %ymm5
+ vmovdqa 0x2c0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x5c0(%rsi), %ymm2
+ vmovdqa 0x5e0(%rsi), %ymm3
+ vmovdqa 0x5c0(%rdx), %ymm4
+ vmovdqa 0x5e0(%rdx), %ymm5
+ vmovdqa 0x2e0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm8
+ vmovdqa 0x1e0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ vmovdqa 0x600(%rsi), %ymm2
+ vmovdqa 0x620(%rsi), %ymm3
+ vmovdqa 0x600(%rdx), %ymm4
+ vmovdqa 0x620(%rdx), %ymm5
+ vmovdqa 0x300(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa (%rdi), %ymm8
+ vmovdqa 0x20(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, (%rdi)
+ vmovdqa %ymm9, 0x20(%rdi)
+ vmovdqa 0x640(%rsi), %ymm2
+ vmovdqa 0x660(%rsi), %ymm3
+ vmovdqa 0x640(%rdx), %ymm4
+ vmovdqa 0x660(%rdx), %ymm5
+ vmovdqa 0x320(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x40(%rdi), %ymm8
+ vmovdqa 0x60(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x40(%rdi)
+ vmovdqa %ymm9, 0x60(%rdi)
+ vmovdqa 0x680(%rsi), %ymm2
+ vmovdqa 0x6a0(%rsi), %ymm3
+ vmovdqa 0x680(%rdx), %ymm4
+ vmovdqa 0x6a0(%rdx), %ymm5
+ vmovdqa 0x340(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x80(%rdi), %ymm8
+ vmovdqa 0xa0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm9, 0xa0(%rdi)
+ vmovdqa 0x6c0(%rsi), %ymm2
+ vmovdqa 0x6e0(%rsi), %ymm3
+ vmovdqa 0x6c0(%rdx), %ymm4
+ vmovdqa 0x6e0(%rdx), %ymm5
+ vmovdqa 0x360(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0xc0(%rdi), %ymm8
+ vmovdqa 0xe0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x700(%rsi), %ymm2
+ vmovdqa 0x720(%rsi), %ymm3
+ vmovdqa 0x700(%rdx), %ymm4
+ vmovdqa 0x720(%rdx), %ymm5
+ vmovdqa 0x380(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x100(%rdi), %ymm8
+ vmovdqa 0x120(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x100(%rdi)
+ vmovdqa %ymm9, 0x120(%rdi)
+ vmovdqa 0x740(%rsi), %ymm2
+ vmovdqa 0x760(%rsi), %ymm3
+ vmovdqa 0x740(%rdx), %ymm4
+ vmovdqa 0x760(%rdx), %ymm5
+ vmovdqa 0x3a0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x140(%rdi), %ymm8
+ vmovdqa 0x160(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x140(%rdi)
+ vmovdqa %ymm9, 0x160(%rdi)
+ vmovdqa 0x780(%rsi), %ymm2
+ vmovdqa 0x7a0(%rsi), %ymm3
+ vmovdqa 0x780(%rdx), %ymm4
+ vmovdqa 0x7a0(%rdx), %ymm5
+ vmovdqa 0x3c0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm8, %ymm13, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x180(%rdi), %ymm8
+ vmovdqa 0x1a0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm9, 0x1a0(%rdi)
+ vmovdqa 0x7c0(%rsi), %ymm2
+ vmovdqa 0x7e0(%rsi), %ymm3
+ vmovdqa 0x7c0(%rdx), %ymm4
+ vmovdqa 0x7e0(%rdx), %ymm5
+ vmovdqa 0x3e0(%rcx), %ymm6
+ vpmullw %ymm2, %ymm1, %ymm13
+ vpmullw %ymm3, %ymm1, %ymm14
+ vpmullw %ymm13, %ymm4, %ymm7
+ vpmullw %ymm13, %ymm5, %ymm9
+ vpmullw %ymm14, %ymm6, %ymm8
+ vpmullw %ymm14, %ymm4, %ymm10
+ vpmulhw %ymm7, %ymm0, %ymm7
+ vpmulhw %ymm9, %ymm0, %ymm9
+ vpmulhw %ymm8, %ymm0, %ymm8
+ vpmulhw %ymm10, %ymm0, %ymm10
+ vpmulhw %ymm2, %ymm4, %ymm11
+ vpmulhw %ymm2, %ymm5, %ymm12
+ vpmulhw %ymm3, %ymm6, %ymm13
+ vpmulhw %ymm3, %ymm4, %ymm14
+ vpsubw %ymm7, %ymm11, %ymm7
+ vpsubw %ymm9, %ymm12, %ymm9
+ vpsubw %ymm13, %ymm8, %ymm8
+ vpsubw %ymm10, %ymm14, %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa 0x1c0(%rdi), %ymm8
+ vmovdqa 0x1e0(%rdi), %ymm10
+ vpaddw %ymm7, %ymm8, %ymm7
+ vpaddw %ymm9, %ymm10, %ymm9
+ vmovdqa %ymm7, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_asm_k4)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ && (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/reduce.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/reduce.S
index e550738705..76a249298f 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/reduce.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/reduce.S
@@ -19,7 +19,8 @@
* Changes:
* - Add call to csub in reduce128_avx to produce outputs
* in [0,1,...,q-1] rather than [0,1,...,q], matching the
- * semantics of mlk_poly_reduce().
+ * semantics of mlk_poly_reduce(),
+ * - Use a macro instead of a local function call.
*/
#include "../../../common.h"
@@ -32,101 +33,186 @@
* dev/x86_64/src/reduce.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(reduce_avx2)
MLK_ASM_FN_SYMBOL(reduce_avx2)
- vmovdqa (%rsi), %ymm0
- vmovdqa 0x40(%rsi), %ymm1
- callq reduce_avx2_core
- addq $0x100, %rdi # imm = 0x100
- callq reduce_avx2_core
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0x4ebf4ebf, %eax # imm = 0x4EBF4EBF
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ vmovdqa (%rdi), %ymm2
+ vmovdqa 0x20(%rdi), %ymm3
+ vmovdqa 0x40(%rdi), %ymm4
+ vmovdqa 0x60(%rdi), %ymm5
+ vmovdqa 0x80(%rdi), %ymm6
+ vmovdqa 0xa0(%rdi), %ymm7
+ vmovdqa 0xc0(%rdi), %ymm8
+ vmovdqa 0xe0(%rdi), %ymm9
+ vpmulhw %ymm1, %ymm2, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm2, %ymm2
+ vpmulhw %ymm1, %ymm3, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpmulhw %ymm1, %ymm4, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpmulhw %ymm1, %ymm5, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm5, %ymm5
+ vpmulhw %ymm1, %ymm6, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm6, %ymm6
+ vpmulhw %ymm1, %ymm7, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm7, %ymm7
+ vpmulhw %ymm1, %ymm8, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm8, %ymm8
+ vpmulhw %ymm1, %ymm9, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vpsubw %ymm0, %ymm2, %ymm2
+ vpsraw $0xf, %ymm2, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm2, %ymm2
+ vpsubw %ymm0, %ymm3, %ymm3
+ vpsraw $0xf, %ymm3, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm3, %ymm3
+ vpsubw %ymm0, %ymm4, %ymm4
+ vpsraw $0xf, %ymm4, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm4, %ymm4
+ vpsubw %ymm0, %ymm5, %ymm5
+ vpsraw $0xf, %ymm5, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm5, %ymm5
+ vpsubw %ymm0, %ymm6, %ymm6
+ vpsraw $0xf, %ymm6, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm6, %ymm6
+ vpsubw %ymm0, %ymm7, %ymm7
+ vpsraw $0xf, %ymm7, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm7, %ymm7
+ vpsubw %ymm0, %ymm8, %ymm8
+ vpsraw $0xf, %ymm8, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm8, %ymm8
+ vpsubw %ymm0, %ymm9, %ymm9
+ vpsraw $0xf, %ymm9, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm9, %ymm9
+ vmovdqa %ymm2, (%rdi)
+ vmovdqa %ymm3, 0x20(%rdi)
+ vmovdqa %ymm4, 0x40(%rdi)
+ vmovdqa %ymm5, 0x60(%rdi)
+ vmovdqa %ymm6, 0x80(%rdi)
+ vmovdqa %ymm7, 0xa0(%rdi)
+ vmovdqa %ymm8, 0xc0(%rdi)
+ vmovdqa %ymm9, 0xe0(%rdi)
+ vmovdqa 0x100(%rdi), %ymm2
+ vmovdqa 0x120(%rdi), %ymm3
+ vmovdqa 0x140(%rdi), %ymm4
+ vmovdqa 0x160(%rdi), %ymm5
+ vmovdqa 0x180(%rdi), %ymm6
+ vmovdqa 0x1a0(%rdi), %ymm7
+ vmovdqa 0x1c0(%rdi), %ymm8
+ vmovdqa 0x1e0(%rdi), %ymm9
+ vpmulhw %ymm1, %ymm2, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm2, %ymm2
+ vpmulhw %ymm1, %ymm3, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm3, %ymm3
+ vpmulhw %ymm1, %ymm4, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpmulhw %ymm1, %ymm5, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm5, %ymm5
+ vpmulhw %ymm1, %ymm6, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm6, %ymm6
+ vpmulhw %ymm1, %ymm7, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm7, %ymm7
+ vpmulhw %ymm1, %ymm8, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm8, %ymm8
+ vpmulhw %ymm1, %ymm9, %ymm12
+ vpsraw $0xa, %ymm12, %ymm12
+ vpmullw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vpsubw %ymm0, %ymm2, %ymm2
+ vpsraw $0xf, %ymm2, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm2, %ymm2
+ vpsubw %ymm0, %ymm3, %ymm3
+ vpsraw $0xf, %ymm3, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm3, %ymm3
+ vpsubw %ymm0, %ymm4, %ymm4
+ vpsraw $0xf, %ymm4, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm4, %ymm4
+ vpsubw %ymm0, %ymm5, %ymm5
+ vpsraw $0xf, %ymm5, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm5, %ymm5
+ vpsubw %ymm0, %ymm6, %ymm6
+ vpsraw $0xf, %ymm6, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm6, %ymm6
+ vpsubw %ymm0, %ymm7, %ymm7
+ vpsraw $0xf, %ymm7, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm7, %ymm7
+ vpsubw %ymm0, %ymm8, %ymm8
+ vpsraw $0xf, %ymm8, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm8, %ymm8
+ vpsubw %ymm0, %ymm9, %ymm9
+ vpsraw $0xf, %ymm9, %ymm12
+ vpand %ymm0, %ymm12, %ymm12
+ vpaddw %ymm12, %ymm9, %ymm9
+ vmovdqa %ymm2, 0x100(%rdi)
+ vmovdqa %ymm3, 0x120(%rdi)
+ vmovdqa %ymm4, 0x140(%rdi)
+ vmovdqa %ymm5, 0x160(%rdi)
+ vmovdqa %ymm6, 0x180(%rdi)
+ vmovdqa %ymm7, 0x1a0(%rdi)
+ vmovdqa %ymm8, 0x1c0(%rdi)
+ vmovdqa %ymm9, 0x1e0(%rdi)
retq
+ .cfi_endproc
-reduce_avx2_core:
- vmovdqa (%rdi), %ymm2
- vmovdqa 0x20(%rdi), %ymm3
- vmovdqa 0x40(%rdi), %ymm4
- vmovdqa 0x60(%rdi), %ymm5
- vmovdqa 0x80(%rdi), %ymm6
- vmovdqa 0xa0(%rdi), %ymm7
- vmovdqa 0xc0(%rdi), %ymm8
- vmovdqa 0xe0(%rdi), %ymm9
- vpmulhw %ymm1, %ymm2, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm2, %ymm2
- vpmulhw %ymm1, %ymm3, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm3, %ymm3
- vpmulhw %ymm1, %ymm4, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm4, %ymm4
- vpmulhw %ymm1, %ymm5, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm5, %ymm5
- vpmulhw %ymm1, %ymm6, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm6, %ymm6
- vpmulhw %ymm1, %ymm7, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm7, %ymm7
- vpmulhw %ymm1, %ymm8, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm8, %ymm8
- vpmulhw %ymm1, %ymm9, %ymm12
- vpsraw $0xa, %ymm12, %ymm12
- vpmullw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm9, %ymm9
- vpsubw %ymm0, %ymm2, %ymm2
- vpsraw $0xf, %ymm2, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm2, %ymm2
- vpsubw %ymm0, %ymm3, %ymm3
- vpsraw $0xf, %ymm3, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm3, %ymm3
- vpsubw %ymm0, %ymm4, %ymm4
- vpsraw $0xf, %ymm4, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm4, %ymm4
- vpsubw %ymm0, %ymm5, %ymm5
- vpsraw $0xf, %ymm5, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm5, %ymm5
- vpsubw %ymm0, %ymm6, %ymm6
- vpsraw $0xf, %ymm6, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm6, %ymm6
- vpsubw %ymm0, %ymm7, %ymm7
- vpsraw $0xf, %ymm7, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm7, %ymm7
- vpsubw %ymm0, %ymm8, %ymm8
- vpsraw $0xf, %ymm8, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm8, %ymm8
- vpsubw %ymm0, %ymm9, %ymm9
- vpsraw $0xf, %ymm9, %ymm12
- vpand %ymm0, %ymm12, %ymm12
- vpaddw %ymm12, %ymm9, %ymm9
- vmovdqa %ymm2, (%rdi)
- vmovdqa %ymm3, 0x20(%rdi)
- vmovdqa %ymm4, 0x40(%rdi)
- vmovdqa %ymm5, 0x60(%rdi)
- vmovdqa %ymm6, 0x80(%rdi)
- vmovdqa %ymm7, 0xa0(%rdi)
- vmovdqa %ymm8, 0xc0(%rdi)
- vmovdqa %ymm9, 0xe0(%rdi)
- retq
+MLK_ASM_FN_SIZE(reduce_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/rej_uniform_asm.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/rej_uniform_asm.S
new file mode 100644
index 0000000000..8cbc9ee1b6
--- /dev/null
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/rej_uniform_asm.S
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*************************************************
+ * Name: mlk_rej_uniform_asm
+ *
+ * Description: Run rejection sampling on uniform random bytes to generate
+ * uniform random integers mod q
+ *
+ * Arguments: - int16_t *r: pointer to output buffer of MLKEM_N
+ * 16-bit coefficients.
+ * - const uint8_t *buf: pointer to input buffer
+ * (assumed to be uniform random bytes)
+ * - unsigned buflen: length of input buffer in bytes.
+ * Must be a multiple of 12.
+ *
+ * Returns number of sampled 16-bit integers (at most MLKEM_N).
+ **************************************************/
+#include "../../../common.h"
+
+#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
+ !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
+
+/*
+ * WARNING: This file is auto-derived from the mlkem-native source file
+ * dev/x86_64/src/rej_uniform_asm.S using scripts/simpasm. Do not modify it directly.
+ */
+
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.balign 4
+.global MLK_ASM_NAMESPACE(rej_uniform_asm)
+MLK_ASM_FN_SYMBOL(rej_uniform_asm)
+
+ .cfi_startproc
+ subq $0x210, %rsp # imm = 0x210
+ .cfi_adjust_cfa_offset 0x210
+ xorl %eax, %eax
+ testq %rdx, %rdx
+ je Lrej_uniform_asm_end
+ movabsq $0xd010d010d010d01, %rax # imm = 0xD010D010D010D01
+ movq %rax, %xmm0
+ pinsrq $0x1, %rax, %xmm0
+ movabsq $0xfff0fff0fff0fff, %rax # imm = 0xFFF0FFF0FFF0FFF
+ movq %rax, %xmm5
+ pinsrq $0x1, %rax, %xmm5
+ movabsq $0x504040302010100, %rax # imm = 0x504040302010100
+ movq %rax, %xmm4
+ movabsq $0xb0a0a0908070706, %rax # imm = 0xB0A0A0908070706
+ pinsrq $0x1, %rax, %xmm4
+ movq $0x0, %rax
+ movq $0x0, %r8
+ movq $0x5555, %r9 # imm = 0x5555
+
+Lrej_uniform_asm_loop_start:
+ movq (%rsi,%r8), %xmm2
+ pinsrd $0x2, 0x8(%rsi,%r8), %xmm2
+ pshufb %xmm4, %xmm2
+ movdqa %xmm2, %xmm3
+ psrlw $0x4, %xmm3
+ pblendw $0xaa, %xmm3, %xmm2 # xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
+ pand %xmm5, %xmm2
+ movdqa %xmm0, %xmm1
+ pcmpgtw %xmm2, %xmm1
+ pmovmskb %xmm1, %r11d
+ pextq %r9, %r11, %r11
+ movq %r11, %r10
+ shlq $0x4, %r10
+ movdqu (%rcx,%r10), %xmm3
+ pshufb %xmm3, %xmm2
+ movdqu %xmm2, (%rsp,%rax,2)
+ popcntq %r11, %r11
+ addq %r11, %rax
+ cmpq $0x100, %rax # imm = 0x100
+ jae Lrej_uniform_asm_final_copy
+ addq $0xc, %r8
+ cmpq %r8, %rdx
+ ja Lrej_uniform_asm_loop_start
+
+Lrej_uniform_asm_final_copy:
+ movq $0x100, %rcx # imm = 0x100
+ cmpq $0x100, %rax # imm = 0x100
+ cmovaq %rcx, %rax
+ movq %rsp, %rsi
+ movq %rax, %rcx
+ shlq %rcx
+ rep movsb (%rsi), %es:(%rdi)
+
+Lrej_uniform_asm_end:
+ addq $0x210, %rsp # imm = 0x210
+ .cfi_adjust_cfa_offset -0x210
+ retq
+ .cfi_endproc
+
+MLK_ASM_FN_SIZE(rej_uniform_asm)
+
+#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
+ */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/rej_uniform_avx2.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/rej_uniform_avx2.c
deleted file mode 100644
index 9c22e5403d..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/rej_uniform_avx2.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/* References
- * ==========
- *
- * - [REF_AVX2]
- * CRYSTALS-Kyber optimized AVX2 implementation
- * Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
- * https://github.com/pq-crystals/kyber/tree/main/avx2
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
- */
-
-#include "../../../common.h"
-
-#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
- !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-
-#include
-#include
-#include
-#include "arith_native_x86_64.h"
-#include "consts.h"
-
-unsigned mlk_rej_uniform_avx2(int16_t *MLK_RESTRICT r, const uint8_t *buf)
-{
- unsigned ctr, pos;
- uint16_t val0, val1;
- uint32_t good;
- const __m256i bound =
- _mm256_load_si256(&mlk_qdata.vec[MLK_AVX2_BACKEND_DATA_OFFSET_16XQ / 16]);
- const __m256i ones = _mm256_set1_epi8(1);
- const __m256i mask = _mm256_set1_epi16(0xFFF);
- const __m256i idx8 =
- _mm256_set_epi8(15, 14, 14, 13, 12, 11, 11, 10, 9, 8, 8, 7, 6, 5, 5, 4,
- 11, 10, 10, 9, 8, 7, 7, 6, 5, 4, 4, 3, 2, 1, 1, 0);
- __m256i f0, f1, g0, g1, g2, g3;
- __m128i f, t, pilo, pihi;
-
- ctr = pos = 0;
- while (ctr <= MLKEM_N - 32 && pos <= MLK_AVX2_REJ_UNIFORM_BUFLEN - 48)
- {
- f0 = _mm256_loadu_si256((__m256i *)&buf[pos]);
- /* Don't load from offset 24, as this would over-read the buffer */
- f1 = _mm256_loadu_si256((__m256i *)&buf[pos + 16]);
- f0 = _mm256_permute4x64_epi64(f0, 0x94 /* 0b10010100 ~= (2,1,1,0) */);
- f1 = _mm256_permute4x64_epi64(f1, 0xe9 /* 0x11101001 ~= (3,2,2,1) */);
- f0 = _mm256_shuffle_epi8(f0, idx8);
- f1 = _mm256_shuffle_epi8(f1, idx8);
- g0 = _mm256_srli_epi16(f0, 4);
- g1 = _mm256_srli_epi16(f1, 4);
- f0 = _mm256_blend_epi16(f0, g0, 0xAA);
- f1 = _mm256_blend_epi16(f1, g1, 0xAA);
- f0 = _mm256_and_si256(f0, mask);
- f1 = _mm256_and_si256(f1, mask);
- pos += 48;
-
- g0 = _mm256_cmpgt_epi16(bound, f0);
- g1 = _mm256_cmpgt_epi16(bound, f1);
-
- g0 = _mm256_packs_epi16(g0, g1);
- good = _mm256_movemask_epi8(g0);
-
- g0 = _mm256_castsi128_si256(
- _mm_loadl_epi64((__m128i *)&mlk_rej_uniform_table[(good >> 0) & 0xFF]));
- g1 = _mm256_castsi128_si256(
- _mm_loadl_epi64((__m128i *)&mlk_rej_uniform_table[(good >> 8) & 0xFF]));
- g0 = _mm256_inserti128_si256(
- g0,
- _mm_loadl_epi64((__m128i *)&mlk_rej_uniform_table[(good >> 16) & 0xFF]),
- 1);
- g1 = _mm256_inserti128_si256(
- g1,
- _mm_loadl_epi64((__m128i *)&mlk_rej_uniform_table[(good >> 24) & 0xFF]),
- 1);
-
- g2 = _mm256_add_epi8(g0, ones);
- g3 = _mm256_add_epi8(g1, ones);
- g0 = _mm256_unpacklo_epi8(g0, g2);
- g1 = _mm256_unpacklo_epi8(g1, g3);
-
- f0 = _mm256_shuffle_epi8(f0, g0);
- f1 = _mm256_shuffle_epi8(f1, g1);
-
- _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0));
- ctr += _mm_popcnt_u32((good >> 0) & 0xFF);
- _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1));
- ctr += _mm_popcnt_u32((good >> 16) & 0xFF);
- _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1));
- ctr += _mm_popcnt_u32((good >> 8) & 0xFF);
- _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1));
- ctr += _mm_popcnt_u32((good >> 24) & 0xFF);
- }
-
- while (ctr <= MLKEM_N - 8 && pos <= MLK_AVX2_REJ_UNIFORM_BUFLEN - 24)
- {
- f = _mm_loadu_si128((__m128i *)&buf[pos]);
- f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8));
- t = _mm_srli_epi16(f, 4);
- f = _mm_blend_epi16(f, t, 0xAA);
- f = _mm_and_si128(f, _mm256_castsi256_si128(mask));
- pos += 12;
-
- t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f);
- good = _mm_movemask_epi8(t);
-
- good = _pext_u32(good, 0x5555);
- pilo = _mm_loadl_epi64((__m128i *)&mlk_rej_uniform_table[good]);
-
- pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
- pilo = _mm_unpacklo_epi8(pilo, pihi);
- f = _mm_shuffle_epi8(f, pilo);
- _mm_storeu_si128((__m128i *)&r[ctr], f);
- ctr += _mm_popcnt_u32(good);
- }
-
- while (ctr < MLKEM_N && pos <= MLK_AVX2_REJ_UNIFORM_BUFLEN - 3)
- {
- val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
- val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4));
- pos += 3;
-
- if (val0 < MLKEM_Q)
- {
- r[ctr++] = val0;
- }
- if (val1 < MLKEM_Q && ctr < MLKEM_N)
- {
- r[ctr++] = val1;
- }
- }
-
- return ctr;
-}
-
-#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
- */
-
-MLK_EMPTY_CU(avx2_rej_uniform)
-
-#endif /* !(MLK_ARITH_BACKEND_X86_64_DEFAULT && \
- !MLK_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/rej_uniform_table.c
index 5ab9a83179..5d5b25b866 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/rej_uniform_table.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/rej_uniform_table.c
@@ -5,6 +5,7 @@
/*
* WARNING: This file is auto-generated from scripts/autogen
+ * in the mlkem-native repository.
* Do not modify it directly.
*/
@@ -13,142 +14,525 @@
#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
#include "arith_native_x86_64.h"
/*
* Lookup table used by rejection sampling of the public matrix.
* See autogen for details.
*/
-MLK_ALIGN const uint8_t mlk_rej_uniform_table[256][8] = {
- {-1, -1, -1, -1, -1, -1, -1, -1}, {0, -1, -1, -1, -1, -1, -1, -1},
- {2, -1, -1, -1, -1, -1, -1, -1}, {0, 2, -1, -1, -1, -1, -1, -1},
- {4, -1, -1, -1, -1, -1, -1, -1}, {0, 4, -1, -1, -1, -1, -1, -1},
- {2, 4, -1, -1, -1, -1, -1, -1}, {0, 2, 4, -1, -1, -1, -1, -1},
- {6, -1, -1, -1, -1, -1, -1, -1}, {0, 6, -1, -1, -1, -1, -1, -1},
- {2, 6, -1, -1, -1, -1, -1, -1}, {0, 2, 6, -1, -1, -1, -1, -1},
- {4, 6, -1, -1, -1, -1, -1, -1}, {0, 4, 6, -1, -1, -1, -1, -1},
- {2, 4, 6, -1, -1, -1, -1, -1}, {0, 2, 4, 6, -1, -1, -1, -1},
- {8, -1, -1, -1, -1, -1, -1, -1}, {0, 8, -1, -1, -1, -1, -1, -1},
- {2, 8, -1, -1, -1, -1, -1, -1}, {0, 2, 8, -1, -1, -1, -1, -1},
- {4, 8, -1, -1, -1, -1, -1, -1}, {0, 4, 8, -1, -1, -1, -1, -1},
- {2, 4, 8, -1, -1, -1, -1, -1}, {0, 2, 4, 8, -1, -1, -1, -1},
- {6, 8, -1, -1, -1, -1, -1, -1}, {0, 6, 8, -1, -1, -1, -1, -1},
- {2, 6, 8, -1, -1, -1, -1, -1}, {0, 2, 6, 8, -1, -1, -1, -1},
- {4, 6, 8, -1, -1, -1, -1, -1}, {0, 4, 6, 8, -1, -1, -1, -1},
- {2, 4, 6, 8, -1, -1, -1, -1}, {0, 2, 4, 6, 8, -1, -1, -1},
- {10, -1, -1, -1, -1, -1, -1, -1}, {0, 10, -1, -1, -1, -1, -1, -1},
- {2, 10, -1, -1, -1, -1, -1, -1}, {0, 2, 10, -1, -1, -1, -1, -1},
- {4, 10, -1, -1, -1, -1, -1, -1}, {0, 4, 10, -1, -1, -1, -1, -1},
- {2, 4, 10, -1, -1, -1, -1, -1}, {0, 2, 4, 10, -1, -1, -1, -1},
- {6, 10, -1, -1, -1, -1, -1, -1}, {0, 6, 10, -1, -1, -1, -1, -1},
- {2, 6, 10, -1, -1, -1, -1, -1}, {0, 2, 6, 10, -1, -1, -1, -1},
- {4, 6, 10, -1, -1, -1, -1, -1}, {0, 4, 6, 10, -1, -1, -1, -1},
- {2, 4, 6, 10, -1, -1, -1, -1}, {0, 2, 4, 6, 10, -1, -1, -1},
- {8, 10, -1, -1, -1, -1, -1, -1}, {0, 8, 10, -1, -1, -1, -1, -1},
- {2, 8, 10, -1, -1, -1, -1, -1}, {0, 2, 8, 10, -1, -1, -1, -1},
- {4, 8, 10, -1, -1, -1, -1, -1}, {0, 4, 8, 10, -1, -1, -1, -1},
- {2, 4, 8, 10, -1, -1, -1, -1}, {0, 2, 4, 8, 10, -1, -1, -1},
- {6, 8, 10, -1, -1, -1, -1, -1}, {0, 6, 8, 10, -1, -1, -1, -1},
- {2, 6, 8, 10, -1, -1, -1, -1}, {0, 2, 6, 8, 10, -1, -1, -1},
- {4, 6, 8, 10, -1, -1, -1, -1}, {0, 4, 6, 8, 10, -1, -1, -1},
- {2, 4, 6, 8, 10, -1, -1, -1}, {0, 2, 4, 6, 8, 10, -1, -1},
- {12, -1, -1, -1, -1, -1, -1, -1}, {0, 12, -1, -1, -1, -1, -1, -1},
- {2, 12, -1, -1, -1, -1, -1, -1}, {0, 2, 12, -1, -1, -1, -1, -1},
- {4, 12, -1, -1, -1, -1, -1, -1}, {0, 4, 12, -1, -1, -1, -1, -1},
- {2, 4, 12, -1, -1, -1, -1, -1}, {0, 2, 4, 12, -1, -1, -1, -1},
- {6, 12, -1, -1, -1, -1, -1, -1}, {0, 6, 12, -1, -1, -1, -1, -1},
- {2, 6, 12, -1, -1, -1, -1, -1}, {0, 2, 6, 12, -1, -1, -1, -1},
- {4, 6, 12, -1, -1, -1, -1, -1}, {0, 4, 6, 12, -1, -1, -1, -1},
- {2, 4, 6, 12, -1, -1, -1, -1}, {0, 2, 4, 6, 12, -1, -1, -1},
- {8, 12, -1, -1, -1, -1, -1, -1}, {0, 8, 12, -1, -1, -1, -1, -1},
- {2, 8, 12, -1, -1, -1, -1, -1}, {0, 2, 8, 12, -1, -1, -1, -1},
- {4, 8, 12, -1, -1, -1, -1, -1}, {0, 4, 8, 12, -1, -1, -1, -1},
- {2, 4, 8, 12, -1, -1, -1, -1}, {0, 2, 4, 8, 12, -1, -1, -1},
- {6, 8, 12, -1, -1, -1, -1, -1}, {0, 6, 8, 12, -1, -1, -1, -1},
- {2, 6, 8, 12, -1, -1, -1, -1}, {0, 2, 6, 8, 12, -1, -1, -1},
- {4, 6, 8, 12, -1, -1, -1, -1}, {0, 4, 6, 8, 12, -1, -1, -1},
- {2, 4, 6, 8, 12, -1, -1, -1}, {0, 2, 4, 6, 8, 12, -1, -1},
- {10, 12, -1, -1, -1, -1, -1, -1}, {0, 10, 12, -1, -1, -1, -1, -1},
- {2, 10, 12, -1, -1, -1, -1, -1}, {0, 2, 10, 12, -1, -1, -1, -1},
- {4, 10, 12, -1, -1, -1, -1, -1}, {0, 4, 10, 12, -1, -1, -1, -1},
- {2, 4, 10, 12, -1, -1, -1, -1}, {0, 2, 4, 10, 12, -1, -1, -1},
- {6, 10, 12, -1, -1, -1, -1, -1}, {0, 6, 10, 12, -1, -1, -1, -1},
- {2, 6, 10, 12, -1, -1, -1, -1}, {0, 2, 6, 10, 12, -1, -1, -1},
- {4, 6, 10, 12, -1, -1, -1, -1}, {0, 4, 6, 10, 12, -1, -1, -1},
- {2, 4, 6, 10, 12, -1, -1, -1}, {0, 2, 4, 6, 10, 12, -1, -1},
- {8, 10, 12, -1, -1, -1, -1, -1}, {0, 8, 10, 12, -1, -1, -1, -1},
- {2, 8, 10, 12, -1, -1, -1, -1}, {0, 2, 8, 10, 12, -1, -1, -1},
- {4, 8, 10, 12, -1, -1, -1, -1}, {0, 4, 8, 10, 12, -1, -1, -1},
- {2, 4, 8, 10, 12, -1, -1, -1}, {0, 2, 4, 8, 10, 12, -1, -1},
- {6, 8, 10, 12, -1, -1, -1, -1}, {0, 6, 8, 10, 12, -1, -1, -1},
- {2, 6, 8, 10, 12, -1, -1, -1}, {0, 2, 6, 8, 10, 12, -1, -1},
- {4, 6, 8, 10, 12, -1, -1, -1}, {0, 4, 6, 8, 10, 12, -1, -1},
- {2, 4, 6, 8, 10, 12, -1, -1}, {0, 2, 4, 6, 8, 10, 12, -1},
- {14, -1, -1, -1, -1, -1, -1, -1}, {0, 14, -1, -1, -1, -1, -1, -1},
- {2, 14, -1, -1, -1, -1, -1, -1}, {0, 2, 14, -1, -1, -1, -1, -1},
- {4, 14, -1, -1, -1, -1, -1, -1}, {0, 4, 14, -1, -1, -1, -1, -1},
- {2, 4, 14, -1, -1, -1, -1, -1}, {0, 2, 4, 14, -1, -1, -1, -1},
- {6, 14, -1, -1, -1, -1, -1, -1}, {0, 6, 14, -1, -1, -1, -1, -1},
- {2, 6, 14, -1, -1, -1, -1, -1}, {0, 2, 6, 14, -1, -1, -1, -1},
- {4, 6, 14, -1, -1, -1, -1, -1}, {0, 4, 6, 14, -1, -1, -1, -1},
- {2, 4, 6, 14, -1, -1, -1, -1}, {0, 2, 4, 6, 14, -1, -1, -1},
- {8, 14, -1, -1, -1, -1, -1, -1}, {0, 8, 14, -1, -1, -1, -1, -1},
- {2, 8, 14, -1, -1, -1, -1, -1}, {0, 2, 8, 14, -1, -1, -1, -1},
- {4, 8, 14, -1, -1, -1, -1, -1}, {0, 4, 8, 14, -1, -1, -1, -1},
- {2, 4, 8, 14, -1, -1, -1, -1}, {0, 2, 4, 8, 14, -1, -1, -1},
- {6, 8, 14, -1, -1, -1, -1, -1}, {0, 6, 8, 14, -1, -1, -1, -1},
- {2, 6, 8, 14, -1, -1, -1, -1}, {0, 2, 6, 8, 14, -1, -1, -1},
- {4, 6, 8, 14, -1, -1, -1, -1}, {0, 4, 6, 8, 14, -1, -1, -1},
- {2, 4, 6, 8, 14, -1, -1, -1}, {0, 2, 4, 6, 8, 14, -1, -1},
- {10, 14, -1, -1, -1, -1, -1, -1}, {0, 10, 14, -1, -1, -1, -1, -1},
- {2, 10, 14, -1, -1, -1, -1, -1}, {0, 2, 10, 14, -1, -1, -1, -1},
- {4, 10, 14, -1, -1, -1, -1, -1}, {0, 4, 10, 14, -1, -1, -1, -1},
- {2, 4, 10, 14, -1, -1, -1, -1}, {0, 2, 4, 10, 14, -1, -1, -1},
- {6, 10, 14, -1, -1, -1, -1, -1}, {0, 6, 10, 14, -1, -1, -1, -1},
- {2, 6, 10, 14, -1, -1, -1, -1}, {0, 2, 6, 10, 14, -1, -1, -1},
- {4, 6, 10, 14, -1, -1, -1, -1}, {0, 4, 6, 10, 14, -1, -1, -1},
- {2, 4, 6, 10, 14, -1, -1, -1}, {0, 2, 4, 6, 10, 14, -1, -1},
- {8, 10, 14, -1, -1, -1, -1, -1}, {0, 8, 10, 14, -1, -1, -1, -1},
- {2, 8, 10, 14, -1, -1, -1, -1}, {0, 2, 8, 10, 14, -1, -1, -1},
- {4, 8, 10, 14, -1, -1, -1, -1}, {0, 4, 8, 10, 14, -1, -1, -1},
- {2, 4, 8, 10, 14, -1, -1, -1}, {0, 2, 4, 8, 10, 14, -1, -1},
- {6, 8, 10, 14, -1, -1, -1, -1}, {0, 6, 8, 10, 14, -1, -1, -1},
- {2, 6, 8, 10, 14, -1, -1, -1}, {0, 2, 6, 8, 10, 14, -1, -1},
- {4, 6, 8, 10, 14, -1, -1, -1}, {0, 4, 6, 8, 10, 14, -1, -1},
- {2, 4, 6, 8, 10, 14, -1, -1}, {0, 2, 4, 6, 8, 10, 14, -1},
- {12, 14, -1, -1, -1, -1, -1, -1}, {0, 12, 14, -1, -1, -1, -1, -1},
- {2, 12, 14, -1, -1, -1, -1, -1}, {0, 2, 12, 14, -1, -1, -1, -1},
- {4, 12, 14, -1, -1, -1, -1, -1}, {0, 4, 12, 14, -1, -1, -1, -1},
- {2, 4, 12, 14, -1, -1, -1, -1}, {0, 2, 4, 12, 14, -1, -1, -1},
- {6, 12, 14, -1, -1, -1, -1, -1}, {0, 6, 12, 14, -1, -1, -1, -1},
- {2, 6, 12, 14, -1, -1, -1, -1}, {0, 2, 6, 12, 14, -1, -1, -1},
- {4, 6, 12, 14, -1, -1, -1, -1}, {0, 4, 6, 12, 14, -1, -1, -1},
- {2, 4, 6, 12, 14, -1, -1, -1}, {0, 2, 4, 6, 12, 14, -1, -1},
- {8, 12, 14, -1, -1, -1, -1, -1}, {0, 8, 12, 14, -1, -1, -1, -1},
- {2, 8, 12, 14, -1, -1, -1, -1}, {0, 2, 8, 12, 14, -1, -1, -1},
- {4, 8, 12, 14, -1, -1, -1, -1}, {0, 4, 8, 12, 14, -1, -1, -1},
- {2, 4, 8, 12, 14, -1, -1, -1}, {0, 2, 4, 8, 12, 14, -1, -1},
- {6, 8, 12, 14, -1, -1, -1, -1}, {0, 6, 8, 12, 14, -1, -1, -1},
- {2, 6, 8, 12, 14, -1, -1, -1}, {0, 2, 6, 8, 12, 14, -1, -1},
- {4, 6, 8, 12, 14, -1, -1, -1}, {0, 4, 6, 8, 12, 14, -1, -1},
- {2, 4, 6, 8, 12, 14, -1, -1}, {0, 2, 4, 6, 8, 12, 14, -1},
- {10, 12, 14, -1, -1, -1, -1, -1}, {0, 10, 12, 14, -1, -1, -1, -1},
- {2, 10, 12, 14, -1, -1, -1, -1}, {0, 2, 10, 12, 14, -1, -1, -1},
- {4, 10, 12, 14, -1, -1, -1, -1}, {0, 4, 10, 12, 14, -1, -1, -1},
- {2, 4, 10, 12, 14, -1, -1, -1}, {0, 2, 4, 10, 12, 14, -1, -1},
- {6, 10, 12, 14, -1, -1, -1, -1}, {0, 6, 10, 12, 14, -1, -1, -1},
- {2, 6, 10, 12, 14, -1, -1, -1}, {0, 2, 6, 10, 12, 14, -1, -1},
- {4, 6, 10, 12, 14, -1, -1, -1}, {0, 4, 6, 10, 12, 14, -1, -1},
- {2, 4, 6, 10, 12, 14, -1, -1}, {0, 2, 4, 6, 10, 12, 14, -1},
- {8, 10, 12, 14, -1, -1, -1, -1}, {0, 8, 10, 12, 14, -1, -1, -1},
- {2, 8, 10, 12, 14, -1, -1, -1}, {0, 2, 8, 10, 12, 14, -1, -1},
- {4, 8, 10, 12, 14, -1, -1, -1}, {0, 4, 8, 10, 12, 14, -1, -1},
- {2, 4, 8, 10, 12, 14, -1, -1}, {0, 2, 4, 8, 10, 12, 14, -1},
- {6, 8, 10, 12, 14, -1, -1, -1}, {0, 6, 8, 10, 12, 14, -1, -1},
- {2, 6, 8, 10, 12, 14, -1, -1}, {0, 2, 6, 8, 10, 12, 14, -1},
- {4, 6, 8, 10, 12, 14, -1, -1}, {0, 4, 6, 8, 10, 12, 14, -1},
- {2, 4, 6, 8, 10, 12, 14, -1}, {0, 2, 4, 6, 8, 10, 12, 14},
+MLK_ALIGN const uint8_t mlk_rej_uniform_table[] = {
+ 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 0 */,
+ 0, 1, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 1 */,
+ 2, 3, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 2 */,
+ 0, 1, 2, 3, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 3 */,
+ 4, 5, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 4 */,
+ 0, 1, 4, 5, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 5 */,
+ 2, 3, 4, 5, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 6 */,
+ 0, 1, 2, 3, 4, 5, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 7 */,
+ 6, 7, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 8 */,
+ 0, 1, 6, 7, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 9 */,
+ 2, 3, 6, 7, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 10 */,
+ 0, 1, 2, 3, 6, 7, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 11 */,
+ 4, 5, 6, 7, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 12 */,
+ 0, 1, 4, 5, 6, 7, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 13 */,
+ 2, 3, 4, 5, 6, 7, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 14 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 15 */,
+ 8, 9, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 16 */,
+ 0, 1, 8, 9, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 17 */,
+ 2, 3, 8, 9, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 18 */,
+ 0, 1, 2, 3, 8, 9, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 19 */,
+ 4, 5, 8, 9, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 20 */,
+ 0, 1, 4, 5, 8, 9, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 21 */,
+ 2, 3, 4, 5, 8, 9, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 22 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 23 */,
+ 6, 7, 8, 9, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 24 */,
+ 0, 1, 6, 7, 8, 9, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 25 */,
+ 2, 3, 6, 7, 8, 9, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 26 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 27 */,
+ 4, 5, 6, 7, 8, 9, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 28 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 29 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 30 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 255, 255, 255, 255, 255, 255 /* 31 */,
+ 10, 11, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 32 */,
+ 0, 1, 10, 11, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 33 */,
+ 2, 3, 10, 11, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 34 */,
+ 0, 1, 2, 3, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 35 */,
+ 4, 5, 10, 11, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 36 */,
+ 0, 1, 4, 5, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 37 */,
+ 2, 3, 4, 5, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 38 */,
+ 0, 1, 2, 3, 4, 5, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 39 */,
+ 6, 7, 10, 11, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 40 */,
+ 0, 1, 6, 7, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 41 */,
+ 2, 3, 6, 7, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 42 */,
+ 0, 1, 2, 3, 6, 7, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 43 */,
+ 4, 5, 6, 7, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 44 */,
+ 0, 1, 4, 5, 6, 7, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 45 */,
+ 2, 3, 4, 5, 6, 7, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 46 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 10, 11, 255, 255, 255, 255, 255, 255 /* 47 */,
+ 8, 9, 10, 11, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 48 */,
+ 0, 1, 8, 9, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 49 */,
+ 2, 3, 8, 9, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 50 */,
+ 0, 1, 2, 3, 8, 9, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 51 */,
+ 4, 5, 8, 9, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 52 */,
+ 0, 1, 4, 5, 8, 9, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 53 */,
+ 2, 3, 4, 5, 8, 9, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 54 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 10, 11, 255, 255, 255, 255, 255, 255 /* 55 */,
+ 6, 7, 8, 9, 10, 11, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 56 */,
+ 0, 1, 6, 7, 8, 9, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 57 */,
+ 2, 3, 6, 7, 8, 9, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 58 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 10, 11, 255, 255, 255, 255, 255, 255 /* 59 */,
+ 4, 5, 6, 7, 8, 9, 10, 11,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 60 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 10, 11, 255, 255, 255, 255, 255, 255 /* 61 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 255, 255, 255, 255, 255, 255 /* 62 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 255, 255, 255, 255 /* 63 */,
+ 12, 13, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 64 */,
+ 0, 1, 12, 13, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 65 */,
+ 2, 3, 12, 13, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 66 */,
+ 0, 1, 2, 3, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 67 */,
+ 4, 5, 12, 13, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 68 */,
+ 0, 1, 4, 5, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 69 */,
+ 2, 3, 4, 5, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 70 */,
+ 0, 1, 2, 3, 4, 5, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 71 */,
+ 6, 7, 12, 13, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 72 */,
+ 0, 1, 6, 7, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 73 */,
+ 2, 3, 6, 7, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 74 */,
+ 0, 1, 2, 3, 6, 7, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 75 */,
+ 4, 5, 6, 7, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 76 */,
+ 0, 1, 4, 5, 6, 7, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 77 */,
+ 2, 3, 4, 5, 6, 7, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 78 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 79 */,
+ 8, 9, 12, 13, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 80 */,
+ 0, 1, 8, 9, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 81 */,
+ 2, 3, 8, 9, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 82 */,
+ 0, 1, 2, 3, 8, 9, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 83 */,
+ 4, 5, 8, 9, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 84 */,
+ 0, 1, 4, 5, 8, 9, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 85 */,
+ 2, 3, 4, 5, 8, 9, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 86 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 87 */,
+ 6, 7, 8, 9, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 88 */,
+ 0, 1, 6, 7, 8, 9, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 89 */,
+ 2, 3, 6, 7, 8, 9, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 90 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 91 */,
+ 4, 5, 6, 7, 8, 9, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 92 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 93 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 94 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 12, 13, 255, 255, 255, 255 /* 95 */,
+ 10, 11, 12, 13, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 96 */,
+ 0, 1, 10, 11, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 97 */,
+ 2, 3, 10, 11, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 98 */,
+ 0, 1, 2, 3, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 99 */,
+ 4, 5, 10, 11, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 100 */,
+ 0, 1, 4, 5, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 101 */,
+ 2, 3, 4, 5, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 102 */,
+ 0, 1, 2, 3, 4, 5, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 103 */,
+ 6, 7, 10, 11, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 104 */,
+ 0, 1, 6, 7, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 105 */,
+ 2, 3, 6, 7, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 106 */,
+ 0, 1, 2, 3, 6, 7, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 107 */,
+ 4, 5, 6, 7, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 108 */,
+ 0, 1, 4, 5, 6, 7, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 109 */,
+ 2, 3, 4, 5, 6, 7, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 110 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 10, 11, 12, 13, 255, 255, 255, 255 /* 111 */,
+ 8, 9, 10, 11, 12, 13, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 112 */,
+ 0, 1, 8, 9, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 113 */,
+ 2, 3, 8, 9, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 114 */,
+ 0, 1, 2, 3, 8, 9, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 115 */,
+ 4, 5, 8, 9, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 116 */,
+ 0, 1, 4, 5, 8, 9, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 117 */,
+ 2, 3, 4, 5, 8, 9, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 118 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 10, 11, 12, 13, 255, 255, 255, 255 /* 119 */,
+ 6, 7, 8, 9, 10, 11, 12, 13,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 120 */,
+ 0, 1, 6, 7, 8, 9, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 121 */,
+ 2, 3, 6, 7, 8, 9, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 122 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 10, 11, 12, 13, 255, 255, 255, 255 /* 123 */,
+ 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 255, 255, 255, 255, 255, 255 /* 124 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 255, 255, 255, 255 /* 125 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 255, 255, 255, 255 /* 126 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 255, 255 /* 127 */,
+ 14, 15, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 128 */,
+ 0, 1, 14, 15, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 129 */,
+ 2, 3, 14, 15, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 130 */,
+ 0, 1, 2, 3, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 131 */,
+ 4, 5, 14, 15, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 132 */,
+ 0, 1, 4, 5, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 133 */,
+ 2, 3, 4, 5, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 134 */,
+ 0, 1, 2, 3, 4, 5, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 135 */,
+ 6, 7, 14, 15, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 136 */,
+ 0, 1, 6, 7, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 137 */,
+ 2, 3, 6, 7, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 138 */,
+ 0, 1, 2, 3, 6, 7, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 139 */,
+ 4, 5, 6, 7, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 140 */,
+ 0, 1, 4, 5, 6, 7, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 141 */,
+ 2, 3, 4, 5, 6, 7, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 142 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 143 */,
+ 8, 9, 14, 15, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 144 */,
+ 0, 1, 8, 9, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 145 */,
+ 2, 3, 8, 9, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 146 */,
+ 0, 1, 2, 3, 8, 9, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 147 */,
+ 4, 5, 8, 9, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 148 */,
+ 0, 1, 4, 5, 8, 9, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 149 */,
+ 2, 3, 4, 5, 8, 9, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 150 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 151 */,
+ 6, 7, 8, 9, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 152 */,
+ 0, 1, 6, 7, 8, 9, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 153 */,
+ 2, 3, 6, 7, 8, 9, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 154 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 155 */,
+ 4, 5, 6, 7, 8, 9, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 156 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 157 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 158 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 14, 15, 255, 255, 255, 255 /* 159 */,
+ 10, 11, 14, 15, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 160 */,
+ 0, 1, 10, 11, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 161 */,
+ 2, 3, 10, 11, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 162 */,
+ 0, 1, 2, 3, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 163 */,
+ 4, 5, 10, 11, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 164 */,
+ 0, 1, 4, 5, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 165 */,
+ 2, 3, 4, 5, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 166 */,
+ 0, 1, 2, 3, 4, 5, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 167 */,
+ 6, 7, 10, 11, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 168 */,
+ 0, 1, 6, 7, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 169 */,
+ 2, 3, 6, 7, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 170 */,
+ 0, 1, 2, 3, 6, 7, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 171 */,
+ 4, 5, 6, 7, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 172 */,
+ 0, 1, 4, 5, 6, 7, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 173 */,
+ 2, 3, 4, 5, 6, 7, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 174 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 10, 11, 14, 15, 255, 255, 255, 255 /* 175 */,
+ 8, 9, 10, 11, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 176 */,
+ 0, 1, 8, 9, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 177 */,
+ 2, 3, 8, 9, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 178 */,
+ 0, 1, 2, 3, 8, 9, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 179 */,
+ 4, 5, 8, 9, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 180 */,
+ 0, 1, 4, 5, 8, 9, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 181 */,
+ 2, 3, 4, 5, 8, 9, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 182 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 10, 11, 14, 15, 255, 255, 255, 255 /* 183 */,
+ 6, 7, 8, 9, 10, 11, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 184 */,
+ 0, 1, 6, 7, 8, 9, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 185 */,
+ 2, 3, 6, 7, 8, 9, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 186 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 10, 11, 14, 15, 255, 255, 255, 255 /* 187 */,
+ 4, 5, 6, 7, 8, 9, 10, 11,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 188 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 10, 11, 14, 15, 255, 255, 255, 255 /* 189 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 14, 15, 255, 255, 255, 255 /* 190 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 14, 15, 255, 255 /* 191 */,
+ 12, 13, 14, 15, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 192 */,
+ 0, 1, 12, 13, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 193 */,
+ 2, 3, 12, 13, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 194 */,
+ 0, 1, 2, 3, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 195 */,
+ 4, 5, 12, 13, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 196 */,
+ 0, 1, 4, 5, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 197 */,
+ 2, 3, 4, 5, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 198 */,
+ 0, 1, 2, 3, 4, 5, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 199 */,
+ 6, 7, 12, 13, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 200 */,
+ 0, 1, 6, 7, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 201 */,
+ 2, 3, 6, 7, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 202 */,
+ 0, 1, 2, 3, 6, 7, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 203 */,
+ 4, 5, 6, 7, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 204 */,
+ 0, 1, 4, 5, 6, 7, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 205 */,
+ 2, 3, 4, 5, 6, 7, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 206 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 207 */,
+ 8, 9, 12, 13, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 208 */,
+ 0, 1, 8, 9, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 209 */,
+ 2, 3, 8, 9, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 210 */,
+ 0, 1, 2, 3, 8, 9, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 211 */,
+ 4, 5, 8, 9, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 212 */,
+ 0, 1, 4, 5, 8, 9, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 213 */,
+ 2, 3, 4, 5, 8, 9, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 214 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 215 */,
+ 6, 7, 8, 9, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 216 */,
+ 0, 1, 6, 7, 8, 9, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 217 */,
+ 2, 3, 6, 7, 8, 9, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 218 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 219 */,
+ 4, 5, 6, 7, 8, 9, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 220 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 221 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 222 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 12, 13, 14, 15, 255, 255 /* 223 */,
+ 10, 11, 12, 13, 14, 15, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 224 */,
+ 0, 1, 10, 11, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 225 */,
+ 2, 3, 10, 11, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 226 */,
+ 0, 1, 2, 3, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 227 */,
+ 4, 5, 10, 11, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 228 */,
+ 0, 1, 4, 5, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 229 */,
+ 2, 3, 4, 5, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 230 */,
+ 0, 1, 2, 3, 4, 5, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 231 */,
+ 6, 7, 10, 11, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 232 */,
+ 0, 1, 6, 7, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 233 */,
+ 2, 3, 6, 7, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 234 */,
+ 0, 1, 2, 3, 6, 7, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 235 */,
+ 4, 5, 6, 7, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 236 */,
+ 0, 1, 4, 5, 6, 7, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 237 */,
+ 2, 3, 4, 5, 6, 7, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 238 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 10, 11, 12, 13, 14, 15, 255, 255 /* 239 */,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 255, 255, 255, 255, 255, 255, 255, 255 /* 240 */,
+ 0, 1, 8, 9, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 241 */,
+ 2, 3, 8, 9, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 242 */,
+ 0, 1, 2, 3, 8, 9, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 243 */,
+ 4, 5, 8, 9, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 244 */,
+ 0, 1, 4, 5, 8, 9, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 245 */,
+ 2, 3, 4, 5, 8, 9, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 246 */,
+ 0, 1, 2, 3, 4, 5, 8, 9,
+ 10, 11, 12, 13, 14, 15, 255, 255 /* 247 */,
+ 6, 7, 8, 9, 10, 11, 12, 13,
+ 14, 15, 255, 255, 255, 255, 255, 255 /* 248 */,
+ 0, 1, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 249 */,
+ 2, 3, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 250 */,
+ 0, 1, 2, 3, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 255, 255 /* 251 */,
+ 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, 255, 255, 255, 255 /* 252 */,
+ 0, 1, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 255, 255 /* 253 */,
+ 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 255, 255 /* 254 */,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15 /* 255 */,
};
#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/shuffle.inc b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/shuffle.inc
deleted file mode 100644
index c03a82bdb5..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/shuffle.inc
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/*
- * This file is derived from the public domain
- * AVX2 Kyber implementation @[REF_AVX2].
- */
-
-.macro shuffle8 r0,r1,r2,r3
-vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
-vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-.macro shuffle4 r0,r1,r2,r3
-vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
-vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-/* Shuffle r0=(a0,b0,c0,d0,...), r1=(a1,b1,c1,d1,...) into */
-/* r2 = (a0,b0,a1,b1,e0,f0,e1,f1,...) */
-/* r3 = (c0,d0,c1,d1,g0,h0,g1,h1,...) */
-.macro shuffle2 r0,r1,r2,r3
-/* r2=(a1,b1,a1,b1,e1,f1,e1,f1,...) */
-vmovsldup %ymm\r1,%ymm\r2
-/* Conditional move */
-/* 0xAA = 0b10101010 */
-/* r2=(a0,b0,a1,b1,e0,f0,e1,f1,...) */
-vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
-/* r0=(c0,d0,0,0,g0,h0,0,0,...) */
-vpsrlq $32,%ymm\r0,%ymm\r0
-/* r3=(c0,d0,c1,d1,g0,h0,g1,h1,...) */
-vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
-
-.macro shuffle1 r0,r1,r2,r3
-vpslld $16,%ymm\r1,%ymm\r2
-vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
-vpsrld $16,%ymm\r0,%ymm\r0
-vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
-.endm
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/tomont.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/tomont.S
index 13e45bdc63..8d7403c22a 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/tomont.S
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/tomont.S
@@ -18,7 +18,8 @@
* Changes:
* - Add call to csub in reduce128_avx to produce outputs
* in [0,1,...,q-1] rather than [0,1,...,q], matching the
- * semantics of mlk_poly_reduce().
+ * semantics of mlk_poly_reduce(),
+ * - Use a macro instead of a local function call.
*/
#include "../../../common.h"
@@ -30,70 +31,125 @@
* dev/x86_64/src/tomont.S using scripts/simpasm. Do not modify it directly.
*/
+#if defined(__ELF__)
+.section .note.GNU-stack,"",@progbits
+#endif
.text
.balign 4
.global MLK_ASM_NAMESPACE(tomont_avx2)
MLK_ASM_FN_SYMBOL(tomont_avx2)
- vmovdqa (%rsi), %ymm0
- vmovdqa 0xa0(%rsi), %ymm1
- vmovdqa 0xc0(%rsi), %ymm2
- callq tomont_avx2_core
- addq $0x100, %rdi # imm = 0x100
- callq tomont_avx2_core
+ .cfi_startproc
+ movl $0xd010d01, %eax # imm = 0xD010D01
+ vmovd %eax, %xmm0
+ vpbroadcastd %xmm0, %ymm0
+ movl $0x50495049, %eax # imm = 0x50495049
+ vmovd %eax, %xmm1
+ vpbroadcastd %xmm1, %ymm1
+ movl $0x5490549, %eax # imm = 0x5490549
+ vmovd %eax, %xmm2
+ vpbroadcastd %xmm2, %ymm2
+ vmovdqa (%rdi), %ymm3
+ vmovdqa 0x20(%rdi), %ymm4
+ vmovdqa 0x40(%rdi), %ymm5
+ vmovdqa 0x60(%rdi), %ymm6
+ vmovdqa 0x80(%rdi), %ymm7
+ vmovdqa 0xa0(%rdi), %ymm8
+ vmovdqa 0xc0(%rdi), %ymm9
+ vmovdqa 0xe0(%rdi), %ymm10
+ vpmullw %ymm1, %ymm3, %ymm11
+ vpmulhw %ymm2, %ymm3, %ymm3
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm11, %ymm3, %ymm3
+ vpmullw %ymm1, %ymm4, %ymm12
+ vpmulhw %ymm2, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpmullw %ymm1, %ymm5, %ymm13
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpsubw %ymm13, %ymm5, %ymm5
+ vpmullw %ymm1, %ymm6, %ymm14
+ vpmulhw %ymm2, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpsubw %ymm14, %ymm6, %ymm6
+ vpmullw %ymm1, %ymm7, %ymm15
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpsubw %ymm15, %ymm7, %ymm7
+ vpmullw %ymm1, %ymm8, %ymm11
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm11, %ymm8, %ymm8
+ vpmullw %ymm1, %ymm9, %ymm12
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vpmullw %ymm1, %ymm10, %ymm13
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpsubw %ymm13, %ymm10, %ymm10
+ vmovdqa %ymm3, (%rdi)
+ vmovdqa %ymm4, 0x20(%rdi)
+ vmovdqa %ymm5, 0x40(%rdi)
+ vmovdqa %ymm6, 0x60(%rdi)
+ vmovdqa %ymm7, 0x80(%rdi)
+ vmovdqa %ymm8, 0xa0(%rdi)
+ vmovdqa %ymm9, 0xc0(%rdi)
+ vmovdqa %ymm10, 0xe0(%rdi)
+ vmovdqa 0x100(%rdi), %ymm3
+ vmovdqa 0x120(%rdi), %ymm4
+ vmovdqa 0x140(%rdi), %ymm5
+ vmovdqa 0x160(%rdi), %ymm6
+ vmovdqa 0x180(%rdi), %ymm7
+ vmovdqa 0x1a0(%rdi), %ymm8
+ vmovdqa 0x1c0(%rdi), %ymm9
+ vmovdqa 0x1e0(%rdi), %ymm10
+ vpmullw %ymm1, %ymm3, %ymm11
+ vpmulhw %ymm2, %ymm3, %ymm3
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm11, %ymm3, %ymm3
+ vpmullw %ymm1, %ymm4, %ymm12
+ vpmulhw %ymm2, %ymm4, %ymm4
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm4, %ymm4
+ vpmullw %ymm1, %ymm5, %ymm13
+ vpmulhw %ymm2, %ymm5, %ymm5
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpsubw %ymm13, %ymm5, %ymm5
+ vpmullw %ymm1, %ymm6, %ymm14
+ vpmulhw %ymm2, %ymm6, %ymm6
+ vpmulhw %ymm0, %ymm14, %ymm14
+ vpsubw %ymm14, %ymm6, %ymm6
+ vpmullw %ymm1, %ymm7, %ymm15
+ vpmulhw %ymm2, %ymm7, %ymm7
+ vpmulhw %ymm0, %ymm15, %ymm15
+ vpsubw %ymm15, %ymm7, %ymm7
+ vpmullw %ymm1, %ymm8, %ymm11
+ vpmulhw %ymm2, %ymm8, %ymm8
+ vpmulhw %ymm0, %ymm11, %ymm11
+ vpsubw %ymm11, %ymm8, %ymm8
+ vpmullw %ymm1, %ymm9, %ymm12
+ vpmulhw %ymm2, %ymm9, %ymm9
+ vpmulhw %ymm0, %ymm12, %ymm12
+ vpsubw %ymm12, %ymm9, %ymm9
+ vpmullw %ymm1, %ymm10, %ymm13
+ vpmulhw %ymm2, %ymm10, %ymm10
+ vpmulhw %ymm0, %ymm13, %ymm13
+ vpsubw %ymm13, %ymm10, %ymm10
+ vmovdqa %ymm3, 0x100(%rdi)
+ vmovdqa %ymm4, 0x120(%rdi)
+ vmovdqa %ymm5, 0x140(%rdi)
+ vmovdqa %ymm6, 0x160(%rdi)
+ vmovdqa %ymm7, 0x180(%rdi)
+ vmovdqa %ymm8, 0x1a0(%rdi)
+ vmovdqa %ymm9, 0x1c0(%rdi)
+ vmovdqa %ymm10, 0x1e0(%rdi)
retq
+ .cfi_endproc
-tomont_avx2_core:
- vmovdqa (%rdi), %ymm3
- vmovdqa 0x20(%rdi), %ymm4
- vmovdqa 0x40(%rdi), %ymm5
- vmovdqa 0x60(%rdi), %ymm6
- vmovdqa 0x80(%rdi), %ymm7
- vmovdqa 0xa0(%rdi), %ymm8
- vmovdqa 0xc0(%rdi), %ymm9
- vmovdqa 0xe0(%rdi), %ymm10
- vpmullw %ymm1, %ymm3, %ymm11
- vpmulhw %ymm2, %ymm3, %ymm3
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm11, %ymm3, %ymm3
- vpmullw %ymm1, %ymm4, %ymm12
- vpmulhw %ymm2, %ymm4, %ymm4
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm4, %ymm4
- vpmullw %ymm1, %ymm5, %ymm13
- vpmulhw %ymm2, %ymm5, %ymm5
- vpmulhw %ymm0, %ymm13, %ymm13
- vpsubw %ymm13, %ymm5, %ymm5
- vpmullw %ymm1, %ymm6, %ymm14
- vpmulhw %ymm2, %ymm6, %ymm6
- vpmulhw %ymm0, %ymm14, %ymm14
- vpsubw %ymm14, %ymm6, %ymm6
- vpmullw %ymm1, %ymm7, %ymm15
- vpmulhw %ymm2, %ymm7, %ymm7
- vpmulhw %ymm0, %ymm15, %ymm15
- vpsubw %ymm15, %ymm7, %ymm7
- vpmullw %ymm1, %ymm8, %ymm11
- vpmulhw %ymm2, %ymm8, %ymm8
- vpmulhw %ymm0, %ymm11, %ymm11
- vpsubw %ymm11, %ymm8, %ymm8
- vpmullw %ymm1, %ymm9, %ymm12
- vpmulhw %ymm2, %ymm9, %ymm9
- vpmulhw %ymm0, %ymm12, %ymm12
- vpsubw %ymm12, %ymm9, %ymm9
- vpmullw %ymm1, %ymm10, %ymm13
- vpmulhw %ymm2, %ymm10, %ymm10
- vpmulhw %ymm0, %ymm13, %ymm13
- vpsubw %ymm13, %ymm10, %ymm10
- vmovdqa %ymm3, (%rdi)
- vmovdqa %ymm4, 0x20(%rdi)
- vmovdqa %ymm5, 0x40(%rdi)
- vmovdqa %ymm6, 0x60(%rdi)
- vmovdqa %ymm7, 0x80(%rdi)
- vmovdqa %ymm8, 0xa0(%rdi)
- vmovdqa %ymm9, 0xc0(%rdi)
- vmovdqa %ymm10, 0xe0(%rdi)
- retq
+MLK_ASM_FN_SIZE(tomont_avx2)
#endif /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/x86_64_mulcache_twiddles.i b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/x86_64_mulcache_twiddles.i
deleted file mode 100644
index 51aeb01122..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/x86_64_mulcache_twiddles.i
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/*
- * WARNING: This file is auto-generated from scripts/autogen
- * Do not modify it directly.
- */
-
-/*
- * Table of twiddle values used in the AVX2 mulcache
- * See autogen for details.
- */
-
-- 1103,
- 555, -1251, 1550, 422, 177, -291, 1574, -246, 1159, -777, -602, -1590, -872, 418, -156, 430,
- 843, 871, 105, 587, -235, -460, 1653, 778, -147, 1483, 1119, 644, 349, 329, -75, 817, 603, 1322,
- -1465, -1215, 1218, -874, -1187, -1185, -1278, -1510, -870, -108, 996, 958, 1522, 1097, 610,
- -1285, 384, -136, -1335, 220, -1659, -1530, 794, -854, 478, -308, 991, -1460, 1628, -335,
- -11477, -32227, 20494, -27738, 945, -14883, 6182, 32010, 10631, 29175, -28762, -18486, 17560,
- -14430, -5276, 11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, -32502, 30317, -18741,
- 12639, 20100, 18525, 19529, -12619, -31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989,
- 10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, 20297, 2146, 15355, -32384, -6280,
- -14903, -11044, 14469, -21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/x86_64_zetas.i b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/x86_64_zetas.i
deleted file mode 100644
index c93ae01433..0000000000
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/native/x86_64/src/x86_64_zetas.i
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) The mlkem-native project authors
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
- */
-
-/*
- * WARNING: This file is auto-generated from scripts/autogen
- * Do not modify it directly.
- */
-
-/*
- * Table of zeta values used in the AVX2 NTTs
- * See autogen for details.
- */
-
-31498, 31498, 31498, 31498, -758, -758, -758, -758, 0, 0, 0, 0, 0, 0, 0, 0,
- 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745,
- 14745, 14745, 14745, 14745, 14745, -359, -359, -359, -359, -359, -359, -359,
- -359, -359, -359, -359, -359, -359, -359, -359, -359, 13525, 13525, 13525,
- 13525, 13525, 13525, 13525, 13525, -12402, -12402, -12402, -12402, -12402,
- -12402, -12402, -12402, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493,
- 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, -20907, -20907, -20907,
- -20907, 27758, 27758, 27758, 27758, -3799, -3799, -3799, -3799, -15690,
- -15690, -15690, -15690, -171, -171, -171, -171, 622, 622, 622, 622, 1577,
- 1577, 1577, 1577, 182, 182, 182, 182, -5827, -5827, 17363, 17363, -26360,
- -26360, -29057, -29057, 5571, 5571, -1102, -1102, 21438, 21438, -26242,
- -26242, 573, 573, -1325, -1325, 264, 264, 383, 383, -829, -829, 1458, 1458,
- -1602, -1602, -130, -130, -5689, -6516, 1496, 30967, -23565, 20179, 20710,
- 25080, -12796, 26616, 16064, -12442, 9134, -650, -25986, 27837, 1223, 652,
- -552, 1015, -1293, 1491, -282, -1544, 516, -8, -320, -666, -1618, -1162,
- 126, 1469, -335, -11477, -32227, 20494, -27738, 945, -14883, 6182, 32010,
- 10631, 29175, -28762, -18486, 17560, -14430, -5276, -1103, 555, -1251, 1550,
- 422, 177, -291, 1574, -246, 1159, -777, -602, -1590, -872, 418, -156, 11182,
- 13387, -14233, -21655, 13131, -4587, 23092, 5493, -32502, 30317, -18741,
- 12639, 20100, 18525, 19529, -12619, 430, 843, 871, 105, 587, -235, -460,
- 1653, 778, -147, 1483, 1119, 644, 349, 329, -75, 787, 787, 787, 787, 787,
- 787, 787, 787, 787, 787, 787, 787, 787, 787, 787, 787, -1517, -1517, -1517,
- -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517,
- -1517, -1517, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191,
- -16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694, 287, 287,
- 287, 287, 287, 287, 287, 287, 202, 202, 202, 202, 202, 202, 202, 202, 10690,
- 10690, 10690, 10690, 1358, 1358, 1358, 1358, -11202, -11202, -11202, -11202,
- 31164, 31164, 31164, 31164, 962, 962, 962, 962, -1202, -1202, -1202, -1202,
- -1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, -28073, -28073, 24313,
- 24313, -10532, -10532, 8800, 8800, 18426, 18426, 8859, 8859, 26675, 26675,
- -16163, -16163, -681, -681, 1017, 1017, 732, 732, 608, 608, -1542, -1542,
- 411, 411, -205, -205, -1571, -1571, 19883, -28250, -15887, -8898, -28309,
- 9075, -30199, 18249, 13426, 14017, -29156, -12757, 16832, 4311, -24155,
- -17915, -853, -90, -271, 830, 107, -1421, -247, -951, -398, 961, -1508,
- -725, 448, -1065, 677, -1275, -31183, 25435, -7382, 24391, -20927, 10946,
- 24214, 16989, 10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, 817,
- 603, 1322, -1465, -1215, 1218, -874, -1187, -1185, -1278, -1510, -870, -108,
- 996, 958, 1522, 20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469,
- -21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132, 1097, 610,
- -1285, 384, -136, -1335, 220, -1659, -1530, 794, -854, 478, -308, 991,
- -1460, 1628,
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/params.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/params.h
index 3f81bb0e2e..04598539c4 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/params.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/params.h
@@ -5,12 +5,6 @@
#ifndef MLK_PARAMS_H
#define MLK_PARAMS_H
-#if defined(MLK_CONFIG_FILE)
-#include MLK_CONFIG_FILE
-#else
-#include "config.h"
-#endif
-
#if !defined(MLK_CONFIG_PARAMETER_SET)
#error MLK_CONFIG_PARAMETER_SET is not defined
#endif
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly.c
index 40d29948c8..564d5d712b 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly.c
@@ -20,8 +20,7 @@
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
-#include
-#include
+
#include "cbmc.h"
#include "debug.h"
#include "poly.h"
@@ -29,9 +28,6 @@
#include "symmetric.h"
#include "verify.h"
-#if !defined(MLK_USE_NATIVE_POLY_TOMONT) || \
- !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
- !defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_fqmul
*
@@ -68,10 +64,7 @@ __contract__(
mlk_assert_abs_bound(&res, 1, MLKEM_Q);
return res;
}
-#endif /* !MLK_USE_NATIVE_POLY_TOMONT || !MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE \
- || !MLK_USE_NATIVE_NTT || !MLK_USE_NATIVE_INTT */
-#if !defined(MLK_USE_NATIVE_POLY_REDUCE) || !defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_barrett_reduce
*
@@ -107,7 +100,7 @@ __contract__(
* Here, we assume it's sign-preserving "arithmetic" shift right.
* See (C99 6.5.7 (5))
*/
- const int32_t t = (magic * a + (1 << 25)) >> 26;
+ const int32_t t = (magic * a + ((int32_t)1 << 25)) >> 26;
/*
* t is in -10 .. +10, so we need 32-bit math to
@@ -118,12 +111,14 @@ __contract__(
mlk_assert_abs_bound(&res, 1, MLKEM_Q_HALF);
return res;
}
-#endif /* !MLK_USE_NATIVE_POLY_REDUCE || !MLK_USE_NATIVE_INTT */
-#if !defined(MLK_USE_NATIVE_POLY_TOMONT)
/* Reference: `poly_tomont()` in the reference implementation @[REF]. */
-MLK_INTERNAL_API
-void mlk_poly_tomont(mlk_poly *r)
+MLK_STATIC_TESTABLE void mlk_poly_tomont_c(mlk_poly *r)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
+)
{
unsigned i;
const int16_t f = 1353; /* check-magic: 1353 == signed_mod(2^32, MLKEM_Q) */
@@ -137,16 +132,23 @@ void mlk_poly_tomont(mlk_poly *r)
mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_TOMONT */
+
MLK_INTERNAL_API
void mlk_poly_tomont(mlk_poly *r)
{
- mlk_poly_tomont_native(r->coeffs);
- mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_TOMONT)
+ int ret;
+ ret = mlk_poly_tomont_native(r->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
-#if !defined(MLK_USE_NATIVE_POLY_REDUCE)
+ mlk_poly_tomont_c(r);
+}
+
/************************************************************
* Name: mlk_scalar_signed_to_unsigned_q
*
@@ -162,7 +164,7 @@ void mlk_poly_tomont(mlk_poly *r)
* - Used here to implement different semantics of `poly_reduce()`;
* see below. in the reference implementation @[REF], this logic is
* part of all compression functions (see `compress.c`). */
-static MLK_INLINE uint16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
+static MLK_INLINE int16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
__contract__(
requires(c > -MLKEM_Q && c < MLKEM_Q)
ensures(return_value >= 0 && return_value < MLKEM_Q)
@@ -170,12 +172,14 @@ __contract__(
{
mlk_assert_abs_bound(&c, 1, MLKEM_Q);
- /* Add Q if c is negative, but in constant time */
- c = mlk_ct_sel_int16(c + MLKEM_Q, c, mlk_ct_cmask_neg_i16(c));
+ /* Add MLKEM_Q if c is negative, but in constant time.
+ *
+ * Note that c + MLKEM_Q does not overflow in int16_t,
+ * so the cast to uint16_t is safe. */
+ c = mlk_ct_sel_int16((int16_t)(c + MLKEM_Q), c, mlk_ct_cmask_neg_i16(c));
- /* and therefore cast to uint16_t is safe. */
mlk_assert_bound(&c, 1, 0, MLKEM_Q);
- return (uint16_t)c;
+ return c;
}
/* Reference: `poly_reduce()` in the reference implementation @[REF]
@@ -185,10 +189,15 @@ __contract__(
* here to go from signed to unsigned representatives.
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
-MLK_INTERNAL_API
-void mlk_poly_reduce(mlk_poly *r)
+MLK_STATIC_TESTABLE void mlk_poly_reduce_c(mlk_poly *r)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+ ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+)
{
unsigned i;
+
for (i = 0; i < MLKEM_N; i++)
__loop__(
invariant(i <= MLKEM_N)
@@ -202,15 +211,23 @@ void mlk_poly_reduce(mlk_poly *r)
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_REDUCE */
+
MLK_INTERNAL_API
void mlk_poly_reduce(mlk_poly *r)
{
- mlk_poly_reduce_native(r->coeffs);
- mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
-}
+#if defined(MLK_USE_NATIVE_POLY_REDUCE)
+ int ret;
+ ret = mlk_poly_reduce_native(r->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
+ mlk_poly_reduce_c(r);
+}
+
/* Reference: `poly_add()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
@@ -224,7 +241,8 @@ void mlk_poly_add(mlk_poly *r, const mlk_poly *b)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
{
- r->coeffs[i] = r->coeffs[i] + b->coeffs[i];
+ /* The preconditions imply that the addition stays within int16_t. */
+ r->coeffs[i] = (int16_t)(r->coeffs[i] + b->coeffs[i]);
}
}
@@ -241,24 +259,24 @@ void mlk_poly_sub(mlk_poly *r, const mlk_poly *b)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
{
- r->coeffs[i] = r->coeffs[i] - b->coeffs[i];
+ /* The preconditions imply that the subtraction stays within int16_t. */
+ r->coeffs[i] = (int16_t)(r->coeffs[i] - b->coeffs[i]);
}
}
-/* Include zeta table unless NTT, invNTT and mulcache computation
- * have been replaced by native implementations. */
-#if !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
- !defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT)
#include "zetas.inc"
-#endif
-#if !defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
/* Reference: Does not exist in the reference implementation @[REF].
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This idea originates
* from @[NeonNTT] and is used at the C level here. */
-MLK_INTERNAL_API
-void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
+MLK_STATIC_TESTABLE void mlk_poly_mulcache_compute_c(mlk_poly_mulcache *x,
+ const mlk_poly *a)
+__contract__(
+ requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
+ requires(memory_no_alias(a, sizeof(mlk_poly)))
+ assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
+)
{
unsigned i;
for (i = 0; i < MLKEM_N / 4; i++)
@@ -266,8 +284,11 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
invariant(i <= MLKEM_N / 4)
invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
{
- x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
- x->coeffs[2 * i + 1] = mlk_fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
+ x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], mlk_zetas[64 + i]);
+ /* The values in zeta table are <= MLKEM_Q in absolute value,
+ * so the negation in int16_t is safe. */
+ x->coeffs[2 * i + 1] =
+ mlk_fqmul(a->coeffs[4 * i + 3], (int16_t)(-mlk_zetas[64 + i]));
}
/*
@@ -278,15 +299,22 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
*/
mlk_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
}
-#else /* !MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
+
MLK_INTERNAL_API
void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
{
- mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
-}
+#if defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
+ int ret;
+ ret = mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
-#if !defined(MLK_USE_NATIVE_NTT)
+ mlk_poly_mulcache_compute_c(x, a);
+}
+
/*
* Computes a block CT butterflies with a fixed twiddle factor,
* using Montgomery multiplication.
@@ -316,7 +344,8 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
/* Reference: Embedded in `ntt()` in the reference implementation @[REF]. */
static void mlk_ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
- unsigned start, unsigned len, int bound)
+ unsigned start, unsigned len,
+ unsigned bound)
__contract__(
requires(start < MLKEM_N)
requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
@@ -346,8 +375,9 @@ __contract__(
{
int16_t t;
t = mlk_fqmul(r[j + len], zeta);
- r[j + len] = r[j] - t;
- r[j] = r[j] + t;
+ /* The precondition implies that the arithmetic does not overflow. */
+ r[j + len] = (int16_t)(r[j] - t);
+ r[j] = (int16_t)(r[j] + t);
}
}
@@ -370,7 +400,7 @@ __contract__(
unsigned start, k, len;
/* Twiddle factors for layer n are at indices 2^(n-1)..2^n-1. */
k = 1u << (layer - 1);
- len = MLKEM_N >> layer;
+ len = (unsigned)MLKEM_N >> layer;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
invariant(start < MLKEM_N + 2 * len)
@@ -378,7 +408,7 @@ __contract__(
invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
{
- int16_t zeta = zetas[k++];
+ int16_t zeta = mlk_zetas[k++];
mlk_ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
}
}
@@ -395,12 +425,19 @@ __contract__(
/* Reference: `ntt()` in the reference implementation @[REF].
* - Iterate over `layer` instead of `len` in the outer loop
* to simplify computation of zeta index. */
-MLK_INTERNAL_API
-void mlk_poly_ntt(mlk_poly *p)
+MLK_STATIC_TESTABLE void mlk_poly_ntt_c(mlk_poly *p)
+__contract__(
+ requires(memory_no_alias(p, sizeof(mlk_poly)))
+ requires(array_abs_bound(p->coeffs, 0, MLKEM_N, MLKEM_Q))
+ assigns(memory_slice(p, sizeof(mlk_poly)))
+ ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_NTT_BOUND))
+)
{
unsigned layer;
int16_t *r;
+
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
+
r = p->coeffs;
for (layer = 1; layer <= 7; layer++)
@@ -414,18 +451,24 @@ void mlk_poly_ntt(mlk_poly *p)
/* Check the stronger bound */
mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
}
-#else /* !MLK_USE_NATIVE_NTT */
MLK_INTERNAL_API
void mlk_poly_ntt(mlk_poly *p)
{
+#if defined(MLK_USE_NATIVE_NTT)
+ int ret;
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
- mlk_ntt_native(p->coeffs);
- mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
-}
+ ret = mlk_ntt_native(p->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
+ return;
+ }
#endif /* MLK_USE_NATIVE_NTT */
-#if !defined(MLK_USE_NATIVE_INTT)
+ mlk_poly_ntt_c(p);
+}
+
/* Compute one layer of inverse NTT */
@@ -439,7 +482,7 @@ __contract__(
ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
unsigned start, k, len;
- len = (MLKEM_N >> layer);
+ len = (unsigned)MLKEM_N >> layer;
k = (1u << layer) - 1;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
@@ -449,7 +492,7 @@ __contract__(
invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
{
unsigned j;
- int16_t zeta = zetas[k--];
+ int16_t zeta = mlk_zetas[k--];
for (j = start; j < start + len; j++)
__loop__(
invariant(start <= j && j <= start + len)
@@ -457,8 +500,9 @@ __contract__(
invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
int16_t t = r[j];
- r[j] = mlk_barrett_reduce(t + r[j + len]);
- r[j + len] = r[j + len] - t;
+ /* The preconditions imply that the arithmetic does not overflow. */
+ r[j] = mlk_barrett_reduce((int16_t)(t + r[j + len]));
+ r[j + len] = (int16_t)(r[j + len] - t);
r[j + len] = mlk_fqmul(r[j + len], zeta);
}
}
@@ -469,18 +513,22 @@ __contract__(
* while the reference implementation normalizes at
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
-MLK_INTERNAL_API
-void mlk_poly_invntt_tomont(mlk_poly *p)
+MLK_STATIC_TESTABLE void mlk_poly_invntt_tomont_c(mlk_poly *p)
+__contract__(
+ requires(memory_no_alias(p, sizeof(mlk_poly)))
+ assigns(memory_slice(p, sizeof(mlk_poly)))
+ ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND))
+)
{
+ unsigned j, layer;
+ const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
+ int16_t *r = p->coeffs;
+
/*
* Scale input polynomial to account for Montgomery factor
* and NTT twist. This also brings coefficients down to
* absolute value < MLKEM_Q.
*/
- unsigned j, layer;
- const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
- int16_t *r = p->coeffs;
-
for (j = 0; j < MLKEM_N; j++)
__loop__(
invariant(j <= MLKEM_N)
@@ -500,16 +548,23 @@ void mlk_poly_invntt_tomont(mlk_poly *p)
mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
}
-#else /* !MLK_USE_NATIVE_INTT */
MLK_INTERNAL_API
void mlk_poly_invntt_tomont(mlk_poly *p)
{
- mlk_intt_native(p->coeffs);
- mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
-}
+#if defined(MLK_USE_NATIVE_INTT)
+ int ret;
+ ret = mlk_intt_native(p->coeffs);
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
+ return;
+ }
#endif /* MLK_USE_NATIVE_INTT */
+ mlk_poly_invntt_tomont_c(p);
+}
+
#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
MLK_EMPTY_CU(mlk_poly)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly.h
index 20fb65e720..587062cce5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly.h
@@ -15,8 +15,7 @@
#ifndef MLK_POLY_H
#define MLK_POLY_H
-#include
-#include
+
#include "cbmc.h"
#include "common.h"
#include "debug.h"
@@ -46,34 +45,6 @@ typedef struct
int16_t coeffs[MLKEM_N >> 1];
} MLK_ALIGN mlk_poly_mulcache;
-/*************************************************
- * Name: mlk_cast_uint16_to_int16
- *
- * Description: Cast uint16 value to int16
- *
- * Returns:
- * input x in 0 .. 32767: returns value unchanged
- * input x in 32768 .. 65535: returns (x - 65536)
- **************************************************/
-#ifdef CBMC
-#pragma CPROVER check push
-#pragma CPROVER check disable "conversion"
-#endif
-static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
-{
- /*
- * PORTABILITY: This relies on uint16_t -> int16_t
- * being implemented as the inverse of int16_t -> uint16_t,
- * which is implementation-defined (C99 6.3.1.3 (3))
- * CBMC (correctly) fails to prove this conversion is OK,
- * so we have to suppress that check here
- */
- return (int16_t)x;
-}
-#ifdef CBMC
-#pragma CPROVER check pop
-#endif
-
/*************************************************
* Name: mlk_montgomery_reduce
*
@@ -90,7 +61,7 @@ static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
static MLK_ALWAYS_INLINE int16_t mlk_montgomery_reduce(int32_t a)
__contract__(
requires(a < +(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)) &&
- a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
+ a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
/* We don't attempt to express an input-dependent output bound
* as the post-condition here. There are two call-sites for this
* function:
@@ -102,8 +73,8 @@ __contract__(
/* check-magic: 62209 == unsigned_mod(pow(MLKEM_Q, -1, 2^16), 2^16) */
const uint32_t QINV = 62209;
- /* Compute a*q^{-1} mod 2^16 in unsigned representatives */
- const uint16_t a_reduced = a & UINT16_MAX;
+ /* Compute a*q^{-1} mod 2^16 in unsigned representatives. */
+ const uint16_t a_reduced = mlk_cast_int32_to_uint16(a);
const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
/* Lift to signed canonical representative mod 2^16. */
@@ -187,7 +158,7 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
__contract__(
requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
requires(memory_no_alias(a, sizeof(mlk_poly)))
- assigns(object_whole(x))
+ assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
);
#define mlk_poly_reduce MLK_NAMESPACE(poly_reduce)
@@ -280,7 +251,7 @@ __contract__(
requires(forall(k0, 0, MLKEM_N, (int32_t) r->coeffs[k0] - b->coeffs[k0] <= INT16_MAX))
requires(forall(k1, 0, MLKEM_N, (int32_t) r->coeffs[k1] - b->coeffs[k1] >= INT16_MIN))
ensures(forall(k, 0, MLKEM_N, r->coeffs[k] == old(*r).coeffs[k] - b->coeffs[k]))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
);
#define mlk_poly_ntt MLK_NAMESPACE(poly_ntt)
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly_k.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly_k.c
index f15ab96ce7..32b214ee04 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly_k.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly_k.c
@@ -22,12 +22,9 @@
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
-#include
-#include
+#include "poly_k.h"
-#include "compress.h"
#include "debug.h"
-#include "poly_k.h"
#include "sampling.h"
#include "symmetric.h"
@@ -37,6 +34,8 @@
* within a single compilation unit. */
#define mlk_poly_cbd_eta1 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta1)
#define mlk_poly_cbd_eta2 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta2)
+#define mlk_polyvec_basemul_acc_montgomery_cached_c \
+ MLK_ADD_PARAM_SET(mlk_polyvec_basemul_acc_montgomery_cached_c)
/* End of parameter set namespacing */
/* Reference: `polyvec_compress()` in the reference implementation @[REF]
@@ -46,29 +45,29 @@
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
- const mlk_polyvec a)
+ const mlk_polyvec *a)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a[i]);
+ mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a->vec[i]);
}
}
/* Reference: `polyvec_decompress()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_decompress_du(mlk_polyvec r,
+void mlk_polyvec_decompress_du(mlk_polyvec *r,
const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_decompress_du(&r[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
+ mlk_poly_decompress_du(&r->vec[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `polyvec_tobytes()` in the reference implementation @[REF].
@@ -77,41 +76,45 @@ void mlk_polyvec_decompress_du(mlk_polyvec r,
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
-void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a)
+void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec *a)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(r, MLKEM_POLYVECBYTES))
+ invariant(i <= MLKEM_K)
+ )
{
- mlk_poly_tobytes(r + i * MLKEM_POLYBYTES, &a[i]);
+ mlk_poly_tobytes(&r[i * MLKEM_POLYBYTES], &a->vec[i]);
}
}
/* Reference: `polyvec_frombytes()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES])
+void mlk_polyvec_frombytes(mlk_polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_frombytes(&r[i], a + i * MLKEM_POLYBYTES);
+ mlk_poly_frombytes(&r->vec[i], a + i * MLKEM_POLYBYTES);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
}
/* Reference: `polyvec_ntt()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_ntt(mlk_polyvec r)
+void mlk_polyvec_ntt(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_ntt(&r[i]);
+ mlk_poly_ntt(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
}
/* Reference: `polyvec_invntt_tomont()` in the reference implementation @[REF].
@@ -120,18 +123,17 @@ void mlk_polyvec_ntt(mlk_polyvec r)
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
MLK_INTERNAL_API
-void mlk_polyvec_invntt_tomont(mlk_polyvec r)
+void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_invntt_tomont(&r[i]);
+ mlk_poly_invntt_tomont(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND);
}
-#if !defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
/* Reference: `polyvec_basemul_acc_montgomery()` in the
* reference implementation @[REF].
* - We use a multiplication cache ('mulcache') here
@@ -143,13 +145,22 @@ void mlk_polyvec_invntt_tomont(mlk_polyvec r)
* at the end. The reference implementation uses 2 * MLKEM_K
* more modular reductions since it reduces after every modular
* multiplication. */
-MLK_INTERNAL_API
-void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+MLK_STATIC_TESTABLE void mlk_polyvec_basemul_acc_montgomery_cached_c(
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
+__contract__(
+ requires(memory_no_alias(r, sizeof(mlk_poly)))
+ requires(memory_no_alias(a, sizeof(mlk_polyvec)))
+ requires(memory_no_alias(b, sizeof(mlk_polyvec)))
+ requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
+ requires(forall(k1, 0, MLKEM_K,
+ array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
+)
{
unsigned i;
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(invariant(i <= MLKEM_N / 2))
{
@@ -163,53 +174,59 @@ void mlk_polyvec_basemul_acc_montgomery_cached(
t[1] <= ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768) &&
t[1] >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768)))
{
- t[0] += (int32_t)a[k].coeffs[2 * i + 1] * b_cache[k].coeffs[i];
- t[0] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i];
- t[1] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i + 1];
- t[1] += (int32_t)a[k].coeffs[2 * i + 1] * b[k].coeffs[2 * i];
+ t[0] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b_cache->vec[k].coeffs[i];
+ t[0] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i];
+ t[1] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i + 1];
+ t[1] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b->vec[k].coeffs[2 * i];
}
r->coeffs[2 * i + 0] = mlk_montgomery_reduce(t[0]);
r->coeffs[2 * i + 1] = mlk_montgomery_reduce(t[1]);
}
}
-#else /* !MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
MLK_INTERNAL_API
void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
{
- mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
- /* Omitting bounds assertion for cache since native implementations may
- * decide not to use a mulcache. Note that the C backend implementation
- * of poly_basemul_montgomery_cached() does still include the check. */
+#if defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
+ {
+ int ret;
+ mlk_assert_bound_2d(a->vec, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
#if MLKEM_K == 2
- mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#elif MLKEM_K == 3
- mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#elif MLKEM_K == 4
- mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
- r->coeffs, (const int16_t *)a, (const int16_t *)b,
- (const int16_t *)b_cache);
+ ret = mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
+ r->coeffs, (const int16_t *)a, (const int16_t *)b,
+ (const int16_t *)b_cache);
#endif
-}
+ if (ret == MLK_NATIVE_FUNC_SUCCESS)
+ {
+ return;
+ }
+ }
#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
+ mlk_polyvec_basemul_acc_montgomery_cached_c(r, a, b, b_cache);
+}
+
/* Reference: Does not exist in the reference implementation @[REF].
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This idea originates
* from @[NeonNTT] and is used at the C level here. */
MLK_INTERNAL_API
-void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
+void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_mulcache_compute(&x[i], &a[i]);
+ mlk_poly_mulcache_compute(&x->vec[i], &a->vec[i]);
}
}
@@ -221,41 +238,53 @@ void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
MLK_INTERNAL_API
-void mlk_polyvec_reduce(mlk_polyvec r)
+void mlk_polyvec_reduce(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_reduce(&r[i]);
+ mlk_poly_reduce(&r->vec[i]);
}
- mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
+ mlk_assert_bound_2d(r->vec, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `polyvec_add()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
MLK_INTERNAL_API
-void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b)
+void mlk_polyvec_add(mlk_polyvec *r, const mlk_polyvec *b)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
+ __loop__(
+ assigns(i, memory_slice(r, sizeof(mlk_polyvec)))
+ invariant(i <= MLKEM_K)
+ invariant(forall(j0, i, MLKEM_K,
+ forall(k0, 0, MLKEM_N,
+ ((int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX) &&
+ ((int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] >= INT16_MIN))))
+ invariant(forall(j2, 0, i,
+ forall(k2, 0, MLKEM_N,
+ (r->vec[j2].coeffs[k2] <= INT16_MAX) &&
+ (r->vec[j2].coeffs[k2] >= INT16_MIN))))
+ )
{
- mlk_poly_add(&r[i], &b[i]);
+ mlk_poly_add(&r->vec[i], &b->vec[i]);
}
}
/* Reference: `polyvec_tomont()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
-void mlk_polyvec_tomont(mlk_polyvec r)
+void mlk_polyvec_tomont(mlk_polyvec *r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
- mlk_poly_tomont(&r[i]);
+ mlk_poly_tomont(&r->vec[i]);
}
- mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
+ mlk_assert_abs_bound_2d(r->vec, MLKEM_K, MLKEM_N, MLKEM_Q);
}
@@ -306,24 +335,41 @@ void mlk_poly_getnoise_eta1_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
{
MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
- memcpy(extkey[0], seed, MLKEM_SYMBYTES);
- memcpy(extkey[1], seed, MLKEM_SYMBYTES);
- memcpy(extkey[2], seed, MLKEM_SYMBYTES);
- memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
extkey[0][MLKEM_SYMBYTES] = nonce0;
extkey[1][MLKEM_SYMBYTES] = nonce1;
extkey[2][MLKEM_SYMBYTES] = nonce2;
extkey[3][MLKEM_SYMBYTES] = nonce3;
+
+#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) && \
+ !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
mlk_prf_eta1_x4(buf, extkey);
+#else
+ mlk_prf_eta1(buf[0], extkey[0]);
+ mlk_prf_eta1(buf[1], extkey[1]);
+ mlk_prf_eta1(buf[2], extkey[2]);
+ if (r3 != NULL)
+ {
+ mlk_prf_eta1(buf[3], extkey[3]);
+ }
+#endif /* !(!FIPS202_X4_DEFAULT_IMPLEMENTATION && \
+ !MLK_CONFIG_SERIAL_FIPS202_ONLY) */
+
mlk_poly_cbd_eta1(r0, buf[0]);
mlk_poly_cbd_eta1(r1, buf[1]);
mlk_poly_cbd_eta1(r2, buf[2]);
- mlk_poly_cbd_eta1(r3, buf[3]);
+ if (r3 != NULL)
+ {
+ mlk_poly_cbd_eta1(r3, buf[3]);
+ mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
+ }
mlk_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
- mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -364,7 +410,7 @@ __contract__(
#endif
}
-/* Reference: `poly_getnoise_eta1()` in the reference implementation @[REF].
+/* Reference: `poly_getnoise_eta2()` in the reference implementation @[REF].
* - We include buffer zeroization. */
MLK_INTERNAL_API
void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
@@ -373,13 +419,13 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
MLK_ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
MLK_ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
- memcpy(extkey, seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey, seed, MLKEM_SYMBYTES);
extkey[MLKEM_SYMBYTES] = nonce;
mlk_prf_eta2(buf, extkey);
mlk_poly_cbd_eta2(r, buf);
- mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
+ mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA2 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
@@ -391,7 +437,7 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
#if MLKEM_K == 2
/* Reference: Does not exist in the reference implementation @[REF].
* - This implements a x4-batched version of `poly_getnoise_eta1()`
- * and `poly_getnoise_eta1()` from the reference implementation,
+ * and `poly_getnoise_eta2()` from the reference implementation,
* leveraging batched Keccak-f1600.
* - If a x4-batched Keccak-f1600 is available, we squeeze
* more random data than needed for the eta2 calls, to be
@@ -409,10 +455,10 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
- memcpy(extkey[0], seed, MLKEM_SYMBYTES);
- memcpy(extkey[1], seed, MLKEM_SYMBYTES);
- memcpy(extkey[2], seed, MLKEM_SYMBYTES);
- memcpy(extkey[3], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
+ mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
extkey[0][MLKEM_SYMBYTES] = nonce0;
extkey[1][MLKEM_SYMBYTES] = nonce1;
extkey[2][MLKEM_SYMBYTES] = nonce2;
@@ -421,14 +467,16 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
/* On systems with fast batched Keccak, we use 4-fold batched PRF,
* even though that means generating more random data in buf[2] and buf[3]
* than necessary. */
-#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION)
+#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION) && \
+ !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
mlk_prf_eta1_x4(buf, extkey);
#else
mlk_prf_eta1(buf[0], extkey[0]);
mlk_prf_eta1(buf[1], extkey[1]);
mlk_prf_eta2(buf[2], extkey[2]);
mlk_prf_eta2(buf[3], extkey[3]);
-#endif /* FIPS202_X4_DEFAULT_IMPLEMENTATION */
+#endif /* !(!FIPS202_X4_DEFAULT_IMPLEMENTATION && \
+ !MLK_CONFIG_SERIAL_FIPS202_ONLY) */
mlk_poly_cbd_eta1(r0, buf[0]);
mlk_poly_cbd_eta1(r1, buf[1]);
@@ -451,3 +499,4 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
#undef mlk_poly_cbd_eta1
#undef mlk_poly_cbd_eta2
+#undef mlk_polyvec_basemul_acc_montgomery_cached_c
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly_k.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly_k.h
index f7a40ff5f9..9089a8e431 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly_k.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/poly_k.h
@@ -15,7 +15,6 @@
#ifndef MLK_POLY_K_H
#define MLK_POLY_K_H
-#include
#include "common.h"
#include "compress.h"
#include "poly.h"
@@ -29,9 +28,20 @@
#define mlk_polyvec_mulcache MLK_ADD_PARAM_SET(mlk_polyvec_mulcache)
/* End of parameter set namespacing */
-typedef mlk_poly mlk_polyvec[MLKEM_K];
-typedef mlk_poly mlk_polymat[MLKEM_K * MLKEM_K];
-typedef mlk_poly_mulcache mlk_polyvec_mulcache[MLKEM_K];
+typedef struct
+{
+ mlk_poly vec[MLKEM_K];
+} MLK_ALIGN mlk_polyvec;
+
+typedef struct
+{
+ mlk_polyvec vec[MLKEM_K];
+} MLK_ALIGN mlk_polymat;
+
+typedef struct
+{
+ mlk_poly_mulcache vec[MLKEM_K];
+} MLK_ALIGN mlk_polyvec_mulcache;
#define mlk_poly_compress_du MLK_NAMESPACE_K(poly_compress_du)
/*************************************************
@@ -131,7 +141,7 @@ __contract__(
requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- assigns(object_whole(r)))
+ assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DV)))
{
#if MLKEM_DV == 4
mlk_poly_compress_d4(r, a);
@@ -168,7 +178,7 @@ static MLK_INLINE void mlk_poly_decompress_dv(
__contract__(
requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
requires(memory_no_alias(r, sizeof(mlk_poly)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
{
#if MLKEM_DV == 4
@@ -200,13 +210,13 @@ __contract__(
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
- const mlk_polyvec a)
+ const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(forall(k0, 0, MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
- assigns(object_whole(r))
+ array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
);
#define mlk_polyvec_decompress_du MLK_NAMESPACE_K(polyvec_decompress_du)
@@ -228,14 +238,14 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_decompress_du(mlk_polyvec r,
+void mlk_polyvec_decompress_du(mlk_polyvec *r,
const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
__contract__(
requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_polyvec_tobytes MLK_NAMESPACE_K(polyvec_tobytes)
@@ -256,13 +266,13 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a)
+void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
requires(forall(k0, 0, MLKEM_K,
- array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
- assigns(object_whole(r))
+ array_bound(a->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ assigns(memory_slice(r, MLKEM_POLYVECBYTES))
);
#define mlk_polyvec_frombytes MLK_NAMESPACE_K(polyvec_frombytes)
@@ -284,13 +294,13 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES])
+void mlk_polyvec_frombytes(mlk_polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES])
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
);
#define mlk_polyvec_ntt MLK_NAMESPACE_K(polyvec_ntt)
@@ -313,14 +323,14 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_ntt(mlk_polyvec r)
+void mlk_polyvec_ntt(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
- assigns(object_whole(r))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
);
#define mlk_polyvec_invntt_tomont MLK_NAMESPACE_K(polyvec_invntt_tomont)
@@ -344,12 +354,12 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_invntt_tomont(mlk_polyvec r)
+void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
);
#define mlk_polyvec_basemul_acc_montgomery_cached \
@@ -380,16 +390,16 @@ __contract__(
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_basemul_acc_montgomery_cached(
- mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
- const mlk_polyvec_mulcache b_cache)
+ mlk_poly *r, const mlk_polyvec *a, const mlk_polyvec *b,
+ const mlk_polyvec_mulcache *b_cache)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(memory_no_alias(b, sizeof(mlk_polyvec)))
requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
requires(forall(k1, 0, MLKEM_K,
- array_bound(a[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
- assigns(object_whole(r))
+ array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
);
#define mlk_polyvec_mulcache_compute MLK_NAMESPACE_K(polyvec_mulcache_compute)
@@ -423,11 +433,11 @@ __contract__(
* higher level safety proofs, and thus not part of the spec.
*/
MLK_INTERNAL_API
-void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
+void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
__contract__(
requires(memory_no_alias(x, sizeof(mlk_polyvec_mulcache)))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
- assigns(object_whole(x))
+ assigns(memory_slice(x, sizeof(mlk_polyvec_mulcache)))
);
#define mlk_polyvec_reduce MLK_NAMESPACE_K(polyvec_reduce)
@@ -436,7 +446,7 @@ __contract__(
*
* Description: Applies Barrett reduction to each coefficient
* of each element of a vector of polynomials;
- * for details of the Barrett reduction see comments in reduce.c
+ * for details of the Barrett reduction see comments in poly.c
*
* Arguments: - mlk_polyvec r: pointer to input/output polynomial
*
@@ -453,12 +463,12 @@ __contract__(
* use of mlk_poly_reduce() in the context of (de)serialization.
*/
MLK_INTERNAL_API
-void mlk_polyvec_reduce(mlk_polyvec r)
+void mlk_polyvec_reduce(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
ensures(forall(k0, 0, MLKEM_K,
- array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
+ array_bound(r->vec[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_polyvec_add MLK_NAMESPACE_K(polyvec_add)
@@ -485,17 +495,17 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b)
+void mlk_polyvec_add(mlk_polyvec *r, const mlk_polyvec *b)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(memory_no_alias(b, sizeof(mlk_polyvec)))
requires(forall(j0, 0, MLKEM_K,
forall(k0, 0, MLKEM_N,
- (int32_t)r[j0].coeffs[k0] + b[j0].coeffs[k0] <= INT16_MAX)))
+ (int32_t)r->vec[j0].coeffs[k0] + b->vec[j0].coeffs[k0] <= INT16_MAX)))
requires(forall(j1, 0, MLKEM_K,
forall(k1, 0, MLKEM_N,
- (int32_t)r[j1].coeffs[k1] + b[j1].coeffs[k1] >= INT16_MIN)))
- assigns(object_whole(r))
+ (int32_t)r->vec[j1].coeffs[k1] + b->vec[j1].coeffs[k1] >= INT16_MIN)))
+ assigns(memory_slice(r, sizeof(mlk_polyvec)))
);
#define mlk_polyvec_tomont MLK_NAMESPACE_K(polyvec_tomont)
@@ -514,13 +524,12 @@ __contract__(
*
**************************************************/
MLK_INTERNAL_API
-void mlk_polyvec_tomont(mlk_polyvec r)
+void mlk_polyvec_tomont(mlk_polyvec *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
assigns(memory_slice(r, sizeof(mlk_polyvec)))
- assigns(object_whole(r))
ensures(forall(j, 0, MLKEM_K,
- array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
+ array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
);
#define mlk_poly_getnoise_eta1_4x MLK_NAMESPACE_K(poly_getnoise_eta1_4x)
@@ -531,7 +540,8 @@ __contract__(
* and nonces, with output polynomials close to centered binomial
* distribution with parameter MLKEM_ETA1.
*
- * Arguments: - mlk_poly *r{0,1,2,3}: pointer to output polynomial
+ * Arguments: - mlk_poly *r{0,1,2,3}: pointer to output polynomial. The last
+ * polynomial pointer may be NULL.
* - const uint8_t *seed: pointer to input seed
* (of length MLKEM_SYMBYTES bytes)
* - uint8_t nonce{0,1,2,3}: one-byte input nonce
@@ -555,16 +565,15 @@ __contract__(
requires(memory_no_alias(r0, sizeof(mlk_poly)))
requires(memory_no_alias(r1, sizeof(mlk_poly)))
requires(memory_no_alias(r2, sizeof(mlk_poly)))
- requires(memory_no_alias(r3, sizeof(mlk_poly)))
+ requires(r3 == NULL || memory_no_alias(r3, sizeof(mlk_poly)))
assigns(memory_slice(r0, sizeof(mlk_poly)))
assigns(memory_slice(r1, sizeof(mlk_poly)))
assigns(memory_slice(r2, sizeof(mlk_poly)))
- assigns(memory_slice(r3, sizeof(mlk_poly)))
- ensures(
- array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
+ assigns(r3 != NULL: memory_slice(r3, sizeof(mlk_poly)))
+ ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
+ ensures(r3 != NULL ==> array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1))
);
#if MLKEM_ETA1 == MLKEM_ETA2
@@ -604,7 +613,7 @@ void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
- assigns(object_whole(r))
+ assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
);
#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
@@ -640,15 +649,19 @@ void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
uint8_t nonce0, uint8_t nonce1,
uint8_t nonce2, uint8_t nonce3)
__contract__(
- requires( /* r0, r1 consecutive, r2, r3 consecutive */
- (memory_no_alias(r0, 2 * sizeof(mlk_poly)) && memory_no_alias(r2, 2 * sizeof(mlk_poly)) &&
- r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
+ requires(memory_no_alias(r0, sizeof(mlk_poly)))
+ requires(memory_no_alias(r1, sizeof(mlk_poly)))
+ requires(memory_no_alias(r2, sizeof(mlk_poly)))
+ requires(memory_no_alias(r3, sizeof(mlk_poly)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
- assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
+ assigns(memory_slice(r0, sizeof(mlk_poly)))
+ assigns(memory_slice(r1, sizeof(mlk_poly)))
+ assigns(memory_slice(r2, sizeof(mlk_poly)))
+ assigns(memory_slice(r3, sizeof(mlk_poly)))
ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
- && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
- && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
+ && array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
+ && array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
+ && array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1))
);
#endif /* MLKEM_K == 2 */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/randombytes.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/randombytes.h
index 132d920afb..3e841d26ca 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/randombytes.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/randombytes.h
@@ -5,18 +5,56 @@
#ifndef MLK_RANDOMBYTES_H
#define MLK_RANDOMBYTES_H
-#include
-#include
#include "cbmc.h"
#include "common.h"
+#if !defined(MLK_CONFIG_NO_RANDOMIZED_API)
#if !defined(MLK_CONFIG_CUSTOM_RANDOMBYTES)
-void randombytes(uint8_t *out, size_t outlen);
-static MLK_INLINE void mlk_randombytes(uint8_t *out, size_t outlen)
+/*************************************************
+ * Name: randombytes
+ *
+ * Description: Fill a buffer with cryptographically secure random bytes.
+ *
+ * mlkem-native does not provide an implementation of this
+ * function. It must be provided by the consumer.
+ *
+ * To use a custom random byte source with a different name
+ * or signature, set MLK_CONFIG_CUSTOM_RANDOMBYTES and define
+ * mlk_randombytes directly.
+ *
+ * Arguments: - uint8_t *out: pointer to output buffer
+ * - size_t outlen: number of random bytes to write
+ *
+ * Returns: 0 on success, non-zero on failure.
+ * On failure, top-level APIs return MLK_ERR_RNG_FAIL.
+ *
+ **************************************************/
+int randombytes(uint8_t *out, size_t outlen);
+
+/*************************************************
+ * Name: mlk_randombytes
+ *
+ * Description: Internal wrapper around randombytes().
+ *
+ * Fill a buffer with cryptographically secure random bytes.
+ *
+ * This function can be replaced by setting
+ * MLK_CONFIG_CUSTOM_RANDOMBYTES and defining mlk_randombytes
+ * directly.
+ *
+ * Arguments: - uint8_t *out: pointer to output buffer
+ * - size_t outlen: number of random bytes to write
+ *
+ * Returns: 0 on success, non-zero on failure.
+ * On failure, top-level APIs return MLK_ERR_RNG_FAIL.
+ *
+ **************************************************/
+MLK_MUST_CHECK_RETURN_VALUE
+static MLK_INLINE int mlk_randombytes(uint8_t *out, size_t outlen)
__contract__(
requires(memory_no_alias(out, outlen))
- assigns(memory_slice(out, outlen))) { randombytes(out, outlen); }
+ assigns(memory_slice(out, outlen))) { return randombytes(out, outlen); }
#endif /* !MLK_CONFIG_CUSTOM_RANDOMBYTES */
-
+#endif /* !MLK_CONFIG_NO_RANDOMIZED_API */
#endif /* !MLK_RANDOMBYTES_H */
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/sampling.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/sampling.c
index be5d931a79..945d12ed3d 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/sampling.c
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/sampling.c
@@ -29,9 +29,10 @@
* in that it adds the offset and always expects the base of the
* target buffer. This avoids shifting the buffer base in the
* caller, which appears tricky to reason about. */
-static unsigned mlk_rej_uniform_scalar(int16_t *r, unsigned target,
- unsigned offset, const uint8_t *buf,
- unsigned buflen)
+MLK_STATIC_TESTABLE unsigned mlk_rej_uniform_c(int16_t *r, unsigned target,
+ unsigned offset,
+ const uint8_t *buf,
+ unsigned buflen)
__contract__(
requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
requires(memory_no_alias(r, sizeof(int16_t) * target))
@@ -39,11 +40,10 @@ __contract__(
requires(array_bound(r, 0, offset, 0, MLKEM_Q))
assigns(memory_slice(r, sizeof(int16_t) * target))
ensures(offset <= return_value && return_value <= target)
- ensures(array_bound(r, 0, return_value, 0, MLKEM_Q))
-)
+ ensures(array_bound(r, 0, return_value, 0, MLKEM_Q)))
{
unsigned ctr, pos;
- uint16_t val0, val1;
+ int16_t val0, val1;
mlk_assert_bound(r, offset, 0, MLKEM_Q);
@@ -55,8 +55,8 @@ __contract__(
invariant(offset <= ctr && ctr <= target && pos <= buflen)
invariant(array_bound(r, 0, ctr, 0, MLKEM_Q)))
{
- val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
- val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
+ val0 = ((buf[pos + 0] >> 0) | (buf[pos + 1] << 8)) & 0xFFF;
+ val1 = ((buf[pos + 1] >> 4) | (buf[pos + 2] << 4)) & 0xFFF;
pos += 3;
if (val0 < MLKEM_Q)
@@ -93,7 +93,7 @@ __contract__(
* Must be a multiple of 3.
*
* Note: Strictly speaking, only a few values of buflen near UINT_MAX need
- * excluding. The limit of 128 is somewhat arbitrary but sufficient for all
+ * excluding. The limit of 4096 is somewhat arbitrary but sufficient for all
* uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
*
* Returns the new offset of sampled 16-bit integers, at most target,
@@ -124,8 +124,9 @@ __contract__(
#if defined(MLK_USE_NATIVE_REJ_UNIFORM)
if (offset == 0)
{
- int ret = mlk_rej_uniform_native(r, target, buf, buflen);
- if (ret != -1)
+ int ret;
+ ret = mlk_rej_uniform_native(r, target, buf, buflen);
+ if (ret != MLK_NATIVE_FUNC_FALLBACK)
{
unsigned res = (unsigned)ret;
mlk_assert_bound(r, res, 0, MLKEM_Q);
@@ -134,19 +135,22 @@ __contract__(
}
#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
- return mlk_rej_uniform_scalar(r, target, offset, buf, buflen);
+ return mlk_rej_uniform_c(r, target, offset, buf, buflen);
}
#ifndef MLKEM_GEN_MATRIX_NBLOCKS
-#define MLKEM_GEN_MATRIX_NBLOCKS \
- ((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + MLK_XOF_RATE) / MLK_XOF_RATE)
+#define MLKEM_GEN_MATRIX_NBLOCKS \
+ ((12 * MLKEM_N / 8 * ((uint32_t)1 << 12) / MLKEM_Q + MLK_XOF_RATE) / \
+ MLK_XOF_RATE)
#endif
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
/* Reference: Does not exist in the reference implementation @[REF].
* - x4-batched version of `rej_uniform()` from the
* reference implementation, leveraging x4-batched Keccak-f1600. */
MLK_INTERNAL_API
-void mlk_poly_rej_uniform_x4(mlk_poly *vec,
+void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
+ mlk_poly *vec3,
uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
{
/* Temporary buffers for XOF output before rejection sampling */
@@ -167,10 +171,10 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
*/
mlk_xof_x4_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &statex);
buflen = MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE;
- ctr[0] = mlk_rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf[0], buflen);
- ctr[1] = mlk_rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf[1], buflen);
- ctr[2] = mlk_rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf[2], buflen);
- ctr[3] = mlk_rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf[3], buflen);
+ ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, 0, buf[0], buflen);
+ ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, 0, buf[1], buflen);
+ ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, 0, buf[2], buflen);
+ ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, 0, buf[3], buflen);
/*
* So long as not all matrix entries have been generated, squeeze
@@ -180,20 +184,24 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
ctr[3] < MLKEM_N)
__loop__(
- assigns(ctr, statex, memory_slice(vec, sizeof(mlk_poly) * 4), object_whole(buf[0]),
- object_whole(buf[1]), object_whole(buf[2]), object_whole(buf[3]))
+ assigns(ctr, statex,
+ memory_slice(vec0, sizeof(mlk_poly)),
+ memory_slice(vec1, sizeof(mlk_poly)),
+ memory_slice(vec2, sizeof(mlk_poly)),
+ memory_slice(vec3, sizeof(mlk_poly)),
+ object_whole(buf))
invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
- invariant(array_bound(vec[0].coeffs, 0, ctr[0], 0, MLKEM_Q))
- invariant(array_bound(vec[1].coeffs, 0, ctr[1], 0, MLKEM_Q))
- invariant(array_bound(vec[2].coeffs, 0, ctr[2], 0, MLKEM_Q))
- invariant(array_bound(vec[3].coeffs, 0, ctr[3], 0, MLKEM_Q)))
+ invariant(array_bound(vec0->coeffs, 0, ctr[0], 0, MLKEM_Q))
+ invariant(array_bound(vec1->coeffs, 0, ctr[1], 0, MLKEM_Q))
+ invariant(array_bound(vec2->coeffs, 0, ctr[2], 0, MLKEM_Q))
+ invariant(array_bound(vec3->coeffs, 0, ctr[3], 0, MLKEM_Q)))
{
mlk_xof_x4_squeezeblocks(buf, 1, &statex);
- ctr[0] = mlk_rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf[0], buflen);
- ctr[1] = mlk_rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf[1], buflen);
- ctr[2] = mlk_rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf[2], buflen);
- ctr[3] = mlk_rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf[3], buflen);
+ ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, ctr[0], buf[0], buflen);
+ ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, ctr[1], buf[1], buflen);
+ ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, ctr[2], buf[2], buflen);
+ ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, ctr[3], buf[3], buflen);
}
mlk_xof_x4_release(&statex);
@@ -202,6 +210,7 @@ void mlk_poly_rej_uniform_x4(mlk_poly *vec,
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(buf, sizeof(buf));
}
+#endif /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
MLK_INTERNAL_API
void mlk_poly_rej_uniform(mlk_poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
@@ -284,7 +293,7 @@ void mlk_poly_cbd2(mlk_poly *r, const uint8_t buf[2 * MLKEM_N / 4])
{
const int16_t a = (d >> (4 * j + 0)) & 0x3;
const int16_t b = (d >> (4 * j + 2)) & 0x3;
- r->coeffs[8 * i + j] = a - b;
+ r->coeffs[8 * i + j] = (int16_t)(a - b);
}
}
}
@@ -336,7 +345,7 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4])
{
const int16_t a = (d >> (6 * j + 0)) & 0x7;
const int16_t b = (d >> (6 * j + 3)) & 0x7;
- r->coeffs[4 * i + j] = a - b;
+ r->coeffs[4 * i + j] = (int16_t)(a - b);
}
}
}
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/sampling.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/sampling.h
index 2cf43c889b..24c26b34a5 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/sampling.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/sampling.h
@@ -15,8 +15,6 @@
#ifndef MLK_SAMPLING_H
#define MLK_SAMPLING_H
-#include
-#include
#include "cbmc.h"
#include "common.h"
#include "poly.h"
@@ -58,6 +56,7 @@ MLK_INTERNAL_API
void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_ETA1 == 3 */
+#if !defined(MLK_CONFIG_SERIAL_FIPS202_ONLY)
#define mlk_poly_rej_uniform_x4 MLK_NAMESPACE(poly_rej_uniform_x4)
/*************************************************
* Name: mlk_poly_rej_uniform_x4
@@ -65,8 +64,8 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
* Description: Generate four polynomials using rejection sampling
* on (pseudo-)uniformly random bytes sampled from a seed.
*
- * Arguments: - mlk_poly *vec:
- * Pointer to an array of 4 polynomials to be sampled.
+ * Arguments: - mlk_poly *vec0, *vec1, *vec2, *vec3:
+ * Pointers to 4 polynomials to be sampled.
* - uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)]:
* Pointer consecutive array of seed buffers of size
* MLKEM_SYMBYTES + 2 each, plus padding for alignment.
@@ -75,16 +74,24 @@ void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
*
**************************************************/
MLK_INTERNAL_API
-void mlk_poly_rej_uniform_x4(mlk_poly *vec,
+void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
+ mlk_poly *vec3,
uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
__contract__(
- requires(memory_no_alias(vec, sizeof(mlk_poly) * 4))
+ requires(memory_no_alias(vec0, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec1, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec2, sizeof(mlk_poly)))
+ requires(memory_no_alias(vec3, sizeof(mlk_poly)))
requires(memory_no_alias(seed, 4 * MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)))
- assigns(memory_slice(vec, sizeof(mlk_poly) * 4))
- ensures(array_bound(vec[0].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[1].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[2].coeffs, 0, MLKEM_N, 0, MLKEM_Q))
- ensures(array_bound(vec[3].coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+ assigns(memory_slice(vec0, sizeof(mlk_poly)))
+ assigns(memory_slice(vec1, sizeof(mlk_poly)))
+ assigns(memory_slice(vec2, sizeof(mlk_poly)))
+ assigns(memory_slice(vec3, sizeof(mlk_poly)))
+ ensures(array_bound(vec0->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec1->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec2->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
+ ensures(array_bound(vec3->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
+#endif /* !MLK_CONFIG_SERIAL_FIPS202_ONLY */
#define mlk_poly_rej_uniform MLK_NAMESPACE(poly_rej_uniform)
/*************************************************
diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/symmetric.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/symmetric.h
index 985bfeab37..68d7e1a0cd 100644
--- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/symmetric.h
+++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/mlkem/src/symmetric.h
@@ -15,12 +15,13 @@
#ifndef MLK_SYMMETRIC_H
#define MLK_SYMMETRIC_H
-#include