diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 2e78d911ba..62de9a6d3b 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -6,6 +6,7 @@ self-hosted-runner: # actionlint 1.7.7 does not recognize macos-15-intel yet # TODO: Remove when upgrading to >= 1.7.8 - macos-15-intel + - windows-11-arm # PQCP runners - pqcp-arm64 - pqcp-ppc64 diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml deleted file mode 100644 index f28879790d..0000000000 --- a/.github/workflows/all.yml +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: CI -permissions: - contents: read -on: - workflow_dispatch: - push: - branches: ["main"] - pull_request: - branches: ["main"] - types: [ "opened", "synchronize" ] - -jobs: - base: - name: Base - permissions: - contents: 'read' - id-token: 'write' - uses: ./.github/workflows/base.yml - secrets: inherit - lint-markdown: - name: Lint Markdown - permissions: - contents: 'read' - id-token: 'write' - uses: ./.github/workflows/lint_markdown.yml - nix: - name: Nix - permissions: - actions: 'write' - contents: 'read' - id-token: 'write' - uses: ./.github/workflows/nix.yml - secrets: inherit - riscv: - name: RISC-V - permissions: - contents: 'read' - id-token: 'write' - needs: [ base ] - uses: ./.github/workflows/riscv.yml - ci: - name: Extended - permissions: - contents: 'read' - id-token: 'write' - needs: [ base, nix ] - uses: ./.github/workflows/ci.yml - secrets: inherit - cbmc: - name: CBMC - permissions: - contents: 'read' - id-token: 'write' - pull-requests: 'write' - needs: [ base, nix ] - uses: ./.github/workflows/cbmc.yml - secrets: inherit - oqs_integration: - name: libOQS - permissions: - contents: 'read' - id-token: 'write' - needs: [ base ] - uses: ./.github/workflows/integration-liboqs.yml - secrets: inherit - pavona_integration: - name: Pavona - permissions: - contents: 'read' - id-token: 'write' - needs: [ base ] - uses: ./.github/workflows/integration-pavona.yml - secrets: inherit - awslc_integration: - name: AWS-LC - permissions: - contents: 'read' - id-token: 'write' - needs: [ base ] - uses: ./.github/workflows/integration-awslc.yml - with: - commit: v5.0.0 - secrets: inherit - ct-test: - name: Constant-time - permissions: - contents: 'read' - id-token: 'write' - needs: [ base, nix ] - uses: ./.github/workflows/ct-tests.yml - secrets: inherit - slothy: - name: SLOTHY - permissions: - contents: 'read' - id-token: 'write' - needs: [ base, nix ] - uses: ./.github/workflows/slothy.yml - secrets: inherit - baremetal: - name: Baremetal - permissions: - contents: 'read' - id-token: 'write' - needs: [ base ] - uses: ./.github/workflows/baremetal.yml - secrets: inherit diff --git a/.github/workflows/baremetal.yml b/.github/workflows/baremetal.yml deleted file mode 100644 index 5be619b388..0000000000 --- a/.github/workflows/baremetal.yml +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright (c) The mldsa-native project authors -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: Baremetal -permissions: - contents: read -on: - workflow_call: - workflow_dispatch: - -jobs: - baremetal_tests: - name: Baremetal tests (${{ matrix.target.name }}) - strategy: - fail-fast: false - matrix: - target: - - runner: ubuntu-latest - name: 'M55-AN547' - makefile: test/baremetal/platform/m55-an547/platform.mk - nix-shell: cross-arm-embedded - func: true - kat: true - acvp: true - wycheproof: false - alloc: true - bench: true - opt: all - - runner: ubuntu-latest - name: 'M33-AN524' - makefile: test/baremetal/platform/m33-an524/platform.mk - nix-shell: cross-arm-embedded - func: true - kat: true - acvp: true - alloc: true - bench: true - opt: no_opt - - runner: ubuntu-latest - name: 'AVR ATmega128RFR2 (modified for 32K RAM)' - makefile: test/baremetal/platform/avr/platform.mk - nix-shell: cross-avr - func: true - kat: true - acvp: true - wycheproof: false - alloc: false - bench: false - opt: no_opt - runs-on: ${{ matrix.target.runner }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: baremetal build + test - uses: ./.github/actions/functest - env: - EXTRA_MAKEFILE: ${{ matrix.target.makefile }} - with: - nix-shell: ${{ matrix.target.nix-shell }} - gh_token: ${{ secrets.GITHUB_TOKEN }} - opt: ${{ matrix.target.opt }} - func: ${{ matrix.target.func }} - kat: ${{ matrix.target.kat }} - acvp: ${{ matrix.target.acvp }} - wycheproof: ${{ matrix.target.wycheproof }} - examples: false - stack: false - alloc: ${{ matrix.target.alloc }} - rng_fail: true - - name: Baremetal bench - if: ${{ matrix.target.bench }} - uses: ./.github/actions/bench - env: - EXTRA_MAKEFILE: ${{ matrix.target.makefile }} - with: - name: ${{ matrix.target.name }} - nix-shell: ${{ matrix.target.nix-shell }} - gh_token: ${{ secrets.GITHUB_TOKEN }} - perf: PMU - opt: true - store_results: false - - baremetal_aarch64_virt: - name: AArch64-virt no-MMU (${{ matrix.fips202_backend }}) - strategy: - fail-fast: false - matrix: - fips202_backend: - - x1_scalar - - x1_v84a - - x2_v84a - - x4_v8a_scalar - - x4_v8a_v84a_scalar - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: baremetal build + test - uses: ./.github/actions/functest - env: - EXTRA_MAKEFILE: test/baremetal/platform/aarch64-virt/platform.mk - with: - nix-shell: cross-aarch64-embedded - gh_token: ${{ secrets.GITHUB_TOKEN }} - opt: opt - func: true - kat: true - acvp: true - examples: false - stack: false - alloc: false - rng_fail: true - extra_args: '--fips202-aarch64-backend=${{ matrix.fips202_backend }}' diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 75ca6df91a..d8398ce1b6 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -5,323 +5,28 @@ name: Base permissions: contents: read on: - workflow_call: workflow_dispatch: + push: + branches: ["main"] + pull_request: + branches: ["main"] jobs: - lint: + quickcheck-windows-arm64: strategy: fail-fast: false matrix: - system: [ubuntu-latest, ubuntu-24.04-arm] - name: Linting - runs-on: ${{ matrix.system }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: ./.github/actions/lint - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - cross-prefix: "aarch64-unknown-linux-gnu-" - quickcheck: - strategy: - fail-fast: false - matrix: - external: - - ${{ github.repository_owner != 'pq-code-package' }} - target: - - runner: ubuntu-24.04-arm - name: 'aarch64' - - runner: ubuntu-latest - name: 'x86_64' - - runner: macos-latest - name: 'macos (aarch64)' - - runner: macos-15-intel - name: 'macos (x86_64)' - exclude: - - {external: true, - target: { - runner: ubuntu-24.04-arm, - name: 'aarch64' - }} - name: Quickcheck (${{ matrix.target.name }}) - runs-on: ${{ matrix.target.runner }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: make quickcheck - run: | - OPT=0 make quickcheck - make clean >/dev/null - OPT=1 make quickcheck - - uses: ./.github/actions/setup-os - - name: tests func - run: | - ./scripts/tests func --check-namespace - quickcheck-acvp: - strategy: - fail-fast: false - matrix: - external: - - ${{ github.repository_owner != 'pq-code-package' }} - target: - - runner: ubuntu-24.04-arm - name: 'aarch64' - - runner: ubuntu-latest - name: 'x86_64' - acvp-version: [v1.1.0.40, v1.1.0.41, v1.1.0.42] - exclude: - - {external: true, - target: { - runner: ubuntu-24.04-arm, - name: 'aarch64' - }} - name: Quickcheck ACVP (${{ matrix.target.name }}, ${{ matrix.acvp-version }}) - runs-on: ${{ matrix.target.runner }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: Run ACVP test - run: | - ./scripts/tests acvp --version ${{ matrix.acvp-version }} - quickcheck_bench: - strategy: - fail-fast: false - matrix: - external: - - ${{ github.repository_owner != 'pq-code-package' }} - target: - - runner: ubuntu-24.04-arm - name: 'aarch64' - - runner: ubuntu-latest - name: 'x86_64' - - runner: macos-latest - name: 'macos (aarch64)' - - runner: macos-15-intel - name: 'macos (x86_64)' - exclude: - - {external: true, - target: { - runner: ubuntu-24.04-arm, - name: 'aarch64' - }} - name: Quickcheck bench (${{ matrix.target.name }}) - runs-on: ${{ matrix.target.runner }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: "tests bench (cycles: NO)" - run: | - ./scripts/tests bench -c NO - - name: "tests bench (build only, cycles: PMU)" - if: ${{ matrix.target.name != 'macos (aarch64)' && matrix.target.name != 'macos (x86_64)' }} - run: | - make clean - ./scripts/tests bench -c PMU --no-run - - name: "tests bench (build only, cycles: PERF)" - if: ${{ matrix.target.name != 'macos (aarch64)' && matrix.target.name != 'macos (x86_64)' }} - run: | - make clean - ./scripts/tests bench -c PERF --no-run - - name: "tests bench (build only, cycles: MAC)" - if: ${{ matrix.target.name == 'macos (aarch64)' || matrix.target.name == 'macos (x86_64)' }} - run: | - make clean - ./scripts/tests bench -c MAC --no-run - - name: tests bench components - run: | - make clean - ./scripts/tests bench --components -c NO - quickcheck-c90: - strategy: - fail-fast: false - matrix: - external: - - ${{ github.repository_owner != 'pq-code-package' }} - target: - - runner: ubuntu-24.04-arm - name: 'aarch64' - - runner: ubuntu-latest - name: 'x86_64' - exclude: - - {external: true, - target: { - runner: ubuntu-24.04-arm, - name: 'aarch64' - }} - name: Quickcheck C90 (${{ matrix.target.name }}) - runs-on: ${{ matrix.target.runner }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: make quickcheck - run: | - OPT=0 CFLAGS=-std=c90 make quickcheck - make clean >/dev/null - OPT=1 CFLAGS=-std=c90 make quickcheck - - uses: ./.github/actions/setup-apt - - name: tests func - run: | - ./scripts/tests func --cflags="-std=c90" --check-namespace - - name: tests bench - run: | - ./scripts/tests bench -c NO --cflags="-std=c90" - - name: tests bench components - run: | - ./scripts/tests bench --components -c NO --cflags="-std=c90" - quickcheck-windows: - strategy: - fail-fast: false - matrix: - system: [windows-2025, windows-2022] - name: Quickcheck ${{ matrix.system }} - runs-on: ${{ matrix.system }} + opt: [0, 1] + name: Quickcheck windows-11-arm (OPT=${{ matrix.opt }}) + runs-on: windows-11-arm steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + with: + arch: arm64 - name: Build test shell: powershell run: | # print compiler version cl - nmake /f ./Makefile.Microsoft_nmake quickcheck - quickcheck-windows-mingw-w64: - strategy: - fail-fast: false - matrix: - # Oldest available + 3 latest; intermediate versions run in legacy-compilers. - mingw-version: [5.4.0, 13.2.0, 14.2.0, 15.2.0] - name: Quickcheck (Mingw-w64 ${{ matrix.mingw-version }}) - runs-on: windows-latest - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: Install MinGW-w64 - run: choco install mingw --version=${{ matrix.mingw-version }} -y - shell: cmd - - name: make quickcheck - shell: bash - run: | - CC=gcc OPT=0 make quickcheck - CC=gcc make clean >/dev/null - CC=gcc OPT=1 make quickcheck - - name: make quickcheck (AVX2) - shell: bash - run: | - CC=gcc make clean >/dev/null - CC=gcc CFLAGS="-mavx2 -mbmi2" make quickcheck - quickcheck-lib: - name: Quickcheck lib - strategy: - matrix: - system: [macos-latest, macos-15-intel, ubuntu-latest, ubuntu-24.04-arm] - runs-on: ${{ matrix.system }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: make lib - run: | - make lib - examples: - name: Examples - strategy: - matrix: - system: [macos-latest, macos-15-intel, ubuntu-latest, ubuntu-24.04-arm] - runs-on: ${{ matrix.system }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: basic - run: | - CFLAGS="-O0" make run -C examples/basic - - name: basic_deterministic - run: | - CFLAGS="-O0" make run -C examples/basic_deterministic - - name: bring_your_own_fips202 - run: | - CFLAGS="-O0" make run -C examples/bring_your_own_fips202 - - name: bring_your_own_fips202_static - run: | - CFLAGS="-O0" make run -C examples/bring_your_own_fips202_static - - name: custom_backend - run: | - CFLAGS="-O0" make run -C examples/custom_backend - - name: monolithic_build - run: | - CFLAGS="-O0" make run -C examples/monolithic_build - - name: monolithic_build_native - run: | - CFLAGS="-O0" make run -C examples/monolithic_build_native - - name: monolithic_build_multilevel - run: | - CFLAGS="-O0" make run -C examples/monolithic_build_multilevel - - name: monolithic_build_multilevel_native - run: | - CFLAGS="-O0" make run -C examples/monolithic_build_multilevel_native - - name: multilevel_build - run: | - CFLAGS="-O0" make run -C examples/multilevel_build - - name: multilevel_build_native - run: | - CFLAGS="-O0" make run -C examples/multilevel_build_native - simpasm: - strategy: - fail-fast: false - matrix: - backend: - - arg: '--aarch64-clean' - name: Clean - - arg: '' - name: Optimized - simplify: - - arg: '' - name: Simplified - - arg: '--no-simplify' - name: Unmodified - runs-on: ubuntu-24.04-arm - name: AArch64 dev backend (${{ matrix.backend.name }}, ${{ matrix.simplify.name }}) - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: Reinstate and test backend - uses: ./.github/actions/setup-shell - with: - nix-shell: 'ci' - gh_token: ${{ secrets.GITHUB_TOKEN }} - script: | - ./scripts/autogen ${{ matrix.backend.arg }} ${{ matrix.simplify.arg }} - make clean - OPT=1 make quickcheck - x86_64_intel_syntax: - name: x86_64 Intel syntax - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: Generate with Intel syntax and test - uses: ./.github/actions/setup-shell - with: - nix-shell: 'ci' - gh_token: ${{ secrets.GITHUB_TOKEN }} - script: | - ./scripts/autogen --x86-64-syntax intel - make clean - ./scripts/tests all - scan-build: - strategy: - fail-fast: false - matrix: - external: - - ${{ github.repository_owner != 'pq-code-package' }} - target: - - runner: ubuntu-24.04-arm - name: 'aarch64' - - runner: ubuntu-latest - name: 'x86_64' - name: scan-build (${{ matrix.target.name }}) - runs-on: ${{ matrix.target.runner }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: ./.github/actions/setup-apt - with: - packages: clang-tools clang - - name: make quickcheck - run: | - scan-build --status-bugs make quickcheck OPT=0 - make clean >/dev/null - scan-build --status-bugs make quickcheck OPT=1 - symlink-check: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: donatj/symlink-check-action@b3c737d0fd4e52752f0b8c71a03f3f775fa015cb + nmake /f ./Makefile.Microsoft_nmake OPT=${{ matrix.opt }} quickcheck diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml deleted file mode 100644 index aee2718cb2..0000000000 --- a/.github/workflows/bench.yml +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: Bench -on: - workflow_dispatch: - push: - branches: ["main"] - pull_request: - branches: ["main"] - types: [ "labeled" ] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -permissions: - contents: read - pull-requests: read - -jobs: - bench: - permissions: - contents: write - pull-requests: write - name: ${{ matrix.target.name }} - strategy: - fail-fast: true - matrix: - target: - - system: rpi4 - name: Arm Cortex-A72 (Raspberry Pi 4) benchmarks - bench_pmu: PMU - archflags: -mcpu=cortex-a72 -DMLK_SYS_AARCH64_SLOW_BARREL_SHIFTER - cflags: "-flto -DMLK_FORCE_AARCH64" - ldflags: "-flto" - bench_extra_args: "" - nix_shell: bench - - system: rpi5 - name: Arm Cortex-A76 (Raspberry Pi 5) benchmarks - bench_pmu: PERF - archflags: "-mcpu=cortex-a76 -march=armv8.2-a" - cflags: "-flto -DMLK_FORCE_AARCH64" - ldflags: "-flto" - bench_extra_args: "" - nix_shell: bench - cross_prefix: "" - - system: a55 - name: Arm Cortex-A55 (Snapdragon 888) benchmarks - bench_pmu: PERF - archflags: "-mcpu=cortex-a55 -march=armv8.2-a" - cflags: "-flto -DMLK_FORCE_AARCH64 -DMLK_CONFIG_FIPS202_BACKEND_FILE=\\\\\\\"fips202/native/aarch64/x1_scalar.h\\\\\\\"" - ldflags: "-flto -static" - bench_extra_args: -w exec-on-a55 - nix_shell: bench - - system: bpi - name: SpacemiT K1 8 (Banana Pi F3) benchmarks - bench_pmu: PERF - archflags: "-march=rv64imafdcv_zicsr_zifencei" - cflags: "" - ldflags: "-static" - bench_extra_args: -w exec-on-bpi - cross_prefix: riscv64-unknown-linux-gnu- - nix_shell: cross-riscv64 - - system: m1-mac-mini - name: Mac Mini (M1, 2020) benchmarks - bench_pmu: MAC - archflags: "-mcpu=apple-m1 -march=armv8.4-a+sha3" - cflags: "-flto" - ldflags: "-flto" - bench_extra_args: "-r" - nix_shell: bench - - system: pqcp-ppc64 - name: ppc64le (POWER10) benchmarks - bench_pmu: PERF - archflags: "-mcpu=native" - cflags: "-flto -DMLK_FORCE_PPC64LE" - ldflags: "-flto" - bench_extra_args: "-r" - nix_shell: '' - cross_prefix: "" - if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork && (github.event.label.name == 'benchmark' || github.ref == 'refs/heads/main') - runs-on: self-hosted-${{ matrix.target.system }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: ./.github/actions/bench - with: - name: ${{ matrix.target.name }} - cflags: ${{ matrix.target.cflags }} - archflags: ${{ matrix.target.archflags }} - ldflags: ${{ matrix.target.ldflags }} - perf: ${{ matrix.target.bench_pmu }} - store_results: ${{ github.repository_owner == 'pq-code-package' && github.ref == 'refs/heads/main' }} - bench_extra_args: ${{ matrix.target.bench_extra_args }} - gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} - nix-shell: ${{ matrix.target.nix_shell }} - cross_prefix: ${{ matrix.target.cross_prefix }} - - ec2_all: - name: ${{ matrix.target.name }} - permissions: - contents: write - pull-requests: write - id-token: write - strategy: - fail-fast: false - matrix: - target: - - name: Graviton2 - ec2_instance_type: t4g.small - ec2_ami: ubuntu-latest (aarch64) - archflags: -mcpu=cortex-a76 -march=armv8.2-a - cflags: "-flto -DMLK_FORCE_AARCH64" - ldflags: "-flto" - perf: PERF - - name: Graviton3 - ec2_instance_type: c7g.medium - ec2_ami: ubuntu-latest (aarch64) - archflags: -march=armv8.4-a+sha3 - cflags: "-flto -DMLK_FORCE_AARCH64" - ldflags: "-flto" - perf: PERF - - name: Graviton4 - ec2_instance_type: c8g.medium - ec2_ami: ubuntu-latest (aarch64) - archflags: -march=armv9-a+sha3 - cflags: "-flto -DMLK_FORCE_AARCH64" - ldflags: "-flto" - perf: PERF - - name: AMD EPYC 4th gen (c7a) - ec2_instance_type: c7a.medium - ec2_ami: ubuntu-latest (x86_64) - archflags: -mavx2 -mbmi2 -mpopcnt -march=znver4 - cflags: "-flto -DMLK_FORCE_X86_64" - ldflags: "-flto" - perf: PMU - - name: Intel Xeon 4th gen (c7i) - ec2_instance_type: c7i.metal-24xl - ec2_ami: ubuntu-latest (x86_64) - archflags: -mavx2 -mbmi2 -mpopcnt -march=sapphirerapids - cflags: "-flto -DMLK_FORCE_X86_64" - ldflags: "-flto" - perf: PMU - - name: AMD EPYC 3rd gen (c6a) - ec2_instance_type: c6a.large - ec2_ami: ubuntu-latest (x86_64) - archflags: -mavx2 -mbmi2 -mpopcnt -march=znver3 - cflags: "-flto -DMLK_FORCE_X86_64" - ldflags: "-flto" - perf: PMU - - name: Intel Xeon 3rd gen (c6i) - ec2_instance_type: c6i.large - ec2_ami: ubuntu-latest (x86_64) - archflags: -mavx2 -mbmi2 -mpopcnt -march=icelake-server - cflags: "-flto -DMLK_FORCE_X86_64" - ldflags: "-flto" - perf: PMU - uses: ./.github/workflows/bench_ec2_reusable.yml - if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork && (github.event.label.name == 'benchmark' || github.ref == 'refs/heads/main') - with: - ec2_instance_type: ${{ matrix.target.ec2_instance_type }} - ec2_ami: ${{ matrix.target.ec2_ami }} - archflags: ${{ matrix.target.archflags }} - cflags: ${{ matrix.target.cflags }} - ldflags: ${{ matrix.target.ldflags }} - opt: "all" - store_results: ${{ github.repository_owner == 'pq-code-package' && github.ref == 'refs/heads/main' }} # Only store optimized results - name: ${{ matrix.target.name }} - perf: ${{ matrix.target.perf }} - secrets: - AWS_GITHUB_TOKEN: ${{ secrets.AWS_GITHUB_TOKEN }} diff --git a/.github/workflows/bench_ec2_any.yml b/.github/workflows/bench_ec2_any.yml deleted file mode 100644 index 232e877917..0000000000 --- a/.github/workflows/bench_ec2_any.yml +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: bench-ec2-any -permissions: - contents: read -on: - workflow_dispatch: - inputs: - name: - description: Alternative name of instance - default: Graviton2 - ec2_instance_type: - description: Type if EC2 instance to benchmark on - default: t4g.small - ec2_ami: - description: AMI ID - type: choice - options: - - ubuntu-latest (x86_64) - - ubuntu-latest (aarch64) - - ubuntu-latest (custom AMI) - default: ubuntu-latest (aarch64) - ec2_ami_id: - description: AMI ID - required: false - default: ami-096ea6a12ea24a797 - cflags: - description: Custom CFLAGS for compilation - default: - archflags: - description: Custom ARCH flags for compilation - default: '' - ldflags: - description: Custom LDFLAGS for linking - default: '' - opt: - description: Benchmark optimized, non-optimized, or both - type: choice - options: - - all - - opt - - no_opt - bench_extra_args: - description: Additional command line to be appended to `tests bench` script - default: '' - compiler: - description: Compiler to use. When unset, default nix shell is used. - default: '' -jobs: - bench-ec2-any: - name: Ad-hoc benchmark on $${{ inputs.ec2_instance_type }} - permissions: - contents: 'read' - id-token: 'write' - uses: ./.github/workflows/bench_ec2_reusable.yml - with: - ec2_instance_type: ${{ inputs.ec2_instance_type }} - ec2_ami: ${{ inputs.ec2_ami }} - ec2_ami_id: ${{ inputs.ec2_ami_id }} - cflags: ${{ inputs.cflags }} - ldflags: ${{ inputs.ldflags }} - archflags: ${{ inputs.archflags }} - opt: ${{ inputs.opt }} - name: ${{ inputs.name }} - store_results: false - bench_extra_args: ${{ inputs.bench_extra_args }} - compiler: ${{ inputs.compiler }} - secrets: - AWS_GITHUB_TOKEN: ${{ secrets.AWS_GITHUB_TOKEN }} diff --git a/.github/workflows/bench_ec2_reusable.yml b/.github/workflows/bench_ec2_reusable.yml deleted file mode 100644 index 21e760e8ce..0000000000 --- a/.github/workflows/bench_ec2_reusable.yml +++ /dev/null @@ -1,239 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: bench-ec2-reusable -on: - workflow_call: - inputs: - name: - type: string - description: Alternative name of instance - default: Graviton2 - ec2_instance_type: - type: string - description: Type if EC2 instance to benchmark on - default: t4g.small - ec2_ami: - type: string - description: Textual description of AMI - default: ubuntu-latest (aarch64) - ec2_ami_id: - type: string - description: AMI ID - default: ami-096ea6a12ea24a797 - cflags: - type: string - description: Custom CFLAGS for compilation - default: "" - archflags: - type: string - description: Custom ARCH flags for compilation - default: -mcpu=neoverse-n1 -march=armv8.2-a - ldflags: - type: string - description: Custom LDFLAGS for linking - default: "" - opt: - type: string - description: Runs with optimized code if enabled (opt, no_opt, all) - default: "opt" - perf: - type: string - description: Method by which clock cycles should be measured (PMU | PERF) - default: PERF - store_results: - type: boolean - description: Indicates if results should be pushed to github pages - default: false - verbose: - description: Determine for the log verbosity - type: boolean - default: false - bench_extra_args: - type: string - description: Additional command line to be appended to `bench` script - default: '' - compiler: - type: string - description: Compiler to use. When unset, default nix shell is used. - default: '' - additional_packages: - type: string - description: Additional packages to install when custom compiler is used. - default: '' - aws_region: - type: string - default: "us-east-1" - alert_threshold: - type: string - description: "Set alert threshold in percentage for benchmark result" - default: "103%" - secrets: - AWS_GITHUB_TOKEN: - description: GitHub token used for storing benchmark results - required: true -env: - AWS_ROLE: arn:aws:iam::559050233797:role/mlkem-c-aarch64-gh-action - AMI_UBUNTU_LATEST_X86_64: ami-0e86e20dae9224db8 - AMI_UBUNTU_LATEST_AARCH64: ami-096ea6a12ea24a797 - -permissions: - contents: read - pull-requests: read - -jobs: - start-ec2-runner: - name: Start ${{ inputs.name }} (${{ inputs.ec2_instance_type }}) - permissions: - contents: 'read' - id-token: 'write' - runs-on: ubuntu-latest - if: ${{ always() }} # The point is to make this step non-cancellable, - # avoiding race conditions where an instance is started, - # but isn't yet done registering as a runner and reporting back. - outputs: - label: ${{ steps.start-ec2-runner.outputs.label }} - ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: Determine AMI ID - id: det_ami_id - run: | - if [[ "${{ inputs.ec2_ami }}" == "ubuntu-latest (x86_64)" ]]; then - AMI_ID=${{ env.AMI_UBUNTU_LATEST_X86_64 }} - elif [[ "${{ inputs.ec2_ami }}" == "ubuntu-latest (aarch64)" ]]; then - AMI_ID=${{ env.AMI_UBUNTU_LATEST_AARCH64 }} - elif [[ "${{ inputs.ec2_ami }}" == "ubuntu-latest (custom AMI)" ]]; then - AMI_ID=${{ inputs.ec2_ami_id }} - fi - echo "Using AMI ID: $AMI_ID" - echo "AMI_ID=$AMI_ID" >> "$GITHUB_OUTPUT" - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@e7f100cf4c008499ea8adda475de1042d6975c7b # v6.2.0 - with: - role-to-assume: ${{ env.AWS_ROLE }} - aws-region: ${{ inputs.aws_region }} - - name: Start EC2 runner - id: start-ec2-runner - uses: machulav/ec2-github-runner@343a1b2ae682e681c3cec9a235d882da17ff04ef # v2.6.1 - with: - mode: start - github-token: ${{ secrets.AWS_GITHUB_TOKEN }} - ec2-instance-type: ${{ inputs.ec2_instance_type }} - availability-zones-config: >- - [{"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-07b2729e5e065962f","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0c7739cbd02c2c1d2","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0d69987f97f50fc1d","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0d077bf47a0eef46e","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0019f164593f6df43","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0f0d1e7667a264a0e","securityGroupId":"sg-0ab2e297196c8c381"}] - bench_nix: - name: Bench (nix) - permissions: - contents: write - pull-requests: write - runs-on: ${{ needs.start-ec2-runner.outputs.label }} - needs: start-ec2-runner # required to start the main job when the runner is ready - if: ${{ inputs.compiler == '' }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: ./.github/actions/bench - if: ${{ inputs.opt == 'all' || inputs.opt == 'opt' }} - with: - nix-verbose: ${{ inputs.verbose }} - name: ${{ inputs.name }} - cflags: ${{ inputs.cflags }} - archflags: ${{ inputs.archflags }} - ldflags: ${{ inputs.ldflags }} - opt: true - perf: ${{ inputs.perf }} - store_results: ${{ inputs.store_results }} - bench_extra_args: ${{ inputs.bench_extra_args }} - gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} - - uses: ./.github/actions/bench - if: ${{ inputs.opt == 'all' || inputs.opt == 'no_opt' }} - with: - nix-verbose: ${{ inputs.verbose }} - name: ${{ inputs.name }} (no-opt) - cflags: ${{ inputs.cflags }} - archflags: ${{ inputs.archflags }} - ldflags: ${{ inputs.ldflags }} - opt: false - perf: ${{ inputs.perf }} - store_results: ${{ inputs.store_results }} - bench_extra_args: ${{ inputs.bench_extra_args }} - gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} - alert_threshold: ${{ inputs.alert_threshold }} - bench_custom: - name: Bench (custom compiler) - permissions: - contents: write - pull-requests: write - runs-on: ${{ needs.start-ec2-runner.outputs.label }} - needs: start-ec2-runner # required to start the main job when the runner is ready - if: ${{ inputs.compiler != '' }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: ./.github/actions/setup-apt - with: - packages: ${{ inputs.additional_packages }} - - name: Set compiler - run: | - echo "CC=${{ inputs.compiler }}" >> "$GITHUB_ENV" - - uses: ./.github/actions/bench - if: ${{ inputs.opt == 'all' || inputs.opt == 'opt' }} - with: - nix-shell: 'bench' - custom_shell: 'bash' - nix-cache: false - nix-verbose: ${{ inputs.verbose }} - name: ${{ inputs.name }} (${{ inputs.compiler }}) - cflags: ${{ inputs.cflags }} - archflags: ${{ inputs.archflags }} - ldflags: ${{ inputs.ldflags }} - opt: true - perf: ${{ inputs.perf }} - store_results: ${{ inputs.store_results }} - bench_extra_args: ${{ inputs.bench_extra_args }} - gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} - - uses: ./.github/actions/bench - if: ${{ inputs.opt == 'all' || inputs.opt == 'no_opt' }} - with: - nix-shell: 'bench' - custom_shell: 'bash' - nix-cache: false - nix-verbose: ${{ inputs.verbose }} - name: ${{ inputs.name }} (${{ inputs.compiler }}) (no-opt) - cflags: ${{ inputs.cflags }} - archflags: ${{ inputs.archflags }} - ldflags: ${{ inputs.ldflags }} - opt: false - perf: ${{ inputs.perf }} - store_results: ${{ inputs.store_results }} - bench_extra_args: ${{ inputs.bench_extra_args }} - gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} - alert_threshold: ${{ inputs.alert_threshold }} - stop-ec2-runner: - name: Stop ${{ inputs.name }} (${{ inputs.ec2_instance_type }}) - permissions: - contents: 'read' - id-token: 'write' - needs: - - start-ec2-runner - - bench_nix # required to wait when the main job is done - - bench_custom # required to wait when the main job is done - runs-on: ubuntu-latest - if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs - steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@e7f100cf4c008499ea8adda475de1042d6975c7b # v6.2.0 - with: - role-to-assume: ${{ env.AWS_ROLE }} - aws-region: ${{ inputs.aws_region }} - - name: Stop EC2 runner - uses: machulav/ec2-github-runner@343a1b2ae682e681c3cec9a235d882da17ff04ef # v2.6.1 - with: - mode: stop - github-token: ${{ secrets.AWS_GITHUB_TOKEN }} - label: ${{ needs.start-ec2-runner.outputs.label }} - ec2-instance-id: ${{ needs.start-ec2-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/cbmc.yml b/.github/workflows/cbmc.yml deleted file mode 100644 index 749f265ba4..0000000000 --- a/.github/workflows/cbmc.yml +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: CBMC -permissions: - contents: read -on: - workflow_call: - workflow_dispatch: - -jobs: - cbmc_k2: - name: CBMC (ML-KEM-512) - if: ${{ github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork }} - permissions: - contents: 'read' - id-token: 'write' - pull-requests: 'write' - uses: ./.github/workflows/ci_ec2_reusable.yml - with: - name: CBMC (MLKEM-512) - ec2_instance_type: r8g.xlarge - ec2_ami: ubuntu-latest (aarch64) - ec2_volume_size: 20 - compile_mode: native - opt: no_opt - lint: false - verbose: true - test: false - cbmc: true - cbmc_mlkem_k: 2 - secrets: inherit - cbmc_k3: - name: CBMC (ML-KEM-768) - if: ${{ github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork }} - permissions: - contents: 'read' - id-token: 'write' - pull-requests: 'write' - uses: ./.github/workflows/ci_ec2_reusable.yml - with: - name: CBMC (MLKEM-768) - ec2_instance_type: r8g.xlarge - ec2_ami: ubuntu-latest (aarch64) - ec2_volume_size: 20 - compile_mode: native - opt: no_opt - lint: false - verbose: true - test: false - cbmc: true - cbmc_mlkem_k: 3 - secrets: inherit - cbmc_k4: - name: CBMC (ML-KEM-1024) - if: ${{ github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork }} - permissions: - contents: 'read' - id-token: 'write' - pull-requests: 'write' - uses: ./.github/workflows/ci_ec2_reusable.yml - with: - name: CBMC (MLKEM-1024) - ec2_instance_type: r8g.xlarge - ec2_ami: ubuntu-latest (aarch64) - ec2_volume_size: 20 - compile_mode: native - opt: no_opt - lint: false - verbose: true - test: false - cbmc: true - cbmc_mlkem_k: 4 - secrets: inherit diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 4c2f4bd4c1..0000000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,807 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: Extended -permissions: - contents: read -on: - workflow_call: - workflow_dispatch: - -jobs: - build_kat: - strategy: - fail-fast: false - matrix: - external: - - ${{ github.repository_owner != 'pq-code-package' }} - target: - - runner: macos-latest - name: 'MacOS (aarch64)' - arch: mac - mode: native - nix_shell: ci - - runner: macos-15-intel - name: 'MacOS (x86_64)' - arch: mac - mode: native - nix_shell: ci - - runner: ubuntu-24.04-arm - name: 'ubuntu-latest (aarch64)' - arch: aarch64 - mode: native - nix_shell: ci - - runner: ubuntu-24.04-arm - name: 'ubuntu-latest (aarch64)' - arch: x86_64 - mode: cross-x86_64 - nix_shell: cross-x86_64 - - runner: ubuntu-24.04-arm - name: 'ubuntu-latest (aarch64)' - arch: riscv64 - mode: cross-riscv64 - nix_shell: cross-riscv64 - vlen: 128 - - runner: ubuntu-24.04-arm - name: 'ubuntu-latest (aarch64)' - arch: riscv64 - mode: cross-riscv64 - nix_shell: cross-riscv64 - vlen: 256 - - runner: ubuntu-24.04-arm - name: 'ubuntu-latest (aarch64)' - arch: riscv64 - mode: cross-riscv64 - nix_shell: cross-riscv64 - vlen: 512 - - runner: ubuntu-24.04-arm - name: 'ubuntu-latest (aarch64)' - arch: riscv64 - mode: cross-riscv64 - nix_shell: cross-riscv64 - vlen: 1024 - - runner: ubuntu-24.04-arm - name: 'ubuntu-latest (aarch64)' - arch: riscv32 - mode: cross-riscv32 - nix_shell: cross-riscv32 - - runner: ubuntu-24.04-arm - name: 'ubuntu-latest (ppc64le)' - arch: ppc64le - mode: cross-ppc64le - nix_shell: cross-ppc64le - - runner: ubuntu-latest - name: 'ubuntu-latest (x86_64)' - arch: x86_64 - mode: native - nix_shell: ci - - runner: ubuntu-latest - name: 'ubuntu-latest (x86_64)' - arch: aarch64 - mode: cross-aarch64 - nix_shell: cross-aarch64 - - runner: ubuntu-latest - name: 'ubuntu-latest (x86_64)' - arch: aarch64_be - mode: cross-aarch64_be - nix_shell: cross-aarch64_be - exclude: - - {external: true, - target: { - runner: ubuntu-24.04-arm, - name: 'ubuntu-latest (aarch64)', - arch: aarch64, - mode: native, - nix_shell: ci - }} - - {external: true, - target: { - runner: ubuntu-24.04-arm, - name: 'ubuntu-latest (aarch64)', - arch: x86_64, - mode: cross-x86_64, - nix_shell: cross-x86_64 - }} - - {external: true, - target: { - runner: ubuntu-24.04-arm, - name: 'ubuntu-latest (aarch64)', - arch: riscv32, - mode: cross-riscv32, - nix_shell: cross-riscv32 - }} - - {external: true, - target: { - runner: ubuntu-24.04-arm, - name: 'ubuntu-latest (ppc64le)', - arch: ppc64le, - mode: cross-ppc64le, - nix_shell: cross-ppc64le - }} - - {external: true, - target: { - runner: ubuntu-latest, - name: 'ubuntu-latest (x86_64)', - arch: x86_64, - mode: native, - nix_shell: ci - }} - - {external: true, - target: { - runner: ubuntu-latest, - name: 'ubuntu-latest (x86_64)', - arch: aarch64, - mode: cross-aarch64, - nix_shell: cross-aarch64 - }} - - {external: true, - target: { - runner: ubuntu-latest, - name: 'ubuntu-latest (x86_64)', - arch: aarch64_be, - mode: cross-aarch64_be, - nix_shell: cross-aarch64_be - }} - name: Functional tests (${{ matrix.target.arch }}${{ matrix.target.mode != 'native' && ', cross' || ''}}${{ matrix.target.vlen && format(', VLEN={0}', matrix.target.vlen) || '' }}) - runs-on: ${{ matrix.target.runner }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: build + test (no-opt) - uses: ./.github/actions/multi-functest - # no-opt exercises only the C fallback, so RVV VLEN is irrelevant; run once for riscv64 - if: ${{ matrix.target.arch != 'riscv64' || matrix.target.vlen == 128 }} - with: - nix-shell: ${{ matrix.target.nix_shell }} - nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: ${{ matrix.target.mode }} - exec_wrapper: ${{ matrix.target.vlen && format('qemu-riscv64 -cpu rv64,v=true,vlen={0}', matrix.target.vlen) || '' }} - opt: 'no_opt' - - name: build + test (+debug+memsan+ubsan, native) - uses: ./.github/actions/multi-functest - if: ${{ matrix.target.mode == 'native' }} - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - cflags: "-DMLKEM_DEBUG -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all" - ldflags: "-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all" - check_namespace: 'false' - - name: build + test (cross, opt) - uses: ./.github/actions/multi-functest - # There is no native code yet on riscv32 or AArch64_be, so no point running opt tests - if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} - with: - nix-shell: ${{ matrix.target.nix_shell }} - nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: ${{ matrix.target.mode }} - exec_wrapper: ${{ matrix.target.vlen && format('qemu-riscv64 -cpu rv64,v=true,vlen={0}', matrix.target.vlen) || '' }} - opt: 'opt' - - name: build + test (cross, opt, +debug) - uses: ./.github/actions/multi-functest - # There is no native code yet on riscv32 or AArch64_be, so no point running opt tests - if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} - with: - nix-shell: ${{ matrix.target.nix_shell }} - nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: ${{ matrix.target.mode }} - exec_wrapper: ${{ matrix.target.vlen && format('qemu-riscv64 -cpu rv64,v=true,vlen={0}', matrix.target.vlen) || '' }} - cflags: "-DMLKEM_DEBUG" - opt: 'opt' - - name: build + test (cross, opt, C90) - uses: ./.github/actions/multi-functest - if: ${{ matrix.target.arch == 'riscv64' }} - with: - nix-shell: ${{ matrix.target.nix_shell }} - nix-cache: 'true' - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: ${{ matrix.target.mode }} - exec_wrapper: ${{ matrix.target.vlen && format('qemu-riscv64 -cpu rv64,v=true,vlen={0}', matrix.target.vlen) || '' }} - cflags: "-std=c90" - opt: 'opt' - ppc64le_tests: - if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork - name: Functional tests (ppc64le/POWER10, native) - runs-on: pqcp-ppc64 - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: build + test - uses: ./.github/actions/multi-functest - with: - nix-shell: '' - gh_token: ${{ secrets.GITHUB_TOKEN }} - cflags: "-DMLK_FORCE_PPC64LE" - - name: build + test (+debug) - uses: ./.github/actions/multi-functest - with: - nix-shell: '' - gh_token: ${{ secrets.GITHUB_TOKEN }} - cflags: "-DMLKEM_DEBUG -DMLK_FORCE_PPC64LE" - backend_tests: - name: AArch64 FIPS202 backends (${{ matrix.backend }}) - strategy: - fail-fast: false - matrix: - backend: [x1_scalar, x1_v84a, x2_v84a, x4_v8a_scalar, x4_v8a_v84a_scalar] - runs-on: macos-latest - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: build + test - uses: ./.github/actions/multi-functest - with: - nix-shell: 'ci' - nix-cache: 'false' - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: 'native' - opt: 'opt' - examples: 'false' - cflags: "-DMLKEM_DEBUG -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all" - ldflags: "-fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all" - check_namespace: 'false' - extra_args: "--fips202-aarch64-backend ${{ matrix.backend }}" - compiler_tests: - name: Compiler tests (${{ matrix.compiler.family }}-${{ matrix.compiler.version }}, ${{ matrix.target.name }}, ${{ matrix.cflags }}) - strategy: - fail-fast: false - matrix: - cflags: [ "-O0", "-Os", "-O3" ] - target: - - runner: ubuntu-24.04-arm - name: 'aarch64' - - runner: ubuntu-latest - name: 'x86_64' - - runner: macos-latest - name: 'macos' - # Oldest available + 3 latest per family; intermediate versions run in legacy-compilers. - compiler: - - family: gcc - version: "4.8" - shell: gcc48 - darwin: False - opt: all - examples: true - - family: gcc - version: "14" - shell: gcc14 - darwin: True - opt: all - examples: true - - family: gcc - version: "15" - shell: gcc15 - darwin: True - opt: all - examples: true - - family: gcc - version: "16" - shell: gcc16 - # TODO: re-add once gcc16 is no longer broken in nixpkgs-unstable - darwin: False - opt: all - examples: true - - family: clang - version: "6" - shell: clang6 - darwin: False - opt: all - examples: true - - family: clang - version: "20" - shell: clang20 - darwin: True - opt: all - examples: true - - family: clang - version: "21" - shell: clang21 - darwin: True - opt: all - examples: true - - family: clang - version: "22" - shell: clang22 - darwin: True - opt: all - examples: true - # CPU flags are not correctly passed to the zig assembler - # https://github.com/ziglang/zig/issues/23576 - # We therefore only test the C backend - # - # We omit all examples since there is currently no way to run - # only those examples not involving native code. - # - # zig 0.16 still appears affected on Linux (CPU-feature - # predefines look to be dropped in assembler-with-cpp mode). - - family: zig - version: "0.10" - shell: zig0_10 - darwin: False - opt: no_opt - examples: False - - family: zig - version: "0.14" - shell: zig0_14 - darwin: True - opt: no_opt - examples: False - - family: zig - version: "0.15" - shell: zig0_15 - darwin: True - opt: no_opt - examples: False - - family: zig - version: "0.16" - shell: zig0_16 - darwin: True - opt: no_opt - examples: False - runs-on: ${{ matrix.target.runner }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: native build+functest (default) - if: ${{ matrix.compiler.darwin || matrix.target.runner != 'macos-latest' }} - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "${{ matrix.cflags }}" - - name: native build+functest (C90) - if: ${{ matrix.compiler.darwin || matrix.target.runner != 'macos-latest' }} - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c90 ${{ matrix.cflags }}" - - name: native build+functest (C99) - if: ${{ matrix.compiler.darwin || matrix.target.runner != 'macos-latest' }} - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c99 ${{ matrix.cflags }}" - - name: native build+functest (C11) - if: ${{ matrix.compiler.darwin || matrix.target.runner != 'macos-latest' }} - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c11 ${{ matrix.cflags }}" - - name: native build+functest (C17) - if: ${{ (matrix.compiler.darwin || matrix.target.runner != 'macos-latest') && - (matrix.compiler.family == 'zig' || (matrix.compiler.family == 'gcc' && matrix.compiler.version >= 8) || (matrix.compiler.family == 'clang' && matrix.compiler.version >= 7)) }} - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c17 ${{ matrix.cflags }}" - - name: native build+functest (C23) - if: ${{ (matrix.compiler.darwin || matrix.target.runner != 'macos-latest') && - ((matrix.compiler.family == 'zig' && matrix.compiler.version >= 0.14) || (matrix.compiler.family == 'gcc' && matrix.compiler.version >= 14) || (matrix.compiler.family == 'clang' && matrix.compiler.version >= 18)) }} - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c23 ${{ matrix.cflags }}" - stack_analysis: - name: Stack analysis (${{ matrix.target.name }}, ${{ matrix.cflags }}) - strategy: - fail-fast: false - matrix: - external: - - ${{ github.repository_owner != 'pq-code-package' }} - target: - - runner: ubuntu-latest - name: x86_64 - - runner: ubuntu-24.04-arm - name: aarch64 - cflags: ['-O3', '-Os'] - exclude: - - external: true - runs-on: ${{ matrix.target.runner }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: Stack analysis - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - nix-shell: valgrind-varlat_gcc15 - nix-cache: false - opt: all - cflags: "${{ matrix.cflags }}" - func: false - kat: false - acvp: false - wycheproof: false - examples: false - stack: true - check_namespace: false - unit_valgrind: - name: Unit tests + valgrind (${{ matrix.target.name }}, ${{ matrix.cflags }}) - strategy: - fail-fast: false - matrix: - external: - - ${{ github.repository_owner != 'pq-code-package' }} - target: - - runner: ubuntu-latest - name: x86_64 - - runner: ubuntu-24.04-arm - name: aarch64 - cflags: ['-O3', '-Os'] - exclude: - - external: true - runs-on: ${{ matrix.target.runner }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: Unit tests under valgrind - uses: ./.github/actions/functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - nix-shell: valgrind-varlat_gcc15 - nix-cache: false - opt: opt - cflags: "${{ matrix.cflags }} -std=c11 -D_GNU_SOURCE -DMLK_CONFIG_FILE=\\\\\\\"../test/configs/custom_heap_alloc_config.h\\\\\\\"" - func: false - kat: false - acvp: false - wycheproof: false - examples: false - stack: false - unit: true - alloc: false - rng_fail: false - check_namespace: false - # Disable AArch64 SHA3 extension: valgrind cannot emulate it - extra_env: "MK_COMPILER_SUPPORTS_SHA3=0" - exec_wrapper: "valgrind --error-exitcode=1" - config_variations: - name: Non-standard configurations - strategy: - fail-fast: false - matrix: - external: - - ${{ github.repository_owner != 'pq-code-package' }} - target: - - runner: ubuntu-24.04-arm - name: 'ubuntu-latest (aarch64)' - - runner: ubuntu-latest - name: 'ubuntu-latest (x86_64)' - exclude: - - {external: true, - target: { - runner: ubuntu-24.04-arm, - name: 'ubuntu-latest (aarch64)', - }} - - {external: true, - target: { - runner: ubuntu-latest, - name: 'ubuntu-latest (x86_64)', - }} - runs-on: ${{ matrix.target.runner }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: "Config Variations" - uses: ./.github/actions/config-variations - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - check-cf-protections: - name: Test control-flow protections (${{ matrix.compiler.name }}, x86_64) - strategy: - fail-fast: false - matrix: - compiler: - - name: gcc-14 - shell: gcc14 - - name: gcc-15 - shell: gcc15 - - name: clang-19 - shell: clang19 - # On AArch64 -fcf-protection is not supported anyway - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: Test control-flow protections - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - cflags: "-Wl,-z,cet-report=error -fcf-protection=full" - func: true - kat: true - acvp: true - nix-shell: ${{ matrix.compiler.shell }} - # ensure that kem.h and mlkem_native.h; api.h and native backends are compatible - check-apis: - strategy: - fail-fast: false - matrix: - external: - - ${{ github.repository_owner != 'pq-code-package' }} - target: - - runner: ubuntu-24.04-arm - name: 'aarch64' - - runner: ubuntu-latest - name: 'x86_64' - exclude: - - {external: true, - target: { - runner: ubuntu-24.04-arm, - name: 'aarch64' - }} - name: Check API consistency - runs-on: ${{ matrix.target.runner }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: make quickcheck - run: | - OPT=0 CFLAGS="-Imlkem -DMLK_CHECK_APIS -Wno-redundant-decls" make quickcheck - make clean >/dev/null - OPT=1 CFLAGS="-Imlkem -DMLK_CHECK_APIS -Wno-redundant-decls" make quickcheck - - uses: ./.github/actions/setup-apt - - name: tests func - run: | - ./scripts/tests func --cflags="-Imlkem -DMLK_CHECK_APIS -Wno-redundant-decls" - ec2_functests: - strategy: - fail-fast: false - matrix: - target: - - name: AMD EPYC 4th gen (t3a) - ec2_instance_type: t3a.small - ec2_ami: ubuntu-latest (x86_64) - ec2_volume_size: 30 - compile_mode: native - opt: all - config_variations: 'native-cap-CPUID_AVX2' - - name: Intel Xeon 4th gen (t3) - ec2_instance_type: t3.small - ec2_ami: ubuntu-latest (x86_64) - ec2_volume_size: 30 - compile_mode: native - opt: all - config_variations: 'native-cap-CPUID_AVX2' - - name: Graviton2 (c6g.medium) - ec2_instance_type: c6g.medium - ec2_ami: ubuntu-latest (aarch64) - ec2_volume_size: 20 - compile_mode: native - opt: all - config_variations: 'native-cap-ON native-cap-OFF native-cap-ID_AA64PFR1_EL1' - - name: Graviton3 (c7g.medium) - ec2_instance_type: c7g.medium - ec2_ami: ubuntu-latest (aarch64) - ec2_volume_size: 20 - compile_mode: native - opt: all - config_variations: 'native-cap-ID_AA64PFR1_EL1' - name: Platform tests (${{ matrix.target.name }}) - permissions: - contents: 'read' - id-token: 'write' - if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork - uses: ./.github/workflows/ci_ec2_reusable.yml - with: - name: ${{ matrix.target.name }} - ec2_instance_type: ${{ matrix.target.ec2_instance_type }} - ec2_ami: ${{ matrix.target.ec2_ami }} - ec2_volume_size: ${{ matrix.target.ec2_volume_size }} - compile_mode: ${{ matrix.target.compile_mode }} - opt: ${{ matrix.target.opt }} - config_variations: ${{ matrix.target.config_variations || '' }} - functest: true - kattest: true - acvptest: true - lint: false - verbose: true - secrets: inherit - compatibility_tests: - strategy: - max-parallel: 4 - fail-fast: false - matrix: - container: - - id: debian:bullseye - nix_shell: '' - - id: debian:bookworm - nix_shell: '' - - id: nixos/nix:latest - nix_shell: 'nix-shell -p python3 gcc gnumake perl' - name: Compatibility tests (${{ matrix.container.id }}) - runs-on: ubuntu-latest - container: - ${{ matrix.container.id }} - steps: - # We're not using the checkout action here because on it's not supported - # on all containers we want to test. Resort to a manual checkout. - - # We can't hoist this into an action since calling an action can only - # be done after checkout. - - name: Manual checkout - shell: bash - run: | - if (which yum > /dev/null); then - yum install git -y - elif (which apt > /dev/null); then - apt update - apt install git -y - fi - - git config --global --add safe.directory "$GITHUB_WORKSPACE" - git init - git remote add origin "$GITHUB_SERVER_URL/$GITHUB_REPOSITORY" - git fetch origin --depth 1 "$GITHUB_SHA" - git checkout FETCH_HEAD - - uses: ./.github/actions/setup-os - with: - sudo: "" - - name: make quickcheck - shell: bash - run: | - if [ -n "${{ matrix.container.nix_shell }}" ]; then - ${{ matrix.container.nix_shell }} --run "CC=gcc OPT=0 make quickcheck && make clean >/dev/null && CC=gcc OPT=1 make quickcheck" - else - CC=gcc OPT=0 make quickcheck - make clean >/dev/null - CC=gcc OPT=1 make quickcheck - fi - - name: Functional Tests - uses: ./.github/actions/multi-functest - with: - nix-shell: "" - custom_shell: ${{ matrix.container.nix_shell && format('{0} --run \"bash -e {{0}}\"', matrix.container.nix_shell) || 'bash' }} - gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} - ec2_compatibilitytests: - strategy: - max-parallel: 8 - fail-fast: false - matrix: - container: - - id: amazonlinux-2-aarch:base - - id: amazonlinux-2-aarch:gcc-7x - - id: amazonlinux-2-aarch:clang-7x - - id: amazonlinux-2023-aarch:base - - id: amazonlinux-2023-aarch:gcc-11x - - id: amazonlinux-2023-aarch:clang-15x - - id: amazonlinux-2023-aarch:clang-15x-sanitizer - # - id: amazonlinux-2023-aarch:cryptofuzz Not yet supported - - id: ubuntu-22.04-aarch:gcc-12x - - id: ubuntu-22.04-aarch:gcc-11x - - id: ubuntu-20.04-aarch:gcc-8x - - id: ubuntu-20.04-aarch:gcc-7x - - id: ubuntu-20.04-aarch:clang-9x - - id: ubuntu-20.04-aarch:clang-8x - - id: ubuntu-20.04-aarch:clang-7x-bm-framework - - id: ubuntu-20.04-aarch:clang-7x - - id: ubuntu-20.04-aarch:clang-10x - - id: ubuntu-22.04-aarch:base - - id: ubuntu-20.04-aarch:base - name: Compatibility tests (${{ matrix.container.id }}) - permissions: - contents: 'read' - id-token: 'write' - uses: ./.github/workflows/ci_ec2_container.yml - if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork - with: - container: ${{ matrix.container.id }} - name: ${{ matrix.container.id }} - ec2_instance_type: t4g.small - ec2_ami: ubuntu-latest (custom AMI) - ec2_ami_id: ami-0c9bc1901ef0d1066 # Has docker images preinstalled - compile_mode: native - opt: all - functest: true - kattest: true - acvptest: true - lint: false - verbose: true - cflags: "-O0" - secrets: inherit - check_autogenerated_files: - strategy: - fail-fast: false - matrix: - target: - - system: macos-latest - nix_shell: 'cross-autogen' - nix_cache: 'true' - extra_args: '--force-cross' - # TODO: This does not yet work (#1304) - # - system: macos-15-intel - # nix_cache: 'false' - # nix_shell: 'ci' - - system: ubuntu-latest - nix_shell: 'cross-autogen' - nix_cache: 'true' - extra_args: '--force-cross' - - system: ubuntu-24.04-arm - nix_shell: 'cross-autogen' - nix_cache: 'true' - extra_args: '--force-cross' - runs-on: ${{ matrix.target.system }} - name: Check autogenerated files - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: ./.github/actions/setup-shell - with: - nix-shell: ${{ matrix.target.nix_shell }} - nix-cache: ${{ matrix.target.nix_cache }} - gh_token: ${{ secrets.GITHUB_TOKEN }} - script: | - python3 ./scripts/autogen --dry-run ${{ matrix.target.extra_args }} - check_hol_light_object_code: - strategy: - fail-fast: false - matrix: - target: - - system: macos-latest - nix_cache: 'true' - nix_shell: 'hol_light-cross-x86_64' - extra_args: '--force-cross aarch64 x86_64' - # TODO: autogen does not yet work on macos15-intel (#1304) - # - system: macos-15-intel - # nix_cache: 'false' - # nix_shell: 'ci' - - system: ubuntu-latest - nix_shell: 'hol_light-cross-aarch64' - nix_cache: 'true' - extra_args: '--force-cross aarch64 x86_64' - - system: ubuntu-24.04-arm - nix_shell: 'hol_light-cross-x86_64' - nix_cache: 'true' - extra_args: '--force-cross aarch64 x86_64' - runs-on: ${{ matrix.target.system }} - name: Check object code in HOL-Light proofs - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: ./.github/actions/setup-shell - with: - nix-shell: ${{ matrix.target.nix_shell }} - nix-cache: ${{ matrix.target.nix_cache }} - gh_token: ${{ secrets.GITHUB_TOKEN }} - script: | - python3 ./scripts/autogen --dry-run --update-hol-light-bytecode ${{ matrix.target.extra_args }} diff --git a/.github/workflows/ci_ec2_any.yml b/.github/workflows/ci_ec2_any.yml deleted file mode 100644 index 2662954b3e..0000000000 --- a/.github/workflows/ci_ec2_any.yml +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: ci-ec2-any -permissions: - contents: read -on: - workflow_dispatch: - inputs: - name: - description: Alternative name of instance - default: Graviton2 - ec2_instance_type: - description: Type if EC2 instance to run on - default: t4g.small - ec2_ami: - description: AMI ID - type: choice - options: - - ubuntu-latest (x86_64) - - ubuntu-latest (aarch64) - - ubuntu-latest (custom AMI) - default: ubuntu-latest (aarch64) - ec2_ami_id: - description: AMI ID - default: ami-096ea6a12ea24a797 - cflags: - description: Custom CFLAGS for compilation - default: - verbose: - description: Determine for the log verbosity - type: boolean - default: false - compile_mode: - description: Indicates the desired compilation mode (native or cross compilation), or `all` to perform both types, or `none` to skip compilation and functional testing. - type: choice - options: - - all - - native - - cross - - none - default: all - opt: - description: Determine whether to compile and run the opt/no_opt binary or both. - type: choice - options: - - all - - opt - - no_opt - default: all - cbmc: - description: Whether to run CBMC proofs - type: boolean - default: false -jobs: - ci-ec2-any: - name: Ad-hoc CI on ${{ inputs.ec2_instance_type }} - uses: ./.github/workflows/ci_ec2_reusable.yml - with: - name: ${{ inputs.name }} - ec2_instance_type: ${{ inputs.ec2_instance_type }} - ec2_ami: ${{ inputs.ec2_ami }} - ec2_ami_id: ${{ inputs.ec2_ami_id }} - cflags: ${{ inputs.cflags }} - compile_mode: ${{ inputs.compile_mode }} - opt: ${{ inputs.opt }} - functest: ${{ inputs.compile_mode != 'none' }} - kattest: ${{ inputs.compile_mode != 'none' }} - acvptest: ${{ inputs.compile_mode != 'none' }} - lint: true - cbmc: ${{ inputs.cbmc }} - verbose: ${{ inputs.verbose }} - secrets: inherit diff --git a/.github/workflows/ci_ec2_container.yml b/.github/workflows/ci_ec2_container.yml deleted file mode 100644 index 3142e3bb4a..0000000000 --- a/.github/workflows/ci_ec2_container.yml +++ /dev/null @@ -1,216 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: ci-ec2-reusable -permissions: - contents: read -on: - workflow_call: - inputs: - name: - type: string - description: Alternative name of instance - default: Graviton2 - ec2_instance_type: - type: string - description: Type if EC2 instance to benchmark on - default: t4g.small - ec2_ami: - type: string - description: Textual description of AMI - default: ubuntu-latest (aarch64) - ec2_ami_id: - type: string - description: AMI ID - default: ami-096ea6a12ea24a797 - cflags: - type: string - description: Custom CFLAGS for compilation - default: "" - verbose: - description: Determine for the log verbosity - type: boolean - default: false - compile_mode: - type: string - description: either all, native, cross or none - default: all - opt: - type: string - description: either all, opt or no_opt - default: all - functest: - type: boolean - default: true - kattest: - type: boolean - default: true - acvptest: - type: boolean - default: true - lint: - type: boolean - default: true - cbmc: - type: boolean - default: false - cbmc_mlkem_k: - type: string - default: 2 - container: - type: string - default: '' -env: - AWS_ROLE: arn:aws:iam::559050233797:role/mlkem-c-aarch64-gh-action - AWS_REGION: us-east-1 - AMI_UBUNTU_LATEST_X86_64: ami-0e86e20dae9224db8 - AMI_UBUNTU_LATEST_AARCH64: ami-096ea6a12ea24a797 -jobs: - start-ec2-runner: - name: Start instance (${{ inputs.ec2_instance_type }}) - permissions: - contents: 'read' - id-token: 'write' - runs-on: ubuntu-latest - if: ${{ always() }} # The point is to make this step non-cancellable, - # avoiding race conditions where an instance is started, - # but isn't yet done registering as a runner and reporting back. - outputs: - label: ${{ steps.remember-runner.outputs.label }} - ec2-instance-id: ${{ steps.remember-runner.outputs.ec2-instance-id }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: Determine AMI ID - id: det_ami_id - run: | - if [[ "${{ inputs.ec2_ami }}" == "ubuntu-latest (x86_64)" ]]; then - AMI_ID=${{ env.AMI_UBUNTU_LATEST_X86_64 }} - elif [[ "${{ inputs.ec2_ami }}" == "ubuntu-latest (aarch64)" ]]; then - AMI_ID=${{ env.AMI_UBUNTU_LATEST_AARCH64 }} - elif [[ "${{ inputs.ec2_ami }}" == "ubuntu-latest (custom AMI)" ]]; then - AMI_ID=${{ inputs.ec2_ami_id }} - fi - echo "Using AMI ID: $AMI_ID" - echo "AMI_ID=$AMI_ID" >> "$GITHUB_OUTPUT" - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@e7f100cf4c008499ea8adda475de1042d6975c7b # v6.2.0 - with: - role-to-assume: ${{ env.AWS_ROLE }} - aws-region: ${{ env.AWS_REGION }} - - name: Start EC2 runner - id: start-ec2-runner-first - continue-on-error: true - uses: machulav/ec2-github-runner@343a1b2ae682e681c3cec9a235d882da17ff04ef # v2.6.1 - with: - mode: start - github-token: ${{ secrets.AWS_GITHUB_TOKEN }} - ec2-instance-type: ${{ inputs.ec2_instance_type }} - availability-zones-config: >- - [{"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-07b2729e5e065962f","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0c7739cbd02c2c1d2","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0d69987f97f50fc1d","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0d077bf47a0eef46e","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0019f164593f6df43","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0f0d1e7667a264a0e","securityGroupId":"sg-0ab2e297196c8c381"}] - - name: Start EC2 runner (wait before retry) - if: steps.start-ec2-runner-first.outcome == 'failure' - shell: bash - run: | - sleep 30 # Wait 30s before retrying - sleep $((1 + RANDOM % 30)) - - name: Start EC2 runner (retry) - id: start-ec2-runner-second - if: steps.start-ec2-runner-first.outcome == 'failure' - uses: machulav/ec2-github-runner@343a1b2ae682e681c3cec9a235d882da17ff04ef # v2.6.1 - with: - mode: start - github-token: ${{ secrets.AWS_GITHUB_TOKEN }} - ec2-instance-type: ${{ inputs.ec2_instance_type }} - availability-zones-config: >- - [{"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-07b2729e5e065962f","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0c7739cbd02c2c1d2","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0d69987f97f50fc1d","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0d077bf47a0eef46e","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0019f164593f6df43","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0f0d1e7667a264a0e","securityGroupId":"sg-0ab2e297196c8c381"}] - - name: Remember runner - id: remember-runner - shell: bash - run: | - if [[ "${{ steps.start-ec2-runner-first.outcome }}" == "failure" ]]; then - echo "label=${{ steps.start-ec2-runner-second.outputs.label }}" >> "$GITHUB_OUTPUT" - echo "ec2-instance-id=${{ steps.start-ec2-runner-second.outputs.ec2-instance-id }}" >> "$GITHUB_OUTPUT" - else - echo "label=${{ steps.start-ec2-runner-first.outputs.label }}" >> "$GITHUB_OUTPUT" - echo "ec2-instance-id=${{ steps.start-ec2-runner-first.outputs.ec2-instance-id }}" >> "$GITHUB_OUTPUT" - fi - - tests: - name: Run tests - needs: start-ec2-runner - if: ${{ inputs.container != '' }} - runs-on: ${{ needs.start-ec2-runner.outputs.label }} - container: - localhost:5000/${{ inputs.container }} - steps: - # We're not using the checkout action here because on it's not supported - # on all containers we want to test. Resort to a manual checkout. - # - # We can't hoist this into an action since calling an action can only - # be done after checkout. - - name: Manual checkout - shell: bash - run: | - if /usr/bin/which yum; then - yum install git -y - elif /usr/bin/which apt; then - apt update - apt install git -y - fi - - git init - git remote add origin "$GITHUB_SERVER_URL/$GITHUB_REPOSITORY" - git fetch origin --depth 1 "$GITHUB_SHA" - git checkout FETCH_HEAD - - uses: ./.github/actions/setup-os - with: - sudo: "" - - name: make quickcheck - run: | - OPT=0 make quickcheck - make clean >/dev/null - OPT=1 make quickcheck - - name: Functional Tests - uses: ./.github/actions/multi-functest - with: - nix-shell: "" - gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} - cflags: ${{ inputs.cflags }} - compile_mode: ${{ inputs.compile_mode }} - opt: ${{ inputs.opt }} - func: ${{ inputs.functest }} - kat: ${{ inputs.kattest }} - acvp: ${{ inputs.acvptest }} - stop-ec2-runner: - name: Stop instance (${{ inputs.ec2_instance_type }}) - permissions: - contents: 'read' - id-token: 'write' - needs: - - start-ec2-runner - - tests - runs-on: ubuntu-latest - if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs - steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@e7f100cf4c008499ea8adda475de1042d6975c7b # v6.2.0 - with: - role-to-assume: ${{ env.AWS_ROLE }} - aws-region: ${{ env.AWS_REGION }} - - name: Stop EC2 runner - uses: machulav/ec2-github-runner@343a1b2ae682e681c3cec9a235d882da17ff04ef # v2.6.1 - with: - mode: stop - github-token: ${{ secrets.AWS_GITHUB_TOKEN }} - label: ${{ needs.start-ec2-runner.outputs.label }} - ec2-instance-id: ${{ needs.start-ec2-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/ci_ec2_reusable.yml b/.github/workflows/ci_ec2_reusable.yml deleted file mode 100644 index fbaceb9cec..0000000000 --- a/.github/workflows/ci_ec2_reusable.yml +++ /dev/null @@ -1,241 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: ci-ec2-reusable -permissions: - contents: read -on: - workflow_call: - inputs: - name: - type: string - description: Alternative name of instance - default: Graviton2 - ec2_instance_type: - type: string - description: Type if EC2 instance to benchmark on - default: t4g.small - ec2_ami: - type: string - description: Textual description of AMI - default: ubuntu-latest (aarch64) - ec2_ami_id: - type: string - description: AMI ID - default: ami-096ea6a12ea24a797 - ec2_volume_size: - type: string - default: "" - cflags: - type: string - description: Custom CFLAGS for compilation - default: "" - verbose: - description: Determine for the log verbosity - type: boolean - default: false - compile_mode: - type: string - description: either all, native, cross or none - default: all - opt: - type: string - description: either all, opt or no_opt - default: all - functest: - type: boolean - default: true - kattest: - type: boolean - default: true - acvptest: - type: boolean - default: true - lint: - type: boolean - default: true - test: - type: boolean - default: true - config_variations: - type: string - description: List of configuration variation tests to run (space-separated IDs) or empty for no tests - default: '' - cbmc: - type: boolean - default: false - slothy: - type: boolean - default: false - cbmc_mlkem_k: - type: string - default: 2 -env: - AWS_ROLE: arn:aws:iam::559050233797:role/mlkem-c-aarch64-gh-action - AWS_REGION: us-east-1 - AMI_UBUNTU_LATEST_X86_64: ami-0e86e20dae9224db8 - AMI_UBUNTU_LATEST_AARCH64: ami-096ea6a12ea24a797 -jobs: - start-ec2-runner: - name: Start instance (${{ inputs.ec2_instance_type }}) - permissions: - contents: 'read' - id-token: 'write' - runs-on: ubuntu-latest - if: ${{ always() }} # The point is to make this step non-cancellable, - # avoiding race conditions where an instance is started, - # but isn't yet done registering as a runner and reporting back. - outputs: - label: ${{ steps.remember-runner.outputs.label }} - ec2-instance-id: ${{ steps.remember-runner.outputs.ec2-instance-id }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: Determine AMI ID - id: det_ami_id - run: | - if [[ "${{ inputs.ec2_ami }}" == "ubuntu-latest (x86_64)" ]]; then - AMI_ID=${{ env.AMI_UBUNTU_LATEST_X86_64 }} - elif [[ "${{ inputs.ec2_ami }}" == "ubuntu-latest (aarch64)" ]]; then - AMI_ID=${{ env.AMI_UBUNTU_LATEST_AARCH64 }} - elif [[ "${{ inputs.ec2_ami }}" == "ubuntu-latest (custom AMI)" ]]; then - AMI_ID=${{ inputs.ec2_ami_id }} - fi - echo "Using AMI ID: $AMI_ID" - echo "AMI_ID=$AMI_ID" >> "$GITHUB_OUTPUT" - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@e7f100cf4c008499ea8adda475de1042d6975c7b # v6.2.0 - with: - role-to-assume: ${{ env.AWS_ROLE }} - aws-region: ${{ env.AWS_REGION }} - - name: Start EC2 runner - id: start-ec2-runner-first - continue-on-error: true - uses: machulav/ec2-github-runner@343a1b2ae682e681c3cec9a235d882da17ff04ef # v2.6.1 - with: - mode: start - github-token: ${{ secrets.AWS_GITHUB_TOKEN }} - ec2-instance-type: ${{ inputs.ec2_instance_type }} - ec2-volume-size: ${{ inputs.ec2_volume_size }} - availability-zones-config: >- - [{"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-07b2729e5e065962f","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0c7739cbd02c2c1d2","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0d69987f97f50fc1d","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0d077bf47a0eef46e","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0019f164593f6df43","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0f0d1e7667a264a0e","securityGroupId":"sg-0ab2e297196c8c381"}] - - name: Start EC2c runner (wait before retry) - if: steps.start-ec2-runner-first.outcome == 'failure' - shell: bash - run: | - sleep 30 # Wait 30s before retrying - sleep $((1 + RANDOM % 30)) - - name: Start EC2 runner (retry) - id: start-ec2-runner-second - if: steps.start-ec2-runner-first.outcome == 'failure' - uses: machulav/ec2-github-runner@343a1b2ae682e681c3cec9a235d882da17ff04ef # v2.6.1 - with: - mode: start - github-token: ${{ secrets.AWS_GITHUB_TOKEN }} - ec2-instance-type: ${{ inputs.ec2_instance_type }} - ec2-volume-size: ${{ inputs.ec2_volume_size }} - availability-zones-config: >- - [{"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-07b2729e5e065962f","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0c7739cbd02c2c1d2","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0d69987f97f50fc1d","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0d077bf47a0eef46e","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0019f164593f6df43","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ steps.det_ami_id.outputs.AMI_ID }}","subnetId":"subnet-0f0d1e7667a264a0e","securityGroupId":"sg-0ab2e297196c8c381"}] - - name: Remember runner - id: remember-runner - shell: bash - run: | - if [[ "${{ steps.start-ec2-runner-first.outcome }}" == "failure" ]]; then - echo "label=${{ steps.start-ec2-runner-second.outputs.label }}" >> "$GITHUB_OUTPUT" - echo "ec2-instance-id=${{ steps.start-ec2-runner-second.outputs.ec2-instance-id }}" >> "$GITHUB_OUTPUT" - else - echo "label=${{ steps.start-ec2-runner-first.outputs.label }}" >> "$GITHUB_OUTPUT" - echo "ec2-instance-id=${{ steps.start-ec2-runner-first.outputs.ec2-instance-id }}" >> "$GITHUB_OUTPUT" - fi - - tests: - name: Run tests - needs: start-ec2-runner - runs-on: ${{ needs.start-ec2-runner.outputs.label }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: Linting - if: ${{ inputs.lint }} - uses: ./.github/actions/lint - with: - gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} - nix-verbose: ${{ inputs.verbose }} - - name: Functional Tests - if: ${{ inputs.test }} - uses: ./.github/actions/multi-functest - with: - nix-shell: ${{ (inputs.compile_mode == 'cross' || inputs.compile_mode == 'all') && 'cross' || 'ci' }} - nix-cache: ${{ inputs.cbmc || inputs.compile_mode == 'cross' || inputs.compile_mode == 'all' }} - nix-verbose: ${{ inputs.verbose }} - gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} - cflags: ${{ inputs.cflags }} - compile_mode: ${{ inputs.compile_mode }} - opt: ${{ inputs.opt }} - func: ${{ inputs.functest }} - kat: ${{ inputs.kattest }} - acvp: ${{ inputs.acvptest }} - - name: Config Variations - if: ${{ inputs.config_variations != '' && (success() || failure()) }} - uses: ./.github/actions/config-variations - with: - gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} - tests: ${{ inputs.config_variations }} - opt: opt - - name: CBMC - if: ${{ inputs.cbmc && (success() || failure()) }} - uses: ./.github/actions/cbmc - with: - nix-shell: cbmc - nix-verbose: ${{ inputs.verbose }} - mlkem_k: ${{ inputs.cbmc_mlkem_k }} - gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} - - name: SLOTHY - if: ${{ inputs.slothy }} - uses: ./.github/actions/setup-shell - with: - gh_token: ${{ secrets.AWS_GITHUB_TOKEN }} - nix-cache: true - nix-shell: slothy - script: | - autogen --slothy - tests all --opt opt - # Force testing of SLOTHY-optimized Keccak variants - # We can't run the examples here because some of them also specify the backend - make clean - tests all --no-examples --opt opt --cflags="-DMLK_CONFIG_FIPS202_BACKEND_FILE=\\\"fips202/native/aarch64/x1_scalar.h\\\"" - make clean - tests all --no-examples --opt opt --cflags="-DMLK_CONFIG_FIPS202_BACKEND_FILE=\\\"fips202/native/aarch64/x4_v8a_scalar.h\\\"" - make clean - tests all --no-examples --opt opt --cflags="-march=armv8.4-a+sha3 -DMLK_CONFIG_FIPS202_BACKEND_FILE=\\\"fips202/native/aarch64/x4_v8a_v84a_scalar.h\\\"" - stop-ec2-runner: - name: Stop instance (${{ inputs.ec2_instance_type }}) - permissions: - contents: 'read' - id-token: 'write' - needs: - - start-ec2-runner - - tests - runs-on: ubuntu-latest - if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs - steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@e7f100cf4c008499ea8adda475de1042d6975c7b # v6.2.0 - with: - role-to-assume: ${{ env.AWS_ROLE }} - aws-region: ${{ env.AWS_REGION }} - - name: Stop EC2 runner - uses: machulav/ec2-github-runner@343a1b2ae682e681c3cec9a235d882da17ff04ef # v2.6.1 - with: - mode: stop - github-token: ${{ secrets.AWS_GITHUB_TOKEN }} - label: ${{ needs.start-ec2-runner.outputs.label }} - ec2-instance-id: ${{ needs.start-ec2-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/ct-tests.yml b/.github/workflows/ct-tests.yml deleted file mode 100644 index 36ac58c1ba..0000000000 --- a/.github/workflows/ct-tests.yml +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: Constant-time tests -permissions: - contents: read -on: - workflow_call: - workflow_dispatch: - -jobs: - check-ct-varlat: - # Using the patched Valgrind from the KyberSlash paper to detect divisions - # In case the patch no longer applies after an update, we may want to switch back - # to stock valgrind added in https://github.com/pq-code-package/mlkem-native/pull/687 - name: CT test ${{ matrix.nix-shell }} ${{ matrix.system }} - strategy: - fail-fast: false - max-parallel: 10 - matrix: - system: [ubuntu-latest, ubuntu-24.04-arm] - # Oldest available + 3 latest per family; intermediate versions run in legacy-compilers. - nix-shell: - - valgrind-varlat_clang6 - - valgrind-varlat_clang20 - - valgrind-varlat_clang21 - - valgrind-varlat_clang22 - - valgrind-varlat_gcc48 - - valgrind-varlat_gcc14 - - valgrind-varlat_gcc15 - - valgrind-varlat_gcc16 - runs-on: ${{ matrix.system }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: Setup nix - uses: ./.github/actions/setup-shell - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - nix-shell: ${{ matrix.nix-shell }} - nix-cache: true - - name: Build and run test (-Oz) - # -Oz got introduced in gcc12 - if: ${{ matrix.nix-shell != 'valgrind-varlat_gcc48' }} - uses: ./.github/actions/ct-test - with: - cflags: -Oz -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes - - name: Build and run test (-Os) - uses: ./.github/actions/ct-test - with: - cflags: -Os -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes - - name: Build and run test (-O3) - uses: ./.github/actions/ct-test - with: - cflags: -O3 -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes - - name: Build and run test (-Ofast) - # -Ofast got deprecated in clang19; -O3 -ffast-math should be used instead - if: ${{ matrix.nix-shell != 'valgrind-varlat_clang19' && matrix.nix-shell != 'valgrind-varlat_clang20' && matrix.nix-shell != 'valgrind-varlat_clang21' && matrix.nix-shell != 'valgrind-varlat_clang22'}} - uses: ./.github/actions/ct-test - with: - cflags: -Ofast -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes - - name: Build and run test (-O3 -ffast-math) - uses: ./.github/actions/ct-test - with: - cflags: -O3 -ffast-math -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes - - name: Build and run test (-O2) - uses: ./.github/actions/ct-test - with: - cflags: -O2 -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes - - name: Build and run test (-O1) - uses: ./.github/actions/ct-test - with: - cflags: -O1 -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes - - name: Build and run test (-O0) - uses: ./.github/actions/ct-test - with: - cflags: -O0 -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes diff --git a/.github/workflows/hol_light.yml b/.github/workflows/hol_light.yml deleted file mode 100644 index c381c287d0..0000000000 --- a/.github/workflows/hol_light.yml +++ /dev/null @@ -1,272 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: HOL-Light -permissions: - contents: read -on: - push: - branches: ["main"] - paths: - - '.github/workflows/hol_light.yml' - - 'proofs/hol_light/aarch64/Makefile' - - 'proofs/hol_light/aarch64/**/*.S' - - 'proofs/hol_light/aarch64/**/*.ml' - - 'proofs/hol_light/x86_64/Makefile' - - 'proofs/hol_light/x86_64/**/*.S' - - 'proofs/hol_light/x86_64/**/*.ml' - - 'flake.nix' - - 'flake.lock' - - 'nix/hol_light/*' - - 'nix/s2n_bignum/*' - pull_request: - branches: ["main"] - paths: - - '.github/workflows/hol_light.yml' - - 'proofs/hol_light/aarch64/Makefile' - - 'proofs/hol_light/aarch64/**/*.S' - - 'proofs/hol_light/aarch64/**/*.ml' - - 'proofs/hol_light/x86_64/Makefile' - - 'proofs/hol_light/x86_64/**/*.S' - - 'proofs/hol_light/x86_64/**/*.ml' - - 'flake.nix' - - 'flake.lock' - - 'nix/hol_light/*' - - 'nix/s2n_bignum/*' - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - # The proofs also check that the byte code is up to date, - # but we use this as a fast path to not even start the proofs - # if the byte code needs updating. - hol_light_bytecode: - name: AArch64 HOL-Light bytecode check - runs-on: ubuntu-24.04-arm - if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - with: - fetch-depth: 0 - - uses: ./.github/actions/setup-shell - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - nix-shell: 'hol_light' - script: | - autogen --update-hol-light-bytecode --dry-run - hol_light_interactive: - name: AArch64 HOL-Light interactive shell test - runs-on: ubuntu-24.04-arm - needs: [ hol_light_bytecode ] - if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - with: - fetch-depth: 0 - - uses: ./.github/actions/setup-shell - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - nix-shell: 'hol_light' - script: | - make -C proofs/hol_light/aarch64 mlkem/poly_tobytes_aarch64_asm.o - echo 'needs "mlkem_native/aarch64/proofs/poly_tobytes_aarch64_asm.ml";;' | hol.sh - hol_light_proofs: - needs: [ hol_light_bytecode ] - strategy: - fail-fast: false - matrix: - proof: - # Dependencies on {name}.{S,ml} are implicit - - name: ntt_aarch64_asm - needs: ["mlkem_specs.ml", "mlkem_utils.ml", "mlkem_zetas.ml"] - - name: intt_aarch64_asm - needs: ["mlkem_specs.ml", "mlkem_utils.ml", "mlkem_zetas.ml"] - - name: poly_tomont_aarch64_asm - needs: ["mlkem_specs.ml", "mlkem_utils.ml"] - - name: poly_mulcache_compute_aarch64_asm - needs: ["mlkem_specs.ml", "mlkem_utils.ml", "mlkem_zetas.ml"] - - name: poly_reduce_aarch64_asm - needs: ["mlkem_specs.ml", "mlkem_utils.ml"] - - name: polyvec_basemul_acc_montgomery_cached_k2_aarch64_asm - needs: ["mlkem_specs.ml", "mlkem_utils.ml"] - - name: polyvec_basemul_acc_montgomery_cached_k3_aarch64_asm - needs: ["mlkem_specs.ml", "mlkem_utils.ml"] - - name: polyvec_basemul_acc_montgomery_cached_k4_aarch64_asm - needs: ["mlkem_specs.ml", "mlkem_utils.ml"] - - name: poly_tobytes_aarch64_asm - needs: ["mlkem_specs.ml", "mlkem_utils.ml" ] - - name: rej_uniform_aarch64_asm - needs: ["mlkem_specs.ml", "mlkem_utils.ml", "mlkem_rej_uniform_table.ml"] - - name: keccak_f1600_x1_scalar_aarch64_asm - needs: ["keccak_specs.ml"] - - name: keccak_f1600_x1_v84a_aarch64_asm - needs: ["keccak_specs.ml"] - - name: keccak_f1600_x2_v84a_aarch64_asm - needs: ["keccak_specs.ml"] - - name: keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm - needs: ["keccak_specs.ml"] - - name: keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm - needs: ["keccak_specs.ml"] - name: AArch64 HOL Light proof for ${{ matrix.proof.name }}.S - runs-on: ubuntu-24.04-arm - if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - with: - fetch-depth: 0 - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6 - - name: Check if dependencies changed - id: check_run - shell: bash - env: - CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }} - run: | - run_needed=0 - dependencies="${{ join(matrix.proof.needs, ' ') }} ${{ format('{0}.S {0}.ml', matrix.proof.name) }}" - for changed in $CHANGED_FILES; do - for needs in $dependencies; do - if [[ "$changed" == *"$needs" ]]; then - run_needed=1 - fi - done - done - - # Always re-run upon change to nix files for HOL-Light - if [[ "$CHANGED_FILES" == *"nix/"* ]] || [[ "$CHANGED_FILES" == *"hol_light.yml"* ]] || [[ "$CHANGED_FILES" == *"flake"* ]] || [[ "$CHANGED_FILES" == *"proofs/hol_light/aarch64/Makefile"* ]]; then - run_needed=1 - fi - - echo "run_needed=${run_needed}" >> "$GITHUB_OUTPUT" - - uses: ./.github/actions/setup-shell - if: | - steps.check_run.outputs.run_needed == '1' - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - nix-shell: 'hol_light' - script: | - tests hol_light -p ${{ matrix.proof.name }} --verbose - - # x86_64 proofs - hol_light_bytecode_x86_64: - name: x86_64 HOL-Light bytecode check - runs-on: ubuntu-latest - if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - with: - fetch-depth: 0 - - uses: ./.github/actions/setup-shell - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - nix-shell: 'hol_light' - script: | - autogen --update-hol-light-bytecode --dry-run - hol_light_interactive_x86_64: - name: x86_64 HOL-Light interactive shell test - runs-on: ubuntu-latest - needs: [ hol_light_bytecode_x86_64 ] - if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - with: - fetch-depth: 0 - - uses: ./.github/actions/setup-shell - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - nix-shell: 'hol_light' - script: | - make -C proofs/hol_light/x86_64 mlkem/polyvec_basemul_acc_montgomery_cached_k2_avx2_asm.o - echo 'needs "mlkem_native/x86_64/proofs/polyvec_basemul_acc_montgomery_cached_k2_avx2_asm.ml";;' | hol.sh - hol_light_proofs_x86_64: - needs: [ hol_light_bytecode_x86_64 ] - strategy: - fail-fast: false - matrix: - proof: - # Dependencies on {name}.{S,ml} are implicit - - name: ntt_avx2_asm - needs: ["mlkem_specs.ml", "mlkem_zetas.ml"] - - name: intt_avx2_asm - needs: ["mlkem_specs.ml", "mlkem_zetas.ml"] - - name: polyvec_basemul_acc_montgomery_cached_k2_avx2_asm - needs: ["mlkem_specs.ml"] - - name: polyvec_basemul_acc_montgomery_cached_k3_avx2_asm - needs: ["mlkem_specs.ml"] - - name: polyvec_basemul_acc_montgomery_cached_k4_avx2_asm - needs: ["mlkem_specs.ml"] - - name: reduce_avx2_asm - needs: ["mlkem_specs.ml"] - - name: ntttobytes_avx2_asm - needs: ["mlkem_specs.ml"] - - name: rej_uniform_avx2_asm - needs: ["mlkem_specs.ml", "mlkem_rej_uniform_table.ml"] - - name: nttfrombytes_avx2_asm - needs: ["mlkem_specs.ml"] - - name: tomont_avx2_asm - needs: ["mlkem_specs.ml"] - - name: nttunpack_avx2_asm - needs: ["mlkem_specs.ml"] - - name: poly_mulcache_compute_avx2_asm - needs: ["mlkem_specs.ml", "mlkem_zetas.ml"] - - name: poly_compress_d4_avx2_asm - needs: ["mlkem_specs.ml", "mlkem_compress_consts.ml"] - - name: poly_decompress_d4_avx2_asm - needs: ["mlkem_specs.ml", "mlkem_compress_consts.ml"] - - name: poly_compress_d5_avx2_asm - needs: ["mlkem_specs.ml", "mlkem_compress_consts.ml"] - - name: poly_decompress_d5_avx2_asm - needs: ["mlkem_specs.ml", "mlkem_compress_consts.ml"] - - name: poly_compress_d10_avx2_asm - needs: ["mlkem_specs.ml", "mlkem_compress_consts.ml"] - - name: poly_decompress_d10_avx2_asm - needs: ["mlkem_specs.ml", "mlkem_compress_consts.ml"] - - name: poly_compress_d11_avx2_asm - needs: ["mlkem_specs.ml", "mlkem_compress_consts.ml"] - - name: poly_decompress_d11_avx2_asm - needs: ["mlkem_specs.ml", "mlkem_compress_consts.ml"] - - name: keccak_f1600_x4_avx2_asm - needs: ["keccak_utils.ml", "keccak_spec.ml", "keccak_f1600_x4_avx2_constants.ml", "keccak_constants.ml"] - name: x86_64 HOL Light proof for ${{ matrix.proof.name }}.S - runs-on: ubuntu-latest - if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - with: - fetch-depth: 0 - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6 - - name: Check if dependencies changed - id: check_run - shell: bash - env: - CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }} - run: | - run_needed=0 - dependencies="${{ join(matrix.proof.needs, ' ') }} ${{ format('{0}.S {0}.ml', matrix.proof.name) }}" - for changed in $CHANGED_FILES; do - for needs in $dependencies; do - if [[ "$changed" == *"$needs" ]]; then - run_needed=1 - fi - done - done - - # Always re-run upon change to nix files for HOL-Light - if [[ "$CHANGED_FILES" == *"nix/"* ]] || [[ "$CHANGED_FILES" == *"hol_light.yml"* ]] || [[ "$CHANGED_FILES" == *"flake"* ]] || [[ "$CHANGED_FILES" == *"proofs/hol_light/x86_64/Makefile"* ]]; then - run_needed=1 - fi - - echo "run_needed=${run_needed}" >> "$GITHUB_OUTPUT" - - uses: ./.github/actions/setup-shell - if: | - steps.check_run.outputs.run_needed == '1' - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - nix-shell: 'hol_light' - script: | - tests hol_light -p ${{ matrix.proof.name }} --verbose diff --git a/.github/workflows/integration-awslc-main.yml b/.github/workflows/integration-awslc-main.yml deleted file mode 100644 index 8193451aed..0000000000 --- a/.github/workflows/integration-awslc-main.yml +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: AWS-LC (HEAD) -permissions: - contents: read -on: - schedule: - # Run daily at 04:00 UTC - - cron: '0 4 * * *' - workflow_dispatch: - -jobs: - awslc_head: - name: AWS-LC (HEAD) - permissions: - contents: 'read' - id-token: 'write' - uses: ./.github/workflows/integration-awslc.yml - with: - commit: main - secrets: inherit diff --git a/.github/workflows/integration-awslc.yml b/.github/workflows/integration-awslc.yml deleted file mode 100644 index ca82a63036..0000000000 --- a/.github/workflows/integration-awslc.yml +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: AWS-LC -permissions: - contents: read -on: - workflow_call: - inputs: - commit: - type: string - description: Commit to test against - default: main - repository: - type: string - description: Repository to fetch - default: aws/aws-lc - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ inputs.commit }} - cancel-in-progress: true - -env: - DEPENDENCIES: 'cmake golang unifdef' - -jobs: - aws_lc_integration_fips: - strategy: - fail-fast: false - matrix: - system: [ubuntu-latest, ubuntu-24.04-arm] - fips: [0,1] - name: AWS-LC FIPS test (${{ matrix.system }}, FIPS=${{ matrix.fips }}) - runs-on: ${{ matrix.system }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: ./.github/actions/setup-os - with: - packages: ${{ env.DEPENDENCIES }} - - uses: ./.github/actions/setup-aws-lc - with: - repository: ${{ inputs.repository }} - commit: ${{ inputs.commit }} - gh_token: ${{ secrets.GITHUB_TOKEN }} - - name: Apply pre-import patch if present - run: | - cd "$AWSLC_DIR" - if [ -f "$GITHUB_WORKSPACE/integration/aws-lc/pre_import.patch" ]; then - git apply "$GITHUB_WORKSPACE/integration/aws-lc/pre_import.patch" - fi - - name: Run importer - run: | - cd "$AWSLC_DIR/crypto/fipsmodule/ml_kem" - GITHUB_REPOSITORY=$GITHUB_REPOSITORY GITHUB_SHA=$GITHUB_SHA ./importer.sh --force - - name: Apply post-import patch if present - run: | - cd "$AWSLC_DIR" - if [ -f "$GITHUB_WORKSPACE/integration/aws-lc/post_import.patch" ]; then - git apply "$GITHUB_WORKSPACE/integration/aws-lc/post_import.patch" - fi - - name: Build+Test AWS-LC (FIPS=${{ matrix.fips }}) - run: | - cd "$AWSLC_DIR" - mkdir build - cd build - cmake -DFIPS=${{ matrix.fips }} .. - cd .. - - cmake --build ./build --target all - cmake --build ./build --target run_tests - posix: - # This is a partial parallelization of the run_posix_tests.sh script - strategy: - max-parallel: 16 - fail-fast: false - matrix: - system: [ubuntu-latest, ubuntu-24.04-arm] - test: - - name: Debug mode - flags: -DENABLE_DILITHIUM=ON - - name: Release mode - flags: -DCMAKE_BUILD_TYPE=Release -DENABLE_DILITHIUM=ON - - name: Small compilation - flags: -DOPENSSL_SMALL=1 -DCMAKE_BUILD_TYPE=Release -DENABLE_DILITHIUM=ON - - name: No-ASM - flags: -DOPENSSL_NO_ASM=1 -DCMAKE_BUILD_TYPE=Release -DENABLE_DILITHIUM=ON - - name: Shared - flags: -DBUILD_SHARED_LIBS=1 -DCMAKE_BUILD_TYPE=Release -DENABLE_DILITHIUM=ON - - name: Pre-Gen ASM - flags: -DDISABLE_PERL=ON -DENABLE_DILITHIUM=ON - - name: DIT - flags: -DENABLE_DATA_INDEPENDENT_TIMING=ON -DCMAKE_BUILD_TYPE=Release -DENABLE_DILITHIUM=ON - name: Posix test (${{ matrix.test.name }}, ${{ matrix.system }}) - runs-on: ${{ matrix.system }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: ./.github/actions/setup-os - with: - packages: ${{ env.DEPENDENCIES }} - - uses: ./.github/actions/setup-aws-lc - with: - repository: ${{ inputs.repository }} - commit: ${{ inputs.commit }} - gh_token: ${{ secrets.GITHUB_TOKEN }} - - name: Apply pre-import patch if present - run: | - cd "$AWSLC_DIR" - if [ -f "$GITHUB_WORKSPACE/integration/aws-lc/pre_import.patch" ]; then - git apply "$GITHUB_WORKSPACE/integration/aws-lc/pre_import.patch" - fi - - name: Run importer - run: | - cd "$AWSLC_DIR/crypto/fipsmodule/ml_kem" - GITHUB_REPOSITORY=$GITHUB_REPOSITORY GITHUB_SHA=$GITHUB_SHA ./importer.sh --force - - name: Apply post-import patch if present - run: | - cd "$AWSLC_DIR" - if [ -f "$GITHUB_WORKSPACE/integration/aws-lc/post_import.patch" ]; then - git apply "$GITHUB_WORKSPACE/integration/aws-lc/post_import.patch" - fi - - name: Run test - run: | - cd "$AWSLC_DIR" - source tests/ci/common_posix_setup.sh - build_and_test ${{ matrix.test.flags }} - prefix: - # This is a parallelization of the run_prefix_tests.sh script - strategy: - max-parallel: 8 - fail-fast: false - matrix: - system: [ubuntu-latest, ubuntu-24.04-arm, macos-latest, macos-15-intel] - test: - - name: Prefix+Debug - flags: - - name: Prefix+Release - flags: -DCMAKE_BUILD_TYPE=Release - name: Prefix test (${{ matrix.test.name }}, ${{ matrix.system }}) - runs-on: ${{ matrix.system }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: ./.github/actions/setup-os - with: - packages: ${{ env.DEPENDENCIES }} - - uses: ./.github/actions/setup-aws-lc - with: - repository: ${{ inputs.repository }} - commit: ${{ inputs.commit }} - gh_token: ${{ secrets.GITHUB_TOKEN }} - - name: Apply pre-import patch if present - run: | - cd "$AWSLC_DIR" - if [ -f "$GITHUB_WORKSPACE/integration/aws-lc/pre_import.patch" ]; then - git apply "$GITHUB_WORKSPACE/integration/aws-lc/pre_import.patch" - fi - - name: Run importer - run: | - cd "$AWSLC_DIR/crypto/fipsmodule/ml_kem" - GITHUB_REPOSITORY=$GITHUB_REPOSITORY GITHUB_SHA=$GITHUB_SHA ./importer.sh --force - - name: Apply post-import patch if present - run: | - cd "$AWSLC_DIR" - if [ -f "$GITHUB_WORKSPACE/integration/aws-lc/post_import.patch" ]; then - git apply "$GITHUB_WORKSPACE/integration/aws-lc/post_import.patch" - fi - - name: Run test - run: | - cd "$AWSLC_DIR" - source tests/ci/common_posix_setup.sh - build_prefix_and_test ${{ matrix.test.flags }} diff --git a/.github/workflows/integration-liboqs.yml b/.github/workflows/integration-liboqs.yml deleted file mode 100644 index e4fbcdcce9..0000000000 --- a/.github/workflows/integration-liboqs.yml +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: Test liboqs integration -permissions: - contents: read -on: - workflow_call: - workflow_dispatch: - -jobs: - oqs_basic_build: - strategy: - fail-fast: false - matrix: - include: - # x86 - - system: ubuntu-latest - name: Haswell - # ubuntu-latest may have AVX512 instructions that are not supported by valgrind - # We explicitly compile for an older uarch to work around valgrind failures - # TODO: switch this back to auto once valgrind supports AVX512 well - flags: -DOQS_DIST_BUILD=OFF -DOQS_OPT_TARGET=haswell -DCMAKE_BUILD_TYPE=Debug -DOQS_ENABLE_TEST_CONSTANT_TIME=ON - - system: ubuntu-latest - name: C - flags: -DOQS_DIST_BUILD=OFF -DOQS_OPT_TARGET=generic -DCMAKE_BUILD_TYPE=Debug -DOQS_ENABLE_TEST_CONSTANT_TIME=ON - # AArch64 - - system: ubuntu-24.04-arm - name: Auto - flags: -DOQS_DIST_BUILD=OFF -DOQS_OPT_TARGET=auto -DCMAKE_BUILD_TYPE=Debug -DOQS_ENABLE_TEST_CONSTANT_TIME=ON - - system: ubuntu-24.04-arm - name: C - flags: -DOQS_DIST_BUILD=OFF -DOQS_OPT_TARGET=generic -DCMAKE_BUILD_TYPE=Debug -DOQS_ENABLE_TEST_CONSTANT_TIME=ON - name: Build (${{ matrix.name }}, ${{ matrix.system }}) - runs-on: ${{ matrix.system }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: ./.github/actions/setup-os - with: - packages: 'cmake python3-jinja2 python3-tabulate python3-git python3-pytest valgrind' - - uses: ./.github/actions/setup-oqs - with: - commit: 'f986aea60a9f3cb4055474aa212538bb0b14f1fe' # main (2026-05-27) - gh_token: ${{ secrets.GITHUB_TOKEN }} - - name: Apply patch - run: | - cd "$LIBOQS_DIR" - # Adjust commit in importer script - sed -i "/name: mlkem-native/,/preserve_folder_structure/s%git_url: .*%git_url: $GITHUB_SERVER_URL/$GITHUB_REPOSITORY%" scripts/copy_from_upstream/copy_from_upstream.yml - sed -i "/name: mlkem-native/,/preserve_folder_structure/s/git_branch: .*/git_branch: $GITHUB_SHA/" scripts/copy_from_upstream/copy_from_upstream.yml - sed -i "/name: mlkem-native/,/preserve_folder_structure/s/git_commit: .*/git_commit: $GITHUB_SHA/" scripts/copy_from_upstream/copy_from_upstream.yml - # TODO: Remove once patch is removed upstream - # Remove patch - sed -i "/name: mlkem-native/,/preserve_folder_structure/{/patches:/d}" scripts/copy_from_upstream/copy_from_upstream.yml - # TODO: Remove one it has been removed upstream - # Remove CT test suppressions - echo "" > tests/constant_time/kem/passes/ml_kem - # Temporarily remove oldpqclean because of build failures in its SHA3 assembly - yq e -i 'del(.kems[] | select(.name == "kyber"))' scripts/copy_from_upstream/copy_from_upstream.yml - yq e -i 'del(.sigs[] | select(.name == "dilithium"))' scripts/copy_from_upstream/copy_from_upstream.yml - git diff >> "$GITHUB_STEP_SUMMARY"; - - name: Configure - run: | - cd "$LIBOQS_DIR" - git config --global user.name "pqcp_ci" - git config --global user.email "ci@pqcp.org" - git config --global --add safe.directory "$LIBOQS_DIR" - - name: Import mlkem-native - run: | - cd "$LIBOQS_DIR/scripts/copy_from_upstream" - ./copy_from_upstream.py copy - - name: Build libOQS - run: | - cd "$LIBOQS_DIR" - mkdir build - cd build - cmake ${{ matrix.flags }} .. - make -j"$(nproc)" - - name: Run KEM-test - run: | - cd "$LIBOQS_DIR" - python3 -m pytest -k ML-KEM diff --git a/.github/workflows/integration-pavona.yml b/.github/workflows/integration-pavona.yml deleted file mode 100644 index f6e06f5714..0000000000 --- a/.github/workflows/integration-pavona.yml +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: Pavona -permissions: - contents: read -on: - workflow_call: - workflow_dispatch: - -env: - AWS_ROLE: arn:aws:iam::559050233797:role/mlkem-c-aarch64-gh-action - AWS_REGION: us-east-1 - AMI_UBUNTU_X86_64: ami-05cf1e9f73fbad2e2 # Ubuntu 24.04 LTS (2026-04-24) - -jobs: - start-ec2-runner: - name: Start EC2 instance - permissions: - contents: 'read' - id-token: 'write' - runs-on: ubuntu-latest - if: ${{ always() && github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork }} # Make this step non-cancellable to avoid orphaned instances - outputs: - label: ${{ steps.start-ec2-runner.outputs.label }} - ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@e7f100cf4c008499ea8adda475de1042d6975c7b # v6.2.0 - with: - role-to-assume: ${{ env.AWS_ROLE }} - aws-region: ${{ env.AWS_REGION }} - - - name: Start EC2 runner - id: start-ec2-runner - uses: machulav/ec2-github-runner@343a1b2ae682e681c3cec9a235d882da17ff04ef # v2.6.1 - with: - mode: start - github-token: ${{ secrets.AWS_GITHUB_TOKEN }} - ec2-instance-type: c7i.2xlarge - ec2-volume-size: 32 - availability-zones-config: >- - [{"imageId":"${{ env.AMI_UBUNTU_X86_64 }}","subnetId":"subnet-07b2729e5e065962f","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ env.AMI_UBUNTU_X86_64 }}","subnetId":"subnet-0c7739cbd02c2c1d2","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ env.AMI_UBUNTU_X86_64 }}","subnetId":"subnet-0d69987f97f50fc1d","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ env.AMI_UBUNTU_X86_64 }}","subnetId":"subnet-0d077bf47a0eef46e","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ env.AMI_UBUNTU_X86_64 }}","subnetId":"subnet-0019f164593f6df43","securityGroupId":"sg-0ab2e297196c8c381"}, - {"imageId":"${{ env.AMI_UBUNTU_X86_64 }}","subnetId":"subnet-0f0d1e7667a264a0e","securityGroupId":"sg-0ab2e297196c8c381"}] - - pavona_test: - name: Pavona ML-KEM Test (verilator) - needs: start-ec2-runner - if: ${{ github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork }} - runs-on: ${{ needs.start-ec2-runner.outputs.label }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: ./.github/actions/setup-pavona - with: - pavona-repository: https://github.com/pavona/pavona - pavona-commit: 96b8bca4c1025e3b599b53b912ed6afc5a098115 # main (2026-06-01) - - - name: Patch mlkem-native dependency - run: | - cd "$PAVONA_DIR" - - # Calculate sha256 of mlkem-native at the new commit - SHA256=$(curl -sL "https://github.com/$GITHUB_REPOSITORY/archive/$GITHUB_SHA.tar.gz" | sha256sum | cut -d' ' -f1) - - # Update the extensions.bzl file with new commit and sha256 - sed -i \ - -e "s|sha256 = \"[^\"]*\"|sha256 = \"$SHA256\"|" \ - -e "s|strip_prefix = \"mlkem-native-[^\"]*\"|strip_prefix = \"mlkem-native-$GITHUB_SHA\"|" \ - -e "s|archive/[^/]*.tar.gz|archive/$GITHUB_SHA.tar.gz|" \ - third_party/mlkem_native/extensions.bzl - - # Show the changes - echo "=== Patched extensions.bzl ===" - cat third_party/mlkem_native/extensions.bzl - - - name: Patch functest to only test deterministic API - run: | - cd "$PAVONA_DIR" - # speed-up tests in CI by only running deterministic tests - git apply "$GITHUB_WORKSPACE/integration/pavona/derand-only.patch" - - - name: Run mlkem functest - run: | - cd "$PAVONA_DIR" - # Run the test - ./bazelisk.sh test \ - --test_output=streamed \ - --disk_cache="$BAZEL_CACHE_DIR" \ - --registry=https://raw.githubusercontent.com/bazelbuild/bazel-central-registry/main/ \ - //sw/device/tests/crypto:mlkem_functest_sim_verilator - - stop-ec2-runner: - name: Stop EC2 instance - permissions: - contents: 'read' - id-token: 'write' - needs: - - start-ec2-runner - - pavona_test # required to wait when the main job is done - runs-on: ubuntu-latest - if: ${{ always() && github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork }} # required to stop the runner even if errors occur - steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@e7f100cf4c008499ea8adda475de1042d6975c7b # v6.2.0 - with: - role-to-assume: ${{ env.AWS_ROLE }} - aws-region: ${{ env.AWS_REGION }} - - - name: Stop EC2 runner - uses: machulav/ec2-github-runner@343a1b2ae682e681c3cec9a235d882da17ff04ef # v2.6.1 - with: - mode: stop - github-token: ${{ secrets.AWS_GITHUB_TOKEN }} - label: ${{ needs.start-ec2-runner.outputs.label }} - ec2-instance-id: ${{ needs.start-ec2-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/legacy-compilers.yml b/.github/workflows/legacy-compilers.yml deleted file mode 100644 index b8b8e786f1..0000000000 --- a/.github/workflows/legacy-compilers.yml +++ /dev/null @@ -1,614 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: Legacy compilers -permissions: - contents: read -on: - schedule: - - cron: '0 4 * * *' - pull_request: - branches: ["main"] - types: [opened, synchronize, reopened, labeled] - workflow_dispatch: - -jobs: - compiler_tests_linux: - name: Compiler tests (${{ matrix.compiler.family }}-${{ matrix.compiler.version }}, ${{ matrix.target.name }}, ${{ matrix.cflags }}) - if: >- - github.event_name == 'schedule' || - github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && - contains(github.event.pull_request.labels.*.name, 'legacy-compiler-tests')) - strategy: - fail-fast: false - matrix: - cflags: [ "-O0", "-Os", "-O3" ] - target: - - runner: ubuntu-24.04-arm - name: 'aarch64' - - runner: ubuntu-latest - name: 'x86_64' - # All available compiler versions are tested here, including those also - # covered by ci.yml on every PR. - # C17 support starts at gcc 8, clang 7, all zig; C23 at gcc 14, clang 18, - # zig 0.14 (encoded in the C17/C23 step conditions below). - compiler: - - family: gcc - version: "4.8" - shell: gcc48 - opt: all - examples: true - - family: gcc - version: "4.9" - shell: gcc49 - opt: all - examples: true - - family: gcc - version: "6" - shell: gcc6 - opt: all - examples: true - - family: gcc - version: "7" - shell: gcc7 - opt: all - examples: true - - family: gcc - version: "8" - shell: gcc8 - opt: all - examples: true - - family: gcc - version: "9" - shell: gcc9 - opt: all - examples: true - - family: gcc - version: "10" - shell: gcc10 - opt: all - examples: true - - family: gcc - version: "11" - shell: gcc11 - opt: all - examples: true - - family: gcc - version: "12" - shell: gcc12 - opt: all - examples: true - - family: gcc - version: "13" - shell: gcc13 - opt: all - examples: true - - family: gcc - version: "14" - shell: gcc14 - opt: all - examples: true - - family: gcc - version: "15" - shell: gcc15 - opt: all - examples: true - - family: gcc - version: "16" - shell: gcc16 - opt: all - examples: true - - family: clang - version: "6" - shell: clang6 - opt: all - examples: true - - family: clang - version: "7" - shell: clang7 - opt: all - examples: true - - family: clang - version: "8" - shell: clang8 - opt: all - examples: true - - family: clang - version: "9" - shell: clang9 - opt: all - examples: true - - family: clang - version: "10" - shell: clang10 - opt: all - examples: true - - family: clang - version: "11" - shell: clang11 - opt: all - examples: true - - family: clang - version: "12" - shell: clang12 - opt: all - examples: true - - family: clang - version: "13" - shell: clang13 - opt: all - examples: true - - family: clang - version: "14" - shell: clang14 - opt: all - examples: true - - family: clang - version: "15" - shell: clang15 - opt: all - examples: true - - family: clang - version: "16" - shell: clang16 - opt: all - examples: true - - family: clang - version: "17" - shell: clang17 - opt: all - examples: true - - family: clang - version: "18" - shell: clang18 - opt: all - examples: true - - family: clang - version: "19" - shell: clang19 - opt: all - examples: true - - family: clang - version: "20" - shell: clang20 - opt: all - examples: true - - family: clang - version: "21" - shell: clang21 - opt: all - examples: true - - family: clang - version: "22" - shell: clang22 - opt: all - examples: true - # CPU flags are not correctly passed to the zig assembler - # https://github.com/ziglang/zig/issues/23576 - # We therefore only test the C backend, which is why opt is set to - # no_opt and examples are omitted (there is currently no way to run - # only those examples not involving native code). - - family: zig - version: "0.10" - shell: zig0_10 - opt: no_opt - examples: False - - family: zig - version: "0.11" - shell: zig0_11 - opt: no_opt - examples: False - - family: zig - version: "0.12" - shell: zig0_12 - opt: no_opt - examples: False - - family: zig - version: "0.13" - shell: zig0_13 - opt: no_opt - examples: False - - family: zig - version: "0.14" - shell: zig0_14 - opt: no_opt - examples: False - - family: zig - version: "0.15" - shell: zig0_15 - opt: no_opt - examples: False - - family: zig - version: "0.16" - shell: zig0_16 - opt: no_opt - examples: False - runs-on: ${{ matrix.target.runner }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: native build+functest (default) - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "${{ matrix.cflags }}" - - name: native build+functest (C90) - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c90 ${{ matrix.cflags }}" - - name: native build+functest (C99) - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c99 ${{ matrix.cflags }}" - - name: native build+functest (C11) - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c11 ${{ matrix.cflags }}" - - name: native build+functest (C17) - if: ${{ matrix.compiler.family == 'zig' || (matrix.compiler.family == 'gcc' && matrix.compiler.version >= 8) || (matrix.compiler.family == 'clang' && matrix.compiler.version >= 7) }} - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c17 ${{ matrix.cflags }}" - - name: native build+functest (C23) - if: ${{ (matrix.compiler.family == 'zig' && matrix.compiler.version >= 0.14) || (matrix.compiler.family == 'gcc' && matrix.compiler.version >= 14) || (matrix.compiler.family == 'clang' && matrix.compiler.version >= 18) }} - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c23 ${{ matrix.cflags }}" - - compiler_tests_macos: - name: Compiler tests (${{ matrix.compiler.family }}-${{ matrix.compiler.version }}, macos, ${{ matrix.cflags }}) - if: >- - github.event_name == 'schedule' || - github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && - contains(github.event.pull_request.labels.*.name, 'legacy-compiler-tests')) - strategy: - fail-fast: false - matrix: - cflags: [ "-O0", "-Os", "-O3" ] - # Subset of compiler_tests_linux known to build on aarch64-darwin. - compiler: - - family: gcc - version: "11" - shell: gcc11 - opt: all - examples: true - - family: gcc - version: "14" - shell: gcc14 - opt: all - examples: true - - family: gcc - version: "15" - shell: gcc15 - opt: all - examples: true - # TODO: re-add once gcc16 is no longer broken in nixpkgs-unstable - # - family: gcc - # version: "16" - # shell: gcc16 - # opt: all - # examples: true - - family: clang - version: "18" - shell: clang18 - opt: all - examples: true - - family: clang - version: "19" - shell: clang19 - opt: all - examples: true - - family: clang - version: "20" - shell: clang20 - opt: all - examples: true - - family: clang - version: "21" - shell: clang21 - opt: all - examples: true - - family: clang - version: "22" - shell: clang22 - opt: all - examples: true - - family: zig - version: "0.12" - shell: zig0_12 - opt: no_opt - examples: False - - family: zig - version: "0.13" - shell: zig0_13 - opt: no_opt - examples: False - - family: zig - version: "0.14" - shell: zig0_14 - opt: no_opt - examples: False - - family: zig - version: "0.15" - shell: zig0_15 - opt: no_opt - examples: False - - family: zig - version: "0.16" - shell: zig0_16 - opt: no_opt - examples: False - runs-on: macos-latest - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: native build+functest (default) - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "${{ matrix.cflags }}" - - name: native build+functest (C90) - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c90 ${{ matrix.cflags }}" - - name: native build+functest (C99) - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c99 ${{ matrix.cflags }}" - - name: native build+functest (C11) - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c11 ${{ matrix.cflags }}" - - name: native build+functest (C17) - if: ${{ matrix.compiler.family == 'zig' || (matrix.compiler.family == 'gcc' && matrix.compiler.version >= 8) || (matrix.compiler.family == 'clang' && matrix.compiler.version >= 7) }} - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c17 ${{ matrix.cflags }}" - - name: native build+functest (C23) - if: ${{ (matrix.compiler.family == 'zig' && matrix.compiler.version >= 0.14) || (matrix.compiler.family == 'gcc' && matrix.compiler.version >= 14) || (matrix.compiler.family == 'clang' && matrix.compiler.version >= 18) }} - uses: ./.github/actions/multi-functest - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - compile_mode: native - func: true - kat: false - acvp: false - wycheproof: false - examples: ${{ matrix.compiler.examples }} - opt: ${{ matrix.compiler.opt }} - nix-shell: ${{ matrix.compiler.shell }} - cflags: "-std=c23 ${{ matrix.cflags }}" - - compiler_tests_windows_mingw: - name: Compiler tests (Mingw-w64 ${{ matrix.mingw-version }}, x86_64) - if: >- - github.event_name == 'schedule' || - github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && - contains(github.event.pull_request.labels.*.name, 'legacy-compiler-tests')) - strategy: - fail-fast: false - matrix: - mingw-version: - - "5.4.0" - - "6.4.0" - - "7.5.0" - - "8.5.0" - - "9.4.0" - - "10.3.0" - - "11.2.0" - - "12.2.0" - - "13.2.0" - - "14.2.0" - - "15.2.0" - runs-on: windows-latest - steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Install MinGW-w64 - run: choco install mingw --version=${{ matrix.mingw-version }} -y - shell: cmd - - name: make quickcheck - shell: bash - run: | - CC=gcc OPT=0 make quickcheck - CC=gcc make clean >/dev/null - CC=gcc OPT=1 make quickcheck - - name: make quickcheck (AVX2) - shell: bash - run: | - CC=gcc make clean >/dev/null - CC=gcc CFLAGS="-mavx2 -mbmi2" make quickcheck - - check-ct-varlat-legacy: - name: Constant-time test ${{ matrix.compiler.family }}-${{ matrix.compiler.version }} ${{ matrix.system }} - if: >- - github.event_name == 'schedule' || - github.event_name == 'workflow_dispatch' || - (github.event_name == 'pull_request' && - contains(github.event.pull_request.labels.*.name, 'legacy-compiler-tests')) - strategy: - fail-fast: false - max-parallel: 10 - matrix: - system: [ubuntu-latest, ubuntu-24.04-arm] - compiler: - - { family: gcc, version: "4.8", shell: gcc48 } - - { family: gcc, version: "4.9", shell: gcc49 } - - { family: gcc, version: "6", shell: gcc6 } - - { family: gcc, version: "7", shell: gcc7 } - - { family: gcc, version: "8", shell: gcc8 } - - { family: gcc, version: "9", shell: gcc9 } - - { family: gcc, version: "10", shell: gcc10 } - - { family: gcc, version: "11", shell: gcc11 } - - { family: gcc, version: "12", shell: gcc12 } - - { family: gcc, version: "13", shell: gcc13 } - - { family: gcc, version: "14", shell: gcc14 } - - { family: gcc, version: "15", shell: gcc15 } - - { family: gcc, version: "16", shell: gcc16 } - - { family: clang, version: "6", shell: clang6 } - - { family: clang, version: "7", shell: clang7 } - - { family: clang, version: "8", shell: clang8 } - - { family: clang, version: "9", shell: clang9 } - - { family: clang, version: "10", shell: clang10 } - - { family: clang, version: "11", shell: clang11 } - - { family: clang, version: "12", shell: clang12 } - - { family: clang, version: "13", shell: clang13 } - - { family: clang, version: "14", shell: clang14 } - - { family: clang, version: "15", shell: clang15 } - - { family: clang, version: "16", shell: clang16 } - - { family: clang, version: "17", shell: clang17 } - - { family: clang, version: "18", shell: clang18 } - - { family: clang, version: "19", shell: clang19 } - - { family: clang, version: "20", shell: clang20 } - - { family: clang, version: "21", shell: clang21 } - - { family: clang, version: "22", shell: clang22 } - runs-on: ${{ matrix.system }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: Setup nix - uses: ./.github/actions/setup-shell - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - nix-shell: "valgrind-varlat_${{ matrix.compiler.shell }}" - nix-cache: true - - name: Build and run test (-Oz) - # -Oz got introduced in gcc12 - if: ${{ matrix.compiler.family != 'gcc' || matrix.compiler.version >= 12 }} - uses: ./.github/actions/ct-test - with: - cflags: -Oz -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes - - name: Build and run test (-Os) - uses: ./.github/actions/ct-test - with: - cflags: -Os -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes - - name: Build and run test (-O3) - uses: ./.github/actions/ct-test - with: - cflags: -O3 -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes - - name: Build and run test (-Ofast) - # -Ofast got deprecated in clang19; -O3 -ffast-math should be used instead - if: ${{ !(matrix.compiler.family == 'clang' && matrix.compiler.version >= 19) }} - uses: ./.github/actions/ct-test - with: - cflags: -Ofast -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes - - name: Build and run test (-O3 -ffast-math) - uses: ./.github/actions/ct-test - with: - cflags: -O3 -ffast-math -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes - - name: Build and run test (-O2) - uses: ./.github/actions/ct-test - with: - cflags: -O2 -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes - - name: Build and run test (-O1) - uses: ./.github/actions/ct-test - with: - cflags: -O1 -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes - - name: Build and run test (-O0) - uses: ./.github/actions/ct-test - with: - cflags: -O0 -DMLK_CONFIG_KEYGEN_PCT - valgrind_flags: --variable-latency-errors=yes diff --git a/.github/workflows/lint_markdown.yml b/.github/workflows/lint_markdown.yml deleted file mode 100644 index cd3858cafe..0000000000 --- a/.github/workflows/lint_markdown.yml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: Lint-Markdown -permissions: - contents: read -on: - workflow_call: - workflow_dispatch: - -jobs: - lint-markdown-link: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: Check markdown links - run: | - npm install -g markdown-link-check@3.14.2 - find . -name '*.md' -print0 | xargs -0 -P16 -n1 markdown-link-check -q -c .github/mlc_config.json diff --git a/.github/workflows/nix.yml b/.github/workflows/nix.yml deleted file mode 100644 index ebc105e075..0000000000 --- a/.github/workflows/nix.yml +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# Copyright (c) The mldsa-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: Nix -permissions: - contents: read -on: - workflow_call: - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - check_modified_files: - runs-on: ubuntu-latest - outputs: - run_needed: ${{ steps.check_run.outputs.run_needed }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - if: github.event_name != 'workflow_dispatch' - with: - fetch-depth: 0 - - name: Get changed files - if: github.event_name != 'workflow_dispatch' - id: changed-files - uses: tj-actions/changed-files@9426d40962ed5378910ee2e21d5f8c6fcbf2dd96 # v47.0.6 - - name: Check if dependencies changed - id: check_run - shell: bash - env: - CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }} - run: | - if [[ ${{ (github.event_name == 'workflow_dispatch' && '1') || '0' }} == "1" ]]; then - run_needed=1 - else - run_needed=0 - dependencies="flake.lock flake.nix nix/" - for changed in $CHANGED_FILES; do - for needs in $dependencies; do - if [[ "$changed" == "$needs"* ]]; then - run_needed=1 - fi - done - done - fi - echo "run_needed=${run_needed}" >> "$GITHUB_OUTPUT" - - build_nix_cache: - needs: [ check_modified_files ] - if: ${{ needs.check_modified_files.outputs.run_needed == '1' && github.ref == 'refs/heads/main' }} - permissions: - actions: 'write' - contents: 'read' - strategy: - fail-fast: false - matrix: - runner: [ ubuntu-24.04, ubuntu-24.04-arm, macos-latest ] - name: build nix cache (${{ matrix.runner }}) - runs-on: ${{ matrix.runner }} - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - uses: ./.github/actions/setup-nix - with: - cache: true - verbose: true - save_cache: true - devShell: ci - gh_token: ${{ secrets.GITHUB_TOKEN }} - script: | - # We only run cross-compilation checks for x86 on macos-latest, - # so restrict caching to the corresponding cross shell. - if [[ ${{ runner.os }} == 'macOS' ]]; then - nix develop .#cross-x86_64 --profile tmp-cross - else - nix develop .#cross --profile tmp-cross - # GH ubuntu-24.04 image tend to run outof space - if [[ ${{ matrix.runner }} == 'ubuntu-24.04' ]]; then - nix-collect-garbage - fi - fi - nix develop --profile tmp - nix-collect-garbage - develop_environment: - needs: [ check_modified_files ] - if: ${{ needs.check_modified_files.outputs.run_needed == '1' }} - strategy: - fail-fast: false - matrix: - target: - # nixpkgs requires 2.18 since August 2025, see - # https://github.com/NixOS/nixpkgs/pull/428076 - # TODO: Re-enable tests on Ubuntu 22 once nix has been updated to >= 2.18 - # - runner: ubuntu-22.04 - # container: - # install: 'apt' - - runner: ubuntu-latest - container: nixos/nix:2.18.0 - install: 'native' - - runner: ubuntu-24.04 - container: - install: 'apt' - - runner: macos-latest - container: - install: 'installer' - - runner: ubuntu-22.04 - container: - install: 'installer' - - runner: ubuntu-24.04 - container: - install: 'installer' - name: nix setup test (${{ matrix.target.container != '' && matrix.target.container || matrix.target.runner }}, nix via ${{ matrix.target.install }}) - runs-on: ${{ matrix.target.runner }} - container: - ${{ matrix.target.container }} - steps: - - name: Install git - shell: bash - run: | - if ! which git >/dev/null 2>&1; then - ${{ matrix.target.container == '' && 'sudo' || '' }} apt update - ${{ matrix.target.container == '' && 'sudo' || '' }} apt install git -y - fi - - name: Manual checkout - shell: bash - run: | - git init - git config --global --add safe.directory "$GITHUB_WORKSPACE" - git remote add origin "$GITHUB_SERVER_URL/$GITHUB_REPOSITORY" - git fetch origin --depth 1 "$GITHUB_SHA" - git checkout FETCH_HEAD - - uses: ./.github/actions/setup-nix - if: ${{ matrix.target.container == '' }} - with: - gh_token: ${{ secrets.GITHUB_TOKEN }} - devShell: default - verbose: true - cache: true - install: ${{ matrix.target.install }} - - name: nix develop (in container) - if: ${{ matrix.target.container != '' }} - run: | - nix develop --experimental-features "nix-command flakes" --access-tokens "github.com=${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/riscv.yml b/.github/workflows/riscv.yml deleted file mode 100644 index c0d5197530..0000000000 --- a/.github/workflows/riscv.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: RISC-V -permissions: - contents: read -on: - workflow_call: - workflow_dispatch: -jobs: - quickcheck: - name: Quickcheck (riscv) - if: ${{ github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork }} - # via https://riseproject-dev.github.io/riscv-runner/ - runs-on: ubuntu-24.04-riscv - steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - name: make quickcheck - run: | - OPT=0 make quickcheck - make clean >/dev/null - OPT=1 make quickcheck - - uses: ./.github/actions/setup-os - - name: tests func - run: | - ./scripts/tests func --check-namespace diff --git a/.github/workflows/scorecard.yaml b/.github/workflows/scorecard.yaml deleted file mode 100644 index 539b0b0b6c..0000000000 --- a/.github/workflows/scorecard.yaml +++ /dev/null @@ -1,61 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: OSSF Scorecard analysis -on: - # Only the default branch is supported. - branch_protection_rule: - schedule: - # Weekly on Saturdays. - - cron: '30 1 * * 6' - push: - branches: [ main ] - -# Declare default permissions as read only. -permissions: read-all - -jobs: - analysis: - name: Scorecard analysis - runs-on: ubuntu-latest - permissions: - # Needed if using Code scanning alerts - security-events: write - # Needed for GitHub OIDC token if publish_results is true - id-token: write - - steps: - - name: "Checkout code" - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - with: - persist-credentials: false - - - name: "Run analysis" - uses: ossf/scorecard-action@4eaacf0543bb3f2c246792bd56e8cdeffafb205a # v2.4.3 - with: - results_file: results.sarif - results_format: sarif - # (Optional) fine-grained personal access token. Uncomment the `repo_token` line below if: - # - you want to enable the Branch-Protection check on a *public* repository, or - # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-fine-grained-pat-optional. - # repo_token: ${{ secrets.SCORECARD_TOKEN }} - - # Publish the results for public repositories to enable scorecard badges. For more details, see - # https://github.com/ossf/scorecard-action#publishing-results. - # For private repositories, `publish_results` will automatically be set to `false`, regardless - # of the value entered here. - publish_results: true - - # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF - # format to the repository Actions tab. - - name: "Upload artifact" - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: SARIF file - path: results.sarif - retention-days: 5 - - # required for Code scanning alerts - - name: "Upload SARIF results to code scanning" - uses: github/codeql-action/upload-sarif@592977e6ae857384aa79bb31e7a1d62d63449ec5 # v2.16.3 - with: - sarif_file: results.sarif diff --git a/.github/workflows/slothy.yml b/.github/workflows/slothy.yml deleted file mode 100644 index 3573d7ef7d..0000000000 --- a/.github/workflows/slothy.yml +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) The mlkem-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -name: SLOTHY re-optimization tests -permissions: - contents: read -on: - workflow_call: - workflow_dispatch: - -jobs: - slothy: - name: SLOTHY - if: ${{ github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork }} - permissions: - contents: 'read' - id-token: 'write' - uses: ./.github/workflows/ci_ec2_reusable.yml - with: - name: SLOTHY - ec2_instance_type: c8g.8xlarge - ec2_ami: ubuntu-latest (aarch64) - ec2_volume_size: 20 - lint: false - test: false - cbmc: false - slothy: true - secrets: inherit diff --git a/Makefile.Microsoft_nmake b/Makefile.Microsoft_nmake index 1803015c74..b2af8112f3 100644 --- a/Makefile.Microsoft_nmake +++ b/Makefile.Microsoft_nmake @@ -1,15 +1,44 @@ # SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT -CFLAGS = /nologo /O2 /Imlkem /Imlkem/src /Imlkem/src/fips202 /Imlkem/src/fips202/native /Imlkem/src/sys /Imlkem/src/native - -OBJ_FILES = .\mlkem\*.obj \ - .\mlkem\fips202\*.obj +CFLAGS_BASE = /nologo /O2 /Imlkem /Imlkem/src /Imlkem/src/fips202 /Imlkem/src/fips202/native /Imlkem/src/sys /Imlkem/src/native BUILD_DIR = test\build MLKEM512_BUILD_DIR = $(BUILD_DIR)\mlkem512 MLKEM768_BUILD_DIR = $(BUILD_DIR)\mlkem768 MLKEM1024_BUILD_DIR = $(BUILD_DIR)\mlkem1024 +# OPT=1 enables the native AArch64 backend: .asm sources are preprocessed +# with cl /EP and assembled with armasm64. Only supported when targeting +# ARM64. +!IFNDEF OPT +OPT = 0 +!ENDIF + +!IF "$(OPT)" == "1" +CFLAGS = $(CFLAGS_BASE) /DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH /DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 +ASM_PP_FLAGS = /D__ASSEMBLER__ /DMLK_ASM_ARMASM /EP /TC +NATIVE_OBJ_512 = $(MLKEM512_BUILD_DIR)\mlkem\native\*.obj $(MLKEM512_BUILD_DIR)\mlkem\fips202\native\*.obj +NATIVE_OBJ_768 = $(MLKEM768_BUILD_DIR)\mlkem\native\*.obj $(MLKEM768_BUILD_DIR)\mlkem\fips202\native\*.obj +NATIVE_OBJ_1024 = $(MLKEM1024_BUILD_DIR)\mlkem\native\*.obj $(MLKEM1024_BUILD_DIR)\mlkem\fips202\native\*.obj +NATIVE_ASM_OBJ_512 = $(MLKEM512_BUILD_DIR)\mlkem\native_asm\*.obj +NATIVE_ASM_OBJ_768 = $(MLKEM768_BUILD_DIR)\mlkem\native_asm\*.obj +NATIVE_ASM_OBJ_1024 = $(MLKEM1024_BUILD_DIR)\mlkem\native_asm\*.obj +NATIVE_DEPS_512 = $(NATIVE_OBJ_512) native_asm_512 +NATIVE_DEPS_768 = $(NATIVE_OBJ_768) native_asm_768 +NATIVE_DEPS_1024 = $(NATIVE_OBJ_1024) native_asm_1024 +!ELSE +CFLAGS = $(CFLAGS_BASE) +NATIVE_OBJ_512 = +NATIVE_OBJ_768 = +NATIVE_OBJ_1024 = +NATIVE_ASM_OBJ_512 = +NATIVE_ASM_OBJ_768 = +NATIVE_ASM_OBJ_1024 = +NATIVE_DEPS_512 = +NATIVE_DEPS_768 = +NATIVE_DEPS_1024 = +!ENDIF + OBJ_FILES_512 = $(MLKEM512_BUILD_DIR)\mlkem\*.obj \ $(MLKEM512_BUILD_DIR)\mlkem\fips202\*.obj OBJ_FILES_768 = $(MLKEM768_BUILD_DIR)\mlkem\*.obj \ @@ -17,11 +46,6 @@ OBJ_FILES_768 = $(MLKEM768_BUILD_DIR)\mlkem\*.obj \ OBJ_FILES_1024 = $(MLKEM1024_BUILD_DIR)\mlkem\*.obj \ $(MLKEM1024_BUILD_DIR)\mlkem\fips202\*.obj -# NOTE: We currently only build code for non-opt code, as we haven't yet made the assembly compatible on Windows -!IFNDEF OPT -OPT = 0 -!ENDIF - {test/notrandombytes}.c{$(BUILD_DIR)\randombytes}.obj:: @if NOT EXIST $(BUILD_DIR)\randombytes mkdir $(BUILD_DIR)\randombytes $(CC) $(CFLAGS) /c /Fo$(BUILD_DIR)\randombytes\ $< @@ -35,6 +59,18 @@ OPT = 0 @if NOT EXIST $(MLKEM512_BUILD_DIR)\mlkem\fips202 mkdir $(MLKEM512_BUILD_DIR)\mlkem\fips202 $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=512 /c /Fo$(MLKEM512_BUILD_DIR)\mlkem\fips202\ $< +{mlkem\src\native\aarch64\src}.c{$(MLKEM512_BUILD_DIR)\mlkem\native}.obj:: + @if NOT EXIST $(MLKEM512_BUILD_DIR)\mlkem\native mkdir $(MLKEM512_BUILD_DIR)\mlkem\native + $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=512 /c /Fo$(MLKEM512_BUILD_DIR)\mlkem\native\ $< + +{mlkem\src\fips202\native\aarch64\src}.c{$(MLKEM512_BUILD_DIR)\mlkem\fips202\native}.obj:: + @if NOT EXIST $(MLKEM512_BUILD_DIR)\mlkem\fips202\native mkdir $(MLKEM512_BUILD_DIR)\mlkem\fips202\native + $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=512 /c /Fo$(MLKEM512_BUILD_DIR)\mlkem\fips202\native\ $< + +native_asm_512: + @if NOT EXIST $(MLKEM512_BUILD_DIR)\mlkem\native_asm mkdir $(MLKEM512_BUILD_DIR)\mlkem\native_asm + for %f in (mlkem\src\native\aarch64\src\*.asm mlkem\src\fips202\native\aarch64\src\*.asm) do @( $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=512 $(ASM_PP_FLAGS) %f > $(MLKEM512_BUILD_DIR)\mlkem\native_asm\%~nf.i && armasm64 -nologo $(MLKEM512_BUILD_DIR)\mlkem\native_asm\%~nf.i -o $(MLKEM512_BUILD_DIR)\mlkem\native_asm\%~nf.obj ) + {test\src}.c{$(MLKEM512_BUILD_DIR)\test}.obj:: @if NOT EXIST $(MLKEM512_BUILD_DIR)\test mkdir $(MLKEM512_BUILD_DIR)\test $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=512 /c /Fo$(MLKEM512_BUILD_DIR)\test\ $< @@ -48,6 +84,18 @@ OPT = 0 @if NOT EXIST $(MLKEM768_BUILD_DIR)\mlkem\fips202 mkdir $(MLKEM768_BUILD_DIR)\mlkem\fips202 $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=768 /c /Fo$(MLKEM768_BUILD_DIR)\mlkem\fips202\ $< +{mlkem\src\native\aarch64\src}.c{$(MLKEM768_BUILD_DIR)\mlkem\native}.obj:: + @if NOT EXIST $(MLKEM768_BUILD_DIR)\mlkem\native mkdir $(MLKEM768_BUILD_DIR)\mlkem\native + $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=768 /c /Fo$(MLKEM768_BUILD_DIR)\mlkem\native\ $< + +{mlkem\src\fips202\native\aarch64\src}.c{$(MLKEM768_BUILD_DIR)\mlkem\fips202\native}.obj:: + @if NOT EXIST $(MLKEM768_BUILD_DIR)\mlkem\fips202\native mkdir $(MLKEM768_BUILD_DIR)\mlkem\fips202\native + $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=768 /c /Fo$(MLKEM768_BUILD_DIR)\mlkem\fips202\native\ $< + +native_asm_768: + @if NOT EXIST $(MLKEM768_BUILD_DIR)\mlkem\native_asm mkdir $(MLKEM768_BUILD_DIR)\mlkem\native_asm + for %f in (mlkem\src\native\aarch64\src\*.asm mlkem\src\fips202\native\aarch64\src\*.asm) do @( $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=768 $(ASM_PP_FLAGS) %f > $(MLKEM768_BUILD_DIR)\mlkem\native_asm\%~nf.i && armasm64 -nologo $(MLKEM768_BUILD_DIR)\mlkem\native_asm\%~nf.i -o $(MLKEM768_BUILD_DIR)\mlkem\native_asm\%~nf.obj ) + {test\src}.c{$(MLKEM768_BUILD_DIR)\test}.obj:: @if NOT EXIST $(MLKEM768_BUILD_DIR)\test mkdir $(MLKEM768_BUILD_DIR)\test $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=768 /c /Fo$(MLKEM768_BUILD_DIR)\test\ $< @@ -61,6 +109,18 @@ OPT = 0 @if NOT EXIST $(MLKEM1024_BUILD_DIR)\mlkem\fips202 mkdir $(MLKEM1024_BUILD_DIR)\mlkem\fips202 $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=1024 /c /Fo$(MLKEM1024_BUILD_DIR)\mlkem\fips202\ $< +{mlkem\src\native\aarch64\src}.c{$(MLKEM1024_BUILD_DIR)\mlkem\native}.obj:: + @if NOT EXIST $(MLKEM1024_BUILD_DIR)\mlkem\native mkdir $(MLKEM1024_BUILD_DIR)\mlkem\native + $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=1024 /c /Fo$(MLKEM1024_BUILD_DIR)\mlkem\native\ $< + +{mlkem\src\fips202\native\aarch64\src}.c{$(MLKEM1024_BUILD_DIR)\mlkem\fips202\native}.obj:: + @if NOT EXIST $(MLKEM1024_BUILD_DIR)\mlkem\fips202\native mkdir $(MLKEM1024_BUILD_DIR)\mlkem\fips202\native + $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=1024 /c /Fo$(MLKEM1024_BUILD_DIR)\mlkem\fips202\native\ $< + +native_asm_1024: + @if NOT EXIST $(MLKEM1024_BUILD_DIR)\mlkem\native_asm mkdir $(MLKEM1024_BUILD_DIR)\mlkem\native_asm + for %f in (mlkem\src\native\aarch64\src\*.asm mlkem\src\fips202\native\aarch64\src\*.asm) do @( $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=1024 $(ASM_PP_FLAGS) %f > $(MLKEM1024_BUILD_DIR)\mlkem\native_asm\%~nf.i && armasm64 -nologo $(MLKEM1024_BUILD_DIR)\mlkem\native_asm\%~nf.i -o $(MLKEM1024_BUILD_DIR)\mlkem\native_asm\%~nf.obj ) + {test\src}.c{$(MLKEM1024_BUILD_DIR)\test}.obj:: @if NOT EXIST $(MLKEM1024_BUILD_DIR)\test mkdir $(MLKEM1024_BUILD_DIR)\test $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=1024 /c /Fo$(MLKEM1024_BUILD_DIR)\test\ $< @@ -81,34 +141,34 @@ OPT = 0 $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=1024 /c /Fo$(MLKEM1024_BUILD_DIR)\test\acvp\ $< # compile functional test for mlkem512 -test_mlkem512: $(OBJ_FILES_512) $(MLKEM512_BUILD_DIR)\test\test_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj +test_mlkem512: $(OBJ_FILES_512) $(NATIVE_DEPS_512) $(MLKEM512_BUILD_DIR)\test\test_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj @if NOT EXIST $(MLKEM512_BUILD_DIR)\bin mkdir $(MLKEM512_BUILD_DIR)\bin - $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=512 /Fe$(MLKEM512_BUILD_DIR)\bin\test_mlkem512 $** /link + $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=512 /Fe$(MLKEM512_BUILD_DIR)\bin\test_mlkem512 $(OBJ_FILES_512) $(NATIVE_OBJ_512) $(NATIVE_ASM_OBJ_512) $(MLKEM512_BUILD_DIR)\test\test_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj /link # compile functional test for mlkem768 -test_mlkem768: $(OBJ_FILES_768) $(MLKEM768_BUILD_DIR)\test\test_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj +test_mlkem768: $(OBJ_FILES_768) $(NATIVE_DEPS_768) $(MLKEM768_BUILD_DIR)\test\test_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj @if NOT EXIST $(MLKEM768_BUILD_DIR)\bin mkdir $(MLKEM768_BUILD_DIR)\bin - $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=768 /Fe$(MLKEM768_BUILD_DIR)\bin\test_mlkem768 $** /link + $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=768 /Fe$(MLKEM768_BUILD_DIR)\bin\test_mlkem768 $(OBJ_FILES_768) $(NATIVE_OBJ_768) $(NATIVE_ASM_OBJ_768) $(MLKEM768_BUILD_DIR)\test\test_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj /link # compile functional test for mlkem1024 -test_mlkem1024: $(OBJ_FILES_1024) $(MLKEM1024_BUILD_DIR)\test\test_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj +test_mlkem1024: $(OBJ_FILES_1024) $(NATIVE_DEPS_1024) $(MLKEM1024_BUILD_DIR)\test\test_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj @if NOT EXIST $(MLKEM1024_BUILD_DIR)\bin mkdir $(MLKEM1024_BUILD_DIR)\bin - $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=1024 /Fe$(MLKEM1024_BUILD_DIR)\bin\test_mlkem1024 $** /link + $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=1024 /Fe$(MLKEM1024_BUILD_DIR)\bin\test_mlkem1024 $(OBJ_FILES_1024) $(NATIVE_OBJ_1024) $(NATIVE_ASM_OBJ_1024) $(MLKEM1024_BUILD_DIR)\test\test_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj /link # compile acvp test for mlkem512 -acvp_mlkem512: $(OBJ_FILES_512) $(MLKEM512_BUILD_DIR)\test\acvp\acvp_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj +acvp_mlkem512: $(OBJ_FILES_512) $(NATIVE_DEPS_512) $(MLKEM512_BUILD_DIR)\test\acvp\acvp_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj @if NOT EXIST $(MLKEM512_BUILD_DIR)\bin mkdir $(MLKEM512_BUILD_DIR)\bin - $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=512 /Fe$(MLKEM512_BUILD_DIR)\bin\acvp_mlkem512 $** /link + $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=512 /Fe$(MLKEM512_BUILD_DIR)\bin\acvp_mlkem512 $(OBJ_FILES_512) $(NATIVE_OBJ_512) $(NATIVE_ASM_OBJ_512) $(MLKEM512_BUILD_DIR)\test\acvp\acvp_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj /link # compile acvp test for mlkem768 -acvp_mlkem768: $(OBJ_FILES_768) $(MLKEM768_BUILD_DIR)\test\acvp\acvp_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj +acvp_mlkem768: $(OBJ_FILES_768) $(NATIVE_DEPS_768) $(MLKEM768_BUILD_DIR)\test\acvp\acvp_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj @if NOT EXIST $(MLKEM768_BUILD_DIR)\bin mkdir $(MLKEM768_BUILD_DIR)\bin - $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=768 /Fe$(MLKEM768_BUILD_DIR)\bin\acvp_mlkem768 $** /link + $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=768 /Fe$(MLKEM768_BUILD_DIR)\bin\acvp_mlkem768 $(OBJ_FILES_768) $(NATIVE_OBJ_768) $(NATIVE_ASM_OBJ_768) $(MLKEM768_BUILD_DIR)\test\acvp\acvp_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj /link # compile acvp test for mlkem1024 -acvp_mlkem1024: $(OBJ_FILES_1024) $(MLKEM1024_BUILD_DIR)\test\acvp\acvp_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj +acvp_mlkem1024: $(OBJ_FILES_1024) $(NATIVE_DEPS_1024) $(MLKEM1024_BUILD_DIR)\test\acvp\acvp_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj @if NOT EXIST $(MLKEM1024_BUILD_DIR)\bin mkdir $(MLKEM1024_BUILD_DIR)\bin - $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=1024 /Fe$(MLKEM1024_BUILD_DIR)\bin\acvp_mlkem1024 $** /link + $(CC) $(CFLAGS) /D MLK_CONFIG_PARAMETER_SET=1024 /Fe$(MLKEM1024_BUILD_DIR)\bin\acvp_mlkem1024 $(OBJ_FILES_1024) $(NATIVE_OBJ_1024) $(NATIVE_ASM_OBJ_1024) $(MLKEM1024_BUILD_DIR)\test\acvp\acvp_mlkem.obj $(BUILD_DIR)\randombytes\notrandombytes.obj /link acvp: acvp_mlkem512 acvp_mlkem768 acvp_mlkem1024 diff --git a/mlkem/src/common.h b/mlkem/src/common.h index 463029bde9..d9e6e8083b 100644 --- a/mlkem/src/common.h +++ b/mlkem/src/common.h @@ -77,7 +77,10 @@ * -fcf-protection=), we add an endbr64 instruction at every global function * label. See sys.h for more details */ -#if defined(MLK_SYS_X86_64) +#if defined(MLK_ASM_ARMASM) +/* armasm64 labels are colon-free */ +#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) +#elif defined(MLK_SYS_X86_64) #define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) : MLK_CET_ENDBR #elif defined(MLK_SYS_ARMV81M_MVE) /* clang-format off */ diff --git a/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_aarch64_asm.asm b/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_aarch64_asm.asm new file mode 100644 index 0000000000..52e7c12b65 --- /dev/null +++ b/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_aarch64_asm.asm @@ -0,0 +1,351 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +// Author: Hanno Becker +// Author: Matthias Kannwischer + +/*yaml + Name: keccak_f1600_x1_scalar_aarch64_asm + Description: AArch64 scalar implementation of Keccak-f[1600] permutation for single state + Signature: void mlk_keccak_f1600_x1_scalar_aarch64_asm(uint64_t state[25], const uint64_t rc[24]) + ABI: + x0: + type: buffer + size_bytes: 200 + permissions: read/write + c_parameter: uint64_t state[25] + description: Keccak state (25 x uint64_t) + x1: + type: buffer + size_bytes: 192 + permissions: read-only + c_parameter: uint64_t const *rc + description: Round constants (24 x uint64_t) + Stack: + bytes: 128 + description: register preservation and temporary storage +*/ + +#include "../../../../common.h" +#if defined(MLK_FIPS202_AARCH64_NEED_X1_SCALAR) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/fips202/aarch64/src/keccak_f1600_x1_scalar_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(keccak_f1600_x1_scalar_aarch64_asm) +MLK_ASM_FN_SYMBOL(keccak_f1600_x1_scalar_aarch64_asm) + + sub sp, sp, #0x80 + stp x19, x20, [sp, #0x20] + stp x21, x22, [sp, #0x30] + stp x23, x24, [sp, #0x40] + stp x25, x26, [sp, #0x50] + stp x27, x28, [sp, #0x60] + stp x29, x30, [sp, #0x70] + +Lkeccak_f1600_x1_scalar_initial + mov x26, x1 + str x1, [sp, #0x8] + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + str x0, [sp] + eor x30, x24, x25 + eor x27, x9, x10 + eor x0, x30, x21 + eor x26, x27, x6 + eor x27, x26, x7 + eor x29, x0, x22 + eor x26, x29, x23 + eor x29, x4, x5 + eor x30, x29, x1 + eor x0, x27, x8 + eor x29, x30, x2 + eor x30, x19, x20 + eor x30, x30, x16 + eor x27, x26, x0, ror #63 + eor x4, x4, x27 + eor x30, x30, x17 + eor x30, x30, x28 + eor x29, x29, x3 + eor x0, x0, x30, ror #63 + eor x30, x30, x29, ror #63 + eor x22, x22, x30 + eor x23, x23, x30 + str x23, [sp, #0x18] + eor x23, x14, x15 + eor x14, x14, x0 + eor x23, x23, x11 + eor x15, x15, x0 + eor x1, x1, x27 + eor x23, x23, x12 + eor x23, x23, x13 + eor x11, x11, x0 + eor x29, x29, x23, ror #63 + eor x23, x23, x26, ror #63 + eor x26, x13, x0 + eor x13, x28, x23 + eor x28, x24, x30 + eor x24, x16, x23 + eor x16, x21, x30 + eor x21, x25, x30 + eor x30, x19, x23 + eor x19, x20, x23 + eor x20, x17, x23 + eor x17, x12, x0 + eor x0, x2, x27 + eor x2, x6, x29 + eor x6, x8, x29 + bic x8, x28, x13, ror #47 + eor x12, x3, x27 + bic x3, x13, x17, ror #19 + eor x5, x5, x27 + ldr x27, [sp, #0x18] + bic x25, x17, x2, ror #5 + eor x9, x9, x29 + eor x23, x25, x5, ror #52 + eor x3, x3, x2, ror #24 + eor x8, x8, x17, ror #2 + eor x17, x10, x29 + bic x25, x12, x22, ror #47 + eor x29, x7, x29 + bic x10, x4, x27, ror #2 + bic x7, x5, x28, ror #10 + eor x10, x10, x20, ror #50 + eor x13, x7, x13, ror #57 + bic x7, x2, x5, ror #47 + eor x2, x25, x24, ror #39 + bic x25, x20, x11, ror #57 + bic x5, x17, x4, ror #25 + eor x25, x25, x17, ror #53 + bic x17, x11, x17, ror #60 + eor x28, x7, x28, ror #57 + bic x7, x9, x12, ror #42 + eor x7, x7, x22, ror #25 + bic x22, x22, x24, ror #56 + bic x24, x24, x15, ror #31 + eor x22, x22, x15, ror #23 + bic x20, x27, x20, ror #48 + bic x15, x15, x9, ror #16 + eor x12, x15, x12, ror #58 + eor x15, x5, x27, ror #27 + eor x5, x20, x11, ror #41 + ldr x11, [sp, #0x8] + eor x20, x17, x4, ror #21 + eor x17, x24, x9, ror #47 + mov x24, #0x1 // =1 + bic x9, x0, x16, ror #9 + str x24, [sp, #0x10] + bic x24, x29, x1, ror #44 + bic x27, x1, x21, ror #50 + bic x4, x26, x29, ror #63 + eor x1, x1, x4, ror #21 + ldr x11, [x11] + bic x4, x21, x30, ror #57 + eor x21, x24, x21, ror #30 + eor x24, x9, x19, ror #44 + bic x9, x14, x6, ror #5 + eor x9, x9, x0, ror #43 + bic x0, x6, x0, ror #38 + eor x1, x1, x11 + eor x11, x4, x26, ror #35 + eor x4, x0, x16, ror #47 + bic x0, x16, x19, ror #35 + eor x16, x27, x30, ror #43 + bic x27, x30, x26, ror #42 + bic x26, x19, x14, ror #41 + eor x19, x0, x14, ror #12 + eor x14, x26, x6, ror #46 + eor x6, x27, x29, ror #41 + +Lkeccak_f1600_x1_scalar_loop + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + eor x26, x8, x9, ror #57 + eor x27, x0, x14, ror #10 + eor x29, x16, x28, ror #63 + eor x26, x26, x6, ror #51 + eor x30, x23, x22, ror #50 + eor x0, x26, x10, ror #31 + eor x29, x29, x19, ror #37 + eor x27, x27, x12, ror #5 + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + eor x26, x30, x21, ror #26 + eor x26, x26, x25, ror #15 + ror x30, x27, #0x3e + eor x30, x30, x26, ror #57 + ror x26, x26, #0x3a + eor x16, x30, x16 + eor x28, x30, x28, ror #63 + str x28, [sp, #0x18] + eor x29, x29, x17, ror #36 + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + eor x29, x29, x20, ror #2 + eor x28, x28, x4, ror #54 + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + eor x28, x28, x5, ror #25 + ror x0, x0, #0x38 + eor x0, x0, x29, ror #63 + eor x27, x28, x27, ror #61 + eor x13, x0, x13, ror #46 + eor x28, x29, x28, ror #63 + eor x29, x30, x20, ror #2 + eor x20, x26, x3, ror #39 + eor x11, x0, x11, ror #50 + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + eor x21, x26, x1 + eor x9, x27, x9, ror #49 + eor x24, x28, x24, ror #28 + eor x1, x30, x17, ror #36 + eor x14, x0, x14, ror #8 + eor x22, x28, x22, ror #44 + eor x8, x27, x8, ror #56 + eor x17, x27, x7, ror #19 + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + eor x4, x26, x4, ror #54 + eor x0, x0, x12, ror #3 + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + eor x26, x26, x5, ror #25 + eor x2, x7, x16, ror #39 + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + eor x7, x7, x22, ror #25 + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + eor x30, x27, x6, ror #43 + eor x22, x20, x15, ror #23 + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + bic x5, x13, x17, ror #63 + eor x5, x21, x5, ror #21 + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + bic x21, x21, x25, ror #50 + bic x20, x27, x4, ror #25 + bic x10, x16, x15, ror #31 + eor x16, x21, x19, ror #43 + eor x21, x17, x25, ror #30 + bic x19, x25, x19, ror #57 + ldr x25, [sp, #0x10] + eor x17, x10, x9, ror #47 + ldr x9, [sp, #0x8] + eor x15, x20, x28, ror #27 + bic x20, x4, x28, ror #2 + eor x10, x20, x1, ror #50 + bic x20, x11, x27, ror #60 + eor x20, x20, x4, ror #21 + bic x4, x28, x1, ror #48 + bic x1, x1, x11, ror #57 + ldr x28, [x9, x25, lsl #3] + ldr x9, [sp, #0x18] + add x25, x25, #0x1 + str x25, [sp, #0x10] + cmp x25, #0x17 + eor x25, x1, x27, ror #53 + bic x27, x30, x26, ror #47 + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + eor x11, x19, x13, ror #35 + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + bic x27, x24, x9, ror #47 + bic x19, x23, x3, ror #9 + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + bic x29, x3, x29, ror #35 + eor x13, x13, x9, ror #57 + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + bic x14, x14, x8, ror #5 + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + b.le Lkeccak_f1600_x1_scalar_loop + ror x6, x6, #0x2b + ror x11, x11, #0x32 + ror x21, x21, #0x14 + ror x2, x2, #0x3d + ror x7, x7, #0x13 + ror x12, x12, #0x3 + ror x17, x17, #0x24 + ror x22, x22, #0x2c + ror x3, x3, #0x27 + ror x8, x8, #0x38 + ror x13, x13, #0x2e + ror x28, x28, #0x3f + ror x23, x23, #0x3a + ror x4, x4, #0x36 + ror x9, x9, #0x31 + ror x14, x14, #0x8 + ror x19, x19, #0x25 + ror x24, x24, #0x1c + ror x5, x5, #0x19 + ror x10, x10, #0x17 + ror x15, x15, #0x3e + ror x20, x20, #0x2 + ror x25, x25, #0x9 + ldr x0, [sp] + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + ldp x19, x20, [sp, #0x20] + ldp x21, x22, [sp, #0x30] + ldp x23, x24, [sp, #0x40] + ldp x25, x26, [sp, #0x50] + ldp x27, x28, [sp, #0x60] + ldp x29, x30, [sp, #0x70] + add sp, sp, #0x80 + ret + +MLK_ASM_FN_SIZE(keccak_f1600_x1_scalar_aarch64_asm) + +#endif /* MLK_FIPS202_AARCH64_NEED_X1_SCALAR && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +#if defined(__ELF__) +#endif + + END diff --git a/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_aarch64_asm.asm b/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_aarch64_asm.asm new file mode 100644 index 0000000000..04c1a4e2aa --- /dev/null +++ b/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_aarch64_asm.asm @@ -0,0 +1,187 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [HYBRID] + * Hybrid scalar/vector implementations of Keccak and SPHINCS+ on AArch64 + * Becker, Kannwischer + * https://eprint.iacr.org/2022/1243 + */ + +/*yaml + Name: keccak_f1600_x1_v84a_aarch64_asm + Description: AArch64 ARMv8.4-A implementation of Keccak-f[1600] permutation for single state + Signature: void mlk_keccak_f1600_x1_v84a_aarch64_asm(uint64_t state[25], const uint64_t rc[24]) + ABI: + x0: + type: buffer + size_bytes: 200 + permissions: read/write + c_parameter: uint64_t state[25] + description: Keccak state (25 x uint64_t) + x1: + type: buffer + size_bytes: 192 + permissions: read-only + c_parameter: const uint64_t rc[24] + description: Round constants (24 x uint64_t) + Stack: + bytes: 64 + description: register preservation +*/ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// +// This implementation is essentially from the paper @[HYBRID]. +// The only difference is interleaving/deinterleaving of Keccak state +// during load and store, so that the caller need not do this. +// + +#include "../../../../common.h" +#if defined(MLK_FIPS202_AARCH64_NEED_X1_V84A) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#if defined(__ARM_FEATURE_SHA3) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/fips202/aarch64/src/keccak_f1600_x1_v84a_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(keccak_f1600_x1_v84a_aarch64_asm) +MLK_ASM_FN_SYMBOL(keccak_f1600_x1_v84a_aarch64_asm) + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + ldp d0, d1, [x0] + ldp d2, d3, [x0, #0x10] + ldp d4, d5, [x0, #0x20] + ldp d6, d7, [x0, #0x30] + ldp d8, d9, [x0, #0x40] + ldp d10, d11, [x0, #0x50] + ldp d12, d13, [x0, #0x60] + ldp d14, d15, [x0, #0x70] + ldp d16, d17, [x0, #0x80] + ldp d18, d19, [x0, #0x90] + ldp d20, d21, [x0, #0xa0] + ldp d22, d23, [x0, #0xb0] + ldr d24, [x0, #0xc0] + mov x2, #0x18 // =24 + +Lkeccak_f1600_x1_v84a_loop + eor3 v30.16b, v0.16b, v5.16b, v10.16b + eor3 v29.16b, v1.16b, v6.16b, v11.16b + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor3 v27.16b, v3.16b, v8.16b, v13.16b + eor3 v26.16b, v4.16b, v9.16b, v14.16b + eor3 v30.16b, v30.16b, v15.16b, v20.16b + eor3 v29.16b, v29.16b, v16.16b, v21.16b + eor3 v28.16b, v28.16b, v17.16b, v22.16b + eor3 v27.16b, v27.16b, v18.16b, v23.16b + eor3 v26.16b, v26.16b, v19.16b, v24.16b + rax1 v25.2d, v30.2d, v28.2d + rax1 v28.2d, v28.2d, v26.2d + rax1 v26.2d, v26.2d, v29.2d + rax1 v29.2d, v29.2d, v27.2d + rax1 v27.2d, v27.2d, v30.2d + eor v30.16b, v0.16b, v26.16b + xar v0.2d, v2.2d, v29.2d, #0x2 + xar v2.2d, v12.2d, v29.2d, #0x15 + xar v12.2d, v13.2d, v28.2d, #0x27 + xar v13.2d, v19.2d, v27.2d, #0x38 + xar v19.2d, v23.2d, v28.2d, #0x8 + xar v23.2d, v15.2d, v26.2d, #0x17 + xar v15.2d, v1.2d, v25.2d, #0x3f + xar v1.2d, v8.2d, v28.2d, #0x9 + xar v8.2d, v16.2d, v25.2d, #0x13 + xar v16.2d, v7.2d, v29.2d, #0x3a + xar v7.2d, v10.2d, v26.2d, #0x3d + xar v10.2d, v3.2d, v28.2d, #0x24 + xar v3.2d, v18.2d, v28.2d, #0x2b + xar v18.2d, v17.2d, v29.2d, #0x31 + xar v17.2d, v11.2d, v25.2d, #0x36 + xar v11.2d, v9.2d, v27.2d, #0x2c + xar v9.2d, v22.2d, v29.2d, #0x3 + xar v22.2d, v14.2d, v27.2d, #0x19 + xar v14.2d, v20.2d, v26.2d, #0x2e + xar v20.2d, v4.2d, v27.2d, #0x25 + xar v4.2d, v24.2d, v27.2d, #0x32 + xar v24.2d, v21.2d, v25.2d, #0x3e + xar v21.2d, v5.2d, v26.2d, #0x1c + xar v27.2d, v6.2d, v25.2d, #0x14 + ld1r { v31.2d }, [x1], #8 + bcax v5.16b, v10.16b, v7.16b, v11.16b + bcax v6.16b, v11.16b, v8.16b, v7.16b + bcax v7.16b, v7.16b, v9.16b, v8.16b + bcax v8.16b, v8.16b, v10.16b, v9.16b + bcax v9.16b, v9.16b, v11.16b, v10.16b + bcax v10.16b, v15.16b, v12.16b, v16.16b + bcax v11.16b, v16.16b, v13.16b, v12.16b + bcax v12.16b, v12.16b, v14.16b, v13.16b + bcax v13.16b, v13.16b, v15.16b, v14.16b + bcax v14.16b, v14.16b, v16.16b, v15.16b + bcax v15.16b, v20.16b, v17.16b, v21.16b + bcax v16.16b, v21.16b, v18.16b, v17.16b + bcax v17.16b, v17.16b, v19.16b, v18.16b + bcax v18.16b, v18.16b, v20.16b, v19.16b + bcax v19.16b, v19.16b, v21.16b, v20.16b + bcax v20.16b, v0.16b, v22.16b, v1.16b + bcax v21.16b, v1.16b, v23.16b, v22.16b + bcax v22.16b, v22.16b, v24.16b, v23.16b + bcax v23.16b, v23.16b, v0.16b, v24.16b + bcax v24.16b, v24.16b, v1.16b, v0.16b + bcax v0.16b, v30.16b, v2.16b, v27.16b + bcax v1.16b, v27.16b, v3.16b, v2.16b + bcax v2.16b, v2.16b, v4.16b, v3.16b + bcax v3.16b, v3.16b, v30.16b, v4.16b + bcax v4.16b, v4.16b, v27.16b, v30.16b + eor v0.16b, v0.16b, v31.16b + sub x2, x2, #0x1 + cbnz x2, Lkeccak_f1600_x1_v84a_loop + stp d0, d1, [x0] + stp d2, d3, [x0, #0x10] + stp d4, d5, [x0, #0x20] + stp d6, d7, [x0, #0x30] + stp d8, d9, [x0, #0x40] + stp d10, d11, [x0, #0x50] + stp d12, d13, [x0, #0x60] + stp d14, d15, [x0, #0x70] + stp d16, d17, [x0, #0x80] + stp d18, d19, [x0, #0x90] + stp d20, d21, [x0, #0xa0] + stp d22, d23, [x0, #0xb0] + str d24, [x0, #0xc0] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret + +MLK_ASM_FN_SIZE(keccak_f1600_x1_v84a_aarch64_asm) + +#endif /* __ARM_FEATURE_SHA3 */ + +#endif /* MLK_FIPS202_AARCH64_NEED_X1_V84A && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +#if defined(__ELF__) +#endif + + END diff --git a/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_aarch64_asm.asm b/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_aarch64_asm.asm new file mode 100644 index 0000000000..5371eda67c --- /dev/null +++ b/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_aarch64_asm.asm @@ -0,0 +1,242 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [HYBRID] + * Hybrid scalar/vector implementations of Keccak and SPHINCS+ on AArch64 + * Becker, Kannwischer + * https://eprint.iacr.org/2022/1243 + */ + +/*yaml + Name: keccak_f1600_x2_v84a_aarch64_asm + Description: AArch64 ARMv8.4-A implementation of Keccak-f[1600] permutation for two sequential states + Signature: void mlk_keccak_f1600_x2_v84a_aarch64_asm(uint64_t state[50], const uint64_t rc[24]) + ABI: + x0: + type: buffer + size_bytes: 400 + permissions: read/write + c_parameter: uint64_t state[50] + description: Two sequential Keccak states (state0[25], state1[25]) + x1: + type: buffer + size_bytes: 192 + permissions: read-only + c_parameter: const uint64_t rc[24] + description: Round constants (24 x uint64_t) + Stack: + bytes: 64 + description: register preservation +*/ + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// +// This implementation is essentially from the paper @[HYBRID]. +// The only difference is interleaving/deinterleaving of Keccak state +// during load and store, so that the caller need not do this. +// + +#include "../../../../common.h" +#if defined(MLK_FIPS202_AARCH64_NEED_X2_V84A) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#if defined(__ARM_FEATURE_SHA3) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/fips202/aarch64/src/keccak_f1600_x2_v84a_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(keccak_f1600_x2_v84a_aarch64_asm) +MLK_ASM_FN_SYMBOL(keccak_f1600_x2_v84a_aarch64_asm) + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + add x2, x0, #0xc8 + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x2], #32 + trn1 v0.2d, v25.2d, v27.2d + trn2 v1.2d, v25.2d, v27.2d + trn1 v2.2d, v26.2d, v28.2d + trn2 v3.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x2], #32 + trn1 v4.2d, v25.2d, v27.2d + trn2 v5.2d, v25.2d, v27.2d + trn1 v6.2d, v26.2d, v28.2d + trn2 v7.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x2], #32 + trn1 v8.2d, v25.2d, v27.2d + trn2 v9.2d, v25.2d, v27.2d + trn1 v10.2d, v26.2d, v28.2d + trn2 v11.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x2], #32 + trn1 v12.2d, v25.2d, v27.2d + trn2 v13.2d, v25.2d, v27.2d + trn1 v14.2d, v26.2d, v28.2d + trn2 v15.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x2], #32 + trn1 v16.2d, v25.2d, v27.2d + trn2 v17.2d, v25.2d, v27.2d + trn1 v18.2d, v26.2d, v28.2d + trn2 v19.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x2], #32 + trn1 v20.2d, v25.2d, v27.2d + trn2 v21.2d, v25.2d, v27.2d + trn1 v22.2d, v26.2d, v28.2d + trn2 v23.2d, v26.2d, v28.2d + ldr d25, [x0] + ldr d27, [x2] + trn1 v24.2d, v25.2d, v27.2d + mov x2, #0x18 // =24 + +Lkeccak_f1600_x2_v84a_loop + eor3 v30.16b, v0.16b, v5.16b, v10.16b + eor3 v29.16b, v1.16b, v6.16b, v11.16b + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor3 v27.16b, v3.16b, v8.16b, v13.16b + eor3 v26.16b, v4.16b, v9.16b, v14.16b + eor3 v30.16b, v30.16b, v15.16b, v20.16b + eor3 v29.16b, v29.16b, v16.16b, v21.16b + eor3 v28.16b, v28.16b, v17.16b, v22.16b + eor3 v27.16b, v27.16b, v18.16b, v23.16b + eor3 v26.16b, v26.16b, v19.16b, v24.16b + rax1 v25.2d, v30.2d, v28.2d + rax1 v28.2d, v28.2d, v26.2d + rax1 v26.2d, v26.2d, v29.2d + rax1 v29.2d, v29.2d, v27.2d + rax1 v27.2d, v27.2d, v30.2d + eor v30.16b, v0.16b, v26.16b + xar v0.2d, v2.2d, v29.2d, #0x2 + xar v2.2d, v12.2d, v29.2d, #0x15 + xar v12.2d, v13.2d, v28.2d, #0x27 + xar v13.2d, v19.2d, v27.2d, #0x38 + xar v19.2d, v23.2d, v28.2d, #0x8 + xar v23.2d, v15.2d, v26.2d, #0x17 + xar v15.2d, v1.2d, v25.2d, #0x3f + xar v1.2d, v8.2d, v28.2d, #0x9 + xar v8.2d, v16.2d, v25.2d, #0x13 + xar v16.2d, v7.2d, v29.2d, #0x3a + xar v7.2d, v10.2d, v26.2d, #0x3d + xar v10.2d, v3.2d, v28.2d, #0x24 + xar v3.2d, v18.2d, v28.2d, #0x2b + xar v18.2d, v17.2d, v29.2d, #0x31 + xar v17.2d, v11.2d, v25.2d, #0x36 + xar v11.2d, v9.2d, v27.2d, #0x2c + xar v9.2d, v22.2d, v29.2d, #0x3 + xar v22.2d, v14.2d, v27.2d, #0x19 + xar v14.2d, v20.2d, v26.2d, #0x2e + xar v20.2d, v4.2d, v27.2d, #0x25 + xar v4.2d, v24.2d, v27.2d, #0x32 + xar v24.2d, v21.2d, v25.2d, #0x3e + xar v21.2d, v5.2d, v26.2d, #0x1c + xar v27.2d, v6.2d, v25.2d, #0x14 + ld1r { v31.2d }, [x1], #8 + bcax v5.16b, v10.16b, v7.16b, v11.16b + bcax v6.16b, v11.16b, v8.16b, v7.16b + bcax v7.16b, v7.16b, v9.16b, v8.16b + bcax v8.16b, v8.16b, v10.16b, v9.16b + bcax v9.16b, v9.16b, v11.16b, v10.16b + bcax v10.16b, v15.16b, v12.16b, v16.16b + bcax v11.16b, v16.16b, v13.16b, v12.16b + bcax v12.16b, v12.16b, v14.16b, v13.16b + bcax v13.16b, v13.16b, v15.16b, v14.16b + bcax v14.16b, v14.16b, v16.16b, v15.16b + bcax v15.16b, v20.16b, v17.16b, v21.16b + bcax v16.16b, v21.16b, v18.16b, v17.16b + bcax v17.16b, v17.16b, v19.16b, v18.16b + bcax v18.16b, v18.16b, v20.16b, v19.16b + bcax v19.16b, v19.16b, v21.16b, v20.16b + bcax v20.16b, v0.16b, v22.16b, v1.16b + bcax v21.16b, v1.16b, v23.16b, v22.16b + bcax v22.16b, v22.16b, v24.16b, v23.16b + bcax v23.16b, v23.16b, v0.16b, v24.16b + bcax v24.16b, v24.16b, v1.16b, v0.16b + bcax v0.16b, v30.16b, v2.16b, v27.16b + bcax v1.16b, v27.16b, v3.16b, v2.16b + bcax v2.16b, v2.16b, v4.16b, v3.16b + bcax v3.16b, v3.16b, v30.16b, v4.16b + bcax v4.16b, v4.16b, v27.16b, v30.16b + eor v0.16b, v0.16b, v31.16b + sub x2, x2, #0x1 + cbnz x2, Lkeccak_f1600_x2_v84a_loop + sub x0, x0, #0xc0 + add x2, x0, #0xc8 + trn1 v25.2d, v0.2d, v1.2d + trn1 v26.2d, v2.2d, v3.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v0.2d, v1.2d + trn2 v28.2d, v2.2d, v3.2d + st1 { v27.2d, v28.2d }, [x2], #32 + trn1 v25.2d, v4.2d, v5.2d + trn1 v26.2d, v6.2d, v7.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v4.2d, v5.2d + trn2 v28.2d, v6.2d, v7.2d + st1 { v27.2d, v28.2d }, [x2], #32 + trn1 v25.2d, v8.2d, v9.2d + trn1 v26.2d, v10.2d, v11.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v8.2d, v9.2d + trn2 v28.2d, v10.2d, v11.2d + st1 { v27.2d, v28.2d }, [x2], #32 + trn1 v25.2d, v12.2d, v13.2d + trn1 v26.2d, v14.2d, v15.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v12.2d, v13.2d + trn2 v28.2d, v14.2d, v15.2d + st1 { v27.2d, v28.2d }, [x2], #32 + trn1 v25.2d, v16.2d, v17.2d + trn1 v26.2d, v18.2d, v19.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v16.2d, v17.2d + trn2 v28.2d, v18.2d, v19.2d + st1 { v27.2d, v28.2d }, [x2], #32 + trn1 v25.2d, v20.2d, v21.2d + trn1 v26.2d, v22.2d, v23.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v20.2d, v21.2d + trn2 v28.2d, v22.2d, v23.2d + st1 { v27.2d, v28.2d }, [x2], #32 + str d24, [x0] + trn2 v25.2d, v24.2d, v24.2d + str d25, [x2] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret + +MLK_ASM_FN_SIZE(keccak_f1600_x2_v84a_aarch64_asm) + +#endif /* __ARM_FEATURE_SHA3 */ + +#endif /* MLK_FIPS202_AARCH64_NEED_X2_V84A && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +#if defined(__ELF__) +#endif + + END diff --git a/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm.asm b/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm.asm new file mode 100644 index 0000000000..4feddf7cc2 --- /dev/null +++ b/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm.asm @@ -0,0 +1,1036 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +// Author: Hanno Becker +// Author: Matthias Kannwischer + +/*yaml + Name: keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm + Description: AArch64 hybrid scalar/vector implementation of Keccak-f[1600] permutation for four sequential states + Signature: void mlk_keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm(uint64_t state[100], const uint64_t rc[24]) + ABI: + x0: + type: buffer + size_bytes: 800 + permissions: read/write + c_parameter: uint64_t state[100] + description: Four sequential Keccak states (state0[25], state1[25], state2[25], state3[25]) + x1: + type: buffer + size_bytes: 192 + permissions: read-only + c_parameter: const uint64_t rc[24] + description: Round constants (24 x uint64_t) + Stack: + bytes: 224 + description: register preservation and temporary storage +*/ + +#include "../../../../common.h" +#if defined(MLK_FIPS202_AARCH64_NEED_X4_V8A_SCALAR_HYBRID) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/fips202/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm) +MLK_ASM_FN_SYMBOL(keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm) + + sub sp, sp, #0xe0 + stp x19, x20, [sp, #0x30] + stp x21, x22, [sp, #0x40] + stp x23, x24, [sp, #0x50] + stp x25, x26, [sp, #0x60] + stp x27, x28, [sp, #0x70] + stp x29, x30, [sp, #0x80] + stp d8, d9, [sp, #0x90] + stp d10, d11, [sp, #0xa0] + stp d12, d13, [sp, #0xb0] + stp d14, d15, [sp, #0xc0] + mov x29, x1 + mov x30, #0x0 // =0 + str x30, [sp, #0x20] + str x29, [sp, #0x8] + str x29, [sp, #0x10] + str x0, [sp] + add x4, x0, #0xc8 + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v0.2d, v25.2d, v27.2d + trn2 v1.2d, v25.2d, v27.2d + trn1 v2.2d, v26.2d, v28.2d + trn2 v3.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v4.2d, v25.2d, v27.2d + trn2 v5.2d, v25.2d, v27.2d + trn1 v6.2d, v26.2d, v28.2d + trn2 v7.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v8.2d, v25.2d, v27.2d + trn2 v9.2d, v25.2d, v27.2d + trn1 v10.2d, v26.2d, v28.2d + trn2 v11.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v12.2d, v25.2d, v27.2d + trn2 v13.2d, v25.2d, v27.2d + trn1 v14.2d, v26.2d, v28.2d + trn2 v15.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v16.2d, v25.2d, v27.2d + trn2 v17.2d, v25.2d, v27.2d + trn1 v18.2d, v26.2d, v28.2d + trn2 v19.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v20.2d, v25.2d, v27.2d + trn2 v21.2d, v25.2d, v27.2d + trn1 v22.2d, v26.2d, v28.2d + trn2 v23.2d, v26.2d, v28.2d + ldr d25, [x0] + ldr d27, [x4] + trn1 v24.2d, v25.2d, v27.2d + sub x0, x0, #0xc0 + add x0, x0, #0x190 + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + sub x0, x0, #0x190 + +Lkeccak_f1600_x4_v8a_scalar_hybrid_initial + eor x30, x24, x25 + eor x27, x9, x10 + eor v30.16b, v0.16b, v5.16b + eor v30.16b, v30.16b, v10.16b + eor x0, x30, x21 + eor v30.16b, v30.16b, v15.16b + eor x26, x27, x6 + eor x27, x26, x7 + eor v30.16b, v30.16b, v20.16b + eor x29, x0, x22 + eor v29.16b, v1.16b, v6.16b + eor x26, x29, x23 + eor v29.16b, v29.16b, v11.16b + eor x29, x4, x5 + eor x30, x29, x1 + eor v29.16b, v29.16b, v16.16b + eor x0, x27, x8 + eor v29.16b, v29.16b, v21.16b + eor x29, x30, x2 + eor v28.16b, v2.16b, v7.16b + eor x30, x19, x20 + eor x30, x30, x16 + eor v28.16b, v28.16b, v12.16b + eor x27, x26, x0, ror #63 + eor v28.16b, v28.16b, v17.16b + eor x4, x4, x27 + eor v28.16b, v28.16b, v22.16b + eor x30, x30, x17 + eor x30, x30, x28 + eor v27.16b, v3.16b, v8.16b + eor x29, x29, x3 + eor v27.16b, v27.16b, v13.16b + eor x0, x0, x30, ror #63 + eor v27.16b, v27.16b, v18.16b + eor x30, x30, x29, ror #63 + eor x22, x22, x30 + eor v27.16b, v27.16b, v23.16b + eor x23, x23, x30 + eor v26.16b, v4.16b, v9.16b + str x23, [sp, #0xd0] + eor v26.16b, v26.16b, v14.16b + eor x23, x14, x15 + eor x14, x14, x0 + eor v26.16b, v26.16b, v19.16b + eor x23, x23, x11 + eor v26.16b, v26.16b, v24.16b + eor x15, x15, x0 + eor x1, x1, x27 + add v31.2d, v28.2d, v28.2d + eor x23, x23, x12 + sri v31.2d, v28.2d, #0x3f + eor x23, x23, x13 + eor v25.16b, v31.16b, v30.16b + eor x11, x11, x0 + eor x29, x29, x23, ror #63 + add v31.2d, v26.2d, v26.2d + eor x23, x23, x26, ror #63 + sri v31.2d, v26.2d, #0x3f + eor x26, x13, x0 + eor v28.16b, v31.16b, v28.16b + eor x13, x28, x23 + eor x28, x24, x30 + add v31.2d, v29.2d, v29.2d + eor x24, x16, x23 + sri v31.2d, v29.2d, #0x3f + eor x16, x21, x30 + eor v26.16b, v31.16b, v26.16b + eor x21, x25, x30 + eor x30, x19, x23 + add v31.2d, v27.2d, v27.2d + eor x19, x20, x23 + sri v31.2d, v27.2d, #0x3f + eor x20, x17, x23 + eor v29.16b, v31.16b, v29.16b + eor x17, x12, x0 + eor x0, x2, x27 + add v31.2d, v30.2d, v30.2d + eor x2, x6, x29 + sri v31.2d, v30.2d, #0x3f + eor x6, x8, x29 + eor v27.16b, v31.16b, v27.16b + bic x8, x28, x13, ror #47 + eor x12, x3, x27 + eor v30.16b, v0.16b, v26.16b + bic x3, x13, x17, ror #19 + eor v31.16b, v2.16b, v29.16b + eor x5, x5, x27 + ldr x27, [sp, #0xd0] + shl v0.2d, v31.2d, #0x3e + bic x25, x17, x2, ror #5 + sri v0.2d, v31.2d, #0x2 + eor x9, x9, x29 + eor v31.16b, v12.16b, v29.16b + eor x23, x25, x5, ror #52 + eor x3, x3, x2, ror #24 + shl v2.2d, v31.2d, #0x2b + eor x8, x8, x17, ror #2 + sri v2.2d, v31.2d, #0x15 + eor x17, x10, x29 + eor v31.16b, v13.16b, v28.16b + bic x25, x12, x22, ror #47 + eor x29, x7, x29 + shl v12.2d, v31.2d, #0x19 + bic x10, x4, x27, ror #2 + sri v12.2d, v31.2d, #0x27 + bic x7, x5, x28, ror #10 + eor v31.16b, v19.16b, v27.16b + eor x10, x10, x20, ror #50 + eor x13, x7, x13, ror #57 + shl v13.2d, v31.2d, #0x8 + bic x7, x2, x5, ror #47 + sri v13.2d, v31.2d, #0x38 + eor x2, x25, x24, ror #39 + eor v31.16b, v23.16b, v28.16b + bic x25, x20, x11, ror #57 + bic x5, x17, x4, ror #25 + shl v19.2d, v31.2d, #0x38 + eor x25, x25, x17, ror #53 + sri v19.2d, v31.2d, #0x8 + bic x17, x11, x17, ror #60 + eor v31.16b, v15.16b, v26.16b + eor x28, x7, x28, ror #57 + bic x7, x9, x12, ror #42 + shl v23.2d, v31.2d, #0x29 + eor x7, x7, x22, ror #25 + sri v23.2d, v31.2d, #0x17 + bic x22, x22, x24, ror #56 + bic x24, x24, x15, ror #31 + eor v31.16b, v1.16b, v25.16b + eor x22, x22, x15, ror #23 + shl v15.2d, v31.2d, #0x1 + bic x20, x27, x20, ror #48 + sri v15.2d, v31.2d, #0x3f + bic x15, x15, x9, ror #16 + eor x12, x15, x12, ror #58 + eor v31.16b, v8.16b, v28.16b + eor x15, x5, x27, ror #27 + shl v1.2d, v31.2d, #0x37 + eor x5, x20, x11, ror #41 + sri v1.2d, v31.2d, #0x9 + ldr x11, [sp, #0x8] + eor x20, x17, x4, ror #21 + eor v31.16b, v16.16b, v25.16b + eor x17, x24, x9, ror #47 + shl v8.2d, v31.2d, #0x2d + mov x24, #0x1 // =1 + sri v8.2d, v31.2d, #0x13 + bic x9, x0, x16, ror #9 + str x24, [sp, #0x18] + eor v31.16b, v7.16b, v29.16b + bic x24, x29, x1, ror #44 + shl v16.2d, v31.2d, #0x6 + bic x27, x1, x21, ror #50 + sri v16.2d, v31.2d, #0x3a + bic x4, x26, x29, ror #63 + eor x1, x1, x4, ror #21 + eor v31.16b, v10.16b, v26.16b + ldr x11, [x11] + shl v7.2d, v31.2d, #0x3 + bic x4, x21, x30, ror #57 + sri v7.2d, v31.2d, #0x3d + eor x21, x24, x21, ror #30 + eor x24, x9, x19, ror #44 + eor v31.16b, v3.16b, v28.16b + bic x9, x14, x6, ror #5 + shl v10.2d, v31.2d, #0x1c + eor x9, x9, x0, ror #43 + sri v10.2d, v31.2d, #0x24 + bic x0, x6, x0, ror #38 + eor x1, x1, x11 + eor v31.16b, v18.16b, v28.16b + eor x11, x4, x26, ror #35 + shl v3.2d, v31.2d, #0x15 + eor x4, x0, x16, ror #47 + bic x0, x16, x19, ror #35 + sri v3.2d, v31.2d, #0x2b + eor x16, x27, x30, ror #43 + eor v31.16b, v17.16b, v29.16b + bic x27, x30, x26, ror #42 + shl v18.2d, v31.2d, #0xf + bic x26, x19, x14, ror #41 + eor x19, x0, x14, ror #12 + sri v18.2d, v31.2d, #0x31 + eor x14, x26, x6, ror #46 + eor v31.16b, v11.16b, v25.16b + eor x6, x27, x29, ror #41 + shl v17.2d, v31.2d, #0xa + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + sri v17.2d, v31.2d, #0x36 + eor x26, x8, x9, ror #57 + eor v31.16b, v9.16b, v27.16b + eor x27, x0, x14, ror #10 + shl v11.2d, v31.2d, #0x14 + eor x29, x16, x28, ror #63 + eor x26, x26, x6, ror #51 + sri v11.2d, v31.2d, #0x2c + eor x30, x23, x22, ror #50 + eor v31.16b, v22.16b, v29.16b + eor x0, x26, x10, ror #31 + shl v9.2d, v31.2d, #0x3d + eor x29, x29, x19, ror #37 + eor x27, x27, x12, ror #5 + sri v9.2d, v31.2d, #0x3 + eor x30, x30, x24, ror #34 + eor v31.16b, v14.16b, v27.16b + eor x0, x0, x7, ror #27 + shl v22.2d, v31.2d, #0x27 + eor x26, x30, x21, ror #26 + eor x26, x26, x25, ror #15 + sri v22.2d, v31.2d, #0x19 + ror x30, x27, #0x3e + eor v31.16b, v20.16b, v26.16b + eor x30, x30, x26, ror #57 + ror x26, x26, #0x3a + shl v14.2d, v31.2d, #0x12 + eor x16, x30, x16 + sri v14.2d, v31.2d, #0x2e + eor x28, x30, x28, ror #63 + eor v31.16b, v4.16b, v27.16b + str x28, [sp, #0xd0] + eor x29, x29, x17, ror #36 + shl v20.2d, v31.2d, #0x1b + eor x28, x1, x2, ror #61 + sri v20.2d, v31.2d, #0x25 + eor x19, x30, x19, ror #37 + eor v31.16b, v24.16b, v27.16b + eor x29, x29, x20, ror #2 + eor x28, x28, x4, ror #54 + shl v4.2d, v31.2d, #0xe + eor x26, x26, x0, ror #55 + sri v4.2d, v31.2d, #0x32 + eor x28, x28, x3, ror #39 + eor v31.16b, v21.16b, v25.16b + eor x28, x28, x5, ror #25 + ror x0, x0, #0x38 + shl v24.2d, v31.2d, #0x2 + eor x0, x0, x29, ror #63 + sri v24.2d, v31.2d, #0x3e + eor x27, x28, x27, ror #61 + eor v31.16b, v5.16b, v26.16b + eor x13, x0, x13, ror #46 + eor x28, x29, x28, ror #63 + shl v21.2d, v31.2d, #0x24 + eor x29, x30, x20, ror #2 + sri v21.2d, v31.2d, #0x1c + eor x20, x26, x3, ror #39 + eor v31.16b, v6.16b, v25.16b + eor x11, x0, x11, ror #50 + eor x25, x28, x25, ror #9 + shl v27.2d, v31.2d, #0x2c + eor x3, x28, x21, ror #20 + sri v27.2d, v31.2d, #0x14 + eor x21, x26, x1 + eor x9, x27, x9, ror #49 + bic v31.16b, v7.16b, v11.16b + eor x24, x28, x24, ror #28 + eor v5.16b, v31.16b, v10.16b + eor x1, x30, x17, ror #36 + bic v31.16b, v8.16b, v7.16b + eor x14, x0, x14, ror #8 + eor x22, x28, x22, ror #44 + eor v6.16b, v31.16b, v11.16b + eor x8, x27, x8, ror #56 + bic v31.16b, v9.16b, v8.16b + eor x17, x27, x7, ror #19 + eor v7.16b, v31.16b, v7.16b + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + bic v31.16b, v10.16b, v9.16b + eor x4, x26, x4, ror #54 + eor v8.16b, v31.16b, v8.16b + eor x0, x0, x12, ror #3 + bic v31.16b, v11.16b, v10.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + eor v9.16b, v31.16b, v9.16b + eor x26, x26, x5, ror #25 + bic v31.16b, v12.16b, v16.16b + eor x2, x7, x16, ror #39 + eor v10.16b, v31.16b, v15.16b + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + bic v31.16b, v13.16b, v12.16b + eor x7, x7, x22, ror #25 + eor v11.16b, v31.16b, v16.16b + eor x12, x30, x20, ror #58 + bic v31.16b, v14.16b, v13.16b + bic x20, x22, x16, ror #56 + eor x30, x27, x6, ror #43 + eor v12.16b, v31.16b, v12.16b + eor x22, x20, x15, ror #23 + bic v31.16b, v15.16b, v14.16b + bic x6, x19, x13, ror #42 + eor v13.16b, v31.16b, v13.16b + eor x6, x6, x17, ror #41 + bic x5, x13, x17, ror #63 + bic v31.16b, v16.16b, v15.16b + eor x5, x21, x5, ror #21 + eor v14.16b, v31.16b, v14.16b + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + bic v31.16b, v17.16b, v21.16b + bic x21, x21, x25, ror #50 + eor v15.16b, v31.16b, v20.16b + bic x20, x27, x4, ror #25 + bic v31.16b, v18.16b, v17.16b + bic x10, x16, x15, ror #31 + eor x16, x21, x19, ror #43 + eor v16.16b, v31.16b, v21.16b + eor x21, x17, x25, ror #30 + bic v31.16b, v19.16b, v18.16b + bic x19, x25, x19, ror #57 + eor v17.16b, v31.16b, v17.16b + ldr x25, [sp, #0x18] + eor x17, x10, x9, ror #47 + bic v31.16b, v20.16b, v19.16b + ldr x9, [sp, #0x8] + eor v18.16b, v31.16b, v18.16b + eor x15, x20, x28, ror #27 + bic v31.16b, v21.16b, v20.16b + bic x20, x4, x28, ror #2 + eor x10, x20, x1, ror #50 + eor v19.16b, v31.16b, v19.16b + bic x20, x11, x27, ror #60 + bic v31.16b, v22.16b, v1.16b + eor x20, x20, x4, ror #21 + eor v20.16b, v31.16b, v0.16b + bic x4, x28, x1, ror #48 + bic x1, x1, x11, ror #57 + bic v31.16b, v23.16b, v22.16b + ldr x28, [x9, x25, lsl #3] + eor v21.16b, v31.16b, v1.16b + ldr x9, [sp, #0xd0] + bic v31.16b, v24.16b, v23.16b + add x25, x25, #0x1 + str x25, [sp, #0x18] + eor v22.16b, v31.16b, v22.16b + cmp x25, #0x17 + bic v31.16b, v0.16b, v24.16b + eor x25, x1, x27, ror #53 + bic x27, x30, x26, ror #47 + eor v23.16b, v31.16b, v23.16b + eor x1, x5, x28 + bic v31.16b, v1.16b, v0.16b + eor x5, x4, x11, ror #41 + eor v24.16b, v31.16b, v24.16b + eor x11, x19, x13, ror #35 + bic x13, x26, x24, ror #10 + bic v31.16b, v2.16b, v27.16b + eor x28, x27, x24, ror #57 + eor v0.16b, v31.16b, v30.16b + bic x27, x24, x9, ror #47 + bic v31.16b, v3.16b, v2.16b + bic x19, x23, x3, ror #9 + bic x4, x29, x14, ror #41 + eor v1.16b, v31.16b, v27.16b + eor x24, x19, x29, ror #44 + bic v31.16b, v4.16b, v3.16b + bic x29, x3, x29, ror #35 + eor v2.16b, v31.16b, v2.16b + eor x13, x13, x9, ror #57 + eor x19, x29, x14, ror #12 + bic v31.16b, v30.16b, v4.16b + bic x29, x9, x0, ror #19 + eor v3.16b, v31.16b, v3.16b + bic x14, x14, x8, ror #5 + bic v31.16b, v27.16b, v30.16b + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + eor v4.16b, v31.16b, v4.16b + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + ldr x30, [sp, #0x10] + ld1r { v28.2d }, [x30], #8 + str x30, [sp, #0x10] + eor v0.16b, v0.16b, v28.16b + +Lkeccak_f1600_x4_v8a_scalar_hybrid_loop + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + eor v30.16b, v0.16b, v5.16b + eor v30.16b, v30.16b, v10.16b + eor x26, x8, x9, ror #57 + eor v30.16b, v30.16b, v15.16b + eor x27, x0, x14, ror #10 + eor x29, x16, x28, ror #63 + eor v30.16b, v30.16b, v20.16b + eor x26, x26, x6, ror #51 + eor v29.16b, v1.16b, v6.16b + eor x30, x23, x22, ror #50 + eor v29.16b, v29.16b, v11.16b + eor x0, x26, x10, ror #31 + eor x29, x29, x19, ror #37 + eor v29.16b, v29.16b, v16.16b + eor x27, x27, x12, ror #5 + eor v29.16b, v29.16b, v21.16b + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + eor v28.16b, v2.16b, v7.16b + eor x26, x30, x21, ror #26 + eor v28.16b, v28.16b, v12.16b + eor x26, x26, x25, ror #15 + eor v28.16b, v28.16b, v17.16b + ror x30, x27, #0x3e + eor x30, x30, x26, ror #57 + eor v28.16b, v28.16b, v22.16b + ror x26, x26, #0x3a + eor v27.16b, v3.16b, v8.16b + eor x16, x30, x16 + eor v27.16b, v27.16b, v13.16b + eor x28, x30, x28, ror #63 + str x28, [sp, #0xd0] + eor v27.16b, v27.16b, v18.16b + eor x29, x29, x17, ror #36 + eor v27.16b, v27.16b, v23.16b + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + eor v26.16b, v4.16b, v9.16b + eor x29, x29, x20, ror #2 + eor v26.16b, v26.16b, v14.16b + eor x28, x28, x4, ror #54 + eor v26.16b, v26.16b, v19.16b + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + eor v26.16b, v26.16b, v24.16b + eor x28, x28, x5, ror #25 + add v31.2d, v28.2d, v28.2d + ror x0, x0, #0x38 + eor x0, x0, x29, ror #63 + sri v31.2d, v28.2d, #0x3f + eor x27, x28, x27, ror #61 + eor v25.16b, v31.16b, v30.16b + eor x13, x0, x13, ror #46 + add v31.2d, v26.2d, v26.2d + eor x28, x29, x28, ror #63 + eor x29, x30, x20, ror #2 + sri v31.2d, v26.2d, #0x3f + eor x20, x26, x3, ror #39 + eor v28.16b, v31.16b, v28.16b + eor x11, x0, x11, ror #50 + add v31.2d, v29.2d, v29.2d + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + sri v31.2d, v29.2d, #0x3f + eor x21, x26, x1 + eor v26.16b, v31.16b, v26.16b + eor x9, x27, x9, ror #49 + eor x24, x28, x24, ror #28 + add v31.2d, v27.2d, v27.2d + eor x1, x30, x17, ror #36 + sri v31.2d, v27.2d, #0x3f + eor x14, x0, x14, ror #8 + eor v29.16b, v31.16b, v29.16b + eor x22, x28, x22, ror #44 + eor x8, x27, x8, ror #56 + add v31.2d, v30.2d, v30.2d + eor x17, x27, x7, ror #19 + sri v31.2d, v30.2d, #0x3f + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + eor v27.16b, v31.16b, v27.16b + eor x4, x26, x4, ror #54 + eor v30.16b, v0.16b, v26.16b + eor x0, x0, x12, ror #3 + eor v31.16b, v2.16b, v29.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + shl v0.2d, v31.2d, #0x3e + eor x26, x26, x5, ror #25 + sri v0.2d, v31.2d, #0x2 + eor x2, x7, x16, ror #39 + eor v31.16b, v12.16b, v29.16b + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + shl v2.2d, v31.2d, #0x2b + eor x7, x7, x22, ror #25 + sri v2.2d, v31.2d, #0x15 + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + eor v31.16b, v13.16b, v28.16b + eor x30, x27, x6, ror #43 + shl v12.2d, v31.2d, #0x19 + eor x22, x20, x15, ror #23 + sri v12.2d, v31.2d, #0x27 + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + eor v31.16b, v19.16b, v27.16b + bic x5, x13, x17, ror #63 + shl v13.2d, v31.2d, #0x8 + eor x5, x21, x5, ror #21 + sri v13.2d, v31.2d, #0x38 + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + eor v31.16b, v23.16b, v28.16b + bic x21, x21, x25, ror #50 + shl v19.2d, v31.2d, #0x38 + bic x20, x27, x4, ror #25 + bic x10, x16, x15, ror #31 + sri v19.2d, v31.2d, #0x8 + eor x16, x21, x19, ror #43 + eor v31.16b, v15.16b, v26.16b + eor x21, x17, x25, ror #30 + shl v23.2d, v31.2d, #0x29 + bic x19, x25, x19, ror #57 + ldr x25, [sp, #0x18] + sri v23.2d, v31.2d, #0x17 + eor x17, x10, x9, ror #47 + eor v31.16b, v1.16b, v25.16b + ldr x9, [sp, #0x8] + eor x15, x20, x28, ror #27 + shl v15.2d, v31.2d, #0x1 + bic x20, x4, x28, ror #2 + sri v15.2d, v31.2d, #0x3f + eor x10, x20, x1, ror #50 + eor v31.16b, v8.16b, v28.16b + bic x20, x11, x27, ror #60 + eor x20, x20, x4, ror #21 + shl v1.2d, v31.2d, #0x37 + bic x4, x28, x1, ror #48 + sri v1.2d, v31.2d, #0x9 + bic x1, x1, x11, ror #57 + eor v31.16b, v16.16b, v25.16b + ldr x28, [x9, x25, lsl #3] + ldr x9, [sp, #0xd0] + shl v8.2d, v31.2d, #0x2d + add x25, x25, #0x1 + sri v8.2d, v31.2d, #0x13 + str x25, [sp, #0x18] + cmp x25, #0x17 + eor v31.16b, v7.16b, v29.16b + eor x25, x1, x27, ror #53 + shl v16.2d, v31.2d, #0x6 + bic x27, x30, x26, ror #47 + sri v16.2d, v31.2d, #0x3a + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + eor v31.16b, v10.16b, v26.16b + eor x11, x19, x13, ror #35 + shl v7.2d, v31.2d, #0x3 + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + sri v7.2d, v31.2d, #0x3d + bic x27, x24, x9, ror #47 + eor v31.16b, v3.16b, v28.16b + bic x19, x23, x3, ror #9 + shl v10.2d, v31.2d, #0x1c + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + sri v10.2d, v31.2d, #0x24 + bic x29, x3, x29, ror #35 + eor v31.16b, v18.16b, v28.16b + eor x13, x13, x9, ror #57 + shl v3.2d, v31.2d, #0x15 + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + sri v3.2d, v31.2d, #0x2b + bic x14, x14, x8, ror #5 + eor v31.16b, v17.16b, v29.16b + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + shl v18.2d, v31.2d, #0xf + bic x23, x8, x23, ror #38 + sri v18.2d, v31.2d, #0x31 + eor x8, x27, x0, ror #2 + eor v31.16b, v11.16b, v25.16b + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + shl v17.2d, v31.2d, #0xa + eor x23, x3, x26, ror #52 + sri v17.2d, v31.2d, #0x36 + eor x3, x29, x30, ror #24 + eor x0, x15, x11, ror #52 + eor v31.16b, v9.16b, v27.16b + eor x0, x0, x13, ror #48 + shl v11.2d, v31.2d, #0x14 + eor x26, x8, x9, ror #57 + sri v11.2d, v31.2d, #0x2c + eor x27, x0, x14, ror #10 + eor x29, x16, x28, ror #63 + eor v31.16b, v22.16b, v29.16b + eor x26, x26, x6, ror #51 + shl v9.2d, v31.2d, #0x3d + eor x30, x23, x22, ror #50 + sri v9.2d, v31.2d, #0x3 + eor x0, x26, x10, ror #31 + eor x29, x29, x19, ror #37 + eor v31.16b, v14.16b, v27.16b + eor x27, x27, x12, ror #5 + shl v22.2d, v31.2d, #0x27 + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + sri v22.2d, v31.2d, #0x19 + eor x26, x30, x21, ror #26 + eor v31.16b, v20.16b, v26.16b + eor x26, x26, x25, ror #15 + shl v14.2d, v31.2d, #0x12 + ror x30, x27, #0x3e + eor x30, x30, x26, ror #57 + sri v14.2d, v31.2d, #0x2e + ror x26, x26, #0x3a + eor v31.16b, v4.16b, v27.16b + eor x16, x30, x16 + shl v20.2d, v31.2d, #0x1b + eor x28, x30, x28, ror #63 + str x28, [sp, #0xd0] + sri v20.2d, v31.2d, #0x25 + eor x29, x29, x17, ror #36 + eor v31.16b, v24.16b, v27.16b + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + shl v4.2d, v31.2d, #0xe + eor x29, x29, x20, ror #2 + sri v4.2d, v31.2d, #0x32 + eor x28, x28, x4, ror #54 + eor v31.16b, v21.16b, v25.16b + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + shl v24.2d, v31.2d, #0x2 + eor x28, x28, x5, ror #25 + sri v24.2d, v31.2d, #0x3e + ror x0, x0, #0x38 + eor x0, x0, x29, ror #63 + eor v31.16b, v5.16b, v26.16b + eor x27, x28, x27, ror #61 + shl v21.2d, v31.2d, #0x24 + eor x13, x0, x13, ror #46 + sri v21.2d, v31.2d, #0x1c + eor x28, x29, x28, ror #63 + eor x29, x30, x20, ror #2 + eor v31.16b, v6.16b, v25.16b + eor x20, x26, x3, ror #39 + shl v27.2d, v31.2d, #0x2c + eor x11, x0, x11, ror #50 + sri v27.2d, v31.2d, #0x14 + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + bic v31.16b, v7.16b, v11.16b + eor x21, x26, x1 + eor v5.16b, v31.16b, v10.16b + eor x9, x27, x9, ror #49 + eor x24, x28, x24, ror #28 + bic v31.16b, v8.16b, v7.16b + eor x1, x30, x17, ror #36 + eor v6.16b, v31.16b, v11.16b + eor x14, x0, x14, ror #8 + bic v31.16b, v9.16b, v8.16b + eor x22, x28, x22, ror #44 + eor x8, x27, x8, ror #56 + eor v7.16b, v31.16b, v7.16b + eor x17, x27, x7, ror #19 + bic v31.16b, v10.16b, v9.16b + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + eor v8.16b, v31.16b, v8.16b + eor x4, x26, x4, ror #54 + bic v31.16b, v11.16b, v10.16b + eor x0, x0, x12, ror #3 + eor v9.16b, v31.16b, v9.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + bic v31.16b, v12.16b, v16.16b + eor x26, x26, x5, ror #25 + eor v10.16b, v31.16b, v15.16b + eor x2, x7, x16, ror #39 + bic v31.16b, v13.16b, v12.16b + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + eor v11.16b, v31.16b, v16.16b + eor x7, x7, x22, ror #25 + bic v31.16b, v14.16b, v13.16b + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + eor v12.16b, v31.16b, v12.16b + eor x30, x27, x6, ror #43 + bic v31.16b, v15.16b, v14.16b + eor x22, x20, x15, ror #23 + eor v13.16b, v31.16b, v13.16b + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + bic v31.16b, v16.16b, v15.16b + bic x5, x13, x17, ror #63 + eor v14.16b, v31.16b, v14.16b + eor x5, x21, x5, ror #21 + bic v31.16b, v17.16b, v21.16b + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + eor v15.16b, v31.16b, v20.16b + bic x21, x21, x25, ror #50 + bic v31.16b, v18.16b, v17.16b + bic x20, x27, x4, ror #25 + bic x10, x16, x15, ror #31 + eor v16.16b, v31.16b, v21.16b + eor x16, x21, x19, ror #43 + bic v31.16b, v19.16b, v18.16b + eor x21, x17, x25, ror #30 + eor v17.16b, v31.16b, v17.16b + bic x19, x25, x19, ror #57 + ldr x25, [sp, #0x18] + bic v31.16b, v20.16b, v19.16b + eor x17, x10, x9, ror #47 + eor v18.16b, v31.16b, v18.16b + ldr x9, [sp, #0x8] + eor x15, x20, x28, ror #27 + bic v31.16b, v21.16b, v20.16b + bic x20, x4, x28, ror #2 + eor v19.16b, v31.16b, v19.16b + eor x10, x20, x1, ror #50 + bic v31.16b, v22.16b, v1.16b + bic x20, x11, x27, ror #60 + eor x20, x20, x4, ror #21 + eor v20.16b, v31.16b, v0.16b + bic x4, x28, x1, ror #48 + bic v31.16b, v23.16b, v22.16b + bic x1, x1, x11, ror #57 + eor v21.16b, v31.16b, v1.16b + ldr x28, [x9, x25, lsl #3] + ldr x9, [sp, #0xd0] + bic v31.16b, v24.16b, v23.16b + add x25, x25, #0x1 + eor v22.16b, v31.16b, v22.16b + str x25, [sp, #0x18] + cmp x25, #0x17 + bic v31.16b, v0.16b, v24.16b + eor x25, x1, x27, ror #53 + eor v23.16b, v31.16b, v23.16b + bic x27, x30, x26, ror #47 + bic v31.16b, v1.16b, v0.16b + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + eor v24.16b, v31.16b, v24.16b + eor x11, x19, x13, ror #35 + bic v31.16b, v2.16b, v27.16b + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + eor v0.16b, v31.16b, v30.16b + bic x27, x24, x9, ror #47 + bic v31.16b, v3.16b, v2.16b + bic x19, x23, x3, ror #9 + eor v1.16b, v31.16b, v27.16b + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + bic v31.16b, v4.16b, v3.16b + bic x29, x3, x29, ror #35 + eor v2.16b, v31.16b, v2.16b + eor x13, x13, x9, ror #57 + bic v31.16b, v30.16b, v4.16b + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + eor v3.16b, v31.16b, v3.16b + bic x14, x14, x8, ror #5 + bic v31.16b, v27.16b, v30.16b + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + eor v4.16b, v31.16b, v4.16b + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + ldr x30, [sp, #0x10] + ld1r { v28.2d }, [x30], #8 + str x30, [sp, #0x10] + eor v0.16b, v0.16b, v28.16b + +Lkeccak_f1600_x4_v8a_scalar_hybrid_loop_end + b.le Lkeccak_f1600_x4_v8a_scalar_hybrid_loop + ror x2, x2, #0x3d + ror x3, x3, #0x27 + ror x4, x4, #0x36 + ror x5, x5, #0x19 + ror x6, x6, #0x2b + ror x7, x7, #0x13 + ror x8, x8, #0x38 + ror x9, x9, #0x31 + ror x10, x10, #0x17 + ror x11, x11, #0x32 + ror x12, x12, #0x3 + ror x13, x13, #0x2e + ror x14, x14, #0x8 + ror x15, x15, #0x3e + ror x17, x17, #0x24 + ror x28, x28, #0x3f + ror x19, x19, #0x25 + ror x20, x20, #0x2 + ror x21, x21, #0x14 + ror x22, x22, #0x2c + ror x23, x23, #0x3a + ror x24, x24, #0x1c + ror x25, x25, #0x9 + ldr x30, [sp, #0x20] + cmp x30, #0x1 + b.eq Lkeccak_f1600_x4_v8a_scalar_hybrid_done + mov x30, #0x1 // =1 + str x30, [sp, #0x20] + ldr x0, [sp] + add x0, x0, #0x190 + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + sub x0, x0, #0x190 + add x0, x0, #0x258 + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + sub x0, x0, #0x258 + b Lkeccak_f1600_x4_v8a_scalar_hybrid_initial + +Lkeccak_f1600_x4_v8a_scalar_hybrid_done + ldr x0, [sp] + add x0, x0, #0x258 + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + sub x0, x0, #0x258 + add x4, x0, #0xc8 + trn1 v25.2d, v0.2d, v1.2d + trn1 v26.2d, v2.2d, v3.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v0.2d, v1.2d + trn2 v28.2d, v2.2d, v3.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v4.2d, v5.2d + trn1 v26.2d, v6.2d, v7.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v4.2d, v5.2d + trn2 v28.2d, v6.2d, v7.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v8.2d, v9.2d + trn1 v26.2d, v10.2d, v11.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v8.2d, v9.2d + trn2 v28.2d, v10.2d, v11.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v12.2d, v13.2d + trn1 v26.2d, v14.2d, v15.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v12.2d, v13.2d + trn2 v28.2d, v14.2d, v15.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v16.2d, v17.2d + trn1 v26.2d, v18.2d, v19.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v16.2d, v17.2d + trn2 v28.2d, v18.2d, v19.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v20.2d, v21.2d + trn1 v26.2d, v22.2d, v23.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v20.2d, v21.2d + trn2 v28.2d, v22.2d, v23.2d + st1 { v27.2d, v28.2d }, [x4], #32 + str d24, [x0] + trn2 v25.2d, v24.2d, v24.2d + str d25, [x4] + ldp d8, d9, [sp, #0x90] + ldp d10, d11, [sp, #0xa0] + ldp d12, d13, [sp, #0xb0] + ldp d14, d15, [sp, #0xc0] + ldp x19, x20, [sp, #0x30] + ldp x21, x22, [sp, #0x40] + ldp x23, x24, [sp, #0x50] + ldp x25, x26, [sp, #0x60] + ldp x27, x28, [sp, #0x70] + ldp x29, x30, [sp, #0x80] + add sp, sp, #0xe0 + ret + +MLK_ASM_FN_SIZE(keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm) + +#endif /* MLK_FIPS202_AARCH64_NEED_X4_V8A_SCALAR_HYBRID && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +#if defined(__ELF__) +#endif + + END diff --git a/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm.asm b/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm.asm new file mode 100644 index 0000000000..7609ca0db8 --- /dev/null +++ b/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm.asm @@ -0,0 +1,946 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +// Author: Hanno Becker +// Author: Matthias Kannwischer + +/*yaml + Name: keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm + Description: AArch64 hybrid scalar/vector implementation of Keccak-f[1600] permutation for four sequential states with ARMv8.4-A optimizations + Signature: void mlk_keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm(uint64_t state[100], const uint64_t rc[24]) + ABI: + x0: + type: buffer + size_bytes: 800 + permissions: read/write + c_parameter: uint64_t state[100] + description: Four sequential Keccak states (state0[25], state1[25], state2[25], state3[25]) + x1: + type: buffer + size_bytes: 192 + permissions: read-only + c_parameter: const uint64_t rc[24] + description: Round constants (24 x uint64_t) + Stack: + bytes: 224 + description: register preservation and temporary storage +*/ + +#include "../../../../common.h" +#if defined(MLK_FIPS202_AARCH64_NEED_X4_V8A_V84A_SCALAR_HYBRID) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#if defined(__ARM_FEATURE_SHA3) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/fips202/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm) +MLK_ASM_FN_SYMBOL(keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm) + + sub sp, sp, #0xe0 + stp x19, x20, [sp, #0x30] + stp x21, x22, [sp, #0x40] + stp x23, x24, [sp, #0x50] + stp x25, x26, [sp, #0x60] + stp x27, x28, [sp, #0x70] + stp x29, x30, [sp, #0x80] + stp d8, d9, [sp, #0x90] + stp d10, d11, [sp, #0xa0] + stp d12, d13, [sp, #0xb0] + stp d14, d15, [sp, #0xc0] + mov x29, x1 + mov x30, #0x0 // =0 + str x30, [sp, #0x20] + str x29, [sp, #0x8] + str x29, [sp, #0x10] + str x0, [sp] + add x4, x0, #0xc8 + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v0.2d, v25.2d, v27.2d + trn2 v1.2d, v25.2d, v27.2d + trn1 v2.2d, v26.2d, v28.2d + trn2 v3.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v4.2d, v25.2d, v27.2d + trn2 v5.2d, v25.2d, v27.2d + trn1 v6.2d, v26.2d, v28.2d + trn2 v7.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v8.2d, v25.2d, v27.2d + trn2 v9.2d, v25.2d, v27.2d + trn1 v10.2d, v26.2d, v28.2d + trn2 v11.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v12.2d, v25.2d, v27.2d + trn2 v13.2d, v25.2d, v27.2d + trn1 v14.2d, v26.2d, v28.2d + trn2 v15.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v16.2d, v25.2d, v27.2d + trn2 v17.2d, v25.2d, v27.2d + trn1 v18.2d, v26.2d, v28.2d + trn2 v19.2d, v26.2d, v28.2d + ldp q25, q26, [x0], #0x20 + ld1 { v27.2d, v28.2d }, [x4], #32 + trn1 v20.2d, v25.2d, v27.2d + trn2 v21.2d, v25.2d, v27.2d + trn1 v22.2d, v26.2d, v28.2d + trn2 v23.2d, v26.2d, v28.2d + ldr d25, [x0] + ldr d27, [x4] + trn1 v24.2d, v25.2d, v27.2d + sub x0, x0, #0xc0 + add x0, x0, #0x190 + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + sub x0, x0, #0x190 + +Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_initial + eor x30, x24, x25 + eor x27, x9, x10 + eor3 v30.16b, v0.16b, v5.16b, v10.16b + eor v30.16b, v30.16b, v15.16b + eor x0, x30, x21 + eor x26, x27, x6 + eor v30.16b, v30.16b, v20.16b + eor x27, x26, x7 + eor x29, x0, x22 + eor3 v29.16b, v1.16b, v6.16b, v11.16b + eor x26, x29, x23 + eor x29, x4, x5 + eor v29.16b, v29.16b, v16.16b + eor x30, x29, x1 + eor x0, x27, x8 + eor v29.16b, v29.16b, v21.16b + eor x29, x30, x2 + eor x30, x19, x20 + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor x30, x30, x16 + eor x27, x26, x0, ror #63 + eor v28.16b, v28.16b, v17.16b + eor x4, x4, x27 + eor x30, x30, x17 + eor v28.16b, v28.16b, v22.16b + eor x30, x30, x28 + eor x29, x29, x3 + eor3 v27.16b, v3.16b, v8.16b, v13.16b + eor x0, x0, x30, ror #63 + eor x30, x30, x29, ror #63 + eor v27.16b, v27.16b, v18.16b + eor x22, x22, x30 + eor v27.16b, v27.16b, v23.16b + eor x23, x23, x30 + str x23, [sp, #0xd0] + eor3 v26.16b, v4.16b, v9.16b, v14.16b + eor x23, x14, x15 + eor x14, x14, x0 + eor v26.16b, v26.16b, v19.16b + eor x23, x23, x11 + eor x15, x15, x0 + eor v26.16b, v26.16b, v24.16b + eor x1, x1, x27 + eor x23, x23, x12 + rax1 v25.2d, v30.2d, v28.2d + eor x23, x23, x13 + eor x11, x11, x0 + add v31.2d, v26.2d, v26.2d + eor x29, x29, x23, ror #63 + eor x23, x23, x26, ror #63 + sri v31.2d, v26.2d, #0x3f + eor x26, x13, x0 + eor x13, x28, x23 + eor v28.16b, v31.16b, v28.16b + eor x28, x24, x30 + eor x24, x16, x23 + rax1 v26.2d, v26.2d, v29.2d + eor x16, x21, x30 + eor x21, x25, x30 + add v31.2d, v27.2d, v27.2d + eor x30, x19, x23 + sri v31.2d, v27.2d, #0x3f + eor x19, x20, x23 + eor x20, x17, x23 + eor v29.16b, v31.16b, v29.16b + eor x17, x12, x0 + eor x0, x2, x27 + rax1 v27.2d, v27.2d, v30.2d + eor x2, x6, x29 + eor x6, x8, x29 + eor v30.16b, v0.16b, v26.16b + bic x8, x28, x13, ror #47 + eor x12, x3, x27 + eor v31.16b, v2.16b, v29.16b + bic x3, x13, x17, ror #19 + eor x5, x5, x27 + shl v0.2d, v31.2d, #0x3e + ldr x27, [sp, #0xd0] + bic x25, x17, x2, ror #5 + sri v0.2d, v31.2d, #0x2 + eor x9, x9, x29 + eor x23, x25, x5, ror #52 + xar v2.2d, v12.2d, v29.2d, #0x15 + eor x3, x3, x2, ror #24 + eor x8, x8, x17, ror #2 + eor v31.16b, v13.16b, v28.16b + eor x17, x10, x29 + bic x25, x12, x22, ror #47 + shl v12.2d, v31.2d, #0x19 + eor x29, x7, x29 + bic x10, x4, x27, ror #2 + sri v12.2d, v31.2d, #0x27 + bic x7, x5, x28, ror #10 + xar v13.2d, v19.2d, v27.2d, #0x38 + eor x10, x10, x20, ror #50 + eor x13, x7, x13, ror #57 + eor v31.16b, v23.16b, v28.16b + bic x7, x2, x5, ror #47 + eor x2, x25, x24, ror #39 + shl v19.2d, v31.2d, #0x38 + bic x25, x20, x11, ror #57 + bic x5, x17, x4, ror #25 + sri v19.2d, v31.2d, #0x8 + eor x25, x25, x17, ror #53 + bic x17, x11, x17, ror #60 + xar v23.2d, v15.2d, v26.2d, #0x17 + eor x28, x7, x28, ror #57 + bic x7, x9, x12, ror #42 + eor v31.16b, v1.16b, v25.16b + eor x7, x7, x22, ror #25 + bic x22, x22, x24, ror #56 + shl v15.2d, v31.2d, #0x1 + bic x24, x24, x15, ror #31 + eor x22, x22, x15, ror #23 + sri v15.2d, v31.2d, #0x3f + bic x20, x27, x20, ror #48 + bic x15, x15, x9, ror #16 + xar v1.2d, v8.2d, v28.2d, #0x9 + eor x12, x15, x12, ror #58 + eor x15, x5, x27, ror #27 + eor v31.16b, v16.16b, v25.16b + eor x5, x20, x11, ror #41 + shl v8.2d, v31.2d, #0x2d + ldr x11, [sp, #0x8] + eor x20, x17, x4, ror #21 + sri v8.2d, v31.2d, #0x13 + eor x17, x24, x9, ror #47 + mov x24, #0x1 // =1 + xar v16.2d, v7.2d, v29.2d, #0x3a + bic x9, x0, x16, ror #9 + str x24, [sp, #0x18] + eor v31.16b, v10.16b, v26.16b + bic x24, x29, x1, ror #44 + bic x27, x1, x21, ror #50 + shl v7.2d, v31.2d, #0x3 + bic x4, x26, x29, ror #63 + eor x1, x1, x4, ror #21 + sri v7.2d, v31.2d, #0x3d + ldr x11, [x11] + bic x4, x21, x30, ror #57 + xar v10.2d, v3.2d, v28.2d, #0x24 + eor x21, x24, x21, ror #30 + eor x24, x9, x19, ror #44 + eor v31.16b, v18.16b, v28.16b + bic x9, x14, x6, ror #5 + eor x9, x9, x0, ror #43 + shl v3.2d, v31.2d, #0x15 + bic x0, x6, x0, ror #38 + eor x1, x1, x11 + sri v3.2d, v31.2d, #0x2b + eor x11, x4, x26, ror #35 + eor x4, x0, x16, ror #47 + xar v18.2d, v17.2d, v29.2d, #0x31 + bic x0, x16, x19, ror #35 + eor v31.16b, v11.16b, v25.16b + eor x16, x27, x30, ror #43 + bic x27, x30, x26, ror #42 + shl v17.2d, v31.2d, #0xa + bic x26, x19, x14, ror #41 + eor x19, x0, x14, ror #12 + sri v17.2d, v31.2d, #0x36 + eor x14, x26, x6, ror #46 + eor x6, x27, x29, ror #41 + xar v11.2d, v9.2d, v27.2d, #0x2c + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + eor v31.16b, v22.16b, v29.16b + eor x26, x8, x9, ror #57 + eor x27, x0, x14, ror #10 + shl v9.2d, v31.2d, #0x3d + eor x29, x16, x28, ror #63 + eor x26, x26, x6, ror #51 + sri v9.2d, v31.2d, #0x3 + eor x30, x23, x22, ror #50 + eor x0, x26, x10, ror #31 + xar v22.2d, v14.2d, v27.2d, #0x19 + eor x29, x29, x19, ror #37 + eor x27, x27, x12, ror #5 + eor v31.16b, v20.16b, v26.16b + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + shl v14.2d, v31.2d, #0x12 + eor x26, x30, x21, ror #26 + sri v14.2d, v31.2d, #0x2e + eor x26, x26, x25, ror #15 + ror x30, x27, #0x3e + xar v20.2d, v4.2d, v27.2d, #0x25 + eor x30, x30, x26, ror #57 + ror x26, x26, #0x3a + eor v31.16b, v24.16b, v27.16b + eor x16, x30, x16 + eor x28, x30, x28, ror #63 + shl v4.2d, v31.2d, #0xe + str x28, [sp, #0xd0] + eor x29, x29, x17, ror #36 + sri v4.2d, v31.2d, #0x32 + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + xar v24.2d, v21.2d, v25.2d, #0x3e + eor x29, x29, x20, ror #2 + eor x28, x28, x4, ror #54 + eor v31.16b, v5.16b, v26.16b + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + shl v21.2d, v31.2d, #0x24 + eor x28, x28, x5, ror #25 + ror x0, x0, #0x38 + sri v21.2d, v31.2d, #0x1c + eor x0, x0, x29, ror #63 + eor x27, x28, x27, ror #61 + xar v27.2d, v6.2d, v25.2d, #0x14 + eor x13, x0, x13, ror #46 + eor x28, x29, x28, ror #63 + bic v31.16b, v7.16b, v11.16b + eor x29, x30, x20, ror #2 + eor v5.16b, v31.16b, v10.16b + eor x20, x26, x3, ror #39 + eor x11, x0, x11, ror #50 + bcax v6.16b, v11.16b, v8.16b, v7.16b + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + bic v31.16b, v9.16b, v8.16b + eor x21, x26, x1 + eor x9, x27, x9, ror #49 + eor v7.16b, v31.16b, v7.16b + eor x24, x28, x24, ror #28 + eor x1, x30, x17, ror #36 + bcax v8.16b, v8.16b, v10.16b, v9.16b + eor x14, x0, x14, ror #8 + eor x22, x28, x22, ror #44 + bic v31.16b, v11.16b, v10.16b + eor x8, x27, x8, ror #56 + eor x17, x27, x7, ror #19 + eor v9.16b, v31.16b, v9.16b + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + bcax v10.16b, v15.16b, v12.16b, v16.16b + eor x4, x26, x4, ror #54 + eor x0, x0, x12, ror #3 + bic v31.16b, v13.16b, v12.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + eor v11.16b, v31.16b, v16.16b + eor x26, x26, x5, ror #25 + bcax v12.16b, v12.16b, v14.16b, v13.16b + eor x2, x7, x16, ror #39 + bic x7, x9, x20, ror #42 + bic v31.16b, v15.16b, v14.16b + bic x30, x15, x9, ror #16 + eor x7, x7, x22, ror #25 + eor v13.16b, v31.16b, v13.16b + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + bic v31.16b, v16.16b, v15.16b + eor x30, x27, x6, ror #43 + eor x22, x20, x15, ror #23 + eor v14.16b, v31.16b, v14.16b + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + bcax v15.16b, v20.16b, v17.16b, v21.16b + bic x5, x13, x17, ror #63 + eor x5, x21, x5, ror #21 + bic v31.16b, v18.16b, v17.16b + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + eor v16.16b, v31.16b, v21.16b + bic x21, x21, x25, ror #50 + bic x20, x27, x4, ror #25 + bcax v17.16b, v17.16b, v19.16b, v18.16b + bic x10, x16, x15, ror #31 + eor x16, x21, x19, ror #43 + bic v31.16b, v20.16b, v19.16b + eor x21, x17, x25, ror #30 + bic x19, x25, x19, ror #57 + eor v18.16b, v31.16b, v18.16b + ldr x25, [sp, #0x18] + bcax v19.16b, v19.16b, v21.16b, v20.16b + eor x17, x10, x9, ror #47 + ldr x9, [sp, #0x8] + bic v31.16b, v22.16b, v1.16b + eor x15, x20, x28, ror #27 + bic x20, x4, x28, ror #2 + eor v20.16b, v31.16b, v0.16b + eor x10, x20, x1, ror #50 + bic x20, x11, x27, ror #60 + bcax v21.16b, v1.16b, v23.16b, v22.16b + eor x20, x20, x4, ror #21 + bic x4, x28, x1, ror #48 + bic v31.16b, v24.16b, v23.16b + bic x1, x1, x11, ror #57 + ldr x28, [x9, x25, lsl #3] + eor v22.16b, v31.16b, v22.16b + ldr x9, [sp, #0xd0] + add x25, x25, #0x1 + bcax v23.16b, v23.16b, v0.16b, v24.16b + str x25, [sp, #0x18] + cmp x25, #0x17 + bic v31.16b, v1.16b, v0.16b + eor x25, x1, x27, ror #53 + bic x27, x30, x26, ror #47 + eor v24.16b, v31.16b, v24.16b + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + bcax v0.16b, v30.16b, v2.16b, v27.16b + eor x11, x19, x13, ror #35 + bic v31.16b, v3.16b, v2.16b + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + eor v1.16b, v31.16b, v27.16b + bic x27, x24, x9, ror #47 + bic x19, x23, x3, ror #9 + bcax v2.16b, v2.16b, v4.16b, v3.16b + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + bic v31.16b, v30.16b, v4.16b + bic x29, x3, x29, ror #35 + eor x13, x13, x9, ror #57 + eor v3.16b, v31.16b, v3.16b + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + bcax v4.16b, v4.16b, v27.16b, v30.16b + bic x14, x14, x8, ror #5 + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + ldr x30, [sp, #0x10] + ld1r { v28.2d }, [x30], #8 + str x30, [sp, #0x10] + eor v0.16b, v0.16b, v28.16b + +Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_loop + eor x0, x15, x11, ror #52 + eor x0, x0, x13, ror #48 + eor3 v30.16b, v0.16b, v5.16b, v10.16b + eor v30.16b, v30.16b, v15.16b + eor x26, x8, x9, ror #57 + eor x27, x0, x14, ror #10 + eor v30.16b, v30.16b, v20.16b + eor x29, x16, x28, ror #63 + eor x26, x26, x6, ror #51 + eor3 v29.16b, v1.16b, v6.16b, v11.16b + eor x30, x23, x22, ror #50 + eor x0, x26, x10, ror #31 + eor v29.16b, v29.16b, v16.16b + eor x29, x29, x19, ror #37 + eor x27, x27, x12, ror #5 + eor v29.16b, v29.16b, v21.16b + eor x30, x30, x24, ror #34 + eor x0, x0, x7, ror #27 + eor3 v28.16b, v2.16b, v7.16b, v12.16b + eor x26, x30, x21, ror #26 + eor x26, x26, x25, ror #15 + eor v28.16b, v28.16b, v17.16b + ror x30, x27, #0x3e + eor x30, x30, x26, ror #57 + eor v28.16b, v28.16b, v22.16b + ror x26, x26, #0x3a + eor x16, x30, x16 + eor3 v27.16b, v3.16b, v8.16b, v13.16b + eor x28, x30, x28, ror #63 + str x28, [sp, #0xd0] + eor v27.16b, v27.16b, v18.16b + eor x29, x29, x17, ror #36 + eor x28, x1, x2, ror #61 + eor v27.16b, v27.16b, v23.16b + eor x19, x30, x19, ror #37 + eor x29, x29, x20, ror #2 + eor3 v26.16b, v4.16b, v9.16b, v14.16b + eor x28, x28, x4, ror #54 + eor x26, x26, x0, ror #55 + eor v26.16b, v26.16b, v19.16b + eor x28, x28, x3, ror #39 + eor x28, x28, x5, ror #25 + eor v26.16b, v26.16b, v24.16b + ror x0, x0, #0x38 + eor x0, x0, x29, ror #63 + rax1 v25.2d, v30.2d, v28.2d + eor x27, x28, x27, ror #61 + eor x13, x0, x13, ror #46 + add v31.2d, v26.2d, v26.2d + eor x28, x29, x28, ror #63 + eor x29, x30, x20, ror #2 + sri v31.2d, v26.2d, #0x3f + eor x20, x26, x3, ror #39 + eor x11, x0, x11, ror #50 + eor v28.16b, v31.16b, v28.16b + eor x25, x28, x25, ror #9 + eor x3, x28, x21, ror #20 + rax1 v26.2d, v26.2d, v29.2d + eor x21, x26, x1 + add v31.2d, v27.2d, v27.2d + eor x9, x27, x9, ror #49 + eor x24, x28, x24, ror #28 + sri v31.2d, v27.2d, #0x3f + eor x1, x30, x17, ror #36 + eor x14, x0, x14, ror #8 + eor v29.16b, v31.16b, v29.16b + eor x22, x28, x22, ror #44 + eor x8, x27, x8, ror #56 + rax1 v27.2d, v27.2d, v30.2d + eor x17, x27, x7, ror #19 + eor x15, x0, x15, ror #62 + eor v30.16b, v0.16b, v26.16b + bic x7, x20, x22, ror #47 + eor x4, x26, x4, ror #54 + eor v31.16b, v2.16b, v29.16b + eor x0, x0, x12, ror #3 + eor x28, x28, x23, ror #58 + shl v0.2d, v31.2d, #0x3e + eor x23, x26, x2, ror #61 + eor x26, x26, x5, ror #25 + sri v0.2d, v31.2d, #0x2 + eor x2, x7, x16, ror #39 + bic x7, x9, x20, ror #42 + xar v2.2d, v12.2d, v29.2d, #0x15 + bic x30, x15, x9, ror #16 + eor x7, x7, x22, ror #25 + eor v31.16b, v13.16b, v28.16b + eor x12, x30, x20, ror #58 + bic x20, x22, x16, ror #56 + shl v12.2d, v31.2d, #0x19 + eor x30, x27, x6, ror #43 + eor x22, x20, x15, ror #23 + sri v12.2d, v31.2d, #0x27 + bic x6, x19, x13, ror #42 + eor x6, x6, x17, ror #41 + xar v13.2d, v19.2d, v27.2d, #0x38 + bic x5, x13, x17, ror #63 + eor x5, x21, x5, ror #21 + eor v31.16b, v23.16b, v28.16b + bic x17, x17, x21, ror #44 + eor x27, x27, x10, ror #23 + shl v19.2d, v31.2d, #0x38 + bic x21, x21, x25, ror #50 + bic x20, x27, x4, ror #25 + sri v19.2d, v31.2d, #0x8 + bic x10, x16, x15, ror #31 + eor x16, x21, x19, ror #43 + xar v23.2d, v15.2d, v26.2d, #0x17 + eor x21, x17, x25, ror #30 + bic x19, x25, x19, ror #57 + eor v31.16b, v1.16b, v25.16b + ldr x25, [sp, #0x18] + eor x17, x10, x9, ror #47 + shl v15.2d, v31.2d, #0x1 + ldr x9, [sp, #0x8] + sri v15.2d, v31.2d, #0x3f + eor x15, x20, x28, ror #27 + bic x20, x4, x28, ror #2 + xar v1.2d, v8.2d, v28.2d, #0x9 + eor x10, x20, x1, ror #50 + bic x20, x11, x27, ror #60 + eor v31.16b, v16.16b, v25.16b + eor x20, x20, x4, ror #21 + bic x4, x28, x1, ror #48 + shl v8.2d, v31.2d, #0x2d + bic x1, x1, x11, ror #57 + ldr x28, [x9, x25, lsl #3] + sri v8.2d, v31.2d, #0x13 + ldr x9, [sp, #0xd0] + add x25, x25, #0x1 + xar v16.2d, v7.2d, v29.2d, #0x3a + str x25, [sp, #0x18] + cmp x25, #0x17 + eor v31.16b, v10.16b, v26.16b + eor x25, x1, x27, ror #53 + bic x27, x30, x26, ror #47 + shl v7.2d, v31.2d, #0x3 + eor x1, x5, x28 + eor x5, x4, x11, ror #41 + sri v7.2d, v31.2d, #0x3d + eor x11, x19, x13, ror #35 + bic x13, x26, x24, ror #10 + xar v10.2d, v3.2d, v28.2d, #0x24 + eor x28, x27, x24, ror #57 + bic x27, x24, x9, ror #47 + eor v31.16b, v18.16b, v28.16b + bic x19, x23, x3, ror #9 + bic x4, x29, x14, ror #41 + shl v3.2d, v31.2d, #0x15 + eor x24, x19, x29, ror #44 + bic x29, x3, x29, ror #35 + sri v3.2d, v31.2d, #0x2b + eor x13, x13, x9, ror #57 + eor x19, x29, x14, ror #12 + xar v18.2d, v17.2d, v29.2d, #0x31 + bic x29, x9, x0, ror #19 + bic x14, x14, x8, ror #5 + eor v31.16b, v11.16b, v25.16b + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + shl v17.2d, v31.2d, #0xa + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + sri v17.2d, v31.2d, #0x36 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + xar v11.2d, v9.2d, v27.2d, #0x2c + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + eor v31.16b, v22.16b, v29.16b + eor x0, x15, x11, ror #52 + shl v9.2d, v31.2d, #0x3d + eor x0, x0, x13, ror #48 + eor x26, x8, x9, ror #57 + sri v9.2d, v31.2d, #0x3 + eor x27, x0, x14, ror #10 + eor x29, x16, x28, ror #63 + xar v22.2d, v14.2d, v27.2d, #0x19 + eor x26, x26, x6, ror #51 + eor x30, x23, x22, ror #50 + eor v31.16b, v20.16b, v26.16b + eor x0, x26, x10, ror #31 + eor x29, x29, x19, ror #37 + shl v14.2d, v31.2d, #0x12 + eor x27, x27, x12, ror #5 + eor x30, x30, x24, ror #34 + sri v14.2d, v31.2d, #0x2e + eor x0, x0, x7, ror #27 + eor x26, x30, x21, ror #26 + xar v20.2d, v4.2d, v27.2d, #0x25 + eor x26, x26, x25, ror #15 + ror x30, x27, #0x3e + eor v31.16b, v24.16b, v27.16b + eor x30, x30, x26, ror #57 + ror x26, x26, #0x3a + shl v4.2d, v31.2d, #0xe + eor x16, x30, x16 + eor x28, x30, x28, ror #63 + sri v4.2d, v31.2d, #0x32 + str x28, [sp, #0xd0] + eor x29, x29, x17, ror #36 + xar v24.2d, v21.2d, v25.2d, #0x3e + eor x28, x1, x2, ror #61 + eor x19, x30, x19, ror #37 + eor v31.16b, v5.16b, v26.16b + eor x29, x29, x20, ror #2 + eor x28, x28, x4, ror #54 + shl v21.2d, v31.2d, #0x24 + eor x26, x26, x0, ror #55 + eor x28, x28, x3, ror #39 + sri v21.2d, v31.2d, #0x1c + eor x28, x28, x5, ror #25 + ror x0, x0, #0x38 + xar v27.2d, v6.2d, v25.2d, #0x14 + eor x0, x0, x29, ror #63 + eor x27, x28, x27, ror #61 + bic v31.16b, v7.16b, v11.16b + eor x13, x0, x13, ror #46 + eor x28, x29, x28, ror #63 + eor v5.16b, v31.16b, v10.16b + eor x29, x30, x20, ror #2 + eor x20, x26, x3, ror #39 + bcax v6.16b, v11.16b, v8.16b, v7.16b + eor x11, x0, x11, ror #50 + eor x25, x28, x25, ror #9 + bic v31.16b, v9.16b, v8.16b + eor x3, x28, x21, ror #20 + eor v7.16b, v31.16b, v7.16b + eor x21, x26, x1 + eor x9, x27, x9, ror #49 + bcax v8.16b, v8.16b, v10.16b, v9.16b + eor x24, x28, x24, ror #28 + eor x1, x30, x17, ror #36 + bic v31.16b, v11.16b, v10.16b + eor x14, x0, x14, ror #8 + eor x22, x28, x22, ror #44 + eor v9.16b, v31.16b, v9.16b + eor x8, x27, x8, ror #56 + eor x17, x27, x7, ror #19 + bcax v10.16b, v15.16b, v12.16b, v16.16b + eor x15, x0, x15, ror #62 + bic x7, x20, x22, ror #47 + bic v31.16b, v13.16b, v12.16b + eor x4, x26, x4, ror #54 + eor x0, x0, x12, ror #3 + eor v11.16b, v31.16b, v16.16b + eor x28, x28, x23, ror #58 + eor x23, x26, x2, ror #61 + bcax v12.16b, v12.16b, v14.16b, v13.16b + eor x26, x26, x5, ror #25 + eor x2, x7, x16, ror #39 + bic v31.16b, v15.16b, v14.16b + bic x7, x9, x20, ror #42 + bic x30, x15, x9, ror #16 + eor v13.16b, v31.16b, v13.16b + eor x7, x7, x22, ror #25 + eor x12, x30, x20, ror #58 + bic v31.16b, v16.16b, v15.16b + bic x20, x22, x16, ror #56 + eor x30, x27, x6, ror #43 + eor v14.16b, v31.16b, v14.16b + eor x22, x20, x15, ror #23 + bic x6, x19, x13, ror #42 + bcax v15.16b, v20.16b, v17.16b, v21.16b + eor x6, x6, x17, ror #41 + bic x5, x13, x17, ror #63 + bic v31.16b, v18.16b, v17.16b + eor x5, x21, x5, ror #21 + bic x17, x17, x21, ror #44 + eor v16.16b, v31.16b, v21.16b + eor x27, x27, x10, ror #23 + bic x21, x21, x25, ror #50 + bcax v17.16b, v17.16b, v19.16b, v18.16b + bic x20, x27, x4, ror #25 + bic x10, x16, x15, ror #31 + bic v31.16b, v20.16b, v19.16b + eor x16, x21, x19, ror #43 + eor x21, x17, x25, ror #30 + eor v18.16b, v31.16b, v18.16b + bic x19, x25, x19, ror #57 + ldr x25, [sp, #0x18] + bcax v19.16b, v19.16b, v21.16b, v20.16b + eor x17, x10, x9, ror #47 + bic v31.16b, v22.16b, v1.16b + ldr x9, [sp, #0x8] + eor x15, x20, x28, ror #27 + eor v20.16b, v31.16b, v0.16b + bic x20, x4, x28, ror #2 + eor x10, x20, x1, ror #50 + bcax v21.16b, v1.16b, v23.16b, v22.16b + bic x20, x11, x27, ror #60 + eor x20, x20, x4, ror #21 + bic v31.16b, v24.16b, v23.16b + bic x4, x28, x1, ror #48 + bic x1, x1, x11, ror #57 + eor v22.16b, v31.16b, v22.16b + ldr x28, [x9, x25, lsl #3] + ldr x9, [sp, #0xd0] + bcax v23.16b, v23.16b, v0.16b, v24.16b + add x25, x25, #0x1 + str x25, [sp, #0x18] + bic v31.16b, v1.16b, v0.16b + cmp x25, #0x17 + eor x25, x1, x27, ror #53 + eor v24.16b, v31.16b, v24.16b + bic x27, x30, x26, ror #47 + eor x1, x5, x28 + bcax v0.16b, v30.16b, v2.16b, v27.16b + eor x5, x4, x11, ror #41 + eor x11, x19, x13, ror #35 + bic v31.16b, v3.16b, v2.16b + bic x13, x26, x24, ror #10 + eor x28, x27, x24, ror #57 + eor v1.16b, v31.16b, v27.16b + bic x27, x24, x9, ror #47 + bic x19, x23, x3, ror #9 + bcax v2.16b, v2.16b, v4.16b, v3.16b + bic x4, x29, x14, ror #41 + eor x24, x19, x29, ror #44 + bic v31.16b, v30.16b, v4.16b + bic x29, x3, x29, ror #35 + eor x13, x13, x9, ror #57 + eor v3.16b, v31.16b, v3.16b + eor x19, x29, x14, ror #12 + bic x29, x9, x0, ror #19 + bcax v4.16b, v4.16b, v27.16b, v30.16b + bic x14, x14, x8, ror #5 + eor x9, x14, x23, ror #43 + eor x14, x4, x8, ror #46 + bic x23, x8, x23, ror #38 + eor x8, x27, x0, ror #2 + eor x4, x23, x3, ror #47 + bic x3, x0, x30, ror #5 + eor x23, x3, x26, ror #52 + eor x3, x29, x30, ror #24 + ldr x30, [sp, #0x10] + ld1r { v28.2d }, [x30], #8 + str x30, [sp, #0x10] + eor v0.16b, v0.16b, v28.16b + +Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_loop_end + b.le Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_loop + ror x2, x2, #0x3d + ror x3, x3, #0x27 + ror x4, x4, #0x36 + ror x5, x5, #0x19 + ror x6, x6, #0x2b + ror x7, x7, #0x13 + ror x8, x8, #0x38 + ror x9, x9, #0x31 + ror x10, x10, #0x17 + ror x11, x11, #0x32 + ror x12, x12, #0x3 + ror x13, x13, #0x2e + ror x14, x14, #0x8 + ror x15, x15, #0x3e + ror x17, x17, #0x24 + ror x28, x28, #0x3f + ror x19, x19, #0x25 + ror x20, x20, #0x2 + ror x21, x21, #0x14 + ror x22, x22, #0x2c + ror x23, x23, #0x3a + ror x24, x24, #0x1c + ror x25, x25, #0x9 + ldr x30, [sp, #0x20] + cmp x30, #0x1 + b.eq Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_done + mov x30, #0x1 // =1 + str x30, [sp, #0x20] + ldr x0, [sp] + add x0, x0, #0x190 + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + sub x0, x0, #0x190 + add x0, x0, #0x258 + ldp x1, x6, [x0] + ldp x11, x16, [x0, #0x10] + ldp x21, x2, [x0, #0x20] + ldp x7, x12, [x0, #0x30] + ldp x17, x22, [x0, #0x40] + ldp x3, x8, [x0, #0x50] + ldp x13, x28, [x0, #0x60] + ldp x23, x4, [x0, #0x70] + ldp x9, x14, [x0, #0x80] + ldp x19, x24, [x0, #0x90] + ldp x5, x10, [x0, #0xa0] + ldp x15, x20, [x0, #0xb0] + ldr x25, [x0, #0xc0] + sub x0, x0, #0x258 + b Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_initial + +Lkeccak_f1600_x4_v8a_v84a_scalar_hybrid_done + ldr x0, [sp] + add x0, x0, #0x258 + stp x1, x6, [x0] + stp x11, x16, [x0, #0x10] + stp x21, x2, [x0, #0x20] + stp x7, x12, [x0, #0x30] + stp x17, x22, [x0, #0x40] + stp x3, x8, [x0, #0x50] + stp x13, x28, [x0, #0x60] + stp x23, x4, [x0, #0x70] + stp x9, x14, [x0, #0x80] + stp x19, x24, [x0, #0x90] + stp x5, x10, [x0, #0xa0] + stp x15, x20, [x0, #0xb0] + str x25, [x0, #0xc0] + sub x0, x0, #0x258 + add x4, x0, #0xc8 + trn1 v25.2d, v0.2d, v1.2d + trn1 v26.2d, v2.2d, v3.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v0.2d, v1.2d + trn2 v28.2d, v2.2d, v3.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v4.2d, v5.2d + trn1 v26.2d, v6.2d, v7.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v4.2d, v5.2d + trn2 v28.2d, v6.2d, v7.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v8.2d, v9.2d + trn1 v26.2d, v10.2d, v11.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v8.2d, v9.2d + trn2 v28.2d, v10.2d, v11.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v12.2d, v13.2d + trn1 v26.2d, v14.2d, v15.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v12.2d, v13.2d + trn2 v28.2d, v14.2d, v15.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v16.2d, v17.2d + trn1 v26.2d, v18.2d, v19.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v16.2d, v17.2d + trn2 v28.2d, v18.2d, v19.2d + st1 { v27.2d, v28.2d }, [x4], #32 + trn1 v25.2d, v20.2d, v21.2d + trn1 v26.2d, v22.2d, v23.2d + stp q25, q26, [x0], #0x20 + trn2 v27.2d, v20.2d, v21.2d + trn2 v28.2d, v22.2d, v23.2d + st1 { v27.2d, v28.2d }, [x4], #32 + str d24, [x0] + trn2 v25.2d, v24.2d, v24.2d + str d25, [x4] + ldp d8, d9, [sp, #0x90] + ldp d10, d11, [sp, #0xa0] + ldp d12, d13, [sp, #0xb0] + ldp d14, d15, [sp, #0xc0] + ldp x19, x20, [sp, #0x30] + ldp x21, x22, [sp, #0x40] + ldp x23, x24, [sp, #0x50] + ldp x25, x26, [sp, #0x60] + ldp x27, x28, [sp, #0x70] + ldp x29, x30, [sp, #0x80] + add sp, sp, #0xe0 + ret + +MLK_ASM_FN_SIZE(keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm) + +#endif /* __ARM_FEATURE_SHA3 */ + +#endif /* MLK_FIPS202_AARCH64_NEED_X4_V8A_V84A_SCALAR_HYBRID && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +#if defined(__ELF__) +#endif + + END diff --git a/mlkem/src/native/aarch64/src/intt_aarch64_asm.asm b/mlkem/src/native/aarch64/src/intt_aarch64_asm.asm new file mode 100644 index 0000000000..1f34120712 --- /dev/null +++ b/mlkem/src/native/aarch64/src/intt_aarch64_asm.asm @@ -0,0 +1,612 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/native/aarch64/src/intt_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Hanno Becker + * Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [NeonNTT] + * Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 + * Becker, Hwang, Kannwischer, Yang, Yang + * https://eprint.iacr.org/2021/986 + * + * - [SLOTHY_Paper] + * Fast and Clean: Auditable high-performance assembly via constraint solving + * Abdulrahman, Becker, Kannwischer, Klein + * https://eprint.iacr.org/2022/1303 + */ + +/*yaml + Name: intt_aarch64_asm + Description: AArch64 ML-KEM inverse NTT following @[NeonNTT] and @[SLOTHY_Paper] + Signature: void mlk_intt_aarch64_asm(int16_t p[256], const int16_t twiddles12345[80], const int16_t twiddles56[384]) + ABI: + x0: + type: buffer + size_bytes: 512 + permissions: read/write + c_parameter: int16_t p[256] + description: Input/output polynomial + x1: + type: buffer + size_bytes: 160 + permissions: read-only + c_parameter: const int16_t twiddles12345[80] + description: Twiddle factors for layers 1-5 + x2: + type: buffer + size_bytes: 768 + permissions: read-only + c_parameter: const int16_t twiddles56[384] + description: Twiddle factors for layers 6-7 + Stack: + bytes: 64 + description: saving callee-saved Neon registers +*/ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/aarch64_opt/src/intt_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(intt_aarch64_asm) +MLK_ASM_FN_SYMBOL(intt_aarch64_asm) + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + mov w5, #0xd01 // =3329 + mov v7.h[0], w5 + mov w5, #0x4ebf // =20159 + mov v7.h[1], w5 + mov w5, #0x200 // =512 + dup v29.8h, w5 + mov w5, #0x13b0 // =5040 + dup v30.8h, w5 + mov x3, x0 + mov x4, #0x8 // =8 + ldr q13, [x3, #0x20] + ldr q8, [x3, #0x30] + ldr q6, [x3] + ldr q16, [x3, #0x10] + ldr q4, [x3, #0x50] + ldr q11, [x3, #0x40] + ldr q3, [x3, #0x70] + trn1 v23.4s, v13.4s, v8.4s + ldr q0, [x3, #0x60] + trn2 v19.4s, v6.4s, v16.4s + trn2 v21.4s, v13.4s, v8.4s + trn1 v6.4s, v6.4s, v16.4s + ldr q24, [x2, #0x20] + trn1 v10.2d, v19.2d, v21.2d + ldr q16, [x2], #0x60 + trn1 v5.2d, v6.2d, v23.2d + trn1 v28.4s, v0.4s, v3.4s + trn2 v18.2d, v6.2d, v23.2d + mul v31.8h, v10.8h, v29.8h + trn2 v13.4s, v0.4s, v3.4s + ldur q14, [x2, #-0x50] + sqrdmulh v26.8h, v18.8h, v30.8h + ldur q20, [x2, #-0x20] + mul v17.8h, v18.8h, v29.8h + trn2 v18.2d, v19.2d, v21.2d + mul v9.8h, v18.8h, v29.8h + trn1 v12.4s, v11.4s, v4.4s + sqrdmulh v22.8h, v18.8h, v30.8h + sqrdmulh v3.8h, v10.8h, v30.8h + sqrdmulh v25.8h, v5.8h, v30.8h + mls v9.8h, v22.8h, v7.h[0] + mls v17.8h, v26.8h, v7.h[0] + trn2 v26.4s, v11.4s, v4.4s + mul v8.8h, v5.8h, v29.8h + trn1 v10.2d, v26.2d, v13.2d + ldur q11, [x2, #-0x10] + mls v31.8h, v3.8h, v7.h[0] + trn1 v6.2d, v12.2d, v28.2d + trn2 v3.2d, v26.2d, v13.2d + ldur q4, [x2, #-0x30] + mls v8.8h, v25.8h, v7.h[0] + sub v19.8h, v17.8h, v9.8h + trn2 v13.2d, v12.2d, v28.2d + sqrdmulh v1.8h, v3.8h, v30.8h + add v9.8h, v17.8h, v9.8h + mul v18.8h, v19.8h, v20.8h + add v28.8h, v8.8h, v31.8h + sqrdmulh v20.8h, v19.8h, v11.8h + sub v12.8h, v28.8h, v9.8h + sub v23.8h, v8.8h, v31.8h + sqrdmulh v11.8h, v13.8h, v30.8h + sqrdmulh v5.8h, v23.8h, v4.8h + mul v0.8h, v23.8h, v24.8h + mul v2.8h, v13.8h, v29.8h + mls v0.8h, v5.8h, v7.h[0] + add v24.8h, v28.8h, v9.8h + mls v18.8h, v20.8h, v7.h[0] + sqrdmulh v15.8h, v6.8h, v30.8h + sqrdmulh v25.8h, v12.8h, v14.8h + mul v21.8h, v12.8h, v16.8h + sub v23.8h, v0.8h, v18.8h + sqrdmulh v8.8h, v23.8h, v14.8h + mul v23.8h, v23.8h, v16.8h + mls v21.8h, v25.8h, v7.h[0] + mls v23.8h, v8.8h, v7.h[0] + mul v14.8h, v3.8h, v29.8h + add v3.8h, v0.8h, v18.8h + trn2 v4.4s, v24.4s, v3.4s + mls v14.8h, v1.8h, v7.h[0] + trn1 v9.4s, v24.4s, v3.4s + trn2 v12.4s, v21.4s, v23.4s + mls v2.8h, v11.8h, v7.h[0] + trn1 v28.4s, v21.4s, v23.4s + ldr q11, [x1], #0x10 + mul v31.8h, v10.8h, v29.8h + trn1 v25.2d, v4.2d, v12.2d + trn1 v20.2d, v9.2d, v28.2d + ldr q23, [x2, #0x50] + trn2 v13.2d, v4.2d, v12.2d + sqrdmulh v21.8h, v10.8h, v30.8h + trn2 v4.2d, v9.2d, v28.2d + ldr q9, [x2, #0x40] + mul v27.8h, v6.8h, v29.8h + add v26.8h, v20.8h, v25.8h + sub v3.8h, v2.8h, v14.8h + sqdmulh v12.8h, v26.8h, v7.h[1] + add v5.8h, v4.8h, v13.8h + sub v8.8h, v4.8h, v13.8h + add v10.8h, v2.8h, v14.8h + sqdmulh v6.8h, v5.8h, v7.h[1] + ldr q2, [x2, #0x10] + mls v27.8h, v15.8h, v7.h[0] + ldr q15, [x2, #0x20] + srshr v12.8h, v12.8h, #0xb + mls v31.8h, v21.8h, v7.h[0] + srshr v6.8h, v6.8h, #0xb + sqrdmulh v23.8h, v3.8h, v23.8h + mls v26.8h, v12.8h, v7.h[0] + add v21.8h, v27.8h, v31.8h + mls v5.8h, v6.8h, v7.h[0] + sub v6.8h, v27.8h, v31.8h + sub v14.8h, v21.8h, v10.8h + ldr q27, [x2], #0x60 + mul v3.8h, v3.8h, v9.8h + mls v3.8h, v23.8h, v7.h[0] + ldur q13, [x2, #-0x30] + sub v12.8h, v26.8h, v5.8h + add v5.8h, v26.8h, v5.8h + sqrdmulh v31.8h, v8.8h, v11.h[5] + sqrdmulh v19.8h, v12.8h, v11.h[1] + mul v24.8h, v12.8h, v11.h[0] + sqrdmulh v13.8h, v6.8h, v13.8h + mls v24.8h, v19.8h, v7.h[0] + sub x4, x4, #0x2 + +Lintt_layer4567_start + add v16.8h, v21.8h, v10.8h + mul v18.8h, v6.8h, v15.8h + sub v19.8h, v20.8h, v25.8h + ldr q21, [x3, #0xa0] + str q5, [x3], #0x40 + mls v18.8h, v13.8h, v7.h[0] + sqrdmulh v15.8h, v14.8h, v2.8h + ldr q10, [x3, #0x50] + ldr q12, [x3, #0x40] + stur q24, [x3, #-0x20] + mul v5.8h, v8.8h, v11.h[4] + sub v0.8h, v18.8h, v3.8h + ldr q24, [x3, #0x70] + mls v5.8h, v31.8h, v7.h[0] + ldr q26, [x2, #0x50] + trn2 v1.4s, v12.4s, v10.4s + add v6.8h, v18.8h, v3.8h + sqrdmulh v20.8h, v0.8h, v2.8h + trn1 v13.4s, v12.4s, v10.4s + trn1 v18.4s, v16.4s, v6.4s + mul v22.8h, v0.8h, v27.8h + trn1 v17.4s, v21.4s, v24.4s + sqrdmulh v0.8h, v19.8h, v11.h[3] + trn1 v25.2d, v13.2d, v17.2d + mls v22.8h, v20.8h, v7.h[0] + trn2 v21.4s, v21.4s, v24.4s + mul v24.8h, v25.8h, v29.8h + trn2 v28.2d, v13.2d, v17.2d + sqrdmulh v4.8h, v25.8h, v30.8h + trn2 v3.2d, v1.2d, v21.2d + mul v17.8h, v28.8h, v29.8h + sqrdmulh v31.8h, v28.8h, v30.8h + ldr q2, [x2, #0x10] + mls v24.8h, v4.8h, v7.h[0] + mul v4.8h, v19.8h, v11.h[2] + ldr q19, [x2, #0x40] + mls v4.8h, v0.8h, v7.h[0] + mul v0.8h, v14.8h, v27.8h + mls v0.8h, v15.8h, v7.h[0] + sub v8.8h, v4.8h, v5.8h + mul v12.8h, v3.8h, v29.8h + mul v23.8h, v8.8h, v11.h[0] + trn2 v28.4s, v16.4s, v6.4s + sqrdmulh v10.8h, v8.8h, v11.h[1] + trn1 v9.4s, v0.4s, v22.4s + trn2 v22.4s, v0.4s, v22.4s + ldr q11, [x1], #0x10 + mls v17.8h, v31.8h, v7.h[0] + trn1 v20.2d, v18.2d, v9.2d + trn2 v14.2d, v18.2d, v9.2d + ldr q15, [x2, #0x20] + trn1 v6.2d, v1.2d, v21.2d + sqrdmulh v9.8h, v3.8h, v30.8h + trn1 v25.2d, v28.2d, v22.2d + trn2 v16.2d, v28.2d, v22.2d + mls v23.8h, v10.8h, v7.h[0] + add v1.8h, v20.8h, v25.8h + sqrdmulh v21.8h, v6.8h, v30.8h + add v8.8h, v14.8h, v16.8h + ldr q27, [x2], #0x60 + sqdmulh v28.8h, v8.8h, v7.h[1] + mls v12.8h, v9.8h, v7.h[0] + sqdmulh v31.8h, v1.8h, v7.h[1] + mul v0.8h, v6.8h, v29.8h + sub v10.8h, v17.8h, v12.8h + mls v0.8h, v21.8h, v7.h[0] + srshr v21.8h, v28.8h, #0xb + srshr v13.8h, v31.8h, #0xb + sqrdmulh v22.8h, v10.8h, v26.8h + mls v8.8h, v21.8h, v7.h[0] + mls v1.8h, v13.8h, v7.h[0] + add v21.8h, v24.8h, v0.8h + stur q23, [x3, #-0x10] + sub v6.8h, v24.8h, v0.8h + mul v3.8h, v10.8h, v19.8h + add v0.8h, v4.8h, v5.8h + sqdmulh v13.8h, v0.8h, v7.h[1] + ldur q10, [x2, #-0x30] + add v5.8h, v1.8h, v8.8h + mls v3.8h, v22.8h, v7.h[0] + sub v8.8h, v1.8h, v8.8h + mul v24.8h, v8.8h, v11.h[0] + sqrdmulh v8.8h, v8.8h, v11.h[1] + srshr v1.8h, v13.8h, #0xb + sqrdmulh v13.8h, v6.8h, v10.8h + mls v0.8h, v1.8h, v7.h[0] + add v10.8h, v17.8h, v12.8h + mls v24.8h, v8.8h, v7.h[0] + sub v8.8h, v14.8h, v16.8h + sqrdmulh v31.8h, v8.8h, v11.h[5] + sub v14.8h, v21.8h, v10.8h + stur q0, [x3, #-0x30] + subs x4, x4, #0x1 + cbnz x4, Lintt_layer4567_start + mul v15.8h, v6.8h, v15.8h + sub v22.8h, v20.8h, v25.8h + add v4.8h, v21.8h, v10.8h + str q24, [x3, #0x20] + mls v15.8h, v13.8h, v7.h[0] + str q5, [x3], #0x40 + ldr q9, [x1], #0x10 + sqrdmulh v28.8h, v14.8h, v2.8h + mul v16.8h, v14.8h, v27.8h + sub v18.8h, v15.8h, v3.8h + add v15.8h, v15.8h, v3.8h + sqrdmulh v0.8h, v18.8h, v2.8h + trn2 v24.4s, v4.4s, v15.4s + trn1 v2.4s, v4.4s, v15.4s + mul v18.8h, v18.8h, v27.8h + mls v16.8h, v28.8h, v7.h[0] + mls v18.8h, v0.8h, v7.h[0] + mul v23.8h, v8.8h, v11.h[4] + sqrdmulh v12.8h, v22.8h, v11.h[3] + trn1 v17.4s, v16.4s, v18.4s + trn2 v4.4s, v16.4s, v18.4s + mls v23.8h, v31.8h, v7.h[0] + trn2 v3.2d, v2.2d, v17.2d + trn2 v6.2d, v24.2d, v4.2d + mul v26.8h, v22.8h, v11.h[2] + trn1 v28.2d, v2.2d, v17.2d + mls v26.8h, v12.8h, v7.h[0] + add v25.8h, v3.8h, v6.8h + sub v18.8h, v3.8h, v6.8h + trn1 v24.2d, v24.2d, v4.2d + sqdmulh v1.8h, v25.8h, v7.h[1] + sub v27.8h, v28.8h, v24.8h + sqrdmulh v2.8h, v18.8h, v9.h[5] + add v28.8h, v28.8h, v24.8h + mul v24.8h, v27.8h, v9.h[2] + sqdmulh v12.8h, v28.8h, v7.h[1] + mul v20.8h, v18.8h, v9.h[4] + mls v20.8h, v2.8h, v7.h[0] + srshr v1.8h, v1.8h, #0xb + sqrdmulh v19.8h, v27.8h, v9.h[3] + srshr v15.8h, v12.8h, #0xb + mls v25.8h, v1.8h, v7.h[0] + add v8.8h, v26.8h, v23.8h + sub v4.8h, v26.8h, v23.8h + mls v28.8h, v15.8h, v7.h[0] + mls v24.8h, v19.8h, v7.h[0] + mul v2.8h, v4.8h, v11.h[0] + sub v19.8h, v28.8h, v25.8h + sqrdmulh v15.8h, v4.8h, v11.h[1] + add v25.8h, v28.8h, v25.8h + sub v10.8h, v24.8h, v20.8h + str q25, [x3], #0x40 + sqrdmulh v22.8h, v19.8h, v9.h[1] + add v28.8h, v24.8h, v20.8h + sqrdmulh v25.8h, v10.8h, v9.h[1] + mul v27.8h, v19.8h, v9.h[0] + mul v26.8h, v10.8h, v9.h[0] + sqdmulh v20.8h, v28.8h, v7.h[1] + sqdmulh v16.8h, v8.8h, v7.h[1] + mls v26.8h, v25.8h, v7.h[0] + mls v2.8h, v15.8h, v7.h[0] + srshr v15.8h, v20.8h, #0xb + srshr v1.8h, v16.8h, #0xb + mls v27.8h, v22.8h, v7.h[0] + mls v28.8h, v15.8h, v7.h[0] + mls v8.8h, v1.8h, v7.h[0] + stur q27, [x3, #-0x20] + stur q2, [x3, #-0x50] + stur q28, [x3, #-0x30] + stur q26, [x3, #-0x10] + stur q8, [x3, #-0x70] + mov x4, #0x4 // =4 + ldr q0, [x1], #0x20 + ldur q1, [x1, #-0x10] + ldr q26, [x0] + ldr q13, [x0, #0x40] + ldr q28, [x0, #0xc0] + ldr q2, [x0, #0x140] + ldr q6, [x0, #0x80] + ldr q9, [x0, #0x100] + ldr q29, [x0, #0x1c0] + ldr q23, [x0, #0x180] + sub v17.8h, v26.8h, v13.8h + add v4.8h, v26.8h, v13.8h + ldr q25, [x0, #0xd0] + ldr q24, [x0, #0x50] + add v5.8h, v6.8h, v28.8h + mul v19.8h, v17.8h, v0.h[6] + sub v10.8h, v6.8h, v28.8h + ldr q30, [x0, #0x150] + sqrdmulh v12.8h, v17.8h, v0.h[7] + add v17.8h, v9.8h, v2.8h + sub v28.8h, v9.8h, v2.8h + ldr q2, [x0, #0x90] + sub v26.8h, v23.8h, v29.8h + sqrdmulh v31.8h, v10.8h, v1.h[1] + add v22.8h, v23.8h, v29.8h + ldr q3, [x0, #0x110] + sqrdmulh v9.8h, v28.8h, v1.h[3] + sub v20.8h, v4.8h, v5.8h + sub v27.8h, v17.8h, v22.8h + ldr q29, [x0, #0x10] + add v16.8h, v4.8h, v5.8h + sqrdmulh v4.8h, v26.8h, v1.h[5] + add v6.8h, v17.8h, v22.8h + ldr q22, [x0, #0x1d0] + mul v8.8h, v28.8h, v1.h[2] + sub v21.8h, v2.8h, v25.8h + sub v5.8h, v16.8h, v6.8h + mul v17.8h, v26.8h, v1.h[4] + mul v26.8h, v10.8h, v1.h[0] + mls v26.8h, v31.8h, v7.h[0] + mls v17.8h, v4.8h, v7.h[0] + mls v19.8h, v12.8h, v7.h[0] + mls v8.8h, v9.8h, v7.h[0] + sqrdmulh v10.8h, v27.8h, v0.h[5] + sub v12.8h, v19.8h, v26.8h + add v9.8h, v19.8h, v26.8h + sqrdmulh v26.8h, v20.8h, v0.h[3] + sub v11.8h, v8.8h, v17.8h + add v14.8h, v8.8h, v17.8h + sqrdmulh v13.8h, v12.8h, v0.h[3] + add v23.8h, v9.8h, v14.8h + sqrdmulh v28.8h, v11.8h, v0.h[5] + sub v19.8h, v9.8h, v14.8h + mul v17.8h, v27.8h, v0.h[4] + str q23, [x0, #0x40] + mul v14.8h, v20.8h, v0.h[2] + mul v8.8h, v11.8h, v0.h[4] + mul v4.8h, v12.8h, v0.h[2] + mls v14.8h, v26.8h, v7.h[0] + mls v17.8h, v10.8h, v7.h[0] + mls v8.8h, v28.8h, v7.h[0] + mls v4.8h, v13.8h, v7.h[0] + sub v10.8h, v14.8h, v17.8h + add v20.8h, v14.8h, v17.8h + sqrdmulh v28.8h, v5.8h, v0.h[1] + mul v18.8h, v5.8h, v0.h[0] + str q20, [x0, #0x80] + sub v13.8h, v4.8h, v8.8h + mul v23.8h, v10.8h, v0.h[0] + mul v17.8h, v19.8h, v0.h[0] + sqrdmulh v9.8h, v13.8h, v0.h[1] + mls v18.8h, v28.8h, v7.h[0] + sqrdmulh v10.8h, v10.8h, v0.h[1] + sub x4, x4, #0x2 + +Lintt_layer123_start + sub v12.8h, v3.8h, v30.8h + mul v11.8h, v21.8h, v1.h[0] + add v28.8h, v4.8h, v8.8h + ldr q20, [x0, #0x190] + add v27.8h, v16.8h, v6.8h + sqrdmulh v8.8h, v12.8h, v1.h[3] + add v16.8h, v29.8h, v24.8h + str q28, [x0, #0xc0] + mls v23.8h, v10.8h, v7.h[0] + str q27, [x0], #0x10 + add v15.8h, v20.8h, v22.8h + str q18, [x0, #0xf0] + mul v14.8h, v13.8h, v0.h[0] + add v2.8h, v2.8h, v25.8h + sub v26.8h, v20.8h, v22.8h + mul v4.8h, v12.8h, v1.h[2] + sub v5.8h, v16.8h, v2.8h + str q23, [x0, #0x170] + add v20.8h, v3.8h, v30.8h + sqrdmulh v27.8h, v26.8h, v1.h[5] + add v16.8h, v16.8h, v2.8h + mul v18.8h, v26.8h, v1.h[4] + sub v31.8h, v20.8h, v15.8h + mls v4.8h, v8.8h, v7.h[0] + sub v28.8h, v29.8h, v24.8h + mls v18.8h, v27.8h, v7.h[0] + ldr q22, [x0, #0x1d0] + mul v26.8h, v28.8h, v0.h[6] + mul v2.8h, v5.8h, v0.h[2] + sub v12.8h, v4.8h, v18.8h + sqrdmulh v24.8h, v28.8h, v0.h[7] + mls v14.8h, v9.8h, v7.h[0] + sqrdmulh v10.8h, v12.8h, v0.h[5] + mls v26.8h, v24.8h, v7.h[0] + ldr q24, [x0, #0x50] + mul v8.8h, v12.8h, v0.h[4] + str q14, [x0, #0x1b0] + add v28.8h, v4.8h, v18.8h + sqrdmulh v5.8h, v5.8h, v0.h[3] + add v6.8h, v20.8h, v15.8h + sqrdmulh v3.8h, v19.8h, v0.h[1] + sub v13.8h, v16.8h, v6.8h + sqrdmulh v12.8h, v21.8h, v1.h[1] + sqrdmulh v21.8h, v13.8h, v0.h[1] + sqrdmulh v27.8h, v31.8h, v0.h[5] + ldr q25, [x0, #0xd0] + mls v11.8h, v12.8h, v7.h[0] + mul v23.8h, v31.8h, v0.h[4] + mul v18.8h, v13.8h, v0.h[0] + add v30.8h, v26.8h, v11.8h + sub v13.8h, v26.8h, v11.8h + mls v23.8h, v27.8h, v7.h[0] + add v12.8h, v30.8h, v28.8h + sub v19.8h, v30.8h, v28.8h + mls v2.8h, v5.8h, v7.h[0] + str q12, [x0, #0x40] + sqrdmulh v26.8h, v13.8h, v0.h[3] + mls v8.8h, v10.8h, v7.h[0] + ldr q30, [x0, #0x150] + sub v20.8h, v2.8h, v23.8h + mul v4.8h, v13.8h, v0.h[2] + add v13.8h, v2.8h, v23.8h + mls v4.8h, v26.8h, v7.h[0] + ldr q2, [x0, #0x90] + mul v23.8h, v20.8h, v0.h[0] + ldr q29, [x0, #0x10] + sqrdmulh v10.8h, v20.8h, v0.h[1] + str q13, [x0, #0x80] + sub v13.8h, v4.8h, v8.8h + mls v17.8h, v3.8h, v7.h[0] + ldr q3, [x0, #0x110] + mls v18.8h, v21.8h, v7.h[0] + sub v21.8h, v2.8h, v25.8h + sqrdmulh v9.8h, v13.8h, v0.h[1] + str q17, [x0, #0x130] + mul v17.8h, v19.8h, v0.h[0] + subs x4, x4, #0x1 + cbnz x4, Lintt_layer123_start + mls v23.8h, v10.8h, v7.h[0] + ldr q11, [x0, #0x190] + str q18, [x0, #0x100] + add v27.8h, v3.8h, v30.8h + mul v13.8h, v13.8h, v0.h[0] + sub v5.8h, v29.8h, v24.8h + add v14.8h, v16.8h, v6.8h + mls v13.8h, v9.8h, v7.h[0] + add v10.8h, v11.8h, v22.8h + str q23, [x0, #0x180] + sub v20.8h, v11.8h, v22.8h + sub v23.8h, v27.8h, v10.8h + sqrdmulh v16.8h, v21.8h, v1.h[1] + sqrdmulh v31.8h, v23.8h, v0.h[5] + str q13, [x0, #0x1c0] + add v13.8h, v4.8h, v8.8h + mul v18.8h, v21.8h, v1.h[0] + str q13, [x0, #0xc0] + sqrdmulh v13.8h, v19.8h, v0.h[1] + sqrdmulh v28.8h, v20.8h, v1.h[5] + str q14, [x0], #0x10 + mul v4.8h, v20.8h, v1.h[4] + mls v17.8h, v13.8h, v7.h[0] + sub v13.8h, v3.8h, v30.8h + sqrdmulh v8.8h, v13.8h, v1.h[3] + mul v12.8h, v13.8h, v1.h[2] + mls v4.8h, v28.8h, v7.h[0] + mls v12.8h, v8.8h, v7.h[0] + mls v18.8h, v16.8h, v7.h[0] + str q17, [x0, #0x130] + sqrdmulh v15.8h, v5.8h, v0.h[7] + add v11.8h, v27.8h, v10.8h + mul v16.8h, v5.8h, v0.h[6] + sub v8.8h, v12.8h, v4.8h + sqrdmulh v28.8h, v8.8h, v0.h[5] + add v13.8h, v2.8h, v25.8h + mls v16.8h, v15.8h, v7.h[0] + add v26.8h, v12.8h, v4.8h + mul v8.8h, v8.8h, v0.h[4] + add v4.8h, v29.8h, v24.8h + mls v8.8h, v28.8h, v7.h[0] + sub v20.8h, v4.8h, v13.8h + add v14.8h, v4.8h, v13.8h + add v12.8h, v16.8h, v18.8h + sqrdmulh v22.8h, v20.8h, v0.h[3] + add v27.8h, v14.8h, v11.8h + sub v13.8h, v16.8h, v18.8h + mul v4.8h, v20.8h, v0.h[2] + str q27, [x0], #0x10 + sub v24.8h, v12.8h, v26.8h + sqrdmulh v3.8h, v13.8h, v0.h[3] + mul v13.8h, v13.8h, v0.h[2] + sqrdmulh v27.8h, v24.8h, v0.h[1] + mls v13.8h, v3.8h, v7.h[0] + mul v9.8h, v24.8h, v0.h[0] + mls v9.8h, v27.8h, v7.h[0] + add v30.8h, v13.8h, v8.8h + sub v13.8h, v13.8h, v8.8h + mls v4.8h, v22.8h, v7.h[0] + str q30, [x0, #0xb0] + sqrdmulh v16.8h, v13.8h, v0.h[1] + str q9, [x0, #0x130] + mul v9.8h, v13.8h, v0.h[0] + add v13.8h, v12.8h, v26.8h + str q13, [x0, #0x30] + mul v13.8h, v23.8h, v0.h[4] + sub v23.8h, v14.8h, v11.8h + mls v13.8h, v31.8h, v7.h[0] + mls v9.8h, v16.8h, v7.h[0] + mul v30.8h, v23.8h, v0.h[0] + sub v24.8h, v4.8h, v13.8h + add v13.8h, v4.8h, v13.8h + sqrdmulh v23.8h, v23.8h, v0.h[1] + str q9, [x0, #0x1b0] + str q13, [x0, #0x70] + sqrdmulh v13.8h, v24.8h, v0.h[1] + mul v21.8h, v24.8h, v0.h[0] + mls v30.8h, v23.8h, v7.h[0] + mls v21.8h, v13.8h, v7.h[0] + str q30, [x0, #0xf0] + str q21, [x0, #0x170] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret + +MLK_ASM_FN_SIZE(intt_aarch64_asm) + +#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +#if defined(__ELF__) +#endif + + END diff --git a/mlkem/src/native/aarch64/src/ntt_aarch64_asm.asm b/mlkem/src/native/aarch64/src/ntt_aarch64_asm.asm new file mode 100644 index 0000000000..cb0686bd6a --- /dev/null +++ b/mlkem/src/native/aarch64/src/ntt_aarch64_asm.asm @@ -0,0 +1,546 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/native/aarch64/src/ntt_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* Copyright (c) 2022 Arm Limited + * Copyright (c) 2022 Hanno Becker + * Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [NeonNTT] + * Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 + * Becker, Hwang, Kannwischer, Yang, Yang + * https://eprint.iacr.org/2021/986 + * + * - [SLOTHY_Paper] + * Fast and Clean: Auditable high-performance assembly via constraint solving + * Abdulrahman, Becker, Kannwischer, Klein + * https://eprint.iacr.org/2022/1303 + */ + +/*yaml + Name: ntt_aarch64_asm + Description: AArch64 ML-KEM forward NTT following @[NeonNTT] and @[SLOTHY_Paper] + Signature: void mlk_ntt_aarch64_asm(int16_t p[256], const int16_t twiddles12345[80], const int16_t twiddles56[384]) + ABI: + x0: + type: buffer + size_bytes: 512 + permissions: read/write + c_parameter: int16_t p[256] + description: Input/output polynomial + x1: + type: buffer + size_bytes: 160 + permissions: read-only + c_parameter: const int16_t twiddles12345[80] + description: Twiddle factors for layers 1-5 + x2: + type: buffer + size_bytes: 768 + permissions: read-only + c_parameter: const int16_t twiddles56[384] + description: Twiddle factors for layers 6-7 + Stack: + bytes: 64 + description: saving callee-saved Neon registers +*/ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/aarch64_opt/src/ntt_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(ntt_aarch64_asm) +MLK_ASM_FN_SYMBOL(ntt_aarch64_asm) + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + mov w5, #0xd03 // =3331, intentionally wrong + mov v7.h[0], w5 + mov w5, #0x4ebf // =20159 + mov v7.h[1], w5 + mov x3, x0 + mov x4, #0x4 // =4 + ldr q0, [x1], #0x20 + ldur q1, [x1, #-0x10] + ldr q21, [x0, #0x40] + ldr q5, [x0, #0x1c0] + ldr q30, [x0, #0x110] + ldr q24, [x0, #0x140] + ldr q12, [x0, #0x80] + sqrdmulh v9.8h, v5.8h, v0.h[1] + mul v23.8h, v5.8h, v0.h[0] + sqrdmulh v17.8h, v24.8h, v0.h[1] + ldr q13, [x0, #0xc0] + mls v23.8h, v9.8h, v7.h[0] + mul v8.8h, v24.8h, v0.h[0] + mls v8.8h, v17.8h, v7.h[0] + add v9.8h, v13.8h, v23.8h + sub v10.8h, v13.8h, v23.8h + mul v11.8h, v30.8h, v0.h[0] + ldr q13, [x0, #0x180] + sqrdmulh v28.8h, v9.8h, v0.h[3] + sub v29.8h, v21.8h, v8.8h + mul v26.8h, v9.8h, v0.h[2] + add v8.8h, v21.8h, v8.8h + mul v2.8h, v13.8h, v0.h[0] + mls v26.8h, v28.8h, v7.h[0] + mul v28.8h, v10.8h, v0.h[4] + sqrdmulh v23.8h, v10.8h, v0.h[5] + add v22.8h, v8.8h, v26.8h + sqrdmulh v10.8h, v13.8h, v0.h[1] + sqrdmulh v21.8h, v22.8h, v0.h[7] + ldr q13, [x0, #0x100] + mul v16.8h, v22.8h, v0.h[6] + mls v28.8h, v23.8h, v7.h[0] + mls v2.8h, v10.8h, v7.h[0] + sqrdmulh v23.8h, v13.8h, v0.h[1] + sub v10.8h, v29.8h, v28.8h + add v17.8h, v29.8h, v28.8h + mls v16.8h, v21.8h, v7.h[0] + sub v18.8h, v12.8h, v2.8h + ldr q29, [x0] + sqrdmulh v14.8h, v17.8h, v1.h[3] + add v22.8h, v12.8h, v2.8h + sqrdmulh v9.8h, v18.8h, v0.h[5] + mul v21.8h, v13.8h, v0.h[0] + ldr q13, [x0, #0x150] + mul v5.8h, v18.8h, v0.h[4] + mls v5.8h, v9.8h, v7.h[0] + mul v18.8h, v13.8h, v0.h[0] + mls v21.8h, v23.8h, v7.h[0] + sqrdmulh v2.8h, v13.8h, v0.h[1] + mul v13.8h, v17.8h, v1.h[2] + sub v4.8h, v29.8h, v21.8h + mls v13.8h, v14.8h, v7.h[0] + add v25.8h, v29.8h, v21.8h + add v6.8h, v4.8h, v5.8h + sqrdmulh v15.8h, v22.8h, v0.h[3] + sub v21.8h, v4.8h, v5.8h + sub v5.8h, v8.8h, v26.8h + mul v23.8h, v22.8h, v0.h[2] + add v28.8h, v6.8h, v13.8h + sub v13.8h, v6.8h, v13.8h + mul v4.8h, v5.8h, v1.h[0] + sub x4, x4, #0x2 + +Lntt_layer123_start + mls v23.8h, v15.8h, v7.h[0] + ldr q6, [x0, #0x190] + ldr q15, [x0, #0x90] + ldr q19, [x0, #0x10] + mul v22.8h, v10.8h, v1.h[4] + ldr q24, [x0, #0x50] + str q13, [x0, #0x140] + sqrdmulh v13.8h, v6.8h, v0.h[1] + sub v20.8h, v25.8h, v23.8h + sqrdmulh v3.8h, v30.8h, v0.h[1] + str q28, [x0, #0x100] + ldr q30, [x0, #0x120] + mul v8.8h, v6.8h, v0.h[0] + sqrdmulh v27.8h, v10.8h, v1.h[5] + mls v11.8h, v3.8h, v7.h[0] + mls v18.8h, v2.8h, v7.h[0] + ldr q31, [x0, #0x160] + sqrdmulh v10.8h, v5.8h, v1.h[1] + mls v8.8h, v13.8h, v7.h[0] + ldr q13, [x0, #0x1d0] + sub v14.8h, v24.8h, v18.8h + add v9.8h, v24.8h, v18.8h + sqrdmulh v2.8h, v31.8h, v0.h[1] + mls v4.8h, v10.8h, v7.h[0] + add v10.8h, v25.8h, v23.8h + sub v24.8h, v19.8h, v11.8h + add v25.8h, v19.8h, v11.8h + sqrdmulh v28.8h, v13.8h, v0.h[1] + mul v11.8h, v30.8h, v0.h[0] + mul v17.8h, v13.8h, v0.h[0] + sub v13.8h, v10.8h, v16.8h + sub v6.8h, v15.8h, v8.8h + mls v17.8h, v28.8h, v7.h[0] + str q13, [x0, #0x40] + mls v22.8h, v27.8h, v7.h[0] + ldr q13, [x0, #0xd0] + add v26.8h, v20.8h, v4.8h + mul v18.8h, v31.8h, v0.h[0] + add v27.8h, v10.8h, v16.8h + str q26, [x0, #0x80] + sqrdmulh v31.8h, v6.8h, v0.h[5] + add v3.8h, v21.8h, v22.8h + str q27, [x0], #0x10 + mul v26.8h, v6.8h, v0.h[4] + add v6.8h, v13.8h, v17.8h + sub v5.8h, v13.8h, v17.8h + str q3, [x0, #0x170] + sub v17.8h, v21.8h, v22.8h + sqrdmulh v10.8h, v6.8h, v0.h[3] + sub v13.8h, v20.8h, v4.8h + add v20.8h, v15.8h, v8.8h + sqrdmulh v12.8h, v5.8h, v0.h[5] + str q13, [x0, #0xb0] + mul v8.8h, v6.8h, v0.h[2] + str q17, [x0, #0x1b0] + mls v8.8h, v10.8h, v7.h[0] + mul v29.8h, v5.8h, v0.h[4] + mls v29.8h, v12.8h, v7.h[0] + sub v5.8h, v9.8h, v8.8h + add v3.8h, v9.8h, v8.8h + sqrdmulh v15.8h, v20.8h, v0.h[3] + mul v4.8h, v5.8h, v1.h[0] + add v6.8h, v14.8h, v29.8h + sqrdmulh v9.8h, v3.8h, v0.h[7] + sqrdmulh v12.8h, v6.8h, v1.h[3] + sub v10.8h, v14.8h, v29.8h + mul v23.8h, v6.8h, v1.h[2] + mls v26.8h, v31.8h, v7.h[0] + mls v23.8h, v12.8h, v7.h[0] + mul v16.8h, v3.8h, v0.h[6] + add v13.8h, v24.8h, v26.8h + sub v21.8h, v24.8h, v26.8h + mls v16.8h, v9.8h, v7.h[0] + add v28.8h, v13.8h, v23.8h + sub v13.8h, v13.8h, v23.8h + mul v23.8h, v20.8h, v0.h[2] + subs x4, x4, #0x1 + cbnz x4, Lntt_layer123_start + sqrdmulh v3.8h, v5.8h, v1.h[1] + mls v23.8h, v15.8h, v7.h[0] + ldr q5, [x0, #0x190] + mul v29.8h, v10.8h, v1.h[4] + mls v4.8h, v3.8h, v7.h[0] + sub v19.8h, v25.8h, v23.8h + sqrdmulh v31.8h, v5.8h, v0.h[1] + sqrdmulh v6.8h, v30.8h, v0.h[1] + sub v3.8h, v19.8h, v4.8h + mul v5.8h, v5.8h, v0.h[0] + str q3, [x0, #0xc0] + sqrdmulh v12.8h, v10.8h, v1.h[5] + mls v18.8h, v2.8h, v7.h[0] + ldr q3, [x0, #0x1d0] + mls v5.8h, v31.8h, v7.h[0] + sqrdmulh v10.8h, v3.8h, v0.h[1] + mls v11.8h, v6.8h, v7.h[0] + ldr q31, [x0, #0x90] + mul v30.8h, v3.8h, v0.h[0] + mls v30.8h, v10.8h, v7.h[0] + sub v10.8h, v31.8h, v5.8h + mls v29.8h, v12.8h, v7.h[0] + ldr q6, [x0, #0xd0] + sqrdmulh v15.8h, v10.8h, v0.h[5] + mul v17.8h, v10.8h, v0.h[4] + add v10.8h, v6.8h, v30.8h + sub v6.8h, v6.8h, v30.8h + sqrdmulh v12.8h, v10.8h, v0.h[3] + sub v27.8h, v21.8h, v29.8h + sqrdmulh v3.8h, v6.8h, v0.h[5] + mul v10.8h, v10.8h, v0.h[2] + ldr q20, [x0, #0x50] + mls v10.8h, v12.8h, v7.h[0] + mul v2.8h, v6.8h, v0.h[4] + add v6.8h, v20.8h, v18.8h + add v5.8h, v31.8h, v5.8h + mls v2.8h, v3.8h, v7.h[0] + sub v31.8h, v6.8h, v10.8h + sqrdmulh v12.8h, v5.8h, v0.h[3] + sub v22.8h, v20.8h, v18.8h + add v6.8h, v6.8h, v10.8h + mul v20.8h, v31.8h, v1.h[0] + add v30.8h, v22.8h, v2.8h + sqrdmulh v3.8h, v6.8h, v0.h[7] + sqrdmulh v10.8h, v30.8h, v1.h[3] + mul v9.8h, v30.8h, v1.h[2] + ldr q30, [x0, #0x10] + mls v17.8h, v15.8h, v7.h[0] + mls v9.8h, v10.8h, v7.h[0] + mul v15.8h, v6.8h, v0.h[6] + add v24.8h, v30.8h, v11.8h + sub v10.8h, v22.8h, v2.8h + mls v15.8h, v3.8h, v7.h[0] + add v6.8h, v19.8h, v4.8h + add v22.8h, v25.8h, v23.8h + sqrdmulh v3.8h, v10.8h, v1.h[5] + str q13, [x0, #0x140] + sub v19.8h, v30.8h, v11.8h + add v25.8h, v22.8h, v16.8h + mul v5.8h, v5.8h, v0.h[2] + sub v13.8h, v22.8h, v16.8h + str q28, [x0, #0x100] + mls v5.8h, v12.8h, v7.h[0] + str q13, [x0, #0x40] + str q6, [x0, #0x80] + add v21.8h, v21.8h, v29.8h + sqrdmulh v13.8h, v31.8h, v1.h[1] + str q25, [x0], #0x10 + add v12.8h, v19.8h, v17.8h + sub v31.8h, v19.8h, v17.8h + mul v30.8h, v10.8h, v1.h[4] + str q21, [x0, #0x170] + add v21.8h, v24.8h, v5.8h + add v6.8h, v12.8h, v9.8h + mls v30.8h, v3.8h, v7.h[0] + str q27, [x0, #0x1b0] + sub v10.8h, v21.8h, v15.8h + sub v12.8h, v12.8h, v9.8h + mls v20.8h, v13.8h, v7.h[0] + str q6, [x0, #0x100] + str q10, [x0, #0x40] + sub v13.8h, v24.8h, v5.8h + add v3.8h, v21.8h, v15.8h + str q12, [x0, #0x140] + sub v10.8h, v31.8h, v30.8h + add v21.8h, v31.8h, v30.8h + str q3, [x0], #0x10 + add v12.8h, v13.8h, v20.8h + sub v13.8h, v13.8h, v20.8h + str q21, [x0, #0x170] + str q10, [x0, #0x1b0] + str q12, [x0, #0x70] + str q13, [x0, #0xb0] + mov x0, x3 + mov x4, #0x8 // =8 + ldr q2, [x0, #0x20] + ldr q13, [x1], #0x10 + ldr q30, [x0, #0x30] + ldr q25, [x2, #0x40] + ldr q5, [x0] + ldr q18, [x0, #0x60] + ldr q12, [x0, #0x70] + sqrdmulh v17.8h, v2.8h, v13.h[1] + ldr q4, [x1], #0x10 + ldr q23, [x0, #0x10] + sqrdmulh v21.8h, v30.8h, v13.h[1] + ldr q24, [x2, #0x20] + ldr q9, [x2], #0x60 + mul v10.8h, v30.8h, v13.h[0] + mul v11.8h, v2.8h, v13.h[0] + mls v10.8h, v21.8h, v7.h[0] + sqrdmulh v29.8h, v12.8h, v4.h[1] + mul v1.8h, v12.8h, v4.h[0] + add v21.8h, v23.8h, v10.8h + sub v10.8h, v23.8h, v10.8h + mul v8.8h, v18.8h, v4.h[0] + sqrdmulh v23.8h, v21.8h, v13.h[3] + mul v2.8h, v21.8h, v13.h[2] + mls v1.8h, v29.8h, v7.h[0] + mls v2.8h, v23.8h, v7.h[0] + ldur q15, [x2, #-0x50] + sqrdmulh v0.8h, v10.8h, v13.h[5] + mls v11.8h, v17.8h, v7.h[0] + ldr q29, [x0, #0x50] + mul v23.8h, v10.8h, v13.h[4] + mls v23.8h, v0.8h, v7.h[0] + sub v16.8h, v29.8h, v1.8h + add v3.8h, v5.8h, v11.8h + sub v31.8h, v5.8h, v11.8h + sqrdmulh v22.8h, v16.8h, v4.h[5] + add v30.8h, v3.8h, v2.8h + sub v0.8h, v3.8h, v2.8h + sqrdmulh v28.8h, v18.8h, v4.h[1] + add v21.8h, v31.8h, v23.8h + sub v19.8h, v31.8h, v23.8h + mul v26.8h, v16.8h, v4.h[4] + trn2 v3.4s, v30.4s, v0.4s + ldur q23, [x2, #-0x10] + trn2 v18.4s, v21.4s, v19.4s + mls v26.8h, v22.8h, v7.h[0] + trn1 v13.4s, v30.4s, v0.4s + mls v8.8h, v28.8h, v7.h[0] + trn2 v31.2d, v3.2d, v18.2d + trn1 v11.4s, v21.4s, v19.4s + add v27.8h, v29.8h, v1.8h + sqrdmulh v6.8h, v31.8h, v15.8h + trn1 v2.2d, v13.2d, v11.2d + trn2 v13.2d, v13.2d, v11.2d + mul v1.8h, v31.8h, v9.8h + ldr q11, [x0, #0x40] + sqrdmulh v29.8h, v13.8h, v15.8h + mls v1.8h, v6.8h, v7.h[0] + trn1 v6.2d, v3.2d, v18.2d + mul v17.8h, v13.8h, v9.8h + sub v13.8h, v11.8h, v8.8h + sqrdmulh v10.8h, v27.8h, v4.h[3] + sub v12.8h, v13.8h, v26.8h + sub v18.8h, v6.8h, v1.8h + mls v17.8h, v29.8h, v7.h[0] + add v30.8h, v6.8h, v1.8h + add v6.8h, v13.8h, v26.8h + ldur q13, [x2, #-0x30] + sqrdmulh v16.8h, v18.8h, v23.8h + trn1 v28.4s, v6.4s, v12.4s + mul v23.8h, v18.8h, v25.8h + ldr q25, [x2, #0x10] + add v20.8h, v2.8h, v17.8h + mul v0.8h, v30.8h, v24.8h + sqrdmulh v29.8h, v30.8h, v13.8h + sub v30.8h, v2.8h, v17.8h + mls v23.8h, v16.8h, v7.h[0] + sub x4, x4, #0x2 + +Lntt_layer4567_start + ldr q19, [x2, #0x50] + sub v31.8h, v30.8h, v23.8h + mls v0.8h, v29.8h, v7.h[0] + add v16.8h, v11.8h, v8.8h + ldr q18, [x0, #0xa0] + trn2 v14.4s, v6.4s, v12.4s + mul v26.8h, v27.8h, v4.h[2] + ldr q4, [x1], #0x10 + ldr q24, [x2, #0x40] + ldr q21, [x0, #0xb0] + mls v26.8h, v10.8h, v7.h[0] + add v23.8h, v30.8h, v23.8h + sub v15.8h, v20.8h, v0.8h + ldr q9, [x0, #0x90] + add v10.8h, v20.8h, v0.8h + mul v8.8h, v18.8h, v4.h[0] + ldr q1, [x2], #0x60 + trn1 v27.4s, v23.4s, v31.4s + sqrdmulh v12.8h, v18.8h, v4.h[1] + trn1 v5.4s, v10.4s, v15.4s + sub v30.8h, v16.8h, v26.8h + trn2 v13.2d, v5.2d, v27.2d + sqrdmulh v2.8h, v21.8h, v4.h[1] + add v29.8h, v16.8h, v26.8h + mul v0.8h, v21.8h, v4.h[0] + str q13, [x0, #0x20] + trn1 v11.4s, v29.4s, v30.4s + mls v8.8h, v12.8h, v7.h[0] + trn2 v26.4s, v29.4s, v30.4s + trn2 v6.2d, v11.2d, v28.2d + mls v0.8h, v2.8h, v7.h[0] + trn2 v16.2d, v26.2d, v14.2d + trn1 v26.2d, v26.2d, v14.2d + trn1 v20.2d, v5.2d, v27.2d + sqrdmulh v29.8h, v6.8h, v25.8h + trn2 v15.4s, v10.4s, v15.4s + sqrdmulh v13.8h, v16.8h, v25.8h + str q20, [x0], #0x40 + sub v30.8h, v9.8h, v0.8h + add v27.8h, v9.8h, v0.8h + mul v17.8h, v6.8h, v1.8h + sqrdmulh v22.8h, v30.8h, v4.h[5] + mul v18.8h, v16.8h, v1.8h + mls v18.8h, v13.8h, v7.h[0] + mul v2.8h, v30.8h, v4.h[4] + mls v2.8h, v22.8h, v7.h[0] + trn2 v22.4s, v23.4s, v31.4s + sub v3.8h, v26.8h, v18.8h + ldur q25, [x2, #-0x30] + mls v17.8h, v29.8h, v7.h[0] + trn2 v31.2d, v15.2d, v22.2d + trn1 v20.2d, v15.2d, v22.2d + add v16.8h, v26.8h, v18.8h + sqrdmulh v26.8h, v3.8h, v19.8h + trn1 v21.2d, v11.2d, v28.2d + ldr q11, [x0, #0x40] + sqrdmulh v29.8h, v16.8h, v25.8h + stur q20, [x0, #-0x30] + add v20.8h, v21.8h, v17.8h + stur q31, [x0, #-0x10] + mul v23.8h, v3.8h, v24.8h + ldr q25, [x2, #0x10] + sub v13.8h, v11.8h, v8.8h + mls v23.8h, v26.8h, v7.h[0] + ldur q1, [x2, #-0x40] + sub v12.8h, v13.8h, v2.8h + add v6.8h, v13.8h, v2.8h + sqrdmulh v10.8h, v27.8h, v4.h[3] + sub v30.8h, v21.8h, v17.8h + mul v0.8h, v16.8h, v1.8h + trn1 v28.4s, v6.4s, v12.4s + subs x4, x4, #0x1 + cbnz x4, Lntt_layer4567_start + add v22.8h, v11.8h, v8.8h + mul v27.8h, v27.8h, v4.h[2] + trn2 v17.4s, v6.4s, v12.4s + ldr q15, [x2], #0x60 + mls v27.8h, v10.8h, v7.h[0] + add v4.8h, v30.8h, v23.8h + sub v18.8h, v30.8h, v23.8h + ldur q6, [x2, #-0x30] + mls v0.8h, v29.8h, v7.h[0] + ldur q12, [x2, #-0x40] + ldur q24, [x2, #-0x20] + ldur q2, [x2, #-0x10] + trn1 v9.4s, v4.4s, v18.4s + add v10.8h, v22.8h, v27.8h + sub v13.8h, v22.8h, v27.8h + sub v1.8h, v20.8h, v0.8h + trn2 v21.4s, v10.4s, v13.4s + add v27.8h, v20.8h, v0.8h + trn2 v3.2d, v21.2d, v17.2d + trn1 v13.4s, v10.4s, v13.4s + trn1 v31.4s, v27.4s, v1.4s + sqrdmulh v10.8h, v3.8h, v25.8h + trn2 v5.2d, v13.2d, v28.2d + trn1 v13.2d, v13.2d, v28.2d + trn1 v21.2d, v21.2d, v17.2d + sqrdmulh v17.8h, v5.8h, v25.8h + trn2 v30.2d, v31.2d, v9.2d + mul v25.8h, v3.8h, v15.8h + str q30, [x0, #0x20] + trn2 v30.4s, v4.4s, v18.4s + mls v25.8h, v10.8h, v7.h[0] + trn2 v3.4s, v27.4s, v1.4s + mul v20.8h, v5.8h, v15.8h + trn2 v10.2d, v3.2d, v30.2d + mls v20.8h, v17.8h, v7.h[0] + str q10, [x0, #0x30] + sub v18.8h, v21.8h, v25.8h + add v10.8h, v21.8h, v25.8h + trn1 v3.2d, v3.2d, v30.2d + sqrdmulh v30.8h, v18.8h, v2.8h + mul v12.8h, v10.8h, v12.8h + sqrdmulh v6.8h, v10.8h, v6.8h + str q3, [x0, #0x10] + add v21.8h, v13.8h, v20.8h + mul v10.8h, v18.8h, v24.8h + sub v13.8h, v13.8h, v20.8h + mls v10.8h, v30.8h, v7.h[0] + mls v12.8h, v6.8h, v7.h[0] + trn1 v30.2d, v31.2d, v9.2d + sub v3.8h, v13.8h, v10.8h + add v6.8h, v13.8h, v10.8h + add v10.8h, v21.8h, v12.8h + sub v21.8h, v21.8h, v12.8h + trn2 v13.4s, v6.4s, v3.4s + trn1 v12.4s, v10.4s, v21.4s + trn2 v21.4s, v10.4s, v21.4s + trn1 v3.4s, v6.4s, v3.4s + str q30, [x0], #0x40 + trn2 v10.2d, v21.2d, v13.2d + trn1 v13.2d, v21.2d, v13.2d + trn2 v21.2d, v12.2d, v3.2d + trn1 v3.2d, v12.2d, v3.2d + str q10, [x0, #0x30] + str q13, [x0, #0x10] + str q3, [x0], #0x40 + stur q21, [x0, #-0x20] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret + +MLK_ASM_FN_SIZE(ntt_aarch64_asm) + +#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +#if defined(__ELF__) +#endif + + END diff --git a/mlkem/src/native/aarch64/src/poly_mulcache_compute_aarch64_asm.asm b/mlkem/src/native/aarch64/src/poly_mulcache_compute_aarch64_asm.asm new file mode 100644 index 0000000000..8e7573fa01 --- /dev/null +++ b/mlkem/src/native/aarch64/src/poly_mulcache_compute_aarch64_asm.asm @@ -0,0 +1,129 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/native/aarch64/src/poly_mulcache_compute_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/*yaml + Name: poly_mulcache_compute_aarch64_asm + Description: Compute multiplication cache for polynomial + Signature: void mlk_poly_mulcache_compute_aarch64_asm(int16_t cache[128], const int16_t mlk_poly[256], const int16_t zetas[128], const int16_t zetas_twisted[128]) + ABI: + x0: + type: buffer + size_bytes: 256 + permissions: write-only + c_parameter: int16_t cache[128] + description: Output cache + x1: + type: buffer + size_bytes: 512 + permissions: read-only + c_parameter: const int16_t mlk_poly[256] + description: Input polynomial + x2: + type: buffer + size_bytes: 256 + permissions: read-only + c_parameter: const int16_t zetas[128] + description: Zeta values + x3: + type: buffer + size_bytes: 256 + permissions: read-only + c_parameter: const int16_t zetas_twisted[128] + description: Twisted zeta values + Stack: + bytes: 0 +*/ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/aarch64_opt/src/poly_mulcache_compute_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(poly_mulcache_compute_aarch64_asm) +MLK_ASM_FN_SYMBOL(poly_mulcache_compute_aarch64_asm) + + mov w5, #0xd01 // =3329 + dup v6.8h, w5 + mov w5, #0x4ebf // =20159 + dup v7.8h, w5 + mov x4, #0x10 // =16 + ldr q0, [x1], #0x20 + ldur q2, [x1, #-0x10] + ldr q19, [x1], #0x20 + ldr q29, [x3], #0x10 + ldur q16, [x1, #-0x10] + ldr q18, [x2], #0x10 + ldr q26, [x1], #0x20 + ldr q25, [x2], #0x10 + uzp2 v5.8h, v0.8h, v2.8h + ldr q28, [x3], #0x10 + ldur q7, [x1, #-0x10] + ldr q2, [x1], #0x20 + uzp2 v27.8h, v19.8h, v16.8h + sqrdmulh v16.8h, v5.8h, v29.8h + ldr q17, [x3], #0x10 + ldr q19, [x3], #0x10 + mul v5.8h, v5.8h, v18.8h + uzp2 v29.8h, v26.8h, v7.8h + mul v26.8h, v27.8h, v25.8h + sqrdmulh v4.8h, v27.8h, v28.8h + mls v5.8h, v16.8h, v6.h[0] + lsr x4, x4, #1 + sub x4, x4, #0x2 + +Lpoly_mulcache_compute_loop_start + str q5, [x0], #0x10 + sqrdmulh v22.8h, v29.8h, v17.8h + ldr q28, [x2], #0x10 + ldur q24, [x1, #-0x10] + ldr q0, [x1], #0x20 + mls v26.8h, v4.8h, v6.h[0] + ldur q16, [x1, #-0x10] + ldr q17, [x3], #0x10 + mul v5.8h, v29.8h, v28.8h + uzp2 v23.8h, v2.8h, v24.8h + ldr q18, [x2], #0x10 + mls v5.8h, v22.8h, v6.h[0] + uzp2 v29.8h, v0.8h, v16.8h + sqrdmulh v4.8h, v23.8h, v19.8h + ldr q2, [x1], #0x20 + ldr q19, [x3], #0x10 + str q26, [x0], #0x10 + mul v26.8h, v23.8h, v18.8h + subs x4, x4, #0x1 + cbnz x4, Lpoly_mulcache_compute_loop_start + mls v26.8h, v4.8h, v6.h[0] + str q5, [x0], #0x10 + ldr q5, [x2], #0x10 + ldur q4, [x1, #-0x10] + sqrdmulh v16.8h, v29.8h, v17.8h + ldr q0, [x2], #0x10 + mul v29.8h, v29.8h, v5.8h + uzp2 v18.8h, v2.8h, v4.8h + str q26, [x0], #0x10 + sqrdmulh v17.8h, v18.8h, v19.8h + mls v29.8h, v16.8h, v6.h[0] + mul v26.8h, v18.8h, v0.8h + mls v26.8h, v17.8h, v6.h[0] + str q29, [x0], #0x10 + str q26, [x0], #0x10 + ret + +MLK_ASM_FN_SIZE(poly_mulcache_compute_aarch64_asm) + +#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +#if defined(__ELF__) +#endif + + END diff --git a/mlkem/src/native/aarch64/src/poly_reduce_aarch64_asm.asm b/mlkem/src/native/aarch64/src/poly_reduce_aarch64_asm.asm new file mode 100644 index 0000000000..6a6ca15656 --- /dev/null +++ b/mlkem/src/native/aarch64/src/poly_reduce_aarch64_asm.asm @@ -0,0 +1,152 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/native/aarch64/src/poly_reduce_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/*yaml + Name: poly_reduce_aarch64_asm + Description: Barrett reduction of polynomial coefficients + Signature: void mlk_poly_reduce_aarch64_asm(int16_t p[256]) + ABI: + x0: + type: buffer + size_bytes: 512 + permissions: read/write + c_parameter: int16_t p[256] + description: Input/output polynomial + Stack: + bytes: 0 +*/ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/aarch64_opt/src/poly_reduce_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(poly_reduce_aarch64_asm) +MLK_ASM_FN_SYMBOL(poly_reduce_aarch64_asm) + + mov w2, #0xd01 // =3329 + dup v3.8h, w2 + mov w2, #0x4ebf // =20159 + dup v4.8h, w2 + mov x1, #0x8 // =8 + ldr q21, [x0], #0x40 + ldur q18, [x0, #-0x20] + ldur q0, [x0, #-0x30] + ldur q5, [x0, #-0x10] + ldr q26, [x0], #0x40 + sqdmulh v17.8h, v21.8h, v4.h[0] + sqdmulh v27.8h, v18.8h, v4.h[0] + sqdmulh v22.8h, v0.8h, v4.h[0] + srshr v17.8h, v17.8h, #0xb + sqdmulh v23.8h, v5.8h, v4.h[0] + srshr v29.8h, v27.8h, #0xb + mls v21.8h, v17.8h, v3.h[0] + srshr v17.8h, v22.8h, #0xb + mls v18.8h, v29.8h, v3.h[0] + srshr v22.8h, v23.8h, #0xb + mls v0.8h, v17.8h, v3.h[0] + sshr v2.8h, v21.8h, #0xf + mls v5.8h, v22.8h, v3.h[0] + sshr v29.8h, v18.8h, #0xf + and v19.16b, v3.16b, v2.16b + sqdmulh v2.8h, v26.8h, v4.h[0] + sshr v31.8h, v0.8h, #0xf + add v17.8h, v21.8h, v19.8h + and v21.16b, v3.16b, v29.16b + and v31.16b, v3.16b, v31.16b + sub x1, x1, #0x2 + +Lpoly_reduce_loop_start + add v21.8h, v18.8h, v21.8h + ldur q18, [x0, #-0x20] + add v25.8h, v0.8h, v31.8h + ldur q0, [x0, #-0x30] + stur q21, [x0, #-0x60] + sshr v28.8h, v5.8h, #0xf + stur q17, [x0, #-0x80] + srshr v23.8h, v2.8h, #0xb + sqdmulh v30.8h, v18.8h, v4.h[0] + stur q25, [x0, #-0x70] + and v22.16b, v3.16b, v28.16b + sqdmulh v7.8h, v0.8h, v4.h[0] + add v16.8h, v5.8h, v22.8h + ldur q5, [x0, #-0x10] + mls v26.8h, v23.8h, v3.h[0] + stur q16, [x0, #-0x50] + srshr v6.8h, v30.8h, #0xb + srshr v1.8h, v7.8h, #0xb + sqdmulh v19.8h, v5.8h, v4.h[0] + mls v18.8h, v6.8h, v3.h[0] + sshr v24.8h, v26.8h, #0xf + mls v0.8h, v1.8h, v3.h[0] + and v27.16b, v3.16b, v24.16b + srshr v29.8h, v19.8h, #0xb + add v17.8h, v26.8h, v27.8h + ldr q26, [x0], #0x40 + sshr v1.8h, v18.8h, #0xf + mls v5.8h, v29.8h, v3.h[0] + sshr v20.8h, v0.8h, #0xf + and v21.16b, v3.16b, v1.16b + and v31.16b, v3.16b, v20.16b + sqdmulh v2.8h, v26.8h, v4.h[0] + subs x1, x1, #0x1 + cbnz x1, Lpoly_reduce_loop_start + add v28.8h, v0.8h, v31.8h + ldur q29, [x0, #-0x10] + add v21.8h, v18.8h, v21.8h + srshr v18.8h, v2.8h, #0xb + sshr v2.8h, v5.8h, #0xf + ldur q16, [x0, #-0x20] + stur q17, [x0, #-0x80] + ldur q0, [x0, #-0x30] + and v2.16b, v3.16b, v2.16b + sqdmulh v24.8h, v29.8h, v4.h[0] + stur q28, [x0, #-0x70] + stur q21, [x0, #-0x60] + add v31.8h, v5.8h, v2.8h + sqdmulh v6.8h, v16.8h, v4.h[0] + stur q31, [x0, #-0x50] + sqdmulh v17.8h, v0.8h, v4.h[0] + srshr v22.8h, v24.8h, #0xb + mls v26.8h, v18.8h, v3.h[0] + srshr v31.8h, v6.8h, #0xb + mls v29.8h, v22.8h, v3.h[0] + srshr v19.8h, v17.8h, #0xb + mls v16.8h, v31.8h, v3.h[0] + sshr v7.8h, v26.8h, #0xf + mls v0.8h, v19.8h, v3.h[0] + and v5.16b, v3.16b, v7.16b + sshr v22.8h, v29.8h, #0xf + add v27.8h, v26.8h, v5.8h + and v26.16b, v3.16b, v22.16b + sshr v20.8h, v16.8h, #0xf + stur q27, [x0, #-0x40] + and v2.16b, v3.16b, v20.16b + sshr v23.8h, v0.8h, #0xf + add v18.8h, v29.8h, v26.8h + add v31.8h, v16.8h, v2.8h + and v29.16b, v3.16b, v23.16b + stur q18, [x0, #-0x10] + add v25.8h, v0.8h, v29.8h + stur q31, [x0, #-0x20] + stur q25, [x0, #-0x30] + ret + +MLK_ASM_FN_SIZE(poly_reduce_aarch64_asm) + +#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +#if defined(__ELF__) +#endif + + END diff --git a/mlkem/src/native/aarch64/src/poly_tobytes_aarch64_asm.asm b/mlkem/src/native/aarch64/src/poly_tobytes_aarch64_asm.asm new file mode 100644 index 0000000000..90b4de7c94 --- /dev/null +++ b/mlkem/src/native/aarch64/src/poly_tobytes_aarch64_asm.asm @@ -0,0 +1,119 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/native/aarch64/src/poly_tobytes_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/*yaml + Name: poly_tobytes_aarch64_asm + Description: Convert polynomial to byte representation + Signature: void mlk_poly_tobytes_aarch64_asm(uint8_t r[384], const int16_t a[256]) + ABI: + x0: + type: buffer + size_bytes: 384 + permissions: write-only + c_parameter: uint8_t r[384] + description: Output byte array + x1: + type: buffer + size_bytes: 512 + permissions: read-only + c_parameter: const int16_t a[256] + description: Input polynomial + Stack: + bytes: 0 +*/ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/aarch64_opt/src/poly_tobytes_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(poly_tobytes_aarch64_asm) +MLK_ASM_FN_SYMBOL(poly_tobytes_aarch64_asm) + + mov x2, #0x10 // =16 + ldr q5, [x1, #0x10] + ldr q3, [x1], #0x20 + ldr q29, [x1], #0x20 + ldur q2, [x1, #-0x10] + ldr q27, [x1, #0x10] + ldr q23, [x1, #0x30] + ldr q17, [x1], #0x20 + ldr q16, [x1], #0x20 + uzp2 v26.8h, v3.8h, v5.8h + uzp1 v19.8h, v3.8h, v5.8h + uzp2 v0.8h, v29.8h, v2.8h + uzp1 v1.8h, v29.8h, v2.8h + xtn v5.8b, v26.8h + shrn v3.8b, v19.8h, #0x8 + shrn v4.8b, v26.8h, #0x4 + xtn v18.8b, v0.8h + shrn v30.8b, v0.8h, #0x4 + xtn v28.8b, v1.8h + shrn v29.8b, v1.8h, #0x8 + sli v3.8b, v5.8b, #0x4 + xtn v2.8b, v19.8h + sli v29.8b, v18.8b, #0x4 + lsr x2, x2, #1 + sub x2, x2, #0x2 + +Lpoly_tobytes_loop_start + uzp1 v25.8h, v17.8h, v27.8h + uzp2 v31.8h, v17.8h, v27.8h + uzp1 v24.8h, v16.8h, v23.8h + uzp2 v6.8h, v16.8h, v23.8h + st3 { v2.8b, v3.8b, v4.8b }, [x0], #24 + shrn v3.8b, v25.8h, #0x8 + ldr q17, [x1], #0x20 + shrn v4.8b, v31.8h, #0x4 + xtn v21.8b, v6.8h + ldr q23, [x1, #0x10] + st3 { v28.8b, v29.8b, v30.8b }, [x0], #24 + shrn v29.8b, v24.8h, #0x8 + ldur q27, [x1, #-0x10] + xtn v20.8b, v31.8h + ldr q16, [x1], #0x20 + sli v29.8b, v21.8b, #0x4 + xtn v2.8b, v25.8h + sli v3.8b, v20.8b, #0x4 + xtn v28.8b, v24.8h + shrn v30.8b, v6.8h, #0x4 + subs x2, x2, #0x1 + cbnz x2, Lpoly_tobytes_loop_start + uzp2 v7.8h, v17.8h, v27.8h + uzp1 v25.8h, v17.8h, v27.8h + uzp2 v0.8h, v16.8h, v23.8h + st3 { v2.8b, v3.8b, v4.8b }, [x0], #24 + st3 { v28.8b, v29.8b, v30.8b }, [x0], #24 + shrn v21.8b, v25.8h, #0x8 + uzp1 v2.8h, v16.8h, v23.8h + shrn v22.8b, v7.8h, #0x4 + shrn v4.8b, v0.8h, #0x4 + xtn v28.8b, v7.8h + xtn v27.8b, v0.8h + shrn v3.8b, v2.8h, #0x8 + sli v21.8b, v28.8b, #0x4 + xtn v2.8b, v2.8h + sli v3.8b, v27.8b, #0x4 + xtn v20.8b, v25.8h + st3 { v20.8b, v21.8b, v22.8b }, [x0], #24 + st3 { v2.8b, v3.8b, v4.8b }, [x0], #24 + ret + +MLK_ASM_FN_SIZE(poly_tobytes_aarch64_asm) + +#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +#if defined(__ELF__) +#endif + + END diff --git a/mlkem/src/native/aarch64/src/poly_tomont_aarch64_asm.asm b/mlkem/src/native/aarch64/src/poly_tomont_aarch64_asm.asm new file mode 100644 index 0000000000..0d30d4abe2 --- /dev/null +++ b/mlkem/src/native/aarch64/src/poly_tomont_aarch64_asm.asm @@ -0,0 +1,98 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/native/aarch64/src/poly_tomont_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/*yaml + Name: poly_tomont_aarch64_asm + Description: Convert polynomial to Montgomery domain + Signature: void mlk_poly_tomont_aarch64_asm(int16_t p[256]) + ABI: + x0: + type: buffer + size_bytes: 512 + permissions: read/write + c_parameter: int16_t p[256] + description: Input/output polynomial + Stack: + bytes: 0 +*/ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/aarch64_opt/src/poly_tomont_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(poly_tomont_aarch64_asm) +MLK_ASM_FN_SYMBOL(poly_tomont_aarch64_asm) + + mov w2, #0xd01 // =3329 + dup v4.8h, w2 + mov w2, #-0x414 // =-1044 + dup v2.8h, w2 + mov w2, #-0x2824 // =-10276 + dup v3.8h, w2 + mov x1, #0x8 // =8 + ldr q18, [x0, #0x20] + ldr q0, [x0, #0x10] + ldr q16, [x0], #0x40 + sqrdmulh v23.8h, v0.8h, v3.8h + mul v26.8h, v0.8h, v2.8h + sqrdmulh v19.8h, v16.8h, v3.8h + mls v26.8h, v23.8h, v4.h[0] + mul v29.8h, v16.8h, v2.8h + ldur q16, [x0, #-0x10] + mls v29.8h, v19.8h, v4.h[0] + stur q26, [x0, #-0x30] + sqrdmulh v26.8h, v18.8h, v3.8h + mul v18.8h, v18.8h, v2.8h + stur q29, [x0, #-0x40] + sqrdmulh v29.8h, v16.8h, v3.8h + mls v18.8h, v26.8h, v4.h[0] + sub x1, x1, #0x1 + +Lpoly_tomont_loop + ldr q19, [x0, #0x10] + mul v26.8h, v16.8h, v2.8h + ldr q23, [x0, #0x20] + ldr q17, [x0], #0x40 + mls v26.8h, v29.8h, v4.h[0] + ldur q16, [x0, #-0x10] + sqrdmulh v28.8h, v19.8h, v3.8h + stur q18, [x0, #-0x60] + mul v0.8h, v19.8h, v2.8h + stur q26, [x0, #-0x50] + sqrdmulh v24.8h, v23.8h, v3.8h + mul v18.8h, v23.8h, v2.8h + sqrdmulh v22.8h, v17.8h, v3.8h + mul v26.8h, v17.8h, v2.8h + mls v0.8h, v28.8h, v4.h[0] + mls v26.8h, v22.8h, v4.h[0] + sqrdmulh v29.8h, v16.8h, v3.8h + stur q0, [x0, #-0x30] + mls v18.8h, v24.8h, v4.h[0] + stur q26, [x0, #-0x40] + sub x1, x1, #0x1 + cbnz x1, Lpoly_tomont_loop + mul v16.8h, v16.8h, v2.8h + stur q18, [x0, #-0x20] + mls v16.8h, v29.8h, v4.h[0] + stur q16, [x0, #-0x10] + ret + +MLK_ASM_FN_SIZE(poly_tomont_aarch64_asm) + +#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +#if defined(__ELF__) +#endif + + END diff --git a/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_k2_aarch64_asm.asm b/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_k2_aarch64_asm.asm new file mode 100644 index 0000000000..06ca1febdc --- /dev/null +++ b/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_k2_aarch64_asm.asm @@ -0,0 +1,245 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_k2_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [NeonNTT] + * Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 + * Becker, Hwang, Kannwischer, Yang, Yang + * https://eprint.iacr.org/2021/986 + */ + +/*yaml + Name: polyvec_basemul_acc_montgomery_cached_k2_aarch64_asm + Description: Re-implementation of asymmetric base multiplication following @[NeonNTT] for k=2 + Signature: void mlk_polyvec_basemul_acc_montgomery_cached_k2_aarch64_asm(int16_t r[256], const int16_t a[512], const int16_t b[512], const int16_t b_cache[256]) + ABI: + x0: + type: buffer + size_bytes: 512 + permissions: write-only + c_parameter: int16_t r[256] + description: Output polynomial + x1: + type: buffer + size_bytes: 1024 + permissions: read-only + c_parameter: const int16_t a[512] + description: Input polynomial vector a + x2: + type: buffer + size_bytes: 1024 + permissions: read-only + c_parameter: const int16_t b[512] + description: Input polynomial vector b + x3: + type: buffer + size_bytes: 512 + permissions: read-only + c_parameter: const int16_t b_cache[256] + description: Cached values for b + Stack: + bytes: 64 + description: saving callee-saved Neon registers +*/ + +/* Re-implementation of asymmetric base multiplication following @[NeonNTT] */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/aarch64_opt/src/polyvec_basemul_acc_montgomery_cached_k2_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_k2_aarch64_asm) +MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_k2_aarch64_asm) + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + mov w14, #0xd01 // =3329 + dup v0.8h, w14 + mov w14, #0xcff // =3327 + dup v2.8h, w14 + add x4, x1, #0x200 + add x5, x2, #0x200 + add x6, x3, #0x100 + mov x13, #0x10 // =16 + ldr q12, [x1], #0x20 + ldur q9, [x1, #-0x10] + ldr q22, [x2], #0x20 + ldur q30, [x2, #-0x10] + ldr q6, [x5], #0x20 + ldr q7, [x4, #0x10] + ldr q8, [x4], #0x20 + ldur q23, [x5, #-0x10] + uzp1 v16.8h, v12.8h, v9.8h + uzp2 v14.8h, v12.8h, v9.8h + uzp2 v13.8h, v22.8h, v30.8h + uzp1 v18.8h, v22.8h, v30.8h + ld1 { v27.8h }, [x3], #16 + ld1 { v17.8h }, [x6], #16 + smull2 v4.4s, v16.8h, v18.8h + ldr q31, [x1, #0x10] + smull v19.4s, v16.4h, v13.4h + ldr q24, [x1], #0x20 + smlal v19.4s, v14.4h, v18.4h + ldr q22, [x2], #0x20 + smlal2 v4.4s, v14.8h, v27.8h + uzp2 v5.8h, v6.8h, v23.8h + smull2 v29.4s, v16.8h, v13.8h + uzp2 v26.8h, v8.8h, v7.8h + smlal2 v29.4s, v14.8h, v18.8h + uzp1 v30.8h, v24.8h, v31.8h + uzp1 v8.8h, v8.8h, v7.8h + smull v11.4s, v16.4h, v18.4h + smlal v11.4s, v14.4h, v27.4h + ldur q1, [x2, #-0x10] + uzp1 v28.8h, v6.8h, v23.8h + smlal2 v29.4s, v8.8h, v5.8h + ldr q25, [x5], #0x20 + smlal v19.4s, v8.4h, v5.4h + ldr q3, [x4, #0x10] + smlal2 v29.4s, v26.8h, v28.8h + uzp1 v27.8h, v22.8h, v1.8h + smlal v19.4s, v26.4h, v28.4h + ldr q12, [x4], #0x20 + smlal2 v4.4s, v8.8h, v28.8h + ldur q21, [x5, #-0x10] + smlal2 v4.4s, v26.8h, v17.8h + smlal v11.4s, v8.4h, v28.4h + ld1 { v15.8h }, [x6], #16 + smlal v11.4s, v26.4h, v17.4h + ld1 { v20.8h }, [x3], #16 + uzp1 v28.8h, v19.8h, v29.8h + smull2 v23.4s, v30.8h, v27.8h + smull v26.4s, v30.4h, v27.4h + uzp2 v16.8h, v22.8h, v1.8h + mul v28.8h, v28.8h, v2.8h + uzp1 v10.8h, v11.8h, v4.8h + smull2 v8.4s, v30.8h, v16.8h + mul v13.8h, v10.8h, v2.8h + smlal v19.4s, v28.4h, v0.4h + smlal2 v29.4s, v28.8h, v0.8h + smull v18.4s, v30.4h, v16.4h + uzp1 v30.8h, v25.8h, v21.8h + smlal v11.4s, v13.4h, v0.4h + uzp2 v6.8h, v24.8h, v31.8h + uzp1 v16.8h, v12.8h, v3.8h + smlal2 v4.4s, v13.8h, v0.8h + uzp2 v17.8h, v25.8h, v21.8h + smlal2 v8.4s, v6.8h, v27.8h + uzp2 v12.8h, v12.8h, v3.8h + smlal v18.4s, v6.4h, v27.4h + uzp2 v9.8h, v19.8h, v29.8h + smlal2 v8.4s, v16.8h, v17.8h + smlal2 v8.4s, v12.8h, v30.8h + uzp2 v19.8h, v11.8h, v4.8h + sub x13, x13, #0x2 + +Lpolyvec_basemul_acc_montgomery_cached_k2_loop_start + smlal v18.4s, v16.4h, v17.4h + ldr q7, [x4], #0x20 + ldr q10, [x2, #0x10] + smlal v18.4s, v12.4h, v30.4h + smlal2 v23.4s, v6.8h, v20.8h + ldr q14, [x2], #0x20 + smlal2 v23.4s, v16.8h, v30.8h + zip1 v25.8h, v19.8h, v9.8h + zip2 v3.8h, v19.8h, v9.8h + smlal2 v23.4s, v12.8h, v15.8h + smlal v26.4s, v6.4h, v20.4h + uzp1 v5.8h, v18.8h, v8.8h + uzp2 v21.8h, v14.8h, v10.8h + smlal v26.4s, v16.4h, v30.4h + str q25, [x0], #0x20 + mul v29.8h, v5.8h, v2.8h + uzp1 v24.8h, v14.8h, v10.8h + stur q3, [x0, #-0x10] + smlal v26.4s, v12.4h, v15.4h + ld1 { v15.8h }, [x6], #16 + ldr q28, [x1, #0x10] + ldr q11, [x1], #0x20 + ldr q13, [x5], #0x20 + ldur q27, [x4, #-0x10] + smlal2 v8.4s, v29.8h, v0.8h + ldur q22, [x5, #-0x10] + smlal v18.4s, v29.4h, v0.4h + uzp1 v4.8h, v26.8h, v23.8h + uzp1 v1.8h, v11.8h, v28.8h + uzp2 v6.8h, v11.8h, v28.8h + uzp1 v16.8h, v7.8h, v27.8h + mul v31.8h, v4.8h, v2.8h + uzp2 v17.8h, v13.8h, v22.8h + ld1 { v20.8h }, [x3], #16 + uzp2 v9.8h, v18.8h, v8.8h + smull2 v8.4s, v1.8h, v21.8h + uzp1 v30.8h, v13.8h, v22.8h + smlal2 v8.4s, v6.8h, v24.8h + smlal2 v8.4s, v16.8h, v17.8h + uzp2 v12.8h, v7.8h, v27.8h + smlal v26.4s, v31.4h, v0.4h + smlal2 v23.4s, v31.8h, v0.8h + smull v18.4s, v1.4h, v21.4h + smlal v18.4s, v6.4h, v24.4h + smlal2 v8.4s, v12.8h, v30.8h + uzp2 v19.8h, v26.8h, v23.8h + smull2 v23.4s, v1.8h, v24.8h + smull v26.4s, v1.4h, v24.4h + subs x13, x13, #0x1 + cbnz x13, Lpolyvec_basemul_acc_montgomery_cached_k2_loop_start + smlal v26.4s, v6.4h, v20.4h + smlal2 v23.4s, v6.8h, v20.8h + smlal v26.4s, v16.4h, v30.4h + smlal2 v23.4s, v16.8h, v30.8h + smlal v26.4s, v12.4h, v15.4h + smlal2 v23.4s, v12.8h, v15.8h + smlal v18.4s, v16.4h, v17.4h + smlal v18.4s, v12.4h, v30.4h + zip1 v12.8h, v19.8h, v9.8h + str q12, [x0], #0x20 + uzp1 v12.8h, v26.8h, v23.8h + mul v6.8h, v12.8h, v2.8h + uzp1 v12.8h, v18.8h, v8.8h + mul v12.8h, v12.8h, v2.8h + smlal v26.4s, v6.4h, v0.4h + smlal2 v23.4s, v6.8h, v0.8h + smlal2 v8.4s, v12.8h, v0.8h + smlal v18.4s, v12.4h, v0.4h + zip2 v12.8h, v19.8h, v9.8h + uzp2 v6.8h, v26.8h, v23.8h + stur q12, [x0, #-0x10] + uzp2 v12.8h, v18.8h, v8.8h + zip2 v1.8h, v6.8h, v12.8h + zip1 v12.8h, v6.8h, v12.8h + str q1, [x0, #0x10] + str q12, [x0], #0x20 + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret + +MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_k2_aarch64_asm) + + +#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2) */ + +#if defined(__ELF__) +#endif + + END diff --git a/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_k3_aarch64_asm.asm b/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_k3_aarch64_asm.asm new file mode 100644 index 0000000000..7ddea198bf --- /dev/null +++ b/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_k3_aarch64_asm.asm @@ -0,0 +1,298 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_k3_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [NeonNTT] + * Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 + * Becker, Hwang, Kannwischer, Yang, Yang + * https://eprint.iacr.org/2021/986 + */ + +/*yaml + Name: polyvec_basemul_acc_montgomery_cached_k3_aarch64_asm + Description: Re-implementation of asymmetric base multiplication following @[NeonNTT] for k=3 + Signature: void mlk_polyvec_basemul_acc_montgomery_cached_k3_aarch64_asm(int16_t r[256], const int16_t a[768], const int16_t b[768], const int16_t b_cache[384]) + ABI: + x0: + type: buffer + size_bytes: 512 + permissions: write-only + c_parameter: int16_t r[256] + description: Output polynomial + x1: + type: buffer + size_bytes: 1536 + permissions: read-only + c_parameter: const int16_t a[768] + description: Input polynomial vector a + x2: + type: buffer + size_bytes: 1536 + permissions: read-only + c_parameter: const int16_t b[768] + description: Input polynomial vector b + x3: + type: buffer + size_bytes: 768 + permissions: read-only + c_parameter: const int16_t b_cache[384] + description: Cached values for b + Stack: + bytes: 64 + description: saving callee-saved Neon registers +*/ + +/* Re-implementation of asymmetric base multiplication following @[NeonNTT] */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 3) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/aarch64_opt/src/polyvec_basemul_acc_montgomery_cached_k3_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_k3_aarch64_asm) +MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_k3_aarch64_asm) + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + mov w14, #0xd01 // =3329 + dup v0.8h, w14 + mov w14, #0xcff // =3327 + dup v2.8h, w14 + add x4, x1, #0x200 + add x5, x2, #0x200 + add x6, x3, #0x100 + add x7, x1, #0x400 + add x8, x2, #0x400 + add x9, x3, #0x200 + mov x13, #0x10 // =16 + ldr q6, [x7], #0x20 + ldr q19, [x2, #0x10] + ldr q23, [x1], #0x20 + ldur q14, [x1, #-0x10] + ldr q17, [x2], #0x20 + ldr q11, [x4, #0x10] + ldur q28, [x7, #-0x10] + ld1 { v30.8h }, [x3], #16 + ldr q26, [x4], #0x20 + ldr q16, [x8, #0x10] + uzp1 v8.8h, v23.8h, v14.8h + ldr q22, [x5, #0x10] + ldr q18, [x5], #0x20 + uzp1 v20.8h, v17.8h, v19.8h + uzp2 v24.8h, v23.8h, v14.8h + ldr q31, [x8], #0x20 + smull2 v4.4s, v8.8h, v20.8h + uzp1 v25.8h, v26.8h, v11.8h + smull v13.4s, v8.4h, v20.4h + ld1 { v23.8h }, [x6], #16 + uzp1 v1.8h, v18.8h, v22.8h + smlal v13.4s, v24.4h, v30.4h + smlal2 v4.4s, v24.8h, v30.8h + uzp2 v5.8h, v26.8h, v11.8h + smlal2 v4.4s, v25.8h, v1.8h + uzp1 v29.8h, v6.8h, v28.8h + smlal2 v4.4s, v5.8h, v23.8h + ld1 { v7.8h }, [x9], #16 + smlal v13.4s, v25.4h, v1.4h + uzp2 v17.8h, v17.8h, v19.8h + uzp1 v27.8h, v31.8h, v16.8h + smlal v13.4s, v5.4h, v23.4h + uzp2 v22.8h, v18.8h, v22.8h + smull v18.4s, v8.4h, v17.4h + uzp2 v28.8h, v6.8h, v28.8h + smlal v13.4s, v29.4h, v27.4h + smlal2 v4.4s, v29.8h, v27.8h + uzp2 v26.8h, v31.8h, v16.8h + smlal2 v4.4s, v28.8h, v7.8h + ldr q3, [x7, #0x10] + smlal v13.4s, v28.4h, v7.4h + ldr q7, [x1], #0x20 + smlal v18.4s, v24.4h, v20.4h + ldr q15, [x2], #0x20 + smlal v18.4s, v25.4h, v22.4h + smull2 v8.4s, v8.8h, v17.8h + ldur q17, [x1, #-0x10] + uzp1 v23.8h, v13.8h, v4.8h + smlal v18.4s, v5.4h, v1.4h + smlal2 v8.4s, v24.8h, v20.8h + ld1 { v16.8h }, [x3], #16 + mul v23.8h, v23.8h, v2.8h + ldr q19, [x5, #0x10] + ldr q14, [x4, #0x10] + ldr q11, [x4], #0x20 + ldur q20, [x2, #-0x10] + smlal2 v8.4s, v25.8h, v22.8h + smlal2 v8.4s, v5.8h, v1.8h + ldr q22, [x5], #0x20 + uzp1 v1.8h, v7.8h, v17.8h + smlal v18.4s, v29.4h, v26.4h + smlal v13.4s, v23.4h, v0.4h + uzp2 v31.8h, v11.8h, v14.8h + uzp1 v21.8h, v15.8h, v20.8h + smlal2 v4.4s, v23.8h, v0.8h + ld1 { v9.8h }, [x6], #16 + smlal v18.4s, v28.4h, v27.4h + smlal2 v8.4s, v29.8h, v26.8h + ldr q25, [x7], #0x20 + smull v26.4s, v1.4h, v21.4h + uzp1 v24.8h, v22.8h, v19.8h + smlal2 v8.4s, v28.8h, v27.8h + uzp2 v28.8h, v7.8h, v17.8h + uzp1 v29.8h, v11.8h, v14.8h + smull2 v23.4s, v1.8h, v21.8h + ldr q27, [x8], #0x20 + smlal2 v23.4s, v28.8h, v16.8h + ldur q11, [x8, #-0x10] + smlal2 v23.4s, v29.8h, v24.8h + uzp2 v7.8h, v13.8h, v4.8h + uzp2 v19.8h, v22.8h, v19.8h + ld1 { v4.8h }, [x9], #16 + smlal2 v23.4s, v31.8h, v9.8h + uzp1 v13.8h, v25.8h, v3.8h + uzp1 v14.8h, v18.8h, v8.8h + smlal v26.4s, v28.4h, v16.4h + uzp2 v17.8h, v27.8h, v11.8h + uzp2 v20.8h, v15.8h, v20.8h + mul v14.8h, v14.8h, v2.8h + sub x13, x13, #0x2 + +Lpolyvec_basemul_acc_montgomery_cached_k3_loop_start + uzp1 v6.8h, v27.8h, v11.8h + smlal v26.4s, v29.4h, v24.4h + uzp2 v16.8h, v25.8h, v3.8h + smlal v26.4s, v31.4h, v9.4h + ldr q3, [x7, #0x10] + smlal v26.4s, v13.4h, v6.4h + smlal2 v8.4s, v14.8h, v0.8h + ldr q27, [x8], #0x20 + smlal v18.4s, v14.4h, v0.4h + ldr q25, [x7], #0x20 + smlal2 v23.4s, v13.8h, v6.8h + ldr q11, [x1], #0x20 + smlal2 v23.4s, v16.8h, v4.8h + smlal v26.4s, v16.4h, v4.4h + ldur q22, [x1, #-0x10] + uzp2 v30.8h, v18.8h, v8.8h + smull v18.4s, v1.4h, v20.4h + smlal v18.4s, v28.4h, v21.4h + ldr q14, [x2], #0x20 + smlal v18.4s, v29.4h, v19.4h + zip1 v5.8h, v7.8h, v30.8h + uzp1 v4.8h, v26.8h, v23.8h + smull2 v8.4s, v1.8h, v20.8h + zip2 v10.8h, v7.8h, v30.8h + smlal v18.4s, v31.4h, v24.4h + mul v12.8h, v4.8h, v2.8h + ldr q4, [x5, #0x10] + ldr q20, [x4, #0x10] + ldr q1, [x4], #0x20 + ldur q30, [x2, #-0x10] + smlal2 v8.4s, v28.8h, v21.8h + smlal2 v8.4s, v29.8h, v19.8h + ldr q19, [x5], #0x20 + smlal2 v8.4s, v31.8h, v24.8h + ld1 { v15.8h }, [x3], #16 + uzp2 v31.8h, v1.8h, v20.8h + smlal v26.4s, v12.4h, v0.4h + smlal2 v23.4s, v12.8h, v0.8h + uzp1 v21.8h, v14.8h, v30.8h + uzp1 v29.8h, v1.8h, v20.8h + uzp1 v1.8h, v11.8h, v22.8h + smlal2 v8.4s, v13.8h, v17.8h + ld1 { v9.8h }, [x6], #16 + smlal v18.4s, v13.4h, v17.4h + uzp1 v24.8h, v19.8h, v4.8h + uzp2 v7.8h, v26.8h, v23.8h + smull v26.4s, v1.4h, v21.4h + smlal v18.4s, v16.4h, v6.4h + uzp2 v19.8h, v19.8h, v4.8h + smlal2 v8.4s, v16.8h, v6.8h + uzp2 v28.8h, v11.8h, v22.8h + smull2 v23.4s, v1.8h, v21.8h + uzp1 v13.8h, v25.8h, v3.8h + smlal2 v23.4s, v28.8h, v15.8h + ldur q11, [x8, #-0x10] + smlal2 v23.4s, v29.8h, v24.8h + ld1 { v4.8h }, [x9], #16 + smlal2 v23.4s, v31.8h, v9.8h + uzp1 v12.8h, v18.8h, v8.8h + uzp2 v20.8h, v14.8h, v30.8h + smlal v26.4s, v28.4h, v15.4h + str q5, [x0], #0x20 + mul v14.8h, v12.8h, v2.8h + stur q10, [x0, #-0x10] + uzp2 v17.8h, v27.8h, v11.8h + subs x13, x13, #0x1 + cbnz x13, Lpolyvec_basemul_acc_montgomery_cached_k3_loop_start + uzp2 v3.8h, v25.8h, v3.8h + smull2 v16.4s, v1.8h, v20.8h + smull v25.4s, v1.4h, v20.4h + uzp1 v22.8h, v27.8h, v11.8h + smlal2 v16.4s, v28.8h, v21.8h + smlal v25.4s, v28.4h, v21.4h + smlal2 v16.4s, v29.8h, v19.8h + smlal v25.4s, v29.4h, v19.4h + smlal2 v16.4s, v31.8h, v24.8h + smlal v25.4s, v31.4h, v24.4h + smlal v25.4s, v13.4h, v17.4h + smlal2 v16.4s, v13.8h, v17.8h + smlal2 v16.4s, v3.8h, v22.8h + smlal v25.4s, v3.4h, v22.4h + smlal2 v23.4s, v13.8h, v22.8h + smlal v26.4s, v29.4h, v24.4h + smlal v26.4s, v31.4h, v9.4h + smlal v26.4s, v13.4h, v22.4h + uzp1 v10.8h, v25.8h, v16.8h + smlal2 v23.4s, v3.8h, v4.8h + smlal v26.4s, v3.4h, v4.4h + mul v13.8h, v10.8h, v2.8h + smlal v18.4s, v14.4h, v0.4h + smlal2 v8.4s, v14.8h, v0.8h + uzp1 v3.8h, v26.8h, v23.8h + mul v24.8h, v3.8h, v2.8h + uzp2 v17.8h, v18.8h, v8.8h + smlal v25.4s, v13.4h, v0.4h + smlal2 v16.4s, v13.8h, v0.8h + zip1 v21.8h, v7.8h, v17.8h + zip2 v20.8h, v7.8h, v17.8h + smlal2 v23.4s, v24.8h, v0.8h + str q21, [x0], #0x20 + smlal v26.4s, v24.4h, v0.4h + uzp2 v13.8h, v25.8h, v16.8h + stur q20, [x0, #-0x10] + uzp2 v23.8h, v26.8h, v23.8h + zip1 v18.8h, v23.8h, v13.8h + zip2 v13.8h, v23.8h, v13.8h + str q18, [x0], #0x20 + stur q13, [x0, #-0x10] + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret + +MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_k3_aarch64_asm) + +#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3) */ + +#if defined(__ELF__) +#endif + + END diff --git a/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_k4_aarch64_asm.asm b/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_k4_aarch64_asm.asm new file mode 100644 index 0000000000..ed2e735dac --- /dev/null +++ b/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_k4_aarch64_asm.asm @@ -0,0 +1,352 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_k4_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [NeonNTT] + * Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 + * Becker, Hwang, Kannwischer, Yang, Yang + * https://eprint.iacr.org/2021/986 + */ + +/*yaml + Name: polyvec_basemul_acc_montgomery_cached_k4_aarch64_asm + Description: Re-implementation of asymmetric base multiplication following @[NeonNTT] for k=4 + Signature: void mlk_polyvec_basemul_acc_montgomery_cached_k4_aarch64_asm(int16_t r[256], const int16_t a[1024], const int16_t b[1024], const int16_t b_cache[512]) + ABI: + x0: + type: buffer + size_bytes: 512 + permissions: write-only + c_parameter: int16_t r[256] + description: Output polynomial + x1: + type: buffer + size_bytes: 2048 + permissions: read-only + c_parameter: const int16_t a[1024] + description: Input polynomial vector a + x2: + type: buffer + size_bytes: 2048 + permissions: read-only + c_parameter: const int16_t b[1024] + description: Input polynomial vector b + x3: + type: buffer + size_bytes: 1024 + permissions: read-only + c_parameter: const int16_t b_cache[512] + description: Cached values for b + Stack: + bytes: 64 + description: saving callee-saved Neon registers +*/ + +/* Re-implementation of asymmetric base multiplication following @[NeonNTT] */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_AARCH64) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && (defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/aarch64_opt/src/polyvec_basemul_acc_montgomery_cached_k4_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_k4_aarch64_asm) +MLK_ASM_FN_SYMBOL(polyvec_basemul_acc_montgomery_cached_k4_aarch64_asm) + + sub sp, sp, #0x40 + stp d8, d9, [sp] + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + mov w14, #0xd01 // =3329 + dup v0.8h, w14 + mov w14, #0xcff // =3327 + dup v2.8h, w14 + add x4, x1, #0x200 + add x5, x2, #0x200 + add x6, x3, #0x100 + add x7, x1, #0x400 + add x8, x2, #0x400 + add x9, x3, #0x200 + add x10, x1, #0x600 + add x11, x2, #0x600 + add x12, x3, #0x300 + mov x13, #0x10 // =16 + ldr q28, [x1], #0x20 + ldur q5, [x1, #-0x10] + ldr q31, [x2], #0x20 + ldur q27, [x2, #-0x10] + ldr q7, [x5], #0x20 + ldr q10, [x4], #0x20 + ldur q18, [x5, #-0x10] + ldur q9, [x4, #-0x10] + uzp1 v11.8h, v28.8h, v5.8h + uzp2 v19.8h, v28.8h, v5.8h + uzp2 v4.8h, v31.8h, v27.8h + uzp1 v1.8h, v31.8h, v27.8h + ldr q29, [x7], #0x20 + ldr q28, [x8, #0x10] + uzp1 v24.8h, v10.8h, v9.8h + uzp1 v17.8h, v7.8h, v18.8h + uzp2 v7.8h, v7.8h, v18.8h + ldr q21, [x8], #0x20 + uzp2 v27.8h, v10.8h, v9.8h + ldur q6, [x7, #-0x10] + smull v18.4s, v11.4h, v4.4h + ld1 { v9.8h }, [x3], #16 + smull2 v8.4s, v11.8h, v4.8h + ldr q16, [x11], #0x20 + smlal2 v8.4s, v19.8h, v1.8h + ldur q14, [x11, #-0x10] + smlal v18.4s, v19.4h, v1.4h + uzp1 v10.8h, v21.8h, v28.8h + smlal v18.4s, v24.4h, v7.4h + ldr q4, [x10], #0x20 + smlal2 v8.4s, v24.8h, v7.8h + ld1 { v12.8h }, [x6], #16 + smull2 v23.4s, v11.8h, v1.8h + uzp2 v13.8h, v29.8h, v6.8h + smull v26.4s, v11.4h, v1.4h + uzp1 v29.8h, v29.8h, v6.8h + smlal v26.4s, v19.4h, v9.4h + ldur q15, [x10, #-0x10] + smlal2 v23.4s, v19.8h, v9.8h + uzp2 v9.8h, v21.8h, v28.8h + smlal v18.4s, v27.4h, v17.4h + uzp2 v6.8h, v16.8h, v14.8h + uzp1 v21.8h, v16.8h, v14.8h + smlal2 v8.4s, v27.8h, v17.8h + smlal2 v8.4s, v29.8h, v9.8h + uzp1 v30.8h, v4.8h, v15.8h + uzp2 v16.8h, v4.8h, v15.8h + smlal v18.4s, v29.4h, v9.4h + smlal2 v8.4s, v13.8h, v10.8h + ld1 { v15.8h }, [x9], #16 + smlal v18.4s, v13.4h, v10.4h + ldr q11, [x4], #0x20 + smlal v18.4s, v30.4h, v6.4h + ldr q7, [x2], #0x20 + smlal2 v8.4s, v30.8h, v6.8h + ld1 { v9.8h }, [x12], #16 + smlal2 v23.4s, v24.8h, v17.8h + ldur q4, [x2, #-0x10] + smlal v26.4s, v24.4h, v17.4h + ldur q25, [x4, #-0x10] + smlal2 v8.4s, v16.8h, v21.8h + ldr q5, [x5], #0x20 + smlal v18.4s, v16.4h, v21.4h + ldur q22, [x5, #-0x10] + smlal v26.4s, v27.4h, v12.4h + ldr q19, [x1, #0x10] + smlal v26.4s, v29.4h, v10.4h + ld1 { v20.8h }, [x3], #16 + smlal v26.4s, v13.4h, v15.4h + uzp1 v24.8h, v7.8h, v4.8h + smlal2 v23.4s, v27.8h, v12.8h + uzp1 v28.8h, v18.8h, v8.8h + smlal v26.4s, v30.4h, v21.4h + uzp2 v27.8h, v11.8h, v25.8h + smlal2 v23.4s, v29.8h, v10.8h + uzp2 v31.8h, v7.8h, v4.8h + smlal2 v23.4s, v13.8h, v15.8h + uzp1 v14.8h, v5.8h, v22.8h + uzp1 v17.8h, v11.8h, v25.8h + smlal v26.4s, v16.4h, v9.4h + mul v29.8h, v28.8h, v2.8h + sub x13, x13, #0x2 + +Lpolyvec_basemul_acc_montgomery_cached_k4_loop_start + smlal2 v23.4s, v30.8h, v21.8h + ldr q11, [x1], #0x20 + uzp2 v15.8h, v5.8h, v22.8h + smlal v18.4s, v29.4h, v0.4h + ldr q12, [x7], #0x20 + smlal2 v8.4s, v29.8h, v0.8h + ldur q3, [x7, #-0x10] + ldr q21, [x8], #0x20 + uzp1 v29.8h, v11.8h, v19.8h + ldur q13, [x8, #-0x10] + uzp2 v5.8h, v11.8h, v19.8h + smlal2 v23.4s, v16.8h, v9.8h + uzp2 v28.8h, v18.8h, v8.8h + smull2 v8.4s, v29.8h, v31.8h + smlal2 v8.4s, v5.8h, v24.8h + uzp1 v7.8h, v12.8h, v3.8h + smlal2 v8.4s, v17.8h, v15.8h + uzp2 v11.8h, v21.8h, v13.8h + uzp1 v4.8h, v26.8h, v23.8h + smlal2 v8.4s, v27.8h, v14.8h + smlal2 v8.4s, v7.8h, v11.8h + mul v6.8h, v4.8h, v2.8h + ldr q19, [x11], #0x20 + uzp2 v25.8h, v12.8h, v3.8h + ldr q12, [x10], #0x20 + smull v18.4s, v29.4h, v31.4h + ldur q3, [x10, #-0x10] + smlal v18.4s, v5.4h, v24.4h + uzp1 v4.8h, v21.8h, v13.8h + smlal v18.4s, v17.4h, v15.4h + ldur q13, [x11, #-0x10] + ld1 { v1.8h }, [x6], #16 + smlal v26.4s, v6.4h, v0.4h + smlal2 v23.4s, v6.8h, v0.8h + ld1 { v10.8h }, [x9], #16 + smlal v18.4s, v27.4h, v14.4h + uzp1 v30.8h, v12.8h, v3.8h + smlal2 v8.4s, v25.8h, v4.8h + uzp2 v31.8h, v19.8h, v13.8h + smlal v18.4s, v7.4h, v11.4h + ld1 { v9.8h }, [x12], #16 + smlal v18.4s, v25.4h, v4.4h + uzp1 v21.8h, v19.8h, v13.8h + uzp2 v16.8h, v12.8h, v3.8h + smlal v18.4s, v30.4h, v31.4h + smlal2 v8.4s, v30.8h, v31.8h + uzp2 v31.8h, v26.8h, v23.8h + smlal2 v8.4s, v16.8h, v21.8h + smlal v18.4s, v16.4h, v21.4h + zip1 v15.8h, v31.8h, v28.8h + ldr q19, [x1, #0x10] + smull2 v23.4s, v29.8h, v24.8h + smull v26.4s, v29.4h, v24.4h + ldr q3, [x2, #0x10] + smlal v26.4s, v5.4h, v20.4h + ldr q11, [x2], #0x20 + uzp1 v6.8h, v18.8h, v8.8h + smlal v26.4s, v17.4h, v14.4h + smlal v26.4s, v27.4h, v1.4h + zip2 v13.8h, v31.8h, v28.8h + smlal v26.4s, v7.4h, v4.4h + str q15, [x0], #0x20 + smlal v26.4s, v25.4h, v10.4h + stur q13, [x0, #-0x10] + mul v29.8h, v6.8h, v2.8h + uzp1 v24.8h, v11.8h, v3.8h + uzp2 v31.8h, v11.8h, v3.8h + ldr q11, [x4], #0x20 + smlal2 v23.4s, v5.8h, v20.8h + ldur q28, [x4, #-0x10] + smlal2 v23.4s, v17.8h, v14.8h + ldr q5, [x5], #0x20 + smlal2 v23.4s, v27.8h, v1.8h + ldur q22, [x5, #-0x10] + smlal v26.4s, v30.4h, v21.4h + ld1 { v20.8h }, [x3], #16 + smlal v26.4s, v16.4h, v9.4h + uzp1 v17.8h, v11.8h, v28.8h + smlal2 v23.4s, v7.8h, v4.8h + uzp2 v27.8h, v11.8h, v28.8h + smlal2 v23.4s, v25.8h, v10.8h + uzp1 v14.8h, v5.8h, v22.8h + subs x13, x13, #0x1 + cbnz x13, Lpolyvec_basemul_acc_montgomery_cached_k4_loop_start + smlal v18.4s, v29.4h, v0.4h + ldr q11, [x1], #0x20 + uzp2 v28.8h, v5.8h, v22.8h + smlal2 v23.4s, v30.8h, v21.8h + smlal2 v8.4s, v29.8h, v0.8h + ldr q15, [x8, #0x10] + smlal2 v23.4s, v16.8h, v9.8h + ldr q21, [x8], #0x20 + uzp1 v22.8h, v11.8h, v19.8h + uzp2 v12.8h, v11.8h, v19.8h + ldr q1, [x7, #0x10] + ld1 { v6.8h }, [x6], #16 + uzp2 v3.8h, v18.8h, v8.8h + smull v9.4s, v22.4h, v31.4h + smull2 v18.4s, v22.8h, v31.8h + ldr q16, [x7], #0x20 + smull v19.4s, v22.4h, v24.4h + uzp1 v30.8h, v21.8h, v15.8h + uzp2 v25.8h, v21.8h, v15.8h + smull2 v8.4s, v22.8h, v24.8h + smlal v19.4s, v12.4h, v20.4h + ldr q13, [x10, #0x10] + smlal2 v8.4s, v12.8h, v20.8h + uzp1 v29.8h, v16.8h, v1.8h + smlal2 v18.4s, v12.8h, v24.8h + ldr q5, [x10], #0x20 + smlal v9.4s, v12.4h, v24.4h + ldr q4, [x11], #0x20 + smlal v9.4s, v17.4h, v28.4h + ldur q22, [x11, #-0x10] + smlal2 v18.4s, v17.8h, v28.8h + uzp2 v16.8h, v16.8h, v1.8h + smlal v19.4s, v17.4h, v14.4h + ld1 { v28.8h }, [x9], #16 + smlal2 v8.4s, v17.8h, v14.8h + uzp1 v7.8h, v5.8h, v13.8h + smlal v9.4s, v27.4h, v14.4h + uzp1 v17.8h, v4.8h, v22.8h + smlal2 v18.4s, v27.8h, v14.8h + uzp2 v12.8h, v5.8h, v13.8h + uzp2 v21.8h, v4.8h, v22.8h + smlal v19.4s, v27.4h, v6.4h + smlal2 v8.4s, v27.8h, v6.8h + ld1 { v15.8h }, [x12], #16 + smlal v19.4s, v29.4h, v30.4h + uzp1 v20.8h, v26.8h, v23.8h + smlal v9.4s, v29.4h, v25.4h + smlal2 v18.4s, v29.8h, v25.8h + smlal2 v8.4s, v29.8h, v30.8h + smlal v19.4s, v16.4h, v28.4h + smlal2 v8.4s, v16.8h, v28.8h + smlal2 v18.4s, v16.8h, v30.8h + smlal v9.4s, v16.4h, v30.4h + smlal v9.4s, v7.4h, v21.4h + smlal2 v18.4s, v7.8h, v21.8h + smlal2 v8.4s, v7.8h, v17.8h + smlal v19.4s, v7.4h, v17.4h + smlal v19.4s, v12.4h, v15.4h + smlal2 v8.4s, v12.8h, v15.8h + smlal2 v18.4s, v12.8h, v17.8h + smlal v9.4s, v12.4h, v17.4h + mul v6.8h, v20.8h, v2.8h + uzp1 v4.8h, v19.8h, v8.8h + mul v17.8h, v4.8h, v2.8h + uzp1 v12.8h, v9.8h, v18.8h + smlal v26.4s, v6.4h, v0.4h + mul v21.8h, v12.8h, v2.8h + smlal2 v23.4s, v6.8h, v0.8h + smlal2 v8.4s, v17.8h, v0.8h + smlal v19.4s, v17.4h, v0.4h + smlal2 v18.4s, v21.8h, v0.8h + uzp2 v23.8h, v26.8h, v23.8h + smlal v9.4s, v21.4h, v0.4h + zip2 v12.8h, v23.8h, v3.8h + zip1 v22.8h, v23.8h, v3.8h + uzp2 v14.8h, v19.8h, v8.8h + uzp2 v18.8h, v9.8h, v18.8h + str q12, [x0, #0x10] + str q22, [x0], #0x20 + zip2 v24.8h, v14.8h, v18.8h + zip1 v21.8h, v14.8h, v18.8h + str q24, [x0, #0x10] + str q21, [x0], #0x20 + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret + +MLK_ASM_FN_SIZE(polyvec_basemul_acc_montgomery_cached_k4_aarch64_asm) + +#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4) */ + +#if defined(__ELF__) +#endif + + END diff --git a/mlkem/src/native/aarch64/src/rej_uniform_aarch64_asm.asm b/mlkem/src/native/aarch64/src/rej_uniform_aarch64_asm.asm new file mode 100644 index 0000000000..dc92e3ec08 --- /dev/null +++ b/mlkem/src/native/aarch64/src/rej_uniform_aarch64_asm.asm @@ -0,0 +1,226 @@ +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * mlkem/src/native/aarch64/src/rej_uniform_aarch64_asm.S using scripts/simpasm --translate-armasm. Do not modify it directly. + */ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/*yaml + Name: rej_uniform_aarch64_asm + Description: Run rejection sampling on uniform random bytes to generate uniform random integers mod q + Signature: uint64_t mlk_rej_uniform_aarch64_asm(int16_t r[256], const uint8_t *buf, unsigned buflen, const uint8_t table[4096]) + ABI: + x0: + type: buffer + size_bytes: 512 + permissions: write-only + c_parameter: int16_t r[256] + description: Output buffer + x1: + type: buffer + size_bytes: x2 + permissions: read-only + c_parameter: const uint8_t *buf + description: Input buffer + x2: + type: scalar + c_parameter: unsigned buflen + description: Length of input buffer (must be multiple of 24) + test_with: 504 # MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE + x3: + type: buffer + size_bytes: 4096 + permissions: read-only + c_parameter: const uint8_t table[4096] + description: Lookup table + Stack: + bytes: 576 + description: register preservation and temporary storage +*/ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_AARCH64) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/aarch64_opt/src/rej_uniform_aarch64_asm.S using scripts/simpasm. Do not modify it directly. + */ + + AREA |.text|,CODE,READONLY,ALIGN=2 + EXPORT MLK_ASM_NAMESPACE(rej_uniform_aarch64_asm) +MLK_ASM_FN_SYMBOL(rej_uniform_aarch64_asm) + + sub sp, sp, #0x240 + mov x7, #0x1 // =1 + movk x7, #0x2, lsl #16 + movk x7, #0x4, lsl #32 + movk x7, #0x8, lsl #48 + mov v31.d[0], x7 + mov x7, #0x10 // =16 + movk x7, #0x20, lsl #16 + movk x7, #0x40, lsl #32 + movk x7, #0x80, lsl #48 + mov v31.d[1], x7 + mov w11, #0xd01 // =3329 + dup v30.8h, w11 + mov x8, sp + mov x7, x8 + mov x11, #0x0 // =0 + eor v16.16b, v16.16b, v16.16b + +Lrej_uniform_initial_zero + str q16, [x7], #0x40 + stur q16, [x7, #-0x30] + stur q16, [x7, #-0x20] + stur q16, [x7, #-0x10] + add x11, x11, #0x20 + cmp x11, #0x100 + b.lt Lrej_uniform_initial_zero + mov x7, x8 + mov x9, #0x0 // =0 + mov x4, #0x100 // =256 + cmp x2, #0x30 + b.lo Lrej_uniform_loop48_end + +Lrej_uniform_loop48 + cmp x9, x4 + b.hs Lrej_uniform_memory_copy + sub x2, x2, #0x30 + ld3 { v0.16b, v1.16b, v2.16b }, [x1], #48 + zip1 v4.16b, v0.16b, v1.16b + zip2 v5.16b, v0.16b, v1.16b + zip1 v6.16b, v1.16b, v2.16b + zip2 v7.16b, v1.16b, v2.16b + bic v4.8h, #0xf0, lsl #8 + bic v5.8h, #0xf0, lsl #8 + ushr v6.8h, v6.8h, #0x4 + ushr v7.8h, v7.8h, #0x4 + zip1 v16.8h, v4.8h, v6.8h + zip2 v17.8h, v4.8h, v6.8h + zip1 v18.8h, v5.8h, v7.8h + zip2 v19.8h, v5.8h, v7.8h + cmhi v4.8h, v30.8h, v16.8h + cmhi v5.8h, v30.8h, v17.8h + cmhi v6.8h, v30.8h, v18.8h + cmhi v7.8h, v30.8h, v19.8h + and v4.16b, v4.16b, v31.16b + and v5.16b, v5.16b, v31.16b + and v6.16b, v6.16b, v31.16b + and v7.16b, v7.16b, v31.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + uaddlv s22, v6.8h + uaddlv s23, v7.8h + fmov w12, s20 + fmov w13, s21 + fmov w14, s22 + fmov w15, s23 + ldr q24, [x3, x12, lsl #4] + ldr q25, [x3, x13, lsl #4] + ldr q26, [x3, x14, lsl #4] + ldr q27, [x3, x15, lsl #4] + cnt v4.16b, v4.16b + cnt v5.16b, v5.16b + cnt v6.16b, v6.16b + cnt v7.16b, v7.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + uaddlv s22, v6.8h + uaddlv s23, v7.8h + fmov w12, s20 + fmov w13, s21 + fmov w14, s22 + fmov w15, s23 + tbl v16.16b, { v16.16b }, v24.16b + tbl v17.16b, { v17.16b }, v25.16b + tbl v18.16b, { v18.16b }, v26.16b + tbl v19.16b, { v19.16b }, v27.16b + st1 { v16.8h }, [x7] + add x7, x7, x12, lsl #1 + st1 { v17.8h }, [x7] + add x7, x7, x13, lsl #1 + st1 { v18.8h }, [x7] + add x7, x7, x14, lsl #1 + st1 { v19.8h }, [x7] + add x7, x7, x15, lsl #1 + add x12, x12, x13 + add x14, x14, x15 + add x9, x9, x12 + add x9, x9, x14 + cmp x2, #0x30 + b.hs Lrej_uniform_loop48 + +Lrej_uniform_loop48_end + cmp x9, x4 + b.hs Lrej_uniform_memory_copy + cmp x2, #0x18 + b.lo Lrej_uniform_memory_copy + sub x2, x2, #0x18 + ld3 { v0.8b, v1.8b, v2.8b }, [x1], #24 + zip1 v4.16b, v0.16b, v1.16b + zip1 v5.16b, v1.16b, v2.16b + bic v4.8h, #0xf0, lsl #8 + ushr v5.8h, v5.8h, #0x4 + zip1 v16.8h, v4.8h, v5.8h + zip2 v17.8h, v4.8h, v5.8h + cmhi v4.8h, v30.8h, v16.8h + cmhi v5.8h, v30.8h, v17.8h + and v4.16b, v4.16b, v31.16b + and v5.16b, v5.16b, v31.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + fmov w12, s20 + fmov w13, s21 + ldr q24, [x3, x12, lsl #4] + ldr q25, [x3, x13, lsl #4] + cnt v4.16b, v4.16b + cnt v5.16b, v5.16b + uaddlv s20, v4.8h + uaddlv s21, v5.8h + fmov w12, s20 + fmov w13, s21 + tbl v16.16b, { v16.16b }, v24.16b + tbl v17.16b, { v17.16b }, v25.16b + st1 { v16.8h }, [x7] + add x7, x7, x12, lsl #1 + st1 { v17.8h }, [x7] + add x7, x7, x13, lsl #1 + add x9, x9, x12 + add x9, x9, x13 + +Lrej_uniform_memory_copy + cmp x9, x4 + csel x9, x9, x4, lo + mov x11, #0x0 // =0 + mov x7, x8 + +Lrej_uniform_final_copy + ldr q16, [x7], #0x40 + ldur q17, [x7, #-0x30] + ldur q18, [x7, #-0x20] + ldur q19, [x7, #-0x10] + str q16, [x0], #0x40 + stur q17, [x0, #-0x30] + stur q18, [x0, #-0x20] + stur q19, [x0, #-0x10] + add x11, x11, #0x20 + cmp x11, #0x100 + b.lt Lrej_uniform_final_copy + mov x0, x9 + b Lrej_uniform_return + +Lrej_uniform_return + add sp, sp, #0x240 + ret + +MLK_ASM_FN_SIZE(rej_uniform_aarch64_asm) + +#endif /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + +#if defined(__ELF__) +#endif + + END diff --git a/scripts/simpasm b/scripts/simpasm index b93565adb1..f8ac4e03a2 100755 --- a/scripts/simpasm +++ b/scripts/simpasm @@ -75,6 +75,48 @@ def patchup_disasm(asm, cfify=False): return list(gen(asm)) +# Translates a simplified (flat) GAS assembly file as produced by simplify() +# into armasm64 syntax for use with the MSVC toolchain (cl /EP + armasm64). +# Labels must be in column 0 without a trailing colon; directives must be +# indented. Preprocessor directives and C comments are kept; they are +# resolved by `cl /EP` before the file reaches armasm64. +def translate_armasm(asm, asm_input): + indentation = 8 + + def gen(asm): + yield "/*" + yield " * WARNING: This file is auto-derived from the mlkem-native source file" + yield f" * {asm_input} using scripts/simpasm --translate-armasm. Do not modify it directly." + yield " */" + for line in asm: + stripped = line.strip() + # Drop CFI directives; armasm64 has no DWARF CFI support + if stripped.startswith(".cfi_"): + continue + # Drop the ELF non-executable stack marker + if stripped.startswith(".section"): + continue + if stripped == ".text": + yield " " * indentation + "AREA |.text|,CODE,READONLY,ALIGN=2" + continue + # Section start alignment is covered by the AREA attributes + if stripped.startswith(".balign"): + continue + r = re.search(r"^\s*\.global\s+(.*)$", line) + if r is not None: + yield " " * indentation + "EXPORT " + r.group(1) + continue + # Re-format `label:` as colon-free label in column 0 + r = re.search(r"^([a-zA-Z0-9_]+):\s*$", line) + if r is not None: + yield r.group(1) + continue + yield line + yield " " * indentation + "END" + + return list(gen(asm)) + + def find_header_footer(asm, filename): header_end_marker = "simpasm: header-end" footer_start_marker = "simpasm: footer-start" @@ -443,6 +485,11 @@ def _main(): parser.add_argument("--objdump", type=str, default="objdump") parser.add_argument("--strip", type=str, default="llvm-strip") parser.add_argument("--cflags", type=str) + parser.add_argument( + "--translate-armasm", + action="store_true", + help="Translate simplified assembly into armasm64 syntax (.asm)", + ) parser.add_argument("--cfify", action="store_true", help="Apply CFI directives") parser.add_argument( "--arch", @@ -477,6 +524,25 @@ def _main(): else: logger.setLevel(logging.INFO) + if args.translate_armasm is True: + + def translate_file(asm_input, asm_output): + if asm_output is None: + asm_output = str(pathlib.Path(asm_input).with_suffix(".asm")) + with open(asm_input, "r") as f: + asm = f.read().split("\n") + translated = translate_armasm(asm, asm_input) + with open(asm_output, "w+") as f: + f.write("\n".join(translated) + "\n") + logger.info(f"Translated {asm_input} -> {asm_output} (armasm64)") + + if args.input is not None: + translate_file(args.input, args.output) + if args.directory is not None: + for f in pathlib.Path(args.directory).glob("*.S"): + translate_file(f, None) + return + if args.input is not None: simplify(logger, args, args.input, args.output) if args.directory is not None: