mldsa-native/dev/x86_64/src/poly_use_hint_32_avx2.c at main · pq-code-package/mldsa-native · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/*
 * Copyright (c) The mldsa-native project authors
 * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
 */

/* References
 * ==========
 *
 * - [REF_AVX2]
 *   CRYSTALS-Dilithium optimized AVX2 implementation
 *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
 *   https://github.com/pq-crystals/dilithium/tree/master/avx2
 */

/*
 * This file is derived from the public domain
 * AVX2 Dilithium implementation @[REF_AVX2].
 */

#include "../../../common.h"

#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
    !defined(MLD_CONFIG_NO_VERIFY_API) &&          \
    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
     (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))

#include <immintrin.h>
#include "arith_native_x86_64.h"
#include "consts.h"

#define MLD_MM256_BLENDV_EPI32(a, b, mask)                     \
  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \
                                       _mm256_castsi256_ps(b), \
                                       _mm256_castsi256_ps(mask)))

void mld_poly_use_hint_32_avx2(int32_t *a, const int32_t *hint)
{
  unsigned int i;
  __m256i f, f0, f1, h, t;
  const __m256i q_bound = _mm256_set1_epi32(31 * ((MLDSA_Q - 1) / 32));
  /* check-magic: 1025 == floor(2**22 / 4092) */
  const __m256i v = _mm256_set1_epi32(1025);
  const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 32));
  const __m256i off = _mm256_set1_epi32(127);
  const __m256i shift = _mm256_set1_epi32(512);
  const __m256i mask = _mm256_set1_epi32(15);
  const __m256i zero = _mm256_setzero_si256();

  for (i = 0; i < MLDSA_N / 8; i++)
  {
    f = _mm256_load_si256((const __m256i *)&a[8 * i]);
    h = _mm256_load_si256((const __m256i *)&hint[8 * i]);

    /* Reference:
     * - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop.
     * - Our implementation of decompose() is slightly different from that in
     *   @[REF_AVX2]. See poly_decompose_32_avx2.c for more information.
     */
    /* f1, f2 = decompose(f) */
    f1 = _mm256_add_epi32(f, off);
    f1 = _mm256_srli_epi32(f1, 7);
    f1 = _mm256_mulhi_epu16(f1, v);
    f1 = _mm256_mulhrs_epi16(f1, shift);
    t = _mm256_cmpgt_epi32(f, q_bound);
    f0 = _mm256_mullo_epi32(f1, alpha);
    f0 = _mm256_sub_epi32(f, f0);
    f1 = _mm256_andnot_si256(t, f1);
    f0 = _mm256_add_epi32(f0, t);

    /* Reference: The reference avx2 implementation checks a0 >= 0, which is
     * different from the specification and the reference C implementation. We
     * follow the specification and check a0 > 0.
     */
    /* t = (f0 > 0) ? h : -h */
    f0 = _mm256_cmpgt_epi32(f0, zero);
    t = MLD_MM256_BLENDV_EPI32(h, zero, f0);
    t = _mm256_slli_epi32(t, 1);
    h = _mm256_sub_epi32(h, t);

    /* f1 = (f1 + t) % 16 */
    f1 = _mm256_add_epi32(f1, h);
    f1 = _mm256_and_si256(f1, mask);

    _mm256_store_si256((__m256i *)&a[8 * i], f1);
  }
}

#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_VERIFY_API &&     \
         !MLD_CONFIG_MULTILEVEL_NO_SHARED &&                                  \
         (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
         || MLD_CONFIG_PARAMETER_SET == 87) */

MLD_EMPTY_CU(avx2_poly_use_hint_32)

#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_VERIFY_API &&   \
          !MLD_CONFIG_MULTILEVEL_NO_SHARED &&                                  \
          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
          || MLD_CONFIG_PARAMETER_SET == 87)) */

/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
 * Don't modify by hand -- this is auto-generated by scripts/autogen. */
#undef MLD_MM256_BLENDV_EPI32