Skip to content

Commit 8d040f4

Browse files
committed
sd_fft_ctx: Vectorize the first initialization
1 parent 0dec066 commit 8d040f4

1 file changed

Lines changed: 30 additions & 9 deletions

File tree

src/fft_small/sd_fft_ctx.c

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ void sd_fft_ctx_init_prime(sd_fft_ctx_t Q, ulong pp)
5757
{
5858
ulong N, i, k, l, init_depth, two_power_depth;
5959
double * t;
60-
double n, ninv;
60+
double n, ninv, w;
6161

6262
if (!fft_small_mulmod_satisfies_bounds(pp))
6363
flint_throw(FLINT_ERROR, "FFT prime %wu does not satisfy bounds for arithmetic", pp);
@@ -70,6 +70,8 @@ void sd_fft_ctx_init_prime(sd_fft_ctx_t Q, ulong pp)
7070
flint_throw(FLINT_ERROR, "Input %wu is either 2 or not a prime", pp);
7171
Q->primitive_2power_root = sd_fft_ctx_primitive_2power_root(pp, two_power_depth, Q->mod);
7272
init_depth = n_min(two_power_depth, SD_FFT_CTX_W2TAB_INIT);
73+
if (init_depth < 4)
74+
flint_throw(FLINT_ERROR, "Input %wu does not support FFT context initialization", pp);
7375

7476
n = Q->p;
7577
ninv = Q->pinv;
@@ -91,18 +93,37 @@ void sd_fft_ctx_init_prime(sd_fft_ctx_t Q, ulong pp)
9193

9294
Q->w2tab[0] = t;
9395
t[0] = 1;
94-
for (k = 1, l = 1; k < init_depth; k++, l *= 2)
96+
97+
{
98+
ulong ww = sd_fft_ctx_w2tab_root(Q, two_power_depth, 3);
99+
w = vec1d_reduce_0n_to_pmhn(ww, n);
100+
double w2 = vec1d_reduce_pm1n_to_pmhn(vec1d_mulmod(w, w, n, ninv), n);
101+
102+
Q->w2tab[1] = t + 1;
103+
t[1] = vec1d_reduce_pm1n_to_pmhn(vec1d_mulmod(w2, w2, n, ninv), n);
104+
105+
Q->w2tab[2] = t + 2;
106+
t[2] = w2;
107+
t[3] = vec1d_reduce_pm1n_to_pmhn(vec1d_mulmod(t[1], w2, n, ninv), n);
108+
}
109+
110+
vec4d n4 = vec4d_set_d(n);
111+
vec4d ninv4 = vec4d_set_d(ninv);
112+
113+
for (k = 3, l = 4; k < init_depth; k++, l *= 2)
95114
{
96-
ulong ww = sd_fft_ctx_w2tab_root(Q, two_power_depth, k);
97-
double w = vec1d_set_d(vec1d_reduce_0n_to_pmhn(ww, n));
98115
double* curr = t + l;
116+
vec4d w4 = vec4d_set_d(w);
99117
Q->w2tab[k] = curr;
100118
i = 0; do {
101-
vec1d x = vec1d_load(t + i);
102-
x = vec1d_mulmod(x, w, n, ninv);
103-
x = vec1d_reduce_pm1n_to_pmhn(x, n);
104-
vec1d_store(curr + i, x);
105-
} while (i += 1, i < l);
119+
vec4d x = vec4d_load_aligned(t + i);
120+
x = vec4d_mulmod(x, w4, n4, ninv4);
121+
x = vec4d_reduce_pm1n_to_pmhn(x, n4);
122+
vec4d_store_aligned(curr + i, x);
123+
} while (i += 4, i < l);
124+
125+
if (k + 1 < init_depth)
126+
w = vec1d_reduce_0n_to_pmhn(sd_fft_ctx_w2tab_root(Q, two_power_depth, k + 1), n);
106127
}
107128

108129
#if FLINT_USES_PTHREAD

0 commit comments

Comments
 (0)