@@ -57,7 +57,7 @@ void sd_fft_ctx_init_prime(sd_fft_ctx_t Q, ulong pp)
5757{
5858 ulong N , i , k , l , init_depth , two_power_depth ;
5959 double * t ;
60- double n , ninv ;
60+ double n , ninv , w ;
6161
6262 if (!fft_small_mulmod_satisfies_bounds (pp ))
6363 flint_throw (FLINT_ERROR , "FFT prime %wu does not satisfy bounds for arithmetic" , pp );
@@ -70,6 +70,8 @@ void sd_fft_ctx_init_prime(sd_fft_ctx_t Q, ulong pp)
7070 flint_throw (FLINT_ERROR , "Input %wu is either 2 or not a prime" , pp );
7171 Q -> primitive_2power_root = sd_fft_ctx_primitive_2power_root (pp , two_power_depth , Q -> mod );
7272 init_depth = n_min (two_power_depth , SD_FFT_CTX_W2TAB_INIT );
73+ if (init_depth < 4 )
74+ flint_throw (FLINT_ERROR , "Input %wu does not support FFT context initialization" , pp );
7375
7476 n = Q -> p ;
7577 ninv = Q -> pinv ;
@@ -91,18 +93,37 @@ void sd_fft_ctx_init_prime(sd_fft_ctx_t Q, ulong pp)
9193
9294 Q -> w2tab [0 ] = t ;
9395 t [0 ] = 1 ;
94- for (k = 1 , l = 1 ; k < init_depth ; k ++ , l *= 2 )
96+
97+ {
98+ ulong ww = sd_fft_ctx_w2tab_root (Q , two_power_depth , 3 );
99+ w = vec1d_reduce_0n_to_pmhn (ww , n );
100+ double w2 = vec1d_reduce_pm1n_to_pmhn (vec1d_mulmod (w , w , n , ninv ), n );
101+
102+ Q -> w2tab [1 ] = t + 1 ;
103+ t [1 ] = vec1d_reduce_pm1n_to_pmhn (vec1d_mulmod (w2 , w2 , n , ninv ), n );
104+
105+ Q -> w2tab [2 ] = t + 2 ;
106+ t [2 ] = w2 ;
107+ t [3 ] = vec1d_reduce_pm1n_to_pmhn (vec1d_mulmod (t [1 ], w2 , n , ninv ), n );
108+ }
109+
110+ vec4d n4 = vec4d_set_d (n );
111+ vec4d ninv4 = vec4d_set_d (ninv );
112+
113+ for (k = 3 , l = 4 ; k < init_depth ; k ++ , l *= 2 )
95114 {
96- ulong ww = sd_fft_ctx_w2tab_root (Q , two_power_depth , k );
97- double w = vec1d_set_d (vec1d_reduce_0n_to_pmhn (ww , n ));
98115 double * curr = t + l ;
116+ vec4d w4 = vec4d_set_d (w );
99117 Q -> w2tab [k ] = curr ;
100118 i = 0 ; do {
101- vec1d x = vec1d_load (t + i );
102- x = vec1d_mulmod (x , w , n , ninv );
103- x = vec1d_reduce_pm1n_to_pmhn (x , n );
104- vec1d_store (curr + i , x );
105- } while (i += 1 , i < l );
119+ vec4d x = vec4d_load_aligned (t + i );
120+ x = vec4d_mulmod (x , w4 , n4 , ninv4 );
121+ x = vec4d_reduce_pm1n_to_pmhn (x , n4 );
122+ vec4d_store_aligned (curr + i , x );
123+ } while (i += 4 , i < l );
124+
125+ if (k + 1 < init_depth )
126+ w = vec1d_reduce_0n_to_pmhn (sd_fft_ctx_w2tab_root (Q , two_power_depth , k + 1 ), n );
106127 }
107128
108129#if FLINT_USES_PTHREAD
0 commit comments