Skip to content

Commit 38bfd07

Browse files
Merge pull request #2678 from user202729/sd-fft-doc
sd_fft: Add lots of documentation
2 parents 6af13f8 + 3cb7de2 commit 38bfd07

9 files changed

Lines changed: 77 additions & 28 deletions

File tree

doc/source/fft.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,8 @@ Radix 2 transforms
203203
space, but their value is irrelevant. The value of ``trunc`` must be
204204
divisible by 2.
205205

206+
This is a truncated FFT variant in the sense of [vdH2004]_.
207+
206208
.. function:: void fft_truncate1(mp_limb_t ** ii, mp_size_t n, flint_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2, mp_size_t trunc)
207209

208210
As for ``fft_radix2`` except that only the first ``trunc``
@@ -267,6 +269,8 @@ Radix 2 transforms
267269
up to ``trunc`` being careful to note that this involves doubling the
268270
coefficients from ``trunc - n`` up to ``n``.
269271

272+
This inverse truncated transform follows the approach of [vdH2004]_.
273+
270274
.. function:: void ifft_truncate1(mp_limb_t ** ii, mp_size_t n, flint_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2, mp_size_t trunc)
271275

272276
Computes the first ``trunc`` coefficients of the radix 2 inverse

doc/source/fft_small.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
This module currently requires building FLINT with support for
77
AVX2 or NEON instructions.
88

9+
The truncated FFT routines used internally by this module follow the
10+
approach described by van der Hoeven in [vdH2004]_.
11+
912
Integer multiplication
1013
--------------------------------------------------------------------------------
1114

@@ -19,6 +22,7 @@ Integer multiplication
1922
.. function:: void mpn_ctx_init(mpn_ctx_t R, ulong p)
2023

2124
Initialize multiplication context object with initial prime ``p``.
25+
Usually ``p`` is a constant provided by :func:`get_default_mpn_ctx`.
2226

2327
.. function:: void mpn_ctx_clear(mpn_ctx_t R)
2428

doc/source/machine_vectors.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -235,13 +235,13 @@ Other assumptions are not yet documented.
235235
vec4d vec4d_reduce_pm1n_to_pmhn(vec4d a, vec4d n)
236236
vec8d vec8d_reduce_pm1n_to_pmhn(vec8d a, vec8d n)
237237

238-
Return `a \bmod n` reduced to `[-n/2, n/2]` given given `a \in [-n,n]`.
238+
Return `a \bmod n` reduced to `[-n/2, n/2]` given `a \in [-n,n]`.
239239

240240
.. function:: vec1d vec1d_reduce_2n_to_n(vec1d a, vec1d n)
241241
vec4d vec4d_reduce_2n_to_n(vec4d a, vec4d n)
242242
vec8d vec8d_reduce_2n_to_n(vec8d a, vec8d n)
243243

244-
Return `a \bmod n` reduced to `[0,n)` given given `a \in [0,2n)`.
244+
Return `a \bmod n` reduced to `[0,n)` given `a \in [0,2n)`.
245245

246246
.. function:: vec1d vec1d_reduce_to_0n(vec1d a, vec1d n, vec1d ninv)
247247
vec4d vec4d_reduce_to_0n(vec4d a, vec4d n, vec4d ninv)

doc/source/references.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,8 @@ References
357357
358358
.. [vdH1995] \J. van der Hoeven, "Automatic numerical expansions". Proc. of the conference Real numbers and computers (1995), 261-274. https://www.texmacs.org/joris/ane/ane-abs.html
359359
360+
.. [vdH2004] \J. van der Hoeven, "Truncated FFTs", ISSAC 2004 notes, https://www.texmacs.org/joris/issac04/issac04.html
361+
360362
.. [vdH2006] \J. van der Hoeven, "Computations with effective real numbers". Theoretical Computer Science, Volume 351, Issue 1, 14 February 2006, Pages 52-60. https://doi.org/10.1016/j.tcs.2005.09.060
361363
362364
All referenced works: [AbbottBronsteinMulders1999]_, [Apostol1997]_, [Ari2011]_, [Ari2012]_, [Arn2010]_, [Arn2012]_, [ArnoldMonagan2011]_,

src/fft_small.h

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -84,23 +84,8 @@ FLINT_FORCE_INLINE ulong n_trailing_zeros(ulong x) {
8484
return x == 0 ? FLINT_BITS : flint_ctz(x);
8585
}
8686

87-
/*
88-
nbits is a mess
89-
90-
without assuming x != 0:
91-
on x86 we want 64 - LZCNT
92-
on arm we want 64 - CLZ
93-
94-
the problem is gcc decided that __builtin_clz is undefined on zero
95-
input even though both instructions LZCNT and CLZ are defined
96-
97-
assuming x != 0:
98-
on x86 we want BSR + 1
99-
*/
10087
FLINT_FORCE_INLINE ulong n_nbits(ulong x) {
101-
if (x == 0)
102-
return 0;
103-
return FLINT_BITS - flint_clz(x);
88+
return FLINT_BIT_COUNT(x);
10489
}
10590

10691
FLINT_FORCE_INLINE ulong n_nbits_nz(ulong x) {
@@ -123,9 +108,30 @@ FLINT_FORCE_INLINE slong z_max(slong a, slong b) {return FLINT_MAX(a, b);}
123108
int fft_small_mulmod_satisfies_bounds(ulong n);
124109

125110
/*
111+
Let `e(theta) = exp(2*pi*i*theta)`, let `++` denotes concatenation, and `*`
112+
denotes pointwise multiplication.
113+
Consider an infinite sequence `w` of complex numbers defined as follows.
114+
115+
w_0 = {1}
116+
w_i = w_{i-1} ++ e(2^-i) * w_{i-1}
117+
118+
For example
119+
120+
w_1 = {1, -1}
121+
w_2 = {1, -1, i, -i}
122+
w_3 = {1, -1, i, -i, e(1/8), e(5/8), e(3/8), e(7/8)}
123+
124+
Note that the `2i+1`-th element of `w` is the negation of the `2i`-th element,
125+
so we only need to store every other element of `w`. Specific elements of `w`
126+
can be directly accessed with `sd_fft_ctx_w`.
127+
128+
In actual implementation, we work modulo a prime `p` such that `p-1` has
129+
high, but finite, 2-valuation. Most of the calculation works there as well,
130+
except the sequence `w` cannot be infinite.
131+
126132
The twiddle factors are split across SD_FFT_CTX_W2TAB_SIZE tables:
127133
128-
[0] = {e(1)} original index 0
134+
[0] = {e(0)} original index 0
129135
[1] = {e(1/4)} original index 1
130136
[2] = {e(1/8), e(3/8)} original index 2,3
131137
[3] = {e(1/16), e(5/16), e(3/16), e(7/16)} original index 4,5,6,7
@@ -177,9 +183,13 @@ do { \
177183
#define SD_FFT_CTX_W2TAB_INIT 12
178184
#define SD_FFT_CTX_W2TAB_SIZE 40
179185

180-
/* This context is the one expected to sit in a global position */
186+
/*
187+
This context is the one expected to sit in a global position.
188+
One container of this context is the thread_local storage for mpn_ctx_t.
189+
See sd_fft_ctx_init_prime for the meaning of each field.
190+
*/
181191
typedef struct {
182-
double p;
192+
double p; /* the FFT prime */
183193
double pinv;
184194
nmod_t mod;
185195
ulong primitive_root;
@@ -189,6 +199,8 @@ typedef struct {
189199
unsigned int w2tab_depth;
190200
#endif
191201
double* w2tab[SD_FFT_CTX_W2TAB_SIZE];
202+
/* see above for the layout of w2tab. Each entry is an integer
203+
in the range [-p/2, p/2]. */
192204
#if FLINT_USES_PTHREAD
193205
pthread_mutex_t mutex;
194206
#endif
@@ -214,7 +226,7 @@ FLINT_FORCE_INLINE double* sd_fft_ctx_blk_index(double* d, ulong I)
214226
/*
215227
location of the bit-reversed eval:
216228
with out_data = fft(in_data) of length 2^L, then
217-
eval_poly(in_data, sd_fft_ctx_w(, i)) = out_data[sd_fft_ctx_trunc_index(L, i)]
229+
eval_poly(in_data, sd_fft_ctx_w(Q, i)) = out_data[sd_fft_ctx_trunc_index(L, i)]
218230
*/
219231
FLINT_FORCE_INLINE ulong sd_fft_ctx_trunc_index(ulong L, ulong i)
220232
{
@@ -257,6 +269,7 @@ void sd_fft_ctx_point_sqr(const sd_fft_ctx_t Q,
257269
w = {e(0), e(1/2), e(1/4), e(3/4), e(1/8), e(5/8), e(3/8), e(7/8), ...}
258270
Only the terms of even index are explicitly stored, and they are split
259271
among several tables.
272+
The recursive definition of the infinite sequence `w` can be found above.
260273
*/
261274

262275
/* look up w[2*j] */
@@ -275,7 +288,7 @@ FLINT_FORCE_INLINE double sd_fft_ctx_w2inv(const sd_fft_ctx_t Q, ulong j)
275288
return (j == 0) ? -1.0 : Q->w2tab[j_bits][j_mr];
276289
}
277290

278-
/* look up w[j] */
291+
/* look up w[j]. Definition of w above */
279292
FLINT_FORCE_INLINE double sd_fft_ctx_w(const sd_fft_ctx_t Q, ulong j)
280293
{
281294
double r = sd_fft_ctx_w2(Q, j/2);

src/fft_small/mpn_mul.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -896,6 +896,11 @@ DEFINE_IT(7, 6, 5)
896896
DEFINE_IT(8, 7, 6)
897897
#undef DEFINE_IT
898898

899+
/*
900+
Specialized helper function, currently only called from mpn_ctx_init.
901+
Assume p is odd and p - 1 has high 2-valuation, return some number q
902+
(not necessarily prime) less than p such that q - 1 has high 2-valuation.
903+
*/
899904
static ulong next_fft_number(ulong p)
900905
{
901906
ulong bits, l, q;
@@ -969,10 +974,6 @@ static void fill_vec_two_pow_tab(
969974
flint_aligned_free(ps);
970975
}
971976

972-
973-
974-
975-
976977
void mpn_ctx_init(mpn_ctx_t R, ulong p)
977978
{
978979
ulong i;

src/fft_small/sd_fft.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,6 +1010,15 @@ static void sd_fft_trunc_internal(
10101010

10111011
/********************* interface functions ***********************/
10121012

1013+
/*
1014+
compute a truncated FFT inline in the array `d`, assuming terms beyond `itrunc`
1015+
first terms are zero. The output satisfies
1016+
eval_poly(in_data, sd_fft_ctx_w(Q, i)) = out_data[sd_fft_ctx_trunc_index(L, i)]
1017+
Usually it only make sense to have `otrunc >= itrunc`.
1018+
The array `d` need to have size at least the smallest power of 2 larger than or equal
1019+
to `n_max(itrunc, otrunc)`. See [vdH2004]_.
1020+
This function is tested in `test/t-sd_fft.c`.
1021+
*/
10131022
void sd_fft_trunc(
10141023
sd_fft_ctx_t Q,
10151024
double* d,

src/fft_small/sd_fft_ctx.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,12 @@ void sd_fft_ctx_clear(sd_fft_ctx_t Q)
2525
#endif
2626
}
2727

28+
/*
29+
Initialize FFT context.
30+
pp is a prime with at most ~ 50 bits (exactly representable with a `double`)
31+
such that pp - 1 has sufficiently high 2-valuation.
32+
Used in sd_fft_trunc, sd_ifft_trunc, sd_fft_ctx_point_mul, etc.
33+
*/
2834
void sd_fft_ctx_init_prime(sd_fft_ctx_t Q, ulong pp)
2935
{
3036
ulong N, i, k, l;
@@ -47,7 +53,12 @@ void sd_fft_ctx_init_prime(sd_fft_ctx_t Q, ulong pp)
4753
2^(SD_FFT_CTX_W2TAB_INIT-1) entries: 1, e(1/4), e(1/8), e(3/8), ...
4854
4955
Q->w2tab[j] is itself a table of length 2^(j-1) containing 2^(j+1) st
50-
roots of unity.
56+
roots of unity. More documentation on the layout of w2tab can be found
57+
before the definition of SD_FFT_CTX_W2TAB_SIZE.
58+
59+
All entries in w2tab are exactly-representable integers modulo pp, but
60+
they're stored as `double` to make use of the vectorized functions in
61+
machine_vectors.h.
5162
*/
5263
N = n_pow2(SD_FFT_CTX_W2TAB_INIT - 1);
5364
t = (double*) flint_aligned_alloc(4096, n_round_up(N*sizeof(double), 4096));

src/fft_small/sd_ifft.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1497,6 +1497,11 @@ static void sd_ifft_trunc_internal(
14971497

14981498
/********************* interface functions ***********************/
14991499

1500+
/*
1501+
Truncated inverse Fourier transform, inverse of sd_fft_trunc, assume the entries
1502+
past `trunc` first entries of the _output_ are zero. See [vdH2004]_.
1503+
The array `d` need to have size at least `n_pow2(L)`.
1504+
*/
15001505
void sd_ifft_trunc(
15011506
sd_fft_ctx_t Q,
15021507
double* d,

0 commit comments

Comments
 (0)