Skip to content

Commit 689eba5

Browse files
Update series compositions cutoffs; tweaks to KL composition code
1 parent 75f4c50 commit 689eba5

5 files changed

Lines changed: 142 additions & 48 deletions

File tree

src/fmpq_poly/compose_series_kinoshita_li.c

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -73,16 +73,6 @@ _fmpq_poly_compose_series_kinoshita_li(fmpz *res, fmpz_t rden,
7373
fmpz *Qchain_num = _fmpz_vec_init(qchain_total);
7474
fmpz *Qchain_den = _fmpz_vec_init(L + 1); /* one denominator per level */
7575

76-
/* ---- Build P_num = f.reverse(n-1), P_den = den1 ---- */
77-
fmpz *P_num = _fmpz_vec_init(n);
78-
fmpz_t P_den;
79-
fmpz_init_set(P_den, den1);
80-
{
81-
slong glen = FLINT_MIN(len1, n);
82-
for (slong i = 0; i < glen; i++)
83-
fmpz_set(P_num + (n - 1 - i), poly1 + i);
84-
}
85-
8676
/* ---- Initialise Q_0 = 1 - y*g(x) ---- */
8777
{
8878
fmpz *Q0_num = Qchain_num + qoff_arr[0];
@@ -175,11 +165,20 @@ _fmpq_poly_compose_series_kinoshita_li(fmpz *res, fmpz_t rden,
175165
slong yn_W = n;
176166

177167
{
178-
slong jmax = FLINT_MIN(ydeg_arr[L], n);
179-
_fmpq_poly_div_series(W_num, W_den,
180-
P_num, P_den, n,
168+
/* ---- Build P_num = f.reverse(n-1), P_den = den1 ---- */
169+
slong glen = FLINT_MIN(n, len1);
170+
fmpz *P_num = _fmpz_vec_init(glen);
171+
172+
_fmpz_poly_reverse(P_num, poly1, glen, glen);
173+
174+
slong jmax = FLINT_MIN(ydeg_arr[L], glen);
175+
_fmpq_poly_div_series(W_num + n - glen, W_den,
176+
P_num, den1, glen,
181177
Qchain_num + qoff_arr[L], Qchain_den + L, jmax,
182-
n);
178+
glen);
179+
/* Low (n - glen) coefficients of W are a priori zero. */
180+
181+
_fmpz_vec_clear(P_num, glen);
183182
}
184183

185184
/* ================================================================
@@ -278,8 +277,6 @@ _fmpq_poly_compose_series_kinoshita_li(fmpz *res, fmpz_t rden,
278277

279278
_fmpz_vec_clear(W_num, W_alloc);
280279
fmpz_clear(W_den);
281-
_fmpz_vec_clear(P_num, n);
282-
fmpz_clear(P_den);
283280
_fmpz_vec_clear(Qchain_num, qchain_total);
284281
_fmpz_vec_clear(Qchain_den, L + 1);
285282

src/fmpz_poly/compose_series.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,23 @@
1515
#include "fmpz_vec.h"
1616
#include "fmpz_mat.h"
1717
#include "fmpz_poly.h"
18+
#include "gr.h"
19+
#include "gr_poly.h"
1820

1921
void
2022
_fmpz_poly_compose_series(fmpz * res, const fmpz * poly1, slong len1,
2123
const fmpz * poly2, slong len2, slong n)
2224
{
2325
if (len1 <= 10)
2426
_fmpz_poly_compose_series_horner(res, poly1, len1, poly2, len2, n);
25-
else
27+
else if (n < 500)
2628
_fmpz_poly_compose_series_brent_kung(res, poly1, len1, poly2, len2, n);
29+
else
30+
{
31+
gr_ctx_t ctx;
32+
gr_ctx_init_fmpz(ctx);
33+
GR_MUST_SUCCEED(_gr_poly_compose_series_kinoshita_li(res, poly1, len1, poly2, len2, n, ctx));
34+
}
2735
}
2836

2937
void

src/gr_poly/compose_series.c

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
(at your option) any later version. See <https://www.gnu.org/licenses/>.
1111
*/
1212

13+
#include <math.h>
1314
#include "gr_vec.h"
1415
#include "gr_poly.h"
1516

@@ -82,13 +83,31 @@ _gr_poly_compose_series(gr_ptr res, gr_srcptr poly1, slong len1,
8283
{
8384
status |= _gr_poly_compose_series_horner(res, poly1, len1, poly2, len2, n, ctx);
8485
}
85-
else if (len1 * len1 < n || (len1 - 1) * (len2 - 1) + 1 < 4 * n) /* todo: tune these cutoffs */
86+
else if (len1 * len1 < n || (len1 - 1) * (len2 - 1) + 1 < 4 * n) /* todo: tune this */
8687
{
8788
status |= _gr_poly_compose_series_divconquer(res, poly1, len1, poly2, len2, n, ctx);
8889
}
8990
else
9091
{
91-
status |= _gr_poly_compose_series_brent_kung(res, poly1, len1, poly2, len2, n, ctx);
92+
/* Try to figure out a reasonable cutoff for Kinoshita-Li.
93+
The following fallback is OK for fmpz and
94+
nmod with 64-bit coefficients.
95+
See also _gr_poly_revert_series. */
96+
slong cutoff = 500;
97+
98+
/* Tuned for arb and acb. */
99+
if (n >= 100 && gr_ctx_has_real_prec(ctx) == T_TRUE)
100+
{
101+
GR_MUST_SUCCEED(gr_ctx_get_real_prec(&cutoff, ctx));
102+
103+
cutoff = 15000 / sqrt(FLINT_MAX(64, cutoff));
104+
cutoff = FLINT_MAX(cutoff, 100);
105+
}
106+
107+
if (n < cutoff)
108+
status |= _gr_poly_compose_series_brent_kung(res, poly1, len1, poly2, len2, n, ctx);
109+
else
110+
status |= _gr_poly_compose_series_kinoshita_li(res, poly1, len1, poly2, len2, n, ctx);
92111
}
93112

94113
return status;

src/gr_poly/compose_series_kinoshita_li.c

Lines changed: 78 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,9 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
133133
return status;
134134
}
135135

136+
len1 = FLINT_MIN(len1, n);
137+
len2 = FLINT_MIN(len2, n);
138+
136139
/* ---- Compute level count L = ceil(log2(n)) ---- */
137140
L = 0;
138141
{ slong tmp = n; while (tmp > 1) { tmp = (tmp + 1) / 2; L++; } }
@@ -174,17 +177,54 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
174177
qoff_arr[L] = qchain_total;
175178
qchain_total += xn_arr[L] * ydeg_arr[L];
176179

177-
/* ---- Allocate Qchain ---- */
180+
/* ---- Compute scratch_total: maximum KS working memory across all levels ---- */
181+
/*
182+
* Both passes use three temporary buffers laid out consecutively in a
183+
* single pre-allocated scratch vector, avoiding GR_TMP_INIT_VEC and
184+
* GR_TMP_CLEAR_VEC calls inside the loops.
185+
*
186+
* Downward level k: Qks (qks_len) | Qneg_ks (qks_len) | Rks (rks_len).
187+
* Upward level k: Qneg_ks (qneg_ks_len) | W_ks (W_ks_len) | R_rks (R_rks_len).
188+
*
189+
* scratch_total = max over all levels of the per-level total size.
190+
*/
191+
slong scratch_total = 0;
192+
{
193+
slong yn_W_pre = n;
194+
for (k = 0; k < L; k++)
195+
{
196+
slong xn = xn_arr[k];
197+
slong ydeg = ydeg_arr[k];
198+
slong xnh = xn_arr[k + 1];
199+
slong s = 2 * ydeg - 1;
200+
slong qks_len = (xn - 1) * s + ydeg;
201+
slong rks_len = s * (2 * xnh - 1);
202+
scratch_total = FLINT_MAX(scratch_total, 2 * qks_len + rks_len);
203+
}
204+
for (k = L - 1; k >= 0; k--)
205+
{
206+
slong xn = xn_arr[k];
207+
slong xnh = xn_arr[k + 1];
208+
slong ydeg = ydeg_arr[k];
209+
slong ylo = ylo_arr[k];
210+
slong ylor = ylo_arr[k + 1];
211+
slong yn_r = n - ylor;
212+
slong yslice_lo = ylo - ylor;
213+
slong s = ydeg + yn_W_pre - 1;
214+
slong qneg_ks_len = (xn - 1) * s + ydeg;
215+
slong W_ks_len = 2 * (xnh - 1) * s + yn_W_pre;
216+
slong R_rks_len = (xn - 1) * s + yn_r - yslice_lo;
217+
scratch_total = FLINT_MAX(scratch_total, qneg_ks_len + W_ks_len + R_rks_len);
218+
yn_W_pre = n - ylo;
219+
}
220+
}
221+
222+
/* ---- Allocate Qchain and scratch ---- */
178223
gr_ptr Qchain;
179224
GR_TMP_INIT_VEC(Qchain, qchain_total, ctx);
180225

181-
/* ---- Build P(y) = f.reverse(n-1) ---- */
182-
gr_ptr P;
183-
GR_TMP_INIT_VEC(P, n, ctx);
184-
{
185-
slong glen = FLINT_MIN(len1, n);
186-
status |= _gr_poly_reverse(GR_ENTRY(P, n - glen, sz), poly1, glen, glen, ctx);
187-
}
226+
gr_ptr scratch;
227+
GR_TMP_INIT_VEC(scratch, scratch_total, ctx);
188228

189229
/* ---- Initialise Q_0 = 1 - y*g(x) ---- */
190230
{
@@ -214,10 +254,15 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
214254
gr_srcptr Qk = GR_ENTRY(Qchain, qoff_arr[k], sz);
215255
gr_ptr Qk1 = GR_ENTRY(Qchain, qoff_arr[k + 1], sz);
216256

217-
gr_ptr Qks, Qneg_ks, Rks;
218-
GR_TMP_INIT_VEC(Qks, qks_len, ctx);
219-
GR_TMP_INIT_VEC(Qneg_ks, qks_len, ctx);
220-
GR_TMP_INIT_VEC(Rks, rks_len, ctx);
257+
gr_ptr Qks = GR_ENTRY(scratch, 0, sz);
258+
gr_ptr Qneg_ks = GR_ENTRY(scratch, qks_len, sz);
259+
gr_ptr Rks = GR_ENTRY(scratch, 2 * qks_len, sz);
260+
261+
/* Qks and Qneg_ks must be zeroed before packing: the KS encoding
262+
* occupies only qks_len of the s*xn positions; gap slots between
263+
* columns must be zero for the multiply to be correct. */
264+
status |= _gr_vec_zero(Qks, qks_len, ctx);
265+
status |= _gr_vec_zero(Qneg_ks, qks_len, ctx);
221266

222267
/* Pack Q and Q(-x,y): x^i y^j -> flat index i*s + j */
223268
for (slong j = 0; j < ydeg; j++)
@@ -231,19 +276,15 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
231276
status |= gr_set(GR_ENTRY(Qneg_ks, i * s + j, sz), src, ctx);
232277
}
233278

279+
/* Rks is fully written by mullow; no zeroing needed. */
234280
status |= _gr_poly_mullow(Rks, Qks, qks_len, Qneg_ks, qks_len, rks_len, ctx);
235281

236-
GR_TMP_CLEAR_VEC(Qks, qks_len, ctx);
237-
GR_TMP_CLEAR_VEC(Qneg_ks, qks_len, ctx);
238-
239282
/* Unpack even-x columns -> Q_{k+1}.
240283
* [x^{2i} y^j] lives at flat index 2*i*s + j in the y-major layout. */
241284
for (slong j = 0; j < ydA; j++)
242285
for (slong i = 0; i < xnh; i++)
243286
status |= gr_set(GR_ENTRY(Qk1, j * xnh + i, sz),
244287
GR_ENTRY(Rks, 2 * i * s + j, sz), ctx);
245-
246-
GR_TMP_CLEAR_VEC(Rks, rks_len, ctx);
247288
}
248289

249290
/* ================================================================
@@ -268,9 +309,16 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
268309
slong yn_W = n; /* ylo_arr[L] == 0 always */
269310

270311
{
312+
/* Build P(y) = f.reverse(n-1) into scratch, which is at least n elements. */
313+
gr_ptr P = scratch;
314+
slong glen = FLINT_MIN(len1, n);
315+
status |= _gr_poly_reverse(P, poly1, glen, glen, ctx);
316+
271317
gr_srcptr QL = GR_ENTRY(Qchain, qoff_arr[L], sz);
272-
slong jmax = FLINT_MIN(ydeg_arr[L], n);
273-
status |= _gr_poly_div_series(W, P, n, QL, jmax, n, ctx);
318+
slong jmax = FLINT_MIN(ydeg_arr[L], glen);
319+
320+
status |= _gr_poly_div_series(GR_ENTRY(W, n - glen, sz), P, glen, QL, jmax, glen, ctx);
321+
/* Low (n - glen) coefficients of W are a priori zero. */
274322
}
275323

276324
/* ================================================================
@@ -313,10 +361,16 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
313361
slong R_rks_len = (xn - 1) * s + yn_r - yslice_lo;
314362

315363
gr_srcptr Qk = GR_ENTRY(Qchain, qoff_arr[k], sz);
316-
gr_ptr Qneg_ks, W_ks, R_rks;
317-
GR_TMP_INIT_VEC(Qneg_ks, qneg_ks_len, ctx);
318-
GR_TMP_INIT_VEC(W_ks, W_ks_len, ctx);
319-
GR_TMP_INIT_VEC(R_rks, R_rks_len, ctx);
364+
365+
gr_ptr Qneg_ks = GR_ENTRY(scratch, 0, sz);
366+
gr_ptr W_ks = GR_ENTRY(scratch, qneg_ks_len, sz);
367+
gr_ptr R_rks = GR_ENTRY(scratch, qneg_ks_len + W_ks_len, sz);
368+
369+
/* Qneg_ks and W_ks must be zeroed before packing: KS gap slots
370+
* between columns must be zero for the multiply to be correct.
371+
* R_rks is fully written by mulmid; no zeroing needed. */
372+
status |= _gr_vec_zero(Qneg_ks, qneg_ks_len, ctx);
373+
status |= _gr_vec_zero(W_ks, W_ks_len, ctx);
320374

321375
/* Pack Qneg_k: x^i y^j -> flat index i*s + j, with sign flip for odd i */
322376
for (slong j = 0; j < ydeg; j++)
@@ -340,9 +394,6 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
340394
W_ks, W_ks_len,
341395
yslice_lo, yslice_lo + R_rks_len, ctx);
342396

343-
GR_TMP_CLEAR_VEC(Qneg_ks, qneg_ks_len, ctx);
344-
GR_TMP_CLEAR_VEC(W_ks, W_ks_len, ctx);
345-
346397
/* Unpack y-slice [yslice_lo, yn_r) -> W (in place).
347398
* R_rks[k] holds product coefficient yslice_lo+k.
348399
* R[jsrc][i] was at i*s+jsrc; it is now at i*s+jsrc-yslice_lo in R_rks. */
@@ -355,16 +406,14 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
355406
GR_ENTRY(R_rks, i * s + jsrc - yslice_lo, sz), ctx);
356407
}
357408

358-
GR_TMP_CLEAR_VEC(R_rks, R_rks_len, ctx);
359-
360409
yn_W = new_yn_W;
361410
}
362411

363412
/* W[0..n) holds the result f(g(x)) mod x^n. */
364413
status |= _gr_vec_set(res, W, n, ctx);
365414

366415
GR_TMP_CLEAR_VEC(W, W_alloc, ctx);
367-
GR_TMP_CLEAR_VEC(P, n, ctx);
416+
GR_TMP_CLEAR_VEC(scratch, scratch_total, ctx);
368417
GR_TMP_CLEAR_VEC(Qchain, qchain_total, ctx);
369418

370419
flint_free(xn_arr);

src/gr_poly/revert_series.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
(at your option) any later version. See <https://www.gnu.org/licenses/>.
1010
*/
1111

12+
#include <math.h>
1213
#include "gr_vec.h"
1314
#include "gr_poly.h"
1415
#include "ulong_extras.h"
@@ -189,6 +190,26 @@ _gr_poly_revert_series(gr_ptr res, gr_srcptr f, slong flen, slong n, gr_ctx_t ct
189190
{
190191
int status;
191192

193+
/* Estimate composition cutoff for Kinoshita-Li; see _gr_poly_compose_series. */
194+
if (n >= 100)
195+
{
196+
slong cutoff = 500;
197+
198+
if (n >= 100 && gr_ctx_has_real_prec(ctx) == T_TRUE)
199+
{
200+
GR_MUST_SUCCEED(gr_ctx_get_real_prec(&cutoff, ctx));
201+
cutoff = 15000 / sqrt(FLINT_MAX(64, cutoff));
202+
cutoff = FLINT_MAX(cutoff, 100);
203+
}
204+
205+
/* Guess that the optimal cutoff for reversion is quite a bit higher
206+
than that for composition. */
207+
cutoff *= 5;
208+
209+
if (n >= cutoff)
210+
return _gr_poly_revert_series_newton(res, f, flen, n, ctx);
211+
}
212+
192213
status = _gr_poly_revert_series_lagrange_fast(res, f, flen, n, ctx);
193214

194215
/* Newton may succeed where Lagrange fails in finite characteristic */

0 commit comments

Comments
 (0)