flintlib
diff --git a/‎doc/source/fmpq_poly.rst‎
Lines changed: 27 additions & 10 deletions b/‎doc/source/fmpq_poly.rst‎
Lines changed: 27 additions & 10 deletions
diff --git a/‎doc/source/gr_poly.rst‎
Lines changed: 4 additions & 1 deletion b/‎doc/source/gr_poly.rst‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎doc/source/references.rst‎
Lines changed: 2 additions & 0 deletions b/‎doc/source/references.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/fmpq_poly.h‎
Lines changed: 7 additions & 0 deletions b/‎src/fmpq_poly.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/fmpq_poly/compose_series.c‎
Lines changed: 13 additions & 2 deletions b/‎src/fmpq_poly/compose_series.c‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎src/fmpq_poly/compose_series_brent_kung.c‎
Lines changed: 69 additions & 85 deletions b/‎src/fmpq_poly/compose_series_brent_kung.c‎
Lines changed: 69 additions & 85 deletions
diff --git a/‎src/fmpq_poly/compose_series_horner.c‎
Lines changed: 1 addition & 7 deletions b/‎src/fmpq_poly/compose_series_horner.c‎
Lines changed: 1 addition & 7 deletions
@@ -1516,8 +1516,6 @@ Power series composition
     of the inputs and the output.
 
     This implementation uses Brent-Kung algorithm 2.1 [BrentKung1978]_.
-    The default ``fmpz_poly`` composition algorithm is automatically
-    used when the composition can be performed over the integers.
 
 .. function:: void fmpq_poly_compose_series_brent_kung(fmpq_poly_t res, const fmpq_poly_t poly1, const fmpq_poly_t poly2, slong n)
 
@@ -1526,8 +1524,25 @@ Power series composition
     to be zero.
 
     This implementation uses Brent-Kung algorithm 2.1 [BrentKung1978]_.
-    The default ``fmpz_poly`` composition algorithm is automatically
-    used when the composition can be performed over the integers.
+
+.. function:: void _fmpq_poly_compose_series_kinoshita_li(fmpz * res, fmpz_t den, const fmpz * poly1, const fmpz_t den1, slong len1, const fmpz * poly2, const fmpz_t den2, slong len2, slong n)
+
+    Sets ``(res, den, n)`` to the composition of
+    ``(poly1, den1, len1)`` and ``(poly2, den2, len2)`` modulo `x^n`,
+    where the constant term of ``poly2`` is required to be zero.
+
+    Assumes that ``len1, len2, n > 0``, that ``len1, len2 <= n``,
+    and that ``res`` has space for ``n`` coefficients.
+
+    This implementation uses the Kinoshita-Li algorithm [KL2024]_.
+
+.. function:: void fmpq_poly_compose_series_kinoshita_li(fmpq_poly_t res, const fmpq_poly_t poly1, const fmpq_poly_t poly2, slong n)
+
+    Sets ``res`` to the composition of ``poly1`` and ``poly2``
+    modulo `x^n`, where the constant term of ``poly2`` is required
+    to be zero.
+
+    This implementation uses the Kinoshita-Li algorithm [KL2024]_.
 
 .. function:: void _fmpq_poly_compose_series(fmpz * res, fmpz_t den, const fmpz * poly1, const fmpz_t den1, slong len1, const fmpz * poly2, const fmpz_t den2, slong len2, slong n)
 
@@ -1540,10 +1555,10 @@ Power series composition
     space for ``n`` coefficients. Does not support aliasing between any
     of the inputs and the output.
 
-    This implementation automatically switches between the Horner scheme
-    and Brent-Kung algorithm 2.1 depending on the size of the inputs.
-    The default ``fmpz_poly`` composition algorithm is automatically
-    used when the composition can be performed over the integers.
+    This implementation automatically switches between the Horner scheme,
+    Brent-Kung algorithm 2.1 and the Kinoshita-Li algorithm depending on the
+    size of the inputs. The default ``fmpz_poly`` composition algorithm is
+    automaticallyused when the composition can be performed over the integers.
 
 .. function:: void fmpq_poly_compose_series(fmpq_poly_t res, const fmpq_poly_t poly1, const fmpq_poly_t poly2, slong n)
 
@@ -1641,7 +1656,8 @@ Power series reversion
     the linear term is required to be nonzero. Assumes that `n > 0`.
     Does not support aliasing between any of the inputs and the output.
 
-    This implementation defaults to using Newton iteration.
+    This implementation chooses between fast Lagrange inversion and
+    Newton iteration depending on the inputs.
     The default ``fmpz_poly`` reversion algorithm is automatically
     used when the reversion can be performed over the integers.
 
@@ -1651,7 +1667,8 @@ Power series reversion
     The constant term of ``poly2`` is required to be zero and
     the linear term is required to be nonzero.
 
-    This implementation defaults to using Newton iteration.
+    This implementation chooses between fast Lagrange inversion and
+    Newton iteration depending on the inputs.
     The default ``fmpz_poly`` reversion algorithm is automatically
     used when the reversion can be performed over the integers.
 
 
@@ -798,13 +798,16 @@ Power series composition and reversion
               int gr_poly_compose_series_brent_kung(gr_poly_t res, const gr_poly_t poly1, const gr_poly_t poly2, slong n, gr_ctx_t ctx)
               int _gr_poly_compose_series_divconquer(gr_ptr res, gr_srcptr poly1, slong len1, gr_srcptr poly2, slong len2, slong n, gr_ctx_t ctx)
               int gr_poly_compose_series_divconquer(gr_poly_t res, const gr_poly_t poly1, const gr_poly_t poly2, slong n, gr_ctx_t ctx)
+              int _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1, gr_srcptr poly2, slong len2, slong n, gr_ctx_t ctx)
+              int gr_poly_compose_series_kinoshita_li(gr_poly_t res, const gr_poly_t poly1, const gr_poly_t poly2, slong n, gr_ctx_t ctx)
               int _gr_poly_compose_series(gr_ptr res, gr_srcptr poly1, slong len1, gr_srcptr poly2, slong len2, slong n, gr_ctx_t ctx)
               int gr_poly_compose_series(gr_poly_t res, const gr_poly_t poly1, const gr_poly_t poly2, slong n, gr_ctx_t ctx)
 
     Sets *res* to the power series composition `h(x) = f(g(x))` truncated
     to order `O(x^n)` where `f` is given by *poly1* and `g` is given by *poly2*,
     respectively using Horner's rule, the Brent-Kung baby step-giant step
-    algorithm [BrentKung1978]_, divide-and-conquer, and an automatic choice between the algorithms.
+    algorithm [BrentKung1978]_, divide-and-conquer, the quasilinear-complexity
+    Kinoshita-Li algorithm [KL2024]_, and an automatic choice between the algorithms.
 
     The default algorithm also handles short input and
     special-form input `g = ax^n` efficiently.
 
@@ -217,6 +217,8 @@ References
 
 .. [Kar1998] \E. A. Karatsuba, "Fast evaluation of the Hurwitz zeta function and Dirichlet L-series", Problems of Information Transmission 34:4 (1998), 342-353, http://www.mathnet.ru/php/archive.phtml?wshow=paper&jrnid=ppi&paperid=425&option_lang=eng
 
+.. [KL2024] \Y. Kinoshita and B. Li, "Power series composition in near-linear time", 2024, https://arxiv.org/abs/2404.05177
+
 .. [Knu1997] \Knuth, D. E. The Art of Computer Programming, volume 2: Seminumerical algorithms, 1997
 
 .. [Kob2010] \A. Kobel, "Certified Complex Numerical Root Finding", Seminar on Computational Geometry and Geometric Computing (2010), http://www.mpi-inf.mpg.de/departments/d1/teaching/ss10/Seminar_CGGC/Slides/02_Kobel_NRS.pdf
 
@@ -683,6 +683,13 @@ void _fmpq_poly_compose_series_brent_kung(fmpz * res, fmpz_t den,
 void fmpq_poly_compose_series_brent_kung(fmpq_poly_t res,
                     const fmpq_poly_t poly1, const fmpq_poly_t poly2, slong n);
 
+void _fmpq_poly_compose_series_kinoshita_li(fmpz * res, fmpz_t den,
+        const fmpz * poly1, const fmpz_t den1, slong len1,
+        const fmpz * poly2, const fmpz_t den2, slong len2, slong n);
+
+void fmpq_poly_compose_series_kinoshita_li(fmpq_poly_t res,
+                    const fmpq_poly_t poly1, const fmpq_poly_t poly2, slong n);
+
 void _fmpq_poly_compose_series(fmpz * res, fmpz_t den,
         const fmpz * poly1, const fmpz_t den1, slong len1,
         const fmpz * poly2, const fmpz_t den2, slong len2, slong n);
 
@@ -19,12 +19,23 @@ _fmpq_poly_compose_series(fmpz * res, fmpz_t den, const fmpz * poly1,
         const fmpz_t den1, slong len1, const fmpz * poly2,
         const fmpz_t den2, slong len2, slong n)
 {
-    if (len1 <= 20)
+    if (fmpz_is_one(den2))
+    {
+        _fmpz_poly_compose_series(res, poly1, len1, poly2, len2, n);
+        fmpz_set(den, den1);
+        _fmpq_poly_canonicalise(res, den, n);
+        return;
+    }
+
+    if (FLINT_MIN(len1, n) < 8)
         _fmpq_poly_compose_series_horner(res, den, poly1, den1, len1,
                 poly2, den2, len2, n);
-    else
+    else if (n < 250)
         _fmpq_poly_compose_series_brent_kung(res, den, poly1, den1, len1,
                 poly2, den2, len2, n);
+    else
+        _fmpq_poly_compose_series_kinoshita_li(res, den, poly1, den1, len1,
+                poly2, den2, len2, n);
 }
 
 void
 
@@ -1,6 +1,6 @@
 /*
     Copyright (C) 2010 Sebastian Pancratz
-    Copyright (C) 2011 Fredrik Johansson
+    Copyright (C) 2011, 2026 Fredrik Johansson
 
     This file is part of FLINT.
 
@@ -14,48 +14,19 @@
 #include "fmpz.h"
 #include "fmpz_vec.h"
 #include "fmpz_poly.h"
-#include "fmpq.h"
-#include "fmpq_mat.h"
+#include "fmpz_mat.h"
 #include "fmpq_poly.h"
 
-static void
-_fmpq_mat_get_row(fmpz * rnum, fmpz_t den, fmpq_mat_t A, slong i)
-{
-    slong j;
-    fmpz_t t;
-    fmpz_init(t);
-    fmpz_one(den);
-
-    for (j = 0; j < fmpq_mat_ncols(A); j++)
-        fmpz_lcm(den, den, fmpq_mat_entry_den(A, i, j));
-
-    for (j = 0; j < fmpq_mat_ncols(A); j++)
-    {
-        fmpz_divexact(t, den, fmpq_mat_entry_den(A, i, j));
-        fmpz_mul(rnum + j, fmpq_mat_entry_num(A, i, j), t);
-    }
-
-    fmpz_clear(t);
-}
-
-
 void
 _fmpq_poly_compose_series_brent_kung(fmpz * res, fmpz_t den, const fmpz * poly1,
         const fmpz_t den1, slong len1, const fmpz * poly2,
         const fmpz_t den2, slong len2, slong n)
 {
-    fmpq_mat_t A, B, C;
-    fmpz_t tden, uden, hden;
-    fmpz *t, *u, *h, *swap;
-    slong i, j, m;
-
-    if (fmpz_is_one(den2))
-    {
-        _fmpz_poly_compose_series(res, poly1, len1, poly2, len2, n);
-        fmpz_set(den, den1);
-        _fmpq_poly_canonicalise(res, den, n);
-        return;
-    }
+    fmpz_mat_t A, B, C;
+    fmpz *A_den, *t, *h;
+    fmpz_t lcd, scale;
+    fmpz_t C_den, tden, hden;
+    slong i, m;
 
     if (n == 1)
     {
@@ -67,85 +38,98 @@ _fmpq_poly_compose_series_brent_kung(fmpz * res, fmpz_t den, const fmpz * poly1,
 
     m = n_sqrt(n) + 1;
 
-    fmpq_mat_init(A, m, n);
-    fmpq_mat_init(B, m, m);
-    fmpq_mat_init(C, m, n);
+    fmpz_mat_init(A, m, n);
+    fmpz_mat_init(B, m, m);
+    fmpz_mat_init(C, m, n);
+    A_den = _fmpz_vec_init(m);
 
+    fmpz_init(lcd);
+    fmpz_init(scale);
+    fmpz_init(C_den);
     fmpz_init(tden);
-    fmpz_init(uden);
     fmpz_init(hden);
     h = _fmpz_vec_init(n);
     t = _fmpz_vec_init(n);
-    u = _fmpz_vec_init(n);
 
-    /* Set rows of B to the segments of poly1 */
+    /* Set rows of B to the segments of poly1 with common denominator den1. */
     for (i = 0; i < len1; i++)
+        fmpz_set(fmpz_mat_entry(B, i / m, i % m), poly1 + i);
+    /* Remark: if the poly is non-canonical e.g. due to being a truncation
+       of a longer power series, it could be helpful to remove its content
+       here. We could also consider removing content of B row by row. */
+
+    /* Set rows of A to the numerators of powers of poly2 with corresponding
+       denominators in A_den[i]. */
+    fmpz_one(fmpz_mat_entry(A, 0, 0));
+    fmpz_one(A_den + 0);
+    _fmpz_vec_set(fmpz_mat_row(A, 1), poly2, len2);
+    fmpz_set(A_den + 1, den2);
+    /* Optional: may improve performance if poly2 is non-canonical e.g. due
+       to being a truncation of a longer power series. */
+    _fmpq_poly_canonicalise(fmpz_mat_row(A, 1), A_den + 1, n);
+
+    for (i = 2; i < m; i++)
     {
-        fmpz_set(fmpq_mat_entry_num(B, i / m, i % m), poly1 + i);
-        fmpz_set(fmpq_mat_entry_den(B, i / m, i % m), den1);
-        fmpq_canonicalise(fmpq_mat_entry(B, i / m, i % m));
+        fmpz * Ai = fmpz_mat_row(A, i);
+        fmpz * Ai1 = fmpz_mat_row(A, i - 1);
+        fmpz * Ai_den = A_den + i;
+        fmpz * Ai1_den = A_den + i - 1;
+
+        _fmpq_poly_mullow(Ai, Ai_den, Ai1, Ai1_den, n, poly2, den2, len2, n);
+        _fmpq_poly_canonicalise(Ai, Ai_den, n);
     }
 
-    /* Set rows of A to powers of poly2 */
-    fmpq_set_si(fmpq_mat_entry(A, 0, 0), WORD(1), WORD(1));
+    /* Compute h = poly2 ^ m */
+    _fmpq_poly_mullow(h, hden, fmpz_mat_row(A, m - 1), A_den + m - 1, n, poly2, den2, len2, n);
+    _fmpq_poly_canonicalise(h, hden, n);
 
-    for (i = 0; i < len2; i++)
-    {
-        fmpz_set(fmpq_mat_entry_num(A, 1, i), poly2 + i);
-        fmpz_set(fmpq_mat_entry_den(A, 1, i), den2);
-        fmpq_canonicalise(fmpq_mat_entry(A, 1, i));
-    }
+    /* Matrix multiply C = B * A over the integers.
+       B (m x m) has common denominator den1.
+       A (m x n) has row denominator A_den[i].
 
-    _fmpz_vec_set(h, poly2, len2);
-    fmpz_set(hden, den2);
+       We could remove gcd of A columnwise before multiplying, but
+       this is slower for small to moderate n where we want to use Brent-Kung
+       and faster only for large n where we want to use Kinoshita-Li instead. */
+    fmpz_one(lcd);
+    for (i = 0; i < m; i++)
+        fmpz_lcm(lcd, lcd, A_den + i);
 
-    for (i = 2; i < m; i++)
+    for (i = 0; i < m; i++)
     {
-        _fmpq_poly_mullow(t, tden, h, hden, n, poly2, den2, len2, n);
-        _fmpq_poly_canonicalise(t, tden, n);
-
-        for (j = 0; j < n; j++)
-        {
-            fmpz_set(fmpq_mat_entry_num(A, i, j), t + j);
-            fmpz_set(fmpq_mat_entry_den(A, i, j), tden);
-            fmpq_canonicalise(fmpq_mat_entry(A, i, j));
-        }
-        swap = t; t = h; h = swap;
-        fmpz_swap(hden, tden);
+        fmpz_divexact(scale, lcd, A_den + i);
+        if (!fmpz_is_one(scale))
+            _fmpz_vec_scalar_mul_fmpz(fmpz_mat_row(A, i), fmpz_mat_row(A, i), n, scale);
     }
 
-    /* Compute h = poly2 ^ m */
-    _fmpq_poly_mullow(t, tden, h, hden, n, poly2, den2, len2, n);
-    _fmpq_poly_canonicalise(t, tden, n);
-    swap = t; t = h; h = swap;
-    fmpz_swap(hden, tden);
+    fmpz_mat_mul(C, B, A);
+    fmpz_mat_clear(A);
+    _fmpz_vec_clear(A_den, m);
+    fmpz_mat_clear(B);
 
-    /* Matrix multiply */
-    fmpq_mat_mul(C, B, A);
-    fmpq_mat_clear(A);
-    fmpq_mat_clear(B);
+    fmpz_mul(C_den, den1, lcd);
 
+    _fmpz_vec_set(res, fmpz_mat_row(C, m - 1), n);
+    fmpz_set(den, C_den);
+    _fmpq_poly_canonicalise(res, den, n);
     /* Evaluate block composition using the Horner scheme */
-    _fmpq_mat_get_row(res, den, C, m - 1);
-
     for (i = m - 2; i >= 0; i--)
     {
         _fmpq_poly_mullow(t, tden, res, den, n, h, hden, n, n);
-        /* we could canonicalise t here, but it does not seem to make
-           much of a difference */
-        _fmpq_mat_get_row(u, uden, C, i);
-        _fmpq_poly_add(res, den, t, tden, n, u, uden, n);
+        /* Remark: we could canonicalise t and/or fmpz_mat_row(C, i)
+           here; in practice this seems to be slower at least for moderate n. */
+        _fmpq_poly_add(res, den, t, tden, n, fmpz_mat_row(C, i), C_den, n);
     }
 
     _fmpq_poly_canonicalise(res, den, n);
 
-    fmpq_mat_clear(C);
+    fmpz_mat_clear(C);
 
     _fmpz_vec_clear(t, n);
-    _fmpz_vec_clear(u, n);
     _fmpz_vec_clear(h, n);
+    fmpz_clear(lcd);
+    fmpz_clear(scale);
+    fmpz_clear(C_den);
     fmpz_clear(tden);
-    fmpz_clear(uden);
     fmpz_clear(hden);
 }
 
 
@@ -20,13 +20,7 @@ _fmpq_poly_compose_series_horner(fmpz * res, fmpz_t den, const fmpz * poly1,
         const fmpz_t den1, slong len1, const fmpz * poly2,
         const fmpz_t den2, slong len2, slong n)
 {
-    if (fmpz_is_one(den2))
-    {
-        _fmpz_poly_compose_series(res, poly1, len1, poly2, len2, n);
-        fmpz_set(den, den1);
-        _fmpq_poly_canonicalise(res, den, n);
-    }
-    else if (n == 1)
+    if (n == 1)
     {
         fmpz_set(res, poly1);
         fmpz_set(den, den1);