@@ -133,6 +133,9 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
133133 return status ;
134134 }
135135
136+ len1 = FLINT_MIN (len1 , n );
137+ len2 = FLINT_MIN (len2 , n );
138+
136139 /* ---- Compute level count L = ceil(log2(n)) ---- */
137140 L = 0 ;
138141 { slong tmp = n ; while (tmp > 1 ) { tmp = (tmp + 1 ) / 2 ; L ++ ; } }
@@ -174,17 +177,54 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
174177 qoff_arr [L ] = qchain_total ;
175178 qchain_total += xn_arr [L ] * ydeg_arr [L ];
176179
177- /* ---- Allocate Qchain ---- */
180+ /* ---- Compute scratch_total: maximum KS working memory across all levels ---- */
181+ /*
182+ * Both passes use three temporary buffers laid out consecutively in a
183+ * single pre-allocated scratch vector, avoiding GR_TMP_INIT_VEC and
184+ * GR_TMP_CLEAR_VEC calls inside the loops.
185+ *
186+ * Downward level k: Qks (qks_len) | Qneg_ks (qks_len) | Rks (rks_len).
187+ * Upward level k: Qneg_ks (qneg_ks_len) | W_ks (W_ks_len) | R_rks (R_rks_len).
188+ *
189+ * scratch_total = max over all levels of the per-level total size.
190+ */
191+ slong scratch_total = 0 ;
192+ {
193+ slong yn_W_pre = n ;
194+ for (k = 0 ; k < L ; k ++ )
195+ {
196+ slong xn = xn_arr [k ];
197+ slong ydeg = ydeg_arr [k ];
198+ slong xnh = xn_arr [k + 1 ];
199+ slong s = 2 * ydeg - 1 ;
200+ slong qks_len = (xn - 1 ) * s + ydeg ;
201+ slong rks_len = s * (2 * xnh - 1 );
202+ scratch_total = FLINT_MAX (scratch_total , 2 * qks_len + rks_len );
203+ }
204+ for (k = L - 1 ; k >= 0 ; k -- )
205+ {
206+ slong xn = xn_arr [k ];
207+ slong xnh = xn_arr [k + 1 ];
208+ slong ydeg = ydeg_arr [k ];
209+ slong ylo = ylo_arr [k ];
210+ slong ylor = ylo_arr [k + 1 ];
211+ slong yn_r = n - ylor ;
212+ slong yslice_lo = ylo - ylor ;
213+ slong s = ydeg + yn_W_pre - 1 ;
214+ slong qneg_ks_len = (xn - 1 ) * s + ydeg ;
215+ slong W_ks_len = 2 * (xnh - 1 ) * s + yn_W_pre ;
216+ slong R_rks_len = (xn - 1 ) * s + yn_r - yslice_lo ;
217+ scratch_total = FLINT_MAX (scratch_total , qneg_ks_len + W_ks_len + R_rks_len );
218+ yn_W_pre = n - ylo ;
219+ }
220+ }
221+
222+ /* ---- Allocate Qchain and scratch ---- */
178223 gr_ptr Qchain ;
179224 GR_TMP_INIT_VEC (Qchain , qchain_total , ctx );
180225
181- /* ---- Build P(y) = f.reverse(n-1) ---- */
182- gr_ptr P ;
183- GR_TMP_INIT_VEC (P , n , ctx );
184- {
185- slong glen = FLINT_MIN (len1 , n );
186- status |= _gr_poly_reverse (GR_ENTRY (P , n - glen , sz ), poly1 , glen , glen , ctx );
187- }
226+ gr_ptr scratch ;
227+ GR_TMP_INIT_VEC (scratch , scratch_total , ctx );
188228
189229 /* ---- Initialise Q_0 = 1 - y*g(x) ---- */
190230 {
@@ -214,10 +254,15 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
214254 gr_srcptr Qk = GR_ENTRY (Qchain , qoff_arr [k ], sz );
215255 gr_ptr Qk1 = GR_ENTRY (Qchain , qoff_arr [k + 1 ], sz );
216256
217- gr_ptr Qks , Qneg_ks , Rks ;
218- GR_TMP_INIT_VEC (Qks , qks_len , ctx );
219- GR_TMP_INIT_VEC (Qneg_ks , qks_len , ctx );
220- GR_TMP_INIT_VEC (Rks , rks_len , ctx );
257+ gr_ptr Qks = GR_ENTRY (scratch , 0 , sz );
258+ gr_ptr Qneg_ks = GR_ENTRY (scratch , qks_len , sz );
259+ gr_ptr Rks = GR_ENTRY (scratch , 2 * qks_len , sz );
260+
261+ /* Qks and Qneg_ks must be zeroed before packing: the KS encoding
262+ * occupies only qks_len of the s*xn positions; gap slots between
263+ * columns must be zero for the multiply to be correct. */
264+ status |= _gr_vec_zero (Qks , qks_len , ctx );
265+ status |= _gr_vec_zero (Qneg_ks , qks_len , ctx );
221266
222267 /* Pack Q and Q(-x,y): x^i y^j -> flat index i*s + j */
223268 for (slong j = 0 ; j < ydeg ; j ++ )
@@ -231,19 +276,15 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
231276 status |= gr_set (GR_ENTRY (Qneg_ks , i * s + j , sz ), src , ctx );
232277 }
233278
279+ /* Rks is fully written by mullow; no zeroing needed. */
234280 status |= _gr_poly_mullow (Rks , Qks , qks_len , Qneg_ks , qks_len , rks_len , ctx );
235281
236- GR_TMP_CLEAR_VEC (Qks , qks_len , ctx );
237- GR_TMP_CLEAR_VEC (Qneg_ks , qks_len , ctx );
238-
239282 /* Unpack even-x columns -> Q_{k+1}.
240283 * [x^{2i} y^j] lives at flat index 2*i*s + j in the y-major layout. */
241284 for (slong j = 0 ; j < ydA ; j ++ )
242285 for (slong i = 0 ; i < xnh ; i ++ )
243286 status |= gr_set (GR_ENTRY (Qk1 , j * xnh + i , sz ),
244287 GR_ENTRY (Rks , 2 * i * s + j , sz ), ctx );
245-
246- GR_TMP_CLEAR_VEC (Rks , rks_len , ctx );
247288 }
248289
249290 /* ================================================================
@@ -268,9 +309,16 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
268309 slong yn_W = n ; /* ylo_arr[L] == 0 always */
269310
270311 {
312+ /* Build P(y) = f.reverse(n-1) into scratch, which is at least n elements. */
313+ gr_ptr P = scratch ;
314+ slong glen = FLINT_MIN (len1 , n );
315+ status |= _gr_poly_reverse (P , poly1 , glen , glen , ctx );
316+
271317 gr_srcptr QL = GR_ENTRY (Qchain , qoff_arr [L ], sz );
272- slong jmax = FLINT_MIN (ydeg_arr [L ], n );
273- status |= _gr_poly_div_series (W , P , n , QL , jmax , n , ctx );
318+ slong jmax = FLINT_MIN (ydeg_arr [L ], glen );
319+
320+ status |= _gr_poly_div_series (GR_ENTRY (W , n - glen , sz ), P , glen , QL , jmax , glen , ctx );
321+ /* Low (n - glen) coefficients of W are a priori zero. */
274322 }
275323
276324 /* ================================================================
@@ -313,10 +361,16 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
313361 slong R_rks_len = (xn - 1 ) * s + yn_r - yslice_lo ;
314362
315363 gr_srcptr Qk = GR_ENTRY (Qchain , qoff_arr [k ], sz );
316- gr_ptr Qneg_ks , W_ks , R_rks ;
317- GR_TMP_INIT_VEC (Qneg_ks , qneg_ks_len , ctx );
318- GR_TMP_INIT_VEC (W_ks , W_ks_len , ctx );
319- GR_TMP_INIT_VEC (R_rks , R_rks_len , ctx );
364+
365+ gr_ptr Qneg_ks = GR_ENTRY (scratch , 0 , sz );
366+ gr_ptr W_ks = GR_ENTRY (scratch , qneg_ks_len , sz );
367+ gr_ptr R_rks = GR_ENTRY (scratch , qneg_ks_len + W_ks_len , sz );
368+
369+ /* Qneg_ks and W_ks must be zeroed before packing: KS gap slots
370+ * between columns must be zero for the multiply to be correct.
371+ * R_rks is fully written by mulmid; no zeroing needed. */
372+ status |= _gr_vec_zero (Qneg_ks , qneg_ks_len , ctx );
373+ status |= _gr_vec_zero (W_ks , W_ks_len , ctx );
320374
321375 /* Pack Qneg_k: x^i y^j -> flat index i*s + j, with sign flip for odd i */
322376 for (slong j = 0 ; j < ydeg ; j ++ )
@@ -340,9 +394,6 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
340394 W_ks , W_ks_len ,
341395 yslice_lo , yslice_lo + R_rks_len , ctx );
342396
343- GR_TMP_CLEAR_VEC (Qneg_ks , qneg_ks_len , ctx );
344- GR_TMP_CLEAR_VEC (W_ks , W_ks_len , ctx );
345-
346397 /* Unpack y-slice [yslice_lo, yn_r) -> W (in place).
347398 * R_rks[k] holds product coefficient yslice_lo+k.
348399 * R[jsrc][i] was at i*s+jsrc; it is now at i*s+jsrc-yslice_lo in R_rks. */
@@ -355,16 +406,14 @@ _gr_poly_compose_series_kinoshita_li(gr_ptr res, gr_srcptr poly1, slong len1,
355406 GR_ENTRY (R_rks , i * s + jsrc - yslice_lo , sz ), ctx );
356407 }
357408
358- GR_TMP_CLEAR_VEC (R_rks , R_rks_len , ctx );
359-
360409 yn_W = new_yn_W ;
361410 }
362411
363412 /* W[0..n) holds the result f(g(x)) mod x^n. */
364413 status |= _gr_vec_set (res , W , n , ctx );
365414
366415 GR_TMP_CLEAR_VEC (W , W_alloc , ctx );
367- GR_TMP_CLEAR_VEC (P , n , ctx );
416+ GR_TMP_CLEAR_VEC (scratch , scratch_total , ctx );
368417 GR_TMP_CLEAR_VEC (Qchain , qchain_total , ctx );
369418
370419 flint_free (xn_arr );
0 commit comments