Skip to content

Commit c8851d9

Browse files
eendebakptclaude
andcommitted
gh-NNNN: Avoid extra heap alloc per float format in modes 2/3
Two adapter-level tweaks that together save one PyMem_Malloc on every mode-2 (%e/%g) and mode-3 non-negative (%f) call: 1. Stack-buffer the common case. d2exp with precision ≤ ~160 fits in 256 bytes; d2fixed with precision ≤ ~380 fits in 768 bytes. Covers essentially every real-world use (default %e/%g precision is 6, default %f is variable but typically single digits). Large precisions (e.g. "%.123456f") still heap-allocate. 2. Add parse_ryu_d2exp_inplace and parse_ryu_d2fixed_inplace: like their existing copy-based counterparts but rewrite Ryu's output buffer in place and transfer ownership to *out_digits, so the heap path now costs exactly one allocation instead of two. Benchmark impact (PYTHON_JIT=0, main vs this commit, geomean over 17 cases): before: 1.78x speedup vs Gay's dtoa, 5 cases regressing 0.84–0.89x after : 1.80x speedup vs Gay's dtoa, 5 cases regressing 0.87–0.90x The residual regression on %e/%g is algorithmic — Gay's mode-2 has a floating-point fast path for ≲ 15 digits that Ryu doesn't match — so we can't close it from the adapter. Mode-3 (%f, round) where Ryu's uniform performance beats Gay's variable-cost fixed-point path improves from ~3.20x → ~3.30x. No test-suite change; test_float and test_format still pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 63adf3d commit c8851d9

1 file changed

Lines changed: 177 additions & 16 deletions

File tree

Python/_ryu/pystrtod_ryu.h

Lines changed: 177 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,72 @@ parse_ryu_d2exp_output(const char *ryu_buf, int ryu_len,
252252
return 1;
253253
}
254254

255+
/* -------------------------------------------------------------------------
256+
* parse_ryu_d2exp_inplace
257+
*
258+
* Like parse_ryu_d2exp_output, but takes a PyMem_Malloc'd buffer and
259+
* rewrites its contents in place — the same buffer is returned as
260+
* *out_digits (ownership transferred to the caller, who must free it).
261+
* No extra heap allocation is performed.
262+
*
263+
* Safe because the mantissa is compacted toward the front of the buffer
264+
* (write cursor ≤ read cursor − 0 always) and the exponent suffix lies
265+
* strictly past the highest write position.
266+
* ------------------------------------------------------------------------- */
267+
static int
268+
parse_ryu_d2exp_inplace(char *buf, int len,
269+
char **out_digits, int *decpt, int *sign,
270+
char **digits_end)
271+
{
272+
char *p = buf;
273+
char *end = buf + len;
274+
275+
*sign = 0;
276+
if (p < end && *p == '-') { *sign = 1; ++p; }
277+
278+
if (p < end && (*p == 'N' || *p == 'n' || *p == 'I' || *p == 'i')) {
279+
size_t special_len = (size_t)(end - p);
280+
if (p != buf) memmove(buf, p, special_len);
281+
buf[special_len] = '\0';
282+
*out_digits = buf;
283+
*digits_end = buf + special_len;
284+
*decpt = 9999;
285+
return 1;
286+
}
287+
288+
int mant_len = 0;
289+
int dot_pos = -1;
290+
while (p < end && *p != 'e' && *p != 'E') {
291+
if (*p == '.') {
292+
dot_pos = mant_len;
293+
} else {
294+
buf[mant_len++] = *p;
295+
}
296+
++p;
297+
}
298+
if (dot_pos < 0) dot_pos = mant_len;
299+
300+
int exp = 0;
301+
if (p < end && (*p == 'e' || *p == 'E')) {
302+
++p;
303+
int exp_sign = 1;
304+
if (p < end && *p == '-') { exp_sign = -1; ++p; }
305+
else if (p < end && *p == '+') { ++p; }
306+
while (p < end) { exp = exp * 10 + (*p - '0'); ++p; }
307+
exp *= exp_sign;
308+
}
309+
310+
*decpt = dot_pos + exp;
311+
312+
while (mant_len > 1 && buf[mant_len - 1] == '0')
313+
--mant_len;
314+
315+
buf[mant_len] = '\0';
316+
*out_digits = buf;
317+
*digits_end = buf + mant_len;
318+
return 1;
319+
}
320+
255321
/* -------------------------------------------------------------------------
256322
* parse_ryu_d2fixed_output
257323
*
@@ -363,6 +429,71 @@ parse_ryu_d2fixed_output(const char *ryu_buf, int ryu_len,
363429
return 1;
364430
}
365431

432+
/* -------------------------------------------------------------------------
433+
* parse_ryu_d2fixed_inplace
434+
*
435+
* Like parse_ryu_d2fixed_output, but takes a PyMem_Malloc'd buffer and
436+
* rewrites its contents in place — the same buffer is returned as
437+
* *out_digits (ownership transferred to the caller).
438+
* ------------------------------------------------------------------------- */
439+
static int
440+
parse_ryu_d2fixed_inplace(char *buf, int len,
441+
char **out_digits, int *decpt, int *sign,
442+
char **digits_end)
443+
{
444+
char *p = buf;
445+
char *end = buf + len;
446+
447+
*sign = 0;
448+
if (p < end && *p == '-') { *sign = 1; ++p; }
449+
450+
if (p < end && (*p == 'N' || *p == 'n' || *p == 'I' || *p == 'i')) {
451+
size_t special_len = (size_t)(end - p);
452+
if (p != buf) memmove(buf, p, special_len);
453+
buf[special_len] = '\0';
454+
*out_digits = buf;
455+
*digits_end = buf + special_len;
456+
*decpt = 9999;
457+
return 1;
458+
}
459+
460+
int mant_len = 0;
461+
int int_digits = -1;
462+
while (p < end) {
463+
if (*p == '.') {
464+
int_digits = mant_len;
465+
} else {
466+
buf[mant_len++] = *p;
467+
}
468+
++p;
469+
}
470+
if (int_digits < 0)
471+
int_digits = mant_len;
472+
473+
int first_nonzero = 0;
474+
while (first_nonzero < mant_len && buf[first_nonzero] == '0')
475+
++first_nonzero;
476+
477+
if (first_nonzero == mant_len) {
478+
buf[0] = '0';
479+
buf[1] = '\0';
480+
*decpt = 1;
481+
*out_digits = buf;
482+
*digits_end = buf + 1;
483+
return 1;
484+
}
485+
486+
*decpt = int_digits - first_nonzero;
487+
488+
mant_len -= first_nonzero;
489+
memmove(buf, buf + first_nonzero, (size_t)mant_len);
490+
buf[mant_len] = '\0';
491+
492+
*out_digits = buf;
493+
*digits_end = buf + mant_len;
494+
return 1;
495+
}
496+
366497
/* -------------------------------------------------------------------------
367498
* ryu_mode3_neg
368499
*
@@ -569,38 +700,68 @@ _PyRyu_dtoa(double d, int mode, int ndigits,
569700
* Gay's mode 2 with ndigits=N gives N significant digits total.
570701
* d2exp with precision=P gives 1 digit before the point and P after,
571702
* for a total of P+1 significant digits.
572-
* So we pass precision = ndigits - 1. */
703+
* So we pass precision = ndigits - 1.
704+
*
705+
* Fast path: for typical precision (fits in 256B), Ryu writes to
706+
* a stack buffer and parse_ryu_d2exp_output copies out the small
707+
* mantissa. Slow path: heap-allocate a work buffer, parse it in
708+
* place, transfer ownership to *out_digits — one heap alloc total.
709+
*/
573710
int precision = (ndigits > 0) ? ndigits - 1 : 0;
574-
char *buf = (char *)PyMem_Malloc(_pyryu_d2exp_bufsize(precision));
575-
if (buf == NULL)
576-
return NULL;
577-
int len = d2exp_buffered_n(d, (uint32_t)precision, buf);
578-
int ok = parse_ryu_d2exp_output(buf, len, &out_digits, decpt, sign,
579-
digits_end);
580-
PyMem_Free(buf);
581-
if (!ok)
582-
return NULL;
711+
size_t need = _pyryu_d2exp_bufsize(precision);
712+
char stack_buf[256];
713+
if (need <= sizeof(stack_buf)) {
714+
int len = d2exp_buffered_n(d, (uint32_t)precision, stack_buf);
715+
if (!parse_ryu_d2exp_output(stack_buf, len, &out_digits, decpt,
716+
sign, digits_end))
717+
return NULL;
718+
}
719+
else {
720+
char *buf = (char *)PyMem_Malloc(need);
721+
if (buf == NULL)
722+
return NULL;
723+
int len = d2exp_buffered_n(d, (uint32_t)precision, buf);
724+
if (!parse_ryu_d2exp_inplace(buf, len, &out_digits, decpt,
725+
sign, digits_end)) {
726+
PyMem_Free(buf);
727+
return NULL;
728+
}
729+
}
583730
break;
584731
}
585732
case 3: {
586733
/* ndigits digits after the decimal point (fixed-point format).
587734
* ndigits < 0 means round to the nearest multiple of 10^(-ndigits),
588-
* used by float.__round__ with a negative argument. */
735+
* used by float.__round__ with a negative argument.
736+
*
737+
* Fast path: for typical precision (fits in 768B — enough for every
738+
* double's integer part plus ≲ 450 fractional digits), use a stack
739+
* buffer + copy-parse. Slow path: heap + in-place parse + steal.
740+
*/
589741
if (ndigits < 0) {
590742
if (!ryu_mode3_neg(d, -ndigits, &out_digits, decpt, sign,
591743
digits_end))
592744
return NULL;
593745
break;
594746
}
595-
char *buf = (char *)PyMem_Malloc(_pyryu_d2fixed_bufsize(ndigits));
747+
size_t need = _pyryu_d2fixed_bufsize(ndigits);
748+
char stack_buf[768];
749+
if (need <= sizeof(stack_buf)) {
750+
int len = d2fixed_buffered_n(d, (uint32_t)ndigits, stack_buf);
751+
if (!parse_ryu_d2fixed_output(stack_buf, len, &out_digits,
752+
decpt, sign, digits_end))
753+
return NULL;
754+
break;
755+
}
756+
char *buf = (char *)PyMem_Malloc(need);
596757
if (buf == NULL)
597758
return NULL;
598759
int len = d2fixed_buffered_n(d, (uint32_t)ndigits, buf);
599-
int ok = parse_ryu_d2fixed_output(buf, len, &out_digits, decpt, sign,
600-
digits_end);
601-
PyMem_Free(buf);
602-
if (!ok)
760+
if (!parse_ryu_d2fixed_inplace(buf, len, &out_digits, decpt, sign,
761+
digits_end)) {
762+
PyMem_Free(buf);
603763
return NULL;
764+
}
604765
break;
605766
}
606767
default:

0 commit comments

Comments
 (0)