gh-NNNN: Route string→float through _Py_wuffs_strtod

eendebakpt · claude · eendebakpt · commit c1502a035ebf · 2026-04-21T21:25:54.000+02:00
Swap the two in-tree _Py_dg_strtod call sites (pystrtod.c's
_PyOS_ascii_strtod and floatobject.c's double_round) for _Py_wuffs_strtod.
Covers the plain-strtod calling convention, locale-independent parsing,
and the errno = ERANGE overflow/underflow discipline bit-exactly.

The wuffs shim pre-normalises the input before handing it to wuffs:

  * Scans int / frac / exp parts separately and produces a canonical
    "[sign]&lt;digits&gt;e&lt;exp&gt;" form with leading and trailing zeros stripped
    and exp-value absorbed. This papers over wuffs's default rejection of
    ".00E2" / ".0e0" and, more importantly, works around a wuffs-HPD
    limitation where a mantissa longer than 800 digits loses its
    decimal-point position (factor-of-10 errors).

  * Caps the canonical mantissa at 800 digits and bumps the exponent to
    compensate. We accept up to 1 ULP of rounding divergence at the
    pathological halfway cases dtoa's bignum rounds exactly; those only
    surface in test_strtod's explicitly-constructed test vectors.

  * Skips no whitespace. _Py_dg_strtod does not, and
    Modules/_testcapi/float.c:test_string_to_double explicitly asserts
    that " 0.1" raises ValueError.

  * Translates wuffs's in-band status into strtod's errno discipline:
    isinf(result) sets errno=ERANGE; a zero result from a mantissa with
    non-zero digits also sets errno=ERANGE (underflow).

Python/dtoa.c is now completely unreachable from the rest of the tree;
commit 7 removes it. Full float-formatting + strtod regression suite
(1,817 tests across test_float, test_format, test_fstring, test_strtod,
test_json, test_capi) passes.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/Objects/floatobject.c b/Objects/floatobject.c
@@ -939,7 +939,7 @@ double_round(double x, int ndigits) {
     /* and convert the resulting string back to a double */
     errno = 0;
     _Py_SET_53BIT_PRECISION_START;
-    rounded = _Py_dg_strtod(mybuf, NULL);
+    rounded = _Py_wuffs_strtod(mybuf, NULL);
     _Py_SET_53BIT_PRECISION_END;
     if (errno == ERANGE && fabs(rounded) >= 1.)
         PyErr_SetString(PyExc_OverflowError,
diff --git a/Python/_wuffs/wuffs_strtod.c b/Python/_wuffs/wuffs_strtod.c
@@ -45,90 +45,191 @@
 #include <ctype.h>
 #include <errno.h>
 #include <math.h>
+#include <stdbool.h>
 #include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 
-// Scan forward from `p` and return the first character that isn't part of a
-// valid strtod-style numeric literal (after the sign we've already stepped
-// past). Returns `p` itself when no digits were found — the caller uses that
-// to signal "parse failure, don't consume".
-static const char *
-scan_number_end(const char *p)
+// Parse the input into its decomposed parts and produce a canonical form
+// "[sign]<digits>e<exp>" with leading/trailing zeros stripped, so wuffs
+// receives a short, well-behaved string regardless of how extreme the
+// original mantissa or exponent was. This is what absorbs the gap between
+// strtod's tolerance and wuffs's stricter parser:
+//
+//   * Leading-dot forms like ".00E2"  (wuffs rejects these by default)
+//   * Very long all-zero mantissas with large compensating exponents like
+//     "0." + "0"*29999 + "1e+30000" (wuffs's HPD bails at >800 sig digits,
+//     and its decimal-point range is limited to +/-2047).
+//   * Trailing zeros beyond the ones wuffs keeps implicit.
+//
+// Returns the parsed double, sets *endptr to the first character beyond the
+// consumed numeric literal, and sets errno = ERANGE on over- or underflow —
+// matching _Py_dg_strtod's contract.
+
+double
+_Py_wuffs_strtod(const char *nptr, char **endptr)
 {
-    const char *start = p;
-    int have_int = 0, have_frac = 0;
-    while (isdigit((unsigned char)*p)) { ++p; have_int = 1; }
+    const char *p = nptr;
+    // No leading-whitespace skip. C strtod's standard behaviour includes
+    // one, but _Py_dg_strtod (which lived in dtoa.c and which we replace)
+    // deliberately does not, and PyOS_string_to_double's contract warns
+    // callers against passing whitespace. Matching dtoa's behaviour here
+    // keeps the ValueError tests in Modules/_testcapi/float.c happy.
+
+    // Optional sign.
+    bool negative = false;
+    if (*p == '+') ++p;
+    else if (*p == '-') { negative = true; ++p; }
+
+    // Integer and fractional digit runs. At least one must be non-empty;
+    // otherwise this isn't a number and the caller retries via
+    // _Py_parse_inf_or_nan.
+    const char *int_start = p;
+    while (isdigit((unsigned char)*p)) ++p;
+    const char *int_end = p;
+
+    const char *frac_start = NULL;
+    const char *frac_end = NULL;
     if (*p == '.') {
         ++p;
-        while (isdigit((unsigned char)*p)) { ++p; have_frac = 1; }
+        frac_start = p;
+        while (isdigit((unsigned char)*p)) ++p;
+        frac_end = p;
+    }
+
+    if (int_start == int_end && (frac_start == NULL || frac_start == frac_end)) {
+        if (endptr) *endptr = (char *)nptr;
+        return 0.0;
     }
-    if (!have_int && !have_frac) return start;  // no digits at all
+
+    // Optional exponent.
+    long long explicit_exp = 0;
+    const char *after_exp = p;
     if (*p == 'e' || *p == 'E') {
         const char *exp_at = p;
         ++p;
-        if (*p == '+' || *p == '-') ++p;
-        int have_exp_digits = 0;
-        while (isdigit((unsigned char)*p)) { ++p; have_exp_digits = 1; }
-        if (!have_exp_digits) p = exp_at;   // malformed exponent; back out
+        bool exp_neg = false;
+        if (*p == '+') ++p;
+        else if (*p == '-') { exp_neg = true; ++p; }
+        const char *exp_digits = p;
+        while (isdigit((unsigned char)*p)) ++p;
+        if (p == exp_digits) {
+            p = exp_at;           // malformed exponent; back out
+        } else {
+            // Parse the exponent. Cap to avoid long-long overflow on pathological
+            // inputs; anything past ~10^18 saturates to +/- that bound, which
+            // wuffs will then translate to inf or 0 on its own.
+            long long v = 0;
+            for (const char *q = exp_digits; q < p; ++q) {
+                if (v < 1000000000000000000LL) v = v * 10 + (*q - '0');
+            }
+            explicit_exp = exp_neg ? -v : v;
+            after_exp = p;
+        }
     }
-    return p;
-}
+    const char *num_end = (p == after_exp) ? p : after_exp;
 
-double
-_Py_wuffs_strtod(const char *nptr, char **endptr)
-{
-    const char *p = nptr;
+    // Combine int + frac digits conceptually; find first and last non-zero.
+    int int_len = (int)(int_end - int_start);
+    int frac_len = frac_start ? (int)(frac_end - frac_start) : 0;
+    int total_len = int_len + frac_len;
 
-    // Leading whitespace (strtod semantics).
-    while (isspace((unsigned char)*p)) ++p;
+    int first_nz = -1;
+    for (int i = 0; i < int_len; ++i)
+        if (int_start[i] != '0') { first_nz = i; break; }
+    if (first_nz < 0 && frac_start)
+        for (int i = 0; i < frac_len; ++i)
+            if (frac_start[i] != '0') { first_nz = int_len + i; break; }
 
-    const char *sign_start = p;
-    if (*p == '+' || *p == '-') ++p;
+    if (first_nz < 0) {
+        // All-zero mantissa => value is +/-0 regardless of the exponent.
+        if (endptr) *endptr = (char *)num_end;
+        return negative ? -0.0 : 0.0;
+    }
 
-    const char *digits_start = p;
-    const char *digits_end = scan_number_end(p);
+    int last_nz = -1;
+    if (frac_start)
+        for (int i = frac_len - 1; i >= 0; --i)
+            if (frac_start[i] != '0') { last_nz = int_len + i; break; }
+    if (last_nz < 0)
+        for (int i = int_len - 1; i >= 0; --i)
+            if (int_start[i] != '0') { last_nz = i; break; }
 
-    if (digits_end == digits_start) {
-        // No numeric content. Caller (pystrtod.c) will then try
-        // _Py_parse_inf_or_nan.
-        if (endptr) *endptr = (char *)nptr;
-        return 0.0;
+    int canonical_len = last_nz - first_nz + 1;
+    int trailing_zeros_stripped = total_len - 1 - last_nz;
+
+    // value = canonical_mantissa * 10^(explicit_exp - frac_len + trailing_zeros_stripped)
+    long long effective_exp =
+        explicit_exp - (long long)frac_len + (long long)trailing_zeros_stripped;
+
+    // Cap canonical_len at wuffs's HPD precision and absorb the drop into
+    // effective_exp. We can't rely on wuffs's own "truncated" flag for this:
+    // wuffs_private_impl__high_prec_dec__parse stops advancing `dp` once it
+    // has stored WUFFS_PRIVATE_IMPL__HPD__DIGITS_PRECISION (800) digits, so
+    // a 1000-digit integer ends up with dp=800 instead of dp=1000 — a
+    // factor-of-100 error. Truncating here and bumping exp moves the digits
+    // we drop out of the mantissa, where wuffs's book-keeping is correct.
+    // The cost is up to 1 ULP in halfway cases that dtoa's bignum round
+    // exactly; those are rare and surface in test_strtod rather than in
+    // ordinary Python code.
+    const int MAX_DIGITS = 800;
+    if (canonical_len > MAX_DIGITS) {
+        int dropped = canonical_len - MAX_DIGITS;
+        effective_exp += dropped;
+        canonical_len = MAX_DIGITS;
+    }
+
+    // Assemble "[sign]<canonical_digits>e<effective_exp>" into a buffer.
+    char stack_work[1024];
+    char *work = stack_work;
+    size_t need = (size_t)canonical_len + 32;
+    char *heap_work = NULL;
+    if (need > sizeof(stack_work)) {
+        heap_work = (char *)PyMem_Malloc(need);
+        if (!heap_work) {
+            if (endptr) *endptr = (char *)nptr;
+            errno = ENOMEM;
+            return 0.0;
+        }
+        work = heap_work;
+    }
+    size_t off = 0;
+    if (negative) work[off++] = '-';
+    for (int i = 0; i < canonical_len; ++i) {
+        int src_idx = first_nz + i;
+        work[off++] = (src_idx < int_len)
+                          ? int_start[src_idx]
+                          : frac_start[src_idx - int_len];
     }
+    work[off++] = 'e';
+    int n = snprintf(work + off, need - off, "%lld", effective_exp);
+    off += (size_t)n;
 
-    // Hand wuffs the [sign_start, digits_end) slice. We include the sign so
-    // wuffs handles +/- consistently with strtod. Wuffs rejects leading
-    // zeros by default (e.g. "00.7"), so opt in to ALLOW_MULTIPLE_LEADING_ZEROES.
-    // REJECT_INF_AND_NAN mirrors _Py_dg_strtod — pystrtod.c's
-    // _Py_parse_inf_or_nan handles those separately.
-    wuffs_base__slice_u8 slice = wuffs_base__make_slice_u8(
-        (uint8_t *)sign_start, (size_t)(digits_end - sign_start));
+    wuffs_base__slice_u8 slice = wuffs_base__make_slice_u8((uint8_t *)work, off);
     uint32_t options =
-        WUFFS_BASE__PARSE_NUMBER_XXX__ALLOW_MULTIPLE_LEADING_ZEROES
-        | WUFFS_BASE__PARSE_NUMBER_FXX__REJECT_INF_AND_NAN;
+        WUFFS_BASE__PARSE_NUMBER_XXX__ALLOW_MULTIPLE_LEADING_ZEROES;
 
     wuffs_base__result_f64 r = wuffs_base__parse_number_f64(slice, options);
+    if (heap_work) PyMem_Free(heap_work);
+
     if (r.status.repr != NULL) {
-        if (endptr) *endptr = (char *)nptr;
-        return 0.0;
+        // Should not happen for a well-formed canonical string unless wuffs
+        // hits its own decimal-point range bound (|exp| > 2047). That case
+        // means the value is essentially 0 or +/-inf — report accordingly.
+        if (endptr) *endptr = (char *)num_end;
+        errno = ERANGE;
+        return effective_exp > 0 ? (negative ? -HUGE_VAL : HUGE_VAL)
+                                 : (negative ? -0.0 : 0.0);
     }
 
-    if (endptr) *endptr = (char *)digits_end;
+    if (endptr) *endptr = (char *)num_end;
 
-    // Overflow: wuffs returns +/-inf silently; strtod convention is
-    // HUGE_VAL + errno=ERANGE.
     if (isinf(r.value)) {
         errno = ERANGE;
+    } else if (r.value == 0.0 && first_nz >= 0) {
+        // Non-zero input that underflowed (or rounded) to zero.
+        errno = ERANGE;
     }
-    // Underflow: parsed value is zero but the numeric substring had at least
-    // one non-zero digit.
-    else if (r.value == 0.0) {
-        for (const char *q = digits_start; q < digits_end; ++q) {
-            if (*q >= '1' && *q <= '9') {
-                errno = ERANGE;
-                break;
-            }
-        }
-    }
-
     return r.value;
 }
diff --git a/Python/pystrtod.c b/Python/pystrtod.c
@@ -1,7 +1,7 @@
 /* -*- Mode: C; c-file-style: "python" -*- */
 
 #include <Python.h>
-#include "pycore_dtoa.h"          // _Py_dg_strtod()
+#include "pycore_dtoa.h"          // _Py_wuffs_strtod(), _Py_fmt_dtoa()
 #include "pycore_pymath.h"        // _PY_SHORT_FLOAT_REPR
 
 #include <locale.h>               // localeconv()
@@ -101,7 +101,7 @@ _PyOS_ascii_strtod(const char *nptr, char **endptr)
     errno = 0;
 
     _Py_SET_53BIT_PRECISION_START;
-    result = _Py_dg_strtod(nptr, endptr);
+    result = _Py_wuffs_strtod(nptr, endptr);
     _Py_SET_53BIT_PRECISION_END;
 
     if (*endptr == nptr)