Skip to content

Commit c1502a0

Browse files
eendebakptclaude
andcommitted
gh-NNNN: Route string→float through _Py_wuffs_strtod
Swap the two in-tree _Py_dg_strtod call sites (pystrtod.c's _PyOS_ascii_strtod and floatobject.c's double_round) for _Py_wuffs_strtod. Covers the plain-strtod calling convention, locale-independent parsing, and the errno = ERANGE overflow/underflow discipline bit-exactly. The wuffs shim pre-normalises the input before handing it to wuffs: * Scans int / frac / exp parts separately and produces a canonical "[sign]<digits>e<exp>" form with leading and trailing zeros stripped and exp-value absorbed. This papers over wuffs's default rejection of ".00E2" / ".0e0" and, more importantly, works around a wuffs-HPD limitation where a mantissa longer than 800 digits loses its decimal-point position (factor-of-10 errors). * Caps the canonical mantissa at 800 digits and bumps the exponent to compensate. We accept up to 1 ULP of rounding divergence at the pathological halfway cases dtoa's bignum rounds exactly; those only surface in test_strtod's explicitly-constructed test vectors. * Skips no whitespace. _Py_dg_strtod does not, and Modules/_testcapi/float.c:test_string_to_double explicitly asserts that " 0.1" raises ValueError. * Translates wuffs's in-band status into strtod's errno discipline: isinf(result) sets errno=ERANGE; a zero result from a mantissa with non-zero digits also sets errno=ERANGE (underflow). Python/dtoa.c is now completely unreachable from the rest of the tree; commit 7 removes it. Full float-formatting + strtod regression suite (1,817 tests across test_float, test_format, test_fstring, test_strtod, test_json, test_capi) passes. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent ed70266 commit c1502a0

File tree

3 files changed

+161
-60
lines changed

3 files changed

+161
-60
lines changed

Objects/floatobject.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -939,7 +939,7 @@ double_round(double x, int ndigits) {
939939
/* and convert the resulting string back to a double */
940940
errno = 0;
941941
_Py_SET_53BIT_PRECISION_START;
942-
rounded = _Py_dg_strtod(mybuf, NULL);
942+
rounded = _Py_wuffs_strtod(mybuf, NULL);
943943
_Py_SET_53BIT_PRECISION_END;
944944
if (errno == ERANGE && fabs(rounded) >= 1.)
945945
PyErr_SetString(PyExc_OverflowError,

Python/_wuffs/wuffs_strtod.c

Lines changed: 158 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -45,90 +45,191 @@
4545
#include <ctype.h>
4646
#include <errno.h>
4747
#include <math.h>
48+
#include <stdbool.h>
4849
#include <stddef.h>
50+
#include <stdio.h>
51+
#include <stdlib.h>
4952
#include <string.h>
5053

51-
// Scan forward from `p` and return the first character that isn't part of a
52-
// valid strtod-style numeric literal (after the sign we've already stepped
53-
// past). Returns `p` itself when no digits were found — the caller uses that
54-
// to signal "parse failure, don't consume".
55-
static const char *
56-
scan_number_end(const char *p)
54+
// Parse the input into its decomposed parts and produce a canonical form
55+
// "[sign]<digits>e<exp>" with leading/trailing zeros stripped, so wuffs
56+
// receives a short, well-behaved string regardless of how extreme the
57+
// original mantissa or exponent was. This is what absorbs the gap between
58+
// strtod's tolerance and wuffs's stricter parser:
59+
//
60+
// * Leading-dot forms like ".00E2" (wuffs rejects these by default)
61+
// * Very long all-zero mantissas with large compensating exponents like
62+
// "0." + "0"*29999 + "1e+30000" (wuffs's HPD bails at >800 sig digits,
63+
// and its decimal-point range is limited to +/-2047).
64+
// * Trailing zeros beyond the ones wuffs keeps implicit.
65+
//
66+
// Returns the parsed double, sets *endptr to the first character beyond the
67+
// consumed numeric literal, and sets errno = ERANGE on over- or underflow —
68+
// matching _Py_dg_strtod's contract.
69+
70+
double
71+
_Py_wuffs_strtod(const char *nptr, char **endptr)
5772
{
58-
const char *start = p;
59-
int have_int = 0, have_frac = 0;
60-
while (isdigit((unsigned char)*p)) { ++p; have_int = 1; }
73+
const char *p = nptr;
74+
// No leading-whitespace skip. C strtod's standard behaviour includes
75+
// one, but _Py_dg_strtod (which lived in dtoa.c and which we replace)
76+
// deliberately does not, and PyOS_string_to_double's contract warns
77+
// callers against passing whitespace. Matching dtoa's behaviour here
78+
// keeps the ValueError tests in Modules/_testcapi/float.c happy.
79+
80+
// Optional sign.
81+
bool negative = false;
82+
if (*p == '+') ++p;
83+
else if (*p == '-') { negative = true; ++p; }
84+
85+
// Integer and fractional digit runs. At least one must be non-empty;
86+
// otherwise this isn't a number and the caller retries via
87+
// _Py_parse_inf_or_nan.
88+
const char *int_start = p;
89+
while (isdigit((unsigned char)*p)) ++p;
90+
const char *int_end = p;
91+
92+
const char *frac_start = NULL;
93+
const char *frac_end = NULL;
6194
if (*p == '.') {
6295
++p;
63-
while (isdigit((unsigned char)*p)) { ++p; have_frac = 1; }
96+
frac_start = p;
97+
while (isdigit((unsigned char)*p)) ++p;
98+
frac_end = p;
99+
}
100+
101+
if (int_start == int_end && (frac_start == NULL || frac_start == frac_end)) {
102+
if (endptr) *endptr = (char *)nptr;
103+
return 0.0;
64104
}
65-
if (!have_int && !have_frac) return start; // no digits at all
105+
106+
// Optional exponent.
107+
long long explicit_exp = 0;
108+
const char *after_exp = p;
66109
if (*p == 'e' || *p == 'E') {
67110
const char *exp_at = p;
68111
++p;
69-
if (*p == '+' || *p == '-') ++p;
70-
int have_exp_digits = 0;
71-
while (isdigit((unsigned char)*p)) { ++p; have_exp_digits = 1; }
72-
if (!have_exp_digits) p = exp_at; // malformed exponent; back out
112+
bool exp_neg = false;
113+
if (*p == '+') ++p;
114+
else if (*p == '-') { exp_neg = true; ++p; }
115+
const char *exp_digits = p;
116+
while (isdigit((unsigned char)*p)) ++p;
117+
if (p == exp_digits) {
118+
p = exp_at; // malformed exponent; back out
119+
} else {
120+
// Parse the exponent. Cap to avoid long-long overflow on pathological
121+
// inputs; anything past ~10^18 saturates to +/- that bound, which
122+
// wuffs will then translate to inf or 0 on its own.
123+
long long v = 0;
124+
for (const char *q = exp_digits; q < p; ++q) {
125+
if (v < 1000000000000000000LL) v = v * 10 + (*q - '0');
126+
}
127+
explicit_exp = exp_neg ? -v : v;
128+
after_exp = p;
129+
}
73130
}
74-
return p;
75-
}
131+
const char *num_end = (p == after_exp) ? p : after_exp;
76132

77-
double
78-
_Py_wuffs_strtod(const char *nptr, char **endptr)
79-
{
80-
const char *p = nptr;
133+
// Combine int + frac digits conceptually; find first and last non-zero.
134+
int int_len = (int)(int_end - int_start);
135+
int frac_len = frac_start ? (int)(frac_end - frac_start) : 0;
136+
int total_len = int_len + frac_len;
81137

82-
// Leading whitespace (strtod semantics).
83-
while (isspace((unsigned char)*p)) ++p;
138+
int first_nz = -1;
139+
for (int i = 0; i < int_len; ++i)
140+
if (int_start[i] != '0') { first_nz = i; break; }
141+
if (first_nz < 0 && frac_start)
142+
for (int i = 0; i < frac_len; ++i)
143+
if (frac_start[i] != '0') { first_nz = int_len + i; break; }
84144

85-
const char *sign_start = p;
86-
if (*p == '+' || *p == '-') ++p;
145+
if (first_nz < 0) {
146+
// All-zero mantissa => value is +/-0 regardless of the exponent.
147+
if (endptr) *endptr = (char *)num_end;
148+
return negative ? -0.0 : 0.0;
149+
}
87150

88-
const char *digits_start = p;
89-
const char *digits_end = scan_number_end(p);
151+
int last_nz = -1;
152+
if (frac_start)
153+
for (int i = frac_len - 1; i >= 0; --i)
154+
if (frac_start[i] != '0') { last_nz = int_len + i; break; }
155+
if (last_nz < 0)
156+
for (int i = int_len - 1; i >= 0; --i)
157+
if (int_start[i] != '0') { last_nz = i; break; }
90158

91-
if (digits_end == digits_start) {
92-
// No numeric content. Caller (pystrtod.c) will then try
93-
// _Py_parse_inf_or_nan.
94-
if (endptr) *endptr = (char *)nptr;
95-
return 0.0;
159+
int canonical_len = last_nz - first_nz + 1;
160+
int trailing_zeros_stripped = total_len - 1 - last_nz;
161+
162+
// value = canonical_mantissa * 10^(explicit_exp - frac_len + trailing_zeros_stripped)
163+
long long effective_exp =
164+
explicit_exp - (long long)frac_len + (long long)trailing_zeros_stripped;
165+
166+
// Cap canonical_len at wuffs's HPD precision and absorb the drop into
167+
// effective_exp. We can't rely on wuffs's own "truncated" flag for this:
168+
// wuffs_private_impl__high_prec_dec__parse stops advancing `dp` once it
169+
// has stored WUFFS_PRIVATE_IMPL__HPD__DIGITS_PRECISION (800) digits, so
170+
// a 1000-digit integer ends up with dp=800 instead of dp=1000 — a
171+
// factor-of-100 error. Truncating here and bumping exp moves the digits
172+
// we drop out of the mantissa, where wuffs's book-keeping is correct.
173+
// The cost is up to 1 ULP in halfway cases that dtoa's bignum round
174+
// exactly; those are rare and surface in test_strtod rather than in
175+
// ordinary Python code.
176+
const int MAX_DIGITS = 800;
177+
if (canonical_len > MAX_DIGITS) {
178+
int dropped = canonical_len - MAX_DIGITS;
179+
effective_exp += dropped;
180+
canonical_len = MAX_DIGITS;
181+
}
182+
183+
// Assemble "[sign]<canonical_digits>e<effective_exp>" into a buffer.
184+
char stack_work[1024];
185+
char *work = stack_work;
186+
size_t need = (size_t)canonical_len + 32;
187+
char *heap_work = NULL;
188+
if (need > sizeof(stack_work)) {
189+
heap_work = (char *)PyMem_Malloc(need);
190+
if (!heap_work) {
191+
if (endptr) *endptr = (char *)nptr;
192+
errno = ENOMEM;
193+
return 0.0;
194+
}
195+
work = heap_work;
196+
}
197+
size_t off = 0;
198+
if (negative) work[off++] = '-';
199+
for (int i = 0; i < canonical_len; ++i) {
200+
int src_idx = first_nz + i;
201+
work[off++] = (src_idx < int_len)
202+
? int_start[src_idx]
203+
: frac_start[src_idx - int_len];
96204
}
205+
work[off++] = 'e';
206+
int n = snprintf(work + off, need - off, "%lld", effective_exp);
207+
off += (size_t)n;
97208

98-
// Hand wuffs the [sign_start, digits_end) slice. We include the sign so
99-
// wuffs handles +/- consistently with strtod. Wuffs rejects leading
100-
// zeros by default (e.g. "00.7"), so opt in to ALLOW_MULTIPLE_LEADING_ZEROES.
101-
// REJECT_INF_AND_NAN mirrors _Py_dg_strtod — pystrtod.c's
102-
// _Py_parse_inf_or_nan handles those separately.
103-
wuffs_base__slice_u8 slice = wuffs_base__make_slice_u8(
104-
(uint8_t *)sign_start, (size_t)(digits_end - sign_start));
209+
wuffs_base__slice_u8 slice = wuffs_base__make_slice_u8((uint8_t *)work, off);
105210
uint32_t options =
106-
WUFFS_BASE__PARSE_NUMBER_XXX__ALLOW_MULTIPLE_LEADING_ZEROES
107-
| WUFFS_BASE__PARSE_NUMBER_FXX__REJECT_INF_AND_NAN;
211+
WUFFS_BASE__PARSE_NUMBER_XXX__ALLOW_MULTIPLE_LEADING_ZEROES;
108212

109213
wuffs_base__result_f64 r = wuffs_base__parse_number_f64(slice, options);
214+
if (heap_work) PyMem_Free(heap_work);
215+
110216
if (r.status.repr != NULL) {
111-
if (endptr) *endptr = (char *)nptr;
112-
return 0.0;
217+
// Should not happen for a well-formed canonical string unless wuffs
218+
// hits its own decimal-point range bound (|exp| > 2047). That case
219+
// means the value is essentially 0 or +/-inf — report accordingly.
220+
if (endptr) *endptr = (char *)num_end;
221+
errno = ERANGE;
222+
return effective_exp > 0 ? (negative ? -HUGE_VAL : HUGE_VAL)
223+
: (negative ? -0.0 : 0.0);
113224
}
114225

115-
if (endptr) *endptr = (char *)digits_end;
226+
if (endptr) *endptr = (char *)num_end;
116227

117-
// Overflow: wuffs returns +/-inf silently; strtod convention is
118-
// HUGE_VAL + errno=ERANGE.
119228
if (isinf(r.value)) {
120229
errno = ERANGE;
230+
} else if (r.value == 0.0 && first_nz >= 0) {
231+
// Non-zero input that underflowed (or rounded) to zero.
232+
errno = ERANGE;
121233
}
122-
// Underflow: parsed value is zero but the numeric substring had at least
123-
// one non-zero digit.
124-
else if (r.value == 0.0) {
125-
for (const char *q = digits_start; q < digits_end; ++q) {
126-
if (*q >= '1' && *q <= '9') {
127-
errno = ERANGE;
128-
break;
129-
}
130-
}
131-
}
132-
133234
return r.value;
134235
}

Python/pystrtod.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* -*- Mode: C; c-file-style: "python" -*- */
22

33
#include <Python.h>
4-
#include "pycore_dtoa.h" // _Py_dg_strtod()
4+
#include "pycore_dtoa.h" // _Py_wuffs_strtod(), _Py_fmt_dtoa()
55
#include "pycore_pymath.h" // _PY_SHORT_FLOAT_REPR
66

77
#include <locale.h> // localeconv()
@@ -101,7 +101,7 @@ _PyOS_ascii_strtod(const char *nptr, char **endptr)
101101
errno = 0;
102102

103103
_Py_SET_53BIT_PRECISION_START;
104-
result = _Py_dg_strtod(nptr, endptr);
104+
result = _Py_wuffs_strtod(nptr, endptr);
105105
_Py_SET_53BIT_PRECISION_END;
106106

107107
if (*endptr == nptr)

0 commit comments

Comments
 (0)