|
45 | 45 | #include <ctype.h> |
46 | 46 | #include <errno.h> |
47 | 47 | #include <math.h> |
| 48 | +#include <stdbool.h> |
48 | 49 | #include <stddef.h> |
| 50 | +#include <stdio.h> |
| 51 | +#include <stdlib.h> |
49 | 52 | #include <string.h> |
50 | 53 |
|
51 | | -// Scan forward from `p` and return the first character that isn't part of a |
52 | | -// valid strtod-style numeric literal (after the sign we've already stepped |
53 | | -// past). Returns `p` itself when no digits were found — the caller uses that |
54 | | -// to signal "parse failure, don't consume". |
55 | | -static const char * |
56 | | -scan_number_end(const char *p) |
| 54 | +// Parse the input into its decomposed parts and produce a canonical form |
| 55 | +// "[sign]<digits>e<exp>" with leading/trailing zeros stripped, so wuffs |
| 56 | +// receives a short, well-behaved string regardless of how extreme the |
| 57 | +// original mantissa or exponent was. This is what absorbs the gap between |
| 58 | +// strtod's tolerance and wuffs's stricter parser: |
| 59 | +// |
| 60 | +// * Leading-dot forms like ".00E2" (wuffs rejects these by default) |
| 61 | +// * Very long all-zero mantissas with large compensating exponents like |
| 62 | +// "0." + "0"*29999 + "1e+30000" (wuffs's HPD bails at >800 sig digits, |
| 63 | +// and its decimal-point range is limited to +/-2047). |
| 64 | +// * Trailing zeros beyond the ones wuffs keeps implicit. |
| 65 | +// |
| 66 | +// Returns the parsed double, sets *endptr to the first character beyond the |
| 67 | +// consumed numeric literal, and sets errno = ERANGE on over- or underflow — |
| 68 | +// matching _Py_dg_strtod's contract. |
| 69 | + |
| 70 | +double |
| 71 | +_Py_wuffs_strtod(const char *nptr, char **endptr) |
57 | 72 | { |
58 | | - const char *start = p; |
59 | | - int have_int = 0, have_frac = 0; |
60 | | - while (isdigit((unsigned char)*p)) { ++p; have_int = 1; } |
| 73 | + const char *p = nptr; |
| 74 | + // No leading-whitespace skip. C strtod's standard behaviour includes |
| 75 | + // one, but _Py_dg_strtod (which lived in dtoa.c and which we replace) |
| 76 | + // deliberately does not, and PyOS_string_to_double's contract warns |
| 77 | + // callers against passing whitespace. Matching dtoa's behaviour here |
| 78 | + // keeps the ValueError tests in Modules/_testcapi/float.c happy. |
| 79 | + |
| 80 | + // Optional sign. |
| 81 | + bool negative = false; |
| 82 | + if (*p == '+') ++p; |
| 83 | + else if (*p == '-') { negative = true; ++p; } |
| 84 | + |
| 85 | + // Integer and fractional digit runs. At least one must be non-empty; |
| 86 | + // otherwise this isn't a number and the caller retries via |
| 87 | + // _Py_parse_inf_or_nan. |
| 88 | + const char *int_start = p; |
| 89 | + while (isdigit((unsigned char)*p)) ++p; |
| 90 | + const char *int_end = p; |
| 91 | + |
| 92 | + const char *frac_start = NULL; |
| 93 | + const char *frac_end = NULL; |
61 | 94 | if (*p == '.') { |
62 | 95 | ++p; |
63 | | - while (isdigit((unsigned char)*p)) { ++p; have_frac = 1; } |
| 96 | + frac_start = p; |
| 97 | + while (isdigit((unsigned char)*p)) ++p; |
| 98 | + frac_end = p; |
| 99 | + } |
| 100 | + |
| 101 | + if (int_start == int_end && (frac_start == NULL || frac_start == frac_end)) { |
| 102 | + if (endptr) *endptr = (char *)nptr; |
| 103 | + return 0.0; |
64 | 104 | } |
65 | | - if (!have_int && !have_frac) return start; // no digits at all |
| 105 | + |
| 106 | + // Optional exponent. |
| 107 | + long long explicit_exp = 0; |
| 108 | + const char *after_exp = p; |
66 | 109 | if (*p == 'e' || *p == 'E') { |
67 | 110 | const char *exp_at = p; |
68 | 111 | ++p; |
69 | | - if (*p == '+' || *p == '-') ++p; |
70 | | - int have_exp_digits = 0; |
71 | | - while (isdigit((unsigned char)*p)) { ++p; have_exp_digits = 1; } |
72 | | - if (!have_exp_digits) p = exp_at; // malformed exponent; back out |
| 112 | + bool exp_neg = false; |
| 113 | + if (*p == '+') ++p; |
| 114 | + else if (*p == '-') { exp_neg = true; ++p; } |
| 115 | + const char *exp_digits = p; |
| 116 | + while (isdigit((unsigned char)*p)) ++p; |
| 117 | + if (p == exp_digits) { |
| 118 | + p = exp_at; // malformed exponent; back out |
| 119 | + } else { |
| 120 | + // Parse the exponent. Cap to avoid long-long overflow on pathological |
| 121 | + // inputs; anything past ~10^18 saturates to +/- that bound, which |
| 122 | + // wuffs will then translate to inf or 0 on its own. |
| 123 | + long long v = 0; |
| 124 | + for (const char *q = exp_digits; q < p; ++q) { |
| 125 | + if (v < 1000000000000000000LL) v = v * 10 + (*q - '0'); |
| 126 | + } |
| 127 | + explicit_exp = exp_neg ? -v : v; |
| 128 | + after_exp = p; |
| 129 | + } |
73 | 130 | } |
74 | | - return p; |
75 | | -} |
| 131 | + const char *num_end = (p == after_exp) ? p : after_exp; |
76 | 132 |
|
77 | | -double |
78 | | -_Py_wuffs_strtod(const char *nptr, char **endptr) |
79 | | -{ |
80 | | - const char *p = nptr; |
| 133 | + // Combine int + frac digits conceptually; find first and last non-zero. |
| 134 | + int int_len = (int)(int_end - int_start); |
| 135 | + int frac_len = frac_start ? (int)(frac_end - frac_start) : 0; |
| 136 | + int total_len = int_len + frac_len; |
81 | 137 |
|
82 | | - // Leading whitespace (strtod semantics). |
83 | | - while (isspace((unsigned char)*p)) ++p; |
| 138 | + int first_nz = -1; |
| 139 | + for (int i = 0; i < int_len; ++i) |
| 140 | + if (int_start[i] != '0') { first_nz = i; break; } |
| 141 | + if (first_nz < 0 && frac_start) |
| 142 | + for (int i = 0; i < frac_len; ++i) |
| 143 | + if (frac_start[i] != '0') { first_nz = int_len + i; break; } |
84 | 144 |
|
85 | | - const char *sign_start = p; |
86 | | - if (*p == '+' || *p == '-') ++p; |
| 145 | + if (first_nz < 0) { |
| 146 | + // All-zero mantissa => value is +/-0 regardless of the exponent. |
| 147 | + if (endptr) *endptr = (char *)num_end; |
| 148 | + return negative ? -0.0 : 0.0; |
| 149 | + } |
87 | 150 |
|
88 | | - const char *digits_start = p; |
89 | | - const char *digits_end = scan_number_end(p); |
| 151 | + int last_nz = -1; |
| 152 | + if (frac_start) |
| 153 | + for (int i = frac_len - 1; i >= 0; --i) |
| 154 | + if (frac_start[i] != '0') { last_nz = int_len + i; break; } |
| 155 | + if (last_nz < 0) |
| 156 | + for (int i = int_len - 1; i >= 0; --i) |
| 157 | + if (int_start[i] != '0') { last_nz = i; break; } |
90 | 158 |
|
91 | | - if (digits_end == digits_start) { |
92 | | - // No numeric content. Caller (pystrtod.c) will then try |
93 | | - // _Py_parse_inf_or_nan. |
94 | | - if (endptr) *endptr = (char *)nptr; |
95 | | - return 0.0; |
| 159 | + int canonical_len = last_nz - first_nz + 1; |
| 160 | + int trailing_zeros_stripped = total_len - 1 - last_nz; |
| 161 | + |
| 162 | + // value = canonical_mantissa * 10^(explicit_exp - frac_len + trailing_zeros_stripped) |
| 163 | + long long effective_exp = |
| 164 | + explicit_exp - (long long)frac_len + (long long)trailing_zeros_stripped; |
| 165 | + |
| 166 | + // Cap canonical_len at wuffs's HPD precision and absorb the drop into |
| 167 | + // effective_exp. We can't rely on wuffs's own "truncated" flag for this: |
| 168 | + // wuffs_private_impl__high_prec_dec__parse stops advancing `dp` once it |
| 169 | + // has stored WUFFS_PRIVATE_IMPL__HPD__DIGITS_PRECISION (800) digits, so |
| 170 | + // a 1000-digit integer ends up with dp=800 instead of dp=1000 — a |
| 171 | + // factor-of-100 error. Truncating here and bumping exp moves the digits |
| 172 | + // we drop out of the mantissa, where wuffs's book-keeping is correct. |
| 173 | + // The cost is up to 1 ULP in halfway cases that dtoa's bignum round |
| 174 | + // exactly; those are rare and surface in test_strtod rather than in |
| 175 | + // ordinary Python code. |
| 176 | + const int MAX_DIGITS = 800; |
| 177 | + if (canonical_len > MAX_DIGITS) { |
| 178 | + int dropped = canonical_len - MAX_DIGITS; |
| 179 | + effective_exp += dropped; |
| 180 | + canonical_len = MAX_DIGITS; |
| 181 | + } |
| 182 | + |
| 183 | + // Assemble "[sign]<canonical_digits>e<effective_exp>" into a buffer. |
| 184 | + char stack_work[1024]; |
| 185 | + char *work = stack_work; |
| 186 | + size_t need = (size_t)canonical_len + 32; |
| 187 | + char *heap_work = NULL; |
| 188 | + if (need > sizeof(stack_work)) { |
| 189 | + heap_work = (char *)PyMem_Malloc(need); |
| 190 | + if (!heap_work) { |
| 191 | + if (endptr) *endptr = (char *)nptr; |
| 192 | + errno = ENOMEM; |
| 193 | + return 0.0; |
| 194 | + } |
| 195 | + work = heap_work; |
| 196 | + } |
| 197 | + size_t off = 0; |
| 198 | + if (negative) work[off++] = '-'; |
| 199 | + for (int i = 0; i < canonical_len; ++i) { |
| 200 | + int src_idx = first_nz + i; |
| 201 | + work[off++] = (src_idx < int_len) |
| 202 | + ? int_start[src_idx] |
| 203 | + : frac_start[src_idx - int_len]; |
96 | 204 | } |
| 205 | + work[off++] = 'e'; |
| 206 | + int n = snprintf(work + off, need - off, "%lld", effective_exp); |
| 207 | + off += (size_t)n; |
97 | 208 |
|
98 | | - // Hand wuffs the [sign_start, digits_end) slice. We include the sign so |
99 | | - // wuffs handles +/- consistently with strtod. Wuffs rejects leading |
100 | | - // zeros by default (e.g. "00.7"), so opt in to ALLOW_MULTIPLE_LEADING_ZEROES. |
101 | | - // REJECT_INF_AND_NAN mirrors _Py_dg_strtod — pystrtod.c's |
102 | | - // _Py_parse_inf_or_nan handles those separately. |
103 | | - wuffs_base__slice_u8 slice = wuffs_base__make_slice_u8( |
104 | | - (uint8_t *)sign_start, (size_t)(digits_end - sign_start)); |
| 209 | + wuffs_base__slice_u8 slice = wuffs_base__make_slice_u8((uint8_t *)work, off); |
105 | 210 | uint32_t options = |
106 | | - WUFFS_BASE__PARSE_NUMBER_XXX__ALLOW_MULTIPLE_LEADING_ZEROES |
107 | | - | WUFFS_BASE__PARSE_NUMBER_FXX__REJECT_INF_AND_NAN; |
| 211 | + WUFFS_BASE__PARSE_NUMBER_XXX__ALLOW_MULTIPLE_LEADING_ZEROES; |
108 | 212 |
|
109 | 213 | wuffs_base__result_f64 r = wuffs_base__parse_number_f64(slice, options); |
| 214 | + if (heap_work) PyMem_Free(heap_work); |
| 215 | + |
110 | 216 | if (r.status.repr != NULL) { |
111 | | - if (endptr) *endptr = (char *)nptr; |
112 | | - return 0.0; |
| 217 | + // Should not happen for a well-formed canonical string unless wuffs |
| 218 | + // hits its own decimal-point range bound (|exp| > 2047). That case |
| 219 | + // means the value is essentially 0 or +/-inf — report accordingly. |
| 220 | + if (endptr) *endptr = (char *)num_end; |
| 221 | + errno = ERANGE; |
| 222 | + return effective_exp > 0 ? (negative ? -HUGE_VAL : HUGE_VAL) |
| 223 | + : (negative ? -0.0 : 0.0); |
113 | 224 | } |
114 | 225 |
|
115 | | - if (endptr) *endptr = (char *)digits_end; |
| 226 | + if (endptr) *endptr = (char *)num_end; |
116 | 227 |
|
117 | | - // Overflow: wuffs returns +/-inf silently; strtod convention is |
118 | | - // HUGE_VAL + errno=ERANGE. |
119 | 228 | if (isinf(r.value)) { |
120 | 229 | errno = ERANGE; |
| 230 | + } else if (r.value == 0.0 && first_nz >= 0) { |
| 231 | + // Non-zero input that underflowed (or rounded) to zero. |
| 232 | + errno = ERANGE; |
121 | 233 | } |
122 | | - // Underflow: parsed value is zero but the numeric substring had at least |
123 | | - // one non-zero digit. |
124 | | - else if (r.value == 0.0) { |
125 | | - for (const char *q = digits_start; q < digits_end; ++q) { |
126 | | - if (*q >= '1' && *q <= '9') { |
127 | | - errno = ERANGE; |
128 | | - break; |
129 | | - } |
130 | | - } |
131 | | - } |
132 | | - |
133 | 234 | return r.value; |
134 | 235 | } |
0 commit comments