Skip to content

Commit 9b78bdd

Browse files
gpsheadclaude
andcommitted
Simplify base85 canonical check using integer division
Replace the re-encode-and-compare loops with a quotient comparison: two divisions by 85**n_pad tell us whether the decoded uint32 and the zero-padded output bytes share the same leading base-85 digits. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 308433a commit 9b78bdd

File tree

1 file changed

+31
-42
lines changed

1 file changed

+31
-42
lines changed

Modules/binascii.c

Lines changed: 31 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,9 @@ static const _Py_ALIGNED_DEF(64, unsigned char) table_b2a_base85_a85[] =
244244
#define BASE85_A85_Z 0x00000000
245245
#define BASE85_A85_Y 0x20202020
246246

247+
/* 85**0 through 85**4, used for canonical encoding checks. */
248+
static const uint32_t pow85[] = {1, 85, 7225, 614125, 52200625};
249+
247250

248251
static const _Py_ALIGNED_DEF(64, unsigned char) table_a2b_base32[] = {
249252
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
@@ -1178,7 +1181,20 @@ binascii_a2b_ascii85_impl(PyObject *module, Py_buffer *data, int foldspaces,
11781181
*bin_data++ = (leftchar >> (24 - 8 * i)) & 0xff;
11791182
}
11801183

1181-
/* Reject non-canonical encodings in the final group. */
1184+
/* Reject non-canonical encodings in the final group.
1185+
*
1186+
* A partial group of N chars (2-4) encodes N-1 bytes. The
1187+
* decoder pads missing chars with digit 84 (the maximum).
1188+
* The encoder produces the unique N chars for those bytes by
1189+
* zero-padding the bytes to a uint32 and taking the leading
1190+
* N base-85 digits. Two encodings are equivalent iff they
1191+
* yield the same leading digits, i.e. the same quotient when
1192+
* the decoded uint32 is divided by 85**(5-N).
1193+
*
1194+
* So we zero the bottom (4-chunk_len) bytes of leftchar to
1195+
* get the canonical uint32 ("canonical_top") and compare
1196+
* quotients. A 1-char group (chunk_len==0) is always
1197+
* non-canonical since no conforming encoder produces it. */
11821198
if (canonical && chunk_len < 4) {
11831199
if (chunk_len == 0) {
11841200
state = get_binascii_state(module);
@@ -1188,23 +1204,12 @@ binascii_a2b_ascii85_impl(PyObject *module, Py_buffer *data, int foldspaces,
11881204
}
11891205
goto error;
11901206
}
1191-
uint32_t canon = 0;
1192-
for (Py_ssize_t i = chunk_len; i > 0; i--) {
1193-
canon = (canon << 8) | bin_data[-i];
1194-
}
1195-
canon <<= (4 - chunk_len) * 8;
1196-
unsigned char digits[5];
1197-
uint32_t tmp = canon;
1198-
for (int i = 4; i >= 0; i--) {
1199-
digits[i] = tmp % 85;
1200-
tmp /= 85;
1201-
}
1202-
uint32_t expected = 0;
1203-
for (int i = 0; i < 5; i++) {
1204-
expected = expected * 85
1205-
+ (i <= chunk_len ? digits[i] : 84);
1206-
}
1207-
if (expected != leftchar) {
1207+
int n_pad = 4 - chunk_len;
1208+
uint32_t canonical_top =
1209+
(leftchar >> (n_pad * 8)) << (n_pad * 8);
1210+
if (canonical_top / pow85[n_pad]
1211+
!= leftchar / pow85[n_pad])
1212+
{
12081213
state = get_binascii_state(module);
12091214
if (state != NULL) {
12101215
PyErr_SetString(state->Error,
@@ -1461,39 +1466,23 @@ binascii_a2b_base85_impl(PyObject *module, Py_buffer *data,
14611466
*bin_data++ = (leftchar >> (24 - 8 * i)) & 0xff;
14621467
}
14631468

1464-
/* Reject non-canonical encodings in the final group. */
1469+
/* Reject non-canonical encodings in the final group.
1470+
* See the comment in a2b_ascii85 for the full explanation. */
14651471
if (canonical && chunk_len < 4) {
14661472
if (chunk_len == 0) {
1467-
/* 1-char partial group is never produced by a conforming
1468-
* encoder. */
14691473
state = get_binascii_state(module);
14701474
if (state != NULL) {
14711475
PyErr_SetString(state->Error,
14721476
"Non-canonical Base85 group size");
14731477
}
14741478
goto error;
14751479
}
1476-
/* Re-encode the output bytes to verify canonical form.
1477-
* Build the canonical uint32 from output bytes (zero-padded). */
1478-
uint32_t canon = 0;
1479-
for (Py_ssize_t i = chunk_len; i > 0; i--) {
1480-
canon = (canon << 8) | bin_data[-i];
1481-
}
1482-
canon <<= (4 - chunk_len) * 8;
1483-
/* Extract first (chunk_len + 1) base85 digits. */
1484-
unsigned char digits[5];
1485-
uint32_t tmp = canon;
1486-
for (int i = 4; i >= 0; i--) {
1487-
digits[i] = tmp % 85;
1488-
tmp /= 85;
1489-
}
1490-
/* Reconstruct expected value: canonical digits + 84-padding. */
1491-
uint32_t expected = 0;
1492-
for (int i = 0; i < 5; i++) {
1493-
expected = expected * 85
1494-
+ (i <= chunk_len ? digits[i] : 84);
1495-
}
1496-
if (expected != leftchar) {
1480+
int n_pad = 4 - chunk_len;
1481+
uint32_t canonical_top =
1482+
(leftchar >> (n_pad * 8)) << (n_pad * 8);
1483+
if (canonical_top / pow85[n_pad]
1484+
!= leftchar / pow85[n_pad])
1485+
{
14971486
state = get_binascii_state(module);
14981487
if (state != NULL) {
14991488
PyErr_SetString(state->Error,

0 commit comments

Comments
 (0)