Skip to content

Commit 2a1d91d

Browse files
gpsheadclaude
andcommitted
Enforce z-abbreviation for all-zero groups in ascii85 canonical mode
When canonical=True, reject '!!!!!' (five zero digits) in favor of the 'z' abbreviation. The PLRM spec defines 'z' as the representation for all-zero groups, so '!!!!!' is a non-canonical encoding. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent b5391bd commit 2a1d91d

File tree

2 files changed

+49
-21
lines changed

2 files changed

+49
-21
lines changed

Lib/test/test_binascii.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -924,6 +924,21 @@ def test_ascii85_canonical(self):
924924
self.assertEqual(
925925
binascii.a2b_ascii85(b'@:E_W', canonical=True), b'abcd')
926926

927+
# 'z' is the canonical form for all-zero groups per the PLRM.
928+
# '!!!!!' decodes identically but is non-canonical.
929+
self.assertEqual(binascii.a2b_ascii85(b'!!!!!'), b'\x00' * 4)
930+
self.assertEqual(binascii.a2b_ascii85(b'z'), b'\x00' * 4)
931+
self.assertEqual(
932+
binascii.a2b_ascii85(b'z', canonical=True), b'\x00' * 4)
933+
with self.assertRaises(binascii.Error):
934+
binascii.a2b_ascii85(b'!!!!!', canonical=True)
935+
# Multiple groups: z + !!!!! should fail
936+
with self.assertRaises(binascii.Error):
937+
binascii.a2b_ascii85(b'z!!!!!', canonical=True)
938+
# Multiple z groups are fine
939+
self.assertEqual(
940+
binascii.a2b_ascii85(b'zz', canonical=True), b'\x00' * 8)
941+
927942
# Empty input is valid
928943
self.assertEqual(binascii.a2b_ascii85(b'', canonical=True), b'')
929944

@@ -935,6 +950,7 @@ def test_ascii85_canonical(self):
935950
@hypothesis.given(payload=hypothesis.strategies.binary())
936951
@hypothesis.example(b'')
937952
@hypothesis.example(b'\x00')
953+
@hypothesis.example(b'\x00\x00\x00\x00') # triggers z abbreviation
938954
@hypothesis.example(b'\xff\xff')
939955
@hypothesis.example(b'abc')
940956
def test_ascii85_canonical_roundtrip(self, payload):

Modules/binascii.c

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1124,6 +1124,7 @@ binascii_a2b_ascii85_impl(PyObject *module, Py_buffer *data, int foldspaces,
11241124

11251125
uint32_t leftchar = 0;
11261126
int group_pos = 0;
1127+
int from_z = 0; /* true when current group came from 'z' shorthand */
11271128
for (; ascii_len > 0 || group_pos != 0; ascii_len--, ascii_data++) {
11281129
/* Shift (in radix-85) data or padding into our buffer. */
11291130
unsigned char this_digit;
@@ -1159,6 +1160,7 @@ binascii_a2b_ascii85_impl(PyObject *module, Py_buffer *data, int foldspaces,
11591160
goto error;
11601161
}
11611162
leftchar = this_ch == 'y' ? BASE85_A85_Y : BASE85_A85_Z;
1163+
from_z = (this_ch == 'z');
11621164
group_pos = 5;
11631165
}
11641166
else if (!ignorechar(this_ch, ignorechars, ignorecache)) {
@@ -1193,35 +1195,45 @@ binascii_a2b_ascii85_impl(PyObject *module, Py_buffer *data, int foldspaces,
11931195
*bin_data++ = (leftchar >> (24 - 8 * i)) & 0xff;
11941196
}
11951197

1196-
/* Reject non-canonical encodings in the final group.
1197-
*
1198-
* A partial group of N chars (2-4) encodes N-1 bytes. The
1199-
* decoder pads missing chars with digit 84 (the maximum).
1200-
* The encoder produces the unique N chars for those bytes by
1201-
* zero-padding the bytes to a uint32 and taking the leading
1202-
* N base-85 digits. Two encodings are equivalent iff they
1203-
* yield the same leading digits, i.e. the same quotient when
1204-
* the decoded uint32 is divided by 85**(5-N).
1205-
*
1206-
* So we zero the bottom (4-chunk_len) bytes of leftchar to
1207-
* get the canonical uint32 ("canonical_top") and compare
1208-
* quotients. */
1209-
if (canonical && chunk_len < 4) {
1210-
int n_pad = 4 - chunk_len;
1211-
uint32_t canonical_top =
1212-
(leftchar >> (n_pad * 8)) << (n_pad * 8);
1213-
if (canonical_top / pow85[n_pad]
1214-
!= leftchar / pow85[n_pad])
1215-
{
1198+
if (canonical) {
1199+
/* The PLRM spec requires all-zero groups to use the 'z'
1200+
* abbreviation. Reject '!!!!!' (five zero digits). */
1201+
if (chunk_len == 4 && leftchar == 0 && !from_z) {
12161202
state = get_binascii_state(module);
12171203
if (state != NULL) {
12181204
PyErr_SetString(state->Error,
1219-
"Non-zero padding bits");
1205+
"Non-canonical encoding, "
1206+
"use 'z' for all-zero groups");
12201207
}
12211208
goto error;
12221209
}
1210+
/* Reject non-canonical partial groups.
1211+
*
1212+
* A partial group of N chars (2-4) encodes N-1 bytes.
1213+
* The decoder pads missing chars with digit 84 (the max).
1214+
* The encoder produces the unique N chars for those bytes
1215+
* by zero-padding the bytes to a uint32 and taking the
1216+
* leading N base-85 digits. Two encodings are equivalent
1217+
* iff they yield the same quotient when divided by
1218+
* 85**(5-N). */
1219+
if (chunk_len < 4) {
1220+
int n_pad = 4 - chunk_len;
1221+
uint32_t canonical_top =
1222+
(leftchar >> (n_pad * 8)) << (n_pad * 8);
1223+
if (canonical_top / pow85[n_pad]
1224+
!= leftchar / pow85[n_pad])
1225+
{
1226+
state = get_binascii_state(module);
1227+
if (state != NULL) {
1228+
PyErr_SetString(state->Error,
1229+
"Non-zero padding bits");
1230+
}
1231+
goto error;
1232+
}
1233+
}
12231234
}
12241235

1236+
from_z = 0;
12251237
group_pos = 0;
12261238
leftchar = 0;
12271239
}

0 commit comments

Comments
 (0)