Enforce z-abbreviation for all-zero groups in ascii85 canonical mode

gpshead · claude · gpshead · commit 2a1d91d3a26c · 2026-04-05T19:31:09.000Z
When canonical=True, reject '!!!!!' (five zero digits) in favor of
the 'z' abbreviation. The PLRM spec defines 'z' as the representation
for all-zero groups, so '!!!!!' is a non-canonical encoding.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py
@@ -924,6 +924,21 @@ def test_ascii85_canonical(self):
         self.assertEqual(
             binascii.a2b_ascii85(b'@:E_W', canonical=True), b'abcd')
 
+        # 'z' is the canonical form for all-zero groups per the PLRM.
+        # '!!!!!' decodes identically but is non-canonical.
+        self.assertEqual(binascii.a2b_ascii85(b'!!!!!'), b'\x00' * 4)
+        self.assertEqual(binascii.a2b_ascii85(b'z'), b'\x00' * 4)
+        self.assertEqual(
+            binascii.a2b_ascii85(b'z', canonical=True), b'\x00' * 4)
+        with self.assertRaises(binascii.Error):
+            binascii.a2b_ascii85(b'!!!!!', canonical=True)
+        # Multiple groups: z + !!!!! should fail
+        with self.assertRaises(binascii.Error):
+            binascii.a2b_ascii85(b'z!!!!!', canonical=True)
+        # Multiple z groups are fine
+        self.assertEqual(
+            binascii.a2b_ascii85(b'zz', canonical=True), b'\x00' * 8)
+
         # Empty input is valid
         self.assertEqual(binascii.a2b_ascii85(b'', canonical=True), b'')
 
@@ -935,6 +950,7 @@ def test_ascii85_canonical(self):
     @hypothesis.given(payload=hypothesis.strategies.binary())
     @hypothesis.example(b'')
     @hypothesis.example(b'\x00')
+    @hypothesis.example(b'\x00\x00\x00\x00')  # triggers z abbreviation
     @hypothesis.example(b'\xff\xff')
     @hypothesis.example(b'abc')
     def test_ascii85_canonical_roundtrip(self, payload):
diff --git a/Modules/binascii.c b/Modules/binascii.c
@@ -1124,6 +1124,7 @@ binascii_a2b_ascii85_impl(PyObject *module, Py_buffer *data, int foldspaces,
 
     uint32_t leftchar = 0;
     int group_pos = 0;
+    int from_z = 0;  /* true when current group came from 'z' shorthand */
     for (; ascii_len > 0 || group_pos != 0; ascii_len--, ascii_data++) {
         /* Shift (in radix-85) data or padding into our buffer. */
         unsigned char this_digit;
@@ -1159,6 +1160,7 @@ binascii_a2b_ascii85_impl(PyObject *module, Py_buffer *data, int foldspaces,
                 goto error;
             }
             leftchar = this_ch == 'y' ? BASE85_A85_Y : BASE85_A85_Z;
+            from_z = (this_ch == 'z');
             group_pos = 5;
         }
         else if (!ignorechar(this_ch, ignorechars, ignorecache)) {
@@ -1193,35 +1195,45 @@ binascii_a2b_ascii85_impl(PyObject *module, Py_buffer *data, int foldspaces,
             *bin_data++ = (leftchar >> (24 - 8 * i)) & 0xff;
         }
 
-        /* Reject non-canonical encodings in the final group.
-         *
-         * A partial group of N chars (2-4) encodes N-1 bytes.  The
-         * decoder pads missing chars with digit 84 (the maximum).
-         * The encoder produces the unique N chars for those bytes by
-         * zero-padding the bytes to a uint32 and taking the leading
-         * N base-85 digits.  Two encodings are equivalent iff they
-         * yield the same leading digits, i.e. the same quotient when
-         * the decoded uint32 is divided by 85**(5-N).
-         *
-         * So we zero the bottom (4-chunk_len) bytes of leftchar to
-         * get the canonical uint32 ("canonical_top") and compare
-         * quotients. */
-        if (canonical && chunk_len < 4) {
-            int n_pad = 4 - chunk_len;
-            uint32_t canonical_top =
-                (leftchar >> (n_pad * 8)) << (n_pad * 8);
-            if (canonical_top / pow85[n_pad]
-                    != leftchar / pow85[n_pad])
-            {
+        if (canonical) {
+            /* The PLRM spec requires all-zero groups to use the 'z'
+             * abbreviation.  Reject '!!!!!' (five zero digits). */
+            if (chunk_len == 4 && leftchar == 0 && !from_z) {
                 state = get_binascii_state(module);
                 if (state != NULL) {
                     PyErr_SetString(state->Error,
-                                    "Non-zero padding bits");
+                                    "Non-canonical encoding, "
+                                    "use 'z' for all-zero groups");
                 }
                 goto error;
             }
+            /* Reject non-canonical partial groups.
+             *
+             * A partial group of N chars (2-4) encodes N-1 bytes.
+             * The decoder pads missing chars with digit 84 (the max).
+             * The encoder produces the unique N chars for those bytes
+             * by zero-padding the bytes to a uint32 and taking the
+             * leading N base-85 digits.  Two encodings are equivalent
+             * iff they yield the same quotient when divided by
+             * 85**(5-N). */
+            if (chunk_len < 4) {
+                int n_pad = 4 - chunk_len;
+                uint32_t canonical_top =
+                    (leftchar >> (n_pad * 8)) << (n_pad * 8);
+                if (canonical_top / pow85[n_pad]
+                        != leftchar / pow85[n_pad])
+                {
+                    state = get_binascii_state(module);
+                    if (state != NULL) {
+                        PyErr_SetString(state->Error,
+                                        "Non-zero padding bits");
+                    }
+                    goto error;
+                }
+            }
         }
 
+        from_z = 0;
         group_pos = 0;
         leftchar = 0;
     }