Fine-tuned checksum codecs

dhondta · dhondta · commit 16293996c86b · 2026-03-24T22:40:52.000+01:00
diff --git a/src/codext/__common__.py b/src/codext/__common__.py
@@ -1,5 +1,6 @@
 # -*- coding: UTF-8 -*-
 import _codecs
+import builtins
 import codecs
 import hashlib
 import json
@@ -20,22 +21,6 @@
 from random import randint
 from string import *
 from types import FunctionType, ModuleType
-try:  # Python2
-    import __builtin__ as builtins
-except ImportError:
-    import builtins
-try:  # Python2
-    from inspect import getfullargspec
-except ImportError:
-    from inspect import getargspec as getfullargspec
-try:  # Python2
-    from string import maketrans
-except ImportError:
-    maketrans = str.maketrans
-try:  # Python3
-    from importlib import reload
-except ImportError:
-    pass
 try:
     import re._parser as sre_parse
 except ImportError:
@@ -44,6 +29,8 @@
 # from Python 3.11, 'sre_parse' is bound as '_parser' ; monkey-patch it for backward-compatibility
 re.sre_parse = sre_parse
 
+maketrans = str.maketrans
+
 
 __all__ = ["add", "add_macro", "add_map", "b", "clear", "codecs", "decode", "encode", "ensure_str", "examples", "guess",
            "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "hashlib", "i2s",
@@ -277,6 +264,7 @@ def getregentry(encoding):
                 # this occurs while m is not None, but possibly no capture group that gives at least 1 group index ;
                 #  in this case, if fenc/fdec is a decorated function, execute it with no arg
                 if len(args) == 0:
+                    from inspect import getfullargspec
                     if fenc and len(getfullargspec(fenc).args) == 1:
                         fenc = fenc()
                     if fdec and len(getfullargspec(fdec).args) == 1:
@@ -767,6 +755,7 @@ def remove(name):
 
 def reset():
     """ Reset codext's local registry of search functions and macros. """
+    from importlib import reload
     global __codecs_registry, CODECS_REGISTRY, MACROS, PERS_MACROS  # noqa: F824
     clear()
     d = os.path.dirname(__file__)
@@ -1142,9 +1131,8 @@ def generate_string_from_regex(regex):
 
 def generate_strings_from_regex(regex, star_plus_max=STAR_PLUS_MAX, repeat_max=REPEAT_MAX, yield_max=YIELD_MAX):
     """ Utility function to generate strings from a regex pattern. """
-    i = 0
-    for result in __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max):
-        yield result
+    for r in __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max):
+        yield r
 
 
 # guess feature objects
diff --git a/src/codext/checksums/adler.py b/src/codext/checksums/adler.py
@@ -3,10 +3,10 @@
 
 This is a codec for computing checksums, for use with other codecs in encoding chains.
 
-These codecs:
-- transform strings from str to str
-- transform strings from bytes to bytes
-- transform file content from str to bytes (write)
+This codec:
+- transforms strings from str to str
+- transforms strings from bytes to bytes
+- transforms file content from str to bytes (write)
 """
 from zlib import adler32
 
diff --git a/src/codext/checksums/crc.py b/src/codext/checksums/crc.py
@@ -3,10 +3,10 @@
 
 This is a codec for computing checksums, for use with other codecs in encoding chains.
 
-These codecs:
-- transform strings from str to str
-- transform strings from bytes to bytes
-- transform file content from str to bytes (write)
+This codec:
+- transforms strings from str to str
+- transforms strings from bytes to bytes
+- transforms file content from str to bytes (write)
 """
 from ..__common__ import add
 
@@ -212,7 +212,7 @@
     },
 }
 
-_pattern = lambda n="": r"^crc" + str(n) + r"(|[-_]?(?:%s))$" % "|".join(x for x in CRC[n].keys() if len(x) > 0)
+_pattern = lambda n="": rf"^crc(?:[-_]?){n}(|[-_]?(?:{'|'.join(x for x in CRC[n].keys() if len(x) > 0)}))$"
 _rev_int = lambda i, l=None: int(bin(i)[2:].zfill(l or len(bin(i)[2:]))[::-1], 2)
 
 
diff --git a/src/codext/checksums/luhn.py b/src/codext/checksums/luhn.py
@@ -1,96 +1,33 @@
 # -*- coding: UTF-8 -*-
 """Luhn Codec - Luhn Mod N checksum algorithm.
 
-The Luhn algorithm, also known as the "modulus 10" algorithm, is a simple checksum
-formula used to validate identification numbers (e.g. credit card numbers, IMEI
-numbers). Encoding appends a check character; decoding verifies the check character
-and strips it.
-
-The Luhn Mod N generalization extends the algorithm to alphabets of arbitrary size N.
-When called as 'luhn' or 'luhn-10', the standard decimal alphabet (0-9, N=10) is
-used. When called as 'luhn-<N>' for 2 ≤ N ≤ 36, the first N characters of
-'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' form the alphabet.
+This is a codec for computing checksums, for use with other codecs in encoding chains.
 
 This codec:
-- en/decodes strings from str to str
-- en/decodes strings from bytes to bytes
-- decodes file content to str (read)
-- encodes file content from str to bytes (write)
-
-Reference: https://en.wikipedia.org/wiki/Luhn_algorithm
-          https://bitcoinwiki.org/wiki/luhn-mod-n-algorithm
+- transforms strings from str to str
+- transforms strings from bytes to bytes
+- transforms file content from str to bytes (write)
 """
 from ..__common__ import *
 
 
-__examples__ = {
-    'enc(luhn|luhn-10|luhn10)': {
-        '7992739871': '79927398713',
-        '':           '',
-        '0':          '00',
-        '1':          '18',
-    },
-    'dec(luhn|luhn-10|luhn10)': {
-        '79927398713': '7992739871',
-        '':            '',
-        '00':          '0',
-        '18':          '1',
-    },
-    'enc-dec(luhn)':    ['123456789', '0' * 10, '9999999999999999'],
-    'enc-dec(luhn-16)': ['0123456789ABCDEF', 'DEADBEEF'],
-    'enc-dec(luhn-36)': ['HELLO', 'WORLD123'],
-}
-
-_FULL_ALPHABET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-
-
-def _luhn_encode(n=""):
-    mod = n if isinstance(n, int) else 10
-    alphabet = _FULL_ALPHABET[:mod]
-
-    def _encode(text, errors="strict"):
-        text = ensure_str(text).upper() if mod > 10 else ensure_str(text)
-        if not text:
+def luhn(n=""):
+    alphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"[:(mod := n if isinstance(n, int) else 10)]
+    def encode(data, errors="strict"):
+        total, data = 0, "".join(c if c in alphabet else handle_error("luhn", errors, kind="character")(c, i, data) \
+                                 for i, c in enumerate(data))
+        if not (data := ensure_str(data).upper() if mod > 10 else ensure_str(data)):
             return "", 0
-        for pos, c in enumerate(text):
-            if c not in alphabet:
-                handle_error("luhn", errors, kind="character")(c, pos, text)
-        total = 0
-        for i, c in enumerate(reversed(text)):
+        for i, c in enumerate(reversed(data)):
             code = alphabet.index(c)
             if i % 2 == 0:
                 d = code * 2
                 code = d % mod + d // mod
             total += code
         check = (mod - total % mod) % mod
-        return text + alphabet[check], len(b(text))
-
-    return _encode
-
-
-def _luhn_decode(n=""):
-    mod = n if isinstance(n, int) else 10
-    alphabet = _FULL_ALPHABET[:mod]
-
-    def _decode(text, errors="strict"):
-        text = ensure_str(text).upper() if mod > 10 else ensure_str(text)
-        if not text:
-            return "", 0
-        for pos, c in enumerate(text):
-            if c not in alphabet:
-                handle_error("luhn", errors, decode=True, kind="character")(c, pos, text)
-        total = 0
-        for i, c in enumerate(reversed(text)):
-            code = alphabet.index(c)
-            if i % 2 == 1:
-                d = code * 2
-                code = d % mod + d // mod
-            total += code
-        if total % mod != 0:
-            handle_error("luhn", errors, decode=True)(text[-1], len(text) - 1, text[:-1])
-        return text[:-1], len(b(text))
+        return alphabet[check], len(b(data))
+    return encode
 
-    return _decode
 
+add("luhn", luhn, pattern=r"^luhn[-_]?(\d{1,2})?$", guess=None)
 
-add("luhn", _luhn_encode, _luhn_decode, pattern=r"^luhn[-_]?(\d{1,2})?$", guess=None)
diff --git a/tests/test_manual.py b/tests/test_manual.py
@@ -96,6 +96,17 @@ def test_codec_case_related_manips(self):
         self.assertRaises(NotImplementedError, codecs.decode, STR, "slug")
         self.assertRaises(NotImplementedError, codecs.decode, STR, "snake")
     
+    def test_codec_checksum_functions(self):
+        from codext.checksums.crc import CRC
+        for n, variants in CRC.items():
+            for name, params in variants.items():
+                enc = ("crc%d-%s" % (n, name) if isinstance(n, int) else "crc-%s" % name).rstrip("-")
+                self.assertEqual(codecs.encode("123456789", enc), "%0{}x".format(round((n or 16)/4+.5)) % params[5])
+        from codext.checksums.luhn import luhn
+        for s, r in [("", ""), ("0", "0"), ("1", "8"), ("7992739871", "3")]:
+            self.assertEqual(codecs.encode(s, "luhn"), r)
+        self.assertEqual(codecs.encode("-", "luhn", errors="ignore"), "")
+    
     def test_codec_dummy_str_manips(self):
         STR = "this is a test"
         self.assertEqual(codecs.decode(STR, "reverse"), "tset a si siht")
@@ -109,7 +120,6 @@ def test_codec_dummy_str_manips(self):
         self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200")
     
     def test_codec_hash_functions(self):
-        from codext.checksums.crc import CRC
         STR = b"This is a test string!"
         for h in ["adler32", "md2", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]:
             self.assertIsNotNone(codecs.encode(STR, h))
@@ -145,13 +155,6 @@ def test_codec_hash_functions(self):
                 h = "crypt-" + m
                 self.assertIsNotNone(codecs.encode(STR, h))
                 self.assertRaises(NotImplementedError, codecs.decode, STR, h)
-        # CRC checks
-        STR = "123456789"
-        for n, variants in CRC.items():
-            for name, params in variants.items():
-                enc = ("crc%d-%s" % (n, name) if isinstance(n, int) else "crc-%s" % name).rstrip("-")
-                print(enc)
-                self.assertEqual(codecs.encode(STR, enc), "%0{}x".format(round((n or 16)/4+.5)) % params[5])
     
     def test_codec_markdown(self):
         HTM = "<h1>Test title</h1>\n\n<p>Test paragraph</p>\n"