Skip to content

Commit 93ee965

Browse files
committed
[mypyc] Add str.isalnum() primitive (python#20852)
Added `str.isalnum()` similar to `str.isspace()`. One interesting thing to point out here is that the benchmarks decline in speed relative to the string's length: | All-alphanumeric | mypyc (s) | Python (s) | Speedup | |--------|----------:|------------:|--------:| | length 1 (`'a'`) | 0.645 | 2.036 | 3.16x | | length 10 (`'abcde12345'`) | 1.026 | 2.607 | 2.54x | | length 100 (`'a' * 100`) | 3.599 | 7.848 | 2.18x | | length 1 (UCS-2: U+00E9 `é`) | 0.816 | 1.976 | 2.42x | | length 10 (UCS-2: U+00E9 * 10) | 2.091 | 2.587 | 1.24x | | length 100 (UCS-2: U+00E9 * 100) | 14.298 | 7.814 | 0.55x | <br /> | Non-alphanumeric (early exit) | mypyc (s) | Python (s) | Speedup | |--------|----------:|------------:|--------:| | length 1 (`' '`) | 0.622 | 2.006 | 3.22x | | length 100 (`'!' * 100`) | 0.617 | 2.024 | 3.28x | | length 100 (`'a' * 99 + '!'`) | 3.453 | 10.246 | 2.97x | <br /> Not entirely sure how to interpret this but could it be because the [Py_UNICODE_ISALNUM](https://github.com/python/cpython/blob/175ab31377d9e616efb95168099d8c2c9036504a/Include/cpython/unicodeobject.h#L769) calls 4 functions internally which is more optimized in CPython due to PGO & LTO (?)
1 parent 10beda8 commit 93ee965

7 files changed

Lines changed: 77 additions & 0 deletions

File tree

mypyc/doc/str_operations.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ Methods
3838
* ``s1.find(s2: str)``
3939
* ``s1.find(s2: str, start: int)``
4040
* ``s1.find(s2: str, start: int, end: int)``
41+
* ``s.isspace()``
42+
* ``s.isalnum()``
4143
* ``s.join(x: Iterable)``
4244
* ``s.lstrip()``
4345
* ``s.lstrip(chars: str)``

mypyc/lib-rt/CPy.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,7 @@ Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged st
772772
CPyTagged CPyStr_Ord(PyObject *obj);
773773
PyObject *CPyStr_Multiply(PyObject *str, CPyTagged count);
774774
bool CPyStr_IsSpace(PyObject *str);
775+
bool CPyStr_IsAlnum(PyObject *str);
775776

776777
// Bytes operations
777778

mypyc/lib-rt/str_ops.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -654,3 +654,26 @@ bool CPyStr_IsSpace(PyObject *str) {
654654
}
655655
return true;
656656
}
657+
658+
bool CPyStr_IsAlnum(PyObject *str) {
659+
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
660+
if (len == 0) return false;
661+
662+
if (PyUnicode_IS_ASCII(str)) {
663+
const Py_UCS1 *data = PyUnicode_1BYTE_DATA(str);
664+
for (Py_ssize_t i = 0; i < len; i++) {
665+
if (!Py_ISALNUM(data[i]))
666+
return false;
667+
}
668+
return true;
669+
}
670+
671+
int kind = PyUnicode_KIND(str);
672+
const void *data = PyUnicode_DATA(str);
673+
for (Py_ssize_t i = 0; i < len; i++) {
674+
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
675+
if (!Py_UNICODE_ISALNUM(ch))
676+
return false;
677+
}
678+
return true;
679+
}

mypyc/primitives/str_ops.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,15 @@
383383
error_kind=ERR_NEVER,
384384
)
385385

386+
method_op(
387+
name="isalnum",
388+
arg_types=[str_rprimitive],
389+
return_type=bool_rprimitive,
390+
c_function_name="CPyStr_IsAlnum",
391+
error_kind=ERR_NEVER,
392+
)
393+
394+
386395
# obj.decode()
387396
method_op(
388397
name="decode",

mypyc/test-data/fixtures/ir.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ def removesuffix(self, suffix: str, /) -> str: ...
130130
def islower(self) -> bool: ...
131131
def count(self, substr: str, start: Optional[int] = None, end: Optional[int] = None) -> int: pass
132132
def isspace(self) -> bool: ...
133+
def isalnum(self) -> bool: ...
133134

134135
class float:
135136
def __init__(self, x: object) -> None: pass

mypyc/test-data/irbuild-str.test

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,3 +803,14 @@ def is_space(x):
803803
L0:
804804
r0 = CPyStr_IsSpace(x)
805805
return r0
806+
807+
[case testStrIsAlnum]
808+
def is_alnum(x: str) -> bool:
809+
return x.isalnum()
810+
[out]
811+
def is_alnum(x):
812+
x :: str
813+
r0 :: bool
814+
L0:
815+
r0 = CPyStr_IsAlnum(x)
816+
return r0

mypyc/test-data/run-strings.test

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1123,3 +1123,33 @@ def test_isspace() -> None:
11231123
c = chr(i)
11241124
a: Any = c
11251125
assert c.isspace() == a.isspace()
1126+
1127+
[case testIsAlnum]
1128+
def test_isalnum_basic() -> None:
1129+
assert "abc".isalnum()
1130+
assert "ABC".isalnum()
1131+
assert "abc123".isalnum()
1132+
assert "123".isalnum()
1133+
assert not "".isalnum()
1134+
assert not " ".isalnum()
1135+
assert not "abc!".isalnum()
1136+
assert not "hello world".isalnum()
1137+
assert not "abc-123".isalnum()
1138+
1139+
def test_isalnum_unicode() -> None:
1140+
# Single chars: letters and digits from various scripts
1141+
assert "\u00E9".isalnum() # é (UCS-1 Latin letter)
1142+
assert "\u0660".isalnum() # ٠ (UCS-2 Arabic-Indic digit)
1143+
assert "\u4E2D".isalnum() # 中 (UCS-2 CJK ideograph)
1144+
assert "\U00010400".isalnum() # 𐐀 (UCS-4 Deseret capital letter long I)
1145+
assert not "\u2000".isalnum() # EN QUAD (whitespace)
1146+
assert not "\u0021".isalnum() # !
1147+
assert not "\u00B6".isalnum() # ¶ (pilcrow sign, punctuation)
1148+
1149+
# Mixed Unicode letters and digits — all alnum
1150+
assert "\u00E9\u0660".isalnum() # é٠
1151+
assert "\u4E2D\u0041\u0660".isalnum() # 中A٠
1152+
1153+
# Unicode letter/digit mixed with punctuation — not alnum
1154+
assert not "\u00E9!".isalnum()
1155+
assert not "\u4E2D\u2000".isalnum() # CJK + whitespace

0 commit comments

Comments
 (0)