Skip to content

Commit 6bcd02e

Browse files
authored
[mypyc] Use cached ASCII characters in CPyStr_GetItem (#21035)
For characters < 256, use `PyUnicode_FromOrdinal()` which returns CPython's cached single-char Latin-1 string objects instead of allocating a new PyUnicode object on every `str[i]` access. This avoids allocation+deallocation overhead in character-scanning hot loops. Characters >= 256 (BMP, supplementary) keep the original `PyUnicode_New` allocation path unchanged. I ran the following micro-benchmark: Scan a 50k-character string with `s[i]` in a loop (repeated the benchmark 5000 times): | String type | Before (ms/iter) | After (ms/iter) | Speedup | |--------------------------|-------------------|-----------------|-----------------| | ASCII (0–127) | 0.651 | 0.166 | **3.9x (-75%)** | | Latin-1 (128–255) | 0.752 | 0.162 | **4.6x (-78%)** | | BMP (256–65535) | 0.901 | 0.809 | no change | | Supplementary (>65535) | 0.842 | 0.743 | no change | | Mixed (25% each) | 0.817 | 0.542 | **1.5x (-34%)** | <br /> This was coauthored with @tobymao
1 parent 0183a21 commit 6bcd02e

File tree

2 files changed

+41
-4
lines changed

2 files changed

+41
-4
lines changed

mypyc/lib-rt/str_ops.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,13 +109,16 @@ PyObject *CPyStr_GetItem(PyObject *str, CPyTagged index) {
109109
enum PyUnicode_Kind kind = (enum PyUnicode_Kind)PyUnicode_KIND(str);
110110
void *data = PyUnicode_DATA(str);
111111
Py_UCS4 ch = PyUnicode_READ(kind, data, n);
112+
if (ch < 256) {
113+
// Latin-1 single-char strings are cached by CPython, so
114+
// PyUnicode_FromOrdinal returns the cached object (with a
115+
// new reference) instead of allocating a new string each time.
116+
return PyUnicode_FromOrdinal(ch);
117+
}
112118
PyObject *unicode = PyUnicode_New(1, ch);
113119
if (unicode == NULL)
114120
return NULL;
115-
116-
if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
117-
PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
118-
} else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
121+
if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
119122
PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
120123
} else {
121124
assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);

mypyc/test-data/run-strings.test

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,40 @@ def test_getitem() -> None:
264264
with assertRaises(IndexError, "string index out of range"):
265265
getitem(s, -4)
266266

267+
def test_getitem_unicode() -> None:
268+
# ASCII - cached by CPython's Latin-1 table
269+
ascii_s = "Hello"
270+
assert getitem(ascii_s, 0) == "H"
271+
assert getitem(ascii_s, 4) == "o"
272+
273+
# Latin-1 (>127, <256) - also cached
274+
latin1_s = "\xe9\xfc\xf1" # éüñ
275+
assert getitem(latin1_s, 0) == "\xe9"
276+
assert getitem(latin1_s, 1) == "\xfc"
277+
assert getitem(latin1_s, 2) == "\xf1"
278+
279+
# BMP (>255) - not cached, allocated fresh
280+
bmp_s = "\u4e2d\u6587" # 中文
281+
assert getitem(bmp_s, 0) == "\u4e2d"
282+
assert getitem(bmp_s, 1) == "\u6587"
283+
284+
# Supplementary plane (>65535)
285+
emoji_s = "\U0001f600\U0001f601" # 😀😁
286+
assert getitem(emoji_s, 0) == "\U0001f600"
287+
assert getitem(emoji_s, 1) == "\U0001f601"
288+
289+
# Mixed string with all kinds
290+
mixed = "A\xe9\u4e2d\U0001f600"
291+
assert getitem(mixed, 0) == "A"
292+
assert getitem(mixed, 1) == "\xe9"
293+
assert getitem(mixed, 2) == "\u4e2d"
294+
assert getitem(mixed, 3) == "\U0001f600"
295+
296+
# Identity check: ASCII/Latin-1 chars should return cached objects
297+
a1 = getitem("abc", 0)
298+
a2 = getitem("axyz", 0)
299+
assert a1 is a2 # both 'a', should be same cached object
300+
267301
def test_find() -> None:
268302
s = "abcab"
269303
assert find(s, "Hello") == -1

0 commit comments

Comments
 (0)