[mypyc] Use cached ASCII characters in CPyStr_GetItem (#21035)

VaggelisD · web-flow · commit 6bcd02e37b5e · 2026-03-18T16:15:35.000Z
For characters < 256, use `PyUnicode_FromOrdinal()` which returns CPython's cached single-char Latin-1 string objects instead of allocating a new PyUnicode object on every `str[i]` access. This avoids allocation+deallocation overhead in character-scanning hot loops. Characters >= 256 (BMP, supplementary) keep the original `PyUnicode_New` allocation path unchanged. I ran the following micro-benchmark: Scan a 50k-character string with `s[i]` in a loop (repeated the benchmark 5000 times): | String type | Before (ms/iter) | After (ms/iter) | Speedup | |--------------------------|-------------------|-----------------|-----------------| | ASCII (0–127) | 0.651 | 0.166 | **3.9x (-75%)** | | Latin-1 (128–255) | 0.752 | 0.162 | **4.6x (-78%)** | | BMP (256–65535) | 0.901 | 0.809 | no change | | Supplementary (>65535) | 0.842 | 0.743 | no change | | Mixed (25% each) | 0.817 | 0.542 | **1.5x (-34%)** | <br /> This was coauthored with @tobymao
diff --git a/mypyc/lib-rt/str_ops.c b/mypyc/lib-rt/str_ops.c
@@ -109,13 +109,16 @@ PyObject *CPyStr_GetItem(PyObject *str, CPyTagged index) {
             enum PyUnicode_Kind kind = (enum PyUnicode_Kind)PyUnicode_KIND(str);
             void *data = PyUnicode_DATA(str);
             Py_UCS4 ch = PyUnicode_READ(kind, data, n);
+            if (ch < 256) {
+                // Latin-1 single-char strings are cached by CPython, so
+                // PyUnicode_FromOrdinal returns the cached object (with a
+                // new reference) instead of allocating a new string each time.
+                return PyUnicode_FromOrdinal(ch);
+            }
             PyObject *unicode = PyUnicode_New(1, ch);
             if (unicode == NULL)
                 return NULL;
-
-            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
-                PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
-            } else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
+            if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
                 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
             } else {
                 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
diff --git a/mypyc/test-data/run-strings.test b/mypyc/test-data/run-strings.test
@@ -264,6 +264,40 @@ def test_getitem() -> None:
     with assertRaises(IndexError, "string index out of range"):
         getitem(s, -4)
 
+def test_getitem_unicode() -> None:
+    # ASCII - cached by CPython's Latin-1 table
+    ascii_s = "Hello"
+    assert getitem(ascii_s, 0) == "H"
+    assert getitem(ascii_s, 4) == "o"
+
+    # Latin-1 (>127, <256) - also cached
+    latin1_s = "\xe9\xfc\xf1"  # éüñ
+    assert getitem(latin1_s, 0) == "\xe9"
+    assert getitem(latin1_s, 1) == "\xfc"
+    assert getitem(latin1_s, 2) == "\xf1"
+
+    # BMP (>255) - not cached, allocated fresh
+    bmp_s = "\u4e2d\u6587"  # 中文
+    assert getitem(bmp_s, 0) == "\u4e2d"
+    assert getitem(bmp_s, 1) == "\u6587"
+
+    # Supplementary plane (>65535)
+    emoji_s = "\U0001f600\U0001f601"  # 😀😁
+    assert getitem(emoji_s, 0) == "\U0001f600"
+    assert getitem(emoji_s, 1) == "\U0001f601"
+
+    # Mixed string with all kinds
+    mixed = "A\xe9\u4e2d\U0001f600"
+    assert getitem(mixed, 0) == "A"
+    assert getitem(mixed, 1) == "\xe9"
+    assert getitem(mixed, 2) == "\u4e2d"
+    assert getitem(mixed, 3) == "\U0001f600"
+
+    # Identity check: ASCII/Latin-1 chars should return cached objects
+    a1 = getitem("abc", 0)
+    a2 = getitem("axyz", 0)
+    assert a1 is a2  # both 'a', should be same cached object
+
 def test_find() -> None:
     s = "abcab"
     assert find(s, "Hello") == -1