Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mypyc/doc/str_operations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ Methods
* ``s.isalnum()``
* ``s.isdigit()``
* ``s.isspace()``
* ``s.lower()``
* ``s.join(x: Iterable)``
* ``s.lstrip()``
* ``s.lstrip(chars: str)``
Expand All @@ -65,6 +66,7 @@ Methods
* ``s.splitlines(keepends: bool)``
* ``s1.startswith(s2: str)``
* ``s1.startswith(t: tuple[str, ...])``
* ``s.upper()``
* ``s.strip()``
* ``s.strip(chars: str)``

Expand Down
2 changes: 2 additions & 0 deletions mypyc/lib-rt/CPy.h
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,8 @@ Py_ssize_t CPyStr_Count(PyObject *unicode, PyObject *substring, CPyTagged start)
Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged start, CPyTagged end);
CPyTagged CPyStr_Ord(PyObject *obj);
PyObject *CPyStr_Multiply(PyObject *str, CPyTagged count);
PyObject *CPyStr_Lower(PyObject *str);
PyObject *CPyStr_Upper(PyObject *str);
bool CPyStr_IsSpace(PyObject *str);
bool CPyStr_IsAlnum(PyObject *str);
bool CPyStr_IsDigit(PyObject *str);
Expand Down
2 changes: 2 additions & 0 deletions mypyc/lib-rt/static_data.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ intern_strings(void) {
INTERN_STRING(endswith, "endswith");
INTERN_STRING(get_type_hints, "get_type_hints");
INTERN_STRING(keys, "keys");
INTERN_STRING(lower, "lower");
INTERN_STRING(items, "items");
INTERN_STRING(join, "join");
INTERN_STRING(register_, "register");
Expand All @@ -66,6 +67,7 @@ intern_strings(void) {
INTERN_STRING(throw_, "throw");
INTERN_STRING(translate, "translate");
INTERN_STRING(update, "update");
INTERN_STRING(upper, "upper");
INTERN_STRING(values, "values");
return 0;
}
Expand Down
2 changes: 2 additions & 0 deletions mypyc/lib-rt/static_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ typedef struct mypyc_interned_str_struct {
PyObject *endswith;
PyObject *get_type_hints;
PyObject *keys;
PyObject *lower;
PyObject *items;
PyObject *join;
PyObject *register_;
Expand All @@ -58,6 +59,7 @@ typedef struct mypyc_interned_str_struct {
PyObject *throw_;
PyObject *translate;
PyObject *update;
PyObject *upper;
PyObject *values;
} mypyc_interned_str_struct;

Expand Down
76 changes: 75 additions & 1 deletion mypyc/lib-rt/str_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

// The _PyUnicode_CheckConsistency definition has been moved to the internal API
// https://github.com/python/cpython/pull/106398
#if defined(Py_DEBUG) && defined(CPY_3_13_FEATURES)
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was a mistake afaict, CPY_3_13_FEATURES is defined by us so that'd always evaluate to true

#if defined(Py_DEBUG) && CPY_3_13_FEATURES
#include "internal/pycore_unicodeobject.h"
#endif

Expand Down Expand Up @@ -678,6 +678,80 @@ bool CPyStr_IsAlnum(PyObject *str) {
return true;
}

static int CPy_ASCII_Lower(unsigned char c) { return Py_TOLOWER(c); }
static int CPy_ASCII_Upper(unsigned char c) { return Py_TOUPPER(c); }
Comment thread
VaggelisD marked this conversation as resolved.
Outdated

static PyObject *CPyStr_ChangeCase(PyObject *self,
Comment thread
VaggelisD marked this conversation as resolved.
Outdated
int (*ascii_func)(unsigned char),
#if CPY_3_13_FEATURES
PyObject *method_name
#else
int (*unicode_func)(Py_UCS4, Py_UCS4 *)
#endif
) {
Py_ssize_t len = PyUnicode_GET_LENGTH(self);
if (len == 0) {
Py_INCREF(self);
return self;
}

// ASCII fast path: 1-to-1, no expansion possible
if (PyUnicode_IS_ASCII(self)) {
PyObject *res = PyUnicode_New(len, 127);
if (res == NULL) return NULL;
const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Py_UCS1 *res_data = PyUnicode_1BYTE_DATA(res);
for (Py_ssize_t i = 0; i < len; i++) {
res_data[i] = ascii_func(data[i]);
}
return res;
}

#if CPY_3_13_FEATURES
// On 3.13+, _PyUnicode_ToLowerFull/ToUpperFull are no longer exported,
// so fall back to CPython's method implementation for non-ASCII strings.
return PyObject_CallMethodNoArgs(self, method_name);
#else
// General Unicode: unicode_func handles 1-to-N expansion.
// Worst case: each codepoint expands to 3 (per Unicode standard).
// The tmp buffer is short-lived, and PyUnicode_FromKindAndData
// compacts the result to the optimal string kind automatically.
int kind = PyUnicode_KIND(self);
const void *data = PyUnicode_DATA(self);
Py_UCS4 *tmp = PyMem_Malloc(sizeof(Py_UCS4) * len * 3);
if (tmp == NULL) return PyErr_NoMemory();

Py_UCS4 mapped[3];
Py_ssize_t out_len = 0;
for (Py_ssize_t i = 0; i < len; i++) {
int n = unicode_func(PyUnicode_READ(kind, data, i), mapped);
for (int j = 0; j < n; j++) {
tmp[out_len++] = mapped[j];
}
}

PyObject *res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, tmp, out_len);
PyMem_Free(tmp);
return res;
#endif
}

PyObject *CPyStr_Lower(PyObject *self) {
#if CPY_3_13_FEATURES
return CPyStr_ChangeCase(self, CPy_ASCII_Lower, mypyc_interned_str.lower);
#else
return CPyStr_ChangeCase(self, CPy_ASCII_Lower, _PyUnicode_ToLowerFull);
#endif
}

PyObject *CPyStr_Upper(PyObject *self) {
#if CPY_3_13_FEATURES
return CPyStr_ChangeCase(self, CPy_ASCII_Upper, mypyc_interned_str.upper);
#else
return CPyStr_ChangeCase(self, CPy_ASCII_Upper, _PyUnicode_ToUpperFull);
#endif
}

bool CPyStr_IsDigit(PyObject *str) {
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
if (len == 0) return false;
Expand Down
18 changes: 18 additions & 0 deletions mypyc/primitives/str_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,24 @@
error_kind=ERR_NEG_INT,
)

# str.lower()
method_op(
name="lower",
arg_types=[str_rprimitive],
return_type=str_rprimitive,
c_function_name="CPyStr_Lower",
error_kind=ERR_MAGIC,
)

# str.upper()
method_op(
name="upper",
arg_types=[str_rprimitive],
return_type=str_rprimitive,
c_function_name="CPyStr_Upper",
error_kind=ERR_MAGIC,
)

method_op(
name="isspace",
arg_types=[str_rprimitive],
Expand Down
1 change: 1 addition & 0 deletions mypyc/test-data/fixtures/ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def lstrip(self, item: Optional[str] = None) -> str: pass
def rstrip(self, item: Optional[str] = None) -> str: pass
def join(self, x: Iterable[str]) -> str: pass
def format(self, *args: Any, **kwargs: Any) -> str: ...
def lower(self) -> str: ...
def upper(self) -> str: ...
def startswith(self, x: Union[str, Tuple[str, ...]], start: int=..., end: int=...) -> bool: ...
def endswith(self, x: Union[str, Tuple[str, ...]], start: int=..., end: int=...) -> bool: ...
Expand Down
20 changes: 20 additions & 0 deletions mypyc/test-data/irbuild-str.test
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,26 @@ L0:
r0 = CPyStr_Multiply(s, n)
return r0

[case testStrLower]
def do_lower(s: str) -> str:
return s.lower()
[out]
def do_lower(s):
s, r0 :: str
L0:
r0 = CPyStr_Lower(s)
return r0

[case testStrUpper]
def do_upper(s: str) -> str:
return s.upper()
[out]
def do_upper(s):
s, r0 :: str
L0:
r0 = CPyStr_Upper(s)
return r0

[case testStrIsSpace]
def is_space(x: str) -> bool:
return x.isspace()
Expand Down
45 changes: 45 additions & 0 deletions mypyc/test-data/run-strings.test
Original file line number Diff line number Diff line change
Expand Up @@ -1258,6 +1258,51 @@ FMT: Final = "{} {}"
def test_format() -> None:
assert FMT.format(400 + 20, "roll" + "up") == "420 rollup"

[case testLowerAndUpper]
from typing import Any

def test_lower_basic() -> None:
assert "".lower() == ""
assert "hello".lower() == "hello"
assert "HELLO".lower() == "hello"
assert "Hello World".lower() == "hello world"
assert "123".lower() == "123"
assert "ABC123".lower() == "abc123"

def test_upper_basic() -> None:
assert "".upper() == ""
assert "HELLO".upper() == "HELLO"
assert "hello".upper() == "HELLO"
assert "Hello World".upper() == "HELLO WORLD"
assert "123".upper() == "123"
assert "abc123".upper() == "ABC123"

def test_lower_unicode() -> None:
assert "\u00C9".lower() == "\u00E9" # É -> é
assert "\u0391\u0392".lower() == "\u03B1\u03B2" # ΑΒ -> αβ
assert "\u4E2D\u6587".lower() == "\u4E2D\u6587" # CJK (no case)
assert "\U0001F600".lower() == "\U0001F600" # Emoji (no case)

def test_upper_unicode() -> None:
assert "\u00E9".upper() == "\u00C9" # é -> É
assert "\u03B1\u03B2".upper() == "\u0391\u0392" # αβ -> ΑΒ
assert "\u4E2D\u6587".upper() == "\u4E2D\u6587" # CJK (no case)
assert "\U0001F600".upper() == "\U0001F600" # Emoji (no case)

def test_expansion() -> None:
# 1-to-N expansion cases
assert "\u0130".lower() == "\u0069\u0307" # İ -> i + combining dot above
assert "\uFB03".lower() == "\uFB03" # ffi ligature stays lowercase
assert "\u00DF".upper() == "SS" # ß -> SS
assert "\uFB03".upper() == "FFI" # ffi ligature -> FFI

def test_comprehensive() -> None:
for i in range(0x110000):
c = chr(i)
a: Any = c
assert c.lower() == a.lower(), f"lower mismatch at U+{i:04X}"
assert c.upper() == a.upper(), f"upper mismatch at U+{i:04X}"

[case testIsSpace]
from typing import Any

Expand Down