Skip to content

Commit b3fbc77

Browse files
committed
[mypyc] Add native char type + codepoint fast paths for str ops
Adds a first-class `char` native type to mypyc, modeled on i64: stored unboxed as int32 codepoint, with -1 as the empty-string sentinel, and bidirectional str<->char promotion. Unblocks codepoint-level fast paths in per-char loops. Core type plumbing: - MYPYC_NATIVE_CHAR_NAMES alongside MYPYC_NATIVE_INT_NAMES - str <-> char bidirectional _promote in semanal_classprop - str covers char in subtypes.covers_at_runtime + overlap in meet - char_rprimitive (int32, is_native_int, error_overlap=False) - mypy_extensions.char stub Boxing / unboxing: - CPyChar_FromObject (accepts 0/1-char str, -113 on type error) - CPyChar_ToStr (uses interned empty-str singleton for -1) - bool(char) checks != -1, not != 0, so "\0" stays truthy Codegen fast paths: - try_specialize_codepoint_compare in transform_comparison_expr handles char/char, char/s[i], char/0-or-1-char-literal, and s[i]/literal uniformly, compiling to int compare of the codepoint - ord(s[i]) refactored to share the codepoint read path - char.isspace/isdigit/isalnum/isalpha/isidentifier/upper method_ops route to codepoint-taking C helpers in str_extra_ops.h - CPyChar_IsIdentifier delegates to PyUnicode_IsIdentifier for non-ASCII (correct XID_Start handling rather than Py_UNICODE_ISALPHA approximation) - CPyChar_Upper falls back to str.upper() for non-ASCII, returning the original codepoint when upper() produces multiple chars (e.g. ß -> SS) since char holds one codepoint New IR transform pass (runs after lower_ir, before dep collection): - char_str_index_fold: folds Unbox(CPyStr_GetItem(s, i) -> char) to a direct CPyStr_GetCharAt int32 read, avoiding the 1-char PyObject alloc Also adds str.isalpha() method_op via CPyStr_IsAlpha. Tests: - run-char.test covers boxing/unboxing, bool semantics (NUL is truthy, empty is falsy), equality, classification methods (including non-ASCII XID_Start for isidentifier), upper (including ß -> ß pinning for the multi-char fallback), str promotion, concatenation, s[i]=="x" specialization, ord, and astral-plane codepoints. - char stub added to test-data/unit/lib-stub/mypy_extensions.pyi so the test harness can resolve the type.
1 parent 5b05d32 commit b3fbc77

20 files changed

Lines changed: 785 additions & 22 deletions

File tree

mypy/meet.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
)
1818
from mypy.typeops import is_recursive_pair, make_simplified_union, tuple_fallback
1919
from mypy.types import (
20+
MYPYC_NATIVE_CHAR_NAMES,
2021
MYPYC_NATIVE_INT_NAMES,
2122
TUPLE_LIKE_INSTANCE_NAMES,
2223
AnyType,
@@ -592,6 +593,8 @@ def _type_object_overlap(left: Type, right: Type) -> bool:
592593

593594
if right.type.fullname == "builtins.int" and left.type.fullname in MYPYC_NATIVE_INT_NAMES:
594595
return True
596+
if right.type.fullname == "builtins.str" and left.type.fullname in MYPYC_NATIVE_CHAR_NAMES:
597+
return True
595598

596599
# Two unrelated types cannot be partially overlapping: they're disjoint.
597600
if left.type.has_base(right.type.fullname):

mypy/semanal_classprop.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
Var,
2323
)
2424
from mypy.options import Options
25-
from mypy.types import MYPYC_NATIVE_INT_NAMES, Instance, ProperType
25+
from mypy.types import MYPYC_NATIVE_CHAR_NAMES, MYPYC_NATIVE_INT_NAMES, Instance, ProperType
2626

2727
# Hard coded type promotions (shared between all Python versions).
2828
# These add extra ad-hoc edges to the subtyping relation. For example,
@@ -184,5 +184,11 @@ def add_type_promotion(
184184
assert isinstance(int_sym.node, TypeInfo)
185185
int_sym.node._promote.append(Instance(defn.info, []))
186186
defn.info.alt_promote = Instance(int_sym.node, [])
187+
# Same pattern for str <-> char (mypyc native char type).
188+
if defn.fullname in MYPYC_NATIVE_CHAR_NAMES:
189+
str_sym = builtin_names["str"]
190+
assert isinstance(str_sym.node, TypeInfo)
191+
str_sym.node._promote.append(Instance(defn.info, []))
192+
defn.info.alt_promote = Instance(str_sym.node, [])
187193
if promote_targets:
188194
defn.info._promote.extend(promote_targets)

mypy/subtypes.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from mypy.options import Options
3737
from mypy.state import state
3838
from mypy.types import (
39+
MYPYC_NATIVE_CHAR_NAMES,
3940
MYPYC_NATIVE_INT_NAMES,
4041
TUPLE_LIKE_INSTANCE_NAMES,
4142
TYPED_NAMEDTUPLE_NAMES,
@@ -2166,6 +2167,10 @@ def covers_at_runtime(item: Type, supertype: Type) -> bool:
21662167
# "int" covers all native int types
21672168
if item.type.fullname in MYPYC_NATIVE_INT_NAMES:
21682169
return True
2170+
elif isinstance(item, Instance) and supertype.type.fullname == "builtins.str":
2171+
# "str" covers the native char type
2172+
if item.type.fullname in MYPYC_NATIVE_CHAR_NAMES:
2173+
return True
21692174
# TODO: Add more special cases.
21702175
return False
21712176

mypy/types.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,10 @@
190190
"mypy_extensions.u8",
191191
)
192192

193+
# Mypyc native char type (compatible with builtins.str, bidirectionally;
194+
# stored unboxed as an int32 codepoint with -1 as the empty sentinel).
195+
MYPYC_NATIVE_CHAR_NAMES: Final = ("mypy_extensions.char",)
196+
193197
DATACLASS_TRANSFORM_NAMES: Final = (
194198
"typing.dataclass_transform",
195199
"typing_extensions.dataclass_transform",

mypy/typeshed/stubs/mypy-extensions/mypy_extensions.pyi

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,3 +220,27 @@ class u8:
220220
def __gt__(self, x: u8) -> bool: ...
221221
def __index__(self) -> int: ...
222222
def __eq__(self, x: object) -> bool: ...
223+
224+
# char represents at most one Unicode codepoint. At the type-check level it's
225+
# bidirectionally compatible with str via the mypyc native-string promotion
226+
# mechanism. Under mypyc a char is stored unboxed as an int32 codepoint
227+
# (with -1 for the empty sentinel).
228+
class char:
229+
def __new__(cls, __x: str = ...) -> char: ...
230+
def __eq__(self, x: object) -> bool: ...
231+
def __ne__(self, x: object) -> bool: ...
232+
def __hash__(self) -> int: ...
233+
# Mixed char/str concat. Result is str.
234+
def __add__(self, x: str) -> str: ...
235+
def __radd__(self, x: str) -> str: ...
236+
def isspace(self) -> bool: ...
237+
def isdigit(self) -> bool: ...
238+
def isalnum(self) -> bool: ...
239+
def isalpha(self) -> bool: ...
240+
def isidentifier(self) -> bool: ...
241+
# Case conversion. Returns the original codepoint if the Unicode
242+
# result is multi-char (e.g. ß.upper() = "SS") since char holds one
243+
# codepoint.
244+
def upper(self) -> char: ...
245+
def lower(self) -> char: ...
246+
def strip(self, __chars: str | None = ...) -> str: ...

mypyc/codegen/emit.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
is_bool_or_bit_rprimitive,
4545
is_bytearray_rprimitive,
4646
is_bytes_rprimitive,
47+
is_char_rprimitive,
4748
is_dict_rprimitive,
4849
is_fixed_width_rtype,
4950
is_float_rprimitive,
@@ -1045,6 +1046,13 @@ def emit_unbox(
10451046
self.emit_line(f"{dest} = CPyLong_AsInt16({src});")
10461047
if not isinstance(error, AssignHandler):
10471048
self.emit_unbox_failure_with_overlapping_error_value(dest, typ, failure)
1049+
elif is_char_rprimitive(typ):
1050+
assert not optional
1051+
if declare_dest:
1052+
self.emit_line(f"int32_t {dest};")
1053+
self.emit_line(f"{dest} = CPyChar_FromObject({src});")
1054+
if not isinstance(error, AssignHandler):
1055+
self.emit_unbox_failure_with_overlapping_error_value(dest, typ, failure)
10481056
elif is_uint8_rprimitive(typ):
10491057
# Whether we are borrowing or not makes no difference.
10501058
assert not optional # Not supported for overlapping error values
@@ -1192,6 +1200,8 @@ def emit_box(
11921200
self.emit_inc_ref(dest, object_rprimitive)
11931201
elif is_int32_rprimitive(typ) or is_int16_rprimitive(typ) or is_uint8_rprimitive(typ):
11941202
self.emit_line(f"{declaration}{dest} = PyLong_FromLong({src});")
1203+
elif is_char_rprimitive(typ):
1204+
self.emit_line(f"{declaration}{dest} = CPyChar_ToStr({src});")
11951205
elif is_int64_rprimitive(typ):
11961206
self.emit_line(f"{declaration}{dest} = PyLong_FromLongLong({src});")
11971207
elif is_float_rprimitive(typ):

mypyc/codegen/emitmodule.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
from mypyc.irbuild.prepare import load_type_map
6767
from mypyc.namegen import NameGenerator, exported_name
6868
from mypyc.options import CompilerOptions
69+
from mypyc.transform.char_str_index_fold import do_char_str_index_fold
6970
from mypyc.transform.copy_propagation import do_copy_propagation
7071
from mypyc.transform.exceptions import insert_exception_handling
7172
from mypyc.transform.flag_elimination import do_flag_elimination
@@ -265,11 +266,14 @@ def compile_scc_to_ir(
265266

266267
# Switch to lower abstraction level IR.
267268
lower_ir(fn, compiler_options)
269+
# Run char_str_index_fold before dependency collection so the new
270+
# str_extra_ops.h primitives it introduces are picked up.
271+
do_char_str_index_fold(fn, compiler_options)
268272
# Calculate implicit module dependencies (needed for librt)
269273
deps = find_implicit_op_dependencies(fn)
270274
if deps is not None:
271275
module.dependencies.update(deps)
272-
# Perform optimizations.
276+
# Remaining optimizations.
273277
do_copy_propagation(fn, compiler_options)
274278
do_flag_elimination(fn, compiler_options)
275279

mypyc/ir/rtypes.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,20 @@ def __hash__(self) -> int:
394394
error_overlap=True,
395395
)
396396

397+
# char: single Unicode codepoint stored as int32; -1 empty sentinel, -113
398+
# error sentinel. Distinct from int32_rprimitive so specializers can route
399+
# char-typed operations through codepoint primitives.
400+
char_rprimitive: Final = RPrimitive(
401+
"char",
402+
is_unboxed=True,
403+
is_refcounted=False,
404+
is_native_int=True,
405+
is_signed=True,
406+
ctype="int32_t",
407+
size=4,
408+
error_overlap=False,
409+
)
410+
397411
# The following unsigned native int types (u16, u32, u64) are not
398412
# exposed to the user. They are for internal use within mypyc only.
399413

@@ -592,13 +606,18 @@ def is_fixed_width_rtype(rtype: RType) -> TypeGuard[RPrimitive]:
592606
or is_int32_rprimitive(rtype)
593607
or is_int16_rprimitive(rtype)
594608
or is_uint8_rprimitive(rtype)
609+
or is_char_rprimitive(rtype)
595610
)
596611

597612

598613
def is_uint8_rprimitive(rtype: RType) -> TypeGuard[RPrimitive]:
599614
return rtype is uint8_rprimitive
600615

601616

617+
def is_char_rprimitive(rtype: RType) -> TypeGuard[RPrimitive]:
618+
return rtype is char_rprimitive
619+
620+
602621
def is_uint32_rprimitive(rtype: RType) -> TypeGuard[RPrimitive]:
603622
return rtype is uint32_rprimitive
604623

mypyc/irbuild/expression.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,18 +79,24 @@
7979
from mypyc.ir.rtypes import (
8080
RInstance,
8181
RTuple,
82+
RType,
8283
RVec,
8384
bool_rprimitive,
85+
char_rprimitive,
8486
int_rprimitive,
8587
is_any_int,
88+
is_char_rprimitive,
8689
is_fixed_width_rtype,
8790
is_int64_rprimitive,
8891
is_int_rprimitive,
8992
is_list_rprimitive,
9093
is_none_rprimitive,
9194
is_object_rprimitive,
95+
is_str_rprimitive,
96+
is_tagged,
9297
object_rprimitive,
9398
set_rprimitive,
99+
short_int_rprimitive,
94100
)
95101
from mypyc.irbuild.ast_helpers import is_borrow_friendly_expr, process_conditional
96102
from mypyc.irbuild.builder import IRBuilder, int_borrow_friendly_op
@@ -115,6 +121,7 @@
115121
apply_method_specialization,
116122
translate_object_new,
117123
translate_object_setattr,
124+
try_emit_str_index_as_int,
118125
)
119126
from mypyc.irbuild.vec import (
120127
vec_append,
@@ -843,6 +850,66 @@ def precompute_set_literal(builder: IRBuilder, s: SetExpr) -> Value | None:
843850
return None
844851

845852

853+
def _codepoint_kind(builder: IRBuilder, expr: Expression, expr_type: RType) -> str | None:
854+
"""Classify expr as a codepoint candidate without emitting IR.
855+
856+
Returns "char", "index" (for ``s[i]`` with int-like index), or None.
857+
"""
858+
if is_char_rprimitive(expr_type):
859+
return "char"
860+
if isinstance(expr, IndexExpr) and is_str_rprimitive(builder.node_type(expr.base)):
861+
idx_type = builder.node_type(expr.index)
862+
if is_tagged(idx_type) or is_fixed_width_rtype(idx_type):
863+
return "index"
864+
return None
865+
866+
867+
def _emit_codepoint_value(
868+
builder: IRBuilder, expr: Expression, kind: str
869+
) -> tuple[Value, RType]:
870+
"""Emit the codepoint read. ``kind`` must come from _codepoint_kind."""
871+
if kind == "char":
872+
return builder.accept(expr), char_rprimitive
873+
assert isinstance(expr, IndexExpr)
874+
val = try_emit_str_index_as_int(builder, expr)
875+
assert val is not None # _codepoint_kind guarantees this
876+
return val, short_int_rprimitive
877+
878+
879+
def try_specialize_codepoint_compare(
880+
builder: IRBuilder, op: str, lhs: Expression, rhs: Expression, line: int
881+
) -> Value | None:
882+
"""Rewrite ``x == y`` / ``x != y`` to an int compare of codepoints when at
883+
least one side is a codepoint (``char`` value or ``s[i]`` on a str) and the
884+
other is either another codepoint or a 0/1-char str literal. Avoids the
885+
1-char PyObject alloc + PyUnicode_Compare.
886+
"""
887+
if op not in ("==", "!="):
888+
return None
889+
lhs_kind = _codepoint_kind(builder, lhs, builder.node_type(lhs))
890+
rhs_kind = _codepoint_kind(builder, rhs, builder.node_type(rhs))
891+
if lhs_kind is None and rhs_kind is None:
892+
return None
893+
# Codepoint on both sides: direct int compare.
894+
if lhs_kind is not None and rhs_kind is not None:
895+
l_val, _ = _emit_codepoint_value(builder, lhs, lhs_kind)
896+
r_val, _ = _emit_codepoint_value(builder, rhs, rhs_kind)
897+
return builder.binary_op(l_val, r_val, op, line)
898+
# One side codepoint, other side must fold to a 0/1-char str literal.
899+
if lhs_kind is not None:
900+
cp_expr, cp_kind, lit_expr = lhs, lhs_kind, rhs
901+
else:
902+
assert rhs_kind is not None
903+
cp_expr, cp_kind, lit_expr = rhs, rhs_kind, lhs
904+
folded = constant_fold_expr(builder, lit_expr)
905+
if not isinstance(folded, str) or len(folded) > 1:
906+
return None # No IR emitted yet — safe to bail.
907+
val, rtype = _emit_codepoint_value(builder, cp_expr, cp_kind)
908+
# Empty string encodes as -1 (char empty sentinel).
909+
codepoint = -1 if len(folded) == 0 else ord(folded)
910+
return builder.binary_op(val, Integer(codepoint, rtype, line), op, line)
911+
912+
846913
def transform_comparison_expr(builder: IRBuilder, e: ComparisonExpr) -> Value:
847914
# x in (...)/[...]
848915
# x not in (...)/[...]
@@ -853,6 +920,15 @@ def transform_comparison_expr(builder: IRBuilder, e: ComparisonExpr) -> Value:
853920
return result
854921

855922
if len(e.operators) == 1:
923+
# Codepoint fast path: char/char, char/s[i], or codepoint/1-char-literal
924+
# -> int compare instead of PyUnicode_Compare.
925+
if first_op in ("==", "!="):
926+
result = try_specialize_codepoint_compare(
927+
builder, first_op, e.operands[0], e.operands[1], e.line
928+
)
929+
if result is not None:
930+
return result
931+
856932
# Special some common simple cases
857933
if first_op in ("is", "is not"):
858934
right_expr = e.operands[1]

mypyc/irbuild/ll_builder.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@
104104
is_bool_or_bit_rprimitive,
105105
is_bytes_rprimitive,
106106
is_c_py_ssize_t_rprimitive,
107+
is_char_rprimitive,
107108
is_dict_rprimitive,
108109
is_fixed_width_rtype,
109110
is_float_rprimitive,
@@ -685,6 +686,7 @@ def coerce_short_int_to_fixed_width(self, src: Value, target_type: RType, line:
685686
def coerce_fixed_width_to_int(self, src: Value, line: int) -> Value:
686687
if (
687688
(is_int32_rprimitive(src.type) and PLATFORM_SIZE == 8)
689+
or (is_char_rprimitive(src.type) and PLATFORM_SIZE == 8)
688690
or is_int16_rprimitive(src.type)
689691
or is_uint8_rprimitive(src.type)
690692
):
@@ -717,7 +719,7 @@ def coerce_fixed_width_to_int(self, src: Value, line: int) -> Value:
717719
self.activate_block(slow)
718720
if is_int64_rprimitive(src_type):
719721
conv_op = int64_to_int_op
720-
elif is_int32_rprimitive(src_type):
722+
elif is_int32_rprimitive(src_type) or is_char_rprimitive(src_type):
721723
assert PLATFORM_SIZE == 4
722724
conv_op = ssize_t_to_int_op
723725
else:
@@ -2126,6 +2128,12 @@ def bool_value(self, value: Value) -> Value:
21262128
elif is_runtime_subtype(value.type, int_rprimitive):
21272129
zero = Integer(0, short_int_rprimitive)
21282130
result = self.comparison_op(value, zero, ComparisonOp.NEQ, value.line)
2131+
elif is_char_rprimitive(value.type):
2132+
# char is falsy only for the empty sentinel (-1). A codepoint of
2133+
# 0 (NUL) is a valid non-empty char and must be truthy, matching
2134+
# str bool semantics where ``"\0"`` is truthy.
2135+
empty = Integer(-1, value.type)
2136+
result = self.add(ComparisonOp(value, empty, ComparisonOp.NEQ))
21292137
elif is_fixed_width_rtype(value.type):
21302138
zero = Integer(0, value.type)
21312139
result = self.add(ComparisonOp(value, zero, ComparisonOp.NEQ))

0 commit comments

Comments
 (0)