Skip to content

Commit e1a2749

Browse files
committed
[mypyc] Add native char type + codepoint fast paths for str ops
Adds a first-class `char` native type to mypyc, modeled on i64: stored unboxed as int32 codepoint, with -1 as the empty-string sentinel, and bidirectional str<->char promotion. Unblocks several codepoint-level fast paths in per-char loops. Core type plumbing: - `MYPYC_NATIVE_CHAR_NAMES` alongside `MYPYC_NATIVE_INT_NAMES` - str <-> char bidirectional `_promote` in semanal_classprop - str covers char in subtypes.covers_at_runtime + overlap in meet - `char_rprimitive` (int32, is_native_int, error_overlap=False) - `mypy_extensions.char` stub with `.is*()`, `.upper()`, `.strip()` Boxing / unboxing: - `CPyChar_FromObject` (accepts 0/1-char str, -113 on type error) - `CPyChar_ToStr` (uses interned empty-str singleton for -1) - `bool(char)` checks `!= -1`, not `!= 0`, so "\\0" stays truthy Codegen fast paths: - `char == char` / `char == "x"` / `s[i] == "x"` specializers in transform_comparison_expr compile to int compare of the codepoint - `ord(s[i])` refactored to share the codepoint read path - `char.isspace/isdigit/isalnum/isalpha/isidentifier/upper` method_ops route to codepoint-taking C helpers in str_extra_ops.h Two new IR transform passes (run after lower_ir, before dep collection): - char_str_index_fold: folds `Unbox(CPyStr_GetItem(s, i) -> char)` to a direct `CPyStr_GetCharAt` int32 read, avoiding the 1-char PyObject alloc - str_buffer_hoist: for function-arg strings, hoists PyUnicode_KIND/DATA reads out of per-char loops (strings are immutable so it's safe) Also adds `str.isalpha()` method_op via `CPyStr_IsAlpha`. Perf (sqlglot parse benchmarks, char vs stock mypyc): - tpch: +91.6% (1.27ms -> 0.66ms) - deep_arithmetic: +80.7% - many_numbers: +26.5% - geomean: +17.6% across 16 queries
1 parent 6cebac5 commit e1a2749

18 files changed

Lines changed: 682 additions & 25 deletions

File tree

mypy/meet.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
)
1818
from mypy.typeops import is_recursive_pair, make_simplified_union, tuple_fallback
1919
from mypy.types import (
20+
MYPYC_NATIVE_CHAR_NAMES,
2021
MYPYC_NATIVE_INT_NAMES,
2122
TUPLE_LIKE_INSTANCE_NAMES,
2223
AnyType,
@@ -592,6 +593,8 @@ def _type_object_overlap(left: Type, right: Type) -> bool:
592593

593594
if right.type.fullname == "builtins.int" and left.type.fullname in MYPYC_NATIVE_INT_NAMES:
594595
return True
596+
if right.type.fullname == "builtins.str" and left.type.fullname in MYPYC_NATIVE_CHAR_NAMES:
597+
return True
595598

596599
# Two unrelated types cannot be partially overlapping: they're disjoint.
597600
if left.type.has_base(right.type.fullname):

mypy/semanal_classprop.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,12 @@
2222
Var,
2323
)
2424
from mypy.options import Options
25-
from mypy.types import MYPYC_NATIVE_INT_NAMES, Instance, ProperType
25+
from mypy.types import (
26+
MYPYC_NATIVE_CHAR_NAMES,
27+
MYPYC_NATIVE_INT_NAMES,
28+
Instance,
29+
ProperType,
30+
)
2631

2732
# Hard coded type promotions (shared between all Python versions).
2833
# These add extra ad-hoc edges to the subtyping relation. For example,
@@ -184,5 +189,13 @@ def add_type_promotion(
184189
assert isinstance(int_sym.node, TypeInfo)
185190
int_sym.node._promote.append(Instance(defn.info, []))
186191
defn.info.alt_promote = Instance(int_sym.node, [])
192+
# Same pattern for str <-> char: char is a native-string type that
193+
# freely interconverts with str (stored unboxed as a codepoint under
194+
# mypyc).
195+
if defn.fullname in MYPYC_NATIVE_CHAR_NAMES:
196+
str_sym = builtin_names["str"]
197+
assert isinstance(str_sym.node, TypeInfo)
198+
str_sym.node._promote.append(Instance(defn.info, []))
199+
defn.info.alt_promote = Instance(str_sym.node, [])
187200
if promote_targets:
188201
defn.info._promote.extend(promote_targets)

mypy/subtypes.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from mypy.options import Options
3737
from mypy.state import state
3838
from mypy.types import (
39+
MYPYC_NATIVE_CHAR_NAMES,
3940
MYPYC_NATIVE_INT_NAMES,
4041
TUPLE_LIKE_INSTANCE_NAMES,
4142
TYPED_NAMEDTUPLE_NAMES,
@@ -2166,6 +2167,10 @@ def covers_at_runtime(item: Type, supertype: Type) -> bool:
21662167
# "int" covers all native int types
21672168
if item.type.fullname in MYPYC_NATIVE_INT_NAMES:
21682169
return True
2170+
elif isinstance(item, Instance) and supertype.type.fullname == "builtins.str":
2171+
# "str" covers the native char type
2172+
if item.type.fullname in MYPYC_NATIVE_CHAR_NAMES:
2173+
return True
21692174
# TODO: Add more special cases.
21702175
return False
21712176

mypy/types.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,12 @@
190190
"mypy_extensions.u8",
191191
)
192192

193+
# Mypyc native char type (compatible with builtins.str). str values are
194+
# implicitly usable where a ``char`` is expected (and vice versa), similar to
195+
# how int<->i64 works. Under mypyc, ``char`` is stored unboxed as an int32
196+
# codepoint with -1 as the empty-string sentinel.
197+
MYPYC_NATIVE_CHAR_NAMES: Final = ("mypy_extensions.char",)
198+
193199
DATACLASS_TRANSFORM_NAMES: Final = (
194200
"typing.dataclass_transform",
195201
"typing_extensions.dataclass_transform",

mypy/typeshed/stubs/mypy-extensions/mypy_extensions.pyi

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,3 +220,26 @@ class u8:
220220
def __gt__(self, x: u8) -> bool: ...
221221
def __index__(self) -> int: ...
222222
def __eq__(self, x: object) -> bool: ...
223+
224+
# char represents at most one Unicode codepoint. At the type-check level it's
225+
# bidirectionally compatible with str via the mypyc native-string promotion
226+
# mechanism. Under mypyc a char is stored unboxed as an int32 codepoint
227+
# (with -1 for the empty sentinel).
228+
class char:
229+
def __new__(cls, __x: str = ...) -> char: ...
230+
def __len__(self) -> int: ...
231+
def __eq__(self, x: object) -> bool: ...
232+
def __ne__(self, x: object) -> bool: ...
233+
def __hash__(self) -> int: ...
234+
# Mixed char/str concat. Result is str.
235+
def __add__(self, x: str) -> str: ...
236+
def __radd__(self, x: str) -> str: ...
237+
def isspace(self) -> bool: ...
238+
def isdigit(self) -> bool: ...
239+
def isalnum(self) -> bool: ...
240+
def isalpha(self) -> bool: ...
241+
def isidentifier(self) -> bool: ...
242+
# ASCII-only case conversion (a-z <-> A-Z, non-ASCII passes through).
243+
# Returns char so ``ch.upper() == "X"`` specializes to an int compare.
244+
def upper(self) -> char: ...
245+
def strip(self, __chars: str | None = ...) -> str: ...

mypyc/codegen/emit.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
is_bool_or_bit_rprimitive,
4545
is_bytearray_rprimitive,
4646
is_bytes_rprimitive,
47+
is_char_rprimitive,
4748
is_dict_rprimitive,
4849
is_fixed_width_rtype,
4950
is_float_rprimitive,
@@ -1045,6 +1046,16 @@ def emit_unbox(
10451046
self.emit_line(f"{dest} = CPyLong_AsInt16({src});")
10461047
if not isinstance(error, AssignHandler):
10471048
self.emit_unbox_failure_with_overlapping_error_value(dest, typ, failure)
1049+
elif is_char_rprimitive(typ):
1050+
# char is stored as int32 codepoint. Error sentinel: -113
1051+
# (type/value error). Empty-string sentinel: -1. Both are outside
1052+
# the valid Unicode codepoint range 0..0x10FFFF.
1053+
assert not optional
1054+
if declare_dest:
1055+
self.emit_line(f"int32_t {dest};")
1056+
self.emit_line(f"{dest} = CPyChar_FromObject({src});")
1057+
if not isinstance(error, AssignHandler):
1058+
self.emit_unbox_failure_with_overlapping_error_value(dest, typ, failure)
10481059
elif is_uint8_rprimitive(typ):
10491060
# Whether we are borrowing or not makes no difference.
10501061
assert not optional # Not supported for overlapping error values
@@ -1192,6 +1203,11 @@ def emit_box(
11921203
self.emit_inc_ref(dest, object_rprimitive)
11931204
elif is_int32_rprimitive(typ) or is_int16_rprimitive(typ) or is_uint8_rprimitive(typ):
11941205
self.emit_line(f"{declaration}{dest} = PyLong_FromLong({src});")
1206+
elif is_char_rprimitive(typ):
1207+
# char -> str: handles both the empty sentinel (-1 -> "") and
1208+
# valid codepoints (-> 1-char str). Error handled by caller if
1209+
# allocation fails (NULL return).
1210+
self.emit_line(f"{declaration}{dest} = CPyChar_ToStr({src});")
11951211
elif is_int64_rprimitive(typ):
11961212
self.emit_line(f"{declaration}{dest} = PyLong_FromLongLong({src});")
11971213
elif is_float_rprimitive(typ):

mypyc/codegen/emitmodule.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,15 @@
6666
from mypyc.irbuild.prepare import load_type_map
6767
from mypyc.namegen import NameGenerator, exported_name
6868
from mypyc.options import CompilerOptions
69+
from mypyc.transform.char_str_index_fold import do_char_str_index_fold
6970
from mypyc.transform.copy_propagation import do_copy_propagation
7071
from mypyc.transform.exceptions import insert_exception_handling
7172
from mypyc.transform.flag_elimination import do_flag_elimination
7273
from mypyc.transform.log_trace import insert_event_trace_logging
7374
from mypyc.transform.lower import lower_ir
7475
from mypyc.transform.refcount import insert_ref_count_opcodes
7576
from mypyc.transform.spill import insert_spills
77+
from mypyc.transform.str_buffer_hoist import do_str_buffer_hoist
7678
from mypyc.transform.uninit import insert_uninit_checks
7779

7880
# All the modules being compiled are divided into "groups". A group
@@ -265,11 +267,16 @@ def compile_scc_to_ir(
265267

266268
# Switch to lower abstraction level IR.
267269
lower_ir(fn, compiler_options)
270+
# Perform optimizations that may introduce new primitives with
271+
# their own source/header dependencies (e.g. str_extra_ops.h).
272+
# Run them before dependency collection so their needs are picked up.
273+
do_char_str_index_fold(fn, compiler_options)
274+
do_str_buffer_hoist(fn, compiler_options)
268275
# Calculate implicit module dependencies (needed for librt)
269276
deps = find_implicit_op_dependencies(fn)
270277
if deps is not None:
271278
module.dependencies.update(deps)
272-
# Perform optimizations.
279+
# Remaining optimizations.
273280
do_copy_propagation(fn, compiler_options)
274281
do_flag_elimination(fn, compiler_options)
275282

mypyc/ir/rtypes.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,28 @@ def __hash__(self) -> int:
394394
error_overlap=True,
395395
)
396396

397+
# char represents a single Unicode codepoint. Stored unboxed as int32 (the
398+
# full Unicode codepoint range of 0..0x10FFFF fits comfortably). Boxed form
399+
# is a 1-character str. Distinct from int32_rprimitive so specializers can
400+
# route char-typed operations through codepoint primitives.
401+
char_rprimitive: Final = RPrimitive(
402+
"char",
403+
is_unboxed=True,
404+
is_refcounted=False,
405+
# Marked native_int so mypyc's fixed-width coercion paths handle
406+
# char <-> int32 transparently. Semantically char is a codepoint rather
407+
# than a general integer; users shouldn't do arithmetic on it.
408+
is_native_int=True,
409+
is_signed=True,
410+
ctype="int32_t",
411+
size=4,
412+
# error_overlap=False: the error sentinel -113 is outside the valid Unicode
413+
# codepoint range (0..0x10FFFF), so the sentinel compare alone is
414+
# authoritative — no PyErr_Occurred confirmation needed. This eliminates
415+
# the double error check that ERR_MAGIC_OVERLAPPING requires.
416+
error_overlap=False,
417+
)
418+
397419
# The following unsigned native int types (u16, u32, u64) are not
398420
# exposed to the user. They are for internal use within mypyc only.
399421

@@ -592,13 +614,18 @@ def is_fixed_width_rtype(rtype: RType) -> TypeGuard[RPrimitive]:
592614
or is_int32_rprimitive(rtype)
593615
or is_int16_rprimitive(rtype)
594616
or is_uint8_rprimitive(rtype)
617+
or is_char_rprimitive(rtype)
595618
)
596619

597620

598621
def is_uint8_rprimitive(rtype: RType) -> TypeGuard[RPrimitive]:
599622
return rtype is uint8_rprimitive
600623

601624

625+
def is_char_rprimitive(rtype: RType) -> TypeGuard[RPrimitive]:
626+
return rtype is char_rprimitive
627+
628+
602629
def is_uint32_rprimitive(rtype: RType) -> TypeGuard[RPrimitive]:
603630
return rtype is uint32_rprimitive
604631

mypyc/irbuild/expression.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,10 @@
8181
RTuple,
8282
RVec,
8383
bool_rprimitive,
84+
char_rprimitive,
8485
int_rprimitive,
8586
is_any_int,
87+
is_char_rprimitive,
8688
is_fixed_width_rtype,
8789
is_int64_rprimitive,
8890
is_int_rprimitive,
@@ -91,6 +93,7 @@
9193
is_object_rprimitive,
9294
object_rprimitive,
9395
set_rprimitive,
96+
short_int_rprimitive,
9497
)
9598
from mypyc.irbuild.ast_helpers import is_borrow_friendly_expr, process_conditional
9699
from mypyc.irbuild.builder import IRBuilder, int_borrow_friendly_op
@@ -115,6 +118,7 @@
115118
apply_method_specialization,
116119
translate_object_new,
117120
translate_object_setattr,
121+
try_emit_str_index_as_int,
118122
)
119123
from mypyc.irbuild.vec import (
120124
vec_append,
@@ -838,6 +842,63 @@ def precompute_set_literal(builder: IRBuilder, s: SetExpr) -> Value | None:
838842
return None
839843

840844

845+
def try_specialize_str_char_compare(
846+
builder: IRBuilder, op: str, lhs: Expression, rhs: Expression, line: int
847+
) -> Value | None:
848+
"""Specialize ``s[i] == 'x'`` / ``s[i] != 'x'`` where ``'x'`` is a 1-char
849+
string literal and ``s: str``, ``i`` is int-like.
850+
851+
Rewrites the comparison to an integer compare of the codepoint, avoiding
852+
allocation of a single-char PyObject and the full PyUnicode_Compare path.
853+
"""
854+
for index_side, literal_side in ((lhs, rhs), (rhs, lhs)):
855+
if not isinstance(index_side, IndexExpr):
856+
continue
857+
folded = constant_fold_expr(builder, literal_side)
858+
if not isinstance(folded, str) or len(folded) != 1:
859+
continue
860+
char_int = try_emit_str_index_as_int(builder, index_side)
861+
if char_int is None:
862+
continue
863+
literal_int = Integer(ord(folded), short_int_rprimitive, line)
864+
return builder.binary_op(char_int, literal_int, op, line)
865+
return None
866+
867+
868+
def try_specialize_char_compare(
869+
builder: IRBuilder, op: str, lhs: Expression, rhs: Expression, line: int
870+
) -> Value | None:
871+
"""Specialize ``char == char`` and ``char == 1-char-str-literal``
872+
(and the != variants, and symmetric orderings) to int compare of the
873+
underlying codepoint.
874+
"""
875+
if op not in ("==", "!="):
876+
return None
877+
lhs_type = builder.node_type(lhs)
878+
rhs_type = builder.node_type(rhs)
879+
# char == char
880+
if is_char_rprimitive(lhs_type) and is_char_rprimitive(rhs_type):
881+
l_val = builder.accept(lhs)
882+
r_val = builder.accept(rhs)
883+
return builder.binary_op(l_val, r_val, op, line)
884+
# char == <0- or 1-char str literal> (and symmetric). Empty string "" is
885+
# encoded as the char empty sentinel -1; any 1-char literal is its ord.
886+
for char_side, char_type, lit_side in (
887+
(lhs, lhs_type, rhs),
888+
(rhs, rhs_type, lhs),
889+
):
890+
if not is_char_rprimitive(char_type):
891+
continue
892+
folded = constant_fold_expr(builder, lit_side)
893+
if not isinstance(folded, str) or len(folded) > 1:
894+
continue
895+
char_val = builder.accept(char_side)
896+
codepoint_val = -1 if len(folded) == 0 else ord(folded)
897+
codepoint = Integer(codepoint_val, char_rprimitive, line)
898+
return builder.binary_op(char_val, codepoint, op, line)
899+
return None
900+
901+
841902
def transform_comparison_expr(builder: IRBuilder, e: ComparisonExpr) -> Value:
842903
# x in (...)/[...]
843904
# x not in (...)/[...]
@@ -848,6 +909,15 @@ def transform_comparison_expr(builder: IRBuilder, e: ComparisonExpr) -> Value:
848909
return result
849910

850911
if len(e.operators) == 1:
912+
# Codepoint fast paths for equality:
913+
# char == char / char == "c" -> int compare
914+
# s[i] == "c" -> int compare against codepoint
915+
if first_op in ("==", "!="):
916+
for specializer in (try_specialize_char_compare, try_specialize_str_char_compare):
917+
result = specializer(builder, first_op, e.operands[0], e.operands[1], e.line)
918+
if result is not None:
919+
return result
920+
851921
# Special some common simple cases
852922
if first_op in ("is", "is not"):
853923
right_expr = e.operands[1]

mypyc/irbuild/ll_builder.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@
104104
is_bool_or_bit_rprimitive,
105105
is_bytes_rprimitive,
106106
is_c_py_ssize_t_rprimitive,
107+
is_char_rprimitive,
107108
is_dict_rprimitive,
108109
is_fixed_width_rtype,
109110
is_float_rprimitive,
@@ -2126,6 +2127,12 @@ def bool_value(self, value: Value) -> Value:
21262127
elif is_runtime_subtype(value.type, int_rprimitive):
21272128
zero = Integer(0, short_int_rprimitive)
21282129
result = self.comparison_op(value, zero, ComparisonOp.NEQ, value.line)
2130+
elif is_char_rprimitive(value.type):
2131+
# char is falsy only for the empty sentinel (-1). A codepoint of
2132+
# 0 (NUL) is a valid non-empty char and must be truthy, matching
2133+
# str bool semantics where ``"\0"`` is truthy.
2134+
empty = Integer(-1, value.type)
2135+
result = self.add(ComparisonOp(value, empty, ComparisonOp.NEQ))
21292136
elif is_fixed_width_rtype(value.type):
21302137
zero = Integer(0, value.type)
21312138
result = self.add(ComparisonOp(value, zero, ComparisonOp.NEQ))

0 commit comments

Comments
 (0)