Skip to content

Commit 6fba903

Browse files
davidfstrclaude
andcommitted
TypeForm: Add 7 more early-reject filters to semanal.py's try_parse_as_type_expression()
These filters reduce the mypy's wall clock slowdown when checking the mypy codebase after the introduction of TypeForm from +2.03% to +1.21%, when using `misc/perf_compare.py` to profile. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 208f9b6 commit 6fba903

1 file changed

Lines changed: 130 additions & 1 deletion

File tree

mypy/semanal.py

Lines changed: 130 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,52 @@
375375
# string literal as a type expression.
376376
_MULTIPLE_WORDS_NONTYPE_RE = re.compile(r'\s*[^\s.\'"|\[]+\s+[^\s.\'"|\[]')
377377

378+
# Matches any valid Python identifier, including identifiers with Unicode characters.
379+
#
380+
# [^\d\W] = word character that is not a digit
381+
# \w = word character
382+
# \Z = match end of string; does not allow a trailing \n, unlike $
383+
_IDENTIFIER_RE = re.compile(r"^[^\d\W]\w*\Z", re.UNICODE)
384+
385+
# Matches if the string contains at least one identifier-start character
386+
# (letter or underscore).
387+
_CONTAINS_IDENTIFIER_RE = re.compile(r"[^\W\d]", re.UNICODE)
388+
389+
# Matches a dotted identifier (e.g. 'builtins.tuple', 'typing.Mapping', 'a.b.c').
390+
_DOTTED_IDENTIFIER_RE = re.compile(r"^[^\d\W]\w*(\.[^\d\W]\w*)+\Z", re.UNICODE)
391+
392+
# Matches a dotted name (one or more identifier components joined by '.').
393+
# Accepts a bare identifier with zero dots. Used to extract every
394+
# dotted identifier from inside a stringified type expression.
395+
_CONTAINED_DOTTED_IDENTIFIER_RE = re.compile(r"[^\W\d]\w*(?:\.[^\W\d]\w*)*", re.UNICODE)
396+
397+
# Matches several patterns that never appear in valid type expressions
398+
# NOTE: Allows '*' for (PEP 646 Unpack) and '+' for (Literal[+N])
399+
_NONTYPE_PATTERN_RE = re.compile(
400+
# Characters never valid in a type expression
401+
r"[!:/<>@%$^?;&~`\\]|"
402+
# '-' not directly preceded by '[' (which can occur in Literal[-N])
403+
# NOTE: Incorrectly rejects multi-element edge cases like Literal[-1, -2]
404+
# which appear in stringified type expressions, which are expected
405+
# to be rare in practice.
406+
r"(?<!\[)-|"
407+
# Leading '.' (incomplete dotted name, file extension, etc)
408+
r"^\.|"
409+
# Trailing '.' (incomplete dotted name, file extension, etc)
410+
r"\.$"
411+
)
412+
413+
# Matches if the first character of the string is invalid as the start of
414+
# a type expression
415+
_NONTYPE_FIRST_CHAR_RE = re.compile(
416+
# Any non-word char other than '*' (which is reserved for PEP 646 Unpack:
417+
# 'tuple[int, *Ts]') or whitespace
418+
r"\A[^\s*\w]|"
419+
# A digit
420+
r"\A\d",
421+
re.UNICODE,
422+
)
423+
378424

379425
class SemanticAnalyzer(
380426
NodeVisitor[None], SemanticAnalyzerInterface, SemanticAnalyzerPluginInterface, SplittingVisitor
@@ -8090,6 +8136,26 @@ def try_parse_as_type_expression(self, maybe_type_expr: Expression) -> None:
80908136
return
80918137
elif isinstance(maybe_type_expr, StrExpr):
80928138
str_value = maybe_type_expr.value # cache
8139+
# (TODO: Experiment with the ordering of all the following filters,
8140+
# to frontload those most efficient at rejecting early.)
8141+
# Filter out string literals with no identifier-start characters
8142+
# (pure punctuation/digits/whitespace) which cannot be type expressions
8143+
if not _CONTAINS_IDENTIFIER_RE.search(str_value):
8144+
maybe_type_expr.as_type = None
8145+
return
8146+
# Filter out string literals whose first non-whitespace character
8147+
# cannot start a valid type expression (a digit, or punctuation
8148+
# other than '*').
8149+
if _NONTYPE_FIRST_CHAR_RE.match(str_value):
8150+
maybe_type_expr.as_type = None
8151+
return
8152+
# Filter out string literals with common patterns that could not
8153+
# possibly be in a type expression
8154+
if _MULTIPLE_WORDS_NONTYPE_RE.match(str_value):
8155+
# A common pattern in string literals containing a sentence.
8156+
# But cannot be a type expression.
8157+
maybe_type_expr.as_type = None
8158+
return
80938159
# Filter out string literals which look like an identifier but
80948160
# cannot be a type expression, for a few common reasons
80958161
if str_value.isidentifier():
@@ -8116,7 +8182,40 @@ def try_parse_as_type_expression(self, maybe_type_expr: Expression) -> None:
81168182
# 2. unbound_paramspec: f'ParamSpec "{name}" is unbound' [codes.VALID_TYPE]
81178183
maybe_type_expr.as_type = None
81188184
return
8119-
else: # does not look like an identifier
8185+
if (
8186+
isinstance(node, Var)
8187+
and isinstance(get_proper_type(node.type), Instance)
8188+
and not self.var_is_typing_special_form(node)
8189+
):
8190+
# Var whose declared type is a concrete instance: it is
8191+
# a value (local, parameter, module-level constant),
8192+
# not a type expression.
8193+
maybe_type_expr.as_type = None
8194+
return
8195+
if isinstance(node, (FuncDef, OverloadedFuncDef, MypyFile)):
8196+
# Functions and modules are never type expressions.
8197+
maybe_type_expr.as_type = None
8198+
return
8199+
elif _DOTTED_IDENTIFIER_RE.fullmatch(str_value):
8200+
# Dotted-name string (e.g. "builtins.tuple", "typing.Mapping").
8201+
# Look up the leftmost component; if it can't possibly be a
8202+
# type prefix, bail. Mirrors the IndexExpr-with-MemberExpr-base
8203+
# filter logic below.
8204+
leftmost = str_value.split(".", 1)[0]
8205+
sym = self.lookup(leftmost, UnboundType(leftmost), suppress_errors=True)
8206+
if sym is None:
8207+
# Leftmost component does not refer to anything in scope
8208+
maybe_type_expr.as_type = None
8209+
return
8210+
node = sym.node # cache
8211+
if isinstance(node, PlaceholderNode) and not node.becomes_typeinfo:
8212+
maybe_type_expr.as_type = None
8213+
return
8214+
if isinstance(node, Var) and not self.var_is_typing_special_form(node):
8215+
# Leftmost component is a Var: cannot be a type prefix
8216+
maybe_type_expr.as_type = None
8217+
return
8218+
else: # does not look like an identifier or dotted identifier
81208219
if '"' in str_value or "'" in str_value:
81218220
# Only valid inside a Literal[...] or Annotated[..., ...] type
81228221
if "[" not in str_value:
@@ -8135,6 +8234,34 @@ def try_parse_as_type_expression(self, maybe_type_expr: Expression) -> None:
81358234
# But cannot be a type expression.
81368235
maybe_type_expr.as_type = None
81378236
return
8237+
# Skip some checks when a non-zero even number of single or double quotes
8238+
# signals a possible Literal[...] component, whose quoted content
8239+
# could contain anything: symbols or identifiers that would be
8240+
# incorrectly processed by some checks.
8241+
sq = str_value.count("'")
8242+
dq = str_value.count('"')
8243+
if not ((sq > 0 and sq % 2 == 0) or (dq > 0 and dq % 2 == 0)):
8244+
# Filter out string literals containing characters or boundary
8245+
# patterns that never appear in valid type expressions
8246+
# (e.g. '/', ':', '<', '>', '@', leading/trailing '.').
8247+
if _NONTYPE_PATTERN_RE.search(str_value):
8248+
maybe_type_expr.as_type = None
8249+
return
8250+
# A string that can spell a valid type must contain 1+ dotted names,
8251+
# all of whose leftmost identifiers must exist in the local scope.
8252+
found = False
8253+
for m in _CONTAINED_DOTTED_IDENTIFIER_RE.finditer(str_value):
8254+
found = True
8255+
leftmost = m.group().split(".", 1)[0]
8256+
if (
8257+
self.lookup(leftmost, UnboundType(leftmost), suppress_errors=True)
8258+
is None
8259+
):
8260+
maybe_type_expr.as_type = None
8261+
return
8262+
if not found:
8263+
maybe_type_expr.as_type = None
8264+
return
81388265
elif isinstance(maybe_type_expr, IndexExpr):
81398266
if isinstance(maybe_type_expr.base, NameExpr):
81408267
if isinstance(
@@ -8208,6 +8335,8 @@ def var_is_typing_special_form(var: Var) -> bool:
82088335
"typing.Literal",
82098336
"typing_extensions.Literal",
82108337
"typing.Optional",
8338+
"typing.Self",
8339+
"typing_extensions.Self",
82118340
"typing.TypeGuard",
82128341
"typing_extensions.TypeGuard",
82138342
"typing.TypeIs",

0 commit comments

Comments
 (0)