Skip to content

Commit 208f9b6

Browse files
davidfstrclaude
andcommitted
TypeForm: Add instrumentation of full parses done in semanal.py's try_parse_as_type_expression()
Specifically: - If you set MYPY_TYPEFORM_PROFILE_FULL_PARSE environment variable, mypy will output a .tsv to that filepath which characterizes the kinds of Expressions that try_parse_as_type_expression() in semanal.py was forced to do a full parse of, which was not rejected early. - A misc/analyze_typeform_full_parse_profile.py script is added which takes those .tsvs and prints an expression-time summary (by total time) plus top-N descriptors per FAIL class. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent cb0ae01 commit 208f9b6

3 files changed

Lines changed: 256 additions & 17 deletions

File tree

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Aggregate the full-parse profile log produced by mypy's
4+
SemanticAnalyzer.try_parse_as_type_expression() when run with
5+
MYPY_TYPEFORM_PROFILE_FULL_PARSE set.
6+
7+
Usage:
8+
# 1. Run mypy with the profile env var set; per-PID log files are
9+
# written as "<path>.<pid>":
10+
MYPY_TYPEFORM_PROFILE_FULL_PARSE=/tmp/tf.log \\
11+
python3 -m mypy --no-incremental -p your_package
12+
13+
# 2. Aggregate one or more per-PID files:
14+
python3 misc/analyze_typeform_full_parse_profile.py /tmp/tf.log.*
15+
16+
# Optional: limit per-descriptor breakdown to top N rows per class.
17+
python3 misc/analyze_typeform_full_parse_profile.py --top 20 /tmp/tf.log.*
18+
19+
The script summarizes which (outcome, kind, subkind) classes account for
20+
the most full-parse time, and lists the top descriptors within each
21+
FAIL class -- the populations worth targeting with cheaper pre-filters
22+
upstream in try_parse_as_type_expression.
23+
24+
See also:
25+
- mypy/semanal.py: SemanticAnalyzer.try_parse_as_type_expression()
26+
- mypy/semanal.py: _log_typeform_full_parse() (TSV schema docstring)
27+
- misc/analyze_typeform_stats.py (aggregate counters via --dump-build-stats)
28+
"""
29+
30+
from __future__ import annotations
31+
32+
import argparse
33+
import statistics
34+
import sys
35+
from collections import defaultdict
36+
from collections.abc import Iterable
37+
38+
39+
def read_rows(paths: Iterable[str]) -> list[tuple[str, str, str, str, int]]:
40+
rows: list[tuple[str, str, str, str, int]] = []
41+
for path in paths:
42+
with open(path) as f:
43+
for line in f:
44+
# Skip header lines (each per-PID file starts with one).
45+
if line.startswith("outcome\t"):
46+
continue
47+
parts = line.rstrip("\n").split("\t")
48+
if len(parts) < 5:
49+
continue
50+
outcome, kind, subkind, desc, dur_ns_str = parts[:5]
51+
try:
52+
dur_ns = int(dur_ns_str)
53+
except ValueError:
54+
continue
55+
rows.append((outcome, kind, subkind, desc, dur_ns))
56+
return rows
57+
58+
59+
def print_class_summary(rows: list[tuple[str, str, str, str, int]]) -> None:
60+
buckets: dict[tuple[str, str, str], list[int]] = defaultdict(list)
61+
total_ns = 0
62+
for outcome, kind, subkind, _desc, dur_ns in rows:
63+
buckets[(outcome, kind, subkind)].append(dur_ns)
64+
total_ns += dur_ns
65+
66+
print("Class summary (by total time):")
67+
print("=" * 80)
68+
print(f"{'count':>7} {'total_ms':>10} {'mean_us':>9} {'med_us':>9} {'pct':>6} class")
69+
print("-" * 80)
70+
ordered = sorted(
71+
(
72+
(sum(d), len(d), statistics.mean(d), statistics.median(d), key)
73+
for key, d in buckets.items()
74+
),
75+
reverse=True,
76+
)
77+
for total, n, mean, med, key in ordered:
78+
pct = (100 * total / total_ns) if total_ns else 0
79+
outcome, kind, subkind = key
80+
print(
81+
f"{n:>7} {total/1e6:>10.2f} {mean/1e3:>9.1f} {med/1e3:>9.1f} "
82+
f"{pct:>5.1f}% {outcome} {kind} {subkind}"
83+
)
84+
print("-" * 80)
85+
print(f"TOTAL: {len(rows):,} events, {total_ns/1e6:.2f} ms")
86+
87+
88+
def print_fail_descriptors(rows: list[tuple[str, str, str, str, int]], top_n: int) -> None:
89+
# Group FAIL rows by (kind, subkind) class, then by descriptor within each.
90+
by_class: dict[tuple[str, str, str], dict[str, list[int]]] = defaultdict(
91+
lambda: defaultdict(list)
92+
)
93+
for outcome, kind, subkind, desc, dur_ns in rows:
94+
if outcome != "FAIL":
95+
continue
96+
by_class[(outcome, kind, subkind)][desc].append(dur_ns)
97+
98+
# Order classes by total FAIL time, descending.
99+
class_totals = sorted(
100+
((sum(sum(d) for d in descs.values()), key, descs) for key, descs in by_class.items()),
101+
reverse=True,
102+
)
103+
for total_ns, key, descs in class_totals:
104+
outcome, kind, subkind = key
105+
print()
106+
print(
107+
f"Top {top_n} descriptors in {outcome} {kind} {subkind} "
108+
f"(class total {total_ns/1e6:.2f} ms):"
109+
)
110+
print("-" * 80)
111+
print(f"{'count':>6} {'total_ms':>10} {'mean_us':>9} descriptor")
112+
rows_d = sorted(
113+
((sum(d), len(d), statistics.mean(d), desc) for desc, d in descs.items()), reverse=True
114+
)
115+
for tot, n, mean, desc in rows_d[:top_n]:
116+
print(f"{n:>6} {tot/1e6:>10.3f} {mean/1e3:>9.1f} {desc!r}")
117+
if len(rows_d) > top_n:
118+
print(f"... {len(rows_d) - top_n} more descriptors")
119+
120+
121+
def main() -> None:
122+
parser = argparse.ArgumentParser(
123+
formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__
124+
)
125+
parser.add_argument(
126+
"files", nargs="+", help="One or more per-PID profile files (e.g. /tmp/tf.log.*)"
127+
)
128+
parser.add_argument(
129+
"--top",
130+
type=int,
131+
default=20,
132+
help="Max number of descriptors to list per FAIL class (default: 20)",
133+
)
134+
args = parser.parse_args()
135+
136+
rows = read_rows(args.files)
137+
if not rows:
138+
print("No data rows found in input files.", file=sys.stderr)
139+
sys.exit(1)
140+
141+
print_class_summary(rows)
142+
print_fail_descriptors(rows, args.top)
143+
144+
145+
if __name__ == "__main__":
146+
main()

misc/analyze_typeform_stats.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
2727
See also:
2828
- mypy/semanal.py: SemanticAnalyzer.try_parse_as_type_expression()
29-
- mypy/semanal.py: DEBUG_TYPE_EXPRESSION_FULL_PARSE_FAILURES
3029
"""
3130

3231
import re
@@ -70,6 +69,7 @@ def analyze_stats(output: str) -> None:
7069
print(
7170
f" - Expensive failed full parses: {failures:,} ({(failures / total * 100):.1f}% of all calls)"
7271
)
72+
print(" - Analyze further with misc/analyze_typeform_full_parse_profile.py")
7373

7474

7575
if __name__ == "__main__":

mypy/semanal.py

Lines changed: 109 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,12 @@
5050

5151
from __future__ import annotations
5252

53+
import os
5354
import re
55+
import time
5456
from collections.abc import Callable, Collection, Iterable, Iterator
5557
from contextlib import contextmanager
56-
from typing import Any, Final, TypeAlias as _TypeAlias, TypeGuard, TypeVar, cast
58+
from typing import Any, Final, TextIO, TypeAlias as _TypeAlias, TypeGuard, TypeVar, cast
5759
from typing_extensions import assert_never
5860

5961
from mypy import errorcodes as codes, message_registry
@@ -320,11 +322,18 @@
320322
T = TypeVar("T")
321323

322324

323-
# Whether to print diagnostic information for failed full parses
324-
# in SemanticAnalyzer.try_parse_as_type_expression().
325+
# Instrumentation: If non-None, every expression that reaches the expensive
326+
# full-parse block of SemanticAnalyzer.try_parse_as_type_expression()
327+
# is logged to a .tsv by log_typeform_full_parse().
325328
#
326-
# See also: misc/analyze_typeform_stats.py
327-
DEBUG_TYPE_EXPRESSION_FULL_PARSE_FAILURES: Final = False
329+
# See also:
330+
# - misc/analyze_typeform_full_parse_profile.py
331+
# - misc/analyze_typeform_stats.py
332+
_TYPEFORM_PROFILE_FULL_PARSE_PATH: Final = os.environ.get("MYPY_TYPEFORM_PROFILE_FULL_PARSE")
333+
_typeform_full_parse_log_file: TextIO | None = None
334+
335+
# TSV column names for the full-parse profile log
336+
_TYPEFORM_PROFILE_FULL_PARSE_HEADER = "outcome\tkind\tsubkind\tdescriptor\tdur_ns\n"
328337

329338

330339
FUTURE_IMPORTS: Final = {
@@ -8164,6 +8173,9 @@ def try_parse_as_type_expression(self, maybe_type_expr: Expression) -> None:
81648173
else:
81658174
assert_never(maybe_type_expr)
81668175

8176+
full_parse_t0 = (
8177+
time.perf_counter_ns() if _TYPEFORM_PROFILE_FULL_PARSE_PATH is not None else 0
8178+
)
81678179
with self.isolated_error_analysis():
81688180
try:
81698181
t = self.expr_to_analyzed_type(maybe_type_expr)
@@ -8173,17 +8185,6 @@ def try_parse_as_type_expression(self, maybe_type_expr: Expression) -> None:
81738185
# Not a type expression
81748186
t = None
81758187

8176-
if DEBUG_TYPE_EXPRESSION_FULL_PARSE_FAILURES and t is None:
8177-
original_flushed_files = set(self.errors.flushed_files) # save
8178-
try:
8179-
errors = self.errors.new_messages() # capture
8180-
finally:
8181-
self.errors.flushed_files = original_flushed_files # restore
8182-
8183-
print(
8184-
f"SA.try_parse_as_type_expression: Full parse failure: {maybe_type_expr}, errors={errors!r}"
8185-
)
8186-
81878188
# Count full parse attempts for profiling
81888189
if t is not None:
81898190
self.type_expression_full_parse_success_count += 1
@@ -8192,6 +8193,12 @@ def try_parse_as_type_expression(self, maybe_type_expr: Expression) -> None:
81928193

81938194
maybe_type_expr.as_type = t
81948195

8196+
if _TYPEFORM_PROFILE_FULL_PARSE_PATH is not None:
8197+
full_parse_t1 = time.perf_counter_ns()
8198+
self.log_typeform_full_parse(
8199+
maybe_type_expr, t is not None, full_parse_t1 - full_parse_t0
8200+
)
8201+
81958202
@staticmethod
81968203
def var_is_typing_special_form(var: Var) -> bool:
81978204
return var.fullname.startswith("typing") and var.fullname in [
@@ -8208,6 +8215,92 @@ def var_is_typing_special_form(var: Var) -> bool:
82088215
"typing.Union",
82098216
]
82108217

8218+
@staticmethod
8219+
def log_typeform_full_parse(expr: Expression, ok: bool, dur_ns: int) -> None:
8220+
"""Log one entry into the full-parse block of try_parse_as_type_expression.
8221+
8222+
Active only when the MYPY_TYPEFORM_PROFILE_FULL_PARSE environment variable
8223+
is set to a file path. Each mypy process (worker) writes to its own file
8224+
named "<path>.<pid>" to avoid contention; concatenating those files yields
8225+
the complete profile. Aggregate with misc/analyze_typeform_full_parse_profile.py.
8226+
8227+
Output is tab-separated with one row per full-parse attempt:
8228+
8229+
outcome "OK" if as_type was set, "FAIL" if the full parse rejected
8230+
the expression (either by raising TypeTranslationError or by
8231+
emitting errors during analysis).
8232+
kind AST node kind: StrExpr | IndexExpr | OpExpr | (other).
8233+
subkind For StrExpr: "ident", "dotident", or "other" (based on the
8234+
string's shape). For IndexExpr: "Name" or "Member" (base
8235+
kind). For OpExpr: always "|" (no other op reaches here).
8236+
descriptor Short, type-specific identifier for the expression:
8237+
StrExpr -> the string value, truncated to 80 chars
8238+
(with " (N)" suffix when truncated).
8239+
IndexExpr -> the full stringified expression (str(expr),
8240+
with tabs/newlines escaped).
8241+
OpExpr -> the full stringified expression (str(expr),
8242+
with tabs/newlines escaped).
8243+
dur_ns Wall-clock nanoseconds spent in the full-parse block for
8244+
this expression (measured around expr_to_analyzed_type
8245+
plus the surrounding isolated_error_analysis ctx).
8246+
8247+
The first line of each file is the column header (same as above).
8248+
"""
8249+
global _typeform_full_parse_log_file
8250+
if _typeform_full_parse_log_file is None:
8251+
assert _TYPEFORM_PROFILE_FULL_PARSE_PATH is not None
8252+
_typeform_full_parse_log_file = open(
8253+
f"{_TYPEFORM_PROFILE_FULL_PARSE_PATH}.{os.getpid()}", "a", buffering=1
8254+
)
8255+
_typeform_full_parse_log_file.write(_TYPEFORM_PROFILE_FULL_PARSE_HEADER)
8256+
outcome = "OK" if ok else "FAIL"
8257+
if isinstance(expr, StrExpr):
8258+
raw = expr.value
8259+
val = (
8260+
raw[:80]
8261+
.replace("\\", "\\\\")
8262+
.replace("\t", "\\t")
8263+
.replace("\n", "\\n")
8264+
.replace("\r", "\\r")
8265+
)
8266+
if len(raw) > 80:
8267+
val += f" ({len(raw)})"
8268+
if _IDENTIFIER_RE.fullmatch(raw):
8269+
subkind = "ident"
8270+
elif _DOTTED_IDENTIFIER_RE.fullmatch(raw):
8271+
subkind = "dotident"
8272+
else:
8273+
subkind = "other"
8274+
line = f"{outcome}\tStrExpr\t{subkind}\t{val}\t{dur_ns}\n"
8275+
elif isinstance(expr, IndexExpr):
8276+
base = expr.base
8277+
if isinstance(base, NameExpr):
8278+
subkind = "Name"
8279+
elif isinstance(base, MemberExpr):
8280+
subkind = "Member"
8281+
else:
8282+
subkind = type(base).__name__
8283+
desc = (
8284+
str(expr)
8285+
.replace("\\", "\\\\")
8286+
.replace("\t", "\\t")
8287+
.replace("\n", "\\n")
8288+
.replace("\r", "\\r")
8289+
)
8290+
line = f"{outcome}\tIndexExpr\t{subkind}\t{desc}\t{dur_ns}\n"
8291+
elif isinstance(expr, OpExpr):
8292+
desc = (
8293+
str(expr)
8294+
.replace("\\", "\\\\")
8295+
.replace("\t", "\\t")
8296+
.replace("\n", "\\n")
8297+
.replace("\r", "\\r")
8298+
)
8299+
line = f"{outcome}\tOpExpr\t|\t{desc}\t{dur_ns}\n"
8300+
else:
8301+
line = f"{outcome}\t{type(expr).__name__}\t\t\t{dur_ns}\n"
8302+
_typeform_full_parse_log_file.write(line)
8303+
82118304
@contextmanager
82128305
def isolated_error_analysis(self) -> Iterator[None]:
82138306
"""

0 commit comments

Comments
 (0)