Skip to content

Commit 4772c73

Browse files
committed
fix(perf): harden scanner root filtering and optimize report snippet/explain paths
1 parent 7f7eeb7 commit 4772c73

5 files changed

Lines changed: 175 additions & 54 deletions

File tree

CHANGELOG.md

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Changelog
22

3-
## [2.0.0b1] - 2026-03-09
3+
## [2.0.0b1]
44

55
CodeClone 2.0 is a major upgrade that expands the project from a structural clone detector into a broader *
66
*baseline-aware code-health and CI governance tool** for Python.
@@ -23,6 +23,23 @@ Compatibility remains a first-class concern in this release:
2323
This is a beta release intended to validate the new architecture, reporting surface, and performance profile before the
2424
final `2.0.0` release.
2525

26+
### Fixes (feat/2.0.0)
27+
28+
- Fixed scanner root-exclude short-circuit: only an explicitly excluded root
29+
directory is skipped; excluded segments in parent path no longer suppress
30+
valid scans (prevents silent zero-file analysis for roots like `build/project`).
31+
- Optimized HTML snippet rendering path:
32+
- `_FileCache` now caches full file lines once per file and serves
33+
line-range slices without repeated full-file scans.
34+
- Pygments imports are cached per importer identity to avoid repeated
35+
dynamic import overhead in hot snippet loops while preserving testability.
36+
- Optimized block explainability AST stats:
37+
- added per-file statement index and range lookup via `bisect`,
38+
replacing repeated full `ast.walk()` scans per range.
39+
- Added scanner regression coverage for roots under excluded parent directories.
40+
- No baseline/cache/report schema contract changes; detector identity semantics
41+
and golden compatibility preserved.
42+
2643
### Architecture
2744

2845
- Refactored CLI orchestration into a stage-based pipeline (`codeclone/pipeline.py`) to isolate discovery, processing,

codeclone/_html_snippets.py

Lines changed: 50 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import importlib
88
from dataclasses import dataclass
99
from functools import lru_cache
10+
from types import ModuleType
1011
from typing import NamedTuple, cast
1112

1213
from .errors import FileProcessingError
@@ -21,33 +22,19 @@ class _Snippet:
2122

2223

2324
class _FileCache:
24-
__slots__ = ("_get_lines_impl", "maxsize")
25+
__slots__ = ("_get_file_lines_impl", "maxsize")
2526

2627
def __init__(self, maxsize: int = 128) -> None:
2728
self.maxsize = maxsize
28-
self._get_lines_impl = lru_cache(maxsize=maxsize)(self._read_file_range)
29+
self._get_file_lines_impl = lru_cache(maxsize=maxsize)(self._read_file_lines)
2930

3031
@staticmethod
31-
def _read_file_range(
32-
filepath: str, start_line: int, end_line: int
33-
) -> tuple[str, ...]:
34-
if start_line < 1:
35-
start_line = 1
36-
if end_line < start_line:
37-
return ()
38-
32+
def _read_file_lines(filepath: str) -> tuple[str, ...]:
3933
try:
4034

4135
def _read_with_errors(errors: str) -> tuple[str, ...]:
42-
lines: list[str] = []
4336
with open(filepath, encoding="utf-8", errors=errors) as f:
44-
for lineno, line in enumerate(f, start=1):
45-
if lineno < start_line:
46-
continue
47-
if lineno > end_line:
48-
break
49-
lines.append(line.rstrip("\n"))
50-
return tuple(lines)
37+
return tuple(line.rstrip("\n") for line in f)
5138

5239
try:
5340
return _read_with_errors("strict")
@@ -59,7 +46,16 @@ def _read_with_errors(errors: str) -> tuple[str, ...]:
5946
def get_lines_range(
6047
self, filepath: str, start_line: int, end_line: int
6148
) -> tuple[str, ...]:
62-
return self._get_lines_impl(filepath, start_line, end_line)
49+
if start_line < 1:
50+
start_line = 1
51+
if end_line < start_line:
52+
return ()
53+
lines = self._get_file_lines_impl(filepath)
54+
start_index = start_line - 1
55+
if start_index >= len(lines):
56+
return ()
57+
end_index = min(len(lines), end_line)
58+
return lines[start_index:end_index]
6359

6460
class _CacheInfo(NamedTuple):
6561
hits: int
@@ -68,17 +64,47 @@ class _CacheInfo(NamedTuple):
6864
currsize: int
6965

7066
def cache_info(self) -> _CacheInfo:
71-
return cast(_FileCache._CacheInfo, self._get_lines_impl.cache_info())
67+
return cast(_FileCache._CacheInfo, self._get_file_lines_impl.cache_info())
7268

7369

74-
def _try_pygments(code: str) -> str | None:
70+
_PYGMENTS_IMPORTER_ID: int | None = None
71+
_PYGMENTS_API: tuple[ModuleType, ModuleType, ModuleType] | None = None
72+
73+
74+
def _load_pygments_api() -> tuple[ModuleType, ModuleType, ModuleType] | None:
75+
"""
76+
Load pygments modules once per import-function identity.
77+
78+
Tests monkeypatch `importlib.import_module`; tracking importer identity keeps
79+
behavior deterministic and allows import-error branches to stay testable.
80+
"""
81+
global _PYGMENTS_IMPORTER_ID
82+
global _PYGMENTS_API
83+
84+
importer_id = id(importlib.import_module)
85+
if importer_id != _PYGMENTS_IMPORTER_ID:
86+
_PYGMENTS_IMPORTER_ID = importer_id
87+
_PYGMENTS_API = None
88+
if _PYGMENTS_API is not None:
89+
return _PYGMENTS_API
90+
7591
try:
7692
pygments = importlib.import_module("pygments")
7793
formatters = importlib.import_module("pygments.formatters")
7894
lexers = importlib.import_module("pygments.lexers")
7995
except ImportError:
8096
return None
8197

98+
_PYGMENTS_API = (pygments, formatters, lexers)
99+
return _PYGMENTS_API
100+
101+
102+
def _try_pygments(code: str) -> str | None:
103+
pygments_api = _load_pygments_api()
104+
if pygments_api is None:
105+
return None
106+
pygments, formatters, lexers = pygments_api
107+
82108
highlight = pygments.highlight
83109
formatter_cls = formatters.HtmlFormatter
84110
lexer_cls = lexers.PythonLexer
@@ -91,10 +117,10 @@ def _pygments_css(style_name: str) -> str:
91117
Returns CSS for pygments tokens. Scoped to `.codebox` to avoid leaking styles.
92118
If Pygments is not available or style missing, returns "".
93119
"""
94-
try:
95-
formatters = importlib.import_module("pygments.formatters")
96-
except ImportError:
120+
pygments_api = _load_pygments_api()
121+
if pygments_api is None:
97122
return ""
123+
_, formatters, _ = pygments_api
98124

99125
try:
100126
formatter_cls = formatters.HtmlFormatter

codeclone/report/explain.py

Lines changed: 93 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from __future__ import annotations
55

66
import ast
7+
from bisect import bisect_left, bisect_right
8+
from dataclasses import dataclass
79
from pathlib import Path
810

911
from .explain_contract import (
@@ -18,6 +20,19 @@
1820
from .types import GroupItemsLike, GroupMapLike
1921

2022

23+
@dataclass(frozen=True, slots=True)
24+
class _StatementRecord:
25+
node: ast.stmt
26+
start_line: int
27+
end_line: int
28+
start_col: int
29+
end_col: int
30+
type_name: str
31+
32+
33+
_StatementIndex = tuple[tuple[_StatementRecord, ...], tuple[int, ...]]
34+
35+
2136
def signature_parts(group_key: str) -> list[str]:
2237
return [part for part in group_key.split("|") if part]
2338

@@ -50,6 +65,53 @@ def parsed_file_tree(
5065
return tree
5166

5267

68+
def _build_statement_index(tree: ast.AST) -> _StatementIndex:
69+
records = tuple(
70+
sorted(
71+
(
72+
_StatementRecord(
73+
node=node,
74+
start_line=int(getattr(node, "lineno", 0)),
75+
end_line=int(getattr(node, "end_lineno", 0)),
76+
start_col=int(getattr(node, "col_offset", 0)),
77+
end_col=int(getattr(node, "end_col_offset", 0)),
78+
type_name=type(node).__name__,
79+
)
80+
for node in ast.walk(tree)
81+
if isinstance(node, ast.stmt)
82+
),
83+
key=lambda record: (
84+
record.start_line,
85+
record.end_line,
86+
record.start_col,
87+
record.end_col,
88+
record.type_name,
89+
),
90+
)
91+
)
92+
start_lines = tuple(record.start_line for record in records)
93+
return records, start_lines
94+
95+
96+
def parsed_statement_index(
97+
filepath: str,
98+
*,
99+
ast_cache: dict[str, ast.AST | None],
100+
stmt_index_cache: dict[str, _StatementIndex | None],
101+
) -> _StatementIndex | None:
102+
if filepath in stmt_index_cache:
103+
return stmt_index_cache[filepath]
104+
105+
tree = parsed_file_tree(filepath, ast_cache=ast_cache)
106+
if tree is None:
107+
stmt_index_cache[filepath] = None
108+
return None
109+
110+
index = _build_statement_index(tree)
111+
stmt_index_cache[filepath] = index
112+
return index
113+
114+
53115
def is_assert_like_stmt(statement: ast.stmt) -> bool:
54116
if isinstance(statement, ast.Assert):
55117
return True
@@ -72,52 +134,50 @@ def assert_range_stats(
72134
start_line: int,
73135
end_line: int,
74136
ast_cache: dict[str, ast.AST | None],
137+
stmt_index_cache: dict[str, _StatementIndex | None],
75138
range_cache: dict[tuple[str, int, int], tuple[int, int, int]],
76139
) -> tuple[int, int, int]:
77140
cache_key = (filepath, start_line, end_line)
78141
if cache_key in range_cache:
79142
return range_cache[cache_key]
80143

81-
tree = parsed_file_tree(filepath, ast_cache=ast_cache)
82-
if tree is None:
144+
statement_index = parsed_statement_index(
145+
filepath,
146+
ast_cache=ast_cache,
147+
stmt_index_cache=stmt_index_cache,
148+
)
149+
if statement_index is None:
83150
range_cache[cache_key] = (0, 0, 0)
84151
return 0, 0, 0
85152

86-
statements = [
87-
node
88-
for node in ast.walk(tree)
89-
if isinstance(node, ast.stmt)
90-
and int(getattr(node, "lineno", 0)) >= start_line
91-
and int(getattr(node, "end_lineno", 0)) <= end_line
92-
]
93-
if not statements:
153+
records, start_lines = statement_index
154+
if not records:
94155
range_cache[cache_key] = (0, 0, 0)
95156
return 0, 0, 0
96157

97-
ordered_statements = sorted(
98-
statements,
99-
key=lambda statement: (
100-
int(getattr(statement, "lineno", 0)),
101-
int(getattr(statement, "end_lineno", 0)),
102-
int(getattr(statement, "col_offset", 0)),
103-
int(getattr(statement, "end_col_offset", 0)),
104-
type(statement).__name__,
105-
),
106-
)
158+
left = bisect_left(start_lines, start_line)
159+
right = bisect_right(start_lines, end_line)
160+
if left >= right:
161+
range_cache[cache_key] = (0, 0, 0)
162+
return 0, 0, 0
107163

108-
total = len(ordered_statements)
109-
assert_like = 0
110-
max_consecutive = 0
111-
current_consecutive = 0
112-
for statement in ordered_statements:
113-
if is_assert_like_stmt(statement):
164+
total, assert_like, max_consecutive, current_consecutive = (0, 0, 0, 0)
165+
for record in records[left:right]:
166+
if record.end_line > end_line:
167+
continue
168+
total += 1
169+
if is_assert_like_stmt(record.node):
114170
assert_like += 1
115171
current_consecutive += 1
116172
if current_consecutive > max_consecutive:
117173
max_consecutive = current_consecutive
118174
else:
119175
current_consecutive = 0
120176

177+
if total == 0:
178+
range_cache[cache_key] = (0, 0, 0)
179+
return 0, 0, 0
180+
121181
stats = (total, assert_like, max_consecutive)
122182
range_cache[cache_key] = stats
123183
return stats
@@ -129,13 +189,15 @@ def is_assert_only_range(
129189
start_line: int,
130190
end_line: int,
131191
ast_cache: dict[str, ast.AST | None],
192+
stmt_index_cache: dict[str, _StatementIndex | None],
132193
range_cache: dict[tuple[str, int, int], tuple[int, int, int]],
133194
) -> bool:
134195
total, assert_like, _ = assert_range_stats(
135196
filepath=filepath,
136197
start_line=start_line,
137198
end_line=end_line,
138199
ast_cache=ast_cache,
200+
stmt_index_cache=stmt_index_cache,
139201
range_cache=range_cache,
140202
)
141203
return total > 0 and total == assert_like
@@ -163,6 +225,7 @@ def enrich_with_assert_facts(
163225
facts: dict[str, str],
164226
items: GroupItemsLike,
165227
ast_cache: dict[str, ast.AST | None],
228+
stmt_index_cache: dict[str, _StatementIndex | None],
166229
range_cache: dict[tuple[str, int, int], tuple[int, int, int]],
167230
) -> None:
168231
assert_only = True
@@ -187,6 +250,7 @@ def enrich_with_assert_facts(
187250
start_line=start_line,
188251
end_line=end_line,
189252
ast_cache=ast_cache,
253+
stmt_index_cache=stmt_index_cache,
190254
range_cache=range_cache,
191255
)
192256
total_statements += range_total
@@ -205,6 +269,7 @@ def enrich_with_assert_facts(
205269
start_line=start_line,
206270
end_line=end_line,
207271
ast_cache=ast_cache,
272+
stmt_index_cache=stmt_index_cache,
208273
range_cache=range_cache,
209274
)
210275
):
@@ -230,6 +295,7 @@ def build_block_group_facts(block_groups: GroupMapLike) -> dict[str, dict[str, s
230295
Renderers (HTML/TXT/JSON) should only display these facts.
231296
"""
232297
ast_cache: dict[str, ast.AST | None] = {}
298+
stmt_index_cache: dict[str, _StatementIndex | None] = {}
233299
range_cache: dict[tuple[str, int, int], tuple[int, int, int]] = {}
234300
facts_by_group: dict[str, dict[str, str]] = {}
235301

@@ -239,6 +305,7 @@ def build_block_group_facts(block_groups: GroupMapLike) -> dict[str, dict[str, s
239305
facts=facts,
240306
items=items,
241307
ast_cache=ast_cache,
308+
stmt_index_cache=stmt_index_cache,
242309
range_cache=range_cache,
243310
)
244311
group_arity = len(items)

codeclone/scanner.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,10 @@ def iter_py_files(
104104

105105
excludes_set = set(excludes)
106106

107-
# Keep legacy behavior: if root path already includes an excluded segment,
108-
# no files are yielded.
109-
if any(part in excludes_set for part in rootp.parts):
107+
# Keep legacy behavior only when the requested root directory itself is excluded
108+
# (e.g. scanning "<repo>/__pycache__"). Parent directories must not suppress
109+
# scanning, otherwise valid roots like ".../build/project" become empty.
110+
if rootp.name in excludes_set:
110111
return
111112

112113
# Collect and filter first, then sort for deterministic output.

0 commit comments

Comments
 (0)