fix(perf): harden scanner root filtering and optimize report snippet/explain paths

orenlab · orenlab · commit 4772c73820bd · 2026-03-14T17:54:52.000+05:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # Changelog
 
-## [2.0.0b1] - 2026-03-09
+## [2.0.0b1]
 
 CodeClone 2.0 is a major upgrade that expands the project from a structural clone detector into a broader *
 *baseline-aware code-health and CI governance tool** for Python.
@@ -23,6 +23,23 @@ Compatibility remains a first-class concern in this release:
 This is a beta release intended to validate the new architecture, reporting surface, and performance profile before the
 final `2.0.0` release.
 
+### Fixes (feat/2.0.0)
+
+- Fixed scanner root-exclude short-circuit: only an explicitly excluded root
+  directory is skipped; excluded segments in parent path no longer suppress
+  valid scans (prevents silent zero-file analysis for roots like `build/project`).
+- Optimized HTML snippet rendering path:
+  - `_FileCache` now caches full file lines once per file and serves
+    line-range slices without repeated full-file scans.
+  - Pygments imports are cached per importer identity to avoid repeated
+    dynamic import overhead in hot snippet loops while preserving testability.
+- Optimized block explainability AST stats:
+  - added per-file statement index and range lookup via `bisect`,
+    replacing repeated full `ast.walk()` scans per range.
+- Added scanner regression coverage for roots under excluded parent directories.
+- No baseline/cache/report schema contract changes; detector identity semantics
+  and golden compatibility preserved.
+
 ### Architecture
 
 - Refactored CLI orchestration into a stage-based pipeline (`codeclone/pipeline.py`) to isolate discovery, processing,
diff --git a/codeclone/_html_snippets.py b/codeclone/_html_snippets.py
@@ -7,6 +7,7 @@
 import importlib
 from dataclasses import dataclass
 from functools import lru_cache
+from types import ModuleType
 from typing import NamedTuple, cast
 
 from .errors import FileProcessingError
@@ -21,33 +22,19 @@ class _Snippet:
 
 
 class _FileCache:
-    __slots__ = ("_get_lines_impl", "maxsize")
+    __slots__ = ("_get_file_lines_impl", "maxsize")
 
     def __init__(self, maxsize: int = 128) -> None:
         self.maxsize = maxsize
-        self._get_lines_impl = lru_cache(maxsize=maxsize)(self._read_file_range)
+        self._get_file_lines_impl = lru_cache(maxsize=maxsize)(self._read_file_lines)
 
     @staticmethod
-    def _read_file_range(
-        filepath: str, start_line: int, end_line: int
-    ) -> tuple[str, ...]:
-        if start_line < 1:
-            start_line = 1
-        if end_line < start_line:
-            return ()
-
+    def _read_file_lines(filepath: str) -> tuple[str, ...]:
         try:
 
             def _read_with_errors(errors: str) -> tuple[str, ...]:
-                lines: list[str] = []
                 with open(filepath, encoding="utf-8", errors=errors) as f:
-                    for lineno, line in enumerate(f, start=1):
-                        if lineno < start_line:
-                            continue
-                        if lineno > end_line:
-                            break
-                        lines.append(line.rstrip("\n"))
-                return tuple(lines)
+                    return tuple(line.rstrip("\n") for line in f)
 
             try:
                 return _read_with_errors("strict")
@@ -59,7 +46,16 @@ def _read_with_errors(errors: str) -> tuple[str, ...]:
     def get_lines_range(
         self, filepath: str, start_line: int, end_line: int
     ) -> tuple[str, ...]:
-        return self._get_lines_impl(filepath, start_line, end_line)
+        if start_line < 1:
+            start_line = 1
+        if end_line < start_line:
+            return ()
+        lines = self._get_file_lines_impl(filepath)
+        start_index = start_line - 1
+        if start_index >= len(lines):
+            return ()
+        end_index = min(len(lines), end_line)
+        return lines[start_index:end_index]
 
     class _CacheInfo(NamedTuple):
         hits: int
@@ -68,17 +64,47 @@ class _CacheInfo(NamedTuple):
         currsize: int
 
     def cache_info(self) -> _CacheInfo:
-        return cast(_FileCache._CacheInfo, self._get_lines_impl.cache_info())
+        return cast(_FileCache._CacheInfo, self._get_file_lines_impl.cache_info())
 
 
-def _try_pygments(code: str) -> str | None:
+_PYGMENTS_IMPORTER_ID: int | None = None
+_PYGMENTS_API: tuple[ModuleType, ModuleType, ModuleType] | None = None
+
+
+def _load_pygments_api() -> tuple[ModuleType, ModuleType, ModuleType] | None:
+    """
+    Load pygments modules once per import-function identity.
+
+    Tests monkeypatch `importlib.import_module`; tracking importer identity keeps
+    behavior deterministic and allows import-error branches to stay testable.
+    """
+    global _PYGMENTS_IMPORTER_ID
+    global _PYGMENTS_API
+
+    importer_id = id(importlib.import_module)
+    if importer_id != _PYGMENTS_IMPORTER_ID:
+        _PYGMENTS_IMPORTER_ID = importer_id
+        _PYGMENTS_API = None
+    if _PYGMENTS_API is not None:
+        return _PYGMENTS_API
+
     try:
         pygments = importlib.import_module("pygments")
         formatters = importlib.import_module("pygments.formatters")
         lexers = importlib.import_module("pygments.lexers")
     except ImportError:
         return None
 
+    _PYGMENTS_API = (pygments, formatters, lexers)
+    return _PYGMENTS_API
+
+
+def _try_pygments(code: str) -> str | None:
+    pygments_api = _load_pygments_api()
+    if pygments_api is None:
+        return None
+    pygments, formatters, lexers = pygments_api
+
     highlight = pygments.highlight
     formatter_cls = formatters.HtmlFormatter
     lexer_cls = lexers.PythonLexer
@@ -91,10 +117,10 @@ def _pygments_css(style_name: str) -> str:
     Returns CSS for pygments tokens. Scoped to `.codebox` to avoid leaking styles.
     If Pygments is not available or style missing, returns "".
     """
-    try:
-        formatters = importlib.import_module("pygments.formatters")
-    except ImportError:
+    pygments_api = _load_pygments_api()
+    if pygments_api is None:
         return ""
+    _, formatters, _ = pygments_api
 
     try:
         formatter_cls = formatters.HtmlFormatter
diff --git a/codeclone/report/explain.py b/codeclone/report/explain.py
@@ -4,6 +4,8 @@
 from __future__ import annotations
 
 import ast
+from bisect import bisect_left, bisect_right
+from dataclasses import dataclass
 from pathlib import Path
 
 from .explain_contract import (
@@ -18,6 +20,19 @@
 from .types import GroupItemsLike, GroupMapLike
 
 
+@dataclass(frozen=True, slots=True)
+class _StatementRecord:
+    node: ast.stmt
+    start_line: int
+    end_line: int
+    start_col: int
+    end_col: int
+    type_name: str
+
+
+_StatementIndex = tuple[tuple[_StatementRecord, ...], tuple[int, ...]]
+
+
 def signature_parts(group_key: str) -> list[str]:
     return [part for part in group_key.split("|") if part]
 
@@ -50,6 +65,53 @@ def parsed_file_tree(
     return tree
 
 
+def _build_statement_index(tree: ast.AST) -> _StatementIndex:
+    records = tuple(
+        sorted(
+            (
+                _StatementRecord(
+                    node=node,
+                    start_line=int(getattr(node, "lineno", 0)),
+                    end_line=int(getattr(node, "end_lineno", 0)),
+                    start_col=int(getattr(node, "col_offset", 0)),
+                    end_col=int(getattr(node, "end_col_offset", 0)),
+                    type_name=type(node).__name__,
+                )
+                for node in ast.walk(tree)
+                if isinstance(node, ast.stmt)
+            ),
+            key=lambda record: (
+                record.start_line,
+                record.end_line,
+                record.start_col,
+                record.end_col,
+                record.type_name,
+            ),
+        )
+    )
+    start_lines = tuple(record.start_line for record in records)
+    return records, start_lines
+
+
+def parsed_statement_index(
+    filepath: str,
+    *,
+    ast_cache: dict[str, ast.AST | None],
+    stmt_index_cache: dict[str, _StatementIndex | None],
+) -> _StatementIndex | None:
+    if filepath in stmt_index_cache:
+        return stmt_index_cache[filepath]
+
+    tree = parsed_file_tree(filepath, ast_cache=ast_cache)
+    if tree is None:
+        stmt_index_cache[filepath] = None
+        return None
+
+    index = _build_statement_index(tree)
+    stmt_index_cache[filepath] = index
+    return index
+
+
 def is_assert_like_stmt(statement: ast.stmt) -> bool:
     if isinstance(statement, ast.Assert):
         return True
@@ -72,52 +134,50 @@ def assert_range_stats(
     start_line: int,
     end_line: int,
     ast_cache: dict[str, ast.AST | None],
+    stmt_index_cache: dict[str, _StatementIndex | None],
     range_cache: dict[tuple[str, int, int], tuple[int, int, int]],
 ) -> tuple[int, int, int]:
     cache_key = (filepath, start_line, end_line)
     if cache_key in range_cache:
         return range_cache[cache_key]
 
-    tree = parsed_file_tree(filepath, ast_cache=ast_cache)
-    if tree is None:
+    statement_index = parsed_statement_index(
+        filepath,
+        ast_cache=ast_cache,
+        stmt_index_cache=stmt_index_cache,
+    )
+    if statement_index is None:
         range_cache[cache_key] = (0, 0, 0)
         return 0, 0, 0
 
-    statements = [
-        node
-        for node in ast.walk(tree)
-        if isinstance(node, ast.stmt)
-        and int(getattr(node, "lineno", 0)) >= start_line
-        and int(getattr(node, "end_lineno", 0)) <= end_line
-    ]
-    if not statements:
+    records, start_lines = statement_index
+    if not records:
         range_cache[cache_key] = (0, 0, 0)
         return 0, 0, 0
 
-    ordered_statements = sorted(
-        statements,
-        key=lambda statement: (
-            int(getattr(statement, "lineno", 0)),
-            int(getattr(statement, "end_lineno", 0)),
-            int(getattr(statement, "col_offset", 0)),
-            int(getattr(statement, "end_col_offset", 0)),
-            type(statement).__name__,
-        ),
-    )
+    left = bisect_left(start_lines, start_line)
+    right = bisect_right(start_lines, end_line)
+    if left >= right:
+        range_cache[cache_key] = (0, 0, 0)
+        return 0, 0, 0
 
-    total = len(ordered_statements)
-    assert_like = 0
-    max_consecutive = 0
-    current_consecutive = 0
-    for statement in ordered_statements:
-        if is_assert_like_stmt(statement):
+    total, assert_like, max_consecutive, current_consecutive = (0, 0, 0, 0)
+    for record in records[left:right]:
+        if record.end_line > end_line:
+            continue
+        total += 1
+        if is_assert_like_stmt(record.node):
             assert_like += 1
             current_consecutive += 1
             if current_consecutive > max_consecutive:
                 max_consecutive = current_consecutive
         else:
             current_consecutive = 0
 
+    if total == 0:
+        range_cache[cache_key] = (0, 0, 0)
+        return 0, 0, 0
+
     stats = (total, assert_like, max_consecutive)
     range_cache[cache_key] = stats
     return stats
@@ -129,13 +189,15 @@ def is_assert_only_range(
     start_line: int,
     end_line: int,
     ast_cache: dict[str, ast.AST | None],
+    stmt_index_cache: dict[str, _StatementIndex | None],
     range_cache: dict[tuple[str, int, int], tuple[int, int, int]],
 ) -> bool:
     total, assert_like, _ = assert_range_stats(
         filepath=filepath,
         start_line=start_line,
         end_line=end_line,
         ast_cache=ast_cache,
+        stmt_index_cache=stmt_index_cache,
         range_cache=range_cache,
     )
     return total > 0 and total == assert_like
@@ -163,6 +225,7 @@ def enrich_with_assert_facts(
     facts: dict[str, str],
     items: GroupItemsLike,
     ast_cache: dict[str, ast.AST | None],
+    stmt_index_cache: dict[str, _StatementIndex | None],
     range_cache: dict[tuple[str, int, int], tuple[int, int, int]],
 ) -> None:
     assert_only = True
@@ -187,6 +250,7 @@ def enrich_with_assert_facts(
                 start_line=start_line,
                 end_line=end_line,
                 ast_cache=ast_cache,
+                stmt_index_cache=stmt_index_cache,
                 range_cache=range_cache,
             )
             total_statements += range_total
@@ -205,6 +269,7 @@ def enrich_with_assert_facts(
                 start_line=start_line,
                 end_line=end_line,
                 ast_cache=ast_cache,
+                stmt_index_cache=stmt_index_cache,
                 range_cache=range_cache,
             )
         ):
@@ -230,6 +295,7 @@ def build_block_group_facts(block_groups: GroupMapLike) -> dict[str, dict[str, s
     Renderers (HTML/TXT/JSON) should only display these facts.
     """
     ast_cache: dict[str, ast.AST | None] = {}
+    stmt_index_cache: dict[str, _StatementIndex | None] = {}
     range_cache: dict[tuple[str, int, int], tuple[int, int, int]] = {}
     facts_by_group: dict[str, dict[str, str]] = {}
 
@@ -239,6 +305,7 @@ def build_block_group_facts(block_groups: GroupMapLike) -> dict[str, dict[str, s
             facts=facts,
             items=items,
             ast_cache=ast_cache,
+            stmt_index_cache=stmt_index_cache,
             range_cache=range_cache,
         )
         group_arity = len(items)
diff --git a/codeclone/scanner.py b/codeclone/scanner.py
@@ -104,9 +104,10 @@ def iter_py_files(
 
     excludes_set = set(excludes)
 
-    # Keep legacy behavior: if root path already includes an excluded segment,
-    # no files are yielded.
-    if any(part in excludes_set for part in rootp.parts):
+    # Keep legacy behavior only when the requested root directory itself is excluded
+    # (e.g. scanning "<repo>/__pycache__"). Parent directories must not suppress
+    # scanning, otherwise valid roots like ".../build/project" become empty.
+    if rootp.name in excludes_set:
         return
 
     # Collect and filter first, then sort for deterministic output.
diff --git a/tests/test_scanner_extra.py b/tests/test_scanner_extra.py