refactor(diffctx): replace reinvented wheels per audit

nikolay-e · nikolay-e · commit 9dbe9bccbfcb · 2026-04-11T01:33:09.000+02:00
- Python imports: replace regex parser with ast.parse + importlib.util.resolve_name
- Tarjan SCC: replace 48-line hand-rolled algorithm with nx.strongly_connected_components
- Identifier extraction: merge concepts_from_diff_text into extract_identifiers with extra_stopwords param
diff --git a/src/treemapper/diffctx/edges/semantic/python.py b/src/treemapper/diffctx/edges/semantic/python.py
@@ -1,7 +1,7 @@
 # pylint: disable=duplicate-code
 from __future__ import annotations
 
-import re
+import ast
 from collections import defaultdict
 from pathlib import Path
 
@@ -17,74 +17,62 @@
 _SYMBOL_REF_WEIGHT = _PY_WEIGHTS.symbol_ref
 _TYPE_REF_WEIGHT = _PY_WEIGHTS.type_ref
 
-_PY_IMPORT_RE = re.compile(
-    r"(?:from\s{1,20}(\.{0,3}[\w.]{0,200})\s{1,20}import|import\s{1,20}([\w.]{1,200}(?:\s*,\s*[\w.]{1,200})*))"
-)
-
 
 def _is_python_file(path: Path) -> bool:
     return path.suffix.lower() in _PYTHON_EXTS
 
 
-def _count_leading_dots(s: str) -> int:
-    return len(s) - len(s.lstrip("."))
-
-
-def _resolve_relative_import(imported: str, source_path: Path, repo_root: Path | None = None) -> str | None:
-    if not imported.startswith("."):
-        return imported
-
-    dots = _count_leading_dots(imported)
-    relative_module = imported[dots:]
-
-    if repo_root and source_path.is_absolute():
-        try:
-            source_path = source_path.relative_to(repo_root)
-        except ValueError:
-            pass
-
-    parent_parts = _strip_source_prefix(list(source_path.parent.parts))
-
-    if parent_parts and parent_parts[-1] == "__pycache__":
-        parent_parts = parent_parts[:-1]
-
-    for _ in range(dots - 1):
-        if parent_parts:
-            parent_parts.pop()
-
-    if relative_module:
-        parent_parts.extend(relative_module.split("."))
-
-    return ".".join(parent_parts) if parent_parts else None
-
-
 def _add_import_with_prefixes(imports: set[str], imported: str) -> None:
     imports.add(imported)
     parts = imported.split(".")
     for i in range(1, len(parts) + 1):
         imports.add(".".join(parts[:i]))
 
 
+def _resolve_relative(name: str, source_path: Path, repo_root: Path | None) -> str | None:
+    try:
+        import importlib.util
+
+        pkg_parts = _strip_source_prefix(list(source_path.parent.parts))
+        if pkg_parts and pkg_parts[-1] == "__pycache__":
+            pkg_parts = pkg_parts[:-1]
+        if repo_root and source_path.is_absolute():
+            try:
+                source_path = source_path.relative_to(repo_root)
+                pkg_parts = _strip_source_prefix(list(source_path.parent.parts))
+            except ValueError:
+                pass
+        package = ".".join(pkg_parts) if pkg_parts else None
+        if not package:
+            return None
+        return importlib.util.resolve_name(name, package)
+    except (ImportError, ValueError):
+        return None
+
+
 def _extract_imports_from_content(content: str, source_path: Path | None = None, repo_root: Path | None = None) -> set[str]:
     imports: set[str] = set()
-    for match in _PY_IMPORT_RE.finditer(content):
-        from_module = match.group(1)
-        bare_imports = match.group(2)
-
-        if from_module:
-            imported = from_module
-            if imported.startswith(".") and source_path:
-                resolved = _resolve_relative_import(imported, source_path, repo_root)
-                if resolved:
-                    imported = resolved
-                else:
-                    continue
-            _add_import_with_prefixes(imports, imported)
-        elif bare_imports:
-            for name in bare_imports.split(","):
-                name = name.strip()
-                if name:
-                    _add_import_with_prefixes(imports, name)
+    try:
+        tree = ast.parse(content)
+    except SyntaxError:
+        return imports
+
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                if alias.name:
+                    _add_import_with_prefixes(imports, alias.name)
+        elif isinstance(node, ast.ImportFrom):
+            module = node.module or ""
+            if node.level and node.level > 0:
+                dots = "." * node.level
+                relative = dots + module
+                if source_path:
+                    resolved = _resolve_relative(relative, source_path, repo_root)
+                    if resolved:
+                        _add_import_with_prefixes(imports, resolved)
+            elif module:
+                _add_import_with_prefixes(imports, module)
     return imports
 
 
diff --git a/src/treemapper/diffctx/graph_analytics.py b/src/treemapper/diffctx/graph_analytics.py
@@ -2,10 +2,11 @@
 
 import subprocess
 from collections import defaultdict
-from collections.abc import Iterator
 from dataclasses import dataclass, field
 from pathlib import Path
 
+import networkx as nx
+
 from .project_graph import ProjectGraph, _relative_path
 from .types import Fragment, FragmentId
 
@@ -123,53 +124,12 @@ def to_mermaid(qg: QuotientGraph, top_n: int = 20) -> str:
 
 
 def _tarjan_scc(adjacency: dict[str, set[str]]) -> list[list[str]]:
-    index_counter = [0]
-    stack: list[str] = []
-    on_stack: set[str] = set()
-    index: dict[str, int] = {}
-    lowlink: dict[str, int] = {}
-    result: list[list[str]] = []
-
-    for start in adjacency:
-        if start in index:
-            continue
-
-        index[start] = lowlink[start] = index_counter[0]
-        index_counter[0] += 1
-        stack.append(start)
-        on_stack.add(start)
-
-        work: list[tuple[str, Iterator[str]]] = [(start, iter(adjacency.get(start, set())))]
-
-        while work:
-            v, it = work[-1]
-            try:
-                w = next(it)
-                if w not in index:
-                    index[w] = lowlink[w] = index_counter[0]
-                    index_counter[0] += 1
-                    stack.append(w)
-                    on_stack.add(w)
-                    work.append((w, iter(adjacency.get(w, set()))))
-                elif w in on_stack:
-                    lowlink[v] = min(lowlink[v], index[w])
-            except StopIteration:
-                work.pop()
-                if work:
-                    parent = work[-1][0]
-                    lowlink[parent] = min(lowlink[parent], lowlink[v])
-                if lowlink[v] == index[v]:
-                    component: list[str] = []
-                    while True:
-                        w = stack.pop()
-                        on_stack.discard(w)
-                        component.append(w)
-                        if w == v:
-                            break
-                    if len(component) > 1:
-                        result.append(component)
-
-    return result
+    g = nx.DiGraph()
+    for node, neighbors in adjacency.items():
+        g.add_node(node)
+        for nbr in neighbors:
+            g.add_edge(node, nbr)
+    return [list(c) for c in nx.strongly_connected_components(g) if len(c) > 1]
 
 
 def detect_cycles(
diff --git a/src/treemapper/diffctx/types.py b/src/treemapper/diffctx/types.py
@@ -101,16 +101,21 @@ def extract_identifiers(
     *,
     skip_stopwords: bool = False,
     use_nlp: bool = False,
+    extra_stopwords: frozenset[str] | None = None,
+    min_length: int | None = None,
 ) -> frozenset[str]:
     if use_nlp and profile != "code":
         return _extract_tokens_nlp(text, profile=profile, use_nlp=True)
 
     raw = _IDENT_RE.findall(text)
-    min_len = TokenProfile.get_min_len(profile)
+    min_len = min_length if min_length is not None else TokenProfile.get_min_len(profile)
+    stopwords: frozenset[str] = frozenset()
     if skip_stopwords:
         stopwords = TokenProfile.get_stopwords(profile)
+    if extra_stopwords:
+        stopwords = stopwords | extra_stopwords
+    if stopwords:
         return frozenset({ident.lower() for ident in raw if len(ident) >= min_len and ident.lower() not in stopwords})
-    # Normalize to lowercase to match concepts (also lowercase)
     return frozenset({ident.lower() for ident in raw if len(ident) >= min_len})
 
 
diff --git a/src/treemapper/diffctx/utility.py b/src/treemapper/diffctx/utility.py
@@ -11,14 +11,13 @@
 from .edges.structural.testing import _is_test_file
 from .stopwords import _DOCS_STOPWORDS, CODE_STOPWORDS
 from .tokenizer import extract_tokens
-from .types import Fragment, FragmentId
+from .types import Fragment, FragmentId, extract_identifiers
 
 _EXPANSION_STOPWORDS = CODE_STOPWORDS | _DOCS_STOPWORDS
 
 if TYPE_CHECKING:
     from .graph import Graph
 
-_CONCEPT_RE = re.compile(r"[A-Za-z_]\w*")
 _CALL_RE = re.compile(r"(\w+)\s*\(")
 _TYPE_REF_RE = re.compile(r"(?::|->)\s*([A-Z]\w+)")
 _GENERIC_TYPE_RE = re.compile(r"[\[<,]\s*([A-Z]\w*)")
@@ -265,15 +264,13 @@ def concepts_from_diff_text(
     if use_nlp and profile != "code":
         return extract_tokens(text, profile=profile, use_nlp=True)
 
-    raw = _CONCEPT_RE.findall(text)
-    result: set[str] = set()
-    for ident in raw:
-        if len(ident) < 3:
-            continue
-        low = ident.lower()
-        if low not in _EXPANSION_STOPWORDS and low not in _LANGUAGE_BUILTINS:
-            result.add(low)
-    return frozenset(result)
+    return extract_identifiers(
+        text,
+        profile=profile,
+        skip_stopwords=True,
+        extra_stopwords=_EXPANSION_STOPWORDS | _LANGUAGE_BUILTINS,
+        min_length=3,
+    )
 
 
 _CLOSURE_EDGE_CATEGORIES = frozenset({"structural", "semantic"})