refactor: reduce cognitive complexity in ppr, python_semantics, git

nikolay-e · nikolay-e · commit d337bff84d48 · 2026-01-04T23:33:16.000+01:00
Extract helper functions to reduce SonarCloud cognitive complexity:
- ppr.py: extract PPR iteration and normalization helpers
- python_semantics.py: extract AST collection helpers
- git.py: extract diff parsing helpers
diff --git a/src/treemapper/diffctx/git.py b/src/treemapper/diffctx/git.py
@@ -41,54 +41,55 @@ def get_diff_text(repo_root: Path, diff_range: str) -> str:
     return run_git(repo_root, ["diff", diff_range])
 
 
+def _parse_hunk_header(match: re.Match[str], path: Path) -> DiffHunk:
+    old_start = int(match.group(1))
+    old_len_str = match.group(2)
+    old_len = int(old_len_str) if old_len_str else 1
+    new_start = int(match.group(3))
+    new_len_str = match.group(4)
+    new_len = int(new_len_str) if new_len_str else 1
+
+    return DiffHunk(
+        path=path,
+        new_start=new_start,
+        new_len=new_len,
+        old_start=old_start,
+        old_len=old_len,
+    )
+
+
+def _parse_path_line(line: str, repo_root: Path) -> tuple[str, Path | None]:
+    if line.startswith("--- a/"):
+        return "old", repo_root / line.removeprefix("--- a/").strip()
+    if line.startswith("--- /dev/null"):
+        return "old", None
+    if line.startswith("+++ b/"):
+        return "new", repo_root / line.removeprefix("+++ b/").strip()
+    if line.startswith("+++ /dev/null"):
+        return "new", None
+    return "", None
+
+
 def parse_diff(repo_root: Path, diff_range: str) -> list[DiffHunk]:
     output = run_git(repo_root, ["diff", "--unified=0", diff_range])
     hunks: list[DiffHunk] = []
     old_path: Path | None = None
     new_path: Path | None = None
 
     for line in output.splitlines():
-        if line.startswith("--- a/"):
-            rel_path = line.removeprefix("--- a/").strip()
-            old_path = repo_root / rel_path
-            continue
-
-        if line.startswith("--- /dev/null"):
-            old_path = None
+        path_type, path = _parse_path_line(line, repo_root)
+        if path_type == "old":
+            old_path = path
             continue
-
-        if line.startswith("+++ b/"):
-            rel_path = line.removeprefix("+++ b/").strip()
-            new_path = repo_root / rel_path
-            continue
-
-        if line.startswith("+++ /dev/null"):
-            new_path = None
+        if path_type == "new":
+            new_path = path
             continue
 
         match = _HUNK_RE.match(line)
         if match:
-            # For deletions, use old_path; for additions/modifications, use new_path
             current_path = new_path if new_path else old_path
-            if not current_path:
-                continue
-
-            old_start = int(match.group(1))
-            old_len_str = match.group(2)
-            old_len = int(old_len_str) if old_len_str else 1
-            new_start = int(match.group(3))
-            new_len_str = match.group(4)
-            new_len = int(new_len_str) if new_len_str else 1
-
-            hunks.append(
-                DiffHunk(
-                    path=current_path,
-                    new_start=new_start,
-                    new_len=new_len,
-                    old_start=old_start,
-                    old_len=old_len,
-                )
-            )
+            if current_path:
+                hunks.append(_parse_hunk_header(match, current_path))
 
     return hunks
 
diff --git a/src/treemapper/diffctx/ppr.py b/src/treemapper/diffctx/ppr.py
@@ -4,6 +4,50 @@
 from .types import FragmentId
 
 
+def _initialize_ppr_scores(
+    nodes: list[FragmentId], valid_seeds: set[FragmentId]
+) -> tuple[dict[FragmentId, float], dict[FragmentId, float]]:
+    p = {n: (1.0 / len(valid_seeds) if n in valid_seeds else 0.0) for n in nodes}
+    return p, dict(p)
+
+
+def _ppr_iteration(
+    nodes: list[FragmentId],
+    graph: Graph,
+    scores: dict[FragmentId, float],
+    out_sum: dict[FragmentId, float],
+    base: dict[FragmentId, float],
+    p: dict[FragmentId, float],
+    alpha: float,
+) -> dict[FragmentId, float]:
+    new_scores: dict[FragmentId, float] = dict(base)
+    dangling_mass = 0.0
+
+    for src in nodes:
+        nbrs = graph.neighbors(src)
+        total = out_sum[src]
+        if total <= 0 or not nbrs:
+            dangling_mass += scores[src]
+            continue
+        contrib = alpha * scores[src]
+        for dst, w in nbrs.items():
+            new_scores[dst] += contrib * (w / total)
+
+    if dangling_mass > 0:
+        add = alpha * dangling_mass
+        for n in nodes:
+            new_scores[n] += add * p[n]
+
+    return new_scores
+
+
+def _normalize_scores(scores: dict[FragmentId, float]) -> dict[FragmentId, float]:
+    total = sum(scores.values())
+    if total > 0:
+        return {n: s / total for n, s in scores.items()}
+    return scores
+
+
 def personalized_pagerank(
     graph: Graph,
     seeds: set[FragmentId],
@@ -15,46 +59,19 @@ def personalized_pagerank(
         return {}
 
     nodes = list(graph.nodes)
-
-    # Filter seeds to only include nodes that exist in the graph
     valid_seeds = seeds & graph.nodes
     if not valid_seeds:
         return {n: 1.0 / len(nodes) for n in nodes}
 
-    p = {n: (1.0 / len(valid_seeds) if n in valid_seeds else 0.0) for n in nodes}
-    scores = dict(p)
-
+    p, scores = _initialize_ppr_scores(nodes, valid_seeds)
     out_sum = {n: sum(graph.neighbors(n).values()) for n in nodes}
-
     base = {n: (1.0 - alpha) * p[n] for n in nodes}
 
     for _ in range(max_iter):
-        new_scores: dict[FragmentId, float] = dict(base)
-
-        dangling_mass = 0.0
-        for src in nodes:
-            nbrs = graph.neighbors(src)
-            total = out_sum[src]
-            if total <= 0 or not nbrs:
-                dangling_mass += scores[src]
-                continue
-            contrib = alpha * scores[src]
-            for dst, w in nbrs.items():
-                new_scores[dst] += contrib * (w / total)
-
-        if dangling_mass > 0:
-            add = alpha * dangling_mass
-            for n in nodes:
-                new_scores[n] += add * p[n]
-
+        new_scores = _ppr_iteration(nodes, graph, scores, out_sum, base, p, alpha)
         delta = sum(abs(new_scores[n] - scores[n]) for n in nodes)
         scores = new_scores
         if delta < tol:
             break
 
-    total = sum(scores.values())
-    if total > 0:
-        for n in scores:
-            scores[n] /= total
-
-    return scores
+    return _normalize_scores(scores)
diff --git a/src/treemapper/diffctx/python_semantics.py b/src/treemapper/diffctx/python_semantics.py
@@ -25,31 +25,22 @@ def _names_from_expr(expr: ast.AST | None) -> set[str]:
     return out
 
 
-def analyze_python_fragment(code: str) -> PyFragmentInfo:
-    if not code.strip():
-        return PyFragmentInfo(frozenset(), frozenset(), frozenset(), frozenset())
-
-    dedented = textwrap.dedent(code)
-    try:
-        tree = ast.parse(dedented)
-    except SyntaxError:
-        return PyFragmentInfo(frozenset(), frozenset(), frozenset(), frozenset())
-
+def _collect_defines(tree: ast.Module) -> set[str]:
     defines: set[str] = set()
-    refs: set[str] = set()
-    calls: set[str] = set()
-    type_refs: set[str] = set()
-
-    # Only collect top-level definitions (not nested functions/classes)
     for stmt in tree.body:
         if isinstance(stmt, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
             defines.add(stmt.name)
+    return defines
+
+
+def _collect_refs_and_calls(tree: ast.Module) -> tuple[set[str], set[str]]:
+    refs: set[str] = set()
+    calls: set[str] = set()
 
     for node in ast.walk(tree):
         if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load):
             refs.add(node.id)
-
-        if isinstance(node, ast.Attribute):
+        elif isinstance(node, ast.Attribute):
             refs.add(node.attr)
 
         if isinstance(node, ast.Call):
@@ -59,17 +50,46 @@ def analyze_python_fragment(code: str) -> PyFragmentInfo:
             elif isinstance(func, ast.Attribute):
                 calls.add(func.attr)
 
+    return refs, calls
+
+
+def _extract_func_type_refs(node: ast.FunctionDef | ast.AsyncFunctionDef) -> set[str]:
+    type_refs = _names_from_expr(node.returns)
+    for a in node.args.args + node.args.kwonlyargs:
+        type_refs |= _names_from_expr(a.annotation)
+    if node.args.vararg is not None:
+        type_refs |= _names_from_expr(node.args.vararg.annotation)
+    if node.args.kwarg is not None:
+        type_refs |= _names_from_expr(node.args.kwarg.annotation)
+    return type_refs
+
+
+def _collect_type_refs(tree: ast.Module) -> set[str]:
+    type_refs: set[str] = set()
+    for node in ast.walk(tree):
         if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
-            type_refs |= _names_from_expr(node.returns)
-            for a in node.args.args + node.args.kwonlyargs:
-                type_refs |= _names_from_expr(a.annotation)
-            if node.args.vararg is not None:
-                type_refs |= _names_from_expr(node.args.vararg.annotation)
-            if node.args.kwarg is not None:
-                type_refs |= _names_from_expr(node.args.kwarg.annotation)
-
-        if isinstance(node, ast.AnnAssign):
+            type_refs |= _extract_func_type_refs(node)
+        elif isinstance(node, ast.AnnAssign):
             type_refs |= _names_from_expr(node.annotation)
+    return type_refs
+
+
+_EMPTY_INFO = PyFragmentInfo(frozenset(), frozenset(), frozenset(), frozenset())
+
+
+def analyze_python_fragment(code: str) -> PyFragmentInfo:
+    if not code.strip():
+        return _EMPTY_INFO
+
+    dedented = textwrap.dedent(code)
+    try:
+        tree = ast.parse(dedented)
+    except SyntaxError:
+        return _EMPTY_INFO
+
+    defines = _collect_defines(tree)
+    refs, calls = _collect_refs_and_calls(tree)
+    type_refs = _collect_type_refs(tree)
 
     return PyFragmentInfo(
         defines=frozenset(defines),