nikolay-e
diff --git a/‎.github/workflows/cd.yml‎
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/cd.yml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/treemapper/diffctx/__init__.py‎
Lines changed: 168 additions & 124 deletions b/‎src/treemapper/diffctx/__init__.py‎
Lines changed: 168 additions & 124 deletions
@@ -77,6 +77,7 @@ jobs:
         id: commit_version
         run: |
           git config user.name "github-actions[bot]"
+          # 41898282 is GitHub's bot user ID for github-actions[bot]
           git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
           git add src/treemapper/version.py
           if ! git diff --staged --quiet; then
@@ -112,7 +113,7 @@ jobs:
           git bundle create repo.bundle --all
 
       - name: Upload bundle as artifact
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v7
         with:
           name: git-repo-bundle
           path: repo.bundle
@@ -234,7 +235,7 @@ jobs:
           fi
 
       - name: Upload artifact
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v7
         with:
           name: ${{ matrix.asset_name }}-binary
           path: ./repo/dist/treemapper-*
@@ -342,6 +343,7 @@ jobs:
         working-directory: ./repo
         run: |
           git config user.name "github-actions[bot]"
+          # 41898282 is GitHub's bot user ID for github-actions[bot]
           git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
 
           # Push the version bump commit to main
 
@@ -135,7 +135,7 @@ jobs:
           verbose: true
 
       - name: Upload coverage for SonarCloud
-        uses: actions/upload-artifact@v6
+        uses: actions/upload-artifact@v7
         if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12'
         with:
           name: coverage-report
 
@@ -23,7 +23,7 @@
 from .ppr import personalized_pagerank
 from .render import build_partial_tree
 from .select import lazy_greedy_select
-from .types import Fragment, FragmentId, extract_identifiers
+from .types import DiffHunk, Fragment, FragmentId, extract_identifiers
 from .utility import concepts_from_diff_text
 
 __all__ = ["GitError", "build_diff_context"]
@@ -64,6 +64,100 @@ def _read_file_content(
 _MAX_FRAGMENTS = 200
 
 
+def _build_preferred_revs(base_rev: str | None, head_rev: str | None) -> list[str]:
+    revs: list[str] = []
+    if head_rev:
+        revs.append(head_rev)
+    if base_rev and base_rev != head_rev:
+        revs.append(base_rev)
+    return revs
+
+
+def _process_files_for_fragments(
+    files: list[Path],
+    root_dir: Path,
+    preferred_revs: list[str],
+    seen_frag_ids: set[FragmentId],
+) -> list[Fragment]:
+    fragments: list[Fragment] = []
+    for file_path in files:
+        content = _read_file_content(file_path, root_dir, preferred_revs)
+        if content is None:
+            continue
+        for frag in fragment_file(file_path, content):
+            if frag.id not in seen_frag_ids:
+                fragments.append(frag)
+                seen_frag_ids.add(frag.id)
+    return fragments
+
+
+def _find_core_for_hunk(
+    frags: list[Fragment],
+    h_start: int,
+    h_end: int,
+) -> set[FragmentId]:
+    core: set[FragmentId] = set()
+
+    covering = [f for f in frags if f.start_line <= h_start and h_end <= f.end_line]
+    if covering:
+        best = min(covering, key=lambda f: f.line_count)
+        core.add(best.id)
+        return core
+
+    overlapping = [f for f in frags if f.start_line <= h_end and f.end_line >= h_start]
+    if overlapping:
+        for f in overlapping:
+            core.add(f.id)
+        return core
+
+    enc = enclosing_fragment(frags, h_start)
+    if enc is not None:
+        core.add(enc.id)
+        return core
+
+    before = [f for f in frags if f.end_line < h_start]
+    after = [f for f in frags if f.start_line > h_end]
+    if before:
+        core.add(max(before, key=lambda f: f.end_line).id)
+    if after:
+        core.add(min(after, key=lambda f: f.start_line).id)
+
+    return core
+
+
+def _select_full_mode(
+    all_fragments: list[Fragment],
+    changed_files: list[Path],
+) -> list[Fragment]:
+    changed_paths = set(changed_files)
+    selected = [f for f in all_fragments if f.path in changed_paths]
+    selected.sort(key=lambda f: (f.path, f.start_line))
+    return selected
+
+
+def _select_with_ppr(
+    all_fragments: list[Fragment],
+    core_ids: set[FragmentId],
+    concepts: frozenset[str],
+    budget_tokens: int | None,
+    alpha: float,
+    tau: float,
+) -> tuple[list[Fragment], Any]:
+    graph = build_graph(all_fragments)
+    rel_scores = personalized_pagerank(graph, core_ids, alpha=alpha)
+    effective_budget = budget_tokens if budget_tokens is not None else _DEFAULT_BUDGET_TOKENS
+
+    result = lazy_greedy_select(
+        fragments=all_fragments,
+        core_ids=core_ids,
+        rel=rel_scores,
+        concepts=concepts,
+        budget_tokens=effective_budget,
+        tau=tau,
+    )
+    return result.selected, result
+
+
 def build_diff_context(
     root_dir: Path,
     diff_range: str,
@@ -75,159 +169,109 @@ def build_diff_context(
     no_default_ignores: bool = False,
     full: bool = False,
 ) -> dict[str, Any]:
-    if not is_git_repo(root_dir):
-        raise GitError(f"'{root_dir}' is not a git repository")
-
-    if not (0.0 < alpha < 1.0):
-        raise ValueError(f"alpha must be in (0, 1), got {alpha}")
-    if tau < 0.0:
-        raise ValueError(f"tau must be >= 0, got {tau}")
-    if budget_tokens is not None and budget_tokens <= 0:
-        raise ValueError(f"budget_tokens must be > 0, got {budget_tokens}")
+    _validate_inputs(root_dir, alpha, tau, budget_tokens)
 
     hunks = parse_diff(root_dir, diff_range)
     if not hunks:
         return _empty_tree(root_dir)
 
     combined_spec = get_ignore_specs(root_dir, ignore_file, no_default_ignores, None)
-
     diff_text = get_diff_text(root_dir, diff_range)
     concepts = concepts_from_diff_text(diff_text)
 
     changed_files = get_changed_files(root_dir, diff_range)
     changed_files = _filter_ignored(changed_files, root_dir, combined_spec)
 
     base_rev, head_rev = split_diff_range(diff_range)
-    preferred_revs: list[str] = []
-    if head_rev:
-        preferred_revs.append(head_rev)
-    if base_rev and base_rev != head_rev:
-        preferred_revs.append(base_rev)
+    preferred_revs = _build_preferred_revs(base_rev, head_rev)
 
-    all_fragments: list[Fragment] = []
     seen_frag_ids: set[FragmentId] = set()
-
-    for file_path in changed_files:
-        content = _read_file_content(file_path, root_dir, preferred_revs)
-        if content is None:
-            continue
-        frags = fragment_file(file_path, content)
-        for frag in frags:
-            if frag.id not in seen_frag_ids:
-                all_fragments.append(frag)
-                seen_frag_ids.add(frag.id)
+    all_fragments = _process_files_for_fragments(changed_files, root_dir, preferred_revs, seen_frag_ids)
 
     expanded_files = _expand_universe_by_rare_identifiers(root_dir, concepts, changed_files, combined_spec)
-    for file_path in expanded_files:
-        content = _read_file_content(file_path, root_dir, preferred_revs)
-        if content is None:
-            continue
-        frags = fragment_file(file_path, content)
-        for frag in frags:
-            if frag.id not in seen_frag_ids:
-                all_fragments.append(frag)
-                seen_frag_ids.add(frag.id)
+    all_fragments.extend(_process_files_for_fragments(expanded_files, root_dir, preferred_revs, seen_frag_ids))
 
     for frag in all_fragments:
-        token_result = count_tokens(frag.content)
-        frag.token_count = token_result.count + _OVERHEAD_PER_FRAGMENT
+        frag.token_count = count_tokens(frag.content).count + _OVERHEAD_PER_FRAGMENT
 
-    frags_by_path: dict[Path, list[Fragment]] = defaultdict(list)
-    for frag in all_fragments:
-        frags_by_path[frag.path].append(frag)
-
-    core_ids: set[FragmentId] = set()
-    for h in hunks:
-        frags = frags_by_path.get(h.path, [])
-        if not frags:
-            continue
-        h_start = h.new_start
-        h_end = h.end_line
-
-        # Find fragments that fully cover the hunk
-        covering = [f for f in frags if f.start_line <= h_start and h_end <= f.end_line]
-
-        if covering:
-            # Select minimal covering fragment (smallest by line count)
-            best = min(covering, key=lambda f: f.line_count)
-            core_ids.add(best.id)
-        else:
-            # Check for fragments that OVERLAP with the hunk (partial coverage)
-            overlapping = [f for f in frags if f.start_line <= h_end and f.end_line >= h_start]
-            if overlapping:
-                # Add all overlapping fragments as core
-                for f in overlapping:
-                    core_ids.add(f.id)
-            elif (enc := enclosing_fragment(frags, h_start)) is not None:
-                # Fallback: use enclosing fragment
-                core_ids.add(enc.id)
-            else:
-                # For hunks in gaps between fragments, find nearest adjacent fragments
-                before = [f for f in frags if f.end_line < h_start]
-                after = [f for f in frags if f.start_line > h_end]
-                if before:
-                    nearest_before = max(before, key=lambda f: f.end_line)
-                    core_ids.add(nearest_before.id)
-                if after:
-                    nearest_after = min(after, key=lambda f: f.start_line)
-                    core_ids.add(nearest_after.id)
+    core_ids = _identify_core_fragments(hunks, all_fragments)
 
     if full:
-        changed_paths = set(changed_files)
-        selected = [f for f in all_fragments if f.path in changed_paths]
-        selected.sort(key=lambda f: (f.path, f.start_line))
-
-        try:
-            used = sum(f.token_count for f in selected)
-            logging.info(
-                "diffctx: full mode selected=%d from changed files used=%d tokens",
-                len(selected),
-                used,
-            )
-        except (TypeError, AttributeError) as e:
-            # nosemgrep: python-logger-credential-disclosure
-            logging.debug("diffctx: failed to compute token count: %s", e)
+        selected = _select_full_mode(all_fragments, changed_files)
+        _log_full_mode(selected)
     else:
-        graph = build_graph(all_fragments)
+        selected, result = _select_with_ppr(all_fragments, core_ids, concepts, budget_tokens, alpha, tau)
+        _log_ppr_mode(selected, core_ids, budget_tokens, result, alpha, tau)
 
-        rel_scores = personalized_pagerank(graph, core_ids, alpha=alpha)
+    if no_content:
+        for frag in selected:
+            frag.content = ""
 
-        effective_budget = budget_tokens if budget_tokens is not None else _DEFAULT_BUDGET_TOKENS
+    return build_partial_tree(root_dir, selected)
 
-        result = lazy_greedy_select(
-            fragments=all_fragments,
-            core_ids=core_ids,
-            rel=rel_scores,
-            concepts=concepts,
-            budget_tokens=effective_budget,
-            tau=tau,
-        )
 
-        selected = result.selected
+def _validate_inputs(root_dir: Path, alpha: float, tau: float, budget_tokens: int | None) -> None:
+    if not is_git_repo(root_dir):
+        raise GitError(f"'{root_dir}' is not a git repository")
+    if not (0.0 < alpha < 1.0):
+        raise ValueError(f"alpha must be in (0, 1), got {alpha}")
+    if tau < 0.0:
+        raise ValueError(f"tau must be >= 0, got {tau}")
+    if budget_tokens is not None and budget_tokens <= 0:
+        raise ValueError(f"budget_tokens must be > 0, got {budget_tokens}")
 
-        try:
-            used = sum(f.token_count for f in selected)
-            budget_str = str(budget_tokens) if budget_tokens is not None else "unlimited"
-            logging.info(
-                "diffctx: selected=%d core=%d used=%d/%s reason=%s utility=%.4f alpha=%.3f tau=%.3f",
-                len(selected),
-                len(core_ids),
-                used,
-                budget_str,
-                result.reason,
-                result.utility,
-                alpha,
-                tau,
-            )
-        except (TypeError, AttributeError) as e:
-            # nosemgrep: python-logger-credential-disclosure
-            logging.debug("diffctx: failed to compute token count: %s", e)
 
-    if no_content:
-        for frag in selected:
-            frag.content = ""
+def _identify_core_fragments(hunks: list[DiffHunk], all_fragments: list[Fragment]) -> set[FragmentId]:
+    frags_by_path: dict[Path, list[Fragment]] = defaultdict(list)
+    for frag in all_fragments:
+        frags_by_path[frag.path].append(frag)
 
-    return build_partial_tree(root_dir, selected)
+    core_ids: set[FragmentId] = set()
+    for h in hunks:
+        frags = frags_by_path.get(h.path, [])
+        if frags:
+            core_ids.update(_find_core_for_hunk(frags, h.new_start, h.end_line))
+    return core_ids
+
+
+def _log_full_mode(selected: list[Fragment]) -> None:
+    try:
+        used = sum(f.token_count for f in selected)
+        logging.info(
+            "diffctx: full mode selected=%d from changed files used=%d tokens",
+            len(selected),
+            used,
+        )
+    except (TypeError, AttributeError) as e:
+        # nosemgrep: python-logger-credential-disclosure
+        logging.debug("diffctx: failed to compute token count: %s", e)
+
+
+def _log_ppr_mode(
+    selected: list[Fragment],
+    core_ids: set[FragmentId],
+    budget_tokens: int | None,
+    result: Any,
+    alpha: float,
+    tau: float,
+) -> None:
+    try:
+        used = sum(f.token_count for f in selected)
+        budget_str = str(budget_tokens) if budget_tokens is not None else "unlimited"
+        logging.info(
+            "diffctx: selected=%d core=%d used=%d/%s reason=%s utility=%.4f alpha=%.3f tau=%.3f",
+            len(selected),
+            len(core_ids),
+            used,
+            budget_str,
+            result.reason,
+            result.utility,
+            alpha,
+            tau,
+        )
+    except (TypeError, AttributeError) as e:
+        # nosemgrep: python-logger-credential-disclosure
+        logging.debug("diffctx: failed to compute token count: %s", e)
 
 
 _MAX_FILE_SIZE = 100_000  # 100KB