Skip to content

Commit 354d7e1

Browse files
committed
refactor: reduce cognitive complexity and fix code review issues
- Refactor select.py: extract helpers to reduce complexity (66→~15) - Refactor __init__.py: extract helpers for fragments/core selection - Refactor graph.py: extract _add_ref_edges, _extract_config_keys, test detectors - Add Scala and .kts support to language weights and test detection - Fix artifact action versions (all v7), add bot email comment - Fix unused parameters, regex optimizations, redundant exceptions
1 parent 24a32db commit 354d7e1

12 files changed

Lines changed: 546 additions & 415 deletions

File tree

.github/workflows/cd.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ jobs:
7777
id: commit_version
7878
run: |
7979
git config user.name "github-actions[bot]"
80+
# 41898282 is GitHub's bot user ID for github-actions[bot]
8081
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
8182
git add src/treemapper/version.py
8283
if ! git diff --staged --quiet; then
@@ -112,7 +113,7 @@ jobs:
112113
git bundle create repo.bundle --all
113114
114115
- name: Upload bundle as artifact
115-
uses: actions/upload-artifact@v6
116+
uses: actions/upload-artifact@v7
116117
with:
117118
name: git-repo-bundle
118119
path: repo.bundle
@@ -234,7 +235,7 @@ jobs:
234235
fi
235236
236237
- name: Upload artifact
237-
uses: actions/upload-artifact@v6
238+
uses: actions/upload-artifact@v7
238239
with:
239240
name: ${{ matrix.asset_name }}-binary
240241
path: ./repo/dist/treemapper-*
@@ -342,6 +343,7 @@ jobs:
342343
working-directory: ./repo
343344
run: |
344345
git config user.name "github-actions[bot]"
346+
# 41898282 is GitHub's bot user ID for github-actions[bot]
345347
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
346348
347349
# Push the version bump commit to main

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ jobs:
135135
verbose: true
136136

137137
- name: Upload coverage for SonarCloud
138-
uses: actions/upload-artifact@v6
138+
uses: actions/upload-artifact@v7
139139
if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12'
140140
with:
141141
name: coverage-report

src/treemapper/diffctx/__init__.py

Lines changed: 168 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from .ppr import personalized_pagerank
2424
from .render import build_partial_tree
2525
from .select import lazy_greedy_select
26-
from .types import Fragment, FragmentId, extract_identifiers
26+
from .types import DiffHunk, Fragment, FragmentId, extract_identifiers
2727
from .utility import concepts_from_diff_text
2828

2929
__all__ = ["GitError", "build_diff_context"]
@@ -64,6 +64,100 @@ def _read_file_content(
6464
_MAX_FRAGMENTS = 200
6565

6666

67+
def _build_preferred_revs(base_rev: str | None, head_rev: str | None) -> list[str]:
68+
revs: list[str] = []
69+
if head_rev:
70+
revs.append(head_rev)
71+
if base_rev and base_rev != head_rev:
72+
revs.append(base_rev)
73+
return revs
74+
75+
76+
def _process_files_for_fragments(
77+
files: list[Path],
78+
root_dir: Path,
79+
preferred_revs: list[str],
80+
seen_frag_ids: set[FragmentId],
81+
) -> list[Fragment]:
82+
fragments: list[Fragment] = []
83+
for file_path in files:
84+
content = _read_file_content(file_path, root_dir, preferred_revs)
85+
if content is None:
86+
continue
87+
for frag in fragment_file(file_path, content):
88+
if frag.id not in seen_frag_ids:
89+
fragments.append(frag)
90+
seen_frag_ids.add(frag.id)
91+
return fragments
92+
93+
94+
def _find_core_for_hunk(
95+
frags: list[Fragment],
96+
h_start: int,
97+
h_end: int,
98+
) -> set[FragmentId]:
99+
core: set[FragmentId] = set()
100+
101+
covering = [f for f in frags if f.start_line <= h_start and h_end <= f.end_line]
102+
if covering:
103+
best = min(covering, key=lambda f: f.line_count)
104+
core.add(best.id)
105+
return core
106+
107+
overlapping = [f for f in frags if f.start_line <= h_end and f.end_line >= h_start]
108+
if overlapping:
109+
for f in overlapping:
110+
core.add(f.id)
111+
return core
112+
113+
enc = enclosing_fragment(frags, h_start)
114+
if enc is not None:
115+
core.add(enc.id)
116+
return core
117+
118+
before = [f for f in frags if f.end_line < h_start]
119+
after = [f for f in frags if f.start_line > h_end]
120+
if before:
121+
core.add(max(before, key=lambda f: f.end_line).id)
122+
if after:
123+
core.add(min(after, key=lambda f: f.start_line).id)
124+
125+
return core
126+
127+
128+
def _select_full_mode(
129+
all_fragments: list[Fragment],
130+
changed_files: list[Path],
131+
) -> list[Fragment]:
132+
changed_paths = set(changed_files)
133+
selected = [f for f in all_fragments if f.path in changed_paths]
134+
selected.sort(key=lambda f: (f.path, f.start_line))
135+
return selected
136+
137+
138+
def _select_with_ppr(
139+
all_fragments: list[Fragment],
140+
core_ids: set[FragmentId],
141+
concepts: frozenset[str],
142+
budget_tokens: int | None,
143+
alpha: float,
144+
tau: float,
145+
) -> tuple[list[Fragment], Any]:
146+
graph = build_graph(all_fragments)
147+
rel_scores = personalized_pagerank(graph, core_ids, alpha=alpha)
148+
effective_budget = budget_tokens if budget_tokens is not None else _DEFAULT_BUDGET_TOKENS
149+
150+
result = lazy_greedy_select(
151+
fragments=all_fragments,
152+
core_ids=core_ids,
153+
rel=rel_scores,
154+
concepts=concepts,
155+
budget_tokens=effective_budget,
156+
tau=tau,
157+
)
158+
return result.selected, result
159+
160+
67161
def build_diff_context(
68162
root_dir: Path,
69163
diff_range: str,
@@ -75,159 +169,109 @@ def build_diff_context(
75169
no_default_ignores: bool = False,
76170
full: bool = False,
77171
) -> dict[str, Any]:
78-
if not is_git_repo(root_dir):
79-
raise GitError(f"'{root_dir}' is not a git repository")
80-
81-
if not (0.0 < alpha < 1.0):
82-
raise ValueError(f"alpha must be in (0, 1), got {alpha}")
83-
if tau < 0.0:
84-
raise ValueError(f"tau must be >= 0, got {tau}")
85-
if budget_tokens is not None and budget_tokens <= 0:
86-
raise ValueError(f"budget_tokens must be > 0, got {budget_tokens}")
172+
_validate_inputs(root_dir, alpha, tau, budget_tokens)
87173

88174
hunks = parse_diff(root_dir, diff_range)
89175
if not hunks:
90176
return _empty_tree(root_dir)
91177

92178
combined_spec = get_ignore_specs(root_dir, ignore_file, no_default_ignores, None)
93-
94179
diff_text = get_diff_text(root_dir, diff_range)
95180
concepts = concepts_from_diff_text(diff_text)
96181

97182
changed_files = get_changed_files(root_dir, diff_range)
98183
changed_files = _filter_ignored(changed_files, root_dir, combined_spec)
99184

100185
base_rev, head_rev = split_diff_range(diff_range)
101-
preferred_revs: list[str] = []
102-
if head_rev:
103-
preferred_revs.append(head_rev)
104-
if base_rev and base_rev != head_rev:
105-
preferred_revs.append(base_rev)
186+
preferred_revs = _build_preferred_revs(base_rev, head_rev)
106187

107-
all_fragments: list[Fragment] = []
108188
seen_frag_ids: set[FragmentId] = set()
109-
110-
for file_path in changed_files:
111-
content = _read_file_content(file_path, root_dir, preferred_revs)
112-
if content is None:
113-
continue
114-
frags = fragment_file(file_path, content)
115-
for frag in frags:
116-
if frag.id not in seen_frag_ids:
117-
all_fragments.append(frag)
118-
seen_frag_ids.add(frag.id)
189+
all_fragments = _process_files_for_fragments(changed_files, root_dir, preferred_revs, seen_frag_ids)
119190

120191
expanded_files = _expand_universe_by_rare_identifiers(root_dir, concepts, changed_files, combined_spec)
121-
for file_path in expanded_files:
122-
content = _read_file_content(file_path, root_dir, preferred_revs)
123-
if content is None:
124-
continue
125-
frags = fragment_file(file_path, content)
126-
for frag in frags:
127-
if frag.id not in seen_frag_ids:
128-
all_fragments.append(frag)
129-
seen_frag_ids.add(frag.id)
192+
all_fragments.extend(_process_files_for_fragments(expanded_files, root_dir, preferred_revs, seen_frag_ids))
130193

131194
for frag in all_fragments:
132-
token_result = count_tokens(frag.content)
133-
frag.token_count = token_result.count + _OVERHEAD_PER_FRAGMENT
195+
frag.token_count = count_tokens(frag.content).count + _OVERHEAD_PER_FRAGMENT
134196

135-
frags_by_path: dict[Path, list[Fragment]] = defaultdict(list)
136-
for frag in all_fragments:
137-
frags_by_path[frag.path].append(frag)
138-
139-
core_ids: set[FragmentId] = set()
140-
for h in hunks:
141-
frags = frags_by_path.get(h.path, [])
142-
if not frags:
143-
continue
144-
h_start = h.new_start
145-
h_end = h.end_line
146-
147-
# Find fragments that fully cover the hunk
148-
covering = [f for f in frags if f.start_line <= h_start and h_end <= f.end_line]
149-
150-
if covering:
151-
# Select minimal covering fragment (smallest by line count)
152-
best = min(covering, key=lambda f: f.line_count)
153-
core_ids.add(best.id)
154-
else:
155-
# Check for fragments that OVERLAP with the hunk (partial coverage)
156-
overlapping = [f for f in frags if f.start_line <= h_end and f.end_line >= h_start]
157-
if overlapping:
158-
# Add all overlapping fragments as core
159-
for f in overlapping:
160-
core_ids.add(f.id)
161-
elif (enc := enclosing_fragment(frags, h_start)) is not None:
162-
# Fallback: use enclosing fragment
163-
core_ids.add(enc.id)
164-
else:
165-
# For hunks in gaps between fragments, find nearest adjacent fragments
166-
before = [f for f in frags if f.end_line < h_start]
167-
after = [f for f in frags if f.start_line > h_end]
168-
if before:
169-
nearest_before = max(before, key=lambda f: f.end_line)
170-
core_ids.add(nearest_before.id)
171-
if after:
172-
nearest_after = min(after, key=lambda f: f.start_line)
173-
core_ids.add(nearest_after.id)
197+
core_ids = _identify_core_fragments(hunks, all_fragments)
174198

175199
if full:
176-
changed_paths = set(changed_files)
177-
selected = [f for f in all_fragments if f.path in changed_paths]
178-
selected.sort(key=lambda f: (f.path, f.start_line))
179-
180-
try:
181-
used = sum(f.token_count for f in selected)
182-
logging.info(
183-
"diffctx: full mode selected=%d from changed files used=%d tokens",
184-
len(selected),
185-
used,
186-
)
187-
except (TypeError, AttributeError) as e:
188-
# nosemgrep: python-logger-credential-disclosure
189-
logging.debug("diffctx: failed to compute token count: %s", e)
200+
selected = _select_full_mode(all_fragments, changed_files)
201+
_log_full_mode(selected)
190202
else:
191-
graph = build_graph(all_fragments)
203+
selected, result = _select_with_ppr(all_fragments, core_ids, concepts, budget_tokens, alpha, tau)
204+
_log_ppr_mode(selected, core_ids, budget_tokens, result, alpha, tau)
192205

193-
rel_scores = personalized_pagerank(graph, core_ids, alpha=alpha)
206+
if no_content:
207+
for frag in selected:
208+
frag.content = ""
194209

195-
effective_budget = budget_tokens if budget_tokens is not None else _DEFAULT_BUDGET_TOKENS
210+
return build_partial_tree(root_dir, selected)
196211

197-
result = lazy_greedy_select(
198-
fragments=all_fragments,
199-
core_ids=core_ids,
200-
rel=rel_scores,
201-
concepts=concepts,
202-
budget_tokens=effective_budget,
203-
tau=tau,
204-
)
205212

206-
selected = result.selected
213+
def _validate_inputs(root_dir: Path, alpha: float, tau: float, budget_tokens: int | None) -> None:
214+
if not is_git_repo(root_dir):
215+
raise GitError(f"'{root_dir}' is not a git repository")
216+
if not (0.0 < alpha < 1.0):
217+
raise ValueError(f"alpha must be in (0, 1), got {alpha}")
218+
if tau < 0.0:
219+
raise ValueError(f"tau must be >= 0, got {tau}")
220+
if budget_tokens is not None and budget_tokens <= 0:
221+
raise ValueError(f"budget_tokens must be > 0, got {budget_tokens}")
207222

208-
try:
209-
used = sum(f.token_count for f in selected)
210-
budget_str = str(budget_tokens) if budget_tokens is not None else "unlimited"
211-
logging.info(
212-
"diffctx: selected=%d core=%d used=%d/%s reason=%s utility=%.4f alpha=%.3f tau=%.3f",
213-
len(selected),
214-
len(core_ids),
215-
used,
216-
budget_str,
217-
result.reason,
218-
result.utility,
219-
alpha,
220-
tau,
221-
)
222-
except (TypeError, AttributeError) as e:
223-
# nosemgrep: python-logger-credential-disclosure
224-
logging.debug("diffctx: failed to compute token count: %s", e)
225223

226-
if no_content:
227-
for frag in selected:
228-
frag.content = ""
224+
def _identify_core_fragments(hunks: list[DiffHunk], all_fragments: list[Fragment]) -> set[FragmentId]:
225+
frags_by_path: dict[Path, list[Fragment]] = defaultdict(list)
226+
for frag in all_fragments:
227+
frags_by_path[frag.path].append(frag)
229228

230-
return build_partial_tree(root_dir, selected)
229+
core_ids: set[FragmentId] = set()
230+
for h in hunks:
231+
frags = frags_by_path.get(h.path, [])
232+
if frags:
233+
core_ids.update(_find_core_for_hunk(frags, h.new_start, h.end_line))
234+
return core_ids
235+
236+
237+
def _log_full_mode(selected: list[Fragment]) -> None:
238+
try:
239+
used = sum(f.token_count for f in selected)
240+
logging.info(
241+
"diffctx: full mode selected=%d from changed files used=%d tokens",
242+
len(selected),
243+
used,
244+
)
245+
except (TypeError, AttributeError) as e:
246+
# nosemgrep: python-logger-credential-disclosure
247+
logging.debug("diffctx: failed to compute token count: %s", e)
248+
249+
250+
def _log_ppr_mode(
251+
selected: list[Fragment],
252+
core_ids: set[FragmentId],
253+
budget_tokens: int | None,
254+
result: Any,
255+
alpha: float,
256+
tau: float,
257+
) -> None:
258+
try:
259+
used = sum(f.token_count for f in selected)
260+
budget_str = str(budget_tokens) if budget_tokens is not None else "unlimited"
261+
logging.info(
262+
"diffctx: selected=%d core=%d used=%d/%s reason=%s utility=%.4f alpha=%.3f tau=%.3f",
263+
len(selected),
264+
len(core_ids),
265+
used,
266+
budget_str,
267+
result.reason,
268+
result.utility,
269+
alpha,
270+
tau,
271+
)
272+
except (TypeError, AttributeError) as e:
273+
# nosemgrep: python-logger-credential-disclosure
274+
logging.debug("diffctx: failed to compute token count: %s", e)
231275

232276

233277
_MAX_FILE_SIZE = 100_000 # 100KB

0 commit comments

Comments
 (0)