Skip to content

Commit 35bc31e

Browse files
committed
fix: filter hub noise and config-generic false positives from diffctx
1 parent 41aef2a commit 35bc31e

44 files changed

Lines changed: 8435 additions & 4 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
__pycache__/
33
*.py[cod]
44
*$py.class
5+
test-repos
56

67
# C extensions
78
*.so

src/treemapper/diffctx/__init__.py

Lines changed: 119 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,119 @@ def _apply_same_file_floor(
179179
rel[frag.id] = _SAME_FILE_FLOOR
180180

181181

182+
_HUB_REVERSE_THRESHOLD = 5
183+
184+
185+
def _find_hub_noise_paths(
186+
graph: Graph,
187+
changed_paths: set[Path],
188+
) -> set[Path]:
189+
reverse_deps: dict[Path, set[Path]] = defaultdict(set)
190+
for (src, dst), category in graph.edge_categories.items():
191+
if category != "semantic":
192+
continue
193+
src_changed = src.path in changed_paths
194+
dst_changed = dst.path in changed_paths
195+
if not (src_changed ^ dst_changed):
196+
continue
197+
198+
changed_frag = src if src_changed else dst
199+
other_frag = dst if src_changed else src
200+
201+
fwd_w = graph.adjacency.get(changed_frag, {}).get(other_frag, 0.0)
202+
rev_w = graph.adjacency.get(other_frag, {}).get(changed_frag, 0.0)
203+
204+
if rev_w > fwd_w:
205+
reverse_deps[changed_frag.path].add(other_frag.path)
206+
207+
noise: set[Path] = set()
208+
for deps in reverse_deps.values():
209+
if len(deps) > _HUB_REVERSE_THRESHOLD:
210+
noise.update(deps)
211+
return noise
212+
213+
214+
def _find_config_generic_code_files(
215+
graph: Graph,
216+
changed_paths: set[Path],
217+
) -> set[Path]:
218+
has_real_edge: set[Path] = set()
219+
has_generic_config: set[Path] = set()
220+
for (src, dst), category in graph.edge_categories.items():
221+
src_changed = src.path in changed_paths
222+
dst_changed = dst.path in changed_paths
223+
if not (src_changed ^ dst_changed):
224+
continue
225+
other_path = (dst if src_changed else src).path
226+
if category == "config_generic":
227+
has_generic_config.add(other_path)
228+
elif category in ("semantic", "config"):
229+
has_real_edge.add(other_path)
230+
231+
generic_only = has_generic_config - has_real_edge
232+
return {p for p in generic_only if p.suffix.lower() in CODE_EXTENSIONS}
233+
234+
235+
def _filter_unrelated_fragments(
236+
fragments: list[Fragment],
237+
core_ids: set[FragmentId],
238+
graph: Graph,
239+
) -> list[Fragment]:
240+
changed_paths = {fid.path for fid in core_ids}
241+
242+
paths_to_remove = _find_hub_noise_paths(graph, changed_paths)
243+
paths_to_remove |= _find_config_generic_code_files(graph, changed_paths)
244+
paths_to_remove -= changed_paths
245+
246+
if not paths_to_remove:
247+
return fragments
248+
249+
kept = [f for f in fragments if f.path not in paths_to_remove]
250+
removed_count = len(fragments) - len(kept)
251+
if removed_count:
252+
logging.debug(
253+
"diffctx: filtered %d fragments from %d unrelated files",
254+
removed_count,
255+
len(paths_to_remove),
256+
)
257+
return kept
258+
259+
260+
_MAX_CONTEXT_FRAGMENTS_PER_FILE = 10
261+
262+
263+
def _cap_context_fragments(
264+
fragments: list[Fragment],
265+
core_ids: set[FragmentId],
266+
rel: dict[FragmentId, float],
267+
) -> list[Fragment]:
268+
changed_paths = {fid.path for fid in core_ids}
269+
270+
ctx_by_path: dict[Path, list[Fragment]] = defaultdict(list)
271+
result: list[Fragment] = []
272+
273+
for f in fragments:
274+
if f.path in changed_paths:
275+
result.append(f)
276+
else:
277+
ctx_by_path[f.path].append(f)
278+
279+
for path, file_frags in ctx_by_path.items():
280+
if len(file_frags) <= _MAX_CONTEXT_FRAGMENTS_PER_FILE:
281+
result.extend(file_frags)
282+
else:
283+
file_frags.sort(key=lambda f: rel.get(f.id, 0.0), reverse=True)
284+
result.extend(file_frags[:_MAX_CONTEXT_FRAGMENTS_PER_FILE])
285+
logging.debug(
286+
"diffctx: capped %s from %d to %d fragments",
287+
path,
288+
len(file_frags),
289+
_MAX_CONTEXT_FRAGMENTS_PER_FILE,
290+
)
291+
292+
return result
293+
294+
182295
def _select_with_ppr(
183296
all_fragments: list[Fragment],
184297
core_ids: set[FragmentId],
@@ -193,20 +306,23 @@ def _select_with_ppr(
193306
rel_scores = personalized_pagerank(graph, core_ids, alpha=alpha, seed_weights=seed_weights)
194307
_apply_same_file_floor(rel_scores, core_ids, all_fragments)
195308

196-
needs = needs_from_diff(all_fragments, core_ids, graph, diff_text)
309+
filtered_fragments = _filter_unrelated_fragments(all_fragments, core_ids, graph)
310+
filtered_fragments = _cap_context_fragments(filtered_fragments, core_ids, rel_scores)
311+
312+
needs = needs_from_diff(filtered_fragments, core_ids, graph, diff_text)
197313

198314
effective_budget = budget_tokens if budget_tokens is not None else _UNLIMITED_BUDGET
199315

200316
result = lazy_greedy_select(
201-
fragments=all_fragments,
317+
fragments=filtered_fragments,
202318
core_ids=core_ids,
203319
rel=rel_scores,
204320
needs=needs,
205321
budget_tokens=effective_budget,
206322
tau=tau,
207323
)
208324

209-
selected = _coherence_post_pass(result, all_fragments, graph, effective_budget)
325+
selected = _coherence_post_pass(result, filtered_fragments, graph, effective_budget)
210326
return selected.selected, selected
211327

212328

src/treemapper/diffctx/edges/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,11 @@ def collect_all_edges(fragments: list[Fragment], repo_root: Path | None = None)
4646
for category, get_builders in _BUILDER_CATEGORIES:
4747
for cls in get_builders():
4848
builder = cls()
49+
cat = builder.category or category
4950
for (src, dst), weight in builder.build(fragments, repo_root).items():
5051
if weight > all_edges.get((src, dst), 0.0):
5152
all_edges[(src, dst)] = weight
52-
edge_categories[(src, dst)] = category
53+
edge_categories[(src, dst)] = cat
5354
return all_edges, edge_categories
5455

5556

src/treemapper/diffctx/edges/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ def add_semantic_edges(
174174
class EdgeBuilder(ABC):
175175
weight: float = 0.5
176176
reverse_weight_factor: float = 0.7
177+
category: str | None = None
177178

178179
@abstractmethod
179180
def build(self, fragments: list[Fragment], repo_root: Path | None = None) -> EdgeDict:

src/treemapper/diffctx/edges/config/generic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ def _is_code_file(path: Path) -> bool:
109109
class ConfigToCodeEdgeBuilder(EdgeBuilder):
110110
weight = 0.45
111111
reverse_weight_factor = 0.70
112+
category = "config_generic"
112113

113114
def discover_related_files(
114115
self,

0 commit comments

Comments
 (0)