@@ -179,6 +179,119 @@ def _apply_same_file_floor(
179179 rel [frag .id ] = _SAME_FILE_FLOOR
180180
181181
182+ _HUB_REVERSE_THRESHOLD = 5
183+
184+
185+ def _find_hub_noise_paths (
186+ graph : Graph ,
187+ changed_paths : set [Path ],
188+ ) -> set [Path ]:
189+ reverse_deps : dict [Path , set [Path ]] = defaultdict (set )
190+ for (src , dst ), category in graph .edge_categories .items ():
191+ if category != "semantic" :
192+ continue
193+ src_changed = src .path in changed_paths
194+ dst_changed = dst .path in changed_paths
195+ if not (src_changed ^ dst_changed ):
196+ continue
197+
198+ changed_frag = src if src_changed else dst
199+ other_frag = dst if src_changed else src
200+
201+ fwd_w = graph .adjacency .get (changed_frag , {}).get (other_frag , 0.0 )
202+ rev_w = graph .adjacency .get (other_frag , {}).get (changed_frag , 0.0 )
203+
204+ if rev_w > fwd_w :
205+ reverse_deps [changed_frag .path ].add (other_frag .path )
206+
207+ noise : set [Path ] = set ()
208+ for deps in reverse_deps .values ():
209+ if len (deps ) > _HUB_REVERSE_THRESHOLD :
210+ noise .update (deps )
211+ return noise
212+
213+
214+ def _find_config_generic_code_files (
215+ graph : Graph ,
216+ changed_paths : set [Path ],
217+ ) -> set [Path ]:
218+ has_real_edge : set [Path ] = set ()
219+ has_generic_config : set [Path ] = set ()
220+ for (src , dst ), category in graph .edge_categories .items ():
221+ src_changed = src .path in changed_paths
222+ dst_changed = dst .path in changed_paths
223+ if not (src_changed ^ dst_changed ):
224+ continue
225+ other_path = (dst if src_changed else src ).path
226+ if category == "config_generic" :
227+ has_generic_config .add (other_path )
228+ elif category in ("semantic" , "config" ):
229+ has_real_edge .add (other_path )
230+
231+ generic_only = has_generic_config - has_real_edge
232+ return {p for p in generic_only if p .suffix .lower () in CODE_EXTENSIONS }
233+
234+
235+ def _filter_unrelated_fragments (
236+ fragments : list [Fragment ],
237+ core_ids : set [FragmentId ],
238+ graph : Graph ,
239+ ) -> list [Fragment ]:
240+ changed_paths = {fid .path for fid in core_ids }
241+
242+ paths_to_remove = _find_hub_noise_paths (graph , changed_paths )
243+ paths_to_remove |= _find_config_generic_code_files (graph , changed_paths )
244+ paths_to_remove -= changed_paths
245+
246+ if not paths_to_remove :
247+ return fragments
248+
249+ kept = [f for f in fragments if f .path not in paths_to_remove ]
250+ removed_count = len (fragments ) - len (kept )
251+ if removed_count :
252+ logging .debug (
253+ "diffctx: filtered %d fragments from %d unrelated files" ,
254+ removed_count ,
255+ len (paths_to_remove ),
256+ )
257+ return kept
258+
259+
260+ _MAX_CONTEXT_FRAGMENTS_PER_FILE = 10
261+
262+
263+ def _cap_context_fragments (
264+ fragments : list [Fragment ],
265+ core_ids : set [FragmentId ],
266+ rel : dict [FragmentId , float ],
267+ ) -> list [Fragment ]:
268+ changed_paths = {fid .path for fid in core_ids }
269+
270+ ctx_by_path : dict [Path , list [Fragment ]] = defaultdict (list )
271+ result : list [Fragment ] = []
272+
273+ for f in fragments :
274+ if f .path in changed_paths :
275+ result .append (f )
276+ else :
277+ ctx_by_path [f .path ].append (f )
278+
279+ for path , file_frags in ctx_by_path .items ():
280+ if len (file_frags ) <= _MAX_CONTEXT_FRAGMENTS_PER_FILE :
281+ result .extend (file_frags )
282+ else :
283+ file_frags .sort (key = lambda f : rel .get (f .id , 0.0 ), reverse = True )
284+ result .extend (file_frags [:_MAX_CONTEXT_FRAGMENTS_PER_FILE ])
285+ logging .debug (
286+ "diffctx: capped %s from %d to %d fragments" ,
287+ path ,
288+ len (file_frags ),
289+ _MAX_CONTEXT_FRAGMENTS_PER_FILE ,
290+ )
291+
292+ return result
293+
294+
182295def _select_with_ppr (
183296 all_fragments : list [Fragment ],
184297 core_ids : set [FragmentId ],
@@ -193,20 +306,23 @@ def _select_with_ppr(
193306 rel_scores = personalized_pagerank (graph , core_ids , alpha = alpha , seed_weights = seed_weights )
194307 _apply_same_file_floor (rel_scores , core_ids , all_fragments )
195308
196- needs = needs_from_diff (all_fragments , core_ids , graph , diff_text )
309+ filtered_fragments = _filter_unrelated_fragments (all_fragments , core_ids , graph )
310+ filtered_fragments = _cap_context_fragments (filtered_fragments , core_ids , rel_scores )
311+
312+ needs = needs_from_diff (filtered_fragments , core_ids , graph , diff_text )
197313
198314 effective_budget = budget_tokens if budget_tokens is not None else _UNLIMITED_BUDGET
199315
200316 result = lazy_greedy_select (
201- fragments = all_fragments ,
317+ fragments = filtered_fragments ,
202318 core_ids = core_ids ,
203319 rel = rel_scores ,
204320 needs = needs ,
205321 budget_tokens = effective_budget ,
206322 tau = tau ,
207323 )
208324
209- selected = _coherence_post_pass (result , all_fragments , graph , effective_budget )
325+ selected = _coherence_post_pass (result , filtered_fragments , graph , effective_budget )
210326 return selected .selected , selected
211327
212328
0 commit comments