2323from .ppr import personalized_pagerank
2424from .render import build_partial_tree
2525from .select import lazy_greedy_select
26- from .types import Fragment , FragmentId , extract_identifiers
26+ from .types import DiffHunk , Fragment , FragmentId , extract_identifiers
2727from .utility import concepts_from_diff_text
2828
2929__all__ = ["GitError" , "build_diff_context" ]
@@ -64,6 +64,100 @@ def _read_file_content(
6464_MAX_FRAGMENTS = 200
6565
6666
67+ def _build_preferred_revs (base_rev : str | None , head_rev : str | None ) -> list [str ]:
68+ revs : list [str ] = []
69+ if head_rev :
70+ revs .append (head_rev )
71+ if base_rev and base_rev != head_rev :
72+ revs .append (base_rev )
73+ return revs
74+
75+
76+ def _process_files_for_fragments (
77+ files : list [Path ],
78+ root_dir : Path ,
79+ preferred_revs : list [str ],
80+ seen_frag_ids : set [FragmentId ],
81+ ) -> list [Fragment ]:
82+ fragments : list [Fragment ] = []
83+ for file_path in files :
84+ content = _read_file_content (file_path , root_dir , preferred_revs )
85+ if content is None :
86+ continue
87+ for frag in fragment_file (file_path , content ):
88+ if frag .id not in seen_frag_ids :
89+ fragments .append (frag )
90+ seen_frag_ids .add (frag .id )
91+ return fragments
92+
93+
94+ def _find_core_for_hunk (
95+ frags : list [Fragment ],
96+ h_start : int ,
97+ h_end : int ,
98+ ) -> set [FragmentId ]:
99+ core : set [FragmentId ] = set ()
100+
101+ covering = [f for f in frags if f .start_line <= h_start and h_end <= f .end_line ]
102+ if covering :
103+ best = min (covering , key = lambda f : f .line_count )
104+ core .add (best .id )
105+ return core
106+
107+ overlapping = [f for f in frags if f .start_line <= h_end and f .end_line >= h_start ]
108+ if overlapping :
109+ for f in overlapping :
110+ core .add (f .id )
111+ return core
112+
113+ enc = enclosing_fragment (frags , h_start )
114+ if enc is not None :
115+ core .add (enc .id )
116+ return core
117+
118+ before = [f for f in frags if f .end_line < h_start ]
119+ after = [f for f in frags if f .start_line > h_end ]
120+ if before :
121+ core .add (max (before , key = lambda f : f .end_line ).id )
122+ if after :
123+ core .add (min (after , key = lambda f : f .start_line ).id )
124+
125+ return core
126+
127+
128+ def _select_full_mode (
129+ all_fragments : list [Fragment ],
130+ changed_files : list [Path ],
131+ ) -> list [Fragment ]:
132+ changed_paths = set (changed_files )
133+ selected = [f for f in all_fragments if f .path in changed_paths ]
134+ selected .sort (key = lambda f : (f .path , f .start_line ))
135+ return selected
136+
137+
138+ def _select_with_ppr (
139+ all_fragments : list [Fragment ],
140+ core_ids : set [FragmentId ],
141+ concepts : frozenset [str ],
142+ budget_tokens : int | None ,
143+ alpha : float ,
144+ tau : float ,
145+ ) -> tuple [list [Fragment ], Any ]:
146+ graph = build_graph (all_fragments )
147+ rel_scores = personalized_pagerank (graph , core_ids , alpha = alpha )
148+ effective_budget = budget_tokens if budget_tokens is not None else _DEFAULT_BUDGET_TOKENS
149+
150+ result = lazy_greedy_select (
151+ fragments = all_fragments ,
152+ core_ids = core_ids ,
153+ rel = rel_scores ,
154+ concepts = concepts ,
155+ budget_tokens = effective_budget ,
156+ tau = tau ,
157+ )
158+ return result .selected , result
159+
160+
67161def build_diff_context (
68162 root_dir : Path ,
69163 diff_range : str ,
@@ -75,159 +169,109 @@ def build_diff_context(
75169 no_default_ignores : bool = False ,
76170 full : bool = False ,
77171) -> dict [str , Any ]:
78- if not is_git_repo (root_dir ):
79- raise GitError (f"'{ root_dir } ' is not a git repository" )
80-
81- if not (0.0 < alpha < 1.0 ):
82- raise ValueError (f"alpha must be in (0, 1), got { alpha } " )
83- if tau < 0.0 :
84- raise ValueError (f"tau must be >= 0, got { tau } " )
85- if budget_tokens is not None and budget_tokens <= 0 :
86- raise ValueError (f"budget_tokens must be > 0, got { budget_tokens } " )
172+ _validate_inputs (root_dir , alpha , tau , budget_tokens )
87173
88174 hunks = parse_diff (root_dir , diff_range )
89175 if not hunks :
90176 return _empty_tree (root_dir )
91177
92178 combined_spec = get_ignore_specs (root_dir , ignore_file , no_default_ignores , None )
93-
94179 diff_text = get_diff_text (root_dir , diff_range )
95180 concepts = concepts_from_diff_text (diff_text )
96181
97182 changed_files = get_changed_files (root_dir , diff_range )
98183 changed_files = _filter_ignored (changed_files , root_dir , combined_spec )
99184
100185 base_rev , head_rev = split_diff_range (diff_range )
101- preferred_revs : list [str ] = []
102- if head_rev :
103- preferred_revs .append (head_rev )
104- if base_rev and base_rev != head_rev :
105- preferred_revs .append (base_rev )
186+ preferred_revs = _build_preferred_revs (base_rev , head_rev )
106187
107- all_fragments : list [Fragment ] = []
108188 seen_frag_ids : set [FragmentId ] = set ()
109-
110- for file_path in changed_files :
111- content = _read_file_content (file_path , root_dir , preferred_revs )
112- if content is None :
113- continue
114- frags = fragment_file (file_path , content )
115- for frag in frags :
116- if frag .id not in seen_frag_ids :
117- all_fragments .append (frag )
118- seen_frag_ids .add (frag .id )
189+ all_fragments = _process_files_for_fragments (changed_files , root_dir , preferred_revs , seen_frag_ids )
119190
120191 expanded_files = _expand_universe_by_rare_identifiers (root_dir , concepts , changed_files , combined_spec )
121- for file_path in expanded_files :
122- content = _read_file_content (file_path , root_dir , preferred_revs )
123- if content is None :
124- continue
125- frags = fragment_file (file_path , content )
126- for frag in frags :
127- if frag .id not in seen_frag_ids :
128- all_fragments .append (frag )
129- seen_frag_ids .add (frag .id )
192+ all_fragments .extend (_process_files_for_fragments (expanded_files , root_dir , preferred_revs , seen_frag_ids ))
130193
131194 for frag in all_fragments :
132- token_result = count_tokens (frag .content )
133- frag .token_count = token_result .count + _OVERHEAD_PER_FRAGMENT
195+ frag .token_count = count_tokens (frag .content ).count + _OVERHEAD_PER_FRAGMENT
134196
135- frags_by_path : dict [Path , list [Fragment ]] = defaultdict (list )
136- for frag in all_fragments :
137- frags_by_path [frag .path ].append (frag )
138-
139- core_ids : set [FragmentId ] = set ()
140- for h in hunks :
141- frags = frags_by_path .get (h .path , [])
142- if not frags :
143- continue
144- h_start = h .new_start
145- h_end = h .end_line
146-
147- # Find fragments that fully cover the hunk
148- covering = [f for f in frags if f .start_line <= h_start and h_end <= f .end_line ]
149-
150- if covering :
151- # Select minimal covering fragment (smallest by line count)
152- best = min (covering , key = lambda f : f .line_count )
153- core_ids .add (best .id )
154- else :
155- # Check for fragments that OVERLAP with the hunk (partial coverage)
156- overlapping = [f for f in frags if f .start_line <= h_end and f .end_line >= h_start ]
157- if overlapping :
158- # Add all overlapping fragments as core
159- for f in overlapping :
160- core_ids .add (f .id )
161- elif (enc := enclosing_fragment (frags , h_start )) is not None :
162- # Fallback: use enclosing fragment
163- core_ids .add (enc .id )
164- else :
165- # For hunks in gaps between fragments, find nearest adjacent fragments
166- before = [f for f in frags if f .end_line < h_start ]
167- after = [f for f in frags if f .start_line > h_end ]
168- if before :
169- nearest_before = max (before , key = lambda f : f .end_line )
170- core_ids .add (nearest_before .id )
171- if after :
172- nearest_after = min (after , key = lambda f : f .start_line )
173- core_ids .add (nearest_after .id )
197+ core_ids = _identify_core_fragments (hunks , all_fragments )
174198
175199 if full :
176- changed_paths = set (changed_files )
177- selected = [f for f in all_fragments if f .path in changed_paths ]
178- selected .sort (key = lambda f : (f .path , f .start_line ))
179-
180- try :
181- used = sum (f .token_count for f in selected )
182- logging .info (
183- "diffctx: full mode selected=%d from changed files used=%d tokens" ,
184- len (selected ),
185- used ,
186- )
187- except (TypeError , AttributeError ) as e :
188- # nosemgrep: python-logger-credential-disclosure
189- logging .debug ("diffctx: failed to compute token count: %s" , e )
200+ selected = _select_full_mode (all_fragments , changed_files )
201+ _log_full_mode (selected )
190202 else :
191- graph = build_graph (all_fragments )
203+ selected , result = _select_with_ppr (all_fragments , core_ids , concepts , budget_tokens , alpha , tau )
204+ _log_ppr_mode (selected , core_ids , budget_tokens , result , alpha , tau )
192205
193- rel_scores = personalized_pagerank (graph , core_ids , alpha = alpha )
206+ if no_content :
207+ for frag in selected :
208+ frag .content = ""
194209
195- effective_budget = budget_tokens if budget_tokens is not None else _DEFAULT_BUDGET_TOKENS
210+ return build_partial_tree ( root_dir , selected )
196211
197- result = lazy_greedy_select (
198- fragments = all_fragments ,
199- core_ids = core_ids ,
200- rel = rel_scores ,
201- concepts = concepts ,
202- budget_tokens = effective_budget ,
203- tau = tau ,
204- )
205212
206- selected = result .selected
213+ def _validate_inputs (root_dir : Path , alpha : float , tau : float , budget_tokens : int | None ) -> None :
214+ if not is_git_repo (root_dir ):
215+ raise GitError (f"'{ root_dir } ' is not a git repository" )
216+ if not (0.0 < alpha < 1.0 ):
217+ raise ValueError (f"alpha must be in (0, 1), got { alpha } " )
218+ if tau < 0.0 :
219+ raise ValueError (f"tau must be >= 0, got { tau } " )
220+ if budget_tokens is not None and budget_tokens <= 0 :
221+ raise ValueError (f"budget_tokens must be > 0, got { budget_tokens } " )
207222
208- try :
209- used = sum (f .token_count for f in selected )
210- budget_str = str (budget_tokens ) if budget_tokens is not None else "unlimited"
211- logging .info (
212- "diffctx: selected=%d core=%d used=%d/%s reason=%s utility=%.4f alpha=%.3f tau=%.3f" ,
213- len (selected ),
214- len (core_ids ),
215- used ,
216- budget_str ,
217- result .reason ,
218- result .utility ,
219- alpha ,
220- tau ,
221- )
222- except (TypeError , AttributeError ) as e :
223- # nosemgrep: python-logger-credential-disclosure
224- logging .debug ("diffctx: failed to compute token count: %s" , e )
225223
226- if no_content :
227- for frag in selected :
228- frag .content = ""
224+ def _identify_core_fragments (hunks : list [DiffHunk ], all_fragments : list [Fragment ]) -> set [FragmentId ]:
225+ frags_by_path : dict [Path , list [Fragment ]] = defaultdict (list )
226+ for frag in all_fragments :
227+ frags_by_path [frag .path ].append (frag )
229228
230- return build_partial_tree (root_dir , selected )
229+ core_ids : set [FragmentId ] = set ()
230+ for h in hunks :
231+ frags = frags_by_path .get (h .path , [])
232+ if frags :
233+ core_ids .update (_find_core_for_hunk (frags , h .new_start , h .end_line ))
234+ return core_ids
235+
236+
237+ def _log_full_mode (selected : list [Fragment ]) -> None :
238+ try :
239+ used = sum (f .token_count for f in selected )
240+ logging .info (
241+ "diffctx: full mode selected=%d from changed files used=%d tokens" ,
242+ len (selected ),
243+ used ,
244+ )
245+ except (TypeError , AttributeError ) as e :
246+ # nosemgrep: python-logger-credential-disclosure
247+ logging .debug ("diffctx: failed to compute token count: %s" , e )
248+
249+
250+ def _log_ppr_mode (
251+ selected : list [Fragment ],
252+ core_ids : set [FragmentId ],
253+ budget_tokens : int | None ,
254+ result : Any ,
255+ alpha : float ,
256+ tau : float ,
257+ ) -> None :
258+ try :
259+ used = sum (f .token_count for f in selected )
260+ budget_str = str (budget_tokens ) if budget_tokens is not None else "unlimited"
261+ logging .info (
262+ "diffctx: selected=%d core=%d used=%d/%s reason=%s utility=%.4f alpha=%.3f tau=%.3f" ,
263+ len (selected ),
264+ len (core_ids ),
265+ used ,
266+ budget_str ,
267+ result .reason ,
268+ result .utility ,
269+ alpha ,
270+ tau ,
271+ )
272+ except (TypeError , AttributeError ) as e :
273+ # nosemgrep: python-logger-credential-disclosure
274+ logging .debug ("diffctx: failed to compute token count: %s" , e )
231275
232276
233277_MAX_FILE_SIZE = 100_000 # 100KB
0 commit comments