@@ -193,6 +193,27 @@ def _is_generated_file(path: Path, content: str) -> bool:
193193 return False
194194
195195
196+ def _truncate_generated_fragments (file_frags : list [Fragment ]) -> list [Fragment ]:
197+ truncated : list [Fragment ] = []
198+ for frag in file_frags :
199+ if frag .line_count <= _MAX_GENERATED_LINES :
200+ truncated .append (frag )
201+ continue
202+ lines = frag .content .splitlines ()
203+ remaining = len (lines ) - _MAX_GENERATED_LINES
204+ lines = lines [:_MAX_GENERATED_LINES ]
205+ truncated_content = "\n " .join (lines ) + f"\n # ... [{ remaining } more lines]"
206+ truncated .append (
207+ Fragment (
208+ id = FragmentId (frag .path , frag .start_line , frag .start_line + len (lines ) - 1 ),
209+ kind = frag .kind ,
210+ content = truncated_content ,
211+ identifiers = extract_identifiers (truncated_content ),
212+ )
213+ )
214+ return truncated
215+
216+
196217def _process_files_for_fragments (
197218 files : list [Path ],
198219 root_dir : Path ,
@@ -221,24 +242,7 @@ def _process_files_for_fragments(
221242 )
222243
223244 if is_generated :
224- truncated : list [Fragment ] = []
225- for frag in file_frags :
226- if frag .line_count > _MAX_GENERATED_LINES :
227- lines = frag .content .splitlines ()
228- remaining = len (lines ) - _MAX_GENERATED_LINES
229- lines = lines [:_MAX_GENERATED_LINES ]
230- truncated_content = "\n " .join (lines ) + f"\n # ... [{ remaining } more lines]"
231- truncated .append (
232- Fragment (
233- id = FragmentId (frag .path , frag .start_line , frag .start_line + len (lines ) - 1 ),
234- kind = frag .kind ,
235- content = truncated_content ,
236- identifiers = extract_identifiers (truncated_content ),
237- )
238- )
239- else :
240- truncated .append (frag )
241- file_frags = truncated
245+ file_frags = _truncate_generated_fragments (file_frags )
242246
243247 for frag in file_frags :
244248 fragments .append (frag )
@@ -303,10 +307,10 @@ def _apply_same_file_floor(
303307_HUB_REVERSE_THRESHOLD = 3
304308
305309
306- def _find_hub_noise_paths (
310+ def _classify_semantic_edges (
307311 graph : Graph ,
308312 changed_paths : set [Path ],
309- ) -> set [Path ]:
313+ ) -> tuple [ dict [ Path , set [Path ]], set [ Path ] ]:
310314 reverse_deps : dict [Path , set [Path ]] = defaultdict (set )
311315 direct_edge_paths : set [Path ] = set ()
312316 for (src , dst ), category in graph .edge_categories .items ():
@@ -327,6 +331,14 @@ def _find_hub_noise_paths(
327331 reverse_deps [changed_frag .path ].add (other_frag .path )
328332 else :
329333 direct_edge_paths .add (other_frag .path )
334+ return reverse_deps , direct_edge_paths
335+
336+
337+ def _find_hub_noise_paths (
338+ graph : Graph ,
339+ changed_paths : set [Path ],
340+ ) -> set [Path ]:
341+ reverse_deps , direct_edge_paths = _classify_semantic_edges (graph , changed_paths )
330342
331343 noise_counts : dict [Path , int ] = {}
332344 for deps in reverse_deps .values ():
@@ -440,6 +452,44 @@ def _filter_low_relevance_fragments(
440452 return kept
441453
442454
455+ def _create_whole_file_fragment (
456+ path : Path ,
457+ root_dir : Path ,
458+ preferred_revs : list [str ],
459+ ) -> Fragment | None :
460+ content = _read_file_content (path , root_dir , preferred_revs )
461+ if not content or not content .strip ():
462+ return None
463+ if _is_generated_file (path , content ):
464+ lines = content .splitlines ()
465+ if len (lines ) > _MAX_GENERATED_LINES :
466+ remaining = len (lines ) - _MAX_GENERATED_LINES
467+ content = "\n " .join (lines [:_MAX_GENERATED_LINES ]) + f"\n # ... [{ remaining } more lines]"
468+ lines = content .splitlines ()
469+ frag = Fragment (
470+ id = FragmentId (path = path , start_line = 1 , end_line = len (lines )),
471+ kind = "chunk" ,
472+ content = content ,
473+ identifiers = extract_identifiers (content ),
474+ )
475+ frag .token_count = count_tokens (content ).count + _OVERHEAD_PER_FRAGMENT
476+ return frag
477+
478+
479+ def _pick_smallest_fitting (
480+ candidates : list [Fragment ],
481+ selected_ids : set [FragmentId ],
482+ budget_left : int ,
483+ ) -> Fragment | None :
484+ ranked = sorted (candidates , key = lambda f : f .token_count )
485+ for cand in ranked :
486+ if cand .token_count <= 0 or cand .id in selected_ids :
487+ continue
488+ if cand .token_count <= budget_left :
489+ return cand
490+ return None
491+
492+
443493def _ensure_changed_files_represented (
444494 selected : list [Fragment ],
445495 all_fragments : list [Fragment ],
@@ -449,8 +499,7 @@ def _ensure_changed_files_represented(
449499 preferred_revs : list [str ],
450500) -> list [Fragment ]:
451501 selected_paths = {f .path for f in selected }
452- changed_paths = set (changed_files )
453- missing_paths = changed_paths - selected_paths
502+ missing_paths = set (changed_files ) - selected_paths
454503
455504 if not missing_paths :
456505 return selected
@@ -466,37 +515,11 @@ def _ensure_changed_files_represented(
466515
467516 for path in sorted (missing_paths ):
468517 candidates = frags_by_path .get (path , [])
469-
470518 if not candidates :
471- content = _read_file_content (path , root_dir , preferred_revs )
472- if content and content .strip ():
473- if _is_generated_file (path , content ):
474- lines = content .splitlines ()
475- if len (lines ) > _MAX_GENERATED_LINES :
476- remaining = len (lines ) - _MAX_GENERATED_LINES
477- content = "\n " .join (lines [:_MAX_GENERATED_LINES ]) + f"\n # ... [{ remaining } more lines]"
478- lines = content .splitlines ()
479- else :
480- lines = content .splitlines ()
481- frag = Fragment (
482- id = FragmentId (path = path , start_line = 1 , end_line = len (lines )),
483- kind = "chunk" ,
484- content = content ,
485- identifiers = extract_identifiers (content ),
486- )
487- frag .token_count = count_tokens (content ).count + _OVERHEAD_PER_FRAGMENT
488- candidates = [frag ]
519+ fallback = _create_whole_file_fragment (path , root_dir , preferred_revs )
520+ candidates = [fallback ] if fallback else []
489521
490- if not candidates :
491- continue
492- ranked = sorted (candidates , key = lambda f : f .token_count )
493- picked = None
494- for cand in ranked :
495- if cand .token_count <= 0 or cand .id in selected_ids :
496- continue
497- if cand .token_count <= budget_left :
498- picked = cand
499- break
522+ picked = _pick_smallest_fitting (candidates , selected_ids , budget_left )
500523 if picked is not None :
501524 added .append (picked )
502525 selected_ids .add (picked .id )
@@ -546,6 +569,25 @@ def _select_with_ppr(
546569 return selected .selected , selected
547570
548571
572+ def _resolve_changed_files (
573+ root_dir : Path ,
574+ diff_range : str ,
575+ untracked : list [Path ],
576+ combined_spec : pathspec .PathSpec ,
577+ wl_spec : pathspec .PathSpec | None ,
578+ ) -> list [Path ]:
579+ changed_files = get_changed_files (root_dir , diff_range )
580+ changed_files = [_normalize_path (p , root_dir ) for p in changed_files ]
581+ changed_files .extend (untracked )
582+ changed_files = _filter_ignored (changed_files , root_dir , combined_spec )
583+ changed_files = _filter_whitelist (changed_files , root_dir , wl_spec )
584+
585+ excluded_paths = get_deleted_files (root_dir , diff_range ) | get_renamed_old_paths (root_dir , diff_range )
586+ if excluded_paths :
587+ changed_files = [f for f in changed_files if f .resolve () not in excluded_paths ]
588+ return changed_files
589+
590+
549591def build_diff_context (
550592 root_dir : Path ,
551593 diff_range : str ,
@@ -568,9 +610,8 @@ def build_diff_context(
568610 combined_spec = get_ignore_specs (root_dir , ignore_file , no_default_ignores , None )
569611 wl_spec = get_whitelist_spec (whitelist_file , root_dir )
570612
571- untracked : list [Path ] = []
572- if is_working_tree_diff :
573- untracked = _discover_untracked_files (root_dir , combined_spec )
613+ untracked = _discover_untracked_files (root_dir , combined_spec ) if is_working_tree_diff else []
614+ if untracked :
574615 hunks .extend (_synthetic_hunks (untracked ))
575616
576617 if not hunks :
@@ -581,15 +622,7 @@ def build_diff_context(
581622 if untracked :
582623 expansion_concepts = _enrich_concepts (expansion_concepts , untracked )
583624
584- changed_files = get_changed_files (root_dir , diff_range )
585- changed_files = [_normalize_path (p , root_dir ) for p in changed_files ]
586- changed_files .extend (untracked )
587- changed_files = _filter_ignored (changed_files , root_dir , combined_spec )
588- changed_files = _filter_whitelist (changed_files , root_dir , wl_spec )
589-
590- excluded_paths = get_deleted_files (root_dir , diff_range ) | get_renamed_old_paths (root_dir , diff_range )
591- if excluded_paths :
592- changed_files = [f for f in changed_files if f .resolve () not in excluded_paths ]
625+ changed_files = _resolve_changed_files (root_dir , diff_range , untracked , combined_spec , wl_spec )
593626
594627 preferred_revs = _build_preferred_revs (base_rev , head_rev )
595628
@@ -736,44 +769,75 @@ def _coherence_post_pass(
736769 )
737770
738771
739- def _compute_seed_weights (
772+ def _map_hunks_to_fragments (
740773 hunks : list [DiffHunk ],
741774 core_ids : set [FragmentId ],
742775 all_fragments : list [Fragment ],
743776) -> dict [FragmentId , float ]:
744- frag_hunk_lines : dict [FragmentId , float ] = {}
777+ result : dict [FragmentId , float ] = {}
745778 for h in hunks :
746779 h_start , h_end = h .core_selection_range
747780 hunk_size = max (1 , h_end - h_start + 1 )
748781 for frag in all_fragments :
749782 if frag .id not in core_ids or frag .path != h .path :
750783 continue
751784 if frag .start_line <= h_end and frag .end_line >= h_start :
752- frag_hunk_lines [frag .id ] = frag_hunk_lines .get (frag .id , 0 ) + hunk_size
753- if not frag_hunk_lines :
754- return {}
785+ result [frag .id ] = result .get (frag .id , 0 ) + hunk_size
786+ return result
787+
755788
789+ def _add_container_weights (
790+ frag_hunk_lines : dict [FragmentId , float ],
791+ core_ids : set [FragmentId ],
792+ all_fragments : list [Fragment ],
793+ ) -> None :
756794 for frag in all_fragments :
757795 if frag .id not in core_ids or frag .id in frag_hunk_lines :
758796 continue
759- if frag .kind in _CONTAINER_FRAGMENT_KINDS :
760- contained_weight = sum (
761- w
762- for fid , w in frag_hunk_lines .items ()
763- if fid .path == frag .path and frag .start_line <= fid .start_line and fid .end_line <= frag .end_line
764- )
765- if contained_weight > 0 :
766- frag_hunk_lines [frag .id ] = contained_weight
797+ if frag .kind not in _CONTAINER_FRAGMENT_KINDS :
798+ continue
799+ contained_weight = sum (
800+ w
801+ for fid , w in frag_hunk_lines .items ()
802+ if fid .path == frag .path and frag .start_line <= fid .start_line and fid .end_line <= frag .end_line
803+ )
804+ if contained_weight > 0 :
805+ frag_hunk_lines [frag .id ] = contained_weight
767806
807+
808+ def _best_hunk_size_for_path (hunks : list [DiffHunk ], path : Path ) -> int :
809+ best = 0
810+ for h in hunks :
811+ if h .path == path :
812+ h_start , h_end = h .core_selection_range
813+ best = max (best , h_end - h_start + 1 )
814+ return best
815+
816+
817+ def _fill_missing_core_weights (
818+ frag_hunk_lines : dict [FragmentId , float ],
819+ core_ids : set [FragmentId ],
820+ hunks : list [DiffHunk ],
821+ ) -> None :
768822 for fid in core_ids :
769- if fid not in frag_hunk_lines :
770- best_hunk_size = 0
771- for h in hunks :
772- if h .path == fid .path :
773- h_start , h_end = h .core_selection_range
774- best_hunk_size = max (best_hunk_size , h_end - h_start + 1 )
775- if best_hunk_size > 0 :
776- frag_hunk_lines [fid ] = best_hunk_size
823+ if fid in frag_hunk_lines :
824+ continue
825+ best = _best_hunk_size_for_path (hunks , fid .path )
826+ if best > 0 :
827+ frag_hunk_lines [fid ] = best
828+
829+
830+ def _compute_seed_weights (
831+ hunks : list [DiffHunk ],
832+ core_ids : set [FragmentId ],
833+ all_fragments : list [Fragment ],
834+ ) -> dict [FragmentId , float ]:
835+ frag_hunk_lines = _map_hunks_to_fragments (hunks , core_ids , all_fragments )
836+ if not frag_hunk_lines :
837+ return {}
838+
839+ _add_container_weights (frag_hunk_lines , core_ids , all_fragments )
840+ _fill_missing_core_weights (frag_hunk_lines , core_ids , hunks )
777841
778842 return frag_hunk_lines
779843
0 commit comments