@@ -15,6 +15,15 @@ class DiscoveryContext:
1515 all_candidate_files : list [Path ]
1616 diff_text : str
1717 expansion_concepts : frozenset [str ]
18+ file_cache : dict [Path , str ] | None = None
19+
20+ def read_file (self , path : Path ) -> str | None :
21+ if self .file_cache is not None and path in self .file_cache :
22+ return self .file_cache [path ]
23+ try :
24+ return path .read_text (encoding = "utf-8" )
25+ except (OSError , UnicodeDecodeError ):
26+ return None
1827
1928
2029class DiscoveryStrategy (ABC ):
@@ -43,6 +52,75 @@ def discover(self, ctx: DiscoveryContext) -> list[Path]:
4352 return list (dict .fromkeys (edge_discovered + expanded ))
4453
4554
55+ class BM25Discovery (DiscoveryStrategy ):
56+ def __init__ (self , top_k : int = 1 ) -> None :
57+ self .top_k = top_k
58+
59+ def discover (self , ctx : DiscoveryContext ) -> list [Path ]:
60+ import math
61+ import re
62+ from collections import Counter
63+
64+ token_re = re .compile (r"[A-Za-z_]\w{2,}" )
65+ changed_set = set (ctx .changed_files )
66+
67+ query_tokens = [m .group ().lower () for m in token_re .finditer (ctx .diff_text )]
68+ if not query_tokens :
69+ return []
70+
71+ corpus : list [list [str ]] = []
72+ paths : list [Path ] = []
73+ for f in ctx .all_candidate_files :
74+ if f in changed_set :
75+ continue
76+ content = ctx .read_file (f )
77+ if content is None :
78+ continue
79+ corpus .append ([m .group ().lower () for m in token_re .finditer (content )])
80+ paths .append (f )
81+
82+ if not corpus :
83+ return []
84+
85+ n_docs = len (corpus )
86+ avgdl = sum (len (d ) for d in corpus ) / n_docs
87+ df : Counter [str ] = Counter ()
88+ for doc in corpus :
89+ for term in set (doc ):
90+ df [term ] += 1
91+
92+ query_set = set (query_tokens )
93+ idf = {t : math .log ((n_docs - df .get (t , 0 ) + 0.5 ) / (df .get (t , 0 ) + 0.5 ) + 1.0 ) for t in query_set }
94+
95+ scores : list [float ] = []
96+ for doc in corpus :
97+ tf : Counter [str ] = Counter (doc )
98+ dl = len (doc )
99+ s = 0.0
100+ for t in query_set :
101+ if t not in tf :
102+ continue
103+ freq = tf [t ]
104+ s += idf .get (t , 0 ) * (freq * 2.5 ) / (freq + 1.5 * (1 - 0.75 + 0.75 * dl / avgdl ))
105+ scores .append (s )
106+
107+ ranked = sorted (range (len (scores )), key = lambda i : - scores [i ])
108+ return [paths [i ] for i in ranked [: self .top_k ] if scores [i ] > 0 ]
109+
110+
111+ class EnsembleDiscovery (DiscoveryStrategy ):
112+ def __init__ (self , strategies : list [DiscoveryStrategy ] | None = None ) -> None :
113+ self ._strategies = strategies or [DefaultDiscovery (), BM25Discovery ()]
114+
115+ def discover (self , ctx : DiscoveryContext ) -> list [Path ]:
116+ seen : dict [Path , None ] = {}
117+ for strategy in self ._strategies :
118+ for path in strategy .discover (ctx ):
119+ if path not in seen :
120+ seen [path ] = None
121+ return list (seen .keys ())
122+
123+
46124@dataclass
47125class ScoringResult :
48126 rel_scores : dict [FragmentId , float ]
0 commit comments