@@ -99,6 +99,66 @@ def create_smart_batches_wrapper(
9999
100100# ── RAG Context ───────────────────────────────────────────────
101101
102+ # Language extension groups for cross-language filtering
103+ _LANG_GROUPS = {
104+ 'php' : {'.php' , '.phtml' },
105+ 'java' : {'.java' },
106+ 'py' : {'.py' },
107+ 'js' : {'.js' , '.jsx' , '.ts' , '.tsx' , '.vue' , '.svelte' },
108+ 'rb' : {'.rb' , '.erb' },
109+ 'go' : {'.go' },
110+ 'cs' : {'.cs' },
111+ 'xml' : {'.xml' , '.xsd' },
112+ }
113+ # Reverse lookup: extension → group key
114+ _EXT_TO_GROUP : Dict [str , str ] = {}
115+ for _gk , _exts in _LANG_GROUPS .items ():
116+ for _ext in _exts :
117+ _EXT_TO_GROUP [_ext ] = _gk
118+
119+
120+ def _detect_batch_language (file_paths : List [str ]) -> Optional [str ]:
121+ """Detect the dominant language group of a batch by file extensions."""
122+ counts : Dict [str , int ] = {}
123+ for fp in file_paths :
124+ ext = os .path .splitext (fp )[1 ].lower ()
125+ group = _EXT_TO_GROUP .get (ext )
126+ if group :
127+ counts [group ] = counts .get (group , 0 ) + 1
128+ if not counts :
129+ return None
130+ top = max (counts , key = counts .get )
131+ # Only return if dominant (>= 70% of files)
132+ total = sum (counts .values ())
133+ if counts [top ] / total >= 0.7 :
134+ return top
135+ return None
136+
137+
138+ def _chunk_matches_language (chunk : Dict , batch_lang : Optional [str ]) -> bool :
139+ """Return True if a chunk is compatible with the batch language group.
140+ If batch_lang is None (unknown), all chunks pass.
141+ Config/XML files always pass.
142+ """
143+ if not batch_lang :
144+ return True
145+ meta = chunk .get ("metadata" , {})
146+ path = meta .get ("path" ) or chunk .get ("file_path" ) or chunk .get ("path" , "" )
147+ if not path :
148+ return True
149+ ext = os .path .splitext (path )[1 ].lower ()
150+ if not ext :
151+ return True
152+ # Config/markup files always pass (xml, json, yaml, etc.)
153+ if ext in ('.xml' , '.json' , '.yaml' , '.yml' , '.toml' , '.ini' , '.cfg' ,
154+ '.properties' , '.conf' , '.env' , '.md' , '.txt' , '.html' ,
155+ '.phtml' , '.twig' , '.hbs' ):
156+ return True
157+ chunk_group = _EXT_TO_GROUP .get (ext )
158+ if not chunk_group :
159+ return True # Unknown extension → allow
160+ return chunk_group == batch_lang
161+
102162
103163async def fetch_batch_rag_context (
104164 rag_client ,
@@ -131,6 +191,11 @@ async def fetch_batch_rag_context(
131191
132192 context = None
133193
194+ # Detect batch language ONCE — used by deterministic, semantic, and duplication filters
195+ batch_lang = _detect_batch_language (batch_file_paths )
196+ if batch_lang :
197+ logger .info (f"Batch language detected: { batch_lang } (from { batch_file_paths } )" )
198+
134199 # 1. Deterministic lookup FIRST — structural deps are highest-value context
135200 try :
136201 deterministic_response = await rag_client .get_deterministic_context (
@@ -158,7 +223,14 @@ async def fetch_batch_rag_context(
158223 max_per_def = 2 , max_file_level = 2 ,
159224 )
160225
226+ lang_filtered = 0
227+
161228 for chunk in scoped :
229+ # Cross-language filter: skip JS chunks for PHP reviews etc.
230+ if not _chunk_matches_language (chunk , batch_lang ):
231+ lang_filtered += 1
232+ continue
233+
162234 merged = dict (chunk )
163235 is_diff_rel = chunk .get ("_diff_relevant" , True )
164236 merged ["score" ] = 0.95 if is_diff_rel else 0.85
@@ -175,8 +247,16 @@ async def fetch_batch_rag_context(
175247 merged .setdefault ("path" , meta .get ("path" , "" ))
176248 context ["relevant_code" ].append (merged )
177249
250+ # Hard cap: deterministic should never dominate the budget
251+ if len (context ["relevant_code" ]) >= 5 :
252+ break
253+
254+ if lang_filtered :
255+ logger .info (f"Cross-language filter: excluded { lang_filtered } chunks "
256+ f"(batch_lang={ batch_lang } )" )
178257 logger .info (f"Deterministic RAG: { len (context ['relevant_code' ])} chunks "
179- f"(diff-scoped from { len (related_defs )} definitions)" )
258+ f"(diff-scoped from { len (related_defs )} definitions, "
259+ f"capped at 5)" )
180260 except Exception as det_err :
181261 logger .debug (f"Deterministic RAG lookup failed: { det_err } " )
182262
@@ -208,11 +288,18 @@ async def fetch_batch_rag_context(
208288 if context is None :
209289 context = {"relevant_code" : []}
210290 added = 0
291+ sem_lang_filtered = 0
211292 for chunk in sem_chunks :
212293 if added >= semantic_fill :
213294 break
295+ # Cross-language filter for semantic results too
296+ if not _chunk_matches_language (chunk , batch_lang ):
297+ sem_lang_filtered += 1
298+ continue
214299 context ["relevant_code" ].append (chunk )
215300 added += 1
301+ if sem_lang_filtered :
302+ logger .info (f"Semantic cross-language filter: excluded { sem_lang_filtered } chunks" )
216303 logger .info (f"Semantic RAG: added { added } /{ len (sem_chunks )} chunks" )
217304 else :
218305 logger .info (f"Deterministic yielded { det_count } chunks — semantic search skipped" )
@@ -268,6 +355,9 @@ async def fetch_batch_rag_context(
268355 continue
269356 if dup_path in seen_paths :
270357 continue
358+ # Cross-language filter for duplication results
359+ if not _chunk_matches_language (dup , batch_lang ):
360+ continue
271361 seen_paths .add (dup_path )
272362
273363 context ["relevant_code" ].append ({
@@ -491,22 +581,46 @@ def _scope_deterministic_to_diff(
491581 if not related_defs :
492582 return []
493583
584+ # Common language builtins / keywords that match definitions everywhere.
585+ # These are too generic to be useful for diff-scoping — they produce false
586+ # positives against unrelated files (especially minified JS bundles).
587+ _DIFF_TOKEN_STOPWORDS = {
588+ # Python builtins & keywords
589+ 'set' , 'get' , 'add' , 'pop' , 'map' , 'len' , 'str' , 'int' , 'dict' , 'list' ,
590+ 'type' , 'key' , 'val' , 'var' , 'def' , 'for' , 'and' , 'not' , 'try' , 'has' ,
591+ 'self' , 'none' , 'true' , 'false' , 'from' , 'import' , 'class' , 'return' ,
592+ 'None' , 'True' , 'False' , 'with' , 'async' , 'await' , 'pass' , 'else' ,
593+ 'elif' , 'while' , 'break' , 'raise' , 'yield' , 'super' , 'init' , 'call' ,
594+ 'item' , 'items' , 'keys' , 'values' , 'update' , 'append' , 'extend' ,
595+ 'print' , 'open' , 'close' , 'read' , 'write' , 'name' , 'path' , 'file' ,
596+ 'data' , 'info' , 'text' , 'code' , 'test' , 'main' , 'args' , 'that' ,
597+ 'this' , 'then' , 'else' , 'each' , 'some' , 'more' , 'than' ,
598+ # Java / JS common
599+ 'new' , 'null' , 'void' , 'byte' , 'char' , 'long' , 'enum' , 'case' ,
600+ 'size' , 'next' , 'done' , 'push' , 'pull' , 'send' , 'save' , 'load' ,
601+ 'toString' , 'valueOf' , 'equals' , 'apply' , 'bind' ,
602+ }
603+
494604 # ── Extract identifiers from diff (both added and removed lines) ──
605+ # Minimum 4 chars to exclude generic 3-letter tokens (set, get, add, etc.)
606+ _TOKEN_RE = re .compile (r'\b([A-Za-z_][A-Za-z0-9_]{3,})\b' )
495607 diff_tokens = set ()
496608
497609 # Primary: raw diff gives us changed lines with +/- prefixes
498610 if batch_raw_diffs :
499611 for raw_diff in batch_raw_diffs :
500612 for line in raw_diff .splitlines ():
501613 if line .startswith (('+' , '-' )) and not line .startswith (('+++' , '---' , '@@' )):
502- for token in re .findall (r'\b([A-Za-z_][A-Za-z0-9_]{2,})\b' , line ):
503- diff_tokens .add (token )
614+ for token in _TOKEN_RE .findall (line ):
615+ if token .lower () not in _DIFF_TOKEN_STOPWORDS and token not in _DIFF_TOKEN_STOPWORDS :
616+ diff_tokens .add (token )
504617
505618 # Supplement: pre-processed diff snippets (added lines only)
506619 if batch_diff_snippets :
507620 snippet_text = " " .join (batch_diff_snippets )
508- for token in re .findall (r'\b([A-Za-z_][A-Za-z0-9_]{2,})\b' , snippet_text ):
509- diff_tokens .add (token )
621+ for token in _TOKEN_RE .findall (snippet_text ):
622+ if token .lower () not in _DIFF_TOKEN_STOPWORDS and token not in _DIFF_TOKEN_STOPWORDS :
623+ diff_tokens .add (token )
510624
511625 # ── Classify each definition ──
512626 diff_relevant = []
@@ -551,12 +665,14 @@ def _scope_deterministic_to_diff(
551665
552666 kept_fl = min (len (file_level ), max_file_level )
553667 dropped_fl = len (file_level ) - kept_fl
668+ # Log a sample of the tokens (first 30) for debuggability
669+ sample_tokens = sorted (diff_tokens )[:30 ]
554670 logger .info (
555671 f"Diff-scoped deterministic: "
556672 f"{ len (diff_relevant )} diff-relevant, "
557673 f"{ kept_fl } file-level kept, "
558674 f"{ dropped_fl } file-level dropped, "
559- f"{ len (diff_tokens )} diff tokens"
675+ f"{ len (diff_tokens )} diff tokens (sample: { sample_tokens } ) "
560676 )
561677
562678 return result
0 commit comments