Skip to content

Commit 85fa0c1

Browse files
committed
feat: Implement cross-language filtering for batch processing and enhance metadata extraction logic
1 parent 3b95d0b commit 85fa0c1

2 files changed

Lines changed: 129 additions & 11 deletions

File tree

python-ecosystem/inference-orchestrator/service/review/orchestrator/stage_1_file_review.py

Lines changed: 122 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,66 @@ def create_smart_batches_wrapper(
9999

100100
# ── RAG Context ───────────────────────────────────────────────
101101

102+
# Language extension groups for cross-language filtering
103+
_LANG_GROUPS = {
104+
'php': {'.php', '.phtml'},
105+
'java': {'.java'},
106+
'py': {'.py'},
107+
'js': {'.js', '.jsx', '.ts', '.tsx', '.vue', '.svelte'},
108+
'rb': {'.rb', '.erb'},
109+
'go': {'.go'},
110+
'cs': {'.cs'},
111+
'xml': {'.xml', '.xsd'},
112+
}
113+
# Reverse lookup: extension → group key
114+
_EXT_TO_GROUP: Dict[str, str] = {}
115+
for _gk, _exts in _LANG_GROUPS.items():
116+
for _ext in _exts:
117+
_EXT_TO_GROUP[_ext] = _gk
118+
119+
120+
def _detect_batch_language(file_paths: List[str]) -> Optional[str]:
121+
"""Detect the dominant language group of a batch by file extensions."""
122+
counts: Dict[str, int] = {}
123+
for fp in file_paths:
124+
ext = os.path.splitext(fp)[1].lower()
125+
group = _EXT_TO_GROUP.get(ext)
126+
if group:
127+
counts[group] = counts.get(group, 0) + 1
128+
if not counts:
129+
return None
130+
top = max(counts, key=counts.get)
131+
# Only return if dominant (>= 70% of files)
132+
total = sum(counts.values())
133+
if counts[top] / total >= 0.7:
134+
return top
135+
return None
136+
137+
138+
def _chunk_matches_language(chunk: Dict, batch_lang: Optional[str]) -> bool:
139+
"""Return True if a chunk is compatible with the batch language group.
140+
If batch_lang is None (unknown), all chunks pass.
141+
Config/XML files always pass.
142+
"""
143+
if not batch_lang:
144+
return True
145+
meta = chunk.get("metadata", {})
146+
path = meta.get("path") or chunk.get("file_path") or chunk.get("path", "")
147+
if not path:
148+
return True
149+
ext = os.path.splitext(path)[1].lower()
150+
if not ext:
151+
return True
152+
# Config/markup files always pass (xml, json, yaml, etc.)
153+
if ext in ('.xml', '.json', '.yaml', '.yml', '.toml', '.ini', '.cfg',
154+
'.properties', '.conf', '.env', '.md', '.txt', '.html',
155+
'.phtml', '.twig', '.hbs'):
156+
return True
157+
chunk_group = _EXT_TO_GROUP.get(ext)
158+
if not chunk_group:
159+
return True # Unknown extension → allow
160+
return chunk_group == batch_lang
161+
102162

103163
async def fetch_batch_rag_context(
104164
rag_client,
@@ -131,6 +191,11 @@ async def fetch_batch_rag_context(
131191

132192
context = None
133193

194+
# Detect batch language ONCE — used by deterministic, semantic, and duplication filters
195+
batch_lang = _detect_batch_language(batch_file_paths)
196+
if batch_lang:
197+
logger.info(f"Batch language detected: {batch_lang} (from {batch_file_paths})")
198+
134199
# 1. Deterministic lookup FIRST — structural deps are highest-value context
135200
try:
136201
deterministic_response = await rag_client.get_deterministic_context(
@@ -158,7 +223,14 @@ async def fetch_batch_rag_context(
158223
max_per_def=2, max_file_level=2,
159224
)
160225

226+
lang_filtered = 0
227+
161228
for chunk in scoped:
229+
# Cross-language filter: skip JS chunks for PHP reviews etc.
230+
if not _chunk_matches_language(chunk, batch_lang):
231+
lang_filtered += 1
232+
continue
233+
162234
merged = dict(chunk)
163235
is_diff_rel = chunk.get("_diff_relevant", True)
164236
merged["score"] = 0.95 if is_diff_rel else 0.85
@@ -175,8 +247,16 @@ async def fetch_batch_rag_context(
175247
merged.setdefault("path", meta.get("path", ""))
176248
context["relevant_code"].append(merged)
177249

250+
# Hard cap: deterministic should never dominate the budget
251+
if len(context["relevant_code"]) >= 5:
252+
break
253+
254+
if lang_filtered:
255+
logger.info(f"Cross-language filter: excluded {lang_filtered} chunks "
256+
f"(batch_lang={batch_lang})")
178257
logger.info(f"Deterministic RAG: {len(context['relevant_code'])} chunks "
179-
f"(diff-scoped from {len(related_defs)} definitions)")
258+
f"(diff-scoped from {len(related_defs)} definitions, "
259+
f"capped at 5)")
180260
except Exception as det_err:
181261
logger.debug(f"Deterministic RAG lookup failed: {det_err}")
182262

@@ -208,11 +288,18 @@ async def fetch_batch_rag_context(
208288
if context is None:
209289
context = {"relevant_code": []}
210290
added = 0
291+
sem_lang_filtered = 0
211292
for chunk in sem_chunks:
212293
if added >= semantic_fill:
213294
break
295+
# Cross-language filter for semantic results too
296+
if not _chunk_matches_language(chunk, batch_lang):
297+
sem_lang_filtered += 1
298+
continue
214299
context["relevant_code"].append(chunk)
215300
added += 1
301+
if sem_lang_filtered:
302+
logger.info(f"Semantic cross-language filter: excluded {sem_lang_filtered} chunks")
216303
logger.info(f"Semantic RAG: added {added}/{len(sem_chunks)} chunks")
217304
else:
218305
logger.info(f"Deterministic yielded {det_count} chunks — semantic search skipped")
@@ -268,6 +355,9 @@ async def fetch_batch_rag_context(
268355
continue
269356
if dup_path in seen_paths:
270357
continue
358+
# Cross-language filter for duplication results
359+
if not _chunk_matches_language(dup, batch_lang):
360+
continue
271361
seen_paths.add(dup_path)
272362

273363
context["relevant_code"].append({
@@ -491,22 +581,46 @@ def _scope_deterministic_to_diff(
491581
if not related_defs:
492582
return []
493583

584+
# Common language builtins / keywords that match definitions everywhere.
585+
# These are too generic to be useful for diff-scoping — they produce false
586+
# positives against unrelated files (especially minified JS bundles).
587+
_DIFF_TOKEN_STOPWORDS = {
588+
# Python builtins & keywords
589+
'set', 'get', 'add', 'pop', 'map', 'len', 'str', 'int', 'dict', 'list',
590+
'type', 'key', 'val', 'var', 'def', 'for', 'and', 'not', 'try', 'has',
591+
'self', 'none', 'true', 'false', 'from', 'import', 'class', 'return',
592+
'None', 'True', 'False', 'with', 'async', 'await', 'pass', 'else',
593+
'elif', 'while', 'break', 'raise', 'yield', 'super', 'init', 'call',
594+
'item', 'items', 'keys', 'values', 'update', 'append', 'extend',
595+
'print', 'open', 'close', 'read', 'write', 'name', 'path', 'file',
596+
'data', 'info', 'text', 'code', 'test', 'main', 'args', 'that',
597+
'this', 'then', 'else', 'each', 'some', 'more', 'than',
598+
# Java / JS common
599+
'new', 'null', 'void', 'byte', 'char', 'long', 'enum', 'case',
600+
'size', 'next', 'done', 'push', 'pull', 'send', 'save', 'load',
601+
'toString', 'valueOf', 'equals', 'apply', 'bind',
602+
}
603+
494604
# ── Extract identifiers from diff (both added and removed lines) ──
605+
# Minimum 4 chars to exclude generic 3-letter tokens (set, get, add, etc.)
606+
_TOKEN_RE = re.compile(r'\b([A-Za-z_][A-Za-z0-9_]{3,})\b')
495607
diff_tokens = set()
496608

497609
# Primary: raw diff gives us changed lines with +/- prefixes
498610
if batch_raw_diffs:
499611
for raw_diff in batch_raw_diffs:
500612
for line in raw_diff.splitlines():
501613
if line.startswith(('+', '-')) and not line.startswith(('+++', '---', '@@')):
502-
for token in re.findall(r'\b([A-Za-z_][A-Za-z0-9_]{2,})\b', line):
503-
diff_tokens.add(token)
614+
for token in _TOKEN_RE.findall(line):
615+
if token.lower() not in _DIFF_TOKEN_STOPWORDS and token not in _DIFF_TOKEN_STOPWORDS:
616+
diff_tokens.add(token)
504617

505618
# Supplement: pre-processed diff snippets (added lines only)
506619
if batch_diff_snippets:
507620
snippet_text = " ".join(batch_diff_snippets)
508-
for token in re.findall(r'\b([A-Za-z_][A-Za-z0-9_]{2,})\b', snippet_text):
509-
diff_tokens.add(token)
621+
for token in _TOKEN_RE.findall(snippet_text):
622+
if token.lower() not in _DIFF_TOKEN_STOPWORDS and token not in _DIFF_TOKEN_STOPWORDS:
623+
diff_tokens.add(token)
510624

511625
# ── Classify each definition ──
512626
diff_relevant = []
@@ -551,12 +665,14 @@ def _scope_deterministic_to_diff(
551665

552666
kept_fl = min(len(file_level), max_file_level)
553667
dropped_fl = len(file_level) - kept_fl
668+
# Log a sample of the tokens (first 30) for debuggability
669+
sample_tokens = sorted(diff_tokens)[:30]
554670
logger.info(
555671
f"Diff-scoped deterministic: "
556672
f"{len(diff_relevant)} diff-relevant, "
557673
f"{kept_fl} file-level kept, "
558674
f"{dropped_fl} file-level dropped, "
559-
f"{len(diff_tokens)} diff tokens"
675+
f"{len(diff_tokens)} diff tokens (sample: {sample_tokens})"
560676
)
561677

562678
return result

python-ecosystem/rag-pipeline/src/rag_pipeline/services/deterministic_context.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -334,11 +334,13 @@ def _query_changed_file(
334334
all_chunks.append(chunk)
335335
changed_file_paths.add(payload.get("path", ""))
336336

337-
# Extract ALL tree-sitter metadata for step 2-4
338-
if isinstance(payload.get("semantic_names"), list):
339-
identifiers_to_find.update(payload["semantic_names"])
340-
if payload.get("primary_name"):
341-
identifiers_to_find.add(payload["primary_name"])
337+
# Extract tree-sitter metadata for step 2-4
338+
# NOTE: We deliberately do NOT add semantic_names or primary_name
339+
# to identifiers_to_find. Those are the file's OWN definitions
340+
# (e.g., __construct, getAliases, apply, _toHtml) and looking
341+
# them up via primary_name MatchAny finds hundreds of unrelated
342+
# files with the same boilerplate method names. Actual external
343+
# dependencies come from imports, extends, and enrichment.
342344
if payload.get("parent_class"):
343345
parent_classes.add(payload["parent_class"])
344346
if payload.get("namespace"):

0 commit comments

Comments
 (0)