diff --git a/scripts/code_hallucination/config.py b/scripts/code_hallucination/config.py index 7f71f99..85bfed4 100644 --- a/scripts/code_hallucination/config.py +++ b/scripts/code_hallucination/config.py @@ -36,6 +36,7 @@ HALLUCINATION_RATIO = 0.4 # 40% hallucinated, 60% clean MAX_FILE_CHARS = 12000 # Cap individual source file size MAX_CONTEXT7_CHARS = 4000 # Documentation fetch limit +MAX_PROMPT_CHARS = 24000 # ~6K tokens, leaves room for answer within 8K model context # === LLM Config === RETRY_DELAY = 2 diff --git a/scripts/code_hallucination/context7_docs.py b/scripts/code_hallucination/context7_docs.py index b287015..ef60850 100644 --- a/scripts/code_hallucination/context7_docs.py +++ b/scripts/code_hallucination/context7_docs.py @@ -228,7 +228,7 @@ def run(instances: list[dict]): if processed % 100 == 0: print( - f" Progress: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)" + f" Phase 4: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)" ) print( diff --git a/scripts/code_hallucination/format_builder.py b/scripts/code_hallucination/format_builder.py index 9d290f7..0f0a660 100644 --- a/scripts/code_hallucination/format_builder.py +++ b/scripts/code_hallucination/format_builder.py @@ -1,15 +1,21 @@ -"""Phase 5: Assign answer format to each instance.""" +"""Phase 5: Assign answer format to each instance. +Supports both sequential (remote API) and async batch (local vLLM) modes. +Set BATCH_SIZE>1 env var for parallel requests to local vLLM. +""" + +import asyncio import json import random import textwrap import time -from openai import OpenAI +from openai import AsyncOpenAI, OpenAI from .config import ( API_BASE_URL, API_KEY, + BATCH_SIZE, FORMAT_TYPES, FORMAT_WEIGHTS, FORMATS_PATH, @@ -26,27 +32,24 @@ that a developer would receive from an AI assistant. Your response MUST: - - Start with a brief explanation (1-3 sentences) of what the issue is and how to fix it + - Start with 1-2 sentences explaining what was wrong and how to fix it - Include the code in a properly formatted code block (```python) - - Optionally end with a short note about what changed or why + - Do NOT add anything after the code block Your response must NOT: - - Include phrases like "Here's the fix" or "I'll help you with that" — just explain directly - - Be longer than necessary — keep it concise + - Include phrases like "Here's the fix" or "I'll help you with that" + - Be longer than 2 sentences of explanation + the code block - Change the code in any way — use it exactly as provided - Add any imports or code not in the original - Example style: - The issue is that `process_data` uses `dict.items()` instead of iterating - over the sorted keys, which causes non-deterministic output. + Example: + The `process_data` function uses `dict.items()` instead of iterating over sorted keys, causing non-deterministic output. ```python def process_data(data): for key in sorted(data.keys()): yield key, data[key] ``` - - This ensures consistent ordering regardless of insertion order. """) @@ -75,7 +78,7 @@ def _generate_explanation( {"role": "user", "content": user_msg}, ], temperature=LLM_TEMPERATURE, - max_tokens=2000, + max_tokens=200, ) result = response.choices[0].message.content.strip() # Verify the code is actually in the response @@ -94,6 +97,47 @@ def _generate_explanation( return None +async def _generate_explanation_async( + aclient: AsyncOpenAI, model: str, code: str, query: str, context: str +) -> str | None: + """Async version of _generate_explanation for batch processing.""" + user_msg = f"""User's question: {query} + +Context (relevant source code): +{context[:3000]} + +Correct code fix: +```python +{code} +``` + +Write a natural AI assistant response that includes this exact code.""" + + for attempt in range(MAX_RETRIES): + try: + response = await aclient.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": EXPLANATION_SYSTEM_PROMPT}, + {"role": "user", "content": user_msg}, + ], + temperature=LLM_TEMPERATURE, + max_tokens=200, + ) + result = response.choices[0].message.content.strip() + if code[:50] in result or "```" in result: + return result + if attempt < MAX_RETRIES - 1: + continue + return None + except Exception: + if attempt < MAX_RETRIES - 1: + await asyncio.sleep(RETRY_DELAY * (attempt + 1)) + else: + return None + return None + + def assign_format(source_data: dict) -> tuple[str, str]: """Assign a format type and build the answer for an instance. @@ -169,7 +213,8 @@ def run( ): """Run Phase 5: Assign formats and build answers. - Returns list of dicts with instance_id, format_type, answer. + Uses async batch processing when BATCH_SIZE > 1 (for local vLLM). + Falls back to sequential processing for remote APIs (BATCH_SIZE=1). """ print("=" * 60) print("Phase 5: Answer Format Building") @@ -180,70 +225,187 @@ def run( if queries is None: queries = {} - # Only init LLM client if we'll need it (lazy) - client = None - - results = [] - format_counts = {fmt: 0 for fmt in FORMAT_TYPES} - skipped = 0 - explanation_failures = 0 - - for inst in instances: + # Load existing for resumability + existing = {} + if FORMATS_PATH.exists(): + with open(FORMATS_PATH) as f: + for line in f: + try: + entry = json.loads(line) + existing[entry["instance_id"]] = entry + except (json.JSONDecodeError, KeyError): + continue + print(f"Already processed: {len(existing)} formats") + + to_process = [inst for inst in instances if inst["instance_id"] not in existing] + print(f"Remaining: {len(to_process)} instances to process") + print(f"Batch size: {BATCH_SIZE}") + + # First pass: assign formats for all instances (no LLM needed) + # Collect which ones need explanation generation + needs_explanation = [] # (instance_id, code, query, context) + entries_no_llm = [] # entries that don't need LLM + + for inst in to_process: instance_id = inst["instance_id"] - # Load source data from cache cache_path = source_cache_dir / f"{instance_id}.json" if not cache_path.exists(): - skipped += 1 continue - with open(cache_path) as f: - source_data = json.load(f) + with open(cache_path) as fp: + source_data = json.load(fp) fmt, answer = assign_format(source_data) if fmt is None: - skipped += 1 continue - # Generate explanation wrapper for code_with_explanation format if fmt == "code_with_explanation": - if client is None: - client = OpenAI(api_key=api_key, base_url=base_url) - print(f" LLM client initialized for code_with_explanation ({base_url})") - query = queries.get(instance_id, inst.get("problem_statement", "")[:500]) context = source_data.get("patch_code", "") - explained = _generate_explanation(client, model, answer, query, context) + needs_explanation.append((instance_id, answer, query, context, fmt)) + else: + entries_no_llm.append( + { + "instance_id": instance_id, + "format_type": fmt, + "answer": answer, + } + ) + + # Write non-LLM entries immediately + results = list(existing.values()) + format_counts = {fmt: 0 for fmt in FORMAT_TYPES} + for entry in results: + fmt = entry.get("format_type") + if fmt in format_counts: + format_counts[fmt] += 1 + + processed = 0 + explanation_failures = 0 + + with open(FORMATS_PATH, "a") as f: + for entry in entries_no_llm: + f.write(json.dumps(entry) + "\n") + results.append(entry) + format_counts[entry["format_type"]] += 1 + processed += 1 + f.flush() + + print(f" Assigned {len(entries_no_llm)} non-LLM formats") + print(f" Need LLM explanation: {len(needs_explanation)} instances") + + # Second pass: generate explanations (batched or sequential) + if needs_explanation: + if BATCH_SIZE > 1: + explanation_failures = _run_explanations_batched( + needs_explanation, format_counts, results, api_key, base_url, model + ) + else: + explanation_failures = _run_explanations_sequential( + needs_explanation, format_counts, results, api_key, base_url, model + ) + + processed += len(needs_explanation) + + print(f"\nAssigned formats for {len(results)} instances") + if explanation_failures: + print(f" Explanation generation failures (fell back to fragment): {explanation_failures}") + for fmt, count in format_counts.items(): + pct = count * 100 // max(len(results), 1) + print(f" {fmt}: {count} ({pct}%)") + + return results + + +def _run_explanations_sequential( + needs_explanation, format_counts, results, api_key, base_url, model +): + """Generate explanations sequentially (for remote APIs).""" + client = OpenAI(api_key=api_key, base_url=base_url) + explanation_failures = 0 + processed = 0 + + with open(FORMATS_PATH, "a") as f: + for instance_id, code, query, context, _ in needs_explanation: + explained = _generate_explanation(client, model, code, query, context) if explained is None: - # Fallback: use raw code as fragment fmt = "fragment" + answer = code explanation_failures += 1 else: + fmt = "code_with_explanation" answer = explained - results.append( - { + entry = { "instance_id": instance_id, "format_type": fmt, "answer": answer, } - ) - format_counts[fmt] += 1 - - # Save - with open(FORMATS_PATH, "w") as f: - for entry in results: f.write(json.dumps(entry) + "\n") + f.flush() + results.append(entry) + format_counts[fmt] += 1 + processed += 1 - print(f"\nAssigned formats for {len(results)} instances (skipped {skipped})") - if explanation_failures: - print(f" Explanation generation failures (fell back to fragment): {explanation_failures}") - for fmt, count in format_counts.items(): - pct = count * 100 // max(len(results), 1) - print(f" {fmt}: {count} ({pct}%)") + if processed % 100 == 0: + print( + f" Phase 5 (explanations): {processed}/{len(needs_explanation)} " + f"(failures: {explanation_failures})" + ) - return results + return explanation_failures + + +def _run_explanations_batched(needs_explanation, format_counts, results, api_key, base_url, model): + """Generate explanations with async batching (for local vLLM).""" + aclient = AsyncOpenAI(api_key=api_key, base_url=base_url) + explanation_failures = 0 + processed = 0 + + async def process_batches(): + nonlocal explanation_failures, processed + + with open(FORMATS_PATH, "a") as f: + for batch_start in range(0, len(needs_explanation), BATCH_SIZE): + batch = needs_explanation[batch_start : batch_start + BATCH_SIZE] + + tasks = [] + for instance_id, code, query, context, _ in batch: + tasks.append(_generate_explanation_async(aclient, model, code, query, context)) + + batch_results = await asyncio.gather(*tasks, return_exceptions=True) + + for (instance_id, code, query, context, _), explained in zip(batch, batch_results): + if isinstance(explained, Exception) or explained is None: + fmt = "fragment" + answer = code + explanation_failures += 1 + else: + fmt = "code_with_explanation" + answer = explained + + entry = { + "instance_id": instance_id, + "format_type": fmt, + "answer": answer, + } + f.write(json.dumps(entry) + "\n") + results.append(entry) + format_counts[fmt] += 1 + processed += 1 + + f.flush() + + if processed % 100 == 0 or batch_start + BATCH_SIZE >= len(needs_explanation): + print( + f" Phase 5 (explanations): {processed}/{len(needs_explanation)} " + f"(failures: {explanation_failures})" + ) + + asyncio.run(process_batches()) + return explanation_failures if __name__ == "__main__": diff --git a/scripts/code_hallucination/hallucination_injector.py b/scripts/code_hallucination/hallucination_injector.py index 941315c..46d676d 100644 --- a/scripts/code_hallucination/hallucination_injector.py +++ b/scripts/code_hallucination/hallucination_injector.py @@ -19,6 +19,7 @@ HALLUCINATED_PATH, HALLUCINATION_TEMPERATURE, HALLUCINATION_TYPES, + MAX_PROMPT_CHARS, MAX_RETRIES, MODEL, RETRY_DELAY, @@ -28,24 +29,30 @@ You are a code hallucination injector for building a hallucination detection dataset. Given a correct answer (which may be pure code OR code with natural language explanation) - and context, create a hallucinated version with specific types of errors. + and SOURCE CODE CONTEXT, create a hallucinated version with specific types of errors. + + CRITICAL: Every error you inject MUST BE DETECTABLE by comparing the answer against + the provided source code context AND/OR the user's request. A human reading the + source files and user query must be able to spot that the hallucinated part + contradicts what's in the source or what the user asked for. Do NOT inject errors + that require running the code or external knowledge to detect. Hallucination types: - - STRUCTURAL: Change a function call, import, or parameter to something that - doesn't exist or is wrong. Code should still parse but reference non-existent - APIs, wrong methods, or invented parameters. - - BEHAVIORAL: Use correct APIs but with wrong values or logic. Wrong defaults, - off-by-one errors, swapped conditions, wrong argument values. - - SEMANTIC: Code that looks like it addresses the user's request but does - something subtly different or opposite. The code parses, uses real APIs, - but fails to do what was asked. If library documentation is provided, - you can make the code contradict the documented API (wrong parameter names, - wrong return types, deprecated usage, etc.). - For answers with explanations, you may also make the explanation contradict - the code or describe incorrect behavior. + - STRUCTURAL: Change a function/method/class name, import, or parameter to something + that does NOT exist in the source context. For example, rename a method call to one + that isn't defined in the provided source files, or add a parameter that the function + doesn't accept according to the source. + - BEHAVIORAL: Use correct API names from the source but with wrong values or logic + that contradicts the source. Wrong default values (different from source), swapped + conditions, wrong argument order compared to the function signature in source. + - SEMANTIC: Code that contradicts the source's behavior, the user's request, or the + explanation contradicts what the source code actually does. For example: claim a + function returns X when the source shows it returns Y, describe wrong control flow, + or solve a different problem than what the user asked for. Rules: - Make 2-3 DISTINCT changes spread across different parts of the answer + - Each change MUST contradict something visible in the source code or user request - Each changed span must be 20-150 characters long (not too short, not too long) - Total hallucinated text must be LESS THAN 40% of the original answer length - Keep most of the answer CORRECT — do NOT rewrite the entire thing @@ -53,11 +60,7 @@ - Make changes PLAUSIBLE — something an LLM would realistically generate - Changes must be SUBTLE, not obviously broken - The code in the hallucinated answer must still be syntactically valid - - Do NOT add comments explaining or hinting at the hallucination (no "# wrong", - "# error", "# typo", "# nonexistent", etc.) — the errors must be invisible - to someone skimming the answer - - If the answer contains both code and explanation, inject errors in BOTH parts - (e.g. wrong API in code + misleading description in text) + - Do NOT add comments explaining or hinting at the hallucination - Preserve the overall structure: keep markdown formatting, code blocks, etc. Respond in this exact JSON format (no markdown, no code blocks): @@ -67,7 +70,7 @@ { "original": "exact original text that was changed", "hallucinated": "what you changed it to", - "explanation": "why this is a hallucination" + "explanation": "why this is wrong — what does the source code or user request actually say?" } ] } @@ -77,10 +80,25 @@ - "original" must be an exact substring of the correct answer - "hallucinated" must be an exact substring of your hallucinated answer - Each "hallucinated" value must be at least 20 characters long + - Each "explanation" must reference what the source code or user request actually says - Return ONLY valid JSON, nothing else """) +def build_source_context(source_data: dict) -> str: + """Build source code context string from cached source data. + + Truncates to MAX_PROMPT_CHARS so the final sample fits in 8K model context. + """ + parts = [] + for filepath, content in source_data.get("source_files", {}).items(): + parts.append(f"File: {filepath}\n```python\n{content}\n```") + context = "\n\n".join(parts) + if len(context) > MAX_PROMPT_CHARS: + context = context[:MAX_PROMPT_CHARS] + return context + + def inject_hallucination( client: OpenAI, model: str, @@ -318,6 +336,7 @@ def run( formats: dict[str, dict], queries: dict[str, str], docs: dict[str, dict] | None = None, + source_cache: dict[str, dict] | None = None, api_key: str = API_KEY, base_url: str = API_BASE_URL, model: str = MODEL, @@ -333,6 +352,8 @@ def run( if docs is None: docs = {} + if source_cache is None: + source_cache = {} HALLUCINATED_PATH.parent.mkdir(parents=True, exist_ok=True) @@ -350,9 +371,13 @@ def run( print(f"Remaining: {len(to_process)} instances to inject") if BATCH_SIZE > 1: - results = _run_batched(to_process, formats, queries, docs, api_key, base_url, model) + results = _run_batched( + to_process, formats, queries, docs, source_cache, api_key, base_url, model + ) else: - results = _run_sequential(to_process, formats, queries, docs, api_key, base_url, model) + results = _run_sequential( + to_process, formats, queries, docs, source_cache, api_key, base_url, model + ) # Stats type_counts = {} @@ -372,7 +397,7 @@ def run( return results -def _run_sequential(to_process, formats, queries, docs, api_key, base_url, model): +def _run_sequential(to_process, formats, queries, docs, source_cache, api_key, base_url, model): """Sequential processing for remote APIs (rate-limited).""" client = OpenAI(api_key=api_key, base_url=base_url) processed = 0 @@ -391,7 +416,12 @@ def _run_sequential(to_process, formats, queries, docs, api_key, base_url, model hall_type = HALLUCINATION_TYPES[i % len(HALLUCINATION_TYPES)] query = queries.get(instance_id, "") - context = inst.get("problem_statement", "") + source_data = source_cache.get(instance_id, {}) + context = ( + build_source_context(source_data) + if source_data + else inst.get("problem_statement", "") + ) instance_docs = docs.get(instance_id, {}) # Try injection with up to 2 quality retries @@ -421,14 +451,14 @@ def _run_sequential(to_process, formats, queries, docs, api_key, base_url, model results.append(entry) processed += 1 - if processed % 50 == 0: - print(f" Progress: {processed}/{len(to_process)} (failed: {failed})") + if processed % 100 == 0: + print(f" Phase 6: {processed}/{len(to_process)} (failed: {failed})") print(f"\nDone: {processed} injected, {failed} failed ({no_spans} had no matchable spans)") return results -def _run_batched(to_process, formats, queries, docs, api_key, base_url, model): +def _run_batched(to_process, formats, queries, docs, source_cache, api_key, base_url, model): """Async batch processing for local vLLM (no rate limiting needed).""" aclient = AsyncOpenAI(api_key=api_key, base_url=base_url) processed = 0 @@ -457,7 +487,12 @@ async def process_batches(): hall_type = HALLUCINATION_TYPES[global_idx % len(HALLUCINATION_TYPES)] query = queries.get(instance_id, "") - context = inst.get("problem_statement", "") + source_data = source_cache.get(instance_id, {}) + context = ( + build_source_context(source_data) + if source_data + else inst.get("problem_statement", "") + ) instance_docs = docs.get(instance_id, {}) tasks.append( @@ -497,11 +532,9 @@ async def process_batches(): results.append(entry) processed += 1 - if processed % 50 == 0 or batch_start + BATCH_SIZE >= len(to_process): + if processed % 100 == 0 or batch_start + BATCH_SIZE >= len(to_process): total = processed + failed - print( - f" Progress: {total}/{len(to_process)} ({processed} ok, {failed} failed)" - ) + print(f" Phase 6: {total}/{len(to_process)} ({processed} ok, {failed} failed)") asyncio.run(process_batches()) print(f"\nDone: {processed} injected, {failed} failed ({no_spans} had no matchable spans)") diff --git a/scripts/code_hallucination/pipeline.py b/scripts/code_hallucination/pipeline.py index ea7fc23..59d8d26 100644 --- a/scripts/code_hallucination/pipeline.py +++ b/scripts/code_hallucination/pipeline.py @@ -29,9 +29,21 @@ HALLUCINATED_PATH, MODEL, QUERIES_PATH, + SOURCE_CACHE_DIR, ) +def load_source_cache(instance_ids: list[str]) -> dict[str, dict]: + """Load source cache for given instance IDs.""" + cache = {} + for iid in instance_ids: + cache_path = SOURCE_CACHE_DIR / f"{iid}.json" + if cache_path.exists(): + with open(cache_path) as f: + cache[iid] = json.load(f) + return cache + + def load_jsonl_dict(path, key="instance_id", value_key=None) -> dict: """Load a JSONL file into a dict keyed by instance_id.""" result = {} @@ -110,8 +122,16 @@ def run_test(n: int = 5, api_key: str = API_KEY, base_url: str = API_BASE_URL, m formats = load_jsonl_dict(FORMATS_PATH) docs = load_jsonl_dict(DOCS_PATH, value_key="docs") to_inject = [i for i in selected if i["instance_id"] in targets] + sc = load_source_cache([i["instance_id"] for i in to_inject]) run_inject( - to_inject, formats, queries_dict, docs=docs, api_key=api_key, base_url=base_url, model=model + to_inject, + formats, + queries_dict, + docs=docs, + source_cache=sc, + api_key=api_key, + base_url=base_url, + model=model, ) # Phase 7: Assemble @@ -210,11 +230,13 @@ def main(): docs = load_jsonl_dict(DOCS_PATH, value_key="docs") targets = select_hallucination_targets(instances) to_inject = [i for i in instances if i["instance_id"] in targets] + sc = load_source_cache([i["instance_id"] for i in to_inject]) run( to_inject, formats, queries, docs=docs, + source_cache=sc, api_key=args.api_key, base_url=args.base_url, model=args.model, diff --git a/scripts/code_hallucination/query_rewriter.py b/scripts/code_hallucination/query_rewriter.py index 966c70f..65f3a16 100644 --- a/scripts/code_hallucination/query_rewriter.py +++ b/scripts/code_hallucination/query_rewriter.py @@ -123,8 +123,8 @@ def run( f.flush() processed += 1 - if processed % 50 == 0: - print(f" Progress: {processed}/{len(to_process)} (failed: {failed})") + if processed % 100 == 0: + print(f" Phase 3: {processed}/{len(to_process)} (failed: {failed})") except Exception as e: print(f" ERROR {instance_id}: {e}") failed += 1 diff --git a/scripts/code_hallucination/sample_assembler.py b/scripts/code_hallucination/sample_assembler.py index bdcbd61..7bf6157 100644 --- a/scripts/code_hallucination/sample_assembler.py +++ b/scripts/code_hallucination/sample_assembler.py @@ -2,7 +2,7 @@ import json -from .config import DATASET_PATH, METADATA_PATH, SOURCE_CACHE_DIR +from .config import DATASET_PATH, MAX_PROMPT_CHARS, METADATA_PATH, SOURCE_CACHE_DIR def build_prompt( @@ -24,7 +24,10 @@ def build_prompt( parts.append(f"User request: {user_query}") - return "\n\n".join(parts) + prompt = "\n\n".join(parts) + if len(prompt) > MAX_PROMPT_CHARS: + prompt = prompt[:MAX_PROMPT_CHARS] + return prompt def assemble_samples( diff --git a/scripts/code_hallucination/source_fetcher.py b/scripts/code_hallucination/source_fetcher.py index 0fc25da..fa24c4e 100644 --- a/scripts/code_hallucination/source_fetcher.py +++ b/scripts/code_hallucination/source_fetcher.py @@ -5,6 +5,7 @@ import re import subprocess import tempfile +import warnings from pathlib import Path import requests @@ -43,7 +44,7 @@ def clone_repo(repo: str, repos_dir: Path = REPOS_DIR) -> Path | None: ["git", "clone", "--bare", f"https://github.com/{repo}.git", str(repo_dir)], capture_output=True, text=True, - timeout=1800, # 30 min for large repos + timeout=60, # 1 min timeout, fall back to GitHub API ) if result.returncode != 0: print(f" ERROR cloning {repo}: {result.stderr[:200]}") @@ -143,7 +144,9 @@ def extract_modified_functions(original_source: str, patched_source: str) -> lis def get_functions(source: str) -> dict[str, str]: """Parse source and extract function name -> source mapping.""" try: - tree = ast.parse(source) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", SyntaxWarning) + tree = ast.parse(source) except SyntaxError: return {} @@ -460,10 +463,7 @@ def fetch_source_for_instance( for filepath in changed_files: if filepath not in source_files: continue - if repo_dir is not None: - patched_source = apply_patch_and_get_file(repo_dir, commit, patch, filepath) - else: - patched_source = apply_patch_in_memory(source_files[filepath], patch, filepath) + patched_source = apply_patch_in_memory(source_files[filepath], patch, filepath) if patched_source: funcs = extract_modified_functions(source_files[filepath], patched_source) for func in funcs: @@ -492,25 +492,25 @@ def run(instances: list[dict], use_github_api: bool = False): print("Phase 2: Source File Fetching") print("=" * 60) + # Suppress SyntaxWarning from ast.parse on third-party source files + warnings.filterwarnings("ignore", category=SyntaxWarning) + SOURCE_CACHE_DIR.mkdir(parents=True, exist_ok=True) + REPOS_DIR.mkdir(parents=True, exist_ok=True) - if not use_github_api: - REPOS_DIR.mkdir(parents=True, exist_ok=True) - # Group by repo for efficient cloning - repos = set(inst["repo"] for inst in instances) - print(f"Need to clone {len(repos)} repos") - for repo in sorted(repos): - clone_repo(repo) - else: + if use_github_api: print("Using GitHub raw API (no cloning)") + # Track repos that failed to clone so we don't retry + clone_failed_repos: set[str] = set() + # Fetch sources per instance results = [] failed = 0 for i, instance in enumerate(instances): - if (i + 1) % 100 == 0: - print(f" Progress: {i + 1}/{len(instances)} ({len(results)} success, {failed} failed)") + if (i + 1) % 100 == 0 or (i + 1) == len(instances): + print(f" Phase 2: {i + 1}/{len(instances)} ({len(results)} success, {failed} failed)") # Skip if already cached cache_path = SOURCE_CACHE_DIR / f"{instance['instance_id']}.json" @@ -519,10 +519,21 @@ def run(instances: list[dict], use_github_api: bool = False): results.append(json.load(f)) continue - result = fetch_source_for_instance(instance, use_github_api=use_github_api) + # Try clone first, fall back to GitHub API + repo = instance["repo"] + use_api_for_this = use_github_api + if not use_api_for_this and repo not in clone_failed_repos: + repo_dir = clone_repo(repo) + if repo_dir is None: + clone_failed_repos.add(repo) + use_api_for_this = True + print(f" Falling back to GitHub API for {repo}") + + result = fetch_source_for_instance( + instance, use_github_api=use_api_for_this or repo in clone_failed_repos + ) if result: results.append(result) - # Cache result cache_path = SOURCE_CACHE_DIR / f"{instance['instance_id']}.json" with open(cache_path, "w") as f: json.dump(result, f) diff --git a/scripts/evaluate_code_hallucination.py b/scripts/evaluate_code_hallucination.py index 87a68fd..2df98c0 100644 --- a/scripts/evaluate_code_hallucination.py +++ b/scripts/evaluate_code_hallucination.py @@ -5,12 +5,13 @@ Supports Groq API with any OpenAI-compatible model. Usage: - # With Groq + Kimi + # With Groq OPENAI_API_KEY=gsk_... OPENAI_API_BASE=https://api.groq.com/openai/v1 \ python scripts/evaluate_code_hallucination.py \ --model moonshotai/kimi-k2-instruct-0905 \ --data_path data/code_hallucination_lettucedetect_v2.json \ - --evaluation_type example_level + --evaluation_type example_level \ + --split test """ import argparse @@ -210,9 +211,12 @@ def main(): help="Limit number of test samples (for quick testing)", ) parser.add_argument( - "--test_ratio", type=float, default=0.3, help="Fraction of data to use as test set" + "--split", + type=str, + default="test", + choices=["train", "dev", "test"], + help="Which split to evaluate on (uses the split field from the dataset)", ) - parser.add_argument("--seed", type=int, default=42) args = parser.parse_args() @@ -243,14 +247,13 @@ def main(): ) ) - # Split into test set - import random - - random.seed(args.seed) - random.shuffle(samples) + # Filter to the requested split + test_samples = [s for s in samples if s.split == args.split] - test_size = int(len(samples) * args.test_ratio) - test_samples = samples[:test_size] + if not test_samples: + available_splits = set(s.split for s in samples) + print(f"No samples found for split '{args.split}'. Available splits: {available_splits}") + return if args.max_samples: test_samples = test_samples[: args.max_samples] @@ -260,7 +263,9 @@ def main(): print(f"Dataset: {data_path}") print(f"Total samples: {len(samples)}") - print(f"Test samples: {len(test_samples)} (positive: {n_positive}, negative: {n_negative})") + print( + f"Evaluating on '{args.split}' split: {len(test_samples)} samples (positive: {n_positive}, negative: {n_negative})" + ) print(f"Model: {args.model}") print(f"API base: {os.getenv('OPENAI_API_BASE', 'https://api.openai.com/v1')}")