diff --git a/scripts/code_hallucination/config.py b/scripts/code_hallucination/config.py
index 7f71f99..85bfed4 100644
--- a/scripts/code_hallucination/config.py
+++ b/scripts/code_hallucination/config.py
@@ -36,6 +36,7 @@
 HALLUCINATION_RATIO = 0.4  # 40% hallucinated, 60% clean
 MAX_FILE_CHARS = 12000  # Cap individual source file size
 MAX_CONTEXT7_CHARS = 4000  # Documentation fetch limit
+MAX_PROMPT_CHARS = 24000  # ~6K tokens, leaves room for answer within 8K model context
 
 # === LLM Config ===
 RETRY_DELAY = 2
diff --git a/scripts/code_hallucination/context7_docs.py b/scripts/code_hallucination/context7_docs.py
index b287015..ef60850 100644
--- a/scripts/code_hallucination/context7_docs.py
+++ b/scripts/code_hallucination/context7_docs.py
@@ -228,7 +228,7 @@ def run(instances: list[dict]):
 
             if processed % 100 == 0:
                 print(
-                    f"  Progress: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
+                    f"  Phase 4: {processed}/{len(to_process)} ({with_docs} with docs, {skipped_by_ratio} skipped)"
                 )
 
     print(
diff --git a/scripts/code_hallucination/format_builder.py b/scripts/code_hallucination/format_builder.py
index 9d290f7..0f0a660 100644
--- a/scripts/code_hallucination/format_builder.py
+++ b/scripts/code_hallucination/format_builder.py
@@ -1,15 +1,21 @@
-"""Phase 5: Assign answer format to each instance."""
+"""Phase 5: Assign answer format to each instance.
 
+Supports both sequential (remote API) and async batch (local vLLM) modes.
+Set BATCH_SIZE>1 env var for parallel requests to local vLLM.
+"""
+
+import asyncio
 import json
 import random
 import textwrap
 import time
 
-from openai import OpenAI
+from openai import AsyncOpenAI, OpenAI
 
 from .config import (
     API_BASE_URL,
     API_KEY,
+    BATCH_SIZE,
     FORMAT_TYPES,
     FORMAT_WEIGHTS,
     FORMATS_PATH,
@@ -26,27 +32,24 @@
     that a developer would receive from an AI assistant.
 
     Your response MUST:
-    - Start with a brief explanation (1-3 sentences) of what the issue is and how to fix it
+    - Start with 1-2 sentences explaining what was wrong and how to fix it
     - Include the code in a properly formatted code block (```python)
-    - Optionally end with a short note about what changed or why
+    - Do NOT add anything after the code block
 
     Your response must NOT:
-    - Include phrases like "Here's the fix" or "I'll help you with that" — just explain directly
-    - Be longer than necessary — keep it concise
+    - Include phrases like "Here's the fix" or "I'll help you with that"
+    - Be longer than 2 sentences of explanation + the code block
     - Change the code in any way — use it exactly as provided
     - Add any imports or code not in the original
 
-    Example style:
-    The issue is that `process_data` uses `dict.items()` instead of iterating
-    over the sorted keys, which causes non-deterministic output.
+    Example:
+    The `process_data` function uses `dict.items()` instead of iterating over sorted keys, causing non-deterministic output.
 
     ```python
     def process_data(data):
         for key in sorted(data.keys()):
             yield key, data[key]
     ```
-
-    This ensures consistent ordering regardless of insertion order.
 """)
 
 
@@ -75,7 +78,7 @@ def _generate_explanation(
                     {"role": "user", "content": user_msg},
                 ],
                 temperature=LLM_TEMPERATURE,
-                max_tokens=2000,
+                max_tokens=200,
             )
             result = response.choices[0].message.content.strip()
             # Verify the code is actually in the response
@@ -94,6 +97,47 @@ def _generate_explanation(
     return None
 
 
+async def _generate_explanation_async(
+    aclient: AsyncOpenAI, model: str, code: str, query: str, context: str
+) -> str | None:
+    """Async version of _generate_explanation for batch processing."""
+    user_msg = f"""User's question: {query}
+
+Context (relevant source code):
+{context[:3000]}
+
+Correct code fix:
+```python
+{code}
+```
+
+Write a natural AI assistant response that includes this exact code."""
+
+    for attempt in range(MAX_RETRIES):
+        try:
+            response = await aclient.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": EXPLANATION_SYSTEM_PROMPT},
+                    {"role": "user", "content": user_msg},
+                ],
+                temperature=LLM_TEMPERATURE,
+                max_tokens=200,
+            )
+            result = response.choices[0].message.content.strip()
+            if code[:50] in result or "```" in result:
+                return result
+            if attempt < MAX_RETRIES - 1:
+                continue
+            return None
+        except Exception:
+            if attempt < MAX_RETRIES - 1:
+                await asyncio.sleep(RETRY_DELAY * (attempt + 1))
+            else:
+                return None
+    return None
+
+
 def assign_format(source_data: dict) -> tuple[str, str]:
     """Assign a format type and build the answer for an instance.
 
@@ -169,7 +213,8 @@ def run(
 ):
     """Run Phase 5: Assign formats and build answers.
 
-    Returns list of dicts with instance_id, format_type, answer.
+    Uses async batch processing when BATCH_SIZE > 1 (for local vLLM).
+    Falls back to sequential processing for remote APIs (BATCH_SIZE=1).
     """
     print("=" * 60)
     print("Phase 5: Answer Format Building")
@@ -180,70 +225,187 @@ def run(
     if queries is None:
         queries = {}
 
-    # Only init LLM client if we'll need it (lazy)
-    client = None
-
-    results = []
-    format_counts = {fmt: 0 for fmt in FORMAT_TYPES}
-    skipped = 0
-    explanation_failures = 0
-
-    for inst in instances:
+    # Load existing for resumability
+    existing = {}
+    if FORMATS_PATH.exists():
+        with open(FORMATS_PATH) as f:
+            for line in f:
+                try:
+                    entry = json.loads(line)
+                    existing[entry["instance_id"]] = entry
+                except (json.JSONDecodeError, KeyError):
+                    continue
+    print(f"Already processed: {len(existing)} formats")
+
+    to_process = [inst for inst in instances if inst["instance_id"] not in existing]
+    print(f"Remaining: {len(to_process)} instances to process")
+    print(f"Batch size: {BATCH_SIZE}")
+
+    # First pass: assign formats for all instances (no LLM needed)
+    # Collect which ones need explanation generation
+    needs_explanation = []  # (instance_id, code, query, context)
+    entries_no_llm = []  # entries that don't need LLM
+
+    for inst in to_process:
         instance_id = inst["instance_id"]
 
-        # Load source data from cache
         cache_path = source_cache_dir / f"{instance_id}.json"
         if not cache_path.exists():
-            skipped += 1
             continue
 
-        with open(cache_path) as f:
-            source_data = json.load(f)
+        with open(cache_path) as fp:
+            source_data = json.load(fp)
 
         fmt, answer = assign_format(source_data)
         if fmt is None:
-            skipped += 1
             continue
 
-        # Generate explanation wrapper for code_with_explanation format
         if fmt == "code_with_explanation":
-            if client is None:
-                client = OpenAI(api_key=api_key, base_url=base_url)
-                print(f"  LLM client initialized for code_with_explanation ({base_url})")
-
             query = queries.get(instance_id, inst.get("problem_statement", "")[:500])
             context = source_data.get("patch_code", "")
-            explained = _generate_explanation(client, model, answer, query, context)
+            needs_explanation.append((instance_id, answer, query, context, fmt))
+        else:
+            entries_no_llm.append(
+                {
+                    "instance_id": instance_id,
+                    "format_type": fmt,
+                    "answer": answer,
+                }
+            )
+
+    # Write non-LLM entries immediately
+    results = list(existing.values())
+    format_counts = {fmt: 0 for fmt in FORMAT_TYPES}
+    for entry in results:
+        fmt = entry.get("format_type")
+        if fmt in format_counts:
+            format_counts[fmt] += 1
+
+    processed = 0
+    explanation_failures = 0
+
+    with open(FORMATS_PATH, "a") as f:
+        for entry in entries_no_llm:
+            f.write(json.dumps(entry) + "\n")
+            results.append(entry)
+            format_counts[entry["format_type"]] += 1
+            processed += 1
+        f.flush()
+
+    print(f"  Assigned {len(entries_no_llm)} non-LLM formats")
+    print(f"  Need LLM explanation: {len(needs_explanation)} instances")
+
+    # Second pass: generate explanations (batched or sequential)
+    if needs_explanation:
+        if BATCH_SIZE > 1:
+            explanation_failures = _run_explanations_batched(
+                needs_explanation, format_counts, results, api_key, base_url, model
+            )
+        else:
+            explanation_failures = _run_explanations_sequential(
+                needs_explanation, format_counts, results, api_key, base_url, model
+            )
+
+    processed += len(needs_explanation)
+
+    print(f"\nAssigned formats for {len(results)} instances")
+    if explanation_failures:
+        print(f"  Explanation generation failures (fell back to fragment): {explanation_failures}")
+    for fmt, count in format_counts.items():
+        pct = count * 100 // max(len(results), 1)
+        print(f"  {fmt}: {count} ({pct}%)")
+
+    return results
+
+
+def _run_explanations_sequential(
+    needs_explanation, format_counts, results, api_key, base_url, model
+):
+    """Generate explanations sequentially (for remote APIs)."""
+    client = OpenAI(api_key=api_key, base_url=base_url)
+    explanation_failures = 0
+    processed = 0
+
+    with open(FORMATS_PATH, "a") as f:
+        for instance_id, code, query, context, _ in needs_explanation:
+            explained = _generate_explanation(client, model, code, query, context)
 
             if explained is None:
-                # Fallback: use raw code as fragment
                 fmt = "fragment"
+                answer = code
                 explanation_failures += 1
             else:
+                fmt = "code_with_explanation"
                 answer = explained
 
-        results.append(
-            {
+            entry = {
                 "instance_id": instance_id,
                 "format_type": fmt,
                 "answer": answer,
             }
-        )
-        format_counts[fmt] += 1
-
-    # Save
-    with open(FORMATS_PATH, "w") as f:
-        for entry in results:
             f.write(json.dumps(entry) + "\n")
+            f.flush()
+            results.append(entry)
+            format_counts[fmt] += 1
+            processed += 1
 
-    print(f"\nAssigned formats for {len(results)} instances (skipped {skipped})")
-    if explanation_failures:
-        print(f"  Explanation generation failures (fell back to fragment): {explanation_failures}")
-    for fmt, count in format_counts.items():
-        pct = count * 100 // max(len(results), 1)
-        print(f"  {fmt}: {count} ({pct}%)")
+            if processed % 100 == 0:
+                print(
+                    f"  Phase 5 (explanations): {processed}/{len(needs_explanation)} "
+                    f"(failures: {explanation_failures})"
+                )
 
-    return results
+    return explanation_failures
+
+
+def _run_explanations_batched(needs_explanation, format_counts, results, api_key, base_url, model):
+    """Generate explanations with async batching (for local vLLM)."""
+    aclient = AsyncOpenAI(api_key=api_key, base_url=base_url)
+    explanation_failures = 0
+    processed = 0
+
+    async def process_batches():
+        nonlocal explanation_failures, processed
+
+        with open(FORMATS_PATH, "a") as f:
+            for batch_start in range(0, len(needs_explanation), BATCH_SIZE):
+                batch = needs_explanation[batch_start : batch_start + BATCH_SIZE]
+
+                tasks = []
+                for instance_id, code, query, context, _ in batch:
+                    tasks.append(_generate_explanation_async(aclient, model, code, query, context))
+
+                batch_results = await asyncio.gather(*tasks, return_exceptions=True)
+
+                for (instance_id, code, query, context, _), explained in zip(batch, batch_results):
+                    if isinstance(explained, Exception) or explained is None:
+                        fmt = "fragment"
+                        answer = code
+                        explanation_failures += 1
+                    else:
+                        fmt = "code_with_explanation"
+                        answer = explained
+
+                    entry = {
+                        "instance_id": instance_id,
+                        "format_type": fmt,
+                        "answer": answer,
+                    }
+                    f.write(json.dumps(entry) + "\n")
+                    results.append(entry)
+                    format_counts[fmt] += 1
+                    processed += 1
+
+                f.flush()
+
+                if processed % 100 == 0 or batch_start + BATCH_SIZE >= len(needs_explanation):
+                    print(
+                        f"  Phase 5 (explanations): {processed}/{len(needs_explanation)} "
+                        f"(failures: {explanation_failures})"
+                    )
+
+    asyncio.run(process_batches())
+    return explanation_failures
 
 
 if __name__ == "__main__":
diff --git a/scripts/code_hallucination/hallucination_injector.py b/scripts/code_hallucination/hallucination_injector.py
index 941315c..46d676d 100644
--- a/scripts/code_hallucination/hallucination_injector.py
+++ b/scripts/code_hallucination/hallucination_injector.py
@@ -19,6 +19,7 @@
     HALLUCINATED_PATH,
     HALLUCINATION_TEMPERATURE,
     HALLUCINATION_TYPES,
+    MAX_PROMPT_CHARS,
     MAX_RETRIES,
     MODEL,
     RETRY_DELAY,
@@ -28,24 +29,30 @@
     You are a code hallucination injector for building a hallucination detection dataset.
 
     Given a correct answer (which may be pure code OR code with natural language explanation)
-    and context, create a hallucinated version with specific types of errors.
+    and SOURCE CODE CONTEXT, create a hallucinated version with specific types of errors.
+
+    CRITICAL: Every error you inject MUST BE DETECTABLE by comparing the answer against
+    the provided source code context AND/OR the user's request. A human reading the
+    source files and user query must be able to spot that the hallucinated part
+    contradicts what's in the source or what the user asked for. Do NOT inject errors
+    that require running the code or external knowledge to detect.
 
     Hallucination types:
-    - STRUCTURAL: Change a function call, import, or parameter to something that
-      doesn't exist or is wrong. Code should still parse but reference non-existent
-      APIs, wrong methods, or invented parameters.
-    - BEHAVIORAL: Use correct APIs but with wrong values or logic. Wrong defaults,
-      off-by-one errors, swapped conditions, wrong argument values.
-    - SEMANTIC: Code that looks like it addresses the user's request but does
-      something subtly different or opposite. The code parses, uses real APIs,
-      but fails to do what was asked. If library documentation is provided,
-      you can make the code contradict the documented API (wrong parameter names,
-      wrong return types, deprecated usage, etc.).
-      For answers with explanations, you may also make the explanation contradict
-      the code or describe incorrect behavior.
+    - STRUCTURAL: Change a function/method/class name, import, or parameter to something
+      that does NOT exist in the source context. For example, rename a method call to one
+      that isn't defined in the provided source files, or add a parameter that the function
+      doesn't accept according to the source.
+    - BEHAVIORAL: Use correct API names from the source but with wrong values or logic
+      that contradicts the source. Wrong default values (different from source), swapped
+      conditions, wrong argument order compared to the function signature in source.
+    - SEMANTIC: Code that contradicts the source's behavior, the user's request, or the
+      explanation contradicts what the source code actually does. For example: claim a
+      function returns X when the source shows it returns Y, describe wrong control flow,
+      or solve a different problem than what the user asked for.
 
     Rules:
     - Make 2-3 DISTINCT changes spread across different parts of the answer
+    - Each change MUST contradict something visible in the source code or user request
     - Each changed span must be 20-150 characters long (not too short, not too long)
     - Total hallucinated text must be LESS THAN 40% of the original answer length
     - Keep most of the answer CORRECT — do NOT rewrite the entire thing
@@ -53,11 +60,7 @@
     - Make changes PLAUSIBLE — something an LLM would realistically generate
     - Changes must be SUBTLE, not obviously broken
     - The code in the hallucinated answer must still be syntactically valid
-    - Do NOT add comments explaining or hinting at the hallucination (no "# wrong",
-      "# error", "# typo", "# nonexistent", etc.) — the errors must be invisible
-      to someone skimming the answer
-    - If the answer contains both code and explanation, inject errors in BOTH parts
-      (e.g. wrong API in code + misleading description in text)
+    - Do NOT add comments explaining or hinting at the hallucination
     - Preserve the overall structure: keep markdown formatting, code blocks, etc.
 
     Respond in this exact JSON format (no markdown, no code blocks):
@@ -67,7 +70,7 @@
             {
                 "original": "exact original text that was changed",
                 "hallucinated": "what you changed it to",
-                "explanation": "why this is a hallucination"
+                "explanation": "why this is wrong — what does the source code or user request actually say?"
             }
         ]
     }
@@ -77,10 +80,25 @@
     - "original" must be an exact substring of the correct answer
     - "hallucinated" must be an exact substring of your hallucinated answer
     - Each "hallucinated" value must be at least 20 characters long
+    - Each "explanation" must reference what the source code or user request actually says
     - Return ONLY valid JSON, nothing else
 """)
 
 
+def build_source_context(source_data: dict) -> str:
+    """Build source code context string from cached source data.
+
+    Truncates to MAX_PROMPT_CHARS so the final sample fits in 8K model context.
+    """
+    parts = []
+    for filepath, content in source_data.get("source_files", {}).items():
+        parts.append(f"File: {filepath}\n```python\n{content}\n```")
+    context = "\n\n".join(parts)
+    if len(context) > MAX_PROMPT_CHARS:
+        context = context[:MAX_PROMPT_CHARS]
+    return context
+
+
 def inject_hallucination(
     client: OpenAI,
     model: str,
@@ -318,6 +336,7 @@ def run(
     formats: dict[str, dict],
     queries: dict[str, str],
     docs: dict[str, dict] | None = None,
+    source_cache: dict[str, dict] | None = None,
     api_key: str = API_KEY,
     base_url: str = API_BASE_URL,
     model: str = MODEL,
@@ -333,6 +352,8 @@ def run(
 
     if docs is None:
         docs = {}
+    if source_cache is None:
+        source_cache = {}
 
     HALLUCINATED_PATH.parent.mkdir(parents=True, exist_ok=True)
 
@@ -350,9 +371,13 @@ def run(
     print(f"Remaining: {len(to_process)} instances to inject")
 
     if BATCH_SIZE > 1:
-        results = _run_batched(to_process, formats, queries, docs, api_key, base_url, model)
+        results = _run_batched(
+            to_process, formats, queries, docs, source_cache, api_key, base_url, model
+        )
     else:
-        results = _run_sequential(to_process, formats, queries, docs, api_key, base_url, model)
+        results = _run_sequential(
+            to_process, formats, queries, docs, source_cache, api_key, base_url, model
+        )
 
     # Stats
     type_counts = {}
@@ -372,7 +397,7 @@ def run(
     return results
 
 
-def _run_sequential(to_process, formats, queries, docs, api_key, base_url, model):
+def _run_sequential(to_process, formats, queries, docs, source_cache, api_key, base_url, model):
     """Sequential processing for remote APIs (rate-limited)."""
     client = OpenAI(api_key=api_key, base_url=base_url)
     processed = 0
@@ -391,7 +416,12 @@ def _run_sequential(to_process, formats, queries, docs, api_key, base_url, model
 
             hall_type = HALLUCINATION_TYPES[i % len(HALLUCINATION_TYPES)]
             query = queries.get(instance_id, "")
-            context = inst.get("problem_statement", "")
+            source_data = source_cache.get(instance_id, {})
+            context = (
+                build_source_context(source_data)
+                if source_data
+                else inst.get("problem_statement", "")
+            )
             instance_docs = docs.get(instance_id, {})
 
             # Try injection with up to 2 quality retries
@@ -421,14 +451,14 @@ def _run_sequential(to_process, formats, queries, docs, api_key, base_url, model
             results.append(entry)
             processed += 1
 
-            if processed % 50 == 0:
-                print(f"  Progress: {processed}/{len(to_process)} (failed: {failed})")
+            if processed % 100 == 0:
+                print(f"  Phase 6: {processed}/{len(to_process)} (failed: {failed})")
 
     print(f"\nDone: {processed} injected, {failed} failed ({no_spans} had no matchable spans)")
     return results
 
 
-def _run_batched(to_process, formats, queries, docs, api_key, base_url, model):
+def _run_batched(to_process, formats, queries, docs, source_cache, api_key, base_url, model):
     """Async batch processing for local vLLM (no rate limiting needed)."""
     aclient = AsyncOpenAI(api_key=api_key, base_url=base_url)
     processed = 0
@@ -457,7 +487,12 @@ async def process_batches():
 
                     hall_type = HALLUCINATION_TYPES[global_idx % len(HALLUCINATION_TYPES)]
                     query = queries.get(instance_id, "")
-                    context = inst.get("problem_statement", "")
+                    source_data = source_cache.get(instance_id, {})
+                    context = (
+                        build_source_context(source_data)
+                        if source_data
+                        else inst.get("problem_statement", "")
+                    )
                     instance_docs = docs.get(instance_id, {})
 
                     tasks.append(
@@ -497,11 +532,9 @@ async def process_batches():
                     results.append(entry)
                     processed += 1
 
-                if processed % 50 == 0 or batch_start + BATCH_SIZE >= len(to_process):
+                if processed % 100 == 0 or batch_start + BATCH_SIZE >= len(to_process):
                     total = processed + failed
-                    print(
-                        f"  Progress: {total}/{len(to_process)} ({processed} ok, {failed} failed)"
-                    )
+                    print(f"  Phase 6: {total}/{len(to_process)} ({processed} ok, {failed} failed)")
 
     asyncio.run(process_batches())
     print(f"\nDone: {processed} injected, {failed} failed ({no_spans} had no matchable spans)")
diff --git a/scripts/code_hallucination/pipeline.py b/scripts/code_hallucination/pipeline.py
index ea7fc23..59d8d26 100644
--- a/scripts/code_hallucination/pipeline.py
+++ b/scripts/code_hallucination/pipeline.py
@@ -29,9 +29,21 @@
     HALLUCINATED_PATH,
     MODEL,
     QUERIES_PATH,
+    SOURCE_CACHE_DIR,
 )
 
 
+def load_source_cache(instance_ids: list[str]) -> dict[str, dict]:
+    """Load source cache for given instance IDs."""
+    cache = {}
+    for iid in instance_ids:
+        cache_path = SOURCE_CACHE_DIR / f"{iid}.json"
+        if cache_path.exists():
+            with open(cache_path) as f:
+                cache[iid] = json.load(f)
+    return cache
+
+
 def load_jsonl_dict(path, key="instance_id", value_key=None) -> dict:
     """Load a JSONL file into a dict keyed by instance_id."""
     result = {}
@@ -110,8 +122,16 @@ def run_test(n: int = 5, api_key: str = API_KEY, base_url: str = API_BASE_URL, m
     formats = load_jsonl_dict(FORMATS_PATH)
     docs = load_jsonl_dict(DOCS_PATH, value_key="docs")
     to_inject = [i for i in selected if i["instance_id"] in targets]
+    sc = load_source_cache([i["instance_id"] for i in to_inject])
     run_inject(
-        to_inject, formats, queries_dict, docs=docs, api_key=api_key, base_url=base_url, model=model
+        to_inject,
+        formats,
+        queries_dict,
+        docs=docs,
+        source_cache=sc,
+        api_key=api_key,
+        base_url=base_url,
+        model=model,
     )
 
     # Phase 7: Assemble
@@ -210,11 +230,13 @@ def main():
             docs = load_jsonl_dict(DOCS_PATH, value_key="docs")
             targets = select_hallucination_targets(instances)
             to_inject = [i for i in instances if i["instance_id"] in targets]
+            sc = load_source_cache([i["instance_id"] for i in to_inject])
             run(
                 to_inject,
                 formats,
                 queries,
                 docs=docs,
+                source_cache=sc,
                 api_key=args.api_key,
                 base_url=args.base_url,
                 model=args.model,
diff --git a/scripts/code_hallucination/query_rewriter.py b/scripts/code_hallucination/query_rewriter.py
index 966c70f..65f3a16 100644
--- a/scripts/code_hallucination/query_rewriter.py
+++ b/scripts/code_hallucination/query_rewriter.py
@@ -123,8 +123,8 @@ def run(
                 f.flush()
                 processed += 1
 
-                if processed % 50 == 0:
-                    print(f"  Progress: {processed}/{len(to_process)} (failed: {failed})")
+                if processed % 100 == 0:
+                    print(f"  Phase 3: {processed}/{len(to_process)} (failed: {failed})")
             except Exception as e:
                 print(f"  ERROR {instance_id}: {e}")
                 failed += 1
diff --git a/scripts/code_hallucination/sample_assembler.py b/scripts/code_hallucination/sample_assembler.py
index bdcbd61..7bf6157 100644
--- a/scripts/code_hallucination/sample_assembler.py
+++ b/scripts/code_hallucination/sample_assembler.py
@@ -2,7 +2,7 @@
 
 import json
 
-from .config import DATASET_PATH, METADATA_PATH, SOURCE_CACHE_DIR
+from .config import DATASET_PATH, MAX_PROMPT_CHARS, METADATA_PATH, SOURCE_CACHE_DIR
 
 
 def build_prompt(
@@ -24,7 +24,10 @@ def build_prompt(
 
     parts.append(f"User request: {user_query}")
 
-    return "\n\n".join(parts)
+    prompt = "\n\n".join(parts)
+    if len(prompt) > MAX_PROMPT_CHARS:
+        prompt = prompt[:MAX_PROMPT_CHARS]
+    return prompt
 
 
 def assemble_samples(
diff --git a/scripts/code_hallucination/source_fetcher.py b/scripts/code_hallucination/source_fetcher.py
index 0fc25da..fa24c4e 100644
--- a/scripts/code_hallucination/source_fetcher.py
+++ b/scripts/code_hallucination/source_fetcher.py
@@ -5,6 +5,7 @@
 import re
 import subprocess
 import tempfile
+import warnings
 from pathlib import Path
 
 import requests
@@ -43,7 +44,7 @@ def clone_repo(repo: str, repos_dir: Path = REPOS_DIR) -> Path | None:
             ["git", "clone", "--bare", f"https://github.com/{repo}.git", str(repo_dir)],
             capture_output=True,
             text=True,
-            timeout=1800,  # 30 min for large repos
+            timeout=60,  # 1 min timeout, fall back to GitHub API
         )
         if result.returncode != 0:
             print(f"  ERROR cloning {repo}: {result.stderr[:200]}")
@@ -143,7 +144,9 @@ def extract_modified_functions(original_source: str, patched_source: str) -> lis
     def get_functions(source: str) -> dict[str, str]:
         """Parse source and extract function name -> source mapping."""
         try:
-            tree = ast.parse(source)
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", SyntaxWarning)
+                tree = ast.parse(source)
         except SyntaxError:
             return {}
 
@@ -460,10 +463,7 @@ def fetch_source_for_instance(
     for filepath in changed_files:
         if filepath not in source_files:
             continue
-        if repo_dir is not None:
-            patched_source = apply_patch_and_get_file(repo_dir, commit, patch, filepath)
-        else:
-            patched_source = apply_patch_in_memory(source_files[filepath], patch, filepath)
+        patched_source = apply_patch_in_memory(source_files[filepath], patch, filepath)
         if patched_source:
             funcs = extract_modified_functions(source_files[filepath], patched_source)
             for func in funcs:
@@ -492,25 +492,25 @@ def run(instances: list[dict], use_github_api: bool = False):
     print("Phase 2: Source File Fetching")
     print("=" * 60)
 
+    # Suppress SyntaxWarning from ast.parse on third-party source files
+    warnings.filterwarnings("ignore", category=SyntaxWarning)
+
     SOURCE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    REPOS_DIR.mkdir(parents=True, exist_ok=True)
 
-    if not use_github_api:
-        REPOS_DIR.mkdir(parents=True, exist_ok=True)
-        # Group by repo for efficient cloning
-        repos = set(inst["repo"] for inst in instances)
-        print(f"Need to clone {len(repos)} repos")
-        for repo in sorted(repos):
-            clone_repo(repo)
-    else:
+    if use_github_api:
         print("Using GitHub raw API (no cloning)")
 
+    # Track repos that failed to clone so we don't retry
+    clone_failed_repos: set[str] = set()
+
     # Fetch sources per instance
     results = []
     failed = 0
 
     for i, instance in enumerate(instances):
-        if (i + 1) % 100 == 0:
-            print(f"  Progress: {i + 1}/{len(instances)} ({len(results)} success, {failed} failed)")
+        if (i + 1) % 100 == 0 or (i + 1) == len(instances):
+            print(f"  Phase 2: {i + 1}/{len(instances)} ({len(results)} success, {failed} failed)")
 
         # Skip if already cached
         cache_path = SOURCE_CACHE_DIR / f"{instance['instance_id']}.json"
@@ -519,10 +519,21 @@ def run(instances: list[dict], use_github_api: bool = False):
                 results.append(json.load(f))
             continue
 
-        result = fetch_source_for_instance(instance, use_github_api=use_github_api)
+        # Try clone first, fall back to GitHub API
+        repo = instance["repo"]
+        use_api_for_this = use_github_api
+        if not use_api_for_this and repo not in clone_failed_repos:
+            repo_dir = clone_repo(repo)
+            if repo_dir is None:
+                clone_failed_repos.add(repo)
+                use_api_for_this = True
+                print(f"  Falling back to GitHub API for {repo}")
+
+        result = fetch_source_for_instance(
+            instance, use_github_api=use_api_for_this or repo in clone_failed_repos
+        )
         if result:
             results.append(result)
-            # Cache result
             cache_path = SOURCE_CACHE_DIR / f"{instance['instance_id']}.json"
             with open(cache_path, "w") as f:
                 json.dump(result, f)
diff --git a/scripts/evaluate_code_hallucination.py b/scripts/evaluate_code_hallucination.py
index 87a68fd..2df98c0 100644
--- a/scripts/evaluate_code_hallucination.py
+++ b/scripts/evaluate_code_hallucination.py
@@ -5,12 +5,13 @@
 Supports Groq API with any OpenAI-compatible model.
 
 Usage:
-    # With Groq + Kimi
+    # With Groq
     OPENAI_API_KEY=gsk_... OPENAI_API_BASE=https://api.groq.com/openai/v1 \
         python scripts/evaluate_code_hallucination.py \
         --model moonshotai/kimi-k2-instruct-0905 \
         --data_path data/code_hallucination_lettucedetect_v2.json \
-        --evaluation_type example_level
+        --evaluation_type example_level \
+        --split test
 """
 
 import argparse
@@ -210,9 +211,12 @@ def main():
         help="Limit number of test samples (for quick testing)",
     )
     parser.add_argument(
-        "--test_ratio", type=float, default=0.3, help="Fraction of data to use as test set"
+        "--split",
+        type=str,
+        default="test",
+        choices=["train", "dev", "test"],
+        help="Which split to evaluate on (uses the split field from the dataset)",
     )
-    parser.add_argument("--seed", type=int, default=42)
 
     args = parser.parse_args()
 
@@ -243,14 +247,13 @@ def main():
             )
         )
 
-    # Split into test set
-    import random
-
-    random.seed(args.seed)
-    random.shuffle(samples)
+    # Filter to the requested split
+    test_samples = [s for s in samples if s.split == args.split]
 
-    test_size = int(len(samples) * args.test_ratio)
-    test_samples = samples[:test_size]
+    if not test_samples:
+        available_splits = set(s.split for s in samples)
+        print(f"No samples found for split '{args.split}'. Available splits: {available_splits}")
+        return
 
     if args.max_samples:
         test_samples = test_samples[: args.max_samples]
@@ -260,7 +263,9 @@ def main():
 
     print(f"Dataset: {data_path}")
     print(f"Total samples: {len(samples)}")
-    print(f"Test samples: {len(test_samples)} (positive: {n_positive}, negative: {n_negative})")
+    print(
+        f"Evaluating on '{args.split}' split: {len(test_samples)} samples (positive: {n_positive}, negative: {n_negative})"
+    )
     print(f"Model: {args.model}")
     print(f"API base: {os.getenv('OPENAI_API_BASE', 'https://api.openai.com/v1')}")