|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Fill placeholder distractors in evaluation specs using Claude API. |
| 4 | +
|
| 5 | +Reads specs with PLACEHOLDER_A/C/D options and asks Claude to generate |
| 6 | +plausible but wrong distractors based on the anchor's domain. |
| 7 | +
|
| 8 | +Usage: |
| 9 | + python3 evaluations/fill-distractors.py # Fill all placeholders |
| 10 | + python3 evaluations/fill-distractors.py --dry-run # Preview prompts |
| 11 | + python3 evaluations/fill-distractors.py --anchor arc42 # Single anchor |
| 12 | +""" |
| 13 | + |
| 14 | +import argparse |
| 15 | +import json |
| 16 | +import os |
| 17 | +import sys |
| 18 | +from pathlib import Path |
| 19 | + |
| 20 | +try: |
| 21 | + import yaml |
| 22 | +except ImportError: |
| 23 | + print("PyYAML required: pip install pyyaml") |
| 24 | + sys.exit(1) |
| 25 | + |
| 26 | +SPECS_DIR = Path(__file__).parent / "specs" |
| 27 | + |
| 28 | + |
| 29 | +def needs_distractors(spec): |
| 30 | + """Check if spec has placeholder distractors.""" |
| 31 | + q = spec.get("questions", {}).get("recognition", {}) |
| 32 | + options = q.get("options", {}) |
| 33 | + return any("PLACEHOLDER" in str(v) for v in options.values()) |
| 34 | + |
| 35 | + |
| 36 | +def generate_distractors(spec): |
| 37 | + """Use Claude API to generate 3 plausible distractors.""" |
| 38 | + try: |
| 39 | + import anthropic |
| 40 | + except ImportError: |
| 41 | + print("anthropic package required: pip install anthropic") |
| 42 | + sys.exit(1) |
| 43 | + |
| 44 | + q = spec["questions"]["recognition"] |
| 45 | + correct = q["options"]["B"] |
| 46 | + title = q["question"].strip().split('"')[1] if '"' in q["question"] else spec["anchor"] |
| 47 | + related = q.get("_related", []) |
| 48 | + proponents = q.get("_proponents", "") |
| 49 | + |
| 50 | + prompt = f"""Generate 3 plausible but WRONG multiple-choice distractors for this question: |
| 51 | +
|
| 52 | +Question: Which of the following best describes "{title}"? |
| 53 | +Correct answer: {correct} |
| 54 | +
|
| 55 | +Requirements for distractors: |
| 56 | +- Each distractor should be a one-sentence description of a DIFFERENT but related concept |
| 57 | +- They must be wrong but sound plausible to someone unfamiliar with the topic |
| 58 | +- All 4 options (correct + 3 distractors) should be similar in length |
| 59 | +- Do NOT include the correct concept in any distractor |
| 60 | +- Draw distractors from adjacent concepts in software engineering, architecture, or methodology |
| 61 | +{f"- Related anchors for inspiration: {', '.join(related)}" if related else ""} |
| 62 | +{f"- The correct answer is associated with: {proponents}" if proponents else ""} |
| 63 | +
|
| 64 | +Return ONLY a JSON object with keys "A", "C", "D" containing the 3 distractor strings. No explanation.""" |
| 65 | + |
| 66 | + client = anthropic.Anthropic() |
| 67 | + response = client.messages.create( |
| 68 | + model="claude-sonnet-4-20250514", |
| 69 | + max_tokens=300, |
| 70 | + temperature=0.7, # some creativity for diverse distractors |
| 71 | + messages=[{"role": "user", "content": prompt}], |
| 72 | + ) |
| 73 | + |
| 74 | + text = response.content[0].text.strip() |
| 75 | + # Parse JSON from response (might be wrapped in ```json ... ```) |
| 76 | + if "```" in text: |
| 77 | + text = text.split("```")[1] |
| 78 | + if text.startswith("json"): |
| 79 | + text = text[4:] |
| 80 | + text = text.strip() |
| 81 | + |
| 82 | + return json.loads(text) |
| 83 | + |
| 84 | + |
| 85 | +def main(): |
| 86 | + parser = argparse.ArgumentParser(description="Fill placeholder distractors using Claude API") |
| 87 | + parser.add_argument("--dry-run", action="store_true", help="Preview without writing") |
| 88 | + parser.add_argument("--anchor", help="Process single anchor") |
| 89 | + args = parser.parse_args() |
| 90 | + |
| 91 | + specs_to_fill = [] |
| 92 | + for f in sorted(SPECS_DIR.glob("*.yaml")): |
| 93 | + spec = yaml.safe_load(f.read_text(encoding="utf-8")) |
| 94 | + if args.anchor and spec["anchor"] != args.anchor: |
| 95 | + continue |
| 96 | + if needs_distractors(spec): |
| 97 | + specs_to_fill.append((f, spec)) |
| 98 | + |
| 99 | + print(f"Found {len(specs_to_fill)} specs needing distractors") |
| 100 | + |
| 101 | + for filepath, spec in specs_to_fill: |
| 102 | + anchor_id = spec["anchor"] |
| 103 | + print(f" {anchor_id}...", end=" ", flush=True) |
| 104 | + |
| 105 | + if args.dry_run: |
| 106 | + print("(dry run)") |
| 107 | + continue |
| 108 | + |
| 109 | + try: |
| 110 | + distractors = generate_distractors(spec) |
| 111 | + q = spec["questions"]["recognition"] |
| 112 | + q["options"]["A"] = distractors["A"] |
| 113 | + q["options"]["C"] = distractors["C"] |
| 114 | + q["options"]["D"] = distractors["D"] |
| 115 | + |
| 116 | + # Remove helper notes |
| 117 | + q.pop("_note", None) |
| 118 | + q.pop("_related", None) |
| 119 | + q.pop("_proponents", None) |
| 120 | + q.pop("_also_known_as", None) |
| 121 | + |
| 122 | + with open(filepath, "w", encoding="utf-8") as fh: |
| 123 | + yaml.dump(spec, fh, default_flow_style=False, allow_unicode=True, sort_keys=False) |
| 124 | + print("OK") |
| 125 | + |
| 126 | + except Exception as e: |
| 127 | + print(f"ERROR: {e}") |
| 128 | + |
| 129 | + print("\nDone. Review the generated distractors before running evaluations!") |
| 130 | + |
| 131 | + |
| 132 | +if __name__ == "__main__": |
| 133 | + main() |
0 commit comments