diff --git a/evaluations/.gitignore b/evaluations/.gitignore
new file mode 100644
index 0000000..0d20b64
--- /dev/null
+++ b/evaluations/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/evaluations/README.adoc b/evaluations/README.adoc
new file mode 100644
index 0000000..8547c49
--- /dev/null
+++ b/evaluations/README.adoc
@@ -0,0 +1,158 @@
+= Semantic Anchor Evaluations
+:toc:
+
+== Overview
+
+Multiple-choice evaluation framework for testing whether semantic anchors work across different LLMs.
+See the link:../docs/anchor-evaluations.adoc[full concept document] for background and methodology.
+
+== Quick Start
+
+=== Prerequisites
+
+* Python 3.10+
+* `pyyaml` package: `pip install pyyaml`
+* At least one of:
+** Claude Code CLI (authenticated)
+** OpenAI API key (`OPENAI_API_KEY` environment variable)
+** Ollama running locally
+
+=== Running the Pilot
+
+[source,bash]
+----
+cd website
+
+# Claude Sonnet (default, via CLI)
+python3 evaluations/pilot.py
+
+# Claude Haiku
+python3 evaluations/pilot.py --model claude-haiku
+
+# GPT-4o-mini (requires OPENAI_API_KEY)
+python3 evaluations/pilot.py --model openai
+
+# Ollama (requires local server + model)
+ollama serve &                               # start server if not running
+ollama pull qwen3:4b                         # pull model (once)
+python3 evaluations/pilot.py --model ollama  # uses qwen3:4b by default
+python3 evaluations/pilot.py --model ollama --ollama-model mistral  # other model
+
+# Multiple models at once
+python3 evaluations/pilot.py --model claude-cli claude-haiku openai
+
+# Dry run (show prompts without sending)
+python3 evaluations/pilot.py --dry-run
+----
+
+=== Available Models
+
+[cols="1,1,2"]
+|===
+|Flag |Model |Notes
+
+|`claude-cli`
+|Claude Sonnet (via CLI)
+|Default. Requires `claude` CLI authenticated.
+
+|`claude-haiku`
+|Claude Haiku (via CLI)
+|Smallest Claude model. Good lower-bound test.
+
+|`openai`
+|GPT-4o-mini (via API)
+|Requires `OPENAI_API_KEY`.
+
+|`claude`
+|Claude Sonnet (via API)
+|Requires `ANTHROPIC_API_KEY`. Alternative to CLI.
+
+|`ollama`
+|Local model (via Ollama)
+|Requires Ollama server on `localhost:11434`. Default: `qwen3:4b`, override with `--ollama-model`.
+|===
+
+== Directory Structure
+
+[source]
+----
+evaluations/
+├── README.adoc          # This file
+├── pilot.py             # Evaluation runner script
+├── specs/               # Question specs (YAML)
+│   ├── arc42.yaml
+│   ├── docs-as-code.yaml
+│   ├── mece.yaml
+│   ├── tdd-london-school.yaml
+│   └── timtowtdi.yaml
+└── results/             # Raw results (JSON, timestamped)
+    └── pilot-*.json
+----
+
+== Question Spec Format
+
+Each anchor has a YAML file with multiple-choice questions:
+
+[source,yaml]
+----
+anchor: tdd-london-school
+tier: 3
+
+questions:
+  recognition:           # Level 1: Does the model identify the anchor?
+    question: |
+      Which of the following best describes "TDD, London School"?
+    options:
+      A: ...             # Distractor (e.g., Chicago School description)
+      B: ...             # Correct answer
+      C: ...             # Distractor (e.g., BDD description)
+      D: ...             # Distractor
+    correct: B
+
+  application:           # Level 2: Does it change behavior?
+    scenario: |
+      You are reviewing a PR. ...
+    anchor_prompt: "using TDD, London School principles"
+    paraphrase_prompt: "Write isolated tests for the service layer"
+    options: ...
+    correct: B
+
+  consistency:           # Level 4: Same answer across aliases/languages?
+    variants:
+      - 'Question with canonical name'
+      - 'Question with alias'
+    language_variant: 'Frage auf Deutsch'
+    options: ...
+    correct: B
+----
+
+== Scoring
+
+* Each question runs *4 times* with randomized option order (position bias mitigation)
+* Score = percentage of correct answers across the 4 runs
+* Response parsing: extracts first capital letter A–D from response
+* Results saved as timestamped JSON in `results/`
+
+== Pilot Results (2026-03-24)
+
+[cols="1,1,1,1"]
+|===
+|Model |Average |Best |Worst
+
+|Claude Sonnet 4.6
+|100%
+|all 100%
+|—
+
+|Claude Haiku 4.5
+|100%
+|all 100%
+|—
+
+|GPT-4o-mini
+|81%
+|Recognition: arc42, MECE, TIMTOWTDI (100%)
+|TDD London School Recognition (25%)
+|===
+
+Key finding: *Position bias is real.* GPT-4o-mini recognizes "TDD, London School" only 25% of the time -- it picks the correct answer only when it happens to be in a favorable position.
diff --git a/evaluations/fill-distractors.py b/evaluations/fill-distractors.py
new file mode 100644
index 0000000..eeb002d
--- /dev/null
+++ b/evaluations/fill-distractors.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+"""
+Fill placeholder distractors in evaluation specs using Claude API.
+
+Reads specs with PLACEHOLDER_A/C/D options and asks Claude to generate
+plausible but wrong distractors based on the anchor's domain.
+
+Usage:
+  python3 evaluations/fill-distractors.py              # Fill all placeholders
+  python3 evaluations/fill-distractors.py --dry-run     # Preview prompts
+  python3 evaluations/fill-distractors.py --anchor arc42 # Single anchor
+"""
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    print("PyYAML required: pip install pyyaml")
+    sys.exit(1)
+
+SPECS_DIR = Path(__file__).parent / "specs"
+
+
+def needs_distractors(spec):
+    """Check if spec has placeholder distractors."""
+    q = spec.get("questions", {}).get("recognition", {})
+    options = q.get("options", {})
+    return any("PLACEHOLDER" in str(v) for v in options.values())
+
+
+def generate_distractors(spec):
+    """Use Claude API to generate 3 plausible distractors."""
+    try:
+        import anthropic
+    except ImportError:
+        print("anthropic package required: pip install anthropic")
+        sys.exit(1)
+
+    q = spec["questions"]["recognition"]
+    correct = q["options"]["B"]
+    title = q["question"].strip().split('"')[1] if '"' in q["question"] else spec["anchor"]
+    related = q.get("_related", [])
+    proponents = q.get("_proponents", "")
+
+    prompt = f"""Generate 3 plausible but WRONG multiple-choice distractors for this question:
+
+Question: Which of the following best describes "{title}"?
+Correct answer: {correct}
+
+Requirements for distractors:
+- Each distractor should be a one-sentence description of a DIFFERENT but related concept
+- They must be wrong but sound plausible to someone unfamiliar with the topic
+- All 4 options (correct + 3 distractors) should be similar in length
+- Do NOT include the correct concept in any distractor
+- Draw distractors from adjacent concepts in software engineering, architecture, or methodology
+{f"- Related anchors for inspiration: {', '.join(related)}" if related else ""}
+{f"- The correct answer is associated with: {proponents}" if proponents else ""}
+
+Return ONLY a JSON object with keys "A", "C", "D" containing the 3 distractor strings. No explanation."""
+
+    client = anthropic.Anthropic()
+    response = client.messages.create(
+        model="claude-sonnet-4-20250514",
+        max_tokens=300,
+        temperature=0.7,  # some creativity for diverse distractors
+        messages=[{"role": "user", "content": prompt}],
+    )
+
+    text = response.content[0].text.strip()
+    # Parse JSON from response (might be wrapped in ```json ... ```)
+    if "```" in text:
+        text = text.split("```")[1]
+        if text.startswith("json"):
+            text = text[4:]
+        text = text.strip()
+
+    return json.loads(text)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Fill placeholder distractors using Claude API")
+    parser.add_argument("--dry-run", action="store_true", help="Preview without writing")
+    parser.add_argument("--anchor", help="Process single anchor")
+    args = parser.parse_args()
+
+    specs_to_fill = []
+    for f in sorted(SPECS_DIR.glob("*.yaml")):
+        spec = yaml.safe_load(f.read_text(encoding="utf-8"))
+        if args.anchor and spec["anchor"] != args.anchor:
+            continue
+        if needs_distractors(spec):
+            specs_to_fill.append((f, spec))
+
+    print(f"Found {len(specs_to_fill)} specs needing distractors")
+
+    for filepath, spec in specs_to_fill:
+        anchor_id = spec["anchor"]
+        print(f"  {anchor_id}...", end=" ", flush=True)
+
+        if args.dry_run:
+            print("(dry run)")
+            continue
+
+        try:
+            distractors = generate_distractors(spec)
+            q = spec["questions"]["recognition"]
+            q["options"]["A"] = distractors["A"]
+            q["options"]["C"] = distractors["C"]
+            q["options"]["D"] = distractors["D"]
+
+            # Remove helper notes
+            q.pop("_note", None)
+            q.pop("_related", None)
+            q.pop("_proponents", None)
+            q.pop("_also_known_as", None)
+
+            with open(filepath, "w", encoding="utf-8") as fh:
+                yaml.dump(spec, fh, default_flow_style=False, allow_unicode=True, sort_keys=False)
+            print("OK")
+
+        except Exception as e:
+            print(f"ERROR: {e}")
+
+    print("\nDone. Review the generated distractors before running evaluations!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluations/generate-l1-specs.py b/evaluations/generate-l1-specs.py
new file mode 100644
index 0000000..198746d
--- /dev/null
+++ b/evaluations/generate-l1-specs.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""
+Generate Level 1 (Recognition) evaluation specs from .adoc anchor metadata.
+
+Reads each anchor's Core Concepts and Related Anchors to produce:
+- A correct answer from the anchor's core description
+- 3 plausible distractors from related/adjacent anchors
+
+Output: YAML specs in evaluations/specs/ (only recognition section).
+Existing specs are preserved — only missing anchors are generated.
+
+Usage:
+  python3 evaluations/generate-l1-specs.py              # Generate all Tier 3
+  python3 evaluations/generate-l1-specs.py --dry-run     # Preview without writing
+  python3 evaluations/generate-l1-specs.py --anchor arc42  # Single anchor
+"""
+
+import argparse
+import os
+import re
+import sys
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    print("PyYAML required: pip install pyyaml")
+    sys.exit(1)
+
+ANCHORS_DIR = Path(__file__).parent.parent / "docs" / "anchors"
+SPECS_DIR = Path(__file__).parent / "specs"
+
+# Skip these anchors (templates, meta, sub-patterns handled by umbrella)
+SKIP_PREFIXES = ["_template", "gof-", "solid-", "test-double-"]
+SKIP_EXACT = ["what-qualifies-as-a-semantic-anchor", "gof-design-patterns",
+              "solid-principles", "test-double-meszaros"]
+
+
+def parse_adoc(filepath):
+    """Extract metadata from an .adoc anchor file."""
+    content = filepath.read_text(encoding="utf-8")
+    lines = content.split("\n")
+
+    result = {
+        "id": filepath.stem,
+        "title": "",
+        "tier": None,
+        "categories": "",
+        "related": [],
+        "proponents": "",
+        "also_known_as": "",
+        "core_concepts": [],
+        "when_to_use": [],
+    }
+
+    # Parse attributes
+    for line in lines:
+        if line.startswith("= "):
+            result["title"] = line[2:].strip()
+        elif line.startswith(":tier:"):
+            result["tier"] = int(line.split(":tier:")[1].strip())
+        elif line.startswith(":categories:"):
+            result["categories"] = line.split(":categories:")[1].strip()
+        elif line.startswith(":related:"):
+            result["related"] = [r.strip() for r in line.split(":related:")[1].strip().split(",")]
+        elif line.startswith(":proponents:"):
+            result["proponents"] = line.split(":proponents:")[1].strip()
+
+    # Parse core concepts (definition list items)
+    in_core = False
+    in_when = False
+    for line in lines:
+        if "Core Concepts" in line:
+            in_core = True
+            in_when = False
+            continue
+        if "When to Use" in line:
+            in_core = False
+            in_when = True
+            continue
+        if "Related" in line or "Contrast" in line or "Technical" in line:
+            in_core = False
+            in_when = False
+            continue
+
+        if in_core and "::" in line:
+            term = line.split("::")[0].strip()
+            desc = line.split("::", 1)[1].strip() if "::" in line else ""
+            if term and not term.startswith("[") and not term.startswith("Key Proponent"):
+                result["core_concepts"].append({"term": term, "desc": desc})
+        elif in_when and line.strip().startswith("*"):
+            result["when_to_use"].append(line.strip().lstrip("* "))
+
+    # Also known as
+    for line in lines:
+        if "Also known as::" in line:
+            result["also_known_as"] = line.split("Also known as::")[1].strip()
+
+    return result
+
+
+def build_correct_answer(anchor):
+    """Build a one-sentence correct answer from core concepts."""
+    concepts = anchor["core_concepts"][:4]
+    if not concepts:
+        return None
+
+    parts = []
+    for c in concepts:
+        if c["desc"]:
+            parts.append(c["desc"].rstrip("."))
+        else:
+            parts.append(c["term"])
+
+    if len(parts) >= 2:
+        return f"{parts[0]}; {parts[1].lower()}"
+    return parts[0]
+
+
+def generate_spec(anchor, all_anchors):
+    """Generate a YAML spec dict for one anchor."""
+    correct = build_correct_answer(anchor)
+    if not correct:
+        return None
+
+    spec = {
+        "anchor": anchor["id"],
+        "tier": anchor["tier"],
+        "questions": {
+            "recognition": {
+                "question": f'Which of the following best describes "{anchor["title"]}"?\n',
+                "options": {
+                    "A": "PLACEHOLDER_A",
+                    "B": correct,
+                    "C": "PLACEHOLDER_C",
+                    "D": "PLACEHOLDER_D",
+                },
+                "correct": "B",
+                "_note": "REVIEW NEEDED: Distractors are placeholders. Replace A, C, D with plausible wrong answers from related anchors.",
+                "_related": anchor["related"],
+                "_proponents": anchor["proponents"],
+                "_also_known_as": anchor["also_known_as"],
+            }
+        }
+    }
+    return spec
+
+
+def should_skip(anchor_id):
+    """Check if anchor should be skipped."""
+    if anchor_id in SKIP_EXACT:
+        return True
+    for prefix in SKIP_PREFIXES:
+        if anchor_id.startswith(prefix) and anchor_id not in SKIP_EXACT:
+            return True
+    return False
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate L1 evaluation specs from .adoc metadata")
+    parser.add_argument("--dry-run", action="store_true", help="Preview without writing files")
+    parser.add_argument("--anchor", help="Generate for a single anchor ID")
+    parser.add_argument("--force", action="store_true", help="Overwrite existing specs")
+    args = parser.parse_args()
+
+    # Parse all anchors
+    all_anchors = {}
+    for f in sorted(ANCHORS_DIR.glob("*.adoc")):
+        if f.stem.endswith(".de") or f.stem == "_template":
+            continue
+        anchor = parse_adoc(f)
+        all_anchors[anchor["id"]] = anchor
+
+    # Filter to Tier 3, skip sub-patterns
+    targets = []
+    for aid, anchor in all_anchors.items():
+        if args.anchor and aid != args.anchor:
+            continue
+        if anchor["tier"] != 3:
+            continue
+        if should_skip(aid):
+            continue
+        targets.append(anchor)
+
+    print(f"Found {len(targets)} Tier 3 anchors to process")
+
+    generated = 0
+    skipped = 0
+    for anchor in targets:
+        spec_file = SPECS_DIR / f"{anchor['id']}.yaml"
+
+        if spec_file.exists() and not args.force:
+            skipped += 1
+            continue
+
+        spec = generate_spec(anchor, all_anchors)
+        if not spec:
+            print(f"  SKIP {anchor['id']}: no core concepts found")
+            continue
+
+        if args.dry_run:
+            print(f"\n--- {anchor['id']} ---")
+            print(yaml.dump(spec, default_flow_style=False, allow_unicode=True))
+        else:
+            SPECS_DIR.mkdir(parents=True, exist_ok=True)
+            with open(spec_file, "w", encoding="utf-8") as fh:
+                yaml.dump(spec, fh, default_flow_style=False, allow_unicode=True, sort_keys=False)
+            print(f"  WROTE {spec_file.name}")
+            generated += 1
+
+    print(f"\nDone: {generated} generated, {skipped} skipped (already exist)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluations/generate-l2-specs.py b/evaluations/generate-l2-specs.py
new file mode 100644
index 0000000..554b119
--- /dev/null
+++ b/evaluations/generate-l2-specs.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+"""
+Generate Level 2 (Application) questions for evaluation specs using Claude API.
+
+For each anchor that has a recognition question but no application question,
+generates a realistic scenario with anchor prompt, paraphrase, and MC options.
+
+Usage:
+  python3 evaluations/generate-l2-specs.py              # Fill all missing L2
+  python3 evaluations/generate-l2-specs.py --dry-run     # Preview
+  python3 evaluations/generate-l2-specs.py --anchor arc42 # Single anchor
+"""
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    print("PyYAML required: pip install pyyaml")
+    sys.exit(1)
+
+SPECS_DIR = Path(__file__).parent / "specs"
+ANCHORS_DIR = Path(__file__).parent.parent / "docs" / "anchors"
+
+SKIP_ANCHORS = {"sanity-check", "negative-control"}
+
+
+def load_anchor_context(anchor_id):
+    """Load anchor .adoc file for context."""
+    adoc = ANCHORS_DIR / f"{anchor_id}.adoc"
+    if adoc.exists():
+        return adoc.read_text(encoding="utf-8")[:2000]
+    return ""
+
+
+def needs_application(spec):
+    """Check if spec is missing an application question."""
+    return "application" not in spec.get("questions", {})
+
+
+def generate_application(spec):
+    """Use Claude API to generate an L2 Application question."""
+    try:
+        import anthropic
+    except ImportError:
+        print("anthropic package required: pip install anthropic")
+        sys.exit(1)
+
+    anchor_id = spec["anchor"]
+    title = spec["questions"]["recognition"]["question"].split('"')[1] if '"' in spec["questions"]["recognition"]["question"] else anchor_id
+    context = load_anchor_context(anchor_id)
+
+    prompt = f"""Generate a Level 2 Application multiple-choice question for the semantic anchor "{title}".
+
+The question tests whether an LLM can APPLY the methodology, not just describe it.
+
+Anchor definition (from .adoc file):
+{context}
+
+Requirements:
+1. Write a realistic SCENARIO (2-3 sentences) describing a concrete software engineering situation where this anchor applies.
+2. Write an ANCHOR_PROMPT — a short phrase like "using {title}" that would be added to the scenario.
+3. Write a PARAPHRASE_PROMPT — describes the GOAL without naming the methodology or hinting at the correct answer. Must be fair: not too specific (leaks answer) and not too vague.
+4. Write 4 OPTIONS (A, B, C, D) — one correct answer that reflects the methodology, three plausible alternatives.
+5. All options should be similar in length.
+6. The correct answer should reflect what a practitioner of this methodology would recommend.
+
+Return ONLY a JSON object with this exact structure:
+{{
+  "scenario": "...",
+  "anchor_prompt": "using {title}",
+  "paraphrase_prompt": "...",
+  "options": {{
+    "A": "...",
+    "B": "...",
+    "C": "...",
+    "D": "..."
+  }},
+  "correct": "B"
+}}
+
+Make B the correct answer. No explanation outside the JSON."""
+
+    client = anthropic.Anthropic()
+    response = client.messages.create(
+        model="claude-sonnet-4-20250514",
+        max_tokens=500,
+        temperature=0.7,
+        messages=[{"role": "user", "content": prompt}],
+    )
+
+    text = response.content[0].text.strip()
+    if "```" in text:
+        text = text.split("```")[1]
+        if text.startswith("json"):
+            text = text[4:]
+        text = text.strip()
+
+    return json.loads(text)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate L2 Application questions using Claude API")
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--anchor", help="Process single anchor")
+    args = parser.parse_args()
+
+    specs_to_fill = []
+    for f in sorted(SPECS_DIR.glob("*.yaml")):
+        spec = yaml.safe_load(f.read_text(encoding="utf-8"))
+        if spec["anchor"] in SKIP_ANCHORS:
+            continue
+        if args.anchor and spec["anchor"] != args.anchor:
+            continue
+        if needs_application(spec):
+            specs_to_fill.append((f, spec))
+
+    print(f"Found {len(specs_to_fill)} specs needing L2 Application questions")
+
+    for filepath, spec in specs_to_fill:
+        anchor_id = spec["anchor"]
+        print(f"  {anchor_id}...", end=" ", flush=True)
+
+        if args.dry_run:
+            print("(dry run)")
+            continue
+
+        try:
+            app = generate_application(spec)
+            spec["questions"]["application"] = app
+
+            with open(filepath, "w", encoding="utf-8") as fh:
+                yaml.dump(spec, fh, default_flow_style=False, allow_unicode=True, sort_keys=False)
+            print("OK")
+
+        except Exception as e:
+            print(f"ERROR: {e}")
+
+    print("\nDone. Review the generated scenarios before running evaluations!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluations/generate-report.py b/evaluations/generate-report.py
new file mode 100644
index 0000000..a6be43e
--- /dev/null
+++ b/evaluations/generate-report.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+"""
+Generate an HTML report from evaluation results.
+
+Reads all result JSON files and produces an interactive HTML report with:
+- Summary table (model × average score)
+- Heatmap (anchor × model)
+- Detail sections per anchor with raw responses
+- Controls (sanity check, negative control) shown separately
+
+Usage:
+  python3 evaluations/generate-report.py
+  python3 evaluations/generate-report.py --output evaluations/report.html
+"""
+
+import argparse
+from html import escape as h
+import json
+from collections import defaultdict
+from pathlib import Path
+
+RESULTS_DIR = Path(__file__).parent / "results"
+SPECS_DIR = Path(__file__).parent / "specs"
+
+# Models to include and display order
+MODEL_DISPLAY = {
+    "claude": "Claude Sonnet",
+    "claude-cli": "Claude Sonnet (CLI)",
+    "claude-haiku": "Claude Haiku",
+    "openai": "GPT-4o",
+    "mistral": "Mistral Large",
+    "ollama": "Ollama (local)",
+}
+
+CONTROL_ANCHORS = {"sanity-check", "negative-control"}
+
+
+def load_best_results():
+    """Load the latest result with the most questions per model."""
+    results = {}
+    for f in sorted(RESULTS_DIR.glob("pilot-*.json")):
+        d = json.load(open(f, encoding="utf-8"))
+        for m, r in d["models"].items():
+            if m not in results or len(r) >= len(results[m]["data"]):
+                results[m] = {
+                    "data": r,
+                    "file": f.name,
+                    "config": d.get("config", {}),
+                    "duration": d.get("duration_seconds", 0),
+                    "timestamp": d.get("timestamp", ""),
+                }
+    return results
+
+
+def score_color(score):
+    if score >= 0.8:
+        return "#22c55e"  # green
+    elif score >= 0.5:
+        return "#eab308"  # yellow
+    else:
+        return "#ef4444"  # red
+
+
+def score_bg(score):
+    if score >= 0.8:
+        return "#dcfce7"
+    elif score >= 0.5:
+        return "#fef9c3"
+    else:
+        return "#fee2e2"
+
+
+def generate_html(results, output_path):
+    # Collect all anchors and questions
+    all_questions = defaultdict(dict)  # anchor/label -> {model: score}
+    model_names = []
+
+    # Prefer full runs (75 questions) over pilot runs
+    for m in ["claude", "openai", "mistral"]:
+        if m in results and len(results[m]["data"]) >= 60:
+            model_names.append(m)
+
+    # Add smaller runs if no full run exists
+    for m in ["claude-cli", "claude-haiku", "ollama"]:
+        if m in results and m not in model_names:
+            model_names.append(m)
+
+    for m in model_names:
+        for q in results[m]["data"]:
+            label = q["label"]
+            all_questions[label][m] = q["score"]
+
+    # Separate controls from anchors
+    anchor_questions = {k: v for k, v in all_questions.items()
+                        if not any(k.startswith(c) for c in CONTROL_ANCHORS)}
+    control_questions = {k: v for k, v in all_questions.items()
+                         if any(k.startswith(c) for c in CONTROL_ANCHORS)}
+
+    # Group by anchor
+    anchor_groups = defaultdict(list)
+    for label in sorted(anchor_questions.keys()):
+        anchor_id = label.split("/")[0]
+        anchor_groups[anchor_id].append(label)
+
+    # Model averages (excluding controls)
+    model_avgs = {}
+    for m in model_names:
+        scores = [anchor_questions[label].get(m) for label in anchor_questions
+                  if anchor_questions[label].get(m) is not None]
+        model_avgs[m] = sum(scores) / len(scores) if scores else 0
+
+    html = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Semantic Anchor Evaluation Report</title>
+<style>
+  * {{ margin: 0; padding: 0; box-sizing: border-box; }}
+  body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; background: #f8fafc; color: #1e293b; padding: 2rem; }}
+  h1 {{ font-size: 1.75rem; margin-bottom: 0.5rem; }}
+  h2 {{ font-size: 1.25rem; margin: 2rem 0 1rem; border-bottom: 2px solid #e2e8f0; padding-bottom: 0.5rem; }}
+  h3 {{ font-size: 1rem; margin: 1.5rem 0 0.5rem; color: #475569; }}
+  .subtitle {{ color: #64748b; margin-bottom: 2rem; }}
+  .summary-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-bottom: 2rem; }}
+  .summary-card {{ background: white; border-radius: 0.5rem; padding: 1.25rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }}
+  .summary-card .model-name {{ font-weight: 600; font-size: 0.875rem; color: #475569; }}
+  .summary-card .score {{ font-size: 2rem; font-weight: 700; margin: 0.5rem 0; }}
+  .summary-card .detail {{ font-size: 0.75rem; color: #94a3b8; }}
+  table {{ width: 100%; border-collapse: collapse; background: white; border-radius: 0.5rem; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.1); margin-bottom: 1.5rem; }}
+  th {{ background: #f1f5f9; padding: 0.625rem 0.75rem; text-align: left; font-weight: 600; font-size: 0.8125rem; color: #475569; border-bottom: 2px solid #e2e8f0; }}
+  td {{ padding: 0.5rem 0.75rem; border-bottom: 1px solid #f1f5f9; font-size: 0.8125rem; }}
+  tr:hover {{ background: #f8fafc; }}
+  .score-cell {{ text-align: center; font-weight: 600; border-radius: 0.25rem; }}
+  .anchor-group {{ font-weight: 600; background: #f8fafc; }}
+  .question-label {{ padding-left: 1.5rem; color: #64748b; }}
+  .check {{ color: #22c55e; }}
+  .controls {{ opacity: 0.7; }}
+  .legend {{ display: flex; gap: 1.5rem; margin: 1rem 0; font-size: 0.8125rem; }}
+  .legend-item {{ display: flex; align-items: center; gap: 0.375rem; }}
+  .legend-dot {{ width: 12px; height: 12px; border-radius: 2px; }}
+  .meta {{ background: #f1f5f9; border-radius: 0.5rem; padding: 1rem; font-size: 0.75rem; color: #64748b; margin-top: 2rem; }}
+  .meta dt {{ font-weight: 600; display: inline; }}
+  .meta dd {{ display: inline; margin-right: 1.5rem; }}
+  .fail-list {{ margin-top: 1rem; }}
+  .fail-item {{ display: flex; justify-content: space-between; padding: 0.25rem 0; font-size: 0.8125rem; border-bottom: 1px solid #f1f5f9; }}
+</style>
+</head>
+<body>
+<h1>Semantic Anchor Evaluation Report</h1>
+<p class="subtitle">Multiple-choice recognition test across {len(model_names)} LLMs — {len(anchor_questions)} questions, {len(anchor_groups)} anchors</p>
+
+<div class="legend">
+  <div class="legend-item"><div class="legend-dot" style="background:#dcfce7"></div> &ge;80%</div>
+  <div class="legend-item"><div class="legend-dot" style="background:#fef9c3"></div> 50–79%</div>
+  <div class="legend-item"><div class="legend-dot" style="background:#fee2e2"></div> &lt;50%</div>
+</div>
+
+<h2>Model Summary</h2>
+<div class="summary-grid">
+"""
+
+    for m in model_names:
+        avg = model_avgs.get(m, 0)
+        display = MODEL_DISPLAY.get(m, m)
+        n = len([1 for l in anchor_questions if anchor_questions[l].get(m) is not None])
+        info = results[m]
+        html += f"""  <div class="summary-card">
+    <div class="model-name">{display}</div>
+    <div class="score" style="color: {score_color(avg)}">{avg:.0%}</div>
+    <div class="detail">{n} questions · {info['file']}</div>
+  </div>
+"""
+
+    html += """</div>
+
+<h2>Heatmap: Anchor × Model</h2>
+<table>
+<thead><tr>
+  <th>Anchor / Question</th>
+"""
+
+    for m in model_names:
+        html += f"  <th style='text-align:center'>{MODEL_DISPLAY.get(m, m)}</th>\n"
+    html += "</tr></thead>\n<tbody>\n"
+
+    for anchor_id in sorted(anchor_groups.keys()):
+        labels = anchor_groups[anchor_id]
+        # Anchor group row with average
+        anchor_scores = {}
+        for m in model_names:
+            scores = [anchor_questions[l].get(m) for l in labels if anchor_questions[l].get(m) is not None]
+            anchor_scores[m] = sum(scores) / len(scores) if scores else None
+
+        html += f'<tr class="anchor-group"><td>{h(anchor_id)}</td>'
+        for m in model_names:
+            s = anchor_scores.get(m)
+            if s is not None:
+                bg = score_bg(s)
+                text = "✓" if s == 1.0 else f"{s:.0%}"
+                html += f'<td class="score-cell" style="background:{bg}">{text}</td>'
+            else:
+                html += '<td class="score-cell" style="color:#cbd5e1">—</td>'
+        html += "</tr>\n"
+
+        # Individual question rows (only show if there are multiple or if score < 100%)
+        if len(labels) > 1:
+            for label in labels:
+                short = label.split("/", 1)[1] if "/" in label else label
+                html += f'<tr><td class="question-label">{h(short)}</td>'
+                for m in model_names:
+                    s = anchor_questions[label].get(m)
+                    if s is not None:
+                        bg = score_bg(s)
+                        text = "✓" if s == 1.0 else f"{s:.0%}"
+                        html += f'<td class="score-cell" style="background:{bg}">{text}</td>'
+                    else:
+                        html += '<td class="score-cell" style="color:#cbd5e1">—</td>'
+                html += "</tr>\n"
+
+    html += "</tbody></table>\n"
+
+    # Controls section
+    if control_questions:
+        html += '<h2>Control Questions</h2>\n<table class="controls">\n<thead><tr><th>Control</th>'
+        for m in model_names:
+            html += f"<th style='text-align:center'>{MODEL_DISPLAY.get(m, m)}</th>"
+        html += "</tr></thead>\n<tbody>\n"
+        for label in sorted(control_questions.keys()):
+            short = label.replace("/recognition", "")
+            html += f"<tr><td>{short}</td>"
+            for m in model_names:
+                s = control_questions[label].get(m)
+                if s is not None:
+                    bg = score_bg(s) if "sanity" not in label else ("#dcfce7" if s == 0 else "#fee2e2")
+                    text = f"{s:.0%}"
+                    html += f'<td class="score-cell" style="background:{bg}">{text}</td>'
+                else:
+                    html += '<td class="score-cell" style="color:#cbd5e1">—</td>'
+            html += "</tr>\n"
+        html += "</tbody></table>\n"
+
+    # Failures detail
+    html += "<h2>Failures Detail</h2>\n"
+    for m in model_names:
+        fails = [(q["label"], q["score"]) for q in results[m]["data"]
+                 if q["score"] < 1.0 and not any(q["label"].startswith(c) for c in CONTROL_ANCHORS)]
+        if not fails:
+            html += f"<h3>{MODEL_DISPLAY.get(m, m)}: no failures</h3>\n"
+        else:
+            html += f'<h3>{MODEL_DISPLAY.get(m, m)}: {len(fails)} failures</h3>\n<div class="fail-list">\n'
+            for label, score in sorted(fails):
+                html += f'<div class="fail-item"><span>{h(label)}</span><span style="color:{score_color(score)};font-weight:600">{score:.0%}</span></div>\n'
+            html += "</div>\n"
+
+    # Metadata
+    html += """
+<div class="meta">
+<h3>Run Metadata</h3>
+<dl>
+"""
+    for m in model_names:
+        info = results[m]
+        dur = info["duration"]
+        html += f"<dt>{MODEL_DISPLAY.get(m, m)}:</dt><dd>{info['file']} · {int(dur//60)}m {int(dur%60)}s · {info['timestamp'][:19]}</dd><br>"
+
+    html += """
+</dl>
+<p style="margin-top:0.75rem">Generated by <code>evaluations/generate-report.py</code> · Position bias mitigation: 4 permutations per question · Scoring: deterministic MC (no LLM judge)</p>
+</div>
+
+</body>
+</html>"""
+
+    output_path.write_text(html, encoding="utf-8")
+    print(f"Report written to {output_path}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate HTML evaluation report")
+    parser.add_argument("--output", default="evaluations/report.html",
+                        help="Output HTML file (default: evaluations/report.html)")
+    args = parser.parse_args()
+
+    results = load_best_results()
+    print(f"Loaded results for {len(results)} models")
+    for m, info in results.items():
+        print(f"  {m}: {len(info['data'])} questions from {info['file']}")
+
+    generate_html(results, Path(args.output))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluations/pilot.py b/evaluations/pilot.py
new file mode 100644
index 0000000..c921ad0
--- /dev/null
+++ b/evaluations/pilot.py
@@ -0,0 +1,502 @@
+#!/usr/bin/env python3
+"""
+Pilot evaluation runner for semantic anchor multiple-choice tests.
+Reads YAML specs, sends questions to LLMs, scores responses.
+
+Usage:
+  python3 pilot.py --model claude      # Claude Sonnet via Anthropic API
+  python3 pilot.py --model ollama      # Local model via Ollama (OpenAI-compatible)
+  python3 pilot.py --model claude ollama  # Both
+  python3 pilot.py --dry-run           # Show prompts without sending
+"""
+
+import argparse
+import json
+import os
+import random
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    print("PyYAML required: pip install pyyaml")
+    sys.exit(1)
+
+SPECS_DIR = Path(__file__).parent / "specs"
+RESULTS_DIR = Path(__file__).parent / "results"
+POSITION_PERMUTATIONS = [
+    [0, 1, 2, 3],  # A B C D (original)
+    [1, 2, 3, 0],  # B C D A
+    [2, 3, 0, 1],  # C D A B
+    [3, 0, 1, 2],  # D A B C
+]
+LETTERS = ["A", "B", "C", "D"]
+
+
+def load_specs():
+    specs = []
+    for f in sorted(SPECS_DIR.glob("*.yaml")):
+        with open(f, encoding="utf-8") as fh:
+            specs.append(yaml.safe_load(fh))
+    return specs
+
+
+def build_prompt(question_text, options, permutation):
+    """Build a prompt with options in the given permutation order."""
+    lines = [question_text.strip(), ""]
+    for i, perm_idx in enumerate(permutation):
+        letter = LETTERS[i]
+        option_text = options[LETTERS[perm_idx]]
+        lines.append(f"{letter}) {option_text}")
+    lines.append("")
+    lines.append("Answer with the letter only.")
+    return "\n".join(lines)
+
+
+def correct_letter_for_permutation(original_correct, permutation):
+    """Find which letter the original correct answer maps to in this permutation.
+    Returns 'X' for sanity checks (no correct answer exists)."""
+    if original_correct == "X":
+        return "X"
+    original_idx = LETTERS.index(original_correct)
+    for i, perm_idx in enumerate(permutation):
+        if perm_idx == original_idx:
+            return LETTERS[i]
+    return None
+
+
+def parse_response(text):
+    """Extract the first capital letter A-D from the response.
+    Strips <think>...</think> blocks (used by reasoning models like qwen3)."""
+    import re
+    # Remove thinking blocks (qwen3, DeepSeek R1, etc.)
+    cleaned = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
+    # If nothing left after stripping, fall back to original
+    if not cleaned:
+        cleaned = text.strip()
+    # Try to find a standalone answer letter (e.g., "B", "B)", "**B**", "b")
+    # First: look for a line that is just a letter (strongest signal)
+    for line in cleaned.split('\n'):
+        line = line.strip().strip('*').strip('.').strip(')').strip()
+        if line.upper() in ("A", "B", "C", "D"):
+            return line.upper()
+    # Fallback: first capital A-D in the text
+    for char in cleaned:
+        if char in "ABCD":
+            return char
+    return None
+
+
+# Global temperature — set via --temperature flag
+TEMPERATURE = 0.0
+
+
+def set_temperature(t):
+    global TEMPERATURE
+    TEMPERATURE = t
+
+
+def call_claude_api(prompt, model="claude-sonnet-4-20250514"):
+    """Send prompt to Claude via Anthropic API."""
+    try:
+        import anthropic
+    except ImportError:
+        print("anthropic package required: pip install anthropic")
+        sys.exit(1)
+
+    client = anthropic.Anthropic()
+    response = client.messages.create(
+        model=model,
+        max_tokens=10,
+        temperature=TEMPERATURE,
+        messages=[{"role": "user", "content": prompt}],
+    )
+    return response.content[0].text, model
+
+
+def call_claude_cli(prompt, model="claude-cli"):
+    """Send prompt to Claude Sonnet via claude -p CLI.
+    Note: temperature cannot be controlled via CLI."""
+    import subprocess
+    result = subprocess.run(
+        ["claude", "-p", prompt],
+        capture_output=True, text=True, timeout=60,
+    )
+    if result.returncode != 0:
+        return f"ERROR: {result.stderr.strip()}", model
+    return result.stdout.strip(), model
+
+
+def call_claude_haiku(prompt, model="claude-haiku"):
+    """Send prompt to Claude Haiku via claude -p CLI.
+    Note: temperature cannot be controlled via CLI."""
+    import subprocess
+    result = subprocess.run(
+        ["claude", "-p", prompt, "--model", "haiku"],
+        capture_output=True, text=True, timeout=60,
+    )
+    if result.returncode != 0:
+        return f"ERROR: {result.stderr.strip()}", model
+    return result.stdout.strip(), model
+
+
+def make_openai_caller(openai_model):
+    """Create an OpenAI caller for a specific model."""
+    def call_openai(prompt, model=openai_model):
+        try:
+            import openai
+        except ImportError:
+            print("openai package required: pip install openai")
+            sys.exit(1)
+
+        client = openai.OpenAI()
+        # GPT-5+ and reasoning models require different parameters
+        is_new_api = any(x in model for x in ("gpt-5", "o3", "o4"))
+        kwargs = {"model": model, "messages": [{"role": "user", "content": prompt}]}
+        if is_new_api:
+            kwargs["max_completion_tokens"] = 2048
+            # GPT-5 only supports temperature=1
+        else:
+            kwargs["max_tokens"] = 10
+            kwargs["temperature"] = TEMPERATURE
+        response = client.chat.completions.create(**kwargs)
+        return response.choices[0].message.content.strip(), model
+    return call_openai
+
+
+def make_mistral_caller(mistral_model):
+    """Create a Mistral caller via OpenAI-compatible API."""
+    def call_mistral(prompt, model=mistral_model):
+        try:
+            import openai
+        except ImportError:
+            print("openai package required: pip install openai")
+            sys.exit(1)
+
+        client = openai.OpenAI(
+            base_url="https://api.mistral.ai/v1",
+            api_key=os.environ.get("MISTRAL_API_KEY", ""),
+        )
+        response = client.chat.completions.create(
+            model=model,
+            max_tokens=10,
+            temperature=TEMPERATURE,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return response.choices[0].message.content.strip(), model
+    return call_mistral
+
+
+def make_deepseek_caller(deepseek_model):
+    """Create a DeepSeek caller via OpenAI-compatible API."""
+    def call_deepseek(prompt, model=deepseek_model):
+        try:
+            import openai
+        except ImportError:
+            print("openai package required: pip install openai")
+            sys.exit(1)
+
+        client = openai.OpenAI(
+            base_url="https://api.deepseek.com",
+            api_key=os.environ.get("DEEPSEEK_API_KEY", ""),
+        )
+        response = client.chat.completions.create(
+            model=model,
+            max_tokens=10,
+            temperature=TEMPERATURE,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return response.choices[0].message.content.strip(), model
+    return call_deepseek
+
+
+def make_ollama_caller(ollama_model, no_think=False, base_url="http://localhost:11434"):
+    """Create an Ollama caller for a specific model."""
+    def call_ollama(prompt, model=ollama_model):
+        import urllib.request
+
+        body = {
+            "model": model,
+            "messages": [{"role": "user", "content": prompt}],
+            "stream": False,
+            "options": {"temperature": TEMPERATURE},
+        }
+        if no_think:
+            body["think"] = False
+
+        data = json.dumps(body).encode("utf-8")
+        req = urllib.request.Request(
+            f"{base_url}/api/chat",
+            data=data,
+            headers={"Content-Type": "application/json"},
+        )
+        with urllib.request.urlopen(req, timeout=300) as resp:
+            result = json.loads(resp.read())
+
+        content = result.get("message", {}).get("content", "")
+        return content, f"ollama/{model}"
+    return call_ollama
+
+
+def run_question(question_data, call_fn, label, context="", verbose=False):
+    """Run a single question 4x with randomized positions. Returns results."""
+    question_text = question_data["question"]
+    if context:
+        question_text = f"{context}\n{question_text}"
+    options = question_data["options"]
+    original_correct = question_data["correct"]
+    results = []
+    for i, perm in enumerate(POSITION_PERMUTATIONS):
+        prompt = build_prompt(question_text, options, perm)
+        expected = correct_letter_for_permutation(original_correct, perm)
+
+        try:
+            response_text, model_id = call_fn(prompt)
+        except Exception as e:
+            response_text = f"ERROR: {e}"
+            if verbose:
+                print(f"\n    [ERROR] {e}")
+
+        answer = parse_response(response_text)
+        correct = answer == expected
+
+        if verbose and i == 0:  # show first permutation only
+            print(f"\n    [RAW] expected={expected} parsed={answer} response={repr(response_text[:200])}")
+
+        results.append({
+            "permutation": [LETTERS[p] for p in perm],
+            "expected": expected,
+            "answer": answer,
+            "correct": correct,
+            "raw_response": response_text.strip()[:500],
+        })
+        time.sleep(0.5)  # rate limiting
+
+    score = sum(1 for r in results if r["correct"]) / len(results)
+    return {
+        "label": label,
+        "score": score,
+        "results": results,
+    }
+
+
+def save_results(all_results, out_file):
+    """Save results incrementally after each question."""
+    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+    with open(out_file, "w", encoding="utf-8") as fh:
+        json.dump(all_results, fh, indent=2, ensure_ascii=False)
+
+
+def run_pilot(models, dry_run=False, verbose=False, ollama_model="qwen3:4b", no_think=False,
+              ollama_url="http://localhost:11434", openai_model="gpt-4o-mini",
+              mistral_model="mistral-large-latest", deepseek_model="deepseek-chat"):
+    start_time = time.time()
+    specs = load_specs()
+    print(f"Loaded {len(specs)} anchor specs")
+    print(f"Models: {', '.join(models)}")
+    print(f"Temperature: {TEMPERATURE}")
+    if "openai" in models:
+        print(f"OpenAI model: {openai_model}")
+    if "mistral" in models:
+        print(f"Mistral model: {mistral_model}")
+    if "deepseek" in models:
+        print(f"DeepSeek model: {deepseek_model}")
+    if "ollama" in models:
+        print(f"Ollama model: {ollama_model}")
+        print(f"Ollama URL: {ollama_url}")
+        print(f"No-think: {no_think}")
+    print(f"Dry run: {dry_run}")
+    print()
+
+    ts = datetime.now().strftime("%Y%m%d-%H%M%S")
+    out_file = RESULTS_DIR / f"pilot-{ts}.json"
+
+    all_results = {
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "config": {
+            "models": models,
+            "openai_model": openai_model if "openai" in models else None,
+            "mistral_model": mistral_model if "mistral" in models else None,
+            "deepseek_model": deepseek_model if "deepseek" in models else None,
+            "ollama_model": ollama_model if "ollama" in models else None,
+            "ollama_url": ollama_url if "ollama" in models else None,
+            "no_think": no_think if "ollama" in models else None,
+            "temperature": TEMPERATURE,
+        },
+        "models": {},
+    }
+
+    for model_name in models:
+        if model_name == "claude":
+            call_fn = call_claude_api
+        elif model_name == "claude-cli":
+            call_fn = call_claude_cli
+        elif model_name == "claude-haiku":
+            call_fn = call_claude_haiku
+        elif model_name == "openai":
+            call_fn = make_openai_caller(openai_model)
+        elif model_name == "mistral":
+            call_fn = make_mistral_caller(mistral_model)
+        elif model_name == "deepseek":
+            call_fn = make_deepseek_caller(deepseek_model)
+        elif model_name == "ollama":
+            call_fn = make_ollama_caller(ollama_model, no_think=no_think, base_url=ollama_url)
+        else:
+            print(f"Unknown model: {model_name}")
+            continue
+
+        # Count total questions for progress display
+        total_q = 0
+        for spec in specs:
+            questions = spec.get("questions", {})
+            if "recognition" in questions: total_q += 1
+            if "application" in questions: total_q += 2  # anchor + paraphrase
+            if "consistency" in questions:
+                cons = questions["consistency"]
+                total_q += len(cons.get("variants", []))
+                if cons.get("language_variant"): total_q += 1
+
+        print(f"=== {model_name.upper()} ({total_q} questions) ===")
+        model_results = []
+        all_results["models"][model_name] = model_results
+        current_q = [0]
+
+        def append_and_save(r):
+            model_results.append(r)
+            current_q[0] += 1
+            if not dry_run:
+                save_results(all_results, out_file)
+
+        for spec in specs:
+            anchor = spec["anchor"]
+            questions = spec.get("questions", {})
+
+            # Level 1: Recognition
+            if "recognition" in questions:
+                q = questions["recognition"]
+                if dry_run:
+                    prompt = build_prompt(q["question"], q["options"], POSITION_PERMUTATIONS[0])
+                    print(f"\n[DRY RUN] {anchor} / recognition:")
+                    print(prompt)
+                else:
+                    print(f"  [{current_q[0]+1}/{total_q}] {anchor} / recognition...", end=" ", flush=True)
+                    result = run_question(q, call_fn, f"{anchor}/recognition", verbose=verbose)
+                    print(f"{result['score']:.0%}")
+                    append_and_save(result)
+
+            # Level 2: Application (anchor variant)
+            if "application" in questions:
+                app = questions["application"]
+                anchor_q = {
+                    "question": f"{app['scenario'].strip()}\n{app['anchor_prompt']}",
+                    "options": app["options"],
+                    "correct": app["correct"],
+                }
+                para_q = {
+                    "question": f"{app['scenario'].strip()}\n{app['paraphrase_prompt']}",
+                    "options": app["options"],
+                    "correct": app["correct"],
+                }
+                if dry_run:
+                    prompt = build_prompt(anchor_q["question"], anchor_q["options"], POSITION_PERMUTATIONS[0])
+                    print(f"\n[DRY RUN] {anchor} / application (anchor):")
+                    print(prompt)
+                else:
+                    print(f"  [{current_q[0]+1}/{total_q}] {anchor} / application (anchor)...", end=" ", flush=True)
+                    result_a = run_question(anchor_q, call_fn, f"{anchor}/application-anchor", verbose=verbose)
+                    print(f"{result_a['score']:.0%}")
+                    append_and_save(result_a)
+
+                    print(f"  [{current_q[0]+1}/{total_q}] {anchor} / application (paraphrase)...", end=" ", flush=True)
+                    result_p = run_question(para_q, call_fn, f"{anchor}/application-paraphrase", verbose=verbose)
+                    print(f"{result_p['score']:.0%}")
+                    append_and_save(result_p)
+
+            # Level 4: Consistency
+            if "consistency" in questions:
+                cons = questions["consistency"]
+                variants = cons.get("variants", [])
+                lang = cons.get("language_variant")
+                if lang:
+                    variants = variants + [lang]
+
+                for i, variant in enumerate(variants):
+                    variant_q = {
+                        "question": variant,
+                        "options": cons["options"],
+                        "correct": cons["correct"],
+                    }
+                    variant_label = f"variant-{i+1}" if i < len(cons.get("variants", [])) else "language"
+                    if dry_run:
+                        prompt = build_prompt(variant_q["question"], variant_q["options"], POSITION_PERMUTATIONS[0])
+                        print(f"\n[DRY RUN] {anchor} / consistency ({variant_label}):")
+                        print(prompt)
+                    else:
+                        print(f"  [{current_q[0]+1}/{total_q}] {anchor} / consistency ({variant_label})...", end=" ", flush=True)
+                        result = run_question(variant_q, call_fn, f"{anchor}/consistency-{variant_label}", verbose=verbose)
+                        print(f"{result['score']:.0%}")
+                        append_and_save(result)
+
+        all_results["models"][model_name] = model_results
+
+    elapsed = time.time() - start_time
+    all_results["duration_seconds"] = round(elapsed, 1)
+
+    if not dry_run:
+        save_results(all_results, out_file)
+        print(f"\nResults saved to {out_file}")
+
+        # Summary
+        print("\n=== SUMMARY ===")
+        print(f"Models: {', '.join(models)}")
+        print(f"Temperature: {TEMPERATURE}")
+        if "openai" in models:
+            print(f"OpenAI: {openai_model}")
+        if "mistral" in models:
+            print(f"Mistral: {mistral_model}")
+        if "deepseek" in models:
+            print(f"DeepSeek: {deepseek_model}")
+        if "ollama" in models:
+            print(f"Ollama: {ollama_model} @ {ollama_url} (no-think={no_think})")
+        minutes, seconds = divmod(int(elapsed), 60)
+        print(f"Duration: {minutes}m {seconds}s")
+        print()
+        for model_name, results in all_results["models"].items():
+            scores = [r["score"] for r in results]
+            avg = sum(scores) / len(scores) if scores else 0
+            print(f"{model_name}: {avg:.0%} average ({len(scores)} questions)")
+            for r in results:
+                status = "✓" if r["score"] == 1.0 else f"{r['score']:.0%}"
+                print(f"  {r['label']}: {status}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Pilot evaluation runner")
+    parser.add_argument("--model", nargs="+", default=["claude-cli"],
+                        choices=["claude", "claude-cli", "claude-haiku", "openai", "mistral", "deepseek", "ollama"],
+                        help="Models to evaluate (default: claude-cli)")
+    parser.add_argument("--openai-model", default="gpt-4o-mini",
+                        help="OpenAI model name (default: gpt-4o-mini). Try: gpt-5, gpt-5-mini, gpt-4o")
+    parser.add_argument("--mistral-model", default="mistral-large-latest",
+                        help="Mistral model name (default: mistral-large-latest)")
+    parser.add_argument("--deepseek-model", default="deepseek-chat",
+                        help="DeepSeek model name (default: deepseek-chat)")
+    parser.add_argument("--ollama-model", default="qwen3:4b",
+                        help="Ollama model name (default: qwen3:4b)")
+    parser.add_argument("--ollama-url", default="http://localhost:11434",
+                        help="Ollama API base URL (default: http://localhost:11434)")
+    parser.add_argument("--temperature", type=float, default=0.0,
+                        help="Sampling temperature (default: 0.0). Note: claude-cli/claude-haiku ignore this.")
+    parser.add_argument("--no-think", action="store_true",
+                        help="Disable reasoning/thinking for Ollama models (faster, fewer tokens)")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Show prompts without sending")
+    parser.add_argument("--verbose", action="store_true",
+                        help="Print raw responses for debugging")
+    args = parser.parse_args()
+    set_temperature(args.temperature)
+    run_pilot(args.model, args.dry_run, args.verbose, args.ollama_model, args.no_think,
+              args.ollama_url, args.openai_model, args.mistral_model, args.deepseek_model)
diff --git a/evaluations/report.html b/evaluations/report.html
new file mode 100644
index 0000000..0fd9fb2
--- /dev/null
+++ b/evaluations/report.html
@@ -0,0 +1,388 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Semantic Anchor Evaluation Report</title>
+<style>
+  * { margin: 0; padding: 0; box-sizing: border-box; }
+  body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; background: #f8fafc; color: #1e293b; padding: 2rem; }
+  h1 { font-size: 1.75rem; margin-bottom: 0.5rem; }
+  h2 { font-size: 1.25rem; margin: 2rem 0 1rem; border-bottom: 2px solid #e2e8f0; padding-bottom: 0.5rem; }
+  h3 { font-size: 1rem; margin: 1.5rem 0 0.5rem; color: #475569; }
+  .subtitle { color: #64748b; margin-bottom: 2rem; }
+  .summary-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-bottom: 2rem; }
+  .summary-card { background: white; border-radius: 0.5rem; padding: 1.25rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
+  .summary-card .model-name { font-weight: 600; font-size: 0.875rem; color: #475569; }
+  .summary-card .score { font-size: 2rem; font-weight: 700; margin: 0.5rem 0; }
+  .summary-card .detail { font-size: 0.75rem; color: #94a3b8; }
+  table { width: 100%; border-collapse: collapse; background: white; border-radius: 0.5rem; overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.1); margin-bottom: 1.5rem; }
+  th { background: #f1f5f9; padding: 0.625rem 0.75rem; text-align: left; font-weight: 600; font-size: 0.8125rem; color: #475569; border-bottom: 2px solid #e2e8f0; }
+  td { padding: 0.5rem 0.75rem; border-bottom: 1px solid #f1f5f9; font-size: 0.8125rem; }
+  tr:hover { background: #f8fafc; }
+  .score-cell { text-align: center; font-weight: 600; border-radius: 0.25rem; }
+  .anchor-group { font-weight: 600; background: #f8fafc; }
+  .question-label { padding-left: 1.5rem; color: #64748b; }
+  .check { color: #22c55e; }
+  .controls { opacity: 0.7; }
+  .legend { display: flex; gap: 1.5rem; margin: 1rem 0; font-size: 0.8125rem; }
+  .legend-item { display: flex; align-items: center; gap: 0.375rem; }
+  .legend-dot { width: 12px; height: 12px; border-radius: 2px; }
+  .meta { background: #f1f5f9; border-radius: 0.5rem; padding: 1rem; font-size: 0.75rem; color: #64748b; margin-top: 2rem; }
+  .meta dt { font-weight: 600; display: inline; }
+  .meta dd { display: inline; margin-right: 1.5rem; }
+  .fail-list { margin-top: 1rem; }
+  .fail-item { display: flex; justify-content: space-between; padding: 0.25rem 0; font-size: 0.8125rem; border-bottom: 1px solid #f1f5f9; }
+</style>
+</head>
+<body>
+<h1>Semantic Anchor Evaluation Report</h1>
+<p class="subtitle">Multiple-choice recognition test across 3 LLMs — 191 questions, 61 anchors</p>
+
+<div class="legend">
+  <div class="legend-item"><div class="legend-dot" style="background:#dcfce7"></div> &ge;80%</div>
+  <div class="legend-item"><div class="legend-dot" style="background:#fef9c3"></div> 50–79%</div>
+  <div class="legend-item"><div class="legend-dot" style="background:#fee2e2"></div> &lt;50%</div>
+</div>
+
+<h2>Model Summary</h2>
+<div class="summary-grid">
+  <div class="summary-card">
+    <div class="model-name">Claude Sonnet</div>
+    <div class="score" style="color: #22c55e">99%</div>
+    <div class="detail">191 questions · pilot-20260324-174404.json</div>
+  </div>
+  <div class="summary-card">
+    <div class="model-name">GPT-4o</div>
+    <div class="score" style="color: #22c55e">98%</div>
+    <div class="detail">191 questions · pilot-20260324-192413.json</div>
+  </div>
+  <div class="summary-card">
+    <div class="model-name">Mistral Large</div>
+    <div class="score" style="color: #22c55e">96%</div>
+    <div class="detail">191 questions · pilot-20260324-190600.json</div>
+  </div>
+</div>
+
+<h2>Heatmap: Anchor × Model</h2>
+<table>
+<thead><tr>
+  <th>Anchor / Question</th>
+  <th style='text-align:center'>Claude Sonnet</th>
+  <th style='text-align:center'>GPT-4o</th>
+  <th style='text-align:center'>Mistral Large</th>
+</tr></thead>
+<tbody>
+<tr class="anchor-group"><td>adr-according-to-nygard</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">92%</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td></tr>
+<tr class="anchor-group"><td>arc42</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">consistency-language</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">consistency-variant-1</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">consistency-variant-2</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">consistency-variant-3</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>atam</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>bdd-given-when-then</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">83%</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">50%</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>bem-methodology</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>bluf</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>c4-diagrams</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>chain-of-thought</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>clean-architecture</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>control-chart-shewhart</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">92%</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>conventional-commits</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>cqrs</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>cynefin-framework</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>definition-of-done</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>devils-advocate</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>diataxis-framework</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>docs-as-code</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>domain-driven-design</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>ears-requirements</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">92%</td><td class="score-cell" style="background:#dcfce7">83%</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td><td class="score-cell" style="background:#fef9c3">75%</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td></tr>
+<tr class="anchor-group"><td>event-driven-architecture</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>fagan-inspection</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>feynman-technique</td><td class="score-cell" style="background:#fef9c3">67%</td><td class="score-cell" style="background:#fef9c3">67%</td><td class="score-cell" style="background:#dcfce7">92%</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#fee2e2">0%</td><td class="score-cell" style="background:#fee2e2">0%</td><td class="score-cell" style="background:#fef9c3">75%</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>five-whys</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>fowler-patterns</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>gherkin</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>github-flow</td><td class="score-cell" style="background:#dcfce7">92%</td><td class="score-cell" style="background:#dcfce7">92%</td><td class="score-cell" style="background:#dcfce7">92%</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#fef9c3">75%</td><td class="score-cell" style="background:#fef9c3">75%</td><td class="score-cell" style="background:#fef9c3">75%</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>gutes-deutsch-wolf-schneider</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>hexagonal-architecture</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>iec-61508-sil-levels</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">92%</td><td class="score-cell" style="background:#dcfce7">83%</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">50%</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>impact-mapping</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>invest</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>iso-25010</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">83%</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>jobs-to-be-done</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>lasr</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">92%</td><td class="score-cell" style="background:#fef9c3">75%</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td><td class="score-cell" style="background:#fee2e2">25%</td></tr>
+<tr class="anchor-group"><td>linddun</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>llm-evaluations</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>madr</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>mece</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>morphological-box</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>moscow</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">92%</td><td class="score-cell" style="background:#fef9c3">75%</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td><td class="score-cell" style="background:#fee2e2">25%</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>mutation-testing</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>nelson-rules</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>owasp-top-10</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>plain-english-strunk-white</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>prd</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">92%</td><td class="score-cell" style="background:#fef9c3">67%</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td><td class="score-cell" style="background:#fee2e2">0%</td></tr>
+<tr class="anchor-group"><td>problem-space-nvc</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">83%</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>property-based-testing</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">83%</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>pyramid-principle</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>semantic-versioning</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">50%</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>socratic-method</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>sota</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>spc</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>stride</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>swot</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>tdd-chicago-school</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">92%</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>tdd-london-school</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">89%</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">consistency-language</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">consistency-variant-1</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">consistency-variant-2</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">consistency-variant-3</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">50%</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">75%</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>testing-pyramid</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>timtowtdi</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>todotxt-flavoured-markdown</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">83%</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#fef9c3">50%</td></tr>
+<tr class="anchor-group"><td>user-story-mapping</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr class="anchor-group"><td>wardley-mapping</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-anchor</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">application-paraphrase</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+<tr><td class="question-label">recognition</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td><td class="score-cell" style="background:#dcfce7">✓</td></tr>
+</tbody></table>
+<h2>Control Questions</h2>
+<table class="controls">
+<thead><tr><th>Control</th><th style='text-align:center'>Claude Sonnet</th><th style='text-align:center'>GPT-4o</th><th style='text-align:center'>Mistral Large</th></tr></thead>
+<tbody>
+<tr><td>negative-control</td><td class="score-cell" style="background:#dcfce7">100%</td><td class="score-cell" style="background:#dcfce7">100%</td><td class="score-cell" style="background:#fef9c3">75%</td></tr>
+<tr><td>sanity-check</td><td class="score-cell" style="background:#dcfce7">0%</td><td class="score-cell" style="background:#dcfce7">0%</td><td class="score-cell" style="background:#dcfce7">0%</td></tr>
+</tbody></table>
+<h2>Failures Detail</h2>
+<h3>Claude Sonnet: 2 failures</h3>
+<div class="fail-list">
+<div class="fail-item"><span>feynman-technique/application-paraphrase</span><span style="color:#ef4444;font-weight:600">0%</span></div>
+<div class="fail-item"><span>github-flow/application-paraphrase</span><span style="color:#eab308;font-weight:600">75%</span></div>
+</div>
+<h3>GPT-4o: 13 failures</h3>
+<div class="fail-list">
+<div class="fail-item"><span>control-chart-shewhart/application-paraphrase</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>ears-requirements/application-paraphrase</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>feynman-technique/application-paraphrase</span><span style="color:#ef4444;font-weight:600">0%</span></div>
+<div class="fail-item"><span>github-flow/application-paraphrase</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>iec-61508-sil-levels/application-paraphrase</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>lasr/recognition</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>moscow/application-paraphrase</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>prd/recognition</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>property-based-testing/application-anchor</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>property-based-testing/application-paraphrase</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>tdd-chicago-school/recognition</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>tdd-london-school/consistency-variant-3</span><span style="color:#eab308;font-weight:600">50%</span></div>
+<div class="fail-item"><span>tdd-london-school/recognition</span><span style="color:#eab308;font-weight:600">75%</span></div>
+</div>
+<h3>Mistral Large: 17 failures</h3>
+<div class="fail-list">
+<div class="fail-item"><span>adr-according-to-nygard/recognition</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>bdd-given-when-then/application-paraphrase</span><span style="color:#eab308;font-weight:600">50%</span></div>
+<div class="fail-item"><span>ears-requirements/application-paraphrase</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>ears-requirements/recognition</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>feynman-technique/application-paraphrase</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>github-flow/application-paraphrase</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>iec-61508-sil-levels/application-anchor</span><span style="color:#eab308;font-weight:600">50%</span></div>
+<div class="fail-item"><span>iso-25010/application-anchor</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>iso-25010/application-paraphrase</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>lasr/recognition</span><span style="color:#ef4444;font-weight:600">25%</span></div>
+<div class="fail-item"><span>moscow/application-paraphrase</span><span style="color:#ef4444;font-weight:600">25%</span></div>
+<div class="fail-item"><span>prd/recognition</span><span style="color:#ef4444;font-weight:600">0%</span></div>
+<div class="fail-item"><span>problem-space-nvc/application-anchor</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>problem-space-nvc/application-paraphrase</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>semantic-versioning/application-anchor</span><span style="color:#eab308;font-weight:600">50%</span></div>
+<div class="fail-item"><span>semantic-versioning/application-paraphrase</span><span style="color:#eab308;font-weight:600">75%</span></div>
+<div class="fail-item"><span>todotxt-flavoured-markdown/recognition</span><span style="color:#eab308;font-weight:600">50%</span></div>
+</div>
+
+<div class="meta">
+<h3>Run Metadata</h3>
+<dl>
+<dt>Claude Sonnet:</dt><dd>pilot-20260324-174404.json · 81m 2s · 2026-03-24T17:44:04</dd><br><dt>GPT-4o:</dt><dd>pilot-20260324-192413.json · 15m 38s · 2026-03-24T19:24:13</dd><br><dt>Mistral Large:</dt><dd>pilot-20260324-190600.json · 16m 58s · 2026-03-24T19:06:00</dd><br>
+</dl>
+<p style="margin-top:0.75rem">Generated by <code>evaluations/generate-report.py</code> · Position bias mitigation: 4 permutations per question · Scoring: deterministic MC (no LLM judge)</p>
+</div>
+
+</body>
+</html>
\ No newline at end of file
diff --git a/evaluations/results/pilot-20260324-174404.json b/evaluations/results/pilot-20260324-174404.json
new file mode 100644
index 0000000..110f33a
--- /dev/null
+++ b/evaluations/results/pilot-20260324-174404.json
@@ -0,0 +1,10442 @@
+{
+  "timestamp": "2026-03-24T17:44:04.891380+00:00",
+  "config": {
+    "models": [
+      "claude"
+    ],
+    "openai_model": null,
+    "mistral_model": null,
+    "deepseek_model": null,
+    "ollama_model": null,
+    "ollama_url": null,
+    "no_think": null,
+    "temperature": 0.0
+  },
+  "models": {
+    "claude": [
+      {
+        "label": "adr-according-to-nygard/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "adr-according-to-nygard/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "adr-according-to-nygard/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "arc42/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          }
+        ]
+      },
+      {
+        "label": "arc42/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "arc42/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "arc42/consistency-variant-1",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "arc42/consistency-variant-2",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "arc42/consistency-variant-3",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "arc42/consistency-language",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "atam/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "atam/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "atam/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bdd-given-when-then/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bdd-given-when-then/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bdd-given-when-then/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bem-methodology/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bem-methodology/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bem-methodology/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bluf/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bluf/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bluf/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "c4-diagrams/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "c4-diagrams/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "c4-diagrams/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "chain-of-thought/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "chain-of-thought/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "chain-of-thought/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "clean-architecture/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "clean-architecture/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "clean-architecture/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "control-chart-shewhart/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "control-chart-shewhart/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "control-chart-shewhart/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "conventional-commits/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "conventional-commits/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "conventional-commits/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cqrs/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cqrs/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cqrs/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cynefin-framework/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cynefin-framework/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cynefin-framework/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "definition-of-done/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "definition-of-done/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "definition-of-done/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "devils-advocate/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "devils-advocate/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "devils-advocate/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "diataxis-framework/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "diataxis-framework/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "diataxis-framework/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "docs-as-code/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "docs-as-code/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "docs-as-code/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "domain-driven-design/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "domain-driven-design/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "domain-driven-design/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "ears-requirements/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "ears-requirements/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "ears-requirements/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "event-driven-architecture/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "event-driven-architecture/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "event-driven-architecture/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fagan-inspection/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fagan-inspection/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fagan-inspection/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "feynman-technique/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "feynman-technique/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "feynman-technique/application-paraphrase",
+        "score": 0.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "C",
+            "correct": false,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "B",
+            "correct": false,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "D",
+            "correct": false,
+            "raw_response": "D"
+          }
+        ]
+      },
+      {
+        "label": "five-whys/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "five-whys/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "five-whys/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fowler-patterns/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fowler-patterns/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fowler-patterns/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "gherkin/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          }
+        ]
+      },
+      {
+        "label": "gherkin/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "gherkin/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "github-flow/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "github-flow/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "github-flow/application-paraphrase",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "gutes-deutsch-wolf-schneider/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "gutes-deutsch-wolf-schneider/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "hexagonal-architecture/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "hexagonal-architecture/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "hexagonal-architecture/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iec-61508-sil-levels/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iec-61508-sil-levels/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iec-61508-sil-levels/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "impact-mapping/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "impact-mapping/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "impact-mapping/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "invest/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "invest/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "invest/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iso-25010/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iso-25010/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iso-25010/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "jobs-to-be-done/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "jobs-to-be-done/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "jobs-to-be-done/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "lasr/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "lasr/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "lasr/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "linddun/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "linddun/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "linddun/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "llm-evaluations/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "llm-evaluations/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "llm-evaluations/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "madr/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "madr/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "madr/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mece/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mece/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mece/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "morphological-box/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "morphological-box/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "morphological-box/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "moscow/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "moscow/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "moscow/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mutation-testing/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mutation-testing/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mutation-testing/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "negative-control/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          }
+        ]
+      },
+      {
+        "label": "nelson-rules/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "nelson-rules/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "nelson-rules/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "owasp-top-10/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          }
+        ]
+      },
+      {
+        "label": "owasp-top-10/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "owasp-top-10/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "plain-english-strunk-white/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "plain-english-strunk-white/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "plain-english-strunk-white/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "prd/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "prd/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "prd/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "problem-space-nvc/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "problem-space-nvc/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "problem-space-nvc/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "property-based-testing/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "property-based-testing/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "property-based-testing/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "pyramid-principle/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "pyramid-principle/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "pyramid-principle/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "sanity-check/recognition",
+        "score": 0.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "X",
+            "answer": "B",
+            "correct": false,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "X",
+            "answer": null,
+            "correct": false,
+            "raw_response": "None of the options provided match the famous answer from"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "X",
+            "answer": null,
+            "correct": false,
+            "raw_response": "None of the options provided match the correct answer from"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "X",
+            "answer": null,
+            "correct": false,
+            "raw_response": "None of the options provided match the famous answer from"
+          }
+        ]
+      },
+      {
+        "label": "semantic-versioning/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "semantic-versioning/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "semantic-versioning/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "socratic-method/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "socratic-method/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "socratic-method/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "sota/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "sota/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "sota/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "spc/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "spc/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "spc/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "stride/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "stride/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "stride/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "swot/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "swot/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "swot/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-chicago-school/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-chicago-school/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-chicago-school/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/consistency-variant-1",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/consistency-variant-2",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/consistency-variant-3",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/consistency-language",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "testing-pyramid/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "testing-pyramid/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "testing-pyramid/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "timtowtdi/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          }
+        ]
+      },
+      {
+        "label": "timtowtdi/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "timtowtdi/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "todotxt-flavoured-markdown/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "todotxt-flavoured-markdown/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "todotxt-flavoured-markdown/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "user-story-mapping/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "user-story-mapping/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "user-story-mapping/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "wardley-mapping/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "wardley-mapping/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "wardley-mapping/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      }
+    ]
+  },
+  "duration_seconds": 4862.1
+}
\ No newline at end of file
diff --git a/evaluations/results/pilot-20260324-190600.json b/evaluations/results/pilot-20260324-190600.json
new file mode 100644
index 0000000..613af48
--- /dev/null
+++ b/evaluations/results/pilot-20260324-190600.json
@@ -0,0 +1,10442 @@
+{
+  "timestamp": "2026-03-24T19:06:00.394684+00:00",
+  "config": {
+    "models": [
+      "mistral"
+    ],
+    "openai_model": null,
+    "mistral_model": "mistral-large-latest",
+    "deepseek_model": null,
+    "ollama_model": null,
+    "ollama_url": null,
+    "no_think": null,
+    "temperature": 0.0
+  },
+  "models": {
+    "mistral": [
+      {
+        "label": "adr-according-to-nygard/recognition",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "adr-according-to-nygard/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "adr-according-to-nygard/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "arc42/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          }
+        ]
+      },
+      {
+        "label": "arc42/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "arc42/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "arc42/consistency-variant-1",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "arc42/consistency-variant-2",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "arc42/consistency-variant-3",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "arc42/consistency-language",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "atam/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "atam/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "atam/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bdd-given-when-then/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bdd-given-when-then/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bdd-given-when-then/application-paraphrase",
+        "score": 0.5,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": null,
+            "correct": false,
+            "raw_response": "ERROR: Error code: 429 - {'object': 'error', 'message': 'Rate limit exceeded', 'type': 'rate_limited', 'param': None, 'code': '1300', 'raw_status_code': 429}"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": null,
+            "correct": false,
+            "raw_response": "ERROR: Error code: 429 - {'object': 'error', 'message': 'Rate limit exceeded', 'type': 'rate_limited', 'param': None, 'code': '1300', 'raw_status_code': 429}"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bem-methodology/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bem-methodology/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bem-methodology/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bluf/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bluf/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bluf/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "c4-diagrams/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "c4-diagrams/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "c4-diagrams/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "chain-of-thought/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "chain-of-thought/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "chain-of-thought/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "clean-architecture/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "clean-architecture/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "clean-architecture/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "control-chart-shewhart/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "control-chart-shewhart/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "control-chart-shewhart/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "conventional-commits/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "conventional-commits/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "conventional-commits/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cqrs/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cqrs/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cqrs/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cynefin-framework/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cynefin-framework/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cynefin-framework/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "definition-of-done/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "definition-of-done/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "definition-of-done/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "devils-advocate/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "devils-advocate/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "devils-advocate/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "diataxis-framework/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "diataxis-framework/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "diataxis-framework/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "docs-as-code/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "docs-as-code/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "docs-as-code/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "domain-driven-design/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "domain-driven-design/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "domain-driven-design/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "ears-requirements/recognition",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "C",
+            "correct": false,
+            "raw_response": "C)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "ears-requirements/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "ears-requirements/application-paraphrase",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "B",
+            "correct": false,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "event-driven-architecture/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "event-driven-architecture/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "event-driven-architecture/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fagan-inspection/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fagan-inspection/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fagan-inspection/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "feynman-technique/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "feynman-technique/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "feynman-technique/application-paraphrase",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "B",
+            "correct": false,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "five-whys/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "five-whys/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "five-whys/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fowler-patterns/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fowler-patterns/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fowler-patterns/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "gherkin/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          }
+        ]
+      },
+      {
+        "label": "gherkin/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "gherkin/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "github-flow/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "github-flow/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "github-flow/application-paraphrase",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "gutes-deutsch-wolf-schneider/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "gutes-deutsch-wolf-schneider/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "hexagonal-architecture/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "hexagonal-architecture/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "hexagonal-architecture/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iec-61508-sil-levels/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iec-61508-sil-levels/application-anchor",
+        "score": 0.5,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "B",
+            "correct": false,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "D",
+            "correct": false,
+            "raw_response": "D"
+          }
+        ]
+      },
+      {
+        "label": "iec-61508-sil-levels/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "impact-mapping/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "impact-mapping/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "impact-mapping/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "invest/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "invest/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "invest/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iso-25010/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iso-25010/application-anchor",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "B",
+            "correct": false,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iso-25010/application-paraphrase",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "B",
+            "correct": false,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "jobs-to-be-done/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "jobs-to-be-done/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "jobs-to-be-done/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "lasr/recognition",
+        "score": 0.25,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "C",
+            "correct": false,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          }
+        ]
+      },
+      {
+        "label": "lasr/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "lasr/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "linddun/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "linddun/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "linddun/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "llm-evaluations/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "llm-evaluations/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "llm-evaluations/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "madr/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "madr/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "madr/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mece/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mece/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mece/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "morphological-box/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "morphological-box/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "morphological-box/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "moscow/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "moscow/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "moscow/application-paraphrase",
+        "score": 0.25,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "D",
+            "correct": false,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "B",
+            "correct": false,
+            "raw_response": "B"
+          }
+        ]
+      },
+      {
+        "label": "mutation-testing/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mutation-testing/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mutation-testing/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "negative-control/recognition",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "D",
+            "answer": "B",
+            "correct": false,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          }
+        ]
+      },
+      {
+        "label": "nelson-rules/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "nelson-rules/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "nelson-rules/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "owasp-top-10/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          }
+        ]
+      },
+      {
+        "label": "owasp-top-10/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "owasp-top-10/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "plain-english-strunk-white/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "plain-english-strunk-white/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "plain-english-strunk-white/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "prd/recognition",
+        "score": 0.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "D",
+            "correct": false,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "C",
+            "correct": false,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "B",
+            "correct": false,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          }
+        ]
+      },
+      {
+        "label": "prd/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "prd/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "problem-space-nvc/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "problem-space-nvc/application-anchor",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "C",
+            "correct": false,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "problem-space-nvc/application-paraphrase",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "C",
+            "correct": false,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "property-based-testing/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "property-based-testing/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "property-based-testing/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "pyramid-principle/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "pyramid-principle/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "pyramid-principle/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "sanity-check/recognition",
+        "score": 0.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "X",
+            "answer": null,
+            "correct": false,
+            "raw_response": "None of the above options is correct, but the"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "X",
+            "answer": null,
+            "correct": false,
+            "raw_response": "None of the options provided is correct, but the"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "X",
+            "answer": null,
+            "correct": false,
+            "raw_response": "None of the options provided is correct, but the"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "X",
+            "answer": "C",
+            "correct": false,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "semantic-versioning/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "semantic-versioning/application-anchor",
+        "score": 0.5,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "C",
+            "correct": false,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "D",
+            "correct": false,
+            "raw_response": "D"
+          }
+        ]
+      },
+      {
+        "label": "semantic-versioning/application-paraphrase",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "D",
+            "correct": false,
+            "raw_response": "D"
+          }
+        ]
+      },
+      {
+        "label": "socratic-method/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "socratic-method/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "socratic-method/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "sota/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "sota/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "sota/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "spc/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "spc/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "spc/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "stride/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "None of the options perfectly describe the **STRIDE"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "stride/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "stride/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "swot/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "swot/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) SWOT Analysis\nB) Value Chain"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "swot/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-chicago-school/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-chicago-school/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-chicago-school/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/consistency-variant-1",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/consistency-variant-2",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/consistency-variant-3",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/consistency-language",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "testing-pyramid/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "testing-pyramid/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "testing-pyramid/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "timtowtdi/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          }
+        ]
+      },
+      {
+        "label": "timtowtdi/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "timtowtdi/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "todotxt-flavoured-markdown/recognition",
+        "score": 0.5,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "D",
+            "correct": false,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "todotxt-flavoured-markdown/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "todotxt-flavoured-markdown/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "user-story-mapping/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "user-story-mapping/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "user-story-mapping/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "wardley-mapping/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "wardley-mapping/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "wardley-mapping/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      }
+    ]
+  },
+  "duration_seconds": 1018.4
+}
\ No newline at end of file
diff --git a/evaluations/results/pilot-20260324-192413.json b/evaluations/results/pilot-20260324-192413.json
new file mode 100644
index 0000000..8c5f85e
--- /dev/null
+++ b/evaluations/results/pilot-20260324-192413.json
@@ -0,0 +1,10442 @@
+{
+  "timestamp": "2026-03-24T19:24:13.551875+00:00",
+  "config": {
+    "models": [
+      "openai"
+    ],
+    "openai_model": "gpt-4o",
+    "mistral_model": null,
+    "deepseek_model": null,
+    "ollama_model": null,
+    "ollama_url": null,
+    "no_think": null,
+    "temperature": 0.0
+  },
+  "models": {
+    "openai": [
+      {
+        "label": "adr-according-to-nygard/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "adr-according-to-nygard/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "adr-according-to-nygard/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "arc42/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) A 12-section template for standardized software"
+          }
+        ]
+      },
+      {
+        "label": "arc42/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "arc42/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) External interfaces in Section 3 (Context"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "arc42/consistency-variant-1",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Gernot Starke"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) Gernot Starke"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Gernot Starke"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Gernot Starke"
+          }
+        ]
+      },
+      {
+        "label": "arc42/consistency-variant-2",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Gernot Starke"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) Gernot Starke"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Gernot Starke"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Gernot Starke"
+          }
+        ]
+      },
+      {
+        "label": "arc42/consistency-variant-3",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Gernot Starke"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) Gernot Starke"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Gernot Starke"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Gernot Starke"
+          }
+        ]
+      },
+      {
+        "label": "arc42/consistency-language",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "atam/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "atam/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "atam/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Build a utility tree to prioritize quality scenarios"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Build a utility tree to prioritize quality scenarios"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Build a utility tree to prioritize quality scenarios"
+          }
+        ]
+      },
+      {
+        "label": "bdd-given-when-then/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bdd-given-when-then/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bdd-given-when-then/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Organize discovery workshops with the three amigos"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bem-methodology/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bem-methodology/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Use structured class names like `.navbar`,"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Use structured class names like `.navbar`,"
+          }
+        ]
+      },
+      {
+        "label": "bem-methodology/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Use structured class names like `.navbar`,"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Use structured class names like `.navbar`,"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Use structured class names like `.navbar`,"
+          }
+        ]
+      },
+      {
+        "label": "bluf/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bluf/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "bluf/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "c4-diagrams/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) Four levels of abstraction; : system in"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Four levels of abstraction; : system in"
+          }
+        ]
+      },
+      {
+        "label": "c4-diagrams/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "c4-diagrams/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "chain-of-thought/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "chain-of-thought/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "chain-of-thought/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "clean-architecture/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "clean-architecture/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Define payment processing use cases in the core"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "clean-architecture/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "control-chart-shewhart/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Measured value plotted over time; process"
+          }
+        ]
+      },
+      {
+        "label": "control-chart-shewhart/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "control-chart-shewhart/application-paraphrase",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "conventional-commits/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "conventional-commits/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "conventional-commits/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cqrs/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cqrs/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cqrs/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Create separate optimized data models: a normalized"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) Create separate optimized data models: a normalized"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Create separate optimized data models: a normalized"
+          }
+        ]
+      },
+      {
+        "label": "cynefin-framework/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Five domains; : best practices apply,"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Five domains; : best practices apply,"
+          }
+        ]
+      },
+      {
+        "label": "cynefin-framework/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "cynefin-framework/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "definition-of-done/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "definition-of-done/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "definition-of-done/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Collaboratively create a single, team-wide"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "devils-advocate/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) Present opposing viewpoints even if not personally held"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "devils-advocate/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "devils-advocate/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) Systematically argue against your own design by"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "diataxis-framework/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Four documentation types; : learning-oriented,"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "diataxis-framework/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Develop four distinct documentation sections: beginner tutorials"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "diataxis-framework/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Develop four distinct documentation sections: beginner tutorials"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Develop four distinct documentation sections: beginner tutorials"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "docs-as-code/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "docs-as-code/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "docs-as-code/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "domain-driven-design/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "domain-driven-design/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "domain-driven-design/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Establish a ubiquitous language by working closely with"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "ears-requirements/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "ears-requirements/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Structure requirements using specific templates: 'The"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Structure requirements using specific templates: 'The"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "ears-requirements/application-paraphrase",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Structure requirements using specific templates: 'The"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "B",
+            "correct": false,
+            "raw_response": "B) Create a comprehensive requirements specification document with functional"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Structure requirements using specific templates: 'The"
+          }
+        ]
+      },
+      {
+        "label": "event-driven-architecture/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "event-driven-architecture/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "event-driven-architecture/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "fagan-inspection/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fagan-inspection/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fagan-inspection/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "feynman-technique/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "feynman-technique/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "feynman-technique/application-paraphrase",
+        "score": 0.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "C",
+            "correct": false,
+            "raw_response": "C)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "B",
+            "correct": false,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          }
+        ]
+      },
+      {
+        "label": "five-whys/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "five-whys/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "five-whys/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fowler-patterns/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "fowler-patterns/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "fowler-patterns/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Use a Domain Model pattern for complex business"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Use a Domain Model pattern for complex business"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Use a Domain Model pattern for complex business"
+          }
+        ]
+      },
+      {
+        "label": "gherkin/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          }
+        ]
+      },
+      {
+        "label": "gherkin/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "gherkin/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "github-flow/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Workflow steps"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) Workflow steps"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Workflow steps"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "github-flow/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "github-flow/application-paraphrase",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "gutes-deutsch-wolf-schneider/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "gutes-deutsch-wolf-schneider/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "gutes-deutsch-wolf-schneider/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "hexagonal-architecture/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "hexagonal-architecture/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "hexagonal-architecture/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iec-61508-sil-levels/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iec-61508-sil-levels/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iec-61508-sil-levels/application-paraphrase",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "D",
+            "correct": false,
+            "raw_response": "D"
+          }
+        ]
+      },
+      {
+        "label": "impact-mapping/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Goal → Actors → Impacts → Deliver"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "impact-mapping/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Map the retention goal to key actors ("
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Map the retention goal to key actors ("
+          }
+        ]
+      },
+      {
+        "label": "impact-mapping/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Map the retention goal to key actors ("
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "invest/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "invest/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "invest/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) Split this into multiple smaller stories with specific"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Split this into multiple smaller stories with specific"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "iso-25010/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "iso-25010/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "iso-25010/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "jobs-to-be-done/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "jobs-to-be-done/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Interview users about the specific circumstances that led"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "jobs-to-be-done/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Interview users about the specific circumstances that led"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) Interview users about the specific circumstances that led"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Interview users about the specific circumstances that led"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Interview users about the specific circumstances that led"
+          }
+        ]
+      },
+      {
+        "label": "lasr/recognition",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          }
+        ]
+      },
+      {
+        "label": "lasr/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "lasr/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "linddun/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "linddun/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "linddun/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Systematically analyze the system against seven privacy"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Systematically analyze the system against seven privacy"
+          }
+        ]
+      },
+      {
+        "label": "llm-evaluations/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "llm-evaluations/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "llm-evaluations/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "madr/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "madr/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) Document the decision with sections for context,"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Document the decision with sections for context,"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "madr/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Document the decision with sections for context,"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "mece/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Structuring categories so they do not overlap"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mece/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Organize by business capability: User Service"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Organize by business capability: User Service"
+          }
+        ]
+      },
+      {
+        "label": "mece/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Organize by business capability: User Service"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Organize by business capability: User Service"
+          }
+        ]
+      },
+      {
+        "label": "morphological-box/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Break complex problem into independent parameters/dim"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Break complex problem into independent parameters/dim"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Break complex problem into independent parameters/dim"
+          }
+        ]
+      },
+      {
+        "label": "morphological-box/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "morphological-box/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "moscow/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "moscow/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "moscow/application-paraphrase",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "C",
+            "correct": false,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mutation-testing/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mutation-testing/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "mutation-testing/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "negative-control/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          }
+        ]
+      },
+      {
+        "label": "nelson-rules/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "nelson-rules/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "nelson-rules/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "owasp-top-10/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          }
+        ]
+      },
+      {
+        "label": "owasp-top-10/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "owasp-top-10/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "plain-english-strunk-white/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "plain-english-strunk-white/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "plain-english-strunk-white/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "prd/recognition",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          }
+        ]
+      },
+      {
+        "label": "prd/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "prd/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Write a comprehensive document that defines the problem"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Write a comprehensive document that defines the problem"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "problem-space-nvc/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Concrete, objective facts without evaluation or judgment"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "problem-space-nvc/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "problem-space-nvc/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "property-based-testing/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Invariants that should always hold; automatic"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "property-based-testing/application-anchor",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Define mathematical invariants like 'interest calculations"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "property-based-testing/application-paraphrase",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "D",
+            "correct": false,
+            "raw_response": "D) Write comprehensive unit tests covering typical financial scenarios"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Define mathematical invariants like 'interest calculations"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "pyramid-principle/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "pyramid-principle/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "pyramid-principle/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "sanity-check/recognition",
+        "score": 0.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "X",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "None of the options provided are correct. The Answer"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "X",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "None of the options provided correspond to the Answer to"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "X",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "None of the options provided correspond to the Answer to"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "X",
+            "answer": "A",
+            "correct": false,
+            "raw_response": "None of the options provided correspond to the Answer to"
+          }
+        ]
+      },
+      {
+        "label": "semantic-versioning/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "semantic-versioning/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) 3.0.0 - because"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "semantic-versioning/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "socratic-method/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "socratic-method/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "socratic-method/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "sota/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "sota/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "sota/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "spc/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "spc/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "spc/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "stride/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "None of the options provided accurately describe the STRIDE"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "stride/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "stride/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "swot/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "swot/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Analyze internal strengths and weaknesses of your current"
+          }
+        ]
+      },
+      {
+        "label": "swot/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Analyze internal strengths and weaknesses of your current"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "tdd-chicago-school/recognition",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "D",
+            "correct": false,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-chicago-school/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "tdd-chicago-school/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Begin with tests for the core pricing calculations"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/recognition",
+        "score": 0.75,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "D",
+            "correct": false,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Write a test that mocks PaymentGateway and"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/consistency-variant-1",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Steve Freeman"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) Steve Freeman"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Steve Freeman"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Steve Freeman"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/consistency-variant-2",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Steve Freeman"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) Steve Freeman"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Steve Freeman"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Steve Freeman"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/consistency-variant-3",
+        "score": 0.5,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "C",
+            "correct": false,
+            "raw_response": "C) Dan North"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "B",
+            "correct": false,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Steve Freeman"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C) Steve Freeman"
+          }
+        ]
+      },
+      {
+        "label": "tdd-london-school/consistency-language",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D) Steve Freeman"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "testing-pyramid/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Three layers; more unit tests, fewer"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A) Three layers; more unit tests, fewer"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "testing-pyramid/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "testing-pyramid/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "timtowtdi/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          }
+        ]
+      },
+      {
+        "label": "timtowtdi/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "timtowtdi/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "todotxt-flavoured-markdown/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "todotxt-flavoured-markdown/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "todotxt-flavoured-markdown/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "user-story-mapping/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "user-story-mapping/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "user-story-mapping/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A)"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "wardley-mapping/recognition",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C)"
+          }
+        ]
+      },
+      {
+        "label": "wardley-mapping/application-anchor",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B)"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      },
+      {
+        "label": "wardley-mapping/application-paraphrase",
+        "score": 1.0,
+        "results": [
+          {
+            "permutation": [
+              "A",
+              "B",
+              "C",
+              "D"
+            ],
+            "expected": "B",
+            "answer": "B",
+            "correct": true,
+            "raw_response": "B) Map the payment processing component's position on"
+          },
+          {
+            "permutation": [
+              "B",
+              "C",
+              "D",
+              "A"
+            ],
+            "expected": "A",
+            "answer": "A",
+            "correct": true,
+            "raw_response": "A"
+          },
+          {
+            "permutation": [
+              "C",
+              "D",
+              "A",
+              "B"
+            ],
+            "expected": "D",
+            "answer": "D",
+            "correct": true,
+            "raw_response": "D)"
+          },
+          {
+            "permutation": [
+              "D",
+              "A",
+              "B",
+              "C"
+            ],
+            "expected": "C",
+            "answer": "C",
+            "correct": true,
+            "raw_response": "C"
+          }
+        ]
+      }
+    ]
+  },
+  "duration_seconds": 938.4
+}
\ No newline at end of file
diff --git a/evaluations/specs/adr-according-to-nygard.yaml b/evaluations/specs/adr-according-to-nygard.yaml
new file mode 100644
index 0000000..95a2e80
--- /dev/null
+++ b/evaluations/specs/adr-according-to-nygard.yaml
@@ -0,0 +1,39 @@
+anchor: adr-according-to-nygard
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "ADR according to Nygard"?
+
+      '
+    options:
+      A: Comprehensive architecture documentation with twelve standardized sections
+        covering context, building blocks, runtime, and deployment views
+      B: Lightweight records that capture a single architecture decision with its
+        context, decision, status, and consequences in a short, focused format
+      C: Collaborative review process where stakeholders evaluate architecture tradeoffs
+        through scenario-based analysis and quality attribute workshops
+      D: Visual modeling technique that represents software architecture at four levels
+        of abstraction from system context down to code
+    correct: B
+  application:
+    scenario: Your team is building a microservices platform and needs to choose between
+      REST APIs and GraphQL for service communication. The decision involves trade-offs
+      between performance, complexity, team expertise, and future scalability requirements.
+    anchor_prompt: using ADR according to Nygard
+    paraphrase_prompt: How should you document this architectural choice to ensure
+      future team members understand the reasoning and can make informed decisions
+      about related architecture changes?
+    options:
+      A: Create a comprehensive design document with detailed technical specifications,
+        implementation guidelines, and performance benchmarks that can be updated
+        as requirements change.
+      B: Write a concise record with the decision title, current status, context explaining
+        the forces at play, the chosen solution, and both positive and negative consequences,
+        then store it immutably with the codebase.
+      C: Document the decision in the project wiki with a detailed comparison matrix,
+        stakeholder approval signatures, and a change log for future modifications
+        to the decision.
+      D: Prepare a formal architecture review presentation covering the evaluation
+        criteria, alternative solutions considered, and implementation roadmap, then
+        archive it in the project repository.
+    correct: B
diff --git a/evaluations/specs/arc42.yaml b/evaluations/specs/arc42.yaml
new file mode 100644
index 0000000..7532be9
--- /dev/null
+++ b/evaluations/specs/arc42.yaml
@@ -0,0 +1,40 @@
+anchor: arc42
+tier: 3
+
+questions:
+  recognition:
+    question: |
+      Which of the following best describes "arc42"?
+    options:
+      A: A 12-section template for standardized software architecture documentation
+      B: A framework for automated architecture compliance checking in CI pipelines
+      C: A UML-based notation for modeling software components and connectors
+      D: A methodology for evaluating architecture trade-offs against quality attributes
+    correct: A
+
+  application:
+    scenario: |
+      You are starting architecture documentation for a new microservices project.
+      The team needs to understand the system's external interfaces and deployment.
+      Where do you document these two concerns?
+    anchor_prompt: "following arc42"
+    paraphrase_prompt: "Create structured architecture documentation covering all relevant aspects"
+    options:
+      A: Both in a single Architecture Overview chapter
+      B: External interfaces in Section 3 (Context and Scope), deployment in Section 7 (Deployment View)
+      C: External interfaces in the API specification, deployment in the runbook
+      D: Both in Section 5 (Building Block View) at different abstraction levels
+    correct: B
+
+  consistency:
+    variants:
+      - 'Which of the following persons is most closely associated with "arc42"?'
+      - 'Who created the arc42 architecture documentation template?'
+      - 'Which author is known for the arc42 documentation framework?'
+    language_variant: 'Welche Person wird am engsten mit "arc42" assoziiert?'
+    options:
+      A: Martin Fowler
+      B: Gernot Starke
+      C: Robert C. Martin
+      D: Simon Brown
+    correct: B
diff --git a/evaluations/specs/atam.yaml b/evaluations/specs/atam.yaml
new file mode 100644
index 0000000..a04100b
--- /dev/null
+++ b/evaluations/specs/atam.yaml
@@ -0,0 +1,38 @@
+anchor: atam
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "ATAM"?
+
+      '
+    options:
+      A: Systematic documentation framework for software architecture decisions using
+        standardized templates that capture context, decision drivers, and consequences
+      B: Iterative risk-driven methodology for evaluating software architectures through
+        stakeholder workshops that identify risks, sensitivity points, and tradeoffs
+        using scenario-based analysis
+      C: Comprehensive template-based approach for documenting software architecture
+        across twelve standardized sections from context to deployment
+      D: Structured code review process with defined roles and phases that inspects
+        software artifacts against checklists and specifications
+    correct: B
+  application:
+    scenario: Your team is designing a financial trading platform where stakeholders
+      demand sub-100ms response times for trade execution, 99.99% uptime, and bank-level
+      security compliance. The architecture team has proposed using microservices
+      with event sourcing, but concerns have been raised about whether this approach
+      can simultaneously meet all quality requirements.
+    anchor_prompt: using ATAM
+    paraphrase_prompt: What systematic approach would best help evaluate whether the
+      proposed architecture can achieve the conflicting quality requirements?
+    options:
+      A: Conduct load testing on a prototype implementation to measure actual performance
+        metrics and identify bottlenecks before making architectural decisions.
+      B: Build a utility tree to prioritize quality scenarios, then analyze how microservices
+        and event sourcing create tradeoff points between performance, availability,
+        and security.
+      C: Create detailed architecture documentation with UML diagrams and have senior
+        architects review the design against established enterprise patterns.
+      D: Implement proof-of-concept services for critical components and run security
+        penetration tests to validate compliance requirements early.
+    correct: B
diff --git a/evaluations/specs/bdd-given-when-then.yaml b/evaluations/specs/bdd-given-when-then.yaml
new file mode 100644
index 0000000..4dc2139
--- /dev/null
+++ b/evaluations/specs/bdd-given-when-then.yaml
@@ -0,0 +1,44 @@
+anchor: bdd-given-when-then
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "BDD (Behavior-Driven Development)"?
+
+      '
+    options:
+      A: Test-first development approach where unit tests are written before implementation
+        code, focusing on isolated component testing with mock objects to verify interactions
+        between system boundaries
+      B: Structured scenario format — Given a precondition, When an action occurs,
+        Then an expected outcome results; concrete examples as executable specifications
+        that define system behavior
+      C: Agile requirement gathering technique that maps user activities and tasks
+        in chronological order to identify system features and prioritize development
+        based on user journey workflows
+      D: Iterative testing methodology that emphasizes state-based verification through
+        direct assertions on system outputs, promoting comprehensive test coverage
+        without external dependencies or test doubles
+    correct: B
+  application:
+    scenario: Your team is building an e-commerce checkout system where business stakeholders
+      are concerned about complex discount rules and payment validation logic. The
+      product owner, developers, and QA engineers have different interpretations of
+      how promotional codes should work with various payment methods.
+    anchor_prompt: using BDD (Behavior-Driven Development)
+    paraphrase_prompt: to ensure all stakeholders share the same understanding of
+      system behavior and create executable documentation
+    options:
+      A: Write detailed technical specifications first, then have developers implement
+        unit tests that verify the discount calculation algorithms work correctly
+        for each payment method combination.
+      B: Organize discovery workshops with the three amigos to write Given-When-Then
+        scenarios like 'Given a customer has a 20% discount code, When they checkout
+        with a credit card, Then the discount applies before payment processing' that
+        become executable tests.
+      C: Create comprehensive user stories with acceptance criteria, then have QA
+        engineers write end-to-end test scripts that validate the complete checkout
+        workflow from the user interface perspective.
+      D: Develop a prototype of the checkout system quickly, then gather feedback
+        from stakeholders through usability testing sessions to refine the discount
+        and payment features iteratively.
+    correct: B
diff --git a/evaluations/specs/bem-methodology.yaml b/evaluations/specs/bem-methodology.yaml
new file mode 100644
index 0000000..e6a7dfd
--- /dev/null
+++ b/evaluations/specs/bem-methodology.yaml
@@ -0,0 +1,42 @@
+anchor: bem-methodology
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "BEM Methodology"?
+
+      '
+    options:
+      A: A software architecture pattern that separates business logic, event handling,
+        and model validation to create scalable enterprise applications with clear
+        separation of concerns
+      B: Solve CSS specificity wars, naming conflicts, and stylesheet maintainability
+        issues in large codebases; standalone component that is meaningful on its
+        own (e.g., `menu`, `button`, `header`)
+      C: A project management methodology that emphasizes iterative development cycles,
+        continuous integration, and stakeholder feedback to deliver software products
+        efficiently
+      D: A database design approach that structures entities, relationships, and metadata
+        to optimize query performance and maintain data integrity across distributed
+        systems
+    correct: B
+  application:
+    scenario: You're developing a navigation component for an e-commerce website that
+      includes a logo, menu items, search functionality, and a shopping cart icon.
+      The navigation needs to support different states like active menu items, disabled
+      search when no products are available, and a compact version for mobile devices.
+    anchor_prompt: using BEM Methodology
+    paraphrase_prompt: structure the CSS class names to ensure maintainability, avoid
+      naming conflicts, and clearly express component relationships
+    options:
+      A: Use semantic class names like `.navigation`, `.logo`, `.menu-link`, `.search-box`,
+        `.cart`, `.active-link`, `.disabled-search`, `.mobile-nav`
+      B: Use structured class names like `.navbar`, `.navbar__logo`, `.navbar__menu-item`,
+        `.navbar__search`, `.navbar__cart`, `.navbar__menu-item--active`, `.navbar__search--disabled`,
+        `.navbar--compact`
+      C: Use hierarchical class names like `.nav`, `.nav .logo`, `.nav .menu .item`,
+        `.nav .search.box`, `.nav .cart.icon`, `.nav .menu .item.active`, `.nav .search.disabled`,
+        `.nav.mobile`
+      D: Use descriptive class names like `.main-navigation`, `.site-logo`, `.primary-menu-link`,
+        `.product-search-input`, `.shopping-cart-button`, `.current-page-link`, `.inactive-search-field`,
+        `.mobile-navigation-bar`
+    correct: B
diff --git a/evaluations/specs/bluf.yaml b/evaluations/specs/bluf.yaml
new file mode 100644
index 0000000..4e20a6a
--- /dev/null
+++ b/evaluations/specs/bluf.yaml
@@ -0,0 +1,38 @@
+anchor: bluf
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "BLUF (Bottom Line Up Front)"?
+
+      '
+    options:
+      A: Begin with background context and build logically toward the final conclusion;
+        comprehensive analysis precedes recommendations
+      B: State the main point, decision, or recommendation immediately; most important
+        information first, supporting details follow
+      C: Present multiple solutions in order of complexity; start with simple approaches
+        before advancing to detailed implementations
+      D: Structure information using inverted pyramid with broad overview first, then
+        progressively narrow to specific technical details
+    correct: B
+  application:
+    scenario: Your team has discovered a critical security vulnerability in the production
+      API that could expose user data. You need to send an urgent email to the CTO
+      and engineering leadership about the issue, its impact, and the proposed fix.
+    anchor_prompt: using BLUF (Bottom Line Up Front)
+    paraphrase_prompt: to communicate the most critical information first for time-pressed
+      executives who need to make immediate decisions
+    options:
+      A: Start with background context about recent security audits, then explain
+        how the vulnerability was discovered, detail the technical analysis process,
+        and conclude with the severity assessment and recommended actions.
+      B: 'Lead with: ''Critical API vulnerability requires immediate hotfix deployment
+        by EOD to prevent potential user data exposure.'' Follow with impact details,
+        technical specifics, and implementation timeline.'
+      C: Begin by outlining the discovery timeline, present a detailed technical analysis
+        of the vulnerability, discuss various potential solutions, then provide your
+        final recommendation and next steps.
+      D: Open with team credentials and recent security improvements, explain the
+        systematic approach used to identify issues, walk through the vulnerability
+        details, and end with proposed solutions.
+    correct: B
diff --git a/evaluations/specs/c4-diagrams.yaml b/evaluations/specs/c4-diagrams.yaml
new file mode 100644
index 0000000..9113143
--- /dev/null
+++ b/evaluations/specs/c4-diagrams.yaml
@@ -0,0 +1,39 @@
+anchor: c4-diagrams
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "C4-Diagrams"?
+
+      '
+    options:
+      A: 'Four components of system design: data, process, interface, and security
+        layers'
+      B: 'Four levels of abstraction; : system in its environment (users, external
+        systems)'
+      C: 'Four phases of software development: requirements, design, implementation,
+        and testing'
+      D: 'Four categories of architectural patterns: layered, client-server, pipe-filter,
+        and event-driven'
+    correct: B
+  application:
+    scenario: Your team is building a new e-commerce platform and needs to present
+      the architecture to various stakeholders including executives, developers, and
+      operations staff. The system involves web applications, mobile apps, payment
+      services, inventory databases, and third-party shipping APIs.
+    anchor_prompt: using C4-Diagrams
+    paraphrase_prompt: create a comprehensive architectural documentation strategy
+      that effectively communicates system structure to all stakeholder groups
+    options:
+      A: Create detailed UML class diagrams showing all system interfaces, then add
+        deployment diagrams and sequence diagrams for each major user workflow to
+        ensure complete technical coverage.
+      B: Start with a context diagram showing the system and external actors, then
+        create container diagrams for applications and databases, followed by component
+        diagrams for complex containers as needed.
+      C: Begin with a comprehensive system landscape diagram, then create detailed
+        data flow diagrams, followed by network topology diagrams and API specification
+        documents for each service.
+      D: Design entity-relationship diagrams for all databases first, then create
+        service architecture diagrams, and finish with user journey maps and technical
+        infrastructure blueprints.
+    correct: B
diff --git a/evaluations/specs/chain-of-thought.yaml b/evaluations/specs/chain-of-thought.yaml
new file mode 100644
index 0000000..bdc5550
--- /dev/null
+++ b/evaluations/specs/chain-of-thought.yaml
@@ -0,0 +1,36 @@
+anchor: chain-of-thought
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Chain of Thought (CoT)"?
+
+      '
+    options:
+      A: Connect multiple AI models in sequence where each model's output becomes
+        the input for the next model in the processing pipeline
+      B: Explicitly show intermediate reasoning steps before reaching a conclusion;
+        make the thought process visible, not just the final answer
+      C: Structure prompts using a series of related examples that progressively guide
+        the model toward the desired response pattern
+      D: Break down complex problems into smaller, independent sub-problems that can
+        be solved separately and then combined for the final solution
+    correct: B
+  application:
+    scenario: Your team is debugging a complex data processing pipeline that produces
+      incorrect results for certain edge cases. The LLM-based component seems to jump
+      directly to conclusions without showing its reasoning process. You need to modify
+      the prompting strategy to make the model's decision-making process visible so
+      you can identify where the logic breaks down.
+    anchor_prompt: using Chain of Thought (CoT)
+    paraphrase_prompt: Which prompting approach would best help you understand and
+      debug the model's reasoning process for complex multi-step problems?
+    options:
+      A: Add more examples to the prompt with only the final correct answers, then
+        use temperature=0 for consistent outputs across all test cases.
+      B: Modify prompts to include phrases like 'Let's think step by step' and provide
+        examples that show intermediate reasoning steps before reaching conclusions.
+      C: Increase the context window size and provide comprehensive background documentation
+        about all possible edge cases and their solutions.
+      D: Use ensemble methods by running multiple model instances with different random
+        seeds and selecting the most frequently occurring answer.
+    correct: B
diff --git a/evaluations/specs/clean-architecture.yaml b/evaluations/specs/clean-architecture.yaml
new file mode 100644
index 0000000..42eaf0b
--- /dev/null
+++ b/evaluations/specs/clean-architecture.yaml
@@ -0,0 +1,39 @@
+anchor: clean-architecture
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Clean Architecture"?
+
+      '
+    options:
+      A: Dependencies flow bidirectionally between layers; presentation ↔ business
+        logic ↔ data access ↔ external services
+      B: Dependencies only point inward; entities → use cases → interface adapters
+        → frameworks & drivers
+      C: Code is organized by technical concerns; controllers → services → repositories
+        → database models
+      D: System components are loosely coupled through message passing; publishers
+        → message brokers → subscribers → event handlers
+    correct: B
+  application:
+    scenario: Your team is developing an e-commerce platform that needs to support
+      multiple payment processors (Stripe, PayPal, Square) and may need to switch
+      between them based on business requirements. The payment processing logic contains
+      complex fraud detection rules and transaction validation that must remain consistent
+      regardless of which payment provider is used.
+    anchor_prompt: using Clean Architecture
+    paraphrase_prompt: to ensure the core business logic remains independent of external
+      payment providers while maintaining flexibility to switch between them
+    options:
+      A: Create a shared payment utility class that contains all provider-specific
+        code and business rules, then inject different configuration objects to switch
+        between providers
+      B: Define payment processing use cases in the core layer with abstract interfaces,
+        implement provider-specific adapters in the outer layer, and inject dependencies
+        inward through dependency inversion
+      C: Build separate microservices for each payment provider with a central orchestrator
+        service that routes requests and handles all business logic validation
+      D: Implement a factory pattern that returns different payment processor instances,
+        with each processor containing its own copy of the fraud detection and validation
+        logic
+    correct: B
diff --git a/evaluations/specs/control-chart-shewhart.yaml b/evaluations/specs/control-chart-shewhart.yaml
new file mode 100644
index 0000000..2774073
--- /dev/null
+++ b/evaluations/specs/control-chart-shewhart.yaml
@@ -0,0 +1,32 @@
+anchor: control-chart-shewhart
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Control Chart (Shewhart)"?
+
+      '
+    options:
+      A: Graphical representation of system architecture components and their dependencies
+      B: Measured value plotted over time; process mean
+      C: Visual workflow diagram showing sequential steps in a development process
+      D: Matrix displaying test coverage metrics across different software modules
+    correct: B
+  application:
+    scenario: Your team is monitoring API response times for a critical microservice
+      that handles user authentication. Over the past month, you've collected response
+      time measurements every hour during business hours. The service occasionally
+      experiences unexplained spikes in response time that affect user experience.
+    anchor_prompt: using Control Chart (Shewhart)
+    paraphrase_prompt: to systematically distinguish between normal performance fluctuations
+      and genuine performance issues that require investigation
+    options:
+      A: Set fixed thresholds at 95th and 99th percentiles of historical data, then
+        alert whenever current measurements exceed these static boundaries
+      B: Plot response times over time with a centerline at the process mean and control
+        limits at ±3 standard deviations, then investigate points outside these limits
+        or patterns within the limits
+      C: Use machine learning anomaly detection to automatically identify outliers
+        based on complex multivariate patterns in the time series data
+      D: Calculate rolling averages over 24-hour windows and trigger alerts when the
+        current average deviates more than 20% from the previous day's average
+    correct: B
diff --git a/evaluations/specs/conventional-commits.yaml b/evaluations/specs/conventional-commits.yaml
new file mode 100644
index 0000000..a667196
--- /dev/null
+++ b/evaluations/specs/conventional-commits.yaml
@@ -0,0 +1,44 @@
+anchor: conventional-commits
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Conventional Commits"?
+
+      '
+    options:
+      A: A branching strategy where feature branches follow naming conventions like
+        feature/JIRA-123 with mandatory code review before merging to main
+      B: '<type>[!][(optional scope)]: <description> + optional body/footer; common
+        types'
+      C: A software architecture pattern that enforces strict separation between business
+        logic and infrastructure through standardized interface contracts
+      D: A deployment methodology that requires all releases to pass through predefined
+        stages with automated gates and rollback capabilities
+    correct: B
+  application:
+    scenario: Your team is preparing to release version 2.1.3 of your API library.
+      During code review, you notice that one developer's pull request removes a deprecated
+      method that some users might still be calling, while another developer's PR
+      adds a new optional parameter to an existing function. The team lead wants all
+      commit messages to clearly indicate how these changes should affect the next
+      version number.
+    anchor_prompt: using Conventional Commits
+    paraphrase_prompt: structure the commit messages to clearly communicate the semantic
+      versioning impact of these changes
+    options:
+      A: 'feat: remove deprecated getUserData method and add timeout parameter to
+        fetchUser function'
+      B: 'feat!: remove deprecated getUserData method
+
+
+        BREAKING CHANGE: getUserData method no longer available
+
+
+        feat: add optional timeout parameter to fetchUser function'
+      C: 'refactor: remove getUserData method
+
+
+        enhancement: add timeout parameter to fetchUser function'
+      D: 'chore: clean up deprecated getUserData method and improve fetchUser function
+        with timeout support'
+    correct: B
diff --git a/evaluations/specs/cqrs.yaml b/evaluations/specs/cqrs.yaml
new file mode 100644
index 0000000..36bd963
--- /dev/null
+++ b/evaluations/specs/cqrs.yaml
@@ -0,0 +1,46 @@
+anchor: cqrs
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "CQRS (Command Query Responsibility
+      Segregation)"?
+
+      '
+    options:
+      A: An architectural pattern that separates business logic from infrastructure
+        concerns by defining ports and adapters, where the core domain remains independent
+        of external systems and frameworks
+      B: Bertrand Meyer's principle — methods either change state (commands) or return
+        data (queries), never both; write operations that change state and return
+        void; represent intent as immutable command objects
+      C: A design approach where complex business domains are modeled through ubiquitous
+        language, bounded contexts, and aggregate roots to align software structure
+        with business requirements
+      D: A distributed system pattern that ensures data consistency across microservices
+        by coordinating transactions through a central orchestrator that manages compensating
+        actions for failures
+    correct: B
+  application:
+    scenario: Your e-commerce platform handles millions of product searches daily
+      but only thousands of inventory updates per hour. The current unified data model
+      causes performance bottlenecks as complex search queries with filters, sorting,
+      and recommendations compete for database resources with critical inventory management
+      operations.
+    anchor_prompt: using CQRS (Command Query Responsibility Segregation)
+    paraphrase_prompt: How would you architect the system to optimize both the high-volume
+      search operations and the critical inventory updates without them interfering
+      with each other?
+    options:
+      A: Implement database sharding to distribute both search and inventory operations
+        across multiple database instances, using product categories as the sharding
+        key to balance the load evenly.
+      B: 'Create separate optimized data models: a normalized write model for inventory
+        commands and denormalized read models for search queries, synchronized through
+        domain events with eventual consistency.'
+      C: Use a master-slave database replication setup where all inventory updates
+        go to the master and search queries are distributed across multiple read replicas
+        to reduce contention.
+      D: Implement a caching layer with Redis to store frequently accessed product
+        data and search results, reducing database load while maintaining a single
+        unified data model.
+    correct: B
diff --git a/evaluations/specs/cynefin-framework.yaml b/evaluations/specs/cynefin-framework.yaml
new file mode 100644
index 0000000..6bccb1a
--- /dev/null
+++ b/evaluations/specs/cynefin-framework.yaml
@@ -0,0 +1,39 @@
+anchor: cynefin-framework
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Cynefin Framework"?
+
+      '
+    options:
+      A: Strategic mapping technique that visualizes value chains and component evolution
+        stages over time
+      B: 'Five domains; : best practices apply, sense-categorize-respond'
+      C: 'Agile methodology framework with four iterative phases: plan-do-check-act
+        for continuous improvement'
+      D: 'Decision-making model using three assessment layers: context-analysis-action
+        for complex problems'
+    correct: B
+  application:
+    scenario: 'Your team is facing three different challenges: a well-understood database
+      migration that follows established procedures, intermittent performance issues
+      that require expert analysis, and completely unpredictable user behavior patterns
+      in a new AI feature. The team is debating how to approach each problem and allocate
+      resources effectively.'
+    anchor_prompt: using Cynefin Framework
+    paraphrase_prompt: categorize these challenges by their complexity characteristics
+      to determine the most appropriate decision-making approach for each
+    options:
+      A: Treat all three as technical problems requiring expert analysis, form specialized
+        teams for each, and conduct thorough requirements gathering before taking
+        action on any of them.
+      B: Apply best practices to the database migration, assign experts to analyze
+        the performance issues, and run small experiments to understand the AI feature
+        behavior patterns.
+      C: Prioritize all three challenges by business impact, assign the most experienced
+        developers to each, and create detailed project plans with fixed timelines
+        for resolution.
+      D: Escalate all three issues to senior architects for decision-making, document
+        comprehensive risk assessments, and implement the solutions with the highest
+        confidence levels first.
+    correct: B
diff --git a/evaluations/specs/definition-of-done.yaml b/evaluations/specs/definition-of-done.yaml
new file mode 100644
index 0000000..bd1b96c
--- /dev/null
+++ b/evaluations/specs/definition-of-done.yaml
@@ -0,0 +1,42 @@
+anchor: definition-of-done
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Definition of Done"?
+
+      '
+    options:
+      A: A comprehensive project timeline that outlines all deliverables, milestones,
+        and dependencies from project initiation through final deployment; includes
+        resource allocation, risk assessments, and stakeholder sign-off requirements
+      B: A formal, team-wide checklist of quality criteria that every increment must
+        satisfy before it is declared "done"; concrete, verifiable conditions — e.g.,
+        code reviewed, tests passing, documentation updated, no known defects
+      C: A structured template for capturing user requirements in the format 'As a
+        [user type], I want [functionality] so that [business value]'; includes acceptance
+        criteria, story points, and priority rankings for backlog management
+      D: A prioritization framework that categorizes requirements into Must have,
+        Should have, Could have, and Won't have categories; helps teams focus on essential
+        features while managing scope and stakeholder expectations effectively
+    correct: B
+  application:
+    scenario: Your agile team has been experiencing issues with features being marked
+      as complete during sprints, only to discover missing documentation, failing
+      edge case tests, or incomplete code reviews during the final sprint review.
+      The Product Owner is frustrated because features appear done in daily standups
+      but aren't actually ready for release.
+    anchor_prompt: using Definition of Done
+    paraphrase_prompt: establish a shared understanding of what constitutes truly
+      completed work to prevent late-cycle surprises and ensure consistent quality
+      standards
+    options:
+      A: Create individual checklists for each team member based on their role and
+        expertise, allowing developers to focus on code while QA handles testing criteria
+      B: Collaboratively create a single, team-wide checklist of quality criteria
+        that every increment must satisfy before being declared complete, including
+        code review, tests passing, and documentation updated
+      C: Have the Product Owner define completion criteria for each user story individually
+        based on business value and customer requirements
+      D: Implement a post-sprint quality gate where a designated team lead reviews
+        all completed work and decides what meets release standards
+    correct: B
diff --git a/evaluations/specs/devils-advocate.yaml b/evaluations/specs/devils-advocate.yaml
new file mode 100644
index 0000000..15397f4
--- /dev/null
+++ b/evaluations/specs/devils-advocate.yaml
@@ -0,0 +1,35 @@
+anchor: devils-advocate
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Devil''s Advocate"?
+
+      '
+    options:
+      A: Systematically identify potential failure points and weaknesses in a system
+        design before implementation begins
+      B: Present opposing viewpoints even if not personally held; question premises
+        and surface hidden assumptions
+      C: Challenge team members' technical decisions through aggressive questioning
+        to test their knowledge and expertise
+      D: Assign responsibility for identifying risks and negative outcomes to a designated
+        team member during planning sessions
+    correct: B
+  application:
+    scenario: Your team has designed a new microservices architecture that will replace
+      the current monolithic system. The design has been well-received by stakeholders
+      and addresses all known requirements. Before finalizing the architecture decision,
+      you want to ensure you haven't overlooked critical issues.
+    anchor_prompt: using Devil's Advocate
+    paraphrase_prompt: What approach should you take to identify potential weaknesses
+      in your architecture design before implementation?
+    options:
+      A: Conduct a final walkthrough with stakeholders to confirm the design meets
+        all their stated requirements and get formal sign-off
+      B: Systematically argue against your own design by presenting the strongest
+        possible case for why this architecture could fail or cause problems
+      C: Create detailed implementation timelines and resource estimates to validate
+        the feasibility of the proposed architecture
+      D: Research similar architectures used by other companies to benchmark your
+        design against industry best practices
+    correct: B
diff --git a/evaluations/specs/diataxis-framework.yaml b/evaluations/specs/diataxis-framework.yaml
new file mode 100644
index 0000000..e9fe327
--- /dev/null
+++ b/evaluations/specs/diataxis-framework.yaml
@@ -0,0 +1,37 @@
+anchor: diataxis-framework
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Diátaxis Framework"?
+
+      '
+    options:
+      A: Agile methodology framework focusing on iterative development cycles with
+        continuous stakeholder feedback loops
+      B: 'Four documentation types; : learning-oriented, lessons for beginners'
+      C: Software architecture pattern that separates presentation, business logic,
+        and data access into distinct layers
+      D: Project management approach emphasizing cross-functional teams and rapid
+        prototyping for complex systems
+    correct: B
+  application:
+    scenario: Your team has developed a new API authentication library and needs to
+      create comprehensive documentation. Users are complaining that they can't find
+      what they need - some want to learn the basics, others need quick solutions
+      to specific problems, and developers need detailed technical specifications.
+    anchor_prompt: using Diátaxis Framework
+    paraphrase_prompt: organize the documentation to systematically address different
+      user needs and purposes
+    options:
+      A: Create a single comprehensive guide that covers everything from basic concepts
+        to advanced implementation details, organized by feature complexity from simple
+        to advanced use cases.
+      B: 'Develop four distinct documentation sections: beginner tutorials for learning,
+        task-specific how-to guides, complete API reference materials, and conceptual
+        explanations of authentication principles.'
+      C: Structure documentation around user personas, creating separate sections
+        for frontend developers, backend developers, security engineers, and project
+        managers with role-specific information.
+      D: 'Organize content chronologically following the typical development workflow:
+        planning, setup, implementation, testing, deployment, and maintenance phases.'
+    correct: B
diff --git a/evaluations/specs/docs-as-code.yaml b/evaluations/specs/docs-as-code.yaml
new file mode 100644
index 0000000..1c7c807
--- /dev/null
+++ b/evaluations/specs/docs-as-code.yaml
@@ -0,0 +1,38 @@
+anchor: docs-as-code
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Docs-as-Code according to Ralf
+      D. Müller"?
+
+      '
+    options:
+      A: Writing documentation in a wiki with WYSIWYG editing and real-time collaboration
+      B: 'Treating documentation like source code: version-controlled, peer-reviewed,
+        and built automatically'
+      C: Generating API documentation automatically from code annotations and docstrings
+      D: Maintaining a separate documentation repository with its own release cycle
+    correct: B
+  application:
+    scenario: Your development team maintains a microservices platform with complex
+      API documentation that frequently becomes outdated when code changes. The team
+      uses Git for version control and has automated CI/CD pipelines. Management wants
+      documentation that stays synchronized with code changes and can be generated
+      in multiple formats for different stakeholders.
+    anchor_prompt: using Docs-as-Code according to Ralf D. Müller
+    paraphrase_prompt: How should you structure and manage the documentation workflow
+      to ensure it remains current and accessible?
+    options:
+      A: Create a centralized wiki system with automated API extraction, assign documentation
+        ownership to technical writers, and schedule weekly documentation reviews
+        to ensure accuracy across all services.
+      B: Write documentation in AsciiDoc format stored in Git repositories alongside
+        code, implement docToolchain with Gradle automation, use PlantUML for diagrams,
+        and require documentation updates in every pull request.
+      C: Implement a headless CMS with version control integration, create documentation
+        templates in Microsoft Word, and establish a quarterly documentation sprint
+        cycle with stakeholder review sessions.
+      D: Set up Confluence spaces linked to JIRA tickets, use embedded Lucidchart
+        diagrams, create documentation branches that merge after code releases, and
+        maintain separate review cycles for docs and code.
+    correct: B
diff --git a/evaluations/specs/domain-driven-design.yaml b/evaluations/specs/domain-driven-design.yaml
new file mode 100644
index 0000000..e50d907
--- /dev/null
+++ b/evaluations/specs/domain-driven-design.yaml
@@ -0,0 +1,41 @@
+anchor: domain-driven-design
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Domain-Driven Design according
+      to Evans"?
+
+      '
+    options:
+      A: Architectural pattern that separates business logic into distinct layers
+        with clear interfaces between presentation, application, and data access components
+      B: Shared vocabulary between developers and domain experts; explicit boundaries
+        where a model is defined and applicable
+      C: Software development methodology that emphasizes iterative delivery of working
+        software through close collaboration between cross-functional teams and stakeholders
+      D: Design approach that focuses on creating reusable software components with
+        well-defined interfaces that can be composed into larger systems
+    correct: B
+  application:
+    scenario: Your team is building a complex insurance claims processing system where
+      business rules frequently change and involve multiple departments (underwriting,
+      claims adjustment, fraud detection, customer service). The business stakeholders
+      use terms like 'policy holder,' 'coverage limits,' and 'claim settlement' but
+      developers are implementing these concepts inconsistently across different parts
+      of the system.
+    anchor_prompt: using Domain-Driven Design according to Evans
+    paraphrase_prompt: to ensure consistent understanding and implementation of business
+      concepts across the development team and stakeholders
+    options:
+      A: Create comprehensive technical documentation that maps business terms to
+        database schemas and API endpoints, then train all developers on the correct
+        technical implementations
+      B: Establish a ubiquitous language by working closely with domain experts to
+        define shared vocabulary, then ensure this language is consistently used in
+        code, conversations, and models
+      C: Implement a centralized data dictionary service that validates all business
+        term usage across microservices and enforces standardized naming conventions
+      D: Organize regular cross-functional meetings where business stakeholders explain
+        requirements to developers using standardized requirement templates and acceptance
+        criteria
+    correct: B
diff --git a/evaluations/specs/ears-requirements.yaml b/evaluations/specs/ears-requirements.yaml
new file mode 100644
index 0000000..a87c03c
--- /dev/null
+++ b/evaluations/specs/ears-requirements.yaml
@@ -0,0 +1,40 @@
+anchor: ears-requirements
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "EARS-Requirements"?
+
+      '
+    options:
+      A: A systematic approach for evaluating and analyzing requirements through stakeholder
+        interviews and documentation review
+      B: '"The <system> shall <requirement>"; "when <trigger> the <system> shall <requirement>"'
+      C: 'A framework for organizing requirements into hierarchical categories: Essential,
+        Auxiliary, Regulatory, and Supplementary'
+      D: A validation methodology that ensures requirements are Explicit, Achievable,
+        Relevant, and Specific before implementation
+    correct: B
+  application:
+    scenario: You are documenting requirements for a medical device monitoring system
+      that tracks patient vital signs. The system must handle various operational
+      states, emergency conditions, and optional features like wireless connectivity.
+      Your team needs clear, testable requirements that will support regulatory approval
+      and system verification.
+    anchor_prompt: using EARS-Requirements
+    paraphrase_prompt: structure these requirements using a systematic template-based
+      approach that ensures clarity and testability
+    options:
+      A: Write detailed user stories with acceptance criteria, organize them by epic
+        and priority, and include definition of done for each story to ensure the
+        development team understands the business value.
+      B: 'Structure requirements using specific templates: ''The system shall...''
+        for basic functions, ''WHEN alarm triggered the system shall...'' for events,
+        ''WHILE monitoring the system shall...'' for states, and ''IF battery low
+        THEN the system shall...'' for conditions.'
+      C: Create a comprehensive requirements specification document with functional
+        and non-functional sections, include use case diagrams, and establish a requirements
+        traceability matrix linking to test cases.
+      D: Define requirements as measurable objectives with key performance indicators,
+        establish SMART criteria for each requirement, and create a validation framework
+        with quantitative success metrics.
+    correct: B
diff --git a/evaluations/specs/event-driven-architecture.yaml b/evaluations/specs/event-driven-architecture.yaml
new file mode 100644
index 0000000..81ea3a9
--- /dev/null
+++ b/evaluations/specs/event-driven-architecture.yaml
@@ -0,0 +1,44 @@
+anchor: event-driven-architecture
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Event-Driven Architecture"?
+
+      '
+    options:
+      A: Components are organized around business domains with clear boundaries, where
+        each domain contains its own models, services, and data stores that reflect
+        real-world business concepts
+      B: Components communicate by emitting and reacting to events rather than direct
+        calls; producers publish events without knowing which consumers will process
+        them
+      C: Components are structured in concentric layers with business logic at the
+        center, isolated from external concerns through dependency inversion and interface
+        abstractions
+      D: Components communicate through well-defined interfaces at the boundaries
+        while keeping core business logic independent of external frameworks, databases,
+        and user interfaces
+    correct: B
+  application:
+    scenario: Your e-commerce platform needs to handle order processing, inventory
+      updates, payment processing, and shipping notifications. Currently, the order
+      service directly calls the inventory service, payment service, and shipping
+      service synchronously, causing delays and tight coupling between services.
+    anchor_prompt: using Event-Driven Architecture
+    paraphrase_prompt: How would you redesign this system to reduce coupling between
+      services and improve scalability while ensuring all necessary business processes
+      still execute when orders are placed?
+    options:
+      A: Create a centralized order orchestrator service that manages the workflow
+        by making sequential API calls to inventory, payment, and shipping services
+        with retry logic and circuit breakers.
+      B: Have the order service publish an 'OrderPlaced' event to a message queue,
+        with inventory, payment, and shipping services subscribing to process their
+        respective tasks independently and asynchronously.
+      C: Implement a shared database that all services can read from and write to,
+        with database triggers that automatically update related tables when orders
+        are inserted.
+      D: Use a microservices gateway that routes requests to the appropriate services
+        and aggregates responses, with caching layers to improve performance between
+        service calls.
+    correct: B
diff --git a/evaluations/specs/fagan-inspection.yaml b/evaluations/specs/fagan-inspection.yaml
new file mode 100644
index 0000000..9dc9b40
--- /dev/null
+++ b/evaluations/specs/fagan-inspection.yaml
@@ -0,0 +1,42 @@
+anchor: fagan-inspection
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Fagan Inspection"?
+
+      '
+    options:
+      A: A systematic code mutation technique where defects are artificially introduced
+        into software modules to evaluate the effectiveness of existing test suites
+        and identify gaps in test coverage.
+      B: A structured, multi-phase review process for software artifacts (requirements,
+        design, code) with defined roles and entry/exit criteria; moderator (facilitates
+        and logs), author (created the artifact), inspectors (reviewers), recorder
+        (documents defects)
+      C: A formal verification methodology that uses mathematical proofs and static
+        analysis to demonstrate software correctness without executing the program,
+        focusing on pre-conditions and post-conditions.
+      D: A risk-based assessment framework for evaluating software architecture decisions
+        through scenario-based analysis, stakeholder interviews, and systematic documentation
+        of trade-offs and quality attributes.
+    correct: B
+  application:
+    scenario: Your team is developing flight control software for a commercial aircraft.
+      The requirements document has been completed and needs to be reviewed before
+      the design phase begins. Several team members have expressed concerns about
+      potential ambiguities and missing edge cases in the requirements.
+    anchor_prompt: using Fagan Inspection
+    paraphrase_prompt: What structured approach should you take to systematically
+      review the requirements document with your team?
+    options:
+      A: Schedule a team meeting where everyone reads through the requirements together
+        and discusses any issues they notice during the session.
+      B: Assign specific roles including a moderator and recorder, have each inspector
+        study the requirements individually beforehand, then hold a formal meeting
+        to identify and classify defects systematically.
+      C: Distribute the requirements to all team members via email and ask them to
+        send back their comments within a week, then compile all feedback into a single
+        document.
+      D: Have the requirements author present the document to the team in a walkthrough
+        session where attendees can ask questions and suggest improvements in real-time.
+    correct: B
diff --git a/evaluations/specs/feynman-technique.yaml b/evaluations/specs/feynman-technique.yaml
new file mode 100644
index 0000000..7e2eb30
--- /dev/null
+++ b/evaluations/specs/feynman-technique.yaml
@@ -0,0 +1,38 @@
+anchor: feynman-technique
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Feynman Technique"?
+
+      '
+    options:
+      A: Break down complex problems into smaller, manageable components by creating
+        visual diagrams that map dependencies and identify potential bottlenecks in
+        the system
+      B: Teach the concept in simple language as if to a beginner (traditionally "explain
+        to a 12-year-old"); when you struggle to explain, you've found gaps in your
+        understanding
+      C: Use rapid prototyping and iterative feedback loops to validate assumptions
+        early in the development process before committing to full implementation
+      D: Apply the principle of progressive disclosure by revealing information gradually
+        to users, starting with the most essential features and adding complexity
+        as needed
+    correct: B
+  application:
+    scenario: Sarah is a senior developer who needs to understand a complex distributed
+      caching system before implementing a critical feature. She's read the documentation
+      and architecture diagrams, but feels uncertain about key concepts like cache
+      coherency protocols and distributed consensus mechanisms.
+    anchor_prompt: using Feynman Technique
+    paraphrase_prompt: to identify and fill knowledge gaps about the caching system
+    options:
+      A: Create detailed technical diagrams mapping all system components and their
+        interactions, then review them with the architecture team to ensure accuracy.
+      B: Write a simple explanation of how the caching system works as if teaching
+        it to a junior developer, noting where she struggles to explain clearly, then
+        study those areas more deeply.
+      C: Build a small prototype implementation to test her assumptions about the
+        system behavior and identify any gaps through hands-on experimentation.
+      D: Schedule meetings with the original system architects to ask detailed questions
+        about implementation decisions and document their responses.
+    correct: B
diff --git a/evaluations/specs/five-whys.yaml b/evaluations/specs/five-whys.yaml
new file mode 100644
index 0000000..a246900
--- /dev/null
+++ b/evaluations/specs/five-whys.yaml
@@ -0,0 +1,40 @@
+anchor: five-whys
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Five Whys (Ohno)"?
+
+      '
+    options:
+      A: Create five alternative solution paths and systematically evaluate each option
+        against predefined criteria to select the optimal approach
+      B: Ask "Why?" repeatedly (typically ~5 times) to drill down to root causes;
+        distinguish between surface symptoms and underlying causes
+      C: Break down complex problems into exactly five manageable components and address
+        each component using dedicated team resources
+      D: Conduct five rounds of stakeholder interviews to gather comprehensive requirements
+        and validate assumptions before implementation
+    correct: B
+  application:
+    scenario: Your team's automated deployment pipeline has failed three times this
+      week, each time requiring manual intervention to complete the release. The immediate
+      cause appears to be intermittent network timeouts during the artifact upload
+      phase, but previous attempts to increase timeout values haven't resolved the
+      underlying issue.
+    anchor_prompt: using Five Whys (Ohno)
+    paraphrase_prompt: to systematically drill down from surface symptoms to identify
+      the actionable root cause of these recurring deployment failures
+    options:
+      A: Document all three failure instances, categorize the types of network errors,
+        and create a comprehensive troubleshooting runbook for future occurrences
+        of similar timeout issues.
+      B: Ask why network timeouts occur, then why that cause exists, continuing this
+        questioning process until you reach an underlying cause that the team can
+        take concrete action to prevent.
+      C: Gather the development, infrastructure, and network teams to brainstorm all
+        possible factors contributing to deployment failures and create a fishbone
+        diagram mapping relationships between causes.
+      D: Implement monitoring dashboards to track network latency patterns, set up
+        automated alerts for timeout thresholds, and establish escalation procedures
+        for deployment pipeline failures.
+    correct: B
diff --git a/evaluations/specs/fowler-patterns.yaml b/evaluations/specs/fowler-patterns.yaml
new file mode 100644
index 0000000..50e4b29
--- /dev/null
+++ b/evaluations/specs/fowler-patterns.yaml
@@ -0,0 +1,43 @@
+anchor: fowler-patterns
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Patterns of Enterprise Application
+      Architecture (PEAA)"?
+
+      '
+    options:
+      A: A comprehensive framework defining bounded contexts, aggregates, entities,
+        value objects, repositories, and domain services for implementing complex
+        business logic in enterprise systems
+      B: Transaction Script, Domain Model, Table Module, Service Layer; table data
+        gateway, row data gateway, active record, data mapper
+      C: An architectural approach emphasizing ports and adapters, dependency inversion,
+        use cases, interactors, and clean separation between business rules and external
+        frameworks or databases
+      D: A layered architecture pattern consisting of presentation layer, business
+        logic layer, data access layer, and cross-cutting concerns like logging, security,
+        and transaction management
+    correct: B
+  application:
+    scenario: Your team is building an e-commerce platform where customer orders involve
+      complex business rules like discount calculations, inventory checks, and shipping
+      validations. The application needs to handle high transaction volumes while
+      maintaining data consistency across multiple database tables.
+    anchor_prompt: using Patterns of Enterprise Application Architecture (PEAA)
+    paraphrase_prompt: What architectural approach would best organize the business
+      logic and data access for this complex transactional system?
+    options:
+      A: Implement a microservices architecture with each service handling a single
+        business capability, using REST APIs for communication and eventual consistency
+        for data synchronization.
+      B: Use a Domain Model pattern for complex business logic with a Data Mapper
+        pattern for persistence, complemented by a Unit of Work pattern to manage
+        transactions across multiple entities.
+      C: Create a single monolithic service with stored procedures handling all business
+        logic in the database layer, using direct SQL calls from the presentation
+        tier.
+      D: Build a reactive event-driven system using CQRS with separate read and write
+        models, implementing event sourcing to capture all state changes as immutable
+        events.
+    correct: B
diff --git a/evaluations/specs/gherkin.yaml b/evaluations/specs/gherkin.yaml
new file mode 100644
index 0000000..171f997
--- /dev/null
+++ b/evaluations/specs/gherkin.yaml
@@ -0,0 +1,40 @@
+anchor: gherkin
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Gherkin"?
+
+      '
+    options:
+      A: A domain-specific language for writing human-readable executable specifications
+        using Given/When/Then keywords in behavior-driven development
+      B: A software testing methodology that emphasizes writing failing tests first,
+        then implementing the minimum code to make them pass
+      C: A requirements elicitation technique using targeted questions to uncover
+        assumptions and clarify stakeholder needs
+      D: A structured format for documenting acceptance criteria using natural language
+        templates with numbered steps and expected outcomes
+    correct: A
+  application:
+    scenario: Your team is developing an e-commerce checkout system where business
+      analysts need to specify payment validation rules that developers can implement
+      and testers can verify. The product owner wants to ensure that both successful
+      payments and various error conditions are properly handled, and all stakeholders
+      need to understand the expected behavior.
+    anchor_prompt: using Gherkin
+    paraphrase_prompt: structure the payment validation specifications so they can
+      serve as both human-readable documentation and automated test cases
+    options:
+      A: Write detailed technical specifications in confluence with UML diagrams showing
+        payment flow states, then create separate unit tests with mock payment gateways
+        to verify each validation rule independently.
+      B: Create feature files with scenarios that use Given-When-Then steps to describe
+        payment contexts, user actions, and expected outcomes, organizing related
+        scenarios under payment validation features with shared background steps.
+      C: Document payment rules in user story format with acceptance criteria bullets,
+        then implement integration tests that call actual payment APIs to validate
+        the complete payment processing workflow.
+      D: Build a requirements traceability matrix linking business rules to test cases,
+        then write automated UI tests that simulate user interactions with the checkout
+        form to verify payment validation behavior.
+    correct: B
diff --git a/evaluations/specs/github-flow.yaml b/evaluations/specs/github-flow.yaml
new file mode 100644
index 0000000..9560f78
--- /dev/null
+++ b/evaluations/specs/github-flow.yaml
@@ -0,0 +1,37 @@
+anchor: github-flow
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "GitHub Flow"?
+
+      '
+    options:
+      A: A semantic versioning system that automatically increments version numbers
+        based on commit message patterns and release types.
+      B: Workflow steps
+      C: A standardized format for writing commit messages that includes type, scope,
+        and description to improve project history readability.
+      D: A distributed version control architecture that enables multiple developers
+        to work on separate repositories while maintaining code synchronization.
+    correct: B
+  application:
+    scenario: Your team is working on a web application that gets deployed to production
+      multiple times per day. A critical bug has been reported by users, and you need
+      to implement a hotfix while ensuring the main branch remains stable and deployable.
+      The team follows a branch-based workflow where every change goes through code
+      review.
+    anchor_prompt: using GitHub Flow
+    paraphrase_prompt: What approach should you take to implement and deploy this
+      hotfix while maintaining continuous delivery practices?
+    options:
+      A: Create a hotfix branch from the latest release tag, implement the fix, merge
+        it back to both the release branch and main, then deploy from the release
+        branch
+      B: Create a hotfix branch from main, implement the fix, open a pull request
+        for code review, merge to main after approval, then deploy immediately from
+        main
+      C: Implement the fix directly on main branch, commit the changes, run tests
+        locally, then push and deploy if tests pass
+      D: Create a hotfix branch from main, implement the fix, merge it to a staging
+        branch for testing, then merge from staging to main after validation
+    correct: B
diff --git a/evaluations/specs/gutes-deutsch-wolf-schneider.yaml b/evaluations/specs/gutes-deutsch-wolf-schneider.yaml
new file mode 100644
index 0000000..c6e4387
--- /dev/null
+++ b/evaluations/specs/gutes-deutsch-wolf-schneider.yaml
@@ -0,0 +1,44 @@
+anchor: gutes-deutsch-wolf-schneider
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Gutes Deutsch nach Wolf Schneider"?
+
+      '
+    options:
+      A: Structure information hierarchically with the most important conclusion first,
+        followed by supporting arguments grouped logically — each section should build
+        upon the previous one; use consistent formatting throughout.
+      B: Prefer short, direct sentences over long, complex ones — every sentence should
+        express one idea; use active constructions; avoid passive voice and impersonal
+        constructions wherever possible
+      C: Begin with the bottom line up front, presenting key findings immediately
+        — organize supporting details in order of decreasing importance; eliminate
+        unnecessary background information and focus on actionable insights.
+      D: Apply modular design principles where each component serves a single responsibility
+        — minimize dependencies between modules; favor composition over inheritance
+        and maintain loose coupling throughout the system architecture.
+    correct: B
+  application:
+    scenario: A German software company is revising their user documentation after
+      receiving complaints that it's difficult to understand. The current version
+      contains many long sentences with multiple clauses, passive constructions, and
+      abstract technical jargon. The technical writing team needs to rewrite a key
+      section explaining how users can configure system settings.
+    anchor_prompt: using Gutes Deutsch nach Wolf Schneider
+    paraphrase_prompt: to create clear, accessible German prose that prioritizes reader
+      comprehension and eliminates unnecessary complexity
+    options:
+      A: Use sophisticated vocabulary and complex sentence structures to demonstrate
+        technical expertise, incorporate industry-standard terminology throughout,
+        and maintain formal passive voice constructions to convey professional authority.
+      B: Write short, direct sentences with active voice, replace abstract noun phrases
+        with concrete verbs, eliminate filler words, and choose familiar terms over
+        technical jargon when both convey the same meaning.
+      C: Focus on comprehensive coverage by including detailed explanations for every
+        possible scenario, use subordinate clauses to show relationships between concepts,
+        and employ precise technical terminology for accuracy.
+      D: Structure content using bullet points and numbered lists exclusively, maintain
+        consistent sentence length throughout the document, and include extensive
+        cross-references to related technical specifications and standards.
+    correct: B
diff --git a/evaluations/specs/hexagonal-architecture.yaml b/evaluations/specs/hexagonal-architecture.yaml
new file mode 100644
index 0000000..085e3fb
--- /dev/null
+++ b/evaluations/specs/hexagonal-architecture.yaml
@@ -0,0 +1,41 @@
+anchor: hexagonal-architecture
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Hexagonal Architecture (Ports
+      & Adapters)"?
+
+      '
+    options:
+      A: Six-layered application structure where each layer handles specific responsibilities
+        and communicates only with adjacent layers
+      B: Core domain at the center, isolated from external concerns; interfaces defining
+        how the application communicates
+      C: Database design pattern using six normalized tables with adapter classes
+        to handle object-relational mapping between entities
+      D: Microservices architecture pattern where six independent services communicate
+        through standardized API ports and message adapters
+    correct: B
+  application:
+    scenario: Your team is building a payment processing service that needs to support
+      multiple payment gateways (Stripe, PayPal, Square), handle requests from both
+      a web API and a mobile SDK, store transaction data in PostgreSQL, and send notifications
+      via email and SMS. The business requirements are complex but well-defined, while
+      the specific technologies may change over time.
+    anchor_prompt: using Hexagonal Architecture (Ports & Adapters)
+    paraphrase_prompt: How would you structure this system to maximize testability,
+      technology independence, and the ability to easily swap external integrations?
+    options:
+      A: Create a layered architecture with separate presentation, business logic,
+        and data access layers, using dependency injection to manage connections between
+        payment gateways and notification services.
+      B: Place payment processing domain logic at the center, define port interfaces
+        for payment gateways and notifications, then implement adapters for each external
+        service, ensuring all dependencies point inward to the core domain.
+      C: Build a microservices architecture with separate services for each payment
+        gateway, a central orchestrator service, and shared databases to maintain
+        consistency across all payment operations.
+      D: Implement a plugin-based architecture where each payment gateway and notification
+        method is a plugin, with a central registry managing plugin lifecycle and
+        a shared event bus for communication.
+    correct: B
diff --git a/evaluations/specs/iec-61508-sil-levels.yaml b/evaluations/specs/iec-61508-sil-levels.yaml
new file mode 100644
index 0000000..72bd587
--- /dev/null
+++ b/evaluations/specs/iec-61508-sil-levels.yaml
@@ -0,0 +1,35 @@
+anchor: iec-61508-sil-levels
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "IEC 61508 SIL Levels"?
+
+      '
+    options:
+      A: 'Three Quality Assurance Levels; : 10^-3^ ≤ pfd < 10^-2^ (acceptable defect
+        density)'
+      B: 'Four Safety Integrity Levels; : 10^-2^ ≤ pfd < 10^-1^ (tolerable risk reduction)'
+      C: 'Five Reliability Assessment Levels; : 10^-4^ ≤ pfd < 10^-3^ (minimum performance
+        threshold)'
+      D: 'Six Verification Testing Levels; : 10^-1^ ≤ pfd < 10^0^ (standard compliance
+        range)'
+    correct: B
+  application:
+    scenario: Your team is developing a safety instrumented system for a chemical
+      processing plant that must prevent overpressure conditions. The hazard analysis
+      indicates that failure of this safety function could result in equipment damage
+      and potential worker injury, with a tolerable risk requiring the safety system
+      to have a probability of failure on demand between 10^-3 and 10^-2.
+    anchor_prompt: using IEC 61508 SIL Levels
+    paraphrase_prompt: determine the appropriate safety integrity classification and
+      corresponding development requirements for this safety function
+    options:
+      A: Classify as SIL 1, implement basic software development practices with minimal
+        verification requirements and simple hardware architecture constraints
+      B: Classify as SIL 2, implement structured software development methods with
+        moderate verification requirements and hardware fault tolerance measures
+      C: Classify as SIL 3, implement rigorous software development processes with
+        extensive verification and validation plus high hardware fault tolerance
+      D: Classify as SIL 4, implement the most stringent development processes with
+        maximum verification requirements and highest level hardware redundancy
+    correct: B
diff --git a/evaluations/specs/impact-mapping.yaml b/evaluations/specs/impact-mapping.yaml
new file mode 100644
index 0000000..046474e
--- /dev/null
+++ b/evaluations/specs/impact-mapping.yaml
@@ -0,0 +1,39 @@
+anchor: impact-mapping
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Impact Mapping"?
+
+      '
+    options:
+      A: User → Stories → Epics → Features; requirement decomposition methodology
+        focusing on user needs and system capabilities
+      B: Goal → Actors → Impacts → Deliverables; business objective (why?)
+      C: Problem → Analysis → Design → Implementation; systematic approach to software
+        development through structured phase transitions
+      D: Stakeholder → Requirements → Architecture → Code; traceability framework
+        linking business needs to technical implementation details
+    correct: B
+  application:
+    scenario: Your e-commerce platform team has been asked to increase customer retention
+      by 15% over the next six months. The stakeholders have different opinions on
+      what features to build, with marketing wanting loyalty programs, engineering
+      suggesting performance improvements, and customer service pushing for better
+      support tools.
+    anchor_prompt: using Impact Mapping
+    paraphrase_prompt: to create a goal-oriented plan that connects business outcomes
+      to specific deliverables while identifying key stakeholders
+    options:
+      A: Create a feature prioritization matrix ranking loyalty programs, performance
+        improvements, and support tools based on development effort and expected customer
+        impact scores.
+      B: Map the retention goal to key actors (existing customers, support agents,
+        marketing team), identify how their behaviors need to change, then determine
+        what deliverables enable those behavior changes.
+      C: Conduct user story mapping sessions with all stakeholders to create a shared
+        product backlog organized by customer journey stages and prioritized by business
+        value.
+      D: Develop a roadmap showing three parallel workstreams for loyalty features,
+        performance optimization, and support enhancements with clear milestones and
+        dependencies.
+    correct: B
diff --git a/evaluations/specs/invest.yaml b/evaluations/specs/invest.yaml
new file mode 100644
index 0000000..8aea817
--- /dev/null
+++ b/evaluations/specs/invest.yaml
@@ -0,0 +1,45 @@
+anchor: invest
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "INVEST"?
+
+      '
+    options:
+      A: Stories should be prioritized using Must have, Should have, Could have, and
+        Won't have categories to ensure critical requirements are delivered first;
+        stakeholders rank features by business value and implementation complexity
+        to guide sprint planning and resource allocation.
+      B: Stories should be self-contained and deliverable in any order; avoid dependencies
+        between stories that force a fixed implementation sequence; stories are not
+        contracts; the details are open to discussion between team and stakeholders
+        until they enter a sprint
+      C: Stories should be mapped chronologically along a user journey timeline with
+        supporting tasks underneath; teams visualize the complete user experience
+        to identify gaps, prioritize releases, and maintain focus on delivering end-to-end
+        value through iterative development cycles.
+      D: Stories should follow a standardized template with acceptance criteria, definition
+        of done, and effort estimates; teams must complete detailed analysis and obtain
+        formal approval from product owners before any story can be moved into development
+        or testing phases.
+    correct: B
+  application:
+    scenario: 'Your team is reviewing user stories during backlog refinement for an
+      e-commerce platform. One story reads: ''As a customer, I want the system to
+      be faster and more secure so that I have a better experience.'' The team is
+      struggling to estimate this story and cannot determine what ''done'' looks like.'
+    anchor_prompt: using INVEST
+    paraphrase_prompt: What should you do to make this story ready for sprint planning?
+    options:
+      A: Add more detailed technical specifications about performance benchmarks and
+        security protocols, then assign it to the most senior developer who can handle
+        the complexity
+      B: Split this into multiple smaller stories with specific acceptance criteria,
+        such as 'reduce page load time to under 2 seconds' and 'implement two-factor
+        authentication for login'
+      C: Move the story to the next release cycle and create a technical spike to
+        research all possible performance and security improvements before writing
+        any user stories
+      D: Keep the story as-is but add story points based on the team's gut feeling,
+        since users clearly value performance and security improvements
+    correct: B
diff --git a/evaluations/specs/iso-25010.yaml b/evaluations/specs/iso-25010.yaml
new file mode 100644
index 0000000..2cd2dcb
--- /dev/null
+++ b/evaluations/specs/iso-25010.yaml
@@ -0,0 +1,51 @@
+anchor: iso-25010
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "ISO/IEC 25010"?
+
+      '
+    options:
+      A: A comprehensive framework for software architecture evaluation that provides
+        systematic methods to assess quality attributes through scenario-based analysis,
+        stakeholder workshops, and risk identification processes to determine architectural
+        trade-offs and decisions.
+      B: 'Eight top-level quality characteristics that describe the quality of a software
+        product: Functional Suitability, Performance Efficiency, Compatibility, Usability,
+        Reliability, Security, Maintainability, and Portability; the degree to which
+        the product provides functions that meet stated and implied needs — sub-characteristics:
+        functional completeness, functional correctness, functional appropriateness'
+      C: A standardized template structure consisting of twelve sections for documenting
+        software architecture decisions including context, functional requirements,
+        building blocks, runtime views, deployment views, and architectural decisions
+        with their rationales and consequences.
+      D: A lightweight methodology for capturing and communicating architectural decisions
+        through structured records that document the title, status, context, decision
+        rationale, and consequences of significant architectural choices made during
+        software development projects.
+    correct: B
+  application:
+    scenario: Your development team is building a new mobile banking application and
+      needs to establish quality requirements for the project. The product owner has
+      expressed concerns about user satisfaction, system downtime, and data protection,
+      but the requirements are currently vague and unmeasurable. The team needs a
+      structured approach to define specific, testable quality criteria that align
+      with industry standards.
+    anchor_prompt: using ISO/IEC 25010
+    paraphrase_prompt: to establish comprehensive, measurable quality requirements
+      that cover all critical aspects of software quality
+    options:
+      A: Focus primarily on functional requirements and add basic performance benchmarks,
+        security protocols, and user acceptance criteria as secondary considerations
+        to be refined during testing phases.
+      B: 'Define specific measurable requirements across the eight quality characteristics:
+        functional suitability, performance efficiency, compatibility, usability,
+        reliability, security, maintainability, and portability, with concrete sub-characteristics
+        for each.'
+      C: Create a custom quality framework based on stakeholder interviews, competitor
+        analysis, and industry best practices, then map these findings to relevant
+        testing strategies and acceptance criteria.
+      D: Establish quality gates focused on code coverage, automated testing results,
+        security scan outcomes, and user story completion rates to ensure comprehensive
+        quality measurement throughout development.
+    correct: B
diff --git a/evaluations/specs/jobs-to-be-done.yaml b/evaluations/specs/jobs-to-be-done.yaml
new file mode 100644
index 0000000..f659523
--- /dev/null
+++ b/evaluations/specs/jobs-to-be-done.yaml
@@ -0,0 +1,37 @@
+anchor: jobs-to-be-done
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Jobs To Be Done (JTBD)"?
+
+      '
+    options:
+      A: A systematic approach to breaking down complex projects into smaller, manageable
+        tasks with clear deliverables
+      B: Progress people want to make in a particular context; practical task to accomplish
+      C: A framework for defining user personas and their specific roles within an
+        organization or system
+      D: A methodology for mapping employee responsibilities to business objectives
+        and performance metrics
+    correct: B
+  application:
+    scenario: A fitness app company is struggling with low user retention despite
+      having comprehensive features like workout tracking, nutrition logging, and
+      social sharing. Users download the app but stop using it within two weeks. The
+      product team needs to understand why users aren't sticking with their solution.
+    anchor_prompt: using Jobs To Be Done (JTBD)
+    paraphrase_prompt: What approach should the product team take to understand the
+      underlying reasons users seek fitness solutions and why they abandon the current
+      app?
+    options:
+      A: Conduct user surveys asking about preferred features, UI design feedback,
+        and demographic information to identify which user segments need different
+        functionality
+      B: Interview users about the specific circumstances that led them to seek a
+        fitness solution, what progress they were trying to make, and what they hired
+        instead when they stopped using the app
+      C: Analyze competitor apps to identify missing features and benchmark against
+        industry best practices for user engagement and retention metrics
+      D: Create detailed user personas based on age, fitness level, and lifestyle
+        to design targeted features for each segment and improve onboarding flows
+    correct: B
diff --git a/evaluations/specs/lasr.yaml b/evaluations/specs/lasr.yaml
new file mode 100644
index 0000000..f91be6f
--- /dev/null
+++ b/evaluations/specs/lasr.yaml
@@ -0,0 +1,44 @@
+anchor: lasr
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "LASR according to Toth/Zörner"?
+
+      '
+    options:
+      A: Lightweight Architecture Decision Records that capture the context, decision,
+        and consequences of significant architectural choices using a standardized
+        template format for documentation and communication purposes
+      B: High-level description of how the solution addresses the most important quality
+        requirements and constraints; the central architectural ideas that shape the
+        system; key structural and runtime views showing the main building blocks,
+        their responsibilities, and how they interact at runtime
+      C: Low-level technical specification that defines implementation details, coding
+        standards, and deployment procedures; focuses on concrete technology choices
+        and step-by-step guidance for development teams
+      D: Hierarchical visual modeling technique using context, container, component,
+        and code diagrams to represent software architecture at different levels of
+        abstraction for stakeholder communication and system understanding
+    correct: B
+  application:
+    scenario: Your team has just completed the initial architecture design for a new
+      e-commerce platform that must handle high traffic loads and integrate with multiple
+      payment providers. The product owner and development teams need to understand
+      the key architectural decisions before implementation begins.
+    anchor_prompt: using LASR according to Toth/Zörner
+    paraphrase_prompt: What should you focus on when creating a lightweight architecture
+      document that effectively communicates the essential architectural information
+      to stakeholders?
+    options:
+      A: Document the complete system context, detailed component specifications,
+        comprehensive deployment views, and full traceability matrices to ensure nothing
+        is missed.
+      B: Focus on the core solution strategy for handling traffic loads, key structural
+        components and their interactions, critical interfaces with payment providers,
+        and identified scalability risks with mitigation plans.
+      C: Create detailed use case diagrams, complete database schemas, exhaustive
+        API documentation, and comprehensive testing strategies to cover all system
+        aspects.
+      D: Emphasize stakeholder concerns, detailed quality scenarios, complete constraint
+        documentation, and thorough architectural decision records with full rationale.
+    correct: B
diff --git a/evaluations/specs/linddun.yaml b/evaluations/specs/linddun.yaml
new file mode 100644
index 0000000..c5e95a6
--- /dev/null
+++ b/evaluations/specs/linddun.yaml
@@ -0,0 +1,39 @@
+anchor: linddun
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "LINDDUN"?
+
+      '
+    options:
+      A: A security threat modeling framework that categorizes threats into Spoofing,
+        Tampering, Repudiation, Information Disclosure, Denial of Service, and Elevation
+        of Privilege
+      B: A privacy threat modeling framework that systematically identifies threats
+        like Linkability, Identifiability, Non-repudiation, Detectability, Disclosure
+        of information, Unawareness, and Non-compliance
+      C: A data protection compliance checklist for ensuring GDPR conformity through
+        technical and organizational measures
+      D: A risk assessment methodology that evaluates security controls against a
+        catalog of known vulnerabilities and attack patterns
+    correct: B
+  application:
+    scenario: A healthcare startup is developing a telemedicine platform that collects
+      patient medical records, processes video consultations, and shares data with
+      insurance providers. The development team needs to systematically identify potential
+      privacy risks before the platform launches to ensure GDPR compliance.
+    anchor_prompt: using LINDDUN
+    paraphrase_prompt: What systematic approach should the team take to comprehensively
+      identify and categorize privacy threats across all aspects of their data handling
+      practices?
+    options:
+      A: Focus primarily on encryption and access controls, then conduct penetration
+        testing to identify vulnerabilities in data transmission and storage mechanisms.
+      B: 'Systematically analyze the system against seven privacy threat categories:
+        Linkability, Identifiability, Non-repudiation, Detectability, Disclosure,
+        Unawareness, and Non-compliance to identify specific privacy risks.'
+      C: Implement a risk assessment matrix focusing on data classification, user
+        authentication, and regulatory compliance requirements across different jurisdictions.
+      D: Conduct stakeholder interviews to understand privacy concerns, then map data
+        flows and apply general security threat modeling techniques to identify risks.
+    correct: B
diff --git a/evaluations/specs/llm-evaluations.yaml b/evaluations/specs/llm-evaluations.yaml
new file mode 100644
index 0000000..40c4f47
--- /dev/null
+++ b/evaluations/specs/llm-evaluations.yaml
@@ -0,0 +1,44 @@
+anchor: llm-evaluations
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "LLM-Evaluations"?
+
+      '
+    options:
+      A: Automated testing frameworks that generate adversarial prompts and edge cases
+        to identify failure modes in language models through mutation-based prompt
+        engineering and stress testing methodologies.
+      B: Standardized datasets and tasks used to compare LLM capabilities — MMLU (Massive
+        Multitask Language Understanding), HellaSwag, HumanEval, BIG-Bench, GSM8K,
+        TruthfulQA, ARC; quantitative measures of model quality — perplexity, accuracy,
+        bleu, rouge, f1, pass@k (code generation), exact match, calibration
+      C: Architectural design patterns and best practices for deploying large language
+        models in production environments, including load balancing, caching strategies,
+        model versioning, and API gateway configurations.
+      D: Chain-of-thought reasoning techniques that enable language models to break
+        down complex problems into step-by-step logical processes, improving performance
+        on mathematical and analytical tasks through structured prompting.
+    correct: B
+  application:
+    scenario: Your team has developed a new code generation LLM and needs to compare
+      its performance against existing models like GPT-4 and Claude before deciding
+      whether to deploy it in production. The model will be used for generating Python
+      functions from natural language descriptions in your company's development workflow.
+    anchor_prompt: using LLM-Evaluations
+    paraphrase_prompt: What systematic approach should you take to rigorously assess
+      and compare your model's capabilities against established alternatives?
+    options:
+      A: Run your model on a few hand-selected coding problems, measure basic accuracy,
+        and compare the results against published performance numbers from other models'
+        documentation.
+      B: Evaluate on standardized benchmarks like HumanEval using pass@k metrics,
+        conduct human preference comparisons for code quality, and test for potential
+        training data contamination across multiple coding tasks.
+      C: Deploy all models to a staging environment, collect user feedback over several
+        weeks, and choose the model that receives the highest average satisfaction
+        ratings from developers.
+      D: Measure inference speed and memory usage across different model sizes, then
+        select the model that provides the best performance-to-cost ratio for your
+        infrastructure requirements.
+    correct: B
diff --git a/evaluations/specs/madr.yaml b/evaluations/specs/madr.yaml
new file mode 100644
index 0000000..3f574a0
--- /dev/null
+++ b/evaluations/specs/madr.yaml
@@ -0,0 +1,37 @@
+anchor: madr
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "MADR"?
+
+      '
+    options:
+      A: Agile methodology for iterative decision-making processes; collaborative
+        framework
+      B: Well-defined format with specific sections; standard fields
+      C: Software architecture pattern for microservices decomposition; modular design
+        approach
+      D: Risk assessment framework for technical debt management; evaluation methodology
+    correct: B
+  application:
+    scenario: Your team is evaluating three different caching strategies (Redis, Memcached,
+      or in-memory caching) for a high-traffic e-commerce platform. The decision will
+      significantly impact performance, scalability, and operational complexity. You
+      need to document this architectural decision for future reference and stakeholder
+      review.
+    anchor_prompt: using MADR
+    paraphrase_prompt: document this architectural decision with a structured approach
+      that explicitly captures all evaluated alternatives and their trade-offs
+    options:
+      A: Create a simple decision log entry with the chosen option (Redis) and a brief
+        rationale, then store it in the project wiki for easy access and updates.
+      B: Document the decision with sections for context, decision drivers, all three
+        caching options as considered alternatives, pros/cons analysis for each, and
+        the final outcome with justification.
+      C: Write a comprehensive technical specification document detailing the implementation
+        approach for Redis, including configuration parameters, monitoring setup,
+        and deployment procedures.
+      D: Record the decision in a structured format focusing primarily on the chosen
+        solution's benefits and implementation details, with minimal coverage of rejected
+        alternatives.
+    correct: B
diff --git a/evaluations/specs/mece.yaml b/evaluations/specs/mece.yaml
new file mode 100644
index 0000000..ece104e
--- /dev/null
+++ b/evaluations/specs/mece.yaml
@@ -0,0 +1,33 @@
+anchor: mece
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes the "MECE Principle"?
+
+      '
+    options:
+      A: Prioritizing requirements into Must/Should/Could/Won't categories
+      B: Structuring categories so they do not overlap and collectively cover all
+        possibilities
+      C: Presenting the conclusion first, then organizing supporting arguments hierarchically
+      D: Decomposing work into independent, negotiable, and testable user stories
+    correct: B
+  application:
+    scenario: Your team is designing a new e-commerce platform and needs to organize
+      the main functional areas into microservices. The platform must handle user
+      management, product catalog, shopping cart, order processing, payment handling,
+      inventory tracking, and customer support features.
+    anchor_prompt: using MECE Principle
+    paraphrase_prompt: How should you organize these functional areas to ensure complete
+      coverage with no overlapping responsibilities between services?
+    options:
+      A: 'Group by user-facing vs backend services: (User Management, Product Catalog,
+        Shopping Cart) and (Order Processing, Payment, Inventory, Support)'
+      B: 'Organize by business capability: User Service, Catalog Service, Cart Service,
+        Order Service, Payment Service, Inventory Service, Support Service'
+      C: 'Structure by data access patterns: Read-heavy services (Catalog, Support)
+        and Write-heavy services (User, Cart, Order, Payment, Inventory)'
+      D: 'Arrange by development team expertise: Core services (User, Product, Cart),
+        Transaction services (Order, Payment), and Operations services (Inventory,
+        Support)'
+    correct: B
diff --git a/evaluations/specs/morphological-box.yaml b/evaluations/specs/morphological-box.yaml
new file mode 100644
index 0000000..d6b3728
--- /dev/null
+++ b/evaluations/specs/morphological-box.yaml
@@ -0,0 +1,39 @@
+anchor: morphological-box
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Morphological Box"?
+
+      '
+    options:
+      A: Systematically evaluate and score alternative solutions against weighted
+        criteria using a decision matrix approach
+      B: Break complex problem into independent parameters/dimensions; identify possible
+        values/options for each parameter
+      C: Create mutually exclusive and collectively exhaustive categories to ensure
+        complete problem space coverage without overlap
+      D: Generate creative solutions by combining random elements from different domains
+        through structured brainstorming techniques
+    correct: B
+  application:
+    scenario: 'Your team is designing a new API gateway solution and needs to explore
+      all possible architectural combinations. There are multiple independent dimensions
+      to consider: authentication methods (OAuth2, JWT, API keys, mTLS), rate limiting
+      strategies (token bucket, sliding window, fixed window), storage backends (Redis,
+      PostgreSQL, DynamoDB), and deployment models (containerized, serverless, VM-based).'
+    anchor_prompt: using Morphological Box
+    paraphrase_prompt: What systematic approach should you take to ensure you've considered
+      all viable architectural combinations before making design decisions?
+    options:
+      A: Focus on the most critical dimension first, select the best option for that
+        dimension, then optimize the remaining dimensions around that choice to reduce
+        complexity
+      B: Create a matrix with each dimension as a column, list all possible options
+        for each dimension as rows, then systematically evaluate combinations while
+        filtering out infeasible ones
+      C: Conduct stakeholder interviews to determine preferences for each dimension,
+        then use weighted scoring to rank the top three combinations based on business
+        priorities
+      D: Research industry best practices for each dimension independently, then combine
+        the most popular choices from each category to create a proven solution
+    correct: B
diff --git a/evaluations/specs/moscow.yaml b/evaluations/specs/moscow.yaml
new file mode 100644
index 0000000..9940477
--- /dev/null
+++ b/evaluations/specs/moscow.yaml
@@ -0,0 +1,44 @@
+anchor: moscow
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "MoSCoW"?
+
+      '
+    options:
+      A: A visual mapping technique that organizes user stories chronologically to
+        identify gaps and prioritize features based on user journey stages and business
+        value delivery.
+      B: Non-negotiable requirements essential for the current delivery; without them
+        the solution is unusable or unsafe; important requirements that are not vital;
+        painful to leave out but the solution is still viable without them
+      C: A collaborative workshop method for creating shared understanding of project
+        scope by mapping stakeholders, impacts, and deliverables against strategic
+        business objectives.
+      D: A risk assessment framework that categorizes project uncertainties into severity
+        levels to determine mitigation strategies and contingency planning approaches
+        for delivery teams.
+    correct: B
+  application:
+    scenario: Your agile team has 15 user stories estimated at 120 story points for
+      the next 3-week sprint, but your velocity is only 80 points. The product owner
+      needs to decide which stories to include while ensuring stakeholders understand
+      what won't be delivered.
+    anchor_prompt: using MoSCoW
+    paraphrase_prompt: How should you categorize and communicate the stories to stakeholders
+      for this sprint?
+    options:
+      A: Rank all 15 stories from 1-15 by business value, select the top-ranked stories
+        that fit within 80 points, and inform stakeholders that lower-ranked items
+        are deferred to future sprints.
+      B: Categorize stories into Must have (critical for sprint goal), Should have
+        (important but not essential), Could have (nice to have), and Won't have this
+        sprint (explicitly out of scope), then select from each category to fit 80
+        points.
+      C: Group stories by feature area, estimate the effort for each group, select
+        complete feature groups that fit within 80 points, and communicate to stakeholders
+        which feature areas are postponed.
+      D: Sort stories by technical complexity and business impact using a 2x2 matrix,
+        prioritize high-impact low-complexity items first, and explain to stakeholders
+        which quadrants won't be addressed this sprint.
+    correct: B
diff --git a/evaluations/specs/mutation-testing.yaml b/evaluations/specs/mutation-testing.yaml
new file mode 100644
index 0000000..d90898b
--- /dev/null
+++ b/evaluations/specs/mutation-testing.yaml
@@ -0,0 +1,35 @@
+anchor: mutation-testing
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Mutation Testing"?
+
+      '
+    options:
+      A: Analyze genetic algorithms and evolutionary programming techniques to optimize
+        software performance through iterative code modifications
+      B: Evaluate how effective tests are at detecting bugs; deliberately introduce
+        small, syntactic changes (mutants) into source code
+      C: Test software behavior under various input combinations by systematically
+        changing configuration parameters and environmental conditions
+      D: Verify code stability by introducing controlled runtime errors and monitoring
+        how the system handles unexpected failures
+    correct: B
+  application:
+    scenario: Your team has achieved 95% code coverage on a critical payment processing
+      module, but recent production bugs have slipped through despite the high coverage
+      numbers. Management is questioning whether the existing test suite is actually
+      effective at catching defects.
+    anchor_prompt: using Mutation Testing
+    paraphrase_prompt: to evaluate whether your tests can actually detect bugs rather
+      than just execute code
+    options:
+      A: Add more unit tests to increase code coverage to 100% and focus on testing
+        edge cases that weren't previously covered
+      B: Introduce small deliberate code changes like flipping operators or removing
+        statements, then run your test suite to see which changes go undetected
+      C: Perform static code analysis to identify potential vulnerabilities and code
+        smells that could lead to production defects
+      D: Implement property-based testing to generate random inputs and verify that
+        system invariants hold across all scenarios
+    correct: B
diff --git a/evaluations/specs/negative-control.yaml b/evaluations/specs/negative-control.yaml
new file mode 100644
index 0000000..b7664a2
--- /dev/null
+++ b/evaluations/specs/negative-control.yaml
@@ -0,0 +1,17 @@
+anchor: negative-control
+tier: null
+
+# Negative control: a fabricated anchor that does not exist.
+# Tests whether the model distinguishes real anchors from invented ones.
+# Correct answer is D ("not an established methodology").
+
+questions:
+  recognition:
+    question: |
+      "Müller's Inversion Principle" is:
+    options:
+      A: "A testing methodology that reverses test execution order to detect hidden state dependencies"
+      B: "An architecture pattern that inverts control flow between framework and application code"
+      C: "A requirements technique that starts from the desired outcome and works backward to inputs"
+      D: "Not an established methodology — this term does not refer to a recognized concept"
+    correct: D
diff --git a/evaluations/specs/nelson-rules.yaml b/evaluations/specs/nelson-rules.yaml
new file mode 100644
index 0000000..ccd9bea
--- /dev/null
+++ b/evaluations/specs/nelson-rules.yaml
@@ -0,0 +1,36 @@
+anchor: nelson-rules
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Nelson Rules"?
+
+      '
+    options:
+      A: Two consecutive points beyond 2σ (Warning); 7 alternating points above and
+        below the mean (oscillation)
+      B: One point beyond 3σ (Outlier); 9 consecutive points on the same side of the
+        mean (shift/bias)
+      C: Four points in a row increasing or decreasing (Trend); 6 consecutive points
+        within 1σ (Clustering)
+      D: Eight points beyond 1σ on alternating sides (Variance); 5 consecutive points
+        forming a pattern (Sequence)
+    correct: B
+  application:
+    scenario: Your team is monitoring API response times using a control chart with
+      mean=200ms and standard deviation=50ms. Over the past 15 data points, you notice
+      that 14 consecutive measurements have been alternating between values above
+      and below the mean (e.g., 220ms, 180ms, 230ms, 170ms, etc.), creating a zigzag
+      pattern.
+    anchor_prompt: using Nelson Rules
+    paraphrase_prompt: to systematically detect whether this alternating pattern indicates
+      a non-random process issue that requires investigation
+    options:
+      A: Focus only on whether any individual points exceed 3 standard deviations
+        from the mean, as this is the most reliable indicator of process problems
+      B: Flag this as a special cause violation since 14 alternating points up and
+        down indicates systematic oscillation that suggests an assignable cause
+      C: Wait for at least 9 consecutive points on the same side of the mean before
+        concluding there is a process shift requiring attention
+      D: Check if 4 out of 5 consecutive points fall beyond 1 standard deviation on
+        the same side before taking corrective action
+    correct: B
diff --git a/evaluations/specs/owasp-top-10.yaml b/evaluations/specs/owasp-top-10.yaml
new file mode 100644
index 0000000..9d8c85a
--- /dev/null
+++ b/evaluations/specs/owasp-top-10.yaml
@@ -0,0 +1,42 @@
+anchor: owasp-top-10
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "OWASP Top 10"?
+
+      '
+    options:
+      A: A regularly updated list of the ten most critical web application security
+        risks, covering threats like injection, broken access control, and cryptographic
+        failures
+      B: A comprehensive framework for assessing software reliability and safety integrity
+        levels in critical systems with four distinct SIL classifications
+      C: An industry standard methodology for evaluating cybersecurity maturity across
+        organizational processes with five progressive capability levels
+      D: A systematic approach to privacy threat modeling that identifies linkability,
+        identifiability, and disclosure risks in software systems
+    correct: A
+  application:
+    scenario: Your team is conducting a security review of a web application that
+      handles customer financial data. During the assessment, you discover that user
+      input from web forms is directly concatenated into SQL queries without validation,
+      the application uses default database credentials, and sensitive customer data
+      is stored in plain text. The development team needs to prioritize which security
+      issues to address first.
+    anchor_prompt: using OWASP Top 10
+    paraphrase_prompt: prioritize these security vulnerabilities based on established
+      web application security risk categories
+    options:
+      A: Focus on the plain text storage issue first since data encryption is the
+        foundation of all security, then address input validation, and finally update
+        default credentials during the next major release cycle.
+      B: Address the SQL injection vulnerability first (A03 - Injection), then fix
+        the plain text storage (A02 - Cryptographic Failures), and finally remediate
+        the default credentials (A05 - Security Misconfiguration).
+      C: Implement comprehensive logging and monitoring capabilities first to detect
+        future attacks, then gradually address the technical vulnerabilities based
+        on development team availability and sprint capacity.
+      D: Prioritize fixing the default credentials first since they provide the easiest
+        attack vector, then address data encryption, and finally implement input validation
+        as a long-term security enhancement.
+    correct: B
diff --git a/evaluations/specs/plain-english-strunk-white.yaml b/evaluations/specs/plain-english-strunk-white.yaml
new file mode 100644
index 0000000..812b902
--- /dev/null
+++ b/evaluations/specs/plain-english-strunk-white.yaml
@@ -0,0 +1,50 @@
+anchor: plain-english-strunk-white
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Plain English according to Strunk
+      & White"?
+
+      '
+    options:
+      A: Write with a clear hierarchical structure where conclusions come first followed
+        by supporting arguments, using logical groupings that allow readers to understand
+        the main point before diving into details — start with the answer, then provide
+        the reasoning that led to that conclusion.
+      B: Every word in a sentence should serve a purpose; cut words that add bulk
+        without adding meaning — "the fact that" → "that", "owing to the fact that"
+        → "since"; prefer active constructions over passive; active voice is more
+        direct, vigorous, and concise — "the dog bit the man" not "the man was bitten
+        by the dog"
+      C: Use simple, everyday vocabulary and short sentences that can be understood
+        by the general public, avoiding technical jargon, complex grammatical structures,
+        and industry-specific terminology — choose 'help' over 'facilitate', 'use'
+        over 'utilize', and 'show' over 'demonstrate'.
+      D: Organize information in a bottom-line-up-front approach where the most important
+        message appears at the beginning, followed by supporting details arranged
+        in decreasing order of importance — present key findings first, then provide
+        the analysis and background that supports those conclusions.
+    correct: B
+  application:
+    scenario: A software engineer is reviewing API documentation that describes error
+      handling procedures. The current draft contains several sentences that feel
+      wordy and unclear, making it difficult for developers to quickly understand
+      what actions to take when errors occur.
+    anchor_prompt: using Plain English according to Strunk & White
+    paraphrase_prompt: to make the error handling documentation as clear, direct,
+      and concise as possible for developer users
+    options:
+      A: Add more detailed explanations and qualifying phrases like 'generally speaking'
+        and 'in most cases' to ensure developers understand the nuanced conditions
+        under which different error handling approaches might be considered appropriate.
+      B: Remove unnecessary words, use active voice, and place the most important
+        action at the end of each sentence. Replace 'In the event that an error occurs'
+        with 'When an error occurs' and 'The system will be restarted by the administrator'
+        with 'The administrator restarts the system.'
+      C: Include comprehensive background context about why each error might occur,
+        using rich descriptive language and multiple adjectives to paint a complete
+        picture of potential system states and failure modes.
+      D: Restructure sentences to use passive voice consistently, add transitional
+        phrases between concepts, and include apologetic language like 'unfortunately'
+        and 'regrettably' to acknowledge the inconvenience of errors.
+    correct: B
diff --git a/evaluations/specs/prd.yaml b/evaluations/specs/prd.yaml
new file mode 100644
index 0000000..d25c7ce
--- /dev/null
+++ b/evaluations/specs/prd.yaml
@@ -0,0 +1,41 @@
+anchor: prd
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "PRD"?
+
+      '
+    options:
+      A: Systematic prioritization framework using Must have, Should have, Could have,
+        and Won't have categories to rank feature requirements based on business value
+        and urgency
+      B: Clear articulation of the problem to be solved and the target users; measurable
+        outcomes that define what "done" looks like (kpis, okrs)
+      C: Visual representation of user activities and tasks arranged chronologically
+        to identify gaps, priorities, and release planning opportunities for product
+        development
+      D: Structured template format capturing user needs as role-based scenarios with
+        acceptance criteria to ensure requirements are testable and implementable
+        by development teams
+    correct: B
+  application:
+    scenario: Your startup is building a new mobile app for freelance project management.
+      The engineering team keeps asking clarifying questions about features, the design
+      team is unsure about user workflows, and marketing needs to understand the target
+      audience. Stakeholders have conflicting ideas about what should be included
+      in the first release.
+    anchor_prompt: using PRD
+    paraphrase_prompt: What document should you create to align all teams and provide
+      a comprehensive foundation for product development?
+    options:
+      A: Create a technical architecture document that outlines the system components,
+        database schema, and API specifications to guide the engineering team's implementation
+        decisions.
+      B: Write a comprehensive document that defines the problem statement, target
+        users, success metrics, functional requirements, scope boundaries, and constraints
+        to align all stakeholders.
+      C: Develop a project timeline with detailed user stories, acceptance criteria,
+        and sprint planning to coordinate development activities across all teams.
+      D: Conduct stakeholder interviews and create a competitive analysis report with
+        market research findings to inform strategic product positioning decisions.
+    correct: B
diff --git a/evaluations/specs/problem-space-nvc.yaml b/evaluations/specs/problem-space-nvc.yaml
new file mode 100644
index 0000000..fb99de0
--- /dev/null
+++ b/evaluations/specs/problem-space-nvc.yaml
@@ -0,0 +1,47 @@
+anchor: problem-space-nvc
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Nonviolent Communication (Rosenberg)"?
+
+      '
+    options:
+      A: A structured approach to code reviews that emphasizes constructive feedback
+        through specific examples and actionable suggestions rather than general criticism.
+      B: Concrete, objective facts without evaluation or judgment. "The deploy failed
+        three times this week" instead of "The deploy always fails."; emotions arising
+        from observations. "i feel frustrated" instead of "this is frustrating."
+      C: A conflict resolution methodology that focuses on identifying root causes
+        of team disagreements and establishing clear communication protocols between
+        stakeholders.
+      D: An agile communication framework that prioritizes transparent status updates
+        and eliminates ambiguous language in sprint retrospectives and daily standups.
+    correct: B
+  application:
+    scenario: 'A product manager sends an email to the development team: ''The search
+      feature is completely broken and users are complaining constantly. You developers
+      never test anything properly before releasing. This is unacceptable and needs
+      to be fixed immediately.'' The team lead wants to respond in a way that addresses
+      the concerns while maintaining a collaborative relationship.'
+    anchor_prompt: using Nonviolent Communication (Rosenberg)
+    paraphrase_prompt: Transform this response to follow a structured approach that
+      separates facts from judgments, acknowledges emotions, identifies underlying
+      needs, and makes specific actionable requests.
+    options:
+      A: I understand you're frustrated with the search feature. However, saying we
+        'never test properly' isn't accurate - we do have testing procedures. Let's
+        schedule a meeting to discuss how we can improve our QA process and address
+        the user complaints more systematically.
+      B: I notice the search feature has generated 15 user complaints this week. I
+        feel concerned because I value delivering quality software that meets user
+        needs. Could we schedule a 30-minute meeting tomorrow to review the specific
+        issues and create an action plan together?
+      C: Thanks for bringing this to our attention. You're right that the search feature
+        has issues and we need to address them quickly. I'll have the team prioritize
+        this as our top bug fix and we'll provide daily updates on our progress until
+        it's resolved.
+      D: I appreciate your passion for quality, and I share your concern about user
+        experience. While the search feature does have problems, let's focus on solutions
+        rather than blame. What specific search scenarios are failing, and what would
+        success look like from your perspective?
+    correct: B
diff --git a/evaluations/specs/property-based-testing.yaml b/evaluations/specs/property-based-testing.yaml
new file mode 100644
index 0000000..4330600
--- /dev/null
+++ b/evaluations/specs/property-based-testing.yaml
@@ -0,0 +1,36 @@
+anchor: property-based-testing
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Property-Based Testing"?
+
+      '
+    options:
+      A: Testing software by validating that object properties and attributes maintain
+        expected values throughout execution
+      B: Invariants that should always hold; automatic test data creation
+      C: A testing methodology that focuses on verifying ownership and access rights
+        of system resources and data
+      D: Unit testing approach that examines individual class properties and their
+        getter/setter method implementations
+    correct: B
+  application:
+    scenario: You're developing a financial calculator library with functions for
+      compound interest, loan payments, and currency conversions. The library will
+      be used by multiple client applications, and accuracy is critical since even
+      small rounding errors could accumulate into significant financial discrepancies
+      over time.
+    anchor_prompt: using Property-Based Testing
+    paraphrase_prompt: What testing approach would best validate that your financial
+      calculations maintain mathematical correctness across all possible input ranges?
+    options:
+      A: Write comprehensive unit tests covering typical financial scenarios like
+        30-year mortgages, common interest rates, and standard loan amounts
+      B: Define mathematical invariants like 'interest calculations should be commutative'
+        and generate thousands of random valid inputs to verify these properties always
+        hold
+      C: Create integration tests that simulate real user workflows by testing complete
+        financial scenarios from input to final calculation output
+      D: Implement regression tests using historical financial data from previous
+        system versions to ensure calculations remain consistent over time
+    correct: B
diff --git a/evaluations/specs/pyramid-principle.yaml b/evaluations/specs/pyramid-principle.yaml
new file mode 100644
index 0000000..3343a5d
--- /dev/null
+++ b/evaluations/specs/pyramid-principle.yaml
@@ -0,0 +1,42 @@
+anchor: pyramid-principle
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Pyramid Principle according
+      to Barbara Minto"?
+
+      '
+    options:
+      A: Hierarchical software architecture pattern where base components support
+        higher-level modules; follows dependency inversion with abstractions at the
+        top layer
+      B: Single key message at the top of the pyramid; situation → complication →
+        question → answer structure for setting context
+      C: Project management framework organizing tasks in ascending priority levels;
+        uses risk assessment → resource allocation → timeline planning → execution
+        phases
+      D: Information architecture methodology structuring content from broad categories
+        to specific details; applies user journey mapping with navigation flow optimization
+    correct: B
+  application:
+    scenario: Your engineering team has discovered a critical security vulnerability
+      in the production system that requires immediate attention and significant resources
+      to fix. The CTO has requested a 10-minute presentation to the executive team
+      explaining the situation and recommending next steps.
+    anchor_prompt: using Pyramid Principle according to Barbara Minto
+    paraphrase_prompt: How should you structure your presentation to maximize clarity
+      and executive buy-in for your recommended solution?
+    options:
+      A: Start with technical details of the vulnerability, explain how it was discovered,
+        walk through potential attack vectors, then conclude with your recommended
+        fix and resource requirements.
+      B: Lead with your recommendation to allocate resources for immediate patching,
+        then explain the current security risk situation, the complications it creates
+        for business operations, and supporting evidence for your proposed solution.
+      C: Present three possible solutions with pros and cons for each, provide detailed
+        technical analysis of the vulnerability, then ask the executives to vote on
+        which approach they prefer.
+      D: Begin by establishing credibility through your team's security expertise,
+        chronologically explain how the vulnerability was discovered, detail the investigation
+        process, then present findings and recommendations.
+    correct: B
diff --git a/evaluations/specs/sanity-check.yaml b/evaluations/specs/sanity-check.yaml
new file mode 100644
index 0000000..c0b7c31
--- /dev/null
+++ b/evaluations/specs/sanity-check.yaml
@@ -0,0 +1,17 @@
+anchor: sanity-check
+tier: null
+
+# Sanity check: none of the options is correct (the answer is 42).
+# Every model MUST score 0% because it will pick a wrong option.
+# If any model scores >0%, the scoring pipeline has a bug.
+
+questions:
+  recognition:
+    question: |
+      What is the Answer to the Ultimate Question of Life, the Universe, and Everything?
+    options:
+      A: "17"
+      B: "23"
+      C: "99"
+      D: "256"
+    correct: X
diff --git a/evaluations/specs/semantic-versioning.yaml b/evaluations/specs/semantic-versioning.yaml
new file mode 100644
index 0000000..0bc881e
--- /dev/null
+++ b/evaluations/specs/semantic-versioning.yaml
@@ -0,0 +1,32 @@
+anchor: semantic-versioning
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Semantic Versioning (SemVer)"?
+
+      '
+    options:
+      A: Version control system that automatically tracks semantic changes in code
+        structure and meaning across different branches
+      B: A versioning scheme using MAJOR.MINOR.PATCH where MAJOR signals breaking
+        changes, MINOR signals new features, and PATCH signals bug fixes
+      C: Development methodology that prioritizes meaningful variable and function
+        naming conventions to improve code readability
+      D: Documentation standard that requires detailed explanations of API functionality
+        and business logic for each software release
+    correct: B
+  application:
+    scenario: You maintain a JavaScript authentication library that currently has
+      version 2.3.1. You need to release an update that adds a new optional parameter
+      to an existing login method, includes several bug fixes for token validation,
+      and removes a deprecated method that was marked for removal six months ago.
+    anchor_prompt: using Semantic Versioning (SemVer)
+    paraphrase_prompt: determine the appropriate version number for this release that
+      properly communicates the impact of changes to library consumers
+    options:
+      A: 2.3.2 - since the new parameter is optional and doesn't break existing code
+      B: 3.0.0 - because removing the deprecated method constitutes a breaking change
+      C: 2.4.0 - to reflect the addition of new functionality with the optional parameter
+      D: 2.3.1-update.1 - using pre-release notation to indicate multiple types of
+        changes
+    correct: B
diff --git a/evaluations/specs/socratic-method.yaml b/evaluations/specs/socratic-method.yaml
new file mode 100644
index 0000000..3d21b8f
--- /dev/null
+++ b/evaluations/specs/socratic-method.yaml
@@ -0,0 +1,35 @@
+anchor: socratic-method
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Socratic Method"?
+
+      '
+    options:
+      A: Systematic approach to software development that emphasizes iterative refinement
+        through structured peer review and collaborative problem-solving sessions
+      B: Lead learners to insights through questions rather than direct instruction;
+        cross-examination technique to expose contradictions in beliefs
+      C: Teaching methodology that breaks complex problems into smaller components
+        and builds understanding through sequential presentation of foundational concepts
+      D: Architectural pattern that separates concerns by organizing code into distinct
+        layers with well-defined interfaces and dependency injection principles
+    correct: B
+  application:
+    scenario: During a code review, a senior developer notices that a junior developer
+      has implemented a caching solution that could cause data consistency issues
+      in a distributed system. The junior developer seems confident in their approach
+      and hasn't considered the potential problems.
+    anchor_prompt: using Socratic Method
+    paraphrase_prompt: to help the junior developer discover the potential issues
+      through guided inquiry rather than direct criticism
+    options:
+      A: Point out the specific data consistency problems and explain why the current
+        caching approach won't work in a distributed environment.
+      B: Ask questions like 'What happens when multiple services update the same cached
+        data?' and 'How does your cache handle network partitions?'
+      C: Suggest they research distributed caching patterns and come back with alternative
+        solutions before proceeding with the implementation.
+      D: Approve the code for now but schedule a follow-up meeting to discuss distributed
+        systems architecture and caching strategies.
+    correct: B
diff --git a/evaluations/specs/sota.yaml b/evaluations/specs/sota.yaml
new file mode 100644
index 0000000..0d22440
--- /dev/null
+++ b/evaluations/specs/sota.yaml
@@ -0,0 +1,36 @@
+anchor: sota
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "SOTA (State-of-the-Art)"?
+
+      '
+    options:
+      A: A standardized framework for documenting software architecture decisions
+        and technical specifications across development teams
+      B: Focus on the most current, cutting-edge methods and techniques; reference
+        current research papers, benchmarks, and empirical results
+      C: A methodology for systematic testing and validation of software systems against
+        predefined quality assurance benchmarks
+      D: An agile development approach that emphasizes iterative prototyping and continuous
+        integration of emerging technologies
+    correct: B
+  application:
+    scenario: Your team is building a new document search system for a legal firm
+      that needs to handle complex queries across millions of legal documents. The
+      current keyword-based search is inadequate, and you need to implement semantic
+      search capabilities that can understand legal terminology and context.
+    anchor_prompt: using SOTA (State-of-the-Art)
+    paraphrase_prompt: What approach would ensure you're implementing the most current
+      and highest-performing solution based on recent research and benchmarks?
+    options:
+      A: Implement a well-documented TF-IDF approach with legal domain customizations,
+        as it's proven reliable and easier to maintain than newer experimental methods.
+      B: Research recent papers on semantic search benchmarks, compare transformer-based
+        embedding models like BGE and E5, and implement the approach showing best
+        performance on legal document retrieval tasks.
+      C: Use the same semantic search architecture that worked well in your previous
+        project, making minor adjustments for the legal domain and document types.
+      D: Follow the semantic search tutorial from the framework documentation, as
+        it represents the vendor's recommended best practices for production systems.
+    correct: B
diff --git a/evaluations/specs/spc.yaml b/evaluations/specs/spc.yaml
new file mode 100644
index 0000000..13aefc9
--- /dev/null
+++ b/evaluations/specs/spc.yaml
@@ -0,0 +1,35 @@
+anchor: spc
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "SPC (Statistical Process Control)"?
+
+      '
+    options:
+      A: Structured programming methodology that controls code execution flow through
+        systematic elimination of goto statements and unstructured branching
+      B: Systematic statistical monitoring of running processes; inherent, random
+        fluctuation — stable and predictable
+      C: Statistical performance computing framework that optimizes system resources
+        by analyzing computational workload patterns and predicting bottlenecks
+      D: Software process certification standard that validates development methodologies
+        through rigorous documentation and compliance verification procedures
+    correct: B
+  application:
+    scenario: Your web application's API response times have been averaging 250ms
+      over the past month, but this week you've noticed some responses taking 400-500ms.
+      The development team wants to determine if this represents a real performance
+      degradation that needs investigation or just normal fluctuation.
+    anchor_prompt: using SPC (Statistical Process Control)
+    paraphrase_prompt: to systematically distinguish between normal process variation
+      and signals that indicate a real change requiring intervention
+    options:
+      A: Set a fixed threshold at 300ms and alert whenever any single response exceeds
+        this limit, then investigate each alert individually
+      B: Plot response times on a control chart with calculated control limits, then
+        apply detection rules to identify when the process shows special cause variation
+      C: Compare this week's average response time to last week's using a t-test and
+        investigate if the difference is statistically significant
+      D: Monitor the 95th percentile response time and trigger an investigation whenever
+        it increases by more than 10% from the baseline
+    correct: B
diff --git a/evaluations/specs/stride.yaml b/evaluations/specs/stride.yaml
new file mode 100644
index 0000000..69b9e74
--- /dev/null
+++ b/evaluations/specs/stride.yaml
@@ -0,0 +1,41 @@
+anchor: stride
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "STRIDE Threat Model"?
+
+      '
+    options:
+      A: A systematic approach for identifying and categorizing the top ten most critical
+        web application security risks; maintained by OWASP foundation; focuses on
+        injection flaws, broken authentication, and sensitive data exposure vulnerabilities
+      B: Impersonating another user, process, or system to gain unauthorized access;
+        mitigated by strong authentication; unauthorized modification of data in transit
+        or at rest; mitigated by integrity controls, digital signatures, and access
+        controls
+      C: A defensive security framework that assumes breach scenarios and implements
+        zero-trust principles; emphasizes continuous verification, least privilege
+        access, and micro-segmentation to limit lateral movement within networks
+      D: A risk assessment methodology for evaluating security controls in regulated
+        environments; provides quantitative scoring based on asset criticality, threat
+        likelihood, and business impact to prioritize remediation efforts
+    correct: B
+  application:
+    scenario: Your team is designing a new online banking application that handles
+      user authentication, financial transactions, and account data. During the security
+      design review, you need to systematically identify potential security threats
+      that could affect different components of the system.
+    anchor_prompt: using STRIDE Threat Model
+    paraphrase_prompt: systematically categorize the security threats by analyzing
+      each component against six fundamental threat categories
+    options:
+      A: Focus primarily on external attack vectors like SQL injection and cross-site
+        scripting, then assess the likelihood and business impact of each vulnerability
+        type
+      B: Examine each system component for Spoofing, Tampering, Repudiation, Information
+        Disclosure, Denial of Service, and Elevation of Privilege threats
+      C: Create attack trees starting from high-value assets, then trace backward
+        through all possible attack paths that could compromise those assets
+      D: Map all system entry points and data flows, then apply a risk rating matrix
+        based on confidentiality, integrity, and availability requirements
+    correct: B
diff --git a/evaluations/specs/swot.yaml b/evaluations/specs/swot.yaml
new file mode 100644
index 0000000..ef6edee
--- /dev/null
+++ b/evaluations/specs/swot.yaml
@@ -0,0 +1,45 @@
+anchor: swot
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "SWOT"?
+
+      '
+    options:
+      A: A prioritization framework that categorizes requirements into Should have,
+        Won't have this time, Optional features, and Time-critical deliverables to
+        manage project scope effectively
+      B: Internal positive attributes and resources that give the subject an advantage
+        over others; internal negative attributes or limitations that place the subject
+        at a disadvantage relative to others
+      C: A visual mapping technique that plots the evolution of components along a
+        value chain from genesis to commodity to identify strategic positioning and
+        dependencies
+      D: A decision-making matrix that systematically evaluates multiple alternatives
+        against weighted criteria by scoring each option to determine the optimal
+        solution objectively
+    correct: B
+  application:
+    scenario: Your team is evaluating whether to migrate from a monolithic architecture
+      to microservices for your e-commerce platform. The monolith has served you well
+      for 3 years but scaling challenges are emerging. You need to present a comprehensive
+      analysis to stakeholders covering all key factors that could influence this
+      architectural decision.
+    anchor_prompt: using SWOT
+    paraphrase_prompt: What framework should you use to systematically evaluate both
+      internal capabilities and external factors that could impact this architectural
+      migration decision?
+    options:
+      A: Create a decision matrix listing technical requirements as rows and architecture
+        options as columns, scoring each combination on feasibility and impact to
+        determine the optimal choice.
+      B: Analyze internal strengths and weaknesses of your current capabilities alongside
+        external opportunities and threats in the market to create a comprehensive
+        strategic assessment.
+      C: Map out the current value chain and evolution stages of each system component
+        to identify which parts are commodities versus differentiators before making
+        architectural changes.
+      D: Categorize all migration requirements into must-have, should-have, could-have,
+        and won't-have priorities to focus development effort on the most critical
+        architectural changes first.
+    correct: B
diff --git a/evaluations/specs/tdd-chicago-school.yaml b/evaluations/specs/tdd-chicago-school.yaml
new file mode 100644
index 0000000..0370501
--- /dev/null
+++ b/evaluations/specs/tdd-chicago-school.yaml
@@ -0,0 +1,39 @@
+anchor: tdd-chicago-school
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "TDD, Chicago School"?
+
+      '
+    options:
+      A: Write tests first, then implement code to pass those tests; focus on behavior
+        verification through extensive mocking of all dependencies
+      B: Verify the state of objects after operations; use real objects whenever possible;
+        mock only external dependencies
+      C: Design software architecture by defining interfaces first; use dependency
+        injection to isolate components and enable comprehensive unit testing
+      D: Develop code in small iterations with continuous refactoring; emphasize pair
+        programming and collective code ownership practices
+    correct: B
+  application:
+    scenario: You're developing an e-commerce order processing system with complex
+      business rules for discounts, taxes, and inventory management. The team needs
+      to implement the core OrderCalculator class that handles pricing logic, integrating
+      with external payment and inventory services.
+    anchor_prompt: using TDD, Chicago School
+    paraphrase_prompt: What approach should you take to develop and test this core
+      business logic component?
+    options:
+      A: Start by mocking all dependencies including the payment service, inventory
+        service, and database connections, then write tests that verify method calls
+        and interactions between components.
+      B: Begin with tests for the core pricing calculations using real domain objects,
+        mock only the external payment and inventory services, and let the design
+        emerge through refactoring cycles.
+      C: Create comprehensive interface definitions and mock implementations for all
+        collaborating objects first, then build the OrderCalculator by specifying
+        expected behavior through interaction testing.
+      D: Write integration tests that cover the entire order flow from UI to database,
+        then extract unit tests for individual components based on the integration
+        test scenarios.
+    correct: B
diff --git a/evaluations/specs/tdd-london-school.yaml b/evaluations/specs/tdd-london-school.yaml
new file mode 100644
index 0000000..56efbbb
--- /dev/null
+++ b/evaluations/specs/tdd-london-school.yaml
@@ -0,0 +1,40 @@
+anchor: tdd-london-school
+tier: 3
+
+questions:
+  recognition:
+    question: |
+      Which of the following best describes "TDD, London School"?
+    options:
+      A: State-based testing with real collaborating objects and minimal mocking
+      B: Outside-in development with mock-heavy, interaction-based testing
+      C: Acceptance testing using Given/When/Then scenario specifications
+      D: Exploratory testing focused on edge cases and unspecified behavior
+    correct: B
+
+  application:
+    scenario: |
+      You are reviewing a pull request. The code adds a new OrderService
+      that calls PaymentGateway and InventoryService.
+      What is your primary testing recommendation?
+    anchor_prompt: "using TDD, London School principles"
+    paraphrase_prompt: "Write isolated tests for the service layer"
+    options:
+      A: Write a test that processes a real order end-to-end through all three services
+      B: Write a test that mocks PaymentGateway and InventoryService to verify OrderService interactions
+      C: Write a test that checks the database state after processing an order
+      D: Skip unit tests and write an integration test with a test database
+    correct: B
+
+  consistency:
+    variants:
+      - 'Which proponent is most closely associated with "TDD, London School"?'
+      - 'Which proponent is most closely associated with "Mockist TDD"?'
+      - 'Which proponent is most closely associated with "Outside-In TDD"?'
+    language_variant: 'Welcher Proponent wird am engsten mit "TDD, London School" assoziiert?'
+    options:
+      A: Kent Beck
+      B: Steve Freeman
+      C: Dan North
+      D: Martin Fowler
+    correct: B
diff --git a/evaluations/specs/testing-pyramid.yaml b/evaluations/specs/testing-pyramid.yaml
new file mode 100644
index 0000000..77ff27c
--- /dev/null
+++ b/evaluations/specs/testing-pyramid.yaml
@@ -0,0 +1,35 @@
+anchor: testing-pyramid
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Testing Pyramid"?
+
+      '
+    options:
+      A: A hierarchical structure where system tests form the base, integration tests
+        the middle, and unit tests the top layer
+      B: Three layers; more unit tests, fewer e2e tests
+      C: A risk assessment framework that categorizes software defects into three
+        priority levels based on severity impact
+      D: A test organization model where manual testing supports automated testing
+        which supports exploratory testing at the apex
+    correct: B
+  application:
+    scenario: Your team is developing an e-commerce platform and currently has 20
+      unit tests, 50 integration tests, and 80 end-to-end tests. The CI/CD pipeline
+      takes 45 minutes to run, and developers are frustrated with slow feedback on
+      their commits. Management wants to improve development velocity while maintaining
+      quality.
+    anchor_prompt: using Testing Pyramid
+    paraphrase_prompt: What test distribution strategy would best optimize feedback
+      speed while maintaining comprehensive coverage?
+    options:
+      A: Increase all test types proportionally to 40 unit tests, 100 integration
+        tests, and 160 end-to-end tests for better coverage
+      B: Restructure to 200 unit tests, 40 integration tests, and 15 end-to-end tests,
+        moving logic validation to faster test layers
+      C: Focus primarily on integration tests with 30 unit tests, 120 integration
+        tests, and 20 end-to-end tests for balanced coverage
+      D: Maintain current ratios but optimize each test type for speed without changing
+        the overall distribution strategy
+    correct: B
diff --git a/evaluations/specs/timtowtdi.yaml b/evaluations/specs/timtowtdi.yaml
new file mode 100644
index 0000000..e545ac1
--- /dev/null
+++ b/evaluations/specs/timtowtdi.yaml
@@ -0,0 +1,36 @@
+anchor: timtowtdi
+tier: 1
+questions:
+  recognition:
+    question: 'Which of the following best describes "TIMTOWTDI"?
+
+      '
+    options:
+      A: A principle that problems can have multiple equally valid solutions, favoring
+        flexibility over prescription
+      B: A testing strategy that combines multiple test types to maximize coverage
+      C: A design pattern that delegates decisions to the most informed component
+        at runtime
+      D: A refactoring approach that transforms complex code into simpler equivalent
+        forms step by step
+    correct: A
+  application:
+    scenario: 'Your team is implementing user authentication for a web application.
+      Three developers have proposed different approaches: JWT tokens with Redis caching,
+      session-based authentication with database storage, and OAuth integration with
+      a third-party provider. All three solutions meet the technical requirements
+      and security standards.'
+    anchor_prompt: using TIMTOWTDI
+    paraphrase_prompt: How should the team handle this situation where multiple valid
+      technical approaches exist?
+    options:
+      A: Select the most popular industry standard approach to ensure long-term maintainability
+        and reduce technical risk.
+      B: Evaluate each approach's trade-offs in your specific context, discuss the
+        implications with the team, and choose based on your constraints rather than
+        dismissing valid alternatives.
+      C: Choose the approach proposed by the most senior developer to maintain team
+        hierarchy and avoid lengthy technical debates.
+      D: Implement the simplest solution first, then refactor to a more sophisticated
+        approach once you have more data about user requirements.
+    correct: B
diff --git a/evaluations/specs/todotxt-flavoured-markdown.yaml b/evaluations/specs/todotxt-flavoured-markdown.yaml
new file mode 100644
index 0000000..658d26e
--- /dev/null
+++ b/evaluations/specs/todotxt-flavoured-markdown.yaml
@@ -0,0 +1,42 @@
+anchor: todotxt-flavoured-markdown
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "todo.txt-flavoured Markdown"?
+
+      '
+    options:
+      A: A markup syntax that extends standard Markdown with project management features
+        using `@context` and `+project` tags, prioritized by numerical prefixes like
+        `1.`, `2.`, `3.`
+      B: Standard GitHub-flavoured markdown syntax (`- [ ]` uncompleted, `- [x]` completed);
+        uses todo.txt priority notation `(a)`, `(b)`, `(c)` where `(a)` is highest
+        priority
+      C: A documentation format that combines reStructuredText syntax with Kanban-style
+        workflow markers (`TODO:`, `DOING:`, `DONE:`) and uses hashtag priority levels
+        `#high`, `#medium`, `#low`
+      D: An issue tracking notation that merges JIRA-style ticket formatting with
+        plain text using bracketed status indicators `[OPEN]`, `[CLOSED]` and priority
+        weights expressed as `{P1}`, `{P2}`, `{P3}`
+    correct: B
+  application:
+    scenario: Your team is managing multiple feature development streams and bug fixes
+      across different projects. Team members need to track tasks that vary in priority,
+      belong to different projects, require specific tools or contexts, and have various
+      deadlines.
+    anchor_prompt: using todo.txt-flavoured Markdown
+    paraphrase_prompt: How should you structure your task list to combine readable
+      markdown formatting with systematic priority levels, project groupings, context
+      indicators, and searchable metadata?
+    options:
+      A: 'Use standard bullet points with custom formatting like `* HIGH: [Website]
+        Fix login bug - Computer work - Due: Feb 5th` and mark completed items by
+        moving them to a separate section'
+      B: Use checkbox syntax with priority letters, plus-prefixed project tags, at-prefixed
+        contexts, and key:value pairs like `- [ ] (A) Fix login bug +website @computer
+        due:2024-02-05`
+      C: Create separate markdown files for each priority level and use YAML frontmatter
+        to specify project, context, and due dates for each task list
+      D: 'Use numbered lists with emoji indicators for priority (🔥⚡⏰) and hashtag-style
+        tags like `1. 🔥 Fix login bug #website #computer #due-feb-5`'
+    correct: B
diff --git a/evaluations/specs/user-story-mapping.yaml b/evaluations/specs/user-story-mapping.yaml
new file mode 100644
index 0000000..08618d9
--- /dev/null
+++ b/evaluations/specs/user-story-mapping.yaml
@@ -0,0 +1,38 @@
+anchor: user-story-mapping
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "User Story Mapping"?
+
+      '
+    options:
+      A: Visual representation of user personas mapped to specific system requirements
+        and acceptance criteria
+      B: Horizontal arrangement of user activities; high-level tasks users perform
+      C: Hierarchical breakdown of software features organized by technical complexity
+        and development priority
+      D: Sequential workflow diagram showing user interactions and system responses
+        throughout the application lifecycle
+    correct: B
+  application:
+    scenario: Your team is building a new e-commerce mobile app and has collected
+      47 user stories in the backlog. The product owner is struggling to explain the
+      release strategy to stakeholders, and developers are confused about how individual
+      stories connect to the overall user experience.
+    anchor_prompt: using User Story Mapping
+    paraphrase_prompt: What approach would best help the team visualize the complete
+      user journey and plan incremental releases?
+    options:
+      A: Group stories by technical complexity and implement the easiest ones first,
+        then present a demo to stakeholders showing completed features in order of
+        development difficulty.
+      B: Arrange stories horizontally by user activities in chronological order, then
+        stack them vertically by priority to identify thin slices of end-to-end functionality
+        for each release.
+      C: Create a detailed project timeline with all stories assigned to specific
+        sprints, then hold stakeholder meetings to review the Gantt chart and adjust
+        dates based on feedback.
+      D: Categorize stories by user role and estimate story points for each category,
+        then create a burndown chart to track progress and communicate velocity to
+        stakeholders.
+    correct: B
diff --git a/evaluations/specs/wardley-mapping.yaml b/evaluations/specs/wardley-mapping.yaml
new file mode 100644
index 0000000..818df78
--- /dev/null
+++ b/evaluations/specs/wardley-mapping.yaml
@@ -0,0 +1,38 @@
+anchor: wardley-mapping
+tier: 3
+questions:
+  recognition:
+    question: 'Which of the following best describes "Wardley Mapping"?
+
+      '
+    options:
+      A: Map system dependencies from infrastructure up; requirements → design → implementation
+        → deployment
+      B: Map components from user needs down; genesis → custom → product → commodity
+      C: Map stakeholder relationships outward; internal → partners → customers →
+        market segments
+      D: Map technical debt from legacy systems; identified → prioritized → refactored
+        → modernized
+    correct: B
+  application:
+    scenario: Your fintech startup is deciding whether to build a custom payment processing
+      system, integrate with an existing payment API like Stripe, or partner with
+      a traditional payment processor. The team is debating the strategic implications
+      of each approach for the company's long-term competitive position.
+    anchor_prompt: using Wardley Mapping
+    paraphrase_prompt: What strategic approach should guide this build-vs-buy-vs-partner
+      decision?
+    options:
+      A: Conduct a cost-benefit analysis comparing the total cost of ownership for
+        each option over a 3-year period, then select the lowest-cost solution that
+        meets current technical requirements.
+      B: Map the payment processing component's position on the evolution axis from
+        genesis to commodity, then choose build for genesis/custom stages and buy/partner
+        for product/commodity stages.
+      C: Survey competitors to see what payment solutions they use, then select the
+        same approach as the most successful competitor to ensure market alignment
+        and reduce strategic risk.
+      D: Evaluate each option based on development team capacity and timeline constraints,
+        prioritizing the approach that can be implemented fastest while maintaining
+        acceptable quality standards.
+    correct: B